#!/bin/bash

# Script to control an ADF scanner
# - start scanning and create a single pdf file
# - with empty page and orientation detection
# - tested with Fujitsu SP-1120
#
# ... excessively borrowed from https://github.com/rocketraman/sane-scan-pdf
#
# Version: 0.1
# Date:    2021-06-16
# License: GNU General Public License
# Author:  Eric Scheibler
# E-Mail:  email [at] eric-scheibler [dot] de
# URL:     http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# Install:
#   sudo apt install imagemagick poppler-utils sane tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng unpaper

OUTPUT="scan.pdf"
TEXT_EDITOR="/usr/bin/vim"
PDF_VIEWER="$HOME/bin/ocr"
HELP=0
VERBOSE=0

# scanner params
DEVICE=pfusp
RESOLUTION=400
MODE=Lineart

# ocr params
OCR_LANGUAGE=deu
OCR_PREVIEW_FIRST_PAGE=0
OVERWRITE_OUTPUT_FILE=0


#####

TMP_DIR=$(mktemp -d -p "" scan.XXXXXXXXXX)
cleanup() {
    rm -rf "$TMP_DIR"
}
trap cleanup EXIT

function yes_or_no {
    while true; do
        read -p "$* [y/n]: " yn
        case $yn in
            [Yy]*) return 0  ;;
            [Nn]*) echo "Aborted" ; return  1 ;;
        esac
    done
}


# Parse command-line options
while [[ $# > 0 ]]; do
    case "$1" in
    -h|--help) HELP=1 ;;
    -v|--verbose) VERBOSE=1 ;;
    -o|--output) shift; OUTPUT="$1" ;;
    -x|--device) shift; DEVICE=$1;;
    -m|--mode) shift; MODE=$1 ;;
    -r|--resolution) shift; RESOLUTION=$1 ;;
    -l|--language) shift; OCR_LANGUAGE=$1 ;;
    -p|--preview-first-page) OCR_PREVIEW_FIRST_PAGE=1 ;;
    -w|--overwrite-output-file) OVERWRITE_OUTPUT_FILE=1 ;;
    esac
    shift     # next option
done

if [[ $HELP == 1 ]]; then
    echo "$(basename $0) [OPTIONS]... [OUTPUT]"
    echo ""
    echo "OPTIONS"
    echo " -x, --device"
    echo "    Override scanner device name, defaulting to \"pfusp\""
    echo " -m, --mode"
    echo "     Mode e.g. Lineart (default), Halftone, Gray, Color, etc."
    echo " -r, --resolution"
    echo "     Resolution e.g 300 (default)"
    echo " -l, --language <lang>"
    echo "     which language to use for OCR"
    echo " -p, --preview-first-page"
    echo "     OCR first page and preview in $TEXT_EDITOR"
    echo ""
    echo "OUTPUT"
    echo " -o, --output <outputfile>"
    echo "     Output to named file default=scan.pdf"
    echo " -w, --overwrite-output-file"
    echo "     Overwrite the output pdf file, if it already exists"
    echo " -v, --verbose"
    exit 0
fi

if [[ $VERBOSE == 0 ]]; then
    quiet_paran="--quiet"
    suppress_error_messages="2> /dev/null"
fi

if [[ "$OUTPUT" == "" ]]; then
    echo >&2 "Output file must be specified. Aborting."
    exit 1
fi

if [[ -f "$OUTPUT" ]]; then
    if [[ $OVERWRITE_OUTPUT_FILE == 0 ]]; then
        echo >&2 "Output file $OUTPUT already exists. Aborting."
        exit 1
    else
        rm "$OUTPUT"
    fi
fi


echo >&2 "Scanning..."
scanadf --device-name "$DEVICE" --source Adf-duplex --resolution $RESOLUTION --mode $MODE -o $TMP_DIR/scan-%04d
if [[ $? != 0 ]]; then
    exit 1
fi
echo ""


shopt -s extglob nullglob
image_files=($TMP_DIR/scan-[0-9]*)
num_scans=${#image_files[@]}
if [[ $num_scans > 0 ]]; then

    if [[ $OCR_PREVIEW_FIRST_PAGE == 1 ]]; then
        echo "Creating preview..."
        preview_image_file="${image_files[0]}"
        preview_text_file="$TMP_DIR/preview_first_page.txt"
        # ocr
        eval tesseract $preview_image_file ${preview_text_file%.*} -l $OCR_LANGUAGE --psm 12 $suppress_error_messages
        # show
        $TEXT_EDITOR $preview_text_file
        if ! yes_or_no "Proceed?"; then
            exit 0
        fi
        # remove preview text file
        rm $preview_text_file
        echo ""
    fi

    echo "Processing $num_scans pages"
    for image_file in ${image_files[@]}; do
        echo "Process $(basename $image_file)"

        # unpaper
        eval unpaper $quiet_paran --overwrite --dpi $RESOLUTION $image_file $image_file $suppress_error_messages

        # convert to tiff
        convert -density ${RESOLUTION}x${RESOLUTION} -units PixelsPerInch $image_file ${image_file}.tiff
        rm $image_file

        # orientation detection
        orientation_result=$(eval tesseract ${image_file}.tiff - --psm 0 $suppress_error_messages) || orientation_result=
        if [[ $orientation_result == *"Rotate: 180"* ]]; then
            echo "Image orientation is upside down, rotate"
            convert -rotate 180 ${image_file}.tiff ${image_file}.tiff
        fi

        # empty page detection
        percentage_white=$(convert ${image_file}.tiff -fuzz 0% -negate -threshold 0 -negate -format "%[fx:100*mean]" info:) || percentage_white=0
        is_empty_page=$(echo "$percentage_white >= 99.8" | bc -l)
        if [[ $is_empty_page == 1 && $orientation_result == "" ]]; then
            echo "Empty page removed"

        else
            eval tesseract ${image_file}.tiff $image_file -l $OCR_LANGUAGE pdf $suppress_error_messages
            rm ${image_file}.tiff
        fi

        echo ""
    done

    # rename or unite created pdf(s)
    pdf_files=($TMP_DIR/scan-[0-9]*.pdf)
    num_pdf_files=${#pdf_files[@]}
    if [[ $num_pdf_files == 1 ]]; then
        echo "Renaming..."
        mv $TMP_DIR/scan-0*.pdf "$OUTPUT"
    elif [[ $num_pdf_files > 1 ]]; then
        echo "Concatenating pdfs..."
        pdfunite "${pdf_files[@]}" "$OUTPUT"
    fi
fi

if [[ -f "$OUTPUT" ]]; then
    echo "Done."
    if [[ $PDF_VIEWER != "" ]]; then
        if yes_or_no "Open ${OUTPUT}?"; then
            $PDF_VIEWER $OUTPUT
        fi
    fi
else
    echo "No scans found."
fi

