Script to extract text from images and scanned PDF files

Posted by Eric Scheibler at April 13, 2015

For the friends of the text console I’ve created a small shell script, which extracts text from images and scanned PDF files. You can specify as many input files as you want. The results are merged into a single text file. You can open it in your favorite text editor or pipe it to stdout. The program Tesseract is used for the text recognition.

Some examples:

ocr image.jpg
ocr image.png document.pdf
ocr http://example.org/image.jpg
ocr -l eng -r layout -o image*
ocr document.pdf | grep -i search

Download: ocr

#!/bin/bash

# This script uses Tesseract to extract text from images and PDF files
# Supports local files and URLs
#
# Version: 0.6
# Date:    2024-04-29
# License: GNU General Public License
# Author:  Eric Scheibler
# E-Mail:  email [at] eric-scheibler [dot] de
# URL:     http://eric-scheibler.de/en/blog/2015/04/script-to-extract-text-from-images-and-scanned-pdf-files/
#
# To use it, you have to install curl, imagemagick, poppler-utils and tesseract ...
#       sudo apt install curl imagemagick pdftk poppler-utils tesseract-ocr
# ... as well as packages for your desired languages
#       sudo apt install tesseract-ocr-deu tesseract-ocr-deu-frak tesseract-ocr-eng

# define some variables
exclude_page_numbers=0
force_ocr=0
open_in_editor=0
pdf_reading_order=''
ocr_language="deu"
psm=12

# temp folder and file
temp_folder=$(mktemp -d)
result_text_file="$temp_folder/result.txt"
trap "rm -rf $temp_folder" EXIT

cleanup() {
    if [ -f "$1" ]; then
        rm "$1"
    fi
}


# check parameters
while getopts ":hfl:p:r:ox" opt; do
    case $opt in
        h)
            echo -e "ocr [-f] [-l language] [-r] [-x] file_1 [file_2 ... file_n]"
            echo -e "This script extracts text from image and PDF files\n"
            echo -e """General options:
    -l lang    3-letter language code (deu, eng, ...)
    -p psm     psm value for tesseract (default is 12)
    -o         Open results in the default text editor\n
    -x         Exclude page numbers\n"""
            echo -e """PDF document options:
    -f         force ocr in already tagged pdf documents
    -r mode    pdf document reading order\n"""
            echo -e "examples
    ocr image1.png image2.jpg
    ocr -l eng document.pdf
    ocr -x https://path_to_image"""
            exit 0
            ;;
        f)
            force_ocr=1
            ;;
        o)
            open_in_editor=1
            ;;
        l)
            ocr_language=$OPTARG
            if (( ${#ocr_language} != 3)); then
                echo "Invalid language parameter $ocr_language"
                exit 1
            fi
            ;;
        p)
            psm=$OPTARG
            if (( $psm <= 0 || $psm > 13)); then
                echo "Invalid value for psm: number between 1 and 13"
                exit 1
            fi
            ;;
        r)
            pdf_reading_order=$OPTARG
            if [ "$pdf_reading_order" != "raw" ] && [ "$pdf_reading_order" != "layout" ]; then
                echo "Invalid pdftotext layout parameter $pdf_reading_order. Choose between 'layout' and 'raw'"
                exit 1
            fi
            # prepare for pdftotext command below
            pdf_reading_order="-"$pdf_reading_order
            ;;
        x)
            exclude_page_numbers=1
            ;;
        \?)
            echo "Invalid option -$OPTARG"
            exit 1
            ;;
        :)
            echo "Option -$OPTARG requires an argument"
            exit 1
            ;;
    esac
done
shift $((OPTIND -1))

if (( $# == 0 )); then
    echo "Missing file(s)"
    exit 1
fi

page_number=1
for file in "$@"
do
    echo "Processing file $file" >&2
    # strip path
    base_filename=${file##*/}

    # download
    downloaded_into=""
    if [[ $file == http://* || $file == https://* ]]; then
        if [ -z $base_filename ]; then
            base_filename = "downloaded_by_ocr_script"
        fi
        downloaded_into="/tmp/$base_filename"
        curl --silent --show-error -o "$downloaded_into" "$file"
        echo "Downloaded $downloaded_into from $file" >&2
        file="$downloaded_into"
    fi

    # check if file exists
    if [ ! -f "$file" ]; then
        echo -e "-- Page $page_number --\n$file does not exist or is not a file.\n" >> "$result_text_file"
        page_number=$(( $page_number + 1 ))
        continue
    fi

    # if it's a pdf file, check if it already contains text
    # if so, copy the text into the corresponding text file and continue with the next one
    if [[ $force_ocr -eq 0 && "${file,,}" == *.pdf ]]; then
        pdf_contents=$(pdftotext -q $pdf_reading_order "$file" -)
        if [ ! -z "$(echo "$pdf_contents" | tr -dc '[:print:]')" ]; then
            # found some text
            if [[ $exclude_page_numbers -eq 0 ]]; then
                echo -e "-- Page $page_number  --  File: ${file##*/} --\n\n$pdf_contents" \
                    | sed -e 's/\f$//g' \
                    | sed -e 's/\f/\n-- Page '$page_number'  --  File: '"${file##*/}"' --\n\n/g' \
                    | perl -pe's/(?<=-- Page )(\d+)/++$page_number/e' >> "$result_text_file"
            else
                echo -e "$pdf_contents" >> "$result_text_file"
            fi
            page_number=$((page_number + $(echo -e "$pdf_contents" | grep -P "\f" | wc -l) ))
            cleanup "$downloaded_into"
            continue
        fi
    fi

    # if it's a pdf file without text, split it into single pages and convert them to ppm format
    # otherweise copy the file into the ocr temp folder
    #
    # strip file extension
    base_filename=${base_filename%.*}
    if [[ "${file,,}" == *.pdf ]]; then
        pdftk "$file" burst output "$temp_folder/$base_filename-p%04d.pdf"
        rm -f "$temp_folder/doc_data.txt"
        for i in "$temp_folder/$base_filename"* ; do
            if [[ "$i" == *.pdf ]]; then
                pdftoppm -r 600 "$i" "${i%.pdf}"
                rm -f "$i"
            fi
        done
    else
        cp "$file" "$temp_folder"
    fi

    # convert to tif
    for i in "$temp_folder/$base_filename"* ; do
        convert "$i" -type Grayscale "${i%.*}.tif"
        rm -f "$i"
    done

    # start tesseract
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.tif ]]; then
            tesseract -l $ocr_language --psm $psm "$i" "${i%.tif}" 2> /dev/null
            rm -f "$i"
        fi
    done

    # concatenate text files
    for i in "$temp_folder/$base_filename"* ; do
        if [[ "$i" == *.txt ]]; then
            if [[ $exclude_page_numbers -eq 0 ]]; then
                echo -e "-- Page $page_number  --  File: ${file##*/} --\n" | cat - "$i" >> "$result_text_file"
            else
                echo -e "\n" | cat - "$i" >> "$result_text_file"
            fi
            rm -f "$i"
            page_number=$(( $page_number + 1 ))
        fi
    done

    # cleanup download from above
    cleanup "$downloaded_into"
done

if [[ $open_in_editor -eq 1 ]]; then
    if [ -z "$EDITOR" ]; then
        echo "EDITOR not set. Please set your preferred editor in the EDITOR environment variable."
        exit 1
    fi
    "$EDITOR" "$result_text_file"
else
    # pipe to stdout
    cat "$result_text_file"
fi