2006-10-29 15:59:09 UTC
updated script for tesseract
converts and scans one or more files.
***
#!/bin/bash
DELT=TRUE
DELG=TRUE
TARGETDIR=/tmp/
OUTPUT=/tmp/
if [[ -z "$1" ]]
then
echo "Usage: $O [-p] [-t TIF-DIR -o TESS-DIR -f file ] files_to_OCR
scans all files with tesseract and outputs the text on STOUT
OPTIONS:
-t TIF-DIR saves converted TIF images in target-directory TIF-DIR
-o TESS-DIR saves files created by tesseract in TESS-DIR
-f FILE saves all text output in file FILE
-s supress output of text on STOUT (only useful with -t or -o;
do not use with -f)
-c convert with fill white, resize, sigmoidal-contrast, etc..
Examples:
$0 file1 file2 ...
scans the files and displays all the text.
When STOUT is used for the scanned text, progess and error
messages are redirected to tes.error.log. -f redirects output
to file STOUT will show progress and error messages (no log file).
Scanning takes time... be patient :)
"
exit
fi
# redirect error to log.file and fd 5 to stout
# is changed in -f or -s
exec 2>tes.error.log
exec 5>&1
while getopts ":t:o:f:pc" Option
do
case $Option in
t ) # save converted tif files in directory
if [[ -d "$OPTARG" ]]
then
TARGETDIR="$OPTARG"/
DELG=FALSE
else
echo "-t: $OPTARG : no such folder"
exit
fi
;;
o ) # save tesseract files in directory
if [[ -d "$OPTARG" ]]
then
OUTPUT="$OPTARG"/
DELT=FALSE
else
echo "-o: $OPTARG : no such folder"
exit
fi
;;
f ) # no STOUT; save in file
FILE="$OPTARG"
exec 5>$FILE
exec 2>&1
;;
s ) # suppress STOUT; show messages
exec 2>&1
exec 5>/dev/null
;;
c ) # convert picture extensively
CONV=TRUE
;;
esac
done
echo "TIF-output directory=$TARGETDIR" >&2
echo "tesseract output directory=$OUTPUT" >&2
shift $(($OPTIND - 1))
for i in $*
do
I_FILE=${i##.*/} #just the filename w/o directory
if [[ ! -f "$i" ]]
then
echo "$i: file not found" >&2
else
if ! identify "$i" 2>/dev/null 1>&2 # check if graphics file can be converted
then
echo "$i: not convertable" >&2
else
echo "processeing: $i ..." >&2
# converting the graphic file
NEWTIF="$TARGETDIR""${I_FILE%\.*}".tif
#convert "$i" "$NEWTIF"
if [[ $CONV == "TRUE" ]]
then
convert "$i" -density 150x150 -resize 200% -fill white -tint 50 -level 20%,80%,1.0 -sigmoidal-contrast 30,50% -sharpen 0x2 -compress none -monochrome "$NEWTIF" 1>&2
else
convert "$i" -density 150x150 -compress none "$NEWTIF" 1>&2
fi
# scanning the newly created tif
T_FILE="$OUTPUT""${I_FILE%\.*}"
tesseract "$NEWTIF" "$T_FILE" 1>&2
# -p: output scanned text on STOUT
# fd #5 is set in getopts
cat "$T_FILE".txt >&5
# delete graphic file after use
if [[ $DELG == "TRUE" ]]
then
rm "$NEWTIF"
fi
# delete tesseract output
if [[ $DELT == "TRUE" ]]
then
rm "$T_FILE".map "$T_FILE".raw "$T_FILE".txt
fi
# concatenate all text outputs
if [[ $CAT == "TRUE" ]]
then
cat "$T_FILE" >> "$TES"
fi
fi
fi
done