#!/bin/sh -e # # Breaks simple captchas. # # deps: gocr, imagemagick, multicrop, textcleaner # # -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* NUM_CHARS=4 CHARSET="[0-9A-z]" # lower number = more dense to search = more rigorous detection DENSITY=3 # 0-100, higher numbers will force strict matches CERTAINTY=0 # 16: not dot divide overlapping chars # 32: do not context correct MODE='-m 16 -m 32' IMAGE_PROG=mpvimg # -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-* usage() { >&2 printf "Usage: %s [-d] [-v] [file]\n" "${0##*/}" exit 1 } init() { random() { r=$(($(date +%s) / $$ - $$)) ; echo ${r#-} ; } dir=/tmp/$(random) # dir to store tmp files tmp=$dir/$(random) # image created unset -f random clean mkdir -p "$dir" } clean() { rm -r "${dir:-?}" 2>/dev/null ||: } check() { if [ ! -f "$1" ] ; then >&2 echo "No such image: $1" exit 1 fi } jpg2png() { case $(file "$1") in *jpg*|*JPG*|*jpeg*|*JPEG*) convert "$1" "${1%.*}.png" rm "$1" image=${image%.*}.png ;; esac } main() { image=$1 check "$image" init # trap 'clean' INT TERM EXIT # convert to png if necessary jpg2png "image" # clean up text and $IM black on white background # text will be evenly spaced but not aligned convert \ -quality 100 \ -shave 1x1 \ -resize 500%x500% \ -threshold 50% \ -deskew 50 \ -trim \ -bordercolor white \ -border 20 \ "$image" "$tmp" [ "$DEBUG" = true ] && $IMAGE_PROG "$tmp" & # IMPORTANT: multicrop reads from top-to-bottom, while # we need to read left-to-right as we are reading text. # # Without rotation the resulting letters are mixed at random. convert -rotate 90 "$tmp" "$tmp" multicrop -u 1 -f 20 -g "${DENSITY:-5}" \ -b white "$tmp" "$dir/multicut.png" >/dev/null # expand borders a little to help with ocr for i in "$dir"/multicut-*.png ; do { # and rotate 270 to put us back where we were convert -rotate 270 "$i" "$i" textcleaner -g -e stretch -f 25 -o 10 -s 1 "$i" "$i" convert \ -quality 100 \ -monochrome \ -trim \ -bordercolor white \ -border 100 \ "$i" "$i" } & done wait if [ "$DEBUG" = true ] ; then for i in "$dir/multicut-"*.png ; do $IMAGE_PROG "$i" & done fi montage "$dir/multicut-*.png" -tile "${NUM_CHARS:-4}"x1 "$dir/out.png" if [ "$DEBUG" = true ] || [ "$DISPLAY_RESULT" = true ] ; then $IMAGE_PROG "$dir/out.png" & fi # shellcheck disable=2086 gocr -u '?' $MODE \ -a ${CERTAINTY:-0} \ -c ${CHARSET:-'[0-9A-z]'} \ "$dir/out.png" 2>/dev/null | \ sed 's/ //g' | tr '[:lower:]' '[:upper:]' } while [ "$1" ] ; do case $1 in -h) usage ;; -v) DEBUG=true ; shift ;; -d) DISPLAY_RESULT=true ; shift ;; *) break esac done main "$@"