144 lines
3.1 KiB
Bash
Executable File
144 lines
3.1 KiB
Bash
Executable File
#!/bin/sh -e
|
|
#
|
|
# Breaks simple captchas.
|
|
#
|
|
# deps: gocr, imagemagick, multicrop, textcleaner
|
|
#
|
|
|
|
# -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
|
|
NUM_CHARS=4
|
|
CHARSET="[0-9A-z]"
|
|
|
|
# lower number = more dense to search = more rigorous detection
|
|
DENSITY=3
|
|
|
|
# 0-100, higher numbers will force strict matches
|
|
CERTAINTY=0
|
|
|
|
# 16: not dot divide overlapping chars
|
|
# 32: do not context correct
|
|
MODE='-m 16 -m 32'
|
|
|
|
IMAGE_PROG=mpvimg
|
|
# -*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
|
|
|
|
usage() {
|
|
>&2 printf "Usage: %s [-d] [-v] [file]\n" "${0##*/}"
|
|
exit 1
|
|
}
|
|
|
|
init() {
|
|
random() { r=$(($(date +%s) / $$ - $$)) ; echo ${r#-} ; }
|
|
dir=/tmp/$(random) # dir to store tmp files
|
|
tmp=$dir/$(random) # image created
|
|
unset -f random
|
|
|
|
clean
|
|
mkdir -p "$dir"
|
|
}
|
|
|
|
clean() {
|
|
rm -r "${dir:-?}" 2>/dev/null ||:
|
|
}
|
|
|
|
check() {
|
|
if [ ! -f "$1" ] ; then
|
|
>&2 echo "No such image: $1"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
jpg2png() {
|
|
case $(file "$1") in
|
|
*jpg*|*JPG*|*jpeg*|*JPEG*)
|
|
convert "$1" "${1%.*}.png"
|
|
rm "$1"
|
|
image=${image%.*}.png
|
|
;;
|
|
esac
|
|
}
|
|
|
|
main() {
|
|
image=$1
|
|
check "$image"
|
|
|
|
init
|
|
# trap 'clean' INT TERM EXIT
|
|
|
|
# convert to png if necessary
|
|
jpg2png "image"
|
|
|
|
# clean up text and $IM<Plug> black on white background
|
|
# text will be evenly spaced but not aligned
|
|
convert \
|
|
-quality 100 \
|
|
-shave 1x1 \
|
|
-resize 500%x500% \
|
|
-threshold 50% \
|
|
-deskew 50 \
|
|
-trim \
|
|
-bordercolor white \
|
|
-border 20 \
|
|
"$image" "$tmp"
|
|
|
|
[ "$DEBUG" = true ] && $IMAGE_PROG "$tmp" &
|
|
|
|
|
|
# IMPORTANT: multicrop reads from top-to-bottom, while
|
|
# we need to read left-to-right as we are reading text.
|
|
#
|
|
# Without rotation the resulting letters are mixed at random.
|
|
convert -rotate 90 "$tmp" "$tmp"
|
|
multicrop -u 1 -f 20 -g "${DENSITY:-5}" \
|
|
-b white "$tmp" "$dir/multicut.png" >/dev/null
|
|
|
|
# expand borders a little to help with ocr
|
|
for i in "$dir"/multicut-*.png ; do
|
|
{
|
|
# and rotate 270 to put us back where we were
|
|
convert -rotate 270 "$i" "$i"
|
|
|
|
textcleaner -g -e stretch -f 25 -o 10 -s 1 "$i" "$i"
|
|
|
|
convert \
|
|
-quality 100 \
|
|
-monochrome \
|
|
-trim \
|
|
-bordercolor white \
|
|
-border 100 \
|
|
"$i" "$i"
|
|
} &
|
|
done
|
|
wait
|
|
|
|
if [ "$DEBUG" = true ] ; then
|
|
for i in "$dir/multicut-"*.png ; do
|
|
$IMAGE_PROG "$i" &
|
|
done
|
|
fi
|
|
|
|
montage "$dir/multicut-*.png" -tile "${NUM_CHARS:-4}"x1 "$dir/out.png"
|
|
|
|
if [ "$DEBUG" = true ] || [ "$DISPLAY_RESULT" = true ] ; then
|
|
$IMAGE_PROG "$dir/out.png" &
|
|
fi
|
|
|
|
# shellcheck disable=2086
|
|
gocr -u '?' $MODE \
|
|
-a ${CERTAINTY:-0} \
|
|
-c ${CHARSET:-'[0-9A-z]'} \
|
|
"$dir/out.png" 2>/dev/null | \
|
|
sed 's/ //g' | tr '[:lower:]' '[:upper:]'
|
|
}
|
|
|
|
while [ "$1" ] ; do
|
|
case $1 in
|
|
-h) usage ;;
|
|
-v) DEBUG=true ; shift ;;
|
|
-d) DISPLAY_RESULT=true ; shift ;;
|
|
*) break
|
|
esac
|
|
done
|
|
|
|
main "$@"
|