src/simple-scan-postprocessing.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

#!/bin/bash
# SPDX-License-Identifier: GPL-3.0-or-later
# Copyright (C) 2022 Alexander Vogt
# Author: Alexander Vogt <a.vogt@fulguritus.com>
#
# Sample postprocessing script for gnome-simple-scan for OCR in PDFs
#
# This script first identifies a suitable instance of ocrmypdf
# (https://github.com/ocrmypdf/OCRmyPDF) and then applies this as a
# postprocessing step to PDFs generated by simple-scan.
#
# Usage:
# =====
# simple-scan-postprocessing mime-type keep-origin input-file args
#
# Currently, only mime-type "application/pdf" is supported, the script will
# exit without an error if "image/jpeg", "image/png", or "image/webp" is
# provided. Any other mime-type results in an error.
# All args are provided to ocrmypdf.
# If keep-origin is set to "true", a copy of the source file is kept.
#
# Example:
# =======
# simple-scan-postprocessing application/pdf true scan.pdf -l eng+deu
# simple-scan-postprocessing application/pdf true scan.pdf -rcd --jbig2-lossy -l deu
#
set -e +m

# Arguments
mime_type="$1"
keep_original="$2"
target="$3"
remainder="${@:4}"
# Globals
_ocrmypdfcontainer="jbarlow83/ocrmypdf"

source="${target%.*}_orig.${target##*.}"

# Helper functions
function findOcrMyPdf() {
	# Determines the path of ocrmypdf in the following order:
	#   1. ocrmypdf from the $PATH (local installation)
	#   2. ocrmypdf through podman (if podman in $PATH)
	#   3. ocrmypdf through docker (if podman in $PATH)
	_ocrmypdf=$(which ocrmypdf) && return
	_ocrmypdf="$(which podman) run --rm -i ${_ocrmypdfcontainer} " && return
	_ocrmypdf="$(which docker) run --rm -i ${_ocrmypdfcontainer} "
	if [ $? -ne 0 ]; then
		echo "No suitable instance of ocrmypdf found. Please check your setup. "
		exit 1
	fi
}

case ${mime_type} in
	"application/pdf")
		mv "$target" "$source" # create a backup

		# Determine the version of ocrmypdf to use
		findOcrMyPdf
		# Execute OCR
		${_ocrmypdf} ${remainder} - - <"$source" >"$target"
		;;
	"image/jpeg")
		exit 0 # Nothing implemented
		;;
	"image/png")
		exit 0 # Nothing implemented
		;;
	"image/webp")
		exit 0 # Nothing implemented
		;;
	*)
		echo "Unsupported mime-type \"${mime_type}\""
		exit 1
		;;
esac

# Clean up
if [ "$keep_original" == "true" ]; then
	exit 0
else
	rm "$source"
fi