The tutorial outlines a complete workflow for converting scanned documents into searchable PDF/A files using OCRmyPDF. It includes sidecar text extraction and batch processing capabilities. The guide walks through installing dependencies, creating synthetic test images, running the conversion, and tuning settings for noisy scans.
Installing OCRmyPDF System Dependencies
The setup process begins by importing standard libraries and defining a function to manage system and Python packages. The script installs Tesseract OCR engines for English, German, and French, along with Ghostscript, Unpaper, and PNGQuant. It also pulls in Python packages such as OCRmyPDF, img2pdf, and Pillow. An optional step builds jbig2enc to enable advanced PDF compression for scanned files.
import io
import os
import re
import sys
import time
import shutil
import logging
import textwrap
import subprocess
from pathlib import Path
INSTALL_JBIG2 = True
def sh(cmd: str, check: bool = True) -> int:
"""Run a shell command, echo it, and show the tail of its output."""
print(f" $ {cmd}")
r = subprocess.run(cmd, shell=True, text=True,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
if r.stdout and r.stdout.strip():
for ln in r.stdout.strip().splitlines()[-12:]:
print(" " + ln)
if check and r.returncode != 0:
raise RuntimeError(f"Command failed ({r.returncode}): {cmd}")
return r.returncode
def install_dependencies() -> None:
"""Install OCRmyPDF's system + Python dependencies for Colab/Ubuntu."""
apt_pkgs = (
"tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd "
"tesseract-ocr-deu tesseract-ocr-fra "
"ghostscript unpaper pngquant poppler-utils qpdf"
)
sh("apt-get update -qq", check=False)
sh(f"DEBIAN_FRONTEND=noninteractive apt-get install -y -qq {apt_pkgs}")
sh(f'"{sys.executable}" -m pip install -q --upgrade ocrmypdf img2pdf "pillow<12"')
if INSTALL_JBIG2 and shutil.which("jbig2") is None:
try:
build_pkgs = ("autoconf automake libtool pkg-config "
"libleptonica-dev zlib1g-dev build-essential git")
sh(f"DEBIAN_FRONTEND=noninteractive apt-get install -y -qq {build_pkgs}")
sh("rm -rf /tmp/jbig2enc && "
"git clone -q https://github.com/agl/jbig2enc.git /tmp/jbig2enc")
sh("cd /tmp/jbig2enc && ./autogen.sh >/dev/null 2>&1 && "
"./configure >/dev/null 2>&1 && make -j2 >/dev/null 2>&1 && "
"make install >/dev/null 2>&1 && ldconfig")
print(" jbig2enc:",
"installed" if shutil.which("jbig2") else "built, but binary not on PATH")
except Exception as e:
print(" jbig2enc build skipped (optional):", e)
def ensure_installed() -> None:
have_tools = bool(shutil.which("tesseract") and shutil.which("gs"))
try:
import ocrmypdf
import img2pdf
from PIL import Image
have_py = True
except Exception:
have_py = False
if have_tools and have_py:
print("Dependencies already present — skipping installation.")
else:
print("Installing dependencies (first run can take a few minutes)...")
install_dependencies()
ensure_installed()
The script checks for existing tools before running the installation. If Tesseract and Ghostscript are found, and the Python libraries load correctly, the process skips the install step. Otherwise, it proceeds with the package manager commands. The first execution may take several minutes to complete.
Loading OCRmyPDF and Building Synthetic Scans
The next section loads the OCRmyPDF library and handles potential compatibility issues with Pillow. It imports specific exception classes like PriorOcrFoundError and EncryptedPdfError for error handling. The code defines sample text pages to simulate a realistic document containing information about optical character recognition and archive statistics dated 14 March 2026.
Helper functions generate synthetic scanned pages. One function adds random speckles to imitate scanner noise, which helps test the cleaning features. Another creates an A4 page at approximately 200 DPI with dark text on a white background. The script can rotate the image by six degrees and apply a slight Gaussian blur to simulate a skewed, imperfect scan.
def _purge(*prefixes):
for name in [m for m in list(sys.modules)
if any(m == p or m.startswith(p + ".") for p in prefixes)]:
del sys.modules[name]
def _load_ocrmypdf():
_purge("PIL", "ocrmypdf")
import ocrmypdf
return ocrmypdf
try:
ocrmypdf = _load_ocrmypdf()
except ImportError as e:
if "_Ink" in str(e) or "PIL" in str(e):
print("Repairing an incompatible Pillow (reinstalling pillow<12)...")
sh(f'"{sys.executable}" -m pip install -q --force-reinstall "pillow<12"')
try:
ocrmypdf = _load_ocrmypdf()
print("Pillow repaired — continuing without a restart.")
except Exception:
raise RuntimeError(
"Pillow is still incompatible in this session. Use the Colab menu: "
"Runtime > Restart session, then run this cell again."
)
else:
raise
from ocrmypdf.exceptions import (
ExitCode,
PriorOcrFoundError,
EncryptedPdfError,
MissingDependencyError,
TaggedPDFError,
DigitalSignatureError,
DpiError,
InputFileError,
UnsupportedImageFormatError,
)
from ocrmypdf.helpers import check_pdf
from ocrmypdf.pdfa import file_claims_pdfa
import img2pdf
from PIL import Image, ImageDraw, ImageFont, ImageFilter
logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s")
logging.getLogger("ocrmypdf").setLevel(logging.WARNING)
logging.getLogger("pdfminer").setLevel(logging.ERROR)
logging.getLogger("PIL").setLevel(logging.WARNING)
SAMPLE_TEXT_PAGES = [
"Optical Character Recognition, commonly abbreviated as OCR, is the "
"process of converting images of typed or printed text into machine "
"encoded text. This page was generated as a synthetic scan so that the "
"OCRmyPDF pipeline has something realistic to recognize and search.",
"On 14 March 2026 the archive contained 1,482 pages across 37 folders. "
"Roughly 92 percent of those pages were scanned at 200 to 300 dots per "
"inch. The remaining 8 percent were skewed and required deskewing before "
"any reliable recognition was possible.",
"After OCRmyPDF finishes, the output is a searchable PDF/A file. You can "
"select text, copy it, and run full text search across thousands of "
"documents. The original image resolution is preserved while a hidden "
"text layer is placed accurately underneath the page image.",
]
def _find_font():
for cand in (
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
"/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf",
):
if os.path.exists(cand):
return cand
return None
_FONT_PATH = _find_font()
FONT = ImageFont.truetype(_FONT_PATH, 40) if _FONT_PATH else ImageFont.load_default()
def _add_speckle(img, n=6000, dark=60):
"""Sprinkle light dark specks to imitate scanner noise (motivates --clean)."""
import random
px = img.load()
w, h = img.size
for _ in range(n):
px[random.randint(0, w - 1), random.randint(0, h - 1)] = random.randint(0, dark)
return img
def render_page(text, skew=False):
"""Render one A4 page (1654x2339 px ≈ 200 DPI) of dark text on white."""
W, H = 1654, 2339
img = Image.new("L", (W, H), 255)
draw = ImageDraw.Draw(img)
draw.multiline_text((150, 180), textwrap.fill(text, width=58),
fill=25, font=FONT, spacing=18)
if skew:
img = img.rotate(Source Read original →



