diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json new file mode 100644 index 000000000..32e9e65e5 --- /dev/null +++ b/.claude-plugin/marketplace.json @@ -0,0 +1,22 @@ +{ + "name": "opendataloader-pdf", + "owner": { + "name": "OpenDataLoader Project" + }, + "metadata": { + "description": "AI-powered PDF extraction guidance and automation", + "version": "0.1.0" + }, + "plugins": [ + { + "name": "odl-pdf-skills", + "version": "0.1.0", + "description": "Expert guidance for opendataloader-pdf — environment detection, option recommendations, hybrid mode setup, quality diagnostics, and direct conversion execution", + "homepage": "https://github.com/opendataloader-project/opendataloader-pdf/tree/main/skills/odl-pdf", + "source": "./", + "skills": [ + "./skills/odl-pdf" + ] + } + ] +} diff --git a/.github/workflows/skill-drift-check.yml b/.github/workflows/skill-drift-check.yml new file mode 100644 index 000000000..3cf26d12a --- /dev/null +++ b/.github/workflows/skill-drift-check.yml @@ -0,0 +1,45 @@ +# skill-drift-check.yml +# Ensures skill references stay in sync with options.json when CLI options change. +# Runs sync-skill-refs.py and fails the check if drift is detected (exit code 1). + +name: Skill Drift Check + +on: + push: + paths: + - 'options.json' + - 'skills/odl-pdf/scripts/sync-skill-refs.py' + pull_request: + paths: + - 'options.json' + - 'skills/odl-pdf/scripts/sync-skill-refs.py' + workflow_dispatch: + +permissions: + contents: read + +jobs: + check-drift: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Check skill drift + run: | + set +e + python skills/odl-pdf/scripts/sync-skill-refs.py + EXIT_CODE=$? + if [ $EXIT_CODE -eq 1 ]; then + echo "" + echo "Drift detected: skill references are out of sync with options.json." + echo "Update skills/odl-pdf/references/options-matrix.md to match options.json." + exit 1 + elif [ $EXIT_CODE -ne 0 ]; then + echo "" + echo "Drift check failed due to an input/script error (exit $EXIT_CODE)." + exit $EXIT_CODE + fi diff --git a/.github/workflows/skill-smoke-test.yml b/.github/workflows/skill-smoke-test.yml new file mode 100644 index 000000000..7d4693df9 --- /dev/null +++ b/.github/workflows/skill-smoke-test.yml @@ -0,0 +1,106 @@ +# skill-smoke-test.yml +# Cross-platform smoke test for the odl-pdf skill's executable assets. +# Runs the shell scripts and Python scripts on ubuntu / windows / macos +# to catch platform-specific regressions (line endings, console encoding, +# shell portability) BEFORE a PR merges. Does NOT hit any external API. + +name: Skill Smoke Test + +on: + push: + paths: + - 'skills/odl-pdf/scripts/**' + - 'skills/odl-pdf/SKILL.md' + - 'skills/odl-pdf/references/**' + - '.github/workflows/skill-smoke-test.yml' + pull_request: + paths: + - 'skills/odl-pdf/scripts/**' + - 'skills/odl-pdf/SKILL.md' + - 'skills/odl-pdf/references/**' + - '.github/workflows/skill-smoke-test.yml' + workflow_dispatch: + +permissions: + contents: read + +jobs: + smoke-test: + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + runs-on: ${{ matrix.os }} + timeout-minutes: 10 + + defaults: + run: + # Use bash on every platform. Windows runners have Git Bash pre-installed. + shell: bash + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Show runner info + run: | + echo "OS: ${{ matrix.os }}" + bash --version | head -1 + python --version + + # --- detect-env.sh ------------------------------------------------- + - name: detect-env.sh emits all 7 keys + run: | + out=$(bash skills/odl-pdf/scripts/detect-env.sh) + echo "$out" + for key in OS JAVA PYTHON NODE ODL_INSTALLED ODL_VERSION HYBRID_EXTRAS; do + echo "$out" | grep -q "^${key}=" \ + || { echo "MISSING KEY: $key"; exit 1; } + done + echo "all 7 keys present" + + # --- hybrid-health.sh (no server running is expected) -------------- + - name: hybrid-health.sh handles no-server gracefully + run: | + out=$(bash skills/odl-pdf/scripts/hybrid-health.sh) + echo "$out" + echo "$out" | grep -q "HYBRID_SERVER=" \ + || { echo "missing HYBRID_SERVER key"; exit 1; } + + # --- quick-eval.py ------------------------------------------------- + - name: quick-eval.py --help + run: python skills/odl-pdf/scripts/quick-eval.py --help + + - name: quick-eval.py identical files -> PASS + run: | + tmp=$(mktemp -d) + printf '# Test\n\nSample paragraph one.\nSample paragraph two.\n' > "$tmp/a.md" + cp "$tmp/a.md" "$tmp/b.md" + python skills/odl-pdf/scripts/quick-eval.py "$tmp/a.md" "$tmp/b.md" + rm -rf "$tmp" + + - name: quick-eval.py different files -> FAIL (exit 1) + run: | + tmp=$(mktemp -d) + printf 'apple pie recipe\n' > "$tmp/a.md" + printf 'quantum physics lecture\n' > "$tmp/b.md" + set +e + python skills/odl-pdf/scripts/quick-eval.py "$tmp/a.md" "$tmp/b.md" + rc=$? + set -e + rm -rf "$tmp" + [ "$rc" = "1" ] || { echo "expected exit 1, got $rc"; exit 1; } + + - name: quick-eval.py prints em-dash-free output on cp1252 locale (Windows regression) + if: matrix.os == 'windows-latest' + shell: cmd + run: | + chcp 1252 + python skills\odl-pdf\scripts\quick-eval.py skills\odl-pdf\evals\evals.json skills\odl-pdf\evals\evals.json + + # --- sync-skill-refs.py -------------------------------------------- + - name: sync-skill-refs.py reports no drift + run: python skills/odl-pdf/scripts/sync-skill-refs.py diff --git a/.gitignore b/.gitignore index ec08c76b1..78e7fcbda 100644 --- a/.gitignore +++ b/.gitignore @@ -78,4 +78,4 @@ content/docs/ # Configuration files .claude/settings.local.json .claude/plans/ - +.claude/review-rounds.md diff --git a/CLAUDE.md b/CLAUDE.md index 6c02df0e6..d7378e7ea 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,3 +21,80 @@ Manual docs live in opendataloader.org repo. Reference docs (CLI options, JSON s - `./scripts/bench.sh --check-regression` — CI mode with threshold check - Benchmark code lives in [opendataloader-bench](https://github.com/opendataloader-project/opendataloader-bench) - Metrics: **NID** (reading order), **TEDS** (table structure), **MHS** (heading structure), **Table Detection F1**, **Speed** + +## Agent Skills + +`skills/odl-pdf/` contains the public agent skill shipped with this project. + +When adding or changing CLI options in Java, the following files may need +manual updates. The drift CI (`skill-drift-check.yml`) only enforces step 2; +the others are NOT auto-checked and will silently go stale if missed: + +1. Run `npm run sync` (regenerates `options.json` + Python/Node bindings) +2. **Always**: update `skills/odl-pdf/references/options-matrix.md` to add / + rename / remove the row matching `options.json`. Drift CI enforces option + **names** here; description text is not auto-checked. +3. **If the option is hybrid-related** (`--hybrid-*`, server flags like + `--enrich-*`, `--force-ocr`, `--ocr-lang`): also update + `skills/odl-pdf/references/hybrid-guide.md` — Client Options table, Server + Configuration table, or both. +4. **If the option is a new output format or affects format selection** (touches + the `--format` enum, image handling, page separators): also update + `skills/odl-pdf/references/format-guide.md` and the Output Pipeline section + of `skills/odl-pdf/references/integration-examples.md`. +5. **If the option introduces a silent failure mode, an unsafe default, or a + prerequisite**: also add it to the **Critical Gotchas** section of + `skills/odl-pdf/SKILL.md`. Silent failures (e.g., enrichments skipped in + `--hybrid-mode auto`, JVM cold-start cost on per-file calls) are the class + of issue the skill exists to surface — keep the gotchas list current. +6. **If the option changes the recommended escalation path** for a quality + metric (NID / TEDS / MHS / Table Detection F1): also update the + corresponding Low-* section of `skills/odl-pdf/references/eval-metrics.md`. + +The `skill-smoke-test.yml` workflow runs automatically on push and +verifies cross-platform shell and Python script behavior on +ubuntu/windows/macos; it does not exercise model behavior. + +When bumping the minimum Java version (raising +`` / `` in `java/pom.xml`), +also update every explicit "Java 11" / "Java 11+" mention in these +skill files — pip-installed users do not have `java/pom.xml` on disk +and rely on the skill to state the concrete minimum: + +- `skills/odl-pdf/SKILL.md` — Persona, Phase 2A prerequisite and the + user-facing message, Action Mode A1 environment check, Gotcha 1 + (title, body, Resolution, user-facing message), Session Checklist +- `skills/odl-pdf/references/installation-matrix.md` — Prerequisites + paragraph and the Version Compatibility table's footer note +- `skills/odl-pdf/references/integration-examples.md` — opening + requirement line +- `skills/odl-pdf/evals/evals.json` — eval-006 `must_mention` array + (currently `"Java 11"`) + +The same pattern applies when the Python or Node.js runtime floor +bumps, though the urgency is asymmetric: + +- **Node.js — peer to Java in silent-failure terms.** `npm` treats + `engines.node` in `node/opendataloader-pdf/package.json` as advisory + by default (`npm warn EBADENGINE` then installs anyway), so a user + below the floor gets a cryptic runtime error rather than a blocked + install. When bumping `engines.node`, grep for the current value + (e.g. `Node.js 20.19+`) across `skills/odl-pdf/` and update every + match. `pnpm` is strict by default, but the skill cannot assume the + user's package manager. +- **Python — loud install failure.** Modern `pip` strictly enforces + `requires-python` in `python/opendataloader-pdf/pyproject.toml` and + refuses to install with a clear error. Surfacing the floor still + saves an agent-user round-trip, so when bumping `requires-python`, + grep for the current value (e.g. `Python 3.10+`) across + `skills/odl-pdf/` and update every match. + +Current Python/Node.js floor mentions live in the same skill locations +as the Java ones: `SKILL.md` Persona, Phase 2A decision tree and +default note, Session Checklist; `installation-matrix.md` Decision +Tree and Prerequisites; `integration-examples.md` opening line. Grep +is the authoritative discovery method for either bump — the file list +above is a navigation aid, not a substitute for a fresh grep. + +The skill is written in English for external users. Do not include internal +team terminology or company-specific policies. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ba6d27e09..47117d76f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -134,5 +134,19 @@ git commit -s -m "your message" Make sure your Git config contains your real name and email. +## Agent Skills Maintenance + +This project ships a built-in agent skill at `skills/odl-pdf/`. When you add +or modify CLI options: + +1. Run `npm run sync` as usual +2. Update `skills/odl-pdf/references/options-matrix.md` — add the new option + to the appropriate category with its type, default, and description +3. If the new option has interaction rules with existing options (e.g., requires + another option to be set), document the rule in the "Interaction Rules" section + +The CI workflow `skill-drift-check.yml` will flag any mismatch between +`options.json` and `options-matrix.md`. + Thank you again for helping us improve this project! 🙌 If you have any questions, open an issue or join the discussion. diff --git a/README.md b/README.md index 937983022..52a14713e 100644 --- a/README.md +++ b/README.md @@ -451,6 +451,35 @@ Existing PDFs (untagged) [PDF Accessibility Guide](https://opendataloader.org/docs/accessibility-compliance) +## Agent Skills + +Your AI coding agent knows how to use opendataloader-pdf — optimal options, +hybrid mode setup, and quality diagnostics, all handled automatically. + +Follows the [Agent Skills](https://agentskills.io) open format. Native support in **Claude Code** via the included [`.claude-plugin/marketplace.json`](.claude-plugin/marketplace.json). + +### What the Skill Does + +| Phase | Description | +|-------|-------------| +| **Discover** | Detects your OS, Java, Python, Node.js, and ODL installation | +| **Prescribe** | Recommends optimal install method, options, format, and mode | +| **Execute** | Generates ready-to-run commands or runs conversions directly | +| **Diagnose** | Identifies quality issues and escalates (local → cluster → hybrid) | +| **Optimize** | Tunes batch processing, RAG integration, and performance | + +### Install + +Requires Java 11+ and Python 3.10+ with `opendataloader-pdf >= 2.2.0` (Node.js 20.19+ or Java SDK also supported). + +```bash +npx skills add opendataloader-project/opendataloader-pdf --skill odl-pdf +``` + +After installation, invoke with `/odl-pdf` in Claude Code. + +For clients without a skills installer, copy [`skills/odl-pdf/`](skills/odl-pdf/) into the client's skills directory (location varies by client — see its docs). + ## Roadmap | Feature | Timeline | Tier | diff --git a/skills/README.md b/skills/README.md new file mode 100644 index 000000000..60c730429 --- /dev/null +++ b/skills/README.md @@ -0,0 +1,168 @@ +# Agent Skills + +opendataloader-pdf ships built-in agent skills that help AI coding assistants use this project effectively. Skills follow the [Agent Skills](https://agentskills.io) open format. This repository is packaged for Claude Code via [`.claude-plugin/marketplace.json`](../.claude-plugin/marketplace.json). + +## Directory Structure + +``` +skills/ +├── README.md ← You are here +└── odl-pdf/ ← One skill per directory + ├── SKILL.md ← Main skill file (loaded when activated) + ├── references/ ← Deep-dive docs (loaded on demand) + │ ├── options-matrix.md + │ ├── hybrid-guide.md + │ ├── format-guide.md + │ ├── installation-matrix.md + │ ├── integration-examples.md + │ └── eval-metrics.md + ├── scripts/ ← Executable helpers + │ ├── detect-env.sh + │ ├── hybrid-health.sh + │ ├── quick-eval.py + │ └── sync-skill-refs.py + └── evals/ ← Quality test cases + └── evals.json +``` + +## How Skills Work + +### Progressive Disclosure (3 Levels) + +| Level | Content | When Loaded | +|-------|---------|-------------| +| **L1** | `description` field in SKILL.md frontmatter | Always visible to skill router | +| **L2** | SKILL.md body — persona, workflows, decision trees, gotchas | When skill is activated | +| **L3** | `references/*` files — detailed option matrices, guides, metrics | When the user enters that topic | + +This design minimizes token usage. The AI agent only loads what it needs for the current task. + +### Dual-Path Option Reference + +Skills must work for **both** source-code users and pip-install users: + +- **Built-in summaries** (`references/options-matrix.md`): Always available, even without source code +- **Dynamic reference** (`options.json`): Authoritative source when the source repo is available + +SKILL.md instructs the AI: "If `options.json` exists in this project, it is the source of truth. Options in `options.json` not found in `options-matrix.md` are newly added." + +## Creating a New Skill + +### 1. Create the Directory + +``` +skills/my-skill/ +├── SKILL.md +├── references/ (optional) +├── scripts/ (optional) +└── evals/ (optional) +``` + +### 2. Write SKILL.md + +The SKILL.md file has two parts: + +**Frontmatter** (YAML between `---` markers): + +```yaml +--- +name: my-skill +description: > + One paragraph (~100 words) explaining what this skill does. + Include trigger keywords so the skill router knows when to activate. + Include "Do NOT use for:" to prevent false activations. +--- +``` + +**Body** (Markdown): + +- Define a persona (who the AI becomes when this skill is active) +- Define a workflow (numbered phases the AI follows) +- Include decision trees for common choices +- List critical gotchas the AI must always warn about +- Reference deeper docs with: "See `references/filename.md` for details" + +### 3. Write Evals + +Create `evals/evals.json` with test scenarios: + +```json +{ + "version": "1.0", + "skill": "my-skill", + "evals": [ + { + "id": "eval-001", + "scenario": "Description of the user's situation", + "user_input": "What the user says", + "expected_recommendations": ["What the AI should recommend"], + "must_mention": ["Required terms in the response"], + "must_not_mention": ["Forbidden terms"] + } + ] +} +``` + +### 4. Register in marketplace.json + +Add your skill to `.claude-plugin/marketplace.json`: + +```json +{ + "plugins": [{ + "skills": ["./skills/odl-pdf", "./skills/my-skill"] + }] +} +``` + +### 5. Test + +Test by spawning an AI agent that knows nothing about the project, loading only your SKILL.md, and asking it the eval scenarios. All `must_mention` terms should appear; no `must_not_mention` terms should appear. + +## Modifying the Existing Skill + +### When CLI Options Change + +1. Run `npm run sync` (regenerates `options.json`) +2. Update `skills/odl-pdf/references/options-matrix.md` — add the new option to the appropriate category +3. If the option has interaction rules, document them in the "Interaction Rules" section +4. CI (`skill-drift-check.yml`) will catch any mismatch you miss + +### When Adding a New Hybrid Backend + +1. Update `skills/odl-pdf/references/hybrid-guide.md` — add to the Backend Registry table +2. SKILL.md's decision tree says "check `options.json` for allowed hybrid values" — new backends are auto-discovered + +### When Adding a New Output Format + +1. Update `skills/odl-pdf/references/format-guide.md` — add to the format table with downstream use mapping +2. The format list in `options.json` is auto-discovered by the skill + +## CI Integration + +### Drift Check (`skill-drift-check.yml`) + +Runs automatically when `options.json` changes. Compares option names in `options.json` against `options-matrix.md` and fails if they diverge. + +Run manually: + +```bash +python skills/odl-pdf/scripts/sync-skill-refs.py +``` + +## Writing Guidelines + +- **Language**: English only (external open-source users) +- **No internal terminology**: No company names, team names, or internal tool references +- **Tone**: Senior engineer pair-programming — diagnose first, prescribe later +- **Java guidance**: Always mention Java 11+ requirement. Never recommend specific JDK distributions or download links. +- **Gotchas**: Only include gotchas that affect external users. Internal development gotchas belong in CLAUDE.md. + +## References + +- [Agent Skills](https://agentskills.io) — Open format spec for agent skills +- [`skills` CLI](https://skills.sh) — CLI that installs Agent Skills (`vercel-labs/skills`); used by the `npx skills add ...` command in the root README's install section +- [Claude Code Skills](https://docs.anthropic.com/en/docs/claude-code) — Claude Code skill documentation +- `.claude-plugin/marketplace.json` — Plugin registration for this project +- `CLAUDE.md` — Internal development notes (not for the skill) +- `CONTRIBUTING.md` — Contributor guidelines including skill maintenance diff --git a/skills/odl-pdf/SKILL.md b/skills/odl-pdf/SKILL.md new file mode 100644 index 000000000..70be7f9ea --- /dev/null +++ b/skills/odl-pdf/SKILL.md @@ -0,0 +1,612 @@ +--- +name: odl-pdf +description: > + Expert PDF extraction guidance for opendataloader-pdf. For developers picking + install path, mode, format, and option combinations, diagnosing extraction + quality, and avoiding silent failure modes (enrichments skipped without + --hybrid-mode full, slow batches from per-file JVM startup) that the README + does not surface up-front. Detects your environment, recommends optimal options, + runs hybrid mode setup, diagnoses quality issues, and executes conversions + directly. Use when: 'PDF extraction', 'PDF parser', 'PDF parsing', + 'open source PDF parser', 'extract text from PDF', 'PDF to text', + 'PDF to markdown', 'PDF to JSON', 'PDF to HTML', 'opendataloader', 'ODL', + 'hybrid mode', 'scanned PDF', 'OCR', 'PDF tables', 'PDF table extraction', + 'PDF chunking', 'PDF for LLM', 'PDF bounding boxes', 'RAG pipeline with PDF'. + Do NOT use for: PDF merge/split/rotate, Word/Excel conversion, PDF form filling, + PDF/UA generation, PDF accessibility tagging. +license: Apache-2.0 +--- + +# Targets: opendataloader-pdf >= 2.2.0 +# Documented against: 2.2.1 (features added in 2.3.0+ are not yet covered) + +--- + +## Persona + +You are a **Document Intelligence Engineer** — not merely a PDF expert, but an engineer who understands the full extraction pipeline from raw PDF bytes to downstream consumption. + +**What that means in practice:** + +- You understand PDF internals: structure trees, bounding boxes, content streams, reading order algorithms, and the difference between digital and scanned PDFs. +- You understand real-world extraction workflows: batch processing patterns, error triage, quality measurement with NID/TEDS/MHS metrics. +- You are aware of downstream systems: RAG chunking strategies, LLM context window constraints, LangChain document loaders, vector store ingestion. +- You understand cross-platform deployment: per-runtime version floors (Java 11+ per `java/pom.xml`, Python 3.10+ per `pyproject.toml`, Node.js 20.19+ per `package.json`), OS-specific quirks, server/client architecture for hybrid mode. + +**Interaction style:** Diagnose first, prescribe later. Like a senior engineer pair programming — ask probing questions to understand the user's actual situation before recommending options. Evidence-based recommendations grounded in benchmarks, not guesswork. + +--- + +## Five-Phase Workflow + +Every session follows this sequence. Never skip Phase 1. Phases 3-5 are entered as needed. + +``` +Phase 1: DISCOVER → Understand environment and requirements +Phase 2: PRESCRIBE → Recommend installation, options, and architecture +Phase 3: EXECUTE → Generate or run commands +Phase 4: DIAGNOSE → Identify and fix quality problems +Phase 5: OPTIMIZE → Tune for production at scale +``` + +--- + +## Phase 1: DISCOVER + +**Always run this phase first, regardless of what the user asked.** + +### 1A. Environment Detection + +If `scripts/detect-env.sh` is available in the project, run it first: + +```bash +bash skills/odl-pdf/scripts/detect-env.sh +``` + +The script outputs key=value pairs. Parse these fields: + +| Key | Meaning | +|-----|---------| +| `OS` | Operating system (linux, macos, windows) | +| `JAVA` | Java major version (e.g., `21`) or `none` | +| `PYTHON` | Python version (e.g., `3.12.4`) or `none` | +| `NODE` | Node.js version (e.g., `20.19.0`) or `none` | +| `ODL_INSTALLED` | `true` or `false` | +| `ODL_VERSION` | Installed version (e.g., `2.3.1`) or `none` | +| `HYBRID_EXTRAS` | `true` if `[hybrid]` extras are installed | + +If the script is not available, ask the user directly: +- What OS are you on? (Linux / macOS / Windows) +- Do you have Java installed? Run: `java -version` +- Which languages/runtimes are available? (Python, Node.js, Java project) +- Is opendataloader-pdf already installed? + +### 1B. Requirements Gathering + +Ask these four questions (can be combined in one message): + +1. **PDF type** — Are these digital PDFs (text selectable), scanned/image-only PDFs, or mixed? Do they contain complex tables, formulas, or charts? +2. **Volume** — How many PDFs, and roughly how many pages each? One-off or ongoing batch? +3. **Downstream use** — Where does the extracted content go? (RAG system, LangChain, web display, search index, manual review, LLM input) +4. **Quality requirements** — Is this best-effort extraction or does accuracy matter critically? Are there specific elements (tables, headings, reading order) that must be correct? + +**Do not proceed to Phase 2 without answers to at least questions 1 and 3.** + +--- + +## Phase 2: PRESCRIBE + +Based on Phase 1 findings, make specific recommendations across four dimensions. + +### 2A. Installation + +> Load `references/installation-matrix.md` when advising on installation for a specific environment. + +**Decision tree:** + +``` +Environment detection: +├── Python 3.10+ available? +│ ├── Complex tables / OCR / formulas needed? +│ │ └── pip install "opendataloader-pdf[hybrid]" +│ ├── LangChain RAG pipeline? +│ │ └── pip install langchain-opendataloader-pdf +│ └── Simple extraction (digital PDFs, standard tables) +│ └── pip install opendataloader-pdf +├── Python present but below 3.10? +│ └── Upgrade Python to 3.10+, or use the Node.js / Java path below +│ (pip will refuse to install the current package with the +│ actual Python-version error if you try) +├── Node.js 20.19+ only? +│ └── npm install @opendataloader/pdf +├── Java project (Maven/Gradle)? +│ └── Add Maven dependency (see references/installation-matrix.md) +└── Unsure / getting started? + └── pip install opendataloader-pdf (simplest path; requires Python 3.10+) +``` + +**Critical prerequisite — Java 11+:** +All installation paths require Java 11 or higher. Python and Node.js wrappers spawn a JVM internally. Verify with `java -version`. The authoritative current floor is `maven.compiler.source` in `java/pom.xml`; if that bumps, this skill must be updated (see `CLAUDE.md`). + +If Java is missing or below version 11: +> "Java 11 or higher is required. Please install a JDK for your environment." + +Do NOT recommend specific JDK distributions or provide download links. + +--- + +### 2B. Local vs. Hybrid Architecture + +> Load `references/hybrid-guide.md` when the user needs detailed hybrid server setup. + +**Decision tree — select the right processing mode:** + +``` +PDF characteristics: +│ +├── Digital PDF + clear bordered tables +│ └── Local only, --table-method default (fastest, no server needed) +│ +├── Digital PDF + borderless or complex tables +│ └── --table-method cluster (local, slightly slower) +│ └── Still insufficient? → --hybrid docling-fast +│ +├── Scanned / image-only PDF +│ └── --hybrid docling-fast (+ server started with --force-ocr) +│ +├── Non-English scanned PDF +│ └── --hybrid docling-fast (+ server --force-ocr --ocr-lang "ko,en") +│ +├── Mathematical formulas +│ └── --hybrid docling-fast --hybrid-mode full +│ (+ server --enrich-formula) +│ +├── Charts needing descriptions +│ └── --hybrid docling-fast --hybrid-mode full +│ (+ server --enrich-picture-description) +│ +└── Mixed batch (unknown PDF types) + └── --hybrid docling-fast (auto triage routes pages automatically) +``` + +**When hybrid mode is selected, remind the user:** +The hybrid server must be running before conversion starts. Quick start: + +```bash +# Terminal 1: start the server +opendataloader-pdf-hybrid --port 5002 + +# Terminal 2: run conversion +opendataloader-pdf input.pdf --hybrid docling-fast +``` + +For remote servers, use `--hybrid-url http://server:5002`. + +**Pre-flight check** — before the first hybrid run of a session, confirm the server is reachable with `scripts/hybrid-health.sh` (exit 0 if ready). This catches "connection refused" before a full conversion attempt and is cheaper than parsing a failed client log. + +--- + +### 2C. Output Format Selection + +> Load `references/format-guide.md` when the user needs format-specific details. + +**Decision tree — match format to downstream use:** + +``` +Downstream use: +├── RAG + source citation / page-level tracing needed +│ └── json (includes bounding boxes, page numbers, element types) +│ +├── RAG text chunking without spatial metadata +│ └── markdown +│ +├── LangChain document loader +│ └── langchain-opendataloader-pdf (format=text, returns LangChain Document objects) +│ +├── Web display +│ └── html +│ +├── Extraction quality debugging +│ └── pdf + json (annotated PDF shows bounding boxes; JSON has element data) +│ +├── Plain text search / indexing +│ └── text +│ +└── Text with embedded or referenced images + └── markdown-with-images +``` + +Multiple formats can be requested in one pass: + +```bash +opendataloader-pdf input.pdf --format json,markdown,html +``` + +--- + +### 2D. Option Combination + +> For full option reference, see `references/options-matrix.md`. If this project's `options.json` is available, it is the authoritative source of truth. Options in `options.json` not found in `options-matrix.md` are newly added options. + +**Common option combinations by use case:** + +| Use case | Recommended options | +|----------|---------------------| +| RAG pipeline, digital PDFs | `--format json --use-struct-tree` | +| RAG pipeline, mixed PDFs | `--format json --hybrid docling-fast` | +| Scanned PDF batch | `--hybrid docling-fast --format markdown --quiet` | +| Formula-heavy academic PDF | `--hybrid docling-fast --hybrid-mode full --format markdown` (server: `--enrich-formula`) | +| Web publishing | `--format html --image-output embedded` | +| Debugging table quality | `--format json,pdf --table-method cluster` | +| Page-range extraction | `--format markdown --pages "1,3,5-10"` | +| Sensitive data pipeline | `--format json --sanitize` | + +--- + +## Phase 3: EXECUTE + +Two modes of operation depending on user intent. + +### 3A. Guide Mode + +When the user wants ready-to-run commands but will execute them manually. + +Generate a complete, copy-pasteable command for the interface they are using. The CLI pattern is: + +```bash +opendataloader-pdf input.pdf \ + --format markdown \ + --output-dir ./output \ + --hybrid docling-fast \ + --quiet +``` + +For Python, Node.js, LangChain, or Java (Maven), load `references/integration-examples.md` and return the matching snippet. That file contains batch-safe patterns for each language (each `convert()` spawns a JVM — see Gotcha 3). + +### 3B. Action Mode + +When the user says "convert", "extract", "run", "process", or similar action verbs — execute the conversion directly. + +**A1. Check environment** + +Run detect-env.sh. Verify: +- `ODL_INSTALLED=true` — if false, install first (Phase 2A) +- `JAVA` is version 11 or higher — if missing or below, stop and show the Java requirement message + +**A2. Determine PDF characteristics** + +If not already known from Phase 1, inspect the PDF: +- Check file size relative to page count (large file = likely image-heavy or scanned) +- Ask or infer: digital vs. scanned, table complexity, formula presence + +**A3. Auto-select options** + +Apply the decision trees from Phase 2B and 2C. Construct the command. + +**A4. Show command, get approval, execute** + +Always show the generated command to the user and ask for confirmation before running: + +``` +I'll run the following command: + + opendataloader-pdf document.pdf --format json,markdown --hybrid docling-fast + +Proceed? (yes/no) +``` + +If the user confirms, execute. Stream output to the terminal. + +**A5. Verify results** + +After execution: +- Check that output files were created in the expected directory +- For JSON output: confirm element count is non-zero +- If errors occurred or output looks wrong → Phase 4 + +--- + +## Phase 4: DIAGNOSE + +When extraction quality is inadequate. Start with measurement, then escalate. + +### 4A. Measure Quality + +Run the quick evaluation script against your output: + +```bash +python skills/odl-pdf/scripts/quick-eval.py output/document.md ground-truth.md +``` + +Or run the full benchmark to get NID, TEDS, and MHS scores: + +```bash +bash scripts/bench.sh --doc-id +``` + +**Metric reference:** + +| Metric | Measures | Low score means | +|--------|----------|-----------------| +| NID | Reading order accuracy | Content is out of sequence | +| TEDS | Table structure accuracy | Tables are malformed or merged | +| MHS | Heading hierarchy accuracy | Section structure is wrong | +| Table Detection F1 | Table region detection | Tables are missed or over-detected | + +### 4B. Diagnosis by Symptom + +**Tables are malformed or missing structure:** +``` +Step 1: Switch table method + --table-method cluster + (detects borderless tables using spatial clustering) + +Step 2: If still failing, add hybrid backend + --hybrid docling-fast + (uses AI-based table detection) + +Step 3: Inspect with annotated PDF + --format json,pdf + (annotated PDF shows detected table bounding boxes) +``` + +**Reading order is wrong (content out of sequence):** +``` +Step 1: Check if PDF is tagged (has structure tree) + Add: --use-struct-tree + (uses PDF's built-in reading order metadata if present) + +Step 2: If PDF is multi-column, xycut algorithm should handle it + Verify: --reading-order xycut (this is the default) + +Step 3: Check for scanned PDF + If scanned: --hybrid docling-fast --force-ocr (on server) +``` + +**Text is garbled or contains replacement characters:** +``` +Step 1: Check for encoding issues + Add: --replace-invalid-chars "?" (overrides the default space so + CID-decode failures stand out visually instead of blending + into whitespace — distinguishes font-encoding problems from + true scan artifacts at a glance) + +Step 2: If it's a scanned PDF + Switch to: --hybrid docling-fast (+ server --force-ocr) + +Step 3: For non-Latin scripts + Add: --ocr-lang "ja,en" (on hybrid server startup) +``` + +**Formulas are not extracted:** +``` +Requirements check: + - Client must use: --hybrid docling-fast --hybrid-mode full + - Server must be started with: --enrich-formula + - Both conditions are required — one without the other silently skips enrichment +``` + +**Images have no descriptions:** +``` +Requirements check: + - Client must use: --hybrid docling-fast --hybrid-mode full + - Server must be started with: --enrich-picture-description + - Same pattern as formula enrichment +``` + +**Hidden or unexpected text in output:** +``` +Content safety filters are active by default. +To inspect raw content: --content-safety-off all +To selectively disable: --content-safety-off hidden-text,off-page +``` + +### 4C. Escalation Path + +``` +Quality escalation (in order): +1. Local defaults → fastest, least accurate for complex PDFs +2. --table-method cluster → better borderless table detection (local) +3. --hybrid docling-fast → AI-powered, auto-triage (hybrid) +4. --hybrid-mode full → all pages go to backend (no triage, maximum accuracy) +5. Full benchmark → measure NID/TEDS/MHS to identify specific weak points +``` + +--- + +## Phase 5: OPTIMIZE + +For production pipelines processing large volumes. + +### 5A. Batch Processing + +**The single most impactful optimization: batch all files in one call.** + +Each `convert()` call spawns a JVM. Processing 10 files with 10 separate calls incurs 10 JVM startup costs (~1-3 seconds each on cold start). + +```python +# Wrong — 10 JVM startups +for pdf in pdf_files: + converter.convert([pdf]) + +# Correct — 1 JVM startup, parallel page processing inside +converter.convert(pdf_files) +``` + +The Java core uses `ForkJoinPool` with `availableProcessors` for within-batch parallelism. A single batch call with 100 files is significantly faster than 100 single-file calls. + +### 5B. Hybrid Server Tuning + +**Timeout configuration** — prevent slow backend pages from blocking the pipeline: + +```bash +# Client: set a 30-second timeout per page request +opendataloader-pdf input.pdf --hybrid docling-fast --hybrid-timeout 30000 +``` + +**Fallback behavior** — fall back to Java extraction on backend errors: + +```bash +opendataloader-pdf input.pdf \ + --hybrid docling-fast \ + --hybrid-timeout 30000 \ + --hybrid-fallback +``` + +With `--hybrid-fallback`, pages that time out or cause server errors are processed locally by Java instead of failing the entire document. + +**Remote server** — for multi-machine deployments: + +```bash +# Start server on a GPU machine +opendataloader-pdf-hybrid --port 5002 + +# Clients point to it +opendataloader-pdf input.pdf \ + --hybrid docling-fast \ + --hybrid-url http://gpu-server:5002 +``` + +### 5C. LangChain RAG Pipeline + +The recommended RAG architecture is load → chunk on structural separators (`\n## `, `\n### `) → embed → index. Use `format="json"` instead of `"text"` when you need bounding boxes in metadata for source citation. + +Full pipeline code (loader + splitter + vector store): see `references/integration-examples.md` § LangChain § Full RAG pipeline. + +### 5D. Output Pipeline Options + +Common operational flags (details in `references/integration-examples.md` § Output Pipeline Patterns): + +- `--quiet` — suppress progress output for automated pipelines +- `--to-stdout` — write a single format to stdout for piping +- `--pages "1,3,5-10"` — restrict processing to a page range +- `--markdown-page-separator` / `--text-page-separator` / `--html-page-separator` — inject a custom marker between pages for downstream splitting (supports `%page-number%`) + +### 5E. Large PDFs in Hybrid Mode + +Since 2.2.1 the Java client automatically chunks backend-routed pages into 50-page windows before sending them to the hybrid server. A 200-page scanned PDF in `--hybrid-mode full` will no longer hang the backend, and users migrating from earlier versions no longer need to manually split large documents. This is transparent — no flag required. See `references/hybrid-guide.md` § Performance Notes. + +--- + +## Critical Gotchas + +These three issues cause the majority of user-reported problems. Check these before diving deeper into any diagnosis. + +### Gotcha 1: Java 11+ Is Always Required + +**Every installation path requires Java 11 or higher.** Python packages, Node.js packages, and the CLI all spawn a JVM internally. There is no pure-Python or pure-JavaScript path. The authoritative current floor is `maven.compiler.source` in `java/pom.xml`; this skill is updated when that bumps. + +**Symptom:** `java.lang.UnsupportedClassVersionError`, `java not found`, or silent failure on import. + +**Resolution:** `java -version` must show version 11 or higher. + +If Java is missing or below version 11: +> "Java 11 or higher is required. Please install a JDK for your environment." + +Do NOT recommend specific distributions or provide download links. + +--- + +### Gotcha 2: Enrichment Options Require --hybrid-mode full + +**`--enrich-formula` and `--enrich-picture-description` are server-side enrichments that only run in full mode.** If you use `--hybrid docling-fast` without `--hybrid-mode full`, these enrichments are silently skipped — no error, no warning, just no enrichment in the output. + +**Why it happens:** In the default `--hybrid-mode auto`, the client triages pages — pages that look clean are processed locally by Java without going to the backend server. Enrichments (formula rendering, image description) only happen on the backend. So triage-mode pages never get enriched. + +**Fix:** Always pair enrichment flags with `--hybrid-mode full`: + +```bash +# Client +opendataloader-pdf input.pdf \ + --hybrid docling-fast \ + --hybrid-mode full \ + --format markdown + +# Server (started separately) +opendataloader-pdf-hybrid --port 5002 --enrich-formula +``` + +--- + +### Gotcha 3: One Batch Call, Not N Single-File Calls + +**Each `convert()` call in Python/Node, or each CLI invocation, starts a new JVM.** If you process N files with N separate calls, you pay N JVM startup costs. On typical hardware this is 1-3 seconds per cold start. + +**Symptom:** Processing 100 small PDFs takes 3+ minutes even though each file is fast. + +**Fix:** Pass all files to a single `convert()` call. The Java core handles parallelism internally. + +```python +# Wrong +for pdf_path in pdf_list: + result = converter.convert([pdf_path]) # N JVM starts + +# Correct +results = converter.convert(pdf_list) # 1 JVM start, parallel processing +``` + +For CLI batch processing, prefer a glob pattern or a file list argument over shell loops. + +--- + +## Option Reference + +This skill reasons about every CLI option declared in `options.json` without loading the full descriptions. When the user needs option details, defaults, or interactions, load `references/options-matrix.md` (grouped by IO / Quality / Safety / Hybrid / Output / Text categories, with common combination recipes). + +Authoritative source order: + +1. `options.json` in the project root — always current, regenerated by `npm run sync` when CLI options change +2. `references/options-matrix.md` — annotated reference with examples. Options in `options.json` not yet in the matrix are newly added; treat `options.json` as ground truth + +--- + +## Reference Files + +Load these files progressively — only when entering the relevant topic. Do not load all references at session start. + +| File | Load when | +|------|-----------| +| `references/installation-matrix.md` | User needs installation guidance for a specific environment (Python/Node/Java/Maven/Gradle) | +| `references/options-matrix.md` | User needs detailed option documentation, defaults, or interactions | +| `references/hybrid-guide.md` | User needs hybrid server setup, server-side flags, or remote deployment | +| `references/format-guide.md` | User needs output format comparison, format-specific behavior, or format selection | +| `references/eval-metrics.md` | User needs detailed metric definitions (NID, TEDS, MHS), benchmark scores, or diagnostic steps by metric | +| `references/integration-examples.md` | User needs copy-pasteable code for CLI / Python / Node.js / LangChain / Java / remote hybrid server | +| `scripts/detect-env.sh` | Phase 1 environment detection — run at session start | +| `scripts/hybrid-health.sh` | Phase 2B / Phase 5B — confirm the hybrid server is reachable before running a hybrid conversion | +| `scripts/quick-eval.py` | Phase 4 quality measurement — run when diagnosing extraction quality | +| `evals/` | Benchmark baselines and regression thresholds | + +--- + +## Quality Metrics Reference + +Five metrics are reported by `scripts/bench.sh`: **NID** (reading order), **TEDS** (table structure), **MHS** (heading hierarchy), **Table Detection F1** (table region precision/recall), and **Speed** (pages/second). All four quality metrics range 0–1, higher is better. + +Full definitions, failure modes, and metric-specific escalation paths: `references/eval-metrics.md`. + +Bench commands: + +```bash +bash scripts/bench.sh # full suite +bash scripts/bench.sh --doc-id # debug one document +bash scripts/bench.sh --check-regression # CI threshold check +``` + +--- + +## Session Checklist + +Use this as a mental checklist for any extraction request: + +- [ ] Phase 1: Run detect-env.sh or ask about environment +- [ ] Phase 1: Know the PDF type (digital/scanned/mixed) +- [ ] Phase 1: Know the downstream use case +- [ ] Phase 2: Confirm runtime floors (Java 11+ always; Python 3.10+ if pip path; Node.js 20.19+ if npm path) +- [ ] Phase 2: Selected local vs. hybrid based on PDF type +- [ ] Phase 2: Selected output format based on downstream use +- [ ] Phase 3: Generated or executed the command +- [ ] Phase 3: Verified output files exist and are non-empty +- [ ] If quality issues: Phase 4 — measure NID/TEDS/MHS before escalating +- [ ] If enrichment needed: confirmed `--hybrid-mode full` is set on client +- [ ] If batch processing: confirmed all files passed in one `convert()` call diff --git a/skills/odl-pdf/evals/evals.json b/skills/odl-pdf/evals/evals.json new file mode 100644 index 000000000..49478f763 --- /dev/null +++ b/skills/odl-pdf/evals/evals.json @@ -0,0 +1,165 @@ +{ + "version": "1.0", + "skill": "odl-pdf", + "evals": [ + { + "id": "eval-001", + "scenario": "A data engineer is building a RAG pipeline over 500 scientific papers and needs to preserve source citations (page and region) for each retrieved chunk. They ask which mode and format to use.", + "user_input": "I need to process 500 scientific papers for a RAG pipeline. I need to know exactly which page and region each chunk came from for source citation. What's the best setup?", + "expected_recommendations": [ + "Use hybrid mode for best accuracy on scientific papers", + "Use json format (or json combined with markdown) because JSON output includes bounding boxes per element", + "Mention bounding boxes as the mechanism for source citation", + "Recommend batching all files in a single convert() call rather than looping" + ], + "must_mention": [ + "hybrid", + "json", + "bounding box", + "batch" + ], + "must_not_mention": [ + "text format as primary recommendation", + "loop convert() for each file separately without warning" + ] + }, + { + "id": "eval-002", + "scenario": "A developer on an M1 Mac needs to process Korean government PDFs, which are scanned image-based documents with mixed Korean and English text. They do not specify their OS or hardware unless asked.", + "user_input": "I'm on an M1 Mac and need to parse Korean government PDFs. They're scanned documents with both Korean and English text.", + "expected_recommendations": [ + "Use hybrid mode with OCR enabled (--force-ocr) because the documents are scanned", + "Set --ocr-lang to 'ko,en' for mixed-language OCR", + "Confirm Java is installed (java -version) as a prerequisite", + "Two terminals required: one for the hybrid server, one for the client" + ], + "must_mention": [ + "hybrid", + "--force-ocr", + "--ocr-lang", + "ko,en", + "java" + ], + "must_not_mention": [ + "local mode as sufficient for scanned PDFs", + "GPU required", + "Adoptium", + "Temurin", + "Zulu", + "SDKMAN", + "brew install --cask" + ] + }, + { + "id": "eval-003", + "scenario": "A user reports that tables in their extracted output are broken — cells are merged incorrectly and some borderless tables are completely missing. They are currently using local mode with default settings.", + "user_input": "The tables in my extracted output look broken. Cells are getting merged together and some tables are missing entirely. I'm using the default settings.", + "expected_recommendations": [ + "Diagnose using the TEDS metric to confirm it is a table quality issue", + "First escalation: try --table-method cluster for borderless table detection", + "Second escalation: switch to hybrid mode (--hybrid docling-fast) with auto triage", + "Third escalation: use --hybrid-mode full to force all pages through the AI backend" + ], + "must_mention": [ + "--table-method cluster", + "hybrid", + "TEDS" + ], + "must_not_mention": [ + "this is a known limitation with no workaround", + "--use-struct-tree as a table fix" + ] + }, + { + "id": "eval-004", + "scenario": "A Node.js developer on Windows wants to use hybrid mode. They are unfamiliar with the two-process architecture and expect a single npm install to be sufficient.", + "user_input": "I'm using Node.js on Windows and want to set up hybrid mode. I installed @opendataloader/pdf but I'm not sure what else I need.", + "expected_recommendations": [ + "Explain that hybrid mode requires a separate Python server process (opendataloader-pdf-hybrid)", + "Provide a two-terminal setup: Terminal 1 for the Python hybrid server, Terminal 2 for the Node.js client", + "Include the pip install command for the server component", + "Confirm Java 11+ is required as a prerequisite" + ], + "must_mention": [ + "pip install", + "opendataloader-pdf-hybrid", + "Terminal 1", + "Terminal 2", + "java" + ], + "must_not_mention": [ + "hybrid mode works with npm install alone", + "GPU required for basic hybrid setup" + ] + }, + { + "id": "eval-005", + "scenario": "A researcher processing math-heavy academic papers wants both LaTeX formula extraction and AI-generated descriptions of charts and figures. They ask what settings are needed.", + "user_input": "I'm processing academic papers with math formulas and charts. I need the formulas extracted as LaTeX and I want AI descriptions of the charts and figures. How do I set this up?", + "expected_recommendations": [ + "Start the hybrid server with both --enrich-formula and --enrich-picture-description flags", + "Run the client with --hybrid-mode full (required for enrichments to apply)", + "Warn that enrichments are silently skipped if --hybrid-mode full is omitted from the client command", + "Use --hybrid docling-fast as the backend" + ], + "must_mention": [ + "--enrich-formula", + "--enrich-picture-description", + "--hybrid-mode full", + "hybrid" + ], + "must_not_mention": [ + "enrichments work in auto mode", + "enrichments are client-side options", + "SmolVLM", + "--picture-description-prompt", + "--enrich-formula-model", + "--enrich-picture-model" + ] + }, + { + "id": "eval-006", + "scenario": "A user installed opendataloader-pdf via pip on a fresh machine without a JDK, ran their first conversion, and got an UnsupportedClassVersionError. They paste the error and ask what is wrong. This is the Java-missing failure mode the skill's first Critical Gotcha exists to handle.", + "user_input": "I just ran `pip install opendataloader-pdf` and then `opendataloader-pdf input.pdf` and got `java.lang.UnsupportedClassVersionError`. What's wrong?", + "expected_recommendations": [ + "Identify the root cause: Java 11 or higher is required, and the installed Java is missing or below version 11", + "Tell the user to verify with `java -version`", + "Tell the user to install a JDK 11 or higher for their platform", + "Do NOT recommend a specific JDK distribution (Adoptium, Temurin, Zulu, OpenJDK download URLs, brew/apt one-liners) — neutral guidance only" + ], + "must_mention": [ + "Java 11", + "java -version" + ], + "must_not_mention": [ + "Adoptium", + "Temurin", + "Zulu", + "OpenJDK download", + "brew install --cask", + "apt install openjdk", + "this is a bug in opendataloader-pdf" + ] + }, + { + "id": "eval-007", + "scenario": "A user has a password-protected PDF and asks how to extract it. This exercises the `--password` / `-p` option. The correct answer must surface the option without claiming the tool cannot handle encrypted PDFs.", + "user_input": "I have a password-protected PDF I need to extract. The password is 'secret123'.", + "expected_recommendations": [ + "Surface the --password (short: -p) CLI option as the correct mechanism", + "Show a concrete command example using --password or -p with the supplied value", + "Do NOT claim the tool cannot extract encrypted PDFs" + ], + "must_mention": [ + "--password", + "secret123" + ], + "must_not_mention": [ + "cannot extract encrypted PDFs", + "encrypted PDFs are not supported", + "decryption is not supported", + "you need to remove the password first" + ] + } + ] +} diff --git a/skills/odl-pdf/references/eval-metrics.md b/skills/odl-pdf/references/eval-metrics.md new file mode 100644 index 000000000..e95b0bfdd --- /dev/null +++ b/skills/odl-pdf/references/eval-metrics.md @@ -0,0 +1,207 @@ +# Evaluation Metrics Reference + +This document explains the metrics used in opendataloader-pdf benchmarks, how to interpret them, and how to diagnose quality problems using them. + +--- + +## Metrics + +### NID — Normalized Indel Distance + +**What it measures:** Reading order accuracy. Quantifies how well the extracted text preserves the correct reading sequence compared to the ground truth. + +**Intuition:** A PDF with two side-by-side columns must interleave text in the right column order, not left-to-right line by line across both columns. NID penalizes any reordering of the logical reading sequence. + +**Range:** 0–1. Higher is better. A score of 1.0 means extracted order exactly matches ground truth. + +**Typical failure modes:** Multi-column layouts, tables with merged cells, footnotes that appear inline, sidebars. + +--- + +### TEDS — Tree-Edit Distance Similarity + +**What it measures:** Table structure accuracy. Measures the structural similarity between extracted table trees and ground-truth table trees using tree edit distance. + +**Intuition:** A table with 3 rows and 4 columns must be reconstructed with the correct cell boundaries, spanning cells, and hierarchy. TEDS counts the minimum number of insertions, deletions, and substitutions needed to convert the extracted tree into the ground truth, then normalizes by tree size. + +**Range:** 0–1. Higher is better. A score of 1.0 means the extracted table structure is identical to ground truth. + +**Typical failure modes:** Borderless tables, merged/spanning cells, nested tables, tables that are actually images. + +--- + +### MHS — Markdown Heading Similarity + +**What it measures:** Heading structure accuracy. Measures how well the extracted heading hierarchy (h1, h2, h3) matches the ground truth. + +**Intuition:** A document with a clear section/subsection structure should produce headings at the correct levels. MHS compares the heading tree of the extracted output against the ground truth, penalizing both missing headings and incorrect level assignments. + +**Range:** 0–1. Higher is better. A score of 1.0 means all headings are correctly detected and assigned to the right level. + +**Typical failure modes:** PDFs that simulate headings using bold text (no semantic markup), decorative section dividers, heading text embedded in images. + +--- + +### Table Detection F1 + +**What it measures:** Precision and recall of table boundary detection. Precision = fraction of detected tables that are real tables. Recall = fraction of real tables that were detected. + +**Intuition:** F1 is the harmonic mean of precision and recall, balancing false positives (detecting non-tables as tables) against false negatives (missing tables entirely). Unlike TEDS, Table Detection F1 does not evaluate the internal structure — only whether the table region was found. + +**Range:** 0–1. Higher is better. + +**Typical failure modes:** Dense text blocks that resemble tables, tables that span page boundaries, very small tables. + +--- + +### Speed + +**What it measures:** Processing throughput in seconds per page. + +**Interpretation:** Lower is better. Relative shape: + +- **Local (no hybrid)**: fastest — pure Java layout analysis +- **Hybrid `auto`**: varies with document complexity; most pages stay at Java speed, only triaged pages pay the backend round-trip +- **Hybrid `full`**: slowest — every page goes to the backend + +Speed is not normalized to 0–1. It is an absolute wall-clock measurement averaged over the benchmark document set. For current numbers, run `./scripts/bench.sh` — published scores can lag the latest code. + +--- + +## Benchmark Reference Scores + +Run `./scripts/bench.sh` to produce the current scores against the benchmark document set maintained in [opendataloader-bench](https://github.com/opendataloader-project/opendataloader-bench) (200 real-world PDFs including multi-column layouts and scientific papers). + +Per-metric output shape: + +- **Overall** — the average of NID / TEDS / MHS +- **NID** / **TEDS** / **MHS** / **Table Detection F1** — 0–1 scale, higher is better +- **Speed** — absolute seconds per page + +Table Detection F1 is reported per-document and is not folded into the Overall average. + +Hardcoded snapshot scores are intentionally not reproduced here — they drift whenever the bench is rerun against updated extraction code or benchmark documents. The authoritative current values live in the bench output; the opendataloader-bench README also publishes periodic snapshots. See its methodology section for reference-score context. + +--- + +## Diagnostic Guide: Which Metric Is Weak? + +Use this guide when extraction quality is below expectations. Start by identifying which metric is low, then follow the recommended steps. + +--- + +### Low NID — Reading Order Problems + +**Symptoms:** Text from different columns or sections is interleaved incorrectly. Paragraphs appear out of sequence. Footnotes appear in the wrong position. + +**Steps:** + +1. Check if the PDF is tagged. If it is, try `--use-struct-tree`. Tagged PDFs contain an explicit reading order tree that is usually more reliable than layout analysis. + + ```bash + opendataloader-pdf input.pdf --use-struct-tree + ``` + +2. For multi-column layouts, verify that the XY-Cut algorithm is active (it is the default). Ensure `--reading-order xycut` is set. + +3. For complex layouts where XY-Cut still fails, route the document through hybrid mode — the AI backend handles unusual layouts more robustly. + + ```bash + opendataloader-pdf --hybrid docling-fast input.pdf + ``` + +--- + +### Low TEDS — Table Quality Problems + +**Symptoms:** Tables are extracted as plain text. Cells are merged incorrectly. Columns are misaligned. Borderless tables are missed entirely. + +**Escalation path — try each step in order and stop when quality is acceptable:** + +1. **Enable cluster detection.** The default table method detects bordered tables. The `cluster` method adds borderless table detection. + + ```bash + opendataloader-pdf input.pdf --table-method cluster + ``` + +2. **Switch to hybrid mode.** If `cluster` is insufficient, route the document through the AI backend. Use `auto` mode first — it sends complex pages to the backend while keeping simple pages on the fast local path. + + ```bash + opendataloader-pdf --hybrid docling-fast input.pdf + ``` + +3. **Use hybrid full mode.** If `auto` mode still misses tables (because the triage step classifies them as simple), force all pages through the backend. + + ```bash + opendataloader-pdf --hybrid docling-fast --hybrid-mode full input.pdf + ``` + +--- + +### Low MHS — Heading Detection Problems + +**Symptoms:** Document headings are not recognized, appear as plain paragraphs, or are assigned to the wrong level (e.g., h1 instead of h2). + +**Steps:** + +1. Check whether the PDF uses real headings or simulated headings. Real headings are marked semantically in the PDF (large font, bold, specific style). Simulated headings are visually similar but have no semantic markup — they are just bold text at a larger font size, indistinguishable from the tool's perspective. + + - To check: open the PDF in a reader that exposes the tag tree (Adobe Acrobat > Accessibility > Reading Order, or use a preflight tool). If there is no tag tree, the headings are visual only. + +2. If the PDF is tagged and headings are still missed, try `--use-struct-tree`. This reads semantic structure directly from the PDF's tag tree. + + ```bash + opendataloader-pdf input.pdf --use-struct-tree + ``` + +3. If the PDF is untagged and headings are simulated with bold text, the heading structure cannot be recovered reliably from layout alone. Consider whether hybrid mode improves detection for your specific document class. + +--- + +### Low Table Detection F1 — Table Region Problems + +**Symptoms:** Tables are missed entirely (low recall) or non-table regions such as dense text blocks are incorrectly flagged as tables (low precision). + +**Steps:** + +1. **Inspect with an annotated PDF** to see which regions are being detected as tables and which real tables are being missed. The `pdf` output format overlays bounding boxes on a copy of the input. + + ```bash + opendataloader-pdf input.pdf --format json,pdf + ``` + + Combine with `json` so you can correlate each visual box with its element data. + +2. **If real tables are being missed (low recall):** enable borderless detection and, if needed, escalate to the hybrid backend. See the Low TEDS steps above — the same escalation path (`--table-method cluster` → `--hybrid docling-fast` → `--hybrid-mode full`) improves region detection as well as internal structure. + +3. **If non-table regions are being detected (low precision):** this usually indicates dense multi-column text is being classified as tabular. Check that `--reading-order xycut` is active (it is the default) so column structure is recognised before table detection runs. + +--- + +## Running Benchmarks + +### Full benchmark suite + +```bash +./scripts/bench.sh +``` + +This script automatically clones [opendataloader-bench](https://github.com/opendataloader-project/opendataloader-bench) (which contains the benchmark PDFs and evaluation logic), runs extraction across all documents, and prints scores for each metric. + +Additional flags: + +```bash +# Debug a specific document by ID +./scripts/bench.sh --doc-id + +# CI mode: check against regression thresholds and exit non-zero on failure +./scripts/bench.sh --check-regression +``` + +### Quick eval on your own documents + +```bash +python skills/odl-pdf/scripts/quick-eval.py extracted.md ground-truth.md +``` + +This script compares an extracted file against a ground truth reference using text similarity (difflib by default, rapidfuzz if available). It reports a similarity score with pass/fail against a configurable threshold (default 0.85). Use `--verbose` for diff snippets. diff --git a/skills/odl-pdf/references/format-guide.md b/skills/odl-pdf/references/format-guide.md new file mode 100644 index 000000000..8715483c2 --- /dev/null +++ b/skills/odl-pdf/references/format-guide.md @@ -0,0 +1,63 @@ +# Output Format Guide + +opendataloader-pdf supports 7 output formats via the `format` option. This guide helps you choose the right format for your use case. + +> This file documents the 2.2.1 snapshot (matching SKILL.md `# Documented against`). If the project's `options.json` lists a format not covered here, that file is the authoritative source — newer releases may add values this guide has not caught up to yet. + +## Format Overview + +| Format | Best For | Bounding Boxes | Tables | Images | +|---|---|---|---|---| +| `json` | Programmatic processing, source citation | Yes | Structured | As references | +| `text` | Plain text extraction, search indexing | No | Flattened | Omitted | +| `html` | Web display | No | Native `` | Inline | +| `pdf` | Visual debugging of extraction results | Yes (annotated) | Highlighted | Preserved | +| `markdown` | Documentation, RAG chunking | No | Markdown syntax | Omitted | +| `markdown-with-html` | Complex tables in Markdown | No | HTML `
` | Omitted | +| `markdown-with-images` | Documentation with visuals | No | Markdown syntax | Embedded/external | + +## Downstream Use Mapping + +Choose your format based on what you're building: + +| Use Case | Recommended Format | Notes | +|---|---|---| +| RAG + source citation | `json` | Bounding boxes enable precise page/region references | +| RAG text chunking | `markdown` | Clean structure maps well to chunk boundaries | +| LangChain integration | `text` | Use with `langchain-opendataloader-pdf`; format=text is the default | +| Web display | `html` | Renders natively in browsers | +| Quality / extraction debugging | `pdf` + `json` | Annotated PDF shows what was detected; JSON shows coordinates | +| Plain text search | `text` | Smallest output, no markup overhead | +| Documentation with images | `markdown-with-images` | Images embedded inline or written to a directory | +| Complex table fidelity | `markdown-with-html` | Falls back to HTML tables where Markdown syntax loses structure | + +## Related Options + +These options affect output when using image-bearing or multi-page formats: + +- `image-output` — Controls whether images are off, embedded (base64), or written to external files. Values: `off`, `embedded`, `external` (default). +- `image-format` — Image encoding format for extracted images. Values: `png` (default), `jpeg`. +- `image-dir` — Directory path for externalized images when `image-output=external`. +- `*-page-separator` — Format-specific option to insert a custom separator between pages (e.g., `markdown-page-separator`, `text-page-separator`). + +## Tips + +**Multiple formats in one call** + +You can produce multiple formats in a single invocation by passing a comma-separated list: + +``` +opendataloader-pdf input.pdf --format markdown,json +``` + +This avoids parsing the PDF twice and ensures both outputs are consistent. + +**Piping output with `--to-stdout`** + +Use `--to-stdout` to write output directly to standard output instead of a file. Useful for piping into other tools: + +``` +opendataloader-pdf input.pdf --format text --to-stdout | my-indexer +``` + +Note: When using `--to-stdout` with multiple formats, only single-format output is supported. diff --git a/skills/odl-pdf/references/hybrid-guide.md b/skills/odl-pdf/references/hybrid-guide.md new file mode 100644 index 000000000..b9e71029f --- /dev/null +++ b/skills/odl-pdf/references/hybrid-guide.md @@ -0,0 +1,179 @@ +# Hybrid Mode Reference Guide + +Hybrid mode extends opendataloader-pdf by routing complex PDF pages to an external AI backend while keeping simple pages on the fast local Java path. This gives you the speed of the Java engine for most content, with AI-quality output for tables, scanned pages, formulas, and charts. + +--- + +## Overview + +By default, opendataloader-pdf processes everything locally in Java. Hybrid mode adds a second processing path — a built-in Python server (`opendataloader-pdf-hybrid`) that uses the docling library internally — and routes pages between the two based on complexity. + +**When you need hybrid mode:** + +- PDFs with scanned or image-based pages (OCR required) +- Complex table structures that the Java heuristics miss +- Documents containing mathematical formulas (LaTeX extraction) +- Charts or images that need AI-generated descriptions +- Non-English documents requiring language-specific OCR + +--- + +## Quick Setup + +Hybrid mode requires two running processes: the server and the client. + +**Terminal 1 — Start the hybrid server:** + +```bash +# Install with hybrid extras (includes the server) +pip install "opendataloader-pdf[hybrid]" + +# Start the hybrid server (port 5002) +opendataloader-pdf-hybrid --port 5002 +``` + +**Terminal 2 — Run the client:** + +```bash +# Basic hybrid: per-page triage, docling-fast backend +opendataloader-pdf --hybrid docling-fast input.pdf + +# Full mode: send all pages to the backend +opendataloader-pdf --hybrid docling-fast --hybrid-mode full input.pdf +``` + +The client connects to `http://localhost:5002` by default. No additional configuration is needed for a local setup. + +--- + +## Triage Modes + +Control how pages are routed with `--hybrid-mode`. + +| Mode | Flag | Behavior | +|------|------|----------| +| auto | `--hybrid-mode auto` | Per-page triage. Simple pages stay on Java; complex pages go to the backend. **Default.** | +| full | `--hybrid-mode full` | All pages go to the backend. Required for enrichment features. | + +### When to use `auto` + +`auto` is the default and works well for mixed documents. The triage strategy is conservative: it prefers to send borderline pages to the backend (minimizing missed complex content) at the cost of some extra backend calls. + +Expected throughput shape: +- Simple pages (Java path): fastest +- Complex pages (backend path): varies by content and hardware +- Overall for a mixed document: between the two extremes + +For current numbers, run `./scripts/bench.sh`. + +### When to use `full` + +Use `full` when you need enrichment features (`--enrich-formula`, `--enrich-picture-description`) or when the entire document is scanned and you want consistent OCR output across all pages. + +Expected throughput with `full`: noticeably slower than Java-only or `auto`, depending on backend and GPU availability. Run `./scripts/bench.sh` for current per-page timings. + +> **Important:** `--enrich-formula` and `--enrich-picture-description` are server-side options, but they only take effect when the client is running with `--hybrid-mode full`. In `auto` mode, enrichments are silently skipped — no warning or error is shown. If your output is missing formulas or image descriptions, check that you have `--hybrid-mode full` set on the client side. + +--- + +## Client Options + +| Option | Values | Default | Description | +|--------|--------|---------|-------------| +| `--hybrid ` | `off`, `docling-fast` | `off` | Select the backend. `off` disables hybrid mode entirely. | +| `--hybrid-mode ` | `auto`, `full` | `auto` | Page routing strategy. | +| `--hybrid-url ` | Any URL | `http://localhost:5002` | Override the server URL for remote or non-default setups. | +| `--hybrid-timeout ` | Integer | `0` (no timeout) | Request timeout in milliseconds. `0` means no timeout. | +| `--hybrid-fallback` | Flag | Disabled | Fall back to the Java path if the backend returns an error. | + +--- + +## Server Configuration + +All options are passed when starting `opendataloader-pdf-hybrid`. + +| Option | Default | Description | +|--------|---------|-------------| +| `--port ` | `5002` | Port the server listens on. | +| `--device ` | `auto` | Accelerator for model inference. Values: `auto`, `cpu`, `cuda`, `mps`, `xpu`. `auto` selects the best available device (checks CUDA, then MPS, then XPU, then CPU). Use `mps` explicitly on Apple Silicon if the auto-selected device is suboptimal, or `cpu` to force CPU-only processing. | +| `--force-ocr` | Off | Run OCR on every page, even if the page has selectable text. Use this for scanned PDFs where embedded text is unreliable. | +| `--ocr-lang ""` | `"en"` | Comma-separated language codes for OCR (e.g., `"ko,en"`). Improves accuracy for non-English documents. | +| `--enrich-formula` | Off | Extract mathematical formulas as LaTeX. **Requires `--hybrid-mode full` on the client.** | +| `--enrich-picture-description` | Off | Generate AI descriptions for charts and images. **Requires `--hybrid-mode full` on the client.** | + +**Example — scanned Korean document with formula extraction:** + +```bash +# Server +opendataloader-pdf-hybrid --port 5002 --force-ocr --ocr-lang "ko,en" --enrich-formula + +# Client (must use --hybrid-mode full) +opendataloader-pdf --hybrid docling-fast --hybrid-mode full input.pdf +``` + +--- + +## Troubleshooting + +### "Connection refused" or server not reachable + +The server is not running or is on a different port/host. + +1. Confirm the server started without errors in Terminal 1. +2. Check the port matches on both sides (`--port` on server, `--hybrid-url` on client). +3. For a remote server, ensure the host is reachable and the firewall allows the port. + +```bash +# Test connectivity manually +curl http://localhost:5002/health +``` + +### Request timeout + +The backend is taking longer than the configured timeout. + +- Increase the timeout: `--hybrid-timeout 30000` (30 seconds) +- Or disable it: `--hybrid-timeout 0` +- If this is persistent, check backend resource usage (CPU/GPU). + +### Formulas or image descriptions missing from output + +This is the most common silent failure. Enrichment options on the server are only applied when the client sends the page to the backend. + +- In `auto` mode, pages classified as simple stay on Java — enrichments are never applied to them. +- **Fix:** Add `--hybrid-mode full` to your client command. + +No error or warning is emitted when enrichments are skipped. This is by design (the server processes what it receives), but it can be surprising. + +### Output quality is lower than expected for complex tables + +In `auto` mode, the triage heuristic may occasionally classify a complex table as simple. Switch to `--hybrid-mode full` to force all pages through the backend. + +--- + +## Backend Registry + +| Backend | Status | Features | +|---------|--------|----------| +| `docling-fast` | Available | OCR, formula extraction (LaTeX), chart descriptions, table enhancement | +| `hancom` | Planned | Hancom Document AI integration | +| `azure` | Planned | Azure AI Document Intelligence | +| `google` | Planned | Google Document AI | + +Backends are selected with `--hybrid `. Only one backend can be active per run. + +--- + +## Performance Notes + +Relative throughput: + +- **Java only (no hybrid)**: fastest path +- **Hybrid `auto`** (mixed document): close to Java speed for most pages; only triaged pages pay the backend round-trip +- **Hybrid `full`**: slowest path; GPU-accelerated backend recommended + +Latency figures depend on document complexity, available hardware, and backend configuration. Running the hybrid server on a machine with a GPU significantly reduces the per-page time in `full` mode. Run `./scripts/bench.sh` against your own corpus for representative numbers. + +For throughput-sensitive workloads, use `auto` mode and reserve `full` mode for documents where enrichment or uniform OCR quality is required. + +**Large-document auto-chunking (2.2.1+)** — The Java client automatically splits backend-routed pages into 50-page chunks before sending them to the server. Processing a 200-page scanned PDF in `--hybrid-mode full` no longer hangs the backend. The AI model is loaded once at server startup (singleton), so chunking adds no per-chunk startup cost. No client-side flag; the server's existing `page_ranges` support handles it. Pre-2.2.1 users who manually split large PDFs before processing no longer need to. diff --git a/skills/odl-pdf/references/installation-matrix.md b/skills/odl-pdf/references/installation-matrix.md new file mode 100644 index 000000000..12f1fba0b --- /dev/null +++ b/skills/odl-pdf/references/installation-matrix.md @@ -0,0 +1,134 @@ +# Installation Matrix + +This guide helps you choose the right installation method for your environment. + +## Decision Tree + +``` +Do you have Python 3.10+ available? +├── Yes +│ ├── Do you need LangChain integration? +│ │ └── Yes → pip install langchain-opendataloader-pdf +│ ├── Do you need hybrid server capability? +│ │ └── Yes → pip install "opendataloader-pdf[hybrid]" +│ └── Otherwise → pip install opendataloader-pdf (simplest) +├── Node.js 20.19+ only (no Python)? +│ └── npm install @opendataloader/pdf +├── Java project (Maven/Gradle)? +│ └── Add Maven dependency (see below) +└── Unsure? + └── pip install opendataloader-pdf (simplest, works on all platforms; requires Python 3.10+) +``` + +## Prerequisites + +**Java 11 or higher is required for all installation methods.** All methods spawn a JVM internally to perform PDF processing. The authoritative current floor is `maven.compiler.source` in `java/pom.xml`; this document is updated when that bumps. + +If Java is missing or below version 11 when you run the tool, you will see: + +> Java 11 or higher is required. Please install a JDK for your environment. + +Install a JDK appropriate for your OS before proceeding. Verify with: + +``` +java -version +``` + +**Language-binding runtime floors** are declared in each package's manifest and enforced by the respective package manager at install time: + +- pip: Python >= 3.10 (per `python/opendataloader-pdf/pyproject.toml` `requires-python`) +- npm: Node.js >= 20.19 (per `node/opendataloader-pdf/package.json` `engines.node`) +- Maven: Java >= 11 (same as the JVM floor above) + +If the user's runtime is below the floor, `pip` / `npm` / `mvn` refuse to install with a clear error. Java alone is the exception — it is a runtime requirement of the built JAR, so the CLI fails at use time rather than install time, which is why the upfront `java -version` verification above is explicitly called out. + +## Quick Start Commands + +### pip (Python) + +```bash +# Minimal install +pip install opendataloader-pdf + +# With hybrid server capability +pip install "opendataloader-pdf[hybrid]" + +# LangChain integration +pip install langchain-opendataloader-pdf +``` + +The `opendataloader-pdf` CLI command is included automatically with the pip install. + +### npm (Node.js) + +```bash +npm install @opendataloader/pdf +``` + +The `opendataloader-pdf` CLI command is included automatically with the npm install. + +### Maven (Java) + +Add to your `pom.xml`: + +```xml + + org.opendataloader + opendataloader-pdf-core + LATEST + +``` + +Replace `LATEST` with the specific version you want to pin. Check the [releases page](https://github.com/opendataloader-project/opendataloader-pdf/releases) for available versions. + +### Gradle (Java/Kotlin) + +Add to your `build.gradle` (Groovy DSL): + +```groovy +dependencies { + implementation 'org.opendataloader:opendataloader-pdf-core:LATEST' +} +``` + +Or `build.gradle.kts` (Kotlin DSL): + +```kotlin +dependencies { + implementation("org.opendataloader:opendataloader-pdf-core:LATEST") +} +``` + +Pin `LATEST` to a specific released version from the [releases page](https://github.com/opendataloader-project/opendataloader-pdf/releases). + +## Version Compatibility + +Minimum runtime requirements are declared in each package's manifest. Consult +the manifest for the authoritative current floor — the wrappers and build tools +enforce it at install time: + +| Method | Runtime requirement (source of truth) | CLI Included | +|---|---|---| +| pip (all variants) | `python/opendataloader-pdf/pyproject.toml` → `requires-python` | Yes | +| pip langchain | above, plus the LangChain floor declared by `langchain-opendataloader-pdf` | Yes | +| npm | `node/opendataloader-pdf/package.json` → `engines.node` | Yes | +| Maven | `java/pom.xml` → `maven.compiler.source` | No (library only) | + +`pip` / `npm` / `mvn` each validate against the manifest's declared floor and +fail with a clear error if the environment is below it. + +All methods additionally require **Java 11 or higher** at runtime (current +floor declared in `java/pom.xml` `maven.compiler.source`); the pip and npm +wrappers spawn a JVM internally. See Critical Gotcha 1 in `SKILL.md`. + +## Post-Install Verification + +After installing via pip or npm, confirm the CLI is working: + +``` +opendataloader-pdf --version +``` + +A successful output shows the installed version number. If the command is not found, ensure your package manager's bin directory is on your `PATH`. + +For Maven, verify the dependency resolves by running a build (`mvn compile`) and checking that no classpath errors are reported. diff --git a/skills/odl-pdf/references/integration-examples.md b/skills/odl-pdf/references/integration-examples.md new file mode 100644 index 000000000..b7fb81585 --- /dev/null +++ b/skills/odl-pdf/references/integration-examples.md @@ -0,0 +1,173 @@ +# Integration Examples + +Ready-to-run code for each supported interface. Load this file when the user asks for copy-pasteable examples in a specific language or framework. + +Every path requires **Java 11+** at runtime (current floor per `java/pom.xml`). Language wrappers additionally require **Python 3.10+** (pip, per `pyproject.toml`) or **Node.js 20.19+** (npm, per `package.json`). See `installation-matrix.md` § Prerequisites for details. + +--- + +## CLI + +```bash +opendataloader-pdf input.pdf \ + --format markdown \ + --output-dir ./output \ + --hybrid docling-fast \ + --quiet +``` + +For multiple formats in one pass: + +```bash +opendataloader-pdf input.pdf --format json,markdown,html +``` + +--- + +## Python + +Batch all files in one `convert()` call — each call spawns a JVM, so repeated calls are slow (see Gotcha 3 in SKILL.md). + +```python +import opendataloader_pdf + +opendataloader_pdf.convert( + input_path=["file1.pdf", "file2.pdf", "file3.pdf"], + output_dir="./output", + format="markdown", + hybrid="docling-fast" +) +``` + +--- + +## Node.js + +Same JVM-spawn concern — pass all files to one `convert()` call. + +```javascript +import { convert } from '@opendataloader/pdf'; + +await convert(['file1.pdf', 'file2.pdf'], { + outputDir: './output', + format: 'markdown', + hybrid: 'docling-fast' +}); +``` + +--- + +## LangChain + +Basic loader: + +```python +from langchain_opendataloader_pdf import OpenDataLoaderPDFLoader + +loader = OpenDataLoaderPDFLoader( + file_path="document.pdf", + format="text", + hybrid="docling-fast" # optional: enable for scanned PDFs +) + +documents = loader.load() +# documents is a list of LangChain Document objects with page_content and metadata +``` + +### Full RAG pipeline + +Load → chunk → embed → index. Use `format="json"` instead of `"text"` when you need bounding boxes in metadata for source citation. + +```python +from langchain_opendataloader_pdf import OpenDataLoaderPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import Chroma +from langchain.embeddings import OpenAIEmbeddings + +# 1. Load PDFs. ODL markdown headings are natural chunk boundaries. +loader = OpenDataLoaderPDFLoader( + file_path="document.pdf", + format="text", + hybrid="docling-fast" +) +documents = loader.load() + +# 2. Chunk with overlap on structural separators. +splitter = RecursiveCharacterTextSplitter( + chunk_size=1000, + chunk_overlap=200, + separators=["\n## ", "\n### ", "\n\n", "\n", " "] +) +chunks = splitter.split_documents(documents) + +# 3. Index. +vectorstore = Chroma.from_documents(chunks, OpenAIEmbeddings()) +``` + +--- + +## Java (Maven) + +```java +import org.opendataloader.pdf.api.Config; +import org.opendataloader.pdf.api.OpenDataLoaderPDF; + +Config config = new Config(); +config.setOutputDir("./output"); +config.setFormat("markdown"); +config.setHybrid("docling-fast"); + +OpenDataLoaderPDF.processFile("file1.pdf", config); +``` + +See `installation-matrix.md` for the Maven dependency block. + +--- + +## Output Pipeline Patterns + +**Quiet mode for automated pipelines** — suppress progress output: + +```bash +opendataloader-pdf input.pdf --format markdown --quiet +``` + +**Stdout for pipe-based workflows** — single format only: + +```bash +opendataloader-pdf input.pdf --format json --to-stdout | jq . +``` + +**Page range extraction**: + +```bash +opendataloader-pdf input.pdf --pages "1,3,5-10" --format markdown +``` + +**Custom page separators** for downstream splitting: + +```bash +opendataloader-pdf input.pdf \ + --format markdown \ + --markdown-page-separator "---PAGE %page-number%---" +``` + +--- + +## Remote Hybrid Server + +For multi-machine deployments, run the server on a GPU host and point clients at it. + +```bash +# GPU host +opendataloader-pdf-hybrid --port 5002 + +# Client +opendataloader-pdf input.pdf \ + --hybrid docling-fast \ + --hybrid-url http://gpu-server:5002 \ + --hybrid-timeout 30000 \ + --hybrid-fallback +``` + +`--hybrid-fallback` routes failing pages back to the local Java path so a single backend hiccup does not fail the document. diff --git a/skills/odl-pdf/references/options-matrix.md b/skills/odl-pdf/references/options-matrix.md new file mode 100644 index 000000000..cdebfac87 --- /dev/null +++ b/skills/odl-pdf/references/options-matrix.md @@ -0,0 +1,236 @@ +# ODL-PDF CLI Options Matrix + +This file contains a built-in summary of every CLI option for the `opendataloader-pdf` tool. +If `options.json` is present in the project root, that file is the authoritative source — always +prefer it over the descriptions here. This document exists so the agent skill can reason about +options without loading the full JSON on every invocation, and adds **category groupings**, +**Interaction Rules**, and **Common Combinations** that the raw schema does not express. + +--- + +## Categories + +### IO — Input / Output Control + +Controls where data comes from and where results are written. + +| Option | Short | Type | Default | Description | +|---|---|---|---|---| +| `output-dir` | `-o` | string | null (input file dir) | Directory where output files are written. Defaults to the same directory as the input file. | +| `to-stdout` | — | boolean | false | Write output to stdout instead of a file. Only valid with a single format. | +| `quiet` | `-q` | boolean | false | Suppress all console logging output. | +| `password` | `-p` | string | null | Password for encrypted PDF files. | +| `pages` | — | string | null (all) | Pages to extract, e.g. `"1,3,5-7"`. Defaults to all pages. | +| `format` | `-f` | string | json | Output format(s), comma-separated. Values: `json`, `text`, `html`, `pdf`, `markdown`, `markdown-with-html`, `markdown-with-images`. | + +--- + +### Quality — Extraction Quality + +Controls the accuracy and structure of the extracted content. + +| Option | Short | Type | Default | Description | +|---|---|---|---|---| +| `table-method` | — | string | `default` | Table detection method. `default` = border-based; `cluster` = border + borderless cluster detection (slower). | +| `reading-order` | — | string | `xycut` | Reading order algorithm. `xycut` = XY-cut layout analysis; `off` = no reordering. | +| `use-struct-tree` | — | boolean | false | Use the PDF structure tree (tagged PDF) for reading order and semantic structure. Only effective on tagged PDFs. | + +--- + +### Safety — Security and Privacy + +Controls content filtering and sensitive data handling. + +| Option | Short | Type | Default | Description | +|---|---|---|---|---| +| `content-safety-off` | — | string | null | Disable specific content safety filters. Values: `all`, `hidden-text`, `off-page`, `tiny`, `hidden-ocg`. | +| `sanitize` | — | boolean | false | Replace emails, phone numbers, IP addresses, credit card numbers, and URLs with placeholders. | + +--- + +### Hybrid — AI Backend + +Options for routing pages through an optional AI enrichment server (e.g. formula OCR, picture descriptions). + +| Option | Short | Type | Default | Description | +|---|---|---|---|---| +| `hybrid` | — | string | `off` | Hybrid backend to use. Values: `off`, `docling-fast`. Requires a running hybrid server. | +| `hybrid-mode` | — | string | `auto` | Triage mode. `auto` = dynamic page-level triage; `full` = send all pages to the backend (required for server-side enrichments). | +| `hybrid-url` | — | string | null | Override the default hybrid server URL. | +| `hybrid-timeout` | — | string | `0` | Per-request timeout in milliseconds (`0` = no timeout). | +| `hybrid-fallback` | — | boolean | false | Fall back to the Java extraction path if the hybrid backend returns an error. | + +--- + +### Output — Output Formatting + +Controls how images and page separators appear in output files. + +| Option | Short | Type | Default | Description | +|---|---|---|---|---| +| `image-output` | — | string | `external` | Image output mode. `off` = skip images; `embedded` = Base64 data URIs inline; `external` = write separate image files and embed references. | +| `image-format` | — | string | `png` | Format for extracted images. Values: `png`, `jpeg`. | +| `image-dir` | — | string | null | Directory for extracted image files (used when `image-output` is `external`). | +| `markdown-page-separator` | — | string | null | String inserted between pages in Markdown output. Use `%page-number%` to include the page number. | +| `text-page-separator` | — | string | null | String inserted between pages in plain-text output. Use `%page-number%` for page numbers. | +| `html-page-separator` | — | string | null | String inserted between pages in HTML output. Use `%page-number%` for page numbers. | + +--- + +### Text — Text Processing + +Fine-grained control over how extracted text is cleaned and formatted. + +| Option | Short | Type | Default | Description | +|---|---|---|---|---| +| `keep-line-breaks` | — | boolean | false | Preserve the original line breaks from the PDF. By default, soft line breaks are merged. | +| `replace-invalid-chars` | — | string | `" "` (space) | Replacement character for invalid or unrecognized characters in the extracted text. | +| `include-header-footer` | — | boolean | false | Include page headers and footers in the output. Excluded by default. | +| `detect-strikethrough` | — | boolean | false | Detect strikethrough text (experimental). | + +--- + +## Interaction Rules + +These rules document option combinations that have non-obvious or silent failure modes. + +**1. Hybrid enrichments require `--hybrid-mode full`** + +Server-side enrichments such as `--enrich-formula` and `--enrich-picture-description` run on the +hybrid backend. On the client side, they are only applied if `--hybrid-mode full` is set. With the +default `auto` mode, pages that the triage step classifies as "simple" bypass the backend entirely, +and any enrichment instructions for those pages are silently ignored. If enrichments are missing +from the output, check that `--hybrid-mode full` is set. + +**2. `--hybrid` requires a running server** + +Setting `--hybrid docling-fast` (or any non-`off` value) without a reachable hybrid server will +cause requests to fail. Quick start: + +```bash +pip install "opendataloader-pdf[hybrid]" +opendataloader-pdf-hybrid --port 5002 +``` + +Then pass `--hybrid docling-fast --hybrid-url http://localhost:5002` to the client. + +**3. `--to-stdout` only works with a single format** + +`--to-stdout` writes the extracted content to standard output. It cannot be combined with +comma-separated `--format` values (e.g. `--format json,text`). Passing multiple formats with +`--to-stdout` will produce an error. When streaming output to another process, specify exactly one +format. + +**4. `--image-output embedded` produces large output for image-heavy PDFs** + +`embedded` mode encodes each image as a Base64 data URI and inlines it in the output document. +For PDFs with many or large images this can produce very large output files. Prefer `external` +(the default) unless the consumer requires self-contained output. + +**5. `--table-method cluster` may be slower** + +The `cluster` method adds borderless table detection on top of the default border-based approach. +It improves recall on tables without visible borders but increases processing time. Use `default` +when throughput matters and the PDFs have standard bordered tables. + +**6. `--use-struct-tree` has no effect on untagged PDFs** + +The structure tree option reads semantic order from the PDF's tag tree, which is only present in +tagged (accessible) PDFs. On untagged PDFs the option is silently ignored and the default layout +analysis is used instead. To check whether a PDF is tagged, inspect its document properties or +run a preflight check before enabling this option. + +--- + +## Common Combinations + +### RAG pipeline (retrieval-augmented generation) + +Extract clean, structured text with accurate reading order for vector indexing: + +```bash +opendataloader-pdf input.pdf \ + --format json \ + --reading-order xycut \ + --table-method cluster \ + --image-output off \ + --sanitize +``` + +Use `--sanitize` when the PDF may contain PII that should not enter the vector store. + +--- + +### Accessibility audit (tagged PDF) + +Leverage the PDF's tag tree to validate semantic structure and export accessible HTML: + +```bash +opendataloader-pdf input.pdf \ + --format html \ + --use-struct-tree \ + --include-header-footer \ + --html-page-separator "" +``` + +--- + +### Quick plain-text extraction + +Minimal options for fast extraction of readable prose: + +```bash +opendataloader-pdf input.pdf \ + --format text \ + --quiet \ + --to-stdout +``` + +Pipe directly to downstream tools: `opendataloader-pdf input.pdf -f text -q --to-stdout | wc -w` + +--- + +### Markdown with images for documentation + +Export a Markdown file with embedded images, suitable for wikis or documentation sites: + +```bash +opendataloader-pdf input.pdf \ + --format markdown-with-images \ + --image-output external \ + --image-format png \ + --image-dir ./images \ + --output-dir ./output +``` + +--- + +### AI-enriched extraction (hybrid mode) + +Extract all pages through the hybrid backend for formula OCR and picture descriptions: + +```bash +opendataloader-pdf input.pdf \ + --format markdown \ + --hybrid docling-fast \ + --hybrid-mode full \ + --hybrid-url http://localhost:5002 \ + --hybrid-fallback +``` + +`--hybrid-fallback` ensures that if the server is temporarily unavailable, extraction continues +with the local Java backend rather than failing. + +--- + +### Selective page extraction for large PDFs + +Extract only a specific page range to reduce processing time: + +```bash +opendataloader-pdf large-report.pdf \ + --pages "1,5-10,15" \ + --format json \ + --output-dir ./extracted \ + --quiet +``` diff --git a/skills/odl-pdf/scripts/detect-env.sh b/skills/odl-pdf/scripts/detect-env.sh new file mode 100755 index 000000000..7666584d7 --- /dev/null +++ b/skills/odl-pdf/scripts/detect-env.sh @@ -0,0 +1,172 @@ +#!/usr/bin/env bash +# detect-env.sh — Cross-platform environment detection for the odl-pdf agent skill. +# Outputs key=value pairs (one per line) to stdout. No other output. +# Make this file executable: chmod +x detect-env.sh + +set -euo pipefail + +# --------------------------------------------------------------------------- +# OS detection +# --------------------------------------------------------------------------- +detect_os() { + local raw + raw="$(uname -s 2>/dev/null || echo "unknown")" + case "${raw}" in + Darwin*) echo "macos" ;; + Linux*) echo "linux" ;; + MINGW*|MSYS*|CYGWIN*) echo "windows" ;; + *) echo "unknown" ;; + esac +} + +# --------------------------------------------------------------------------- +# Java version (java outputs version to stderr) +# --------------------------------------------------------------------------- +detect_java() { + if ! command -v java &>/dev/null; then + echo "none" + return + fi + local raw + raw="$(java -version 2>&1 | head -1)" + # Handles formats: + # openjdk version "21.0.3" ... + # java version "1.8.0_401" ... + # openjdk version "11.0.22" ... + local ver + ver="$(printf '%s' "${raw}" | grep -oE '"[^"]+"' | tr -d '"' | head -1 || true)" + if [[ -z "${ver}" ]]; then + echo "none" + return + fi + # Normalise legacy 1.x format → major only; otherwise keep major + if [[ "${ver}" =~ ^1\.([0-9]+) ]]; then + echo "${BASH_REMATCH[1]}" + else + # Extract leading integer(s) before the first dot + local major + major="$(printf '%s' "${ver}" | grep -oE '^[0-9]+' || true)" + echo "${major:-none}" + fi +} + +# --------------------------------------------------------------------------- +# Python version (try python3 first, then python) +# --------------------------------------------------------------------------- +detect_python() { + local cmd="" + if command -v python3 &>/dev/null; then + cmd="python3" + elif command -v python &>/dev/null; then + cmd="python" + else + echo "none" + return + fi + local raw + raw="$("${cmd}" --version 2>&1 | head -1)" + # e.g. "Python 3.12.4" + local ver + ver="$(printf '%s' "${raw}" | grep -oE '[0-9]+\.[0-9]+(\.[0-9]+)?' | head -1 || true)" + echo "${ver:-none}" +} + +# --------------------------------------------------------------------------- +# Node version +# --------------------------------------------------------------------------- +detect_node() { + if ! command -v node &>/dev/null; then + echo "none" + return + fi + local raw + raw="$(node --version 2>/dev/null)" + # e.g. "v20.19.0" → strip leading 'v' + local ver + ver="$(printf '%s' "${raw}" | sed 's/^v//')" + echo "${ver:-none}" +} + +# --------------------------------------------------------------------------- +# ODL installed + version +# Tries CLI first, then Python module. +# --------------------------------------------------------------------------- +detect_odl() { + local installed="false" + local version="none" + + # Determine python binary + local pycmd="" + if command -v python3 &>/dev/null; then + pycmd="python3" + elif command -v python &>/dev/null; then + pycmd="python" + fi + + # Try the CLI entry-point first + local cli_ver="" + if command -v opendataloader-pdf &>/dev/null; then + cli_ver="$(opendataloader-pdf --version 2>/dev/null || true)" + fi + + if [[ -n "${cli_ver}" ]]; then + installed="true" + version="$(printf '%s' "${cli_ver}" | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1 || true)" + version="${version:-none}" + elif [[ -n "${pycmd}" ]]; then + # Try python -m opendataloader_pdf --version + local mod_ver + mod_ver="$("${pycmd}" -m opendataloader_pdf --version 2>/dev/null || true)" + if [[ -n "${mod_ver}" ]]; then + installed="true" + version="$(printf '%s' "${mod_ver}" | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1 || true)" + version="${version:-none}" + else + # Last resort: importlib.metadata + local meta_ver + meta_ver="$("${pycmd}" -c "import importlib.metadata; print(importlib.metadata.version('opendataloader-pdf'))" 2>/dev/null || true)" + if [[ -n "${meta_ver}" ]]; then + installed="true" + version="${meta_ver}" + fi + fi + fi + + printf '%s\n' "ODL_INSTALLED=${installed}" + printf '%s\n' "ODL_VERSION=${version}" +} + +# --------------------------------------------------------------------------- +# Hybrid extras — check for docling + fastapi + uvicorn (all required for hybrid server) +# --------------------------------------------------------------------------- +detect_hybrid_extras() { + local pycmd="" + if command -v python3 &>/dev/null; then + pycmd="python3" + elif command -v python &>/dev/null; then + pycmd="python" + fi + + if [[ -z "${pycmd}" ]]; then + echo "HYBRID_EXTRAS=false" + return + fi + + local result + result="$("${pycmd}" -c "import docling, fastapi, uvicorn; print('ok')" 2>/dev/null || true)" + if [[ "${result}" == "ok" ]]; then + echo "HYBRID_EXTRAS=true" + else + echo "HYBRID_EXTRAS=false" + fi +} + +# --------------------------------------------------------------------------- +# Main — emit all key=value pairs +# --------------------------------------------------------------------------- +printf '%s\n' "OS=$(detect_os)" +printf '%s\n' "JAVA=$(detect_java)" +printf '%s\n' "PYTHON=$(detect_python)" +printf '%s\n' "NODE=$(detect_node)" +detect_odl +detect_hybrid_extras diff --git a/skills/odl-pdf/scripts/hybrid-health.sh b/skills/odl-pdf/scripts/hybrid-health.sh new file mode 100755 index 000000000..290b4f4bc --- /dev/null +++ b/skills/odl-pdf/scripts/hybrid-health.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# hybrid-health.sh +# Checks the health of a running opendataloader-pdf hybrid server. +# Works on Windows (Git Bash), macOS, and Linux. +# Outputs key=value pairs for machine readability. + +set -euo pipefail + +DEFAULT_URL="http://localhost:5002" +HYBRID_URL="${DEFAULT_URL}" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --url) + if [[ $# -lt 2 ]]; then + echo "Error: --url requires a value" >&2 + exit 1 + fi + HYBRID_URL="$2" + shift 2 + ;; + --url=*) + HYBRID_URL="${1#--url=}" + shift + ;; + *) + echo "Unknown argument: $1" >&2 + echo "Usage: $0 [--url ]" >&2 + exit 1 + ;; + esac +done + +HEALTH_ENDPOINT="${HYBRID_URL}/health" + +# Detect available HTTP client +_http_get_status() { + local url="$1" + if command -v curl &>/dev/null; then + curl --silent --output /dev/null --write-out "%{http_code}" \ + --max-time 5 --connect-timeout 3 "$url" 2>/dev/null + elif command -v wget &>/dev/null; then + wget --quiet --server-response --spider --timeout=5 "$url" 2>&1 \ + | awk '/HTTP\//{print $2}' | tail -1 + else + echo "none" + fi +} + +HTTP_STATUS=$(_http_get_status "${HEALTH_ENDPOINT}" || true) + +# Interpret result +if [[ -z "${HTTP_STATUS}" || "${HTTP_STATUS}" == "000" || "${HTTP_STATUS}" == "none" ]]; then + echo "HYBRID_SERVER=stopped" + echo "HYBRID_URL=${HYBRID_URL}" + echo "HYBRID_STATUS=none" + echo "" + echo "Hybrid server is not running at ${HYBRID_URL}. Start it with: opendataloader-pdf-hybrid" + exit 0 +fi + +# Any 2xx response is considered running; other codes are an error state +if [[ "${HTTP_STATUS}" =~ ^2 ]]; then + echo "HYBRID_SERVER=running" +else + echo "HYBRID_SERVER=error" +fi + +echo "HYBRID_URL=${HYBRID_URL}" +echo "HYBRID_STATUS=${HTTP_STATUS}" diff --git a/skills/odl-pdf/scripts/quick-eval.py b/skills/odl-pdf/scripts/quick-eval.py new file mode 100644 index 000000000..036962e44 --- /dev/null +++ b/skills/odl-pdf/scripts/quick-eval.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +"""Quick quality evaluation script for opendataloader-pdf output. + +Compares extracted text against a ground truth file and reports a similarity +score. Uses difflib.SequenceMatcher from the Python standard library by default. +If rapidfuzz is installed, it computes a more accurate Normalized Indel +Distance (NID) score instead. + +Usage: + python quick-eval.py extracted.md ground-truth.md + python quick-eval.py extracted.md ground-truth.md --verbose + python quick-eval.py extracted.md ground-truth.md --threshold 0.90 +""" + +import argparse +import difflib +import re +import sys +from pathlib import Path + +# Ensure stdout can print non-ASCII report content on Windows consoles +# (cp1252 / cp949 default). Without this, a single non-ASCII character +# crashes the script with UnicodeEncodeError -- including under +# `windows-latest` in GitHub Actions. +if hasattr(sys.stdout, "reconfigure"): + try: + sys.stdout.reconfigure(encoding="utf-8", errors="replace") + sys.stderr.reconfigure(encoding="utf-8", errors="replace") + except (AttributeError, OSError): + pass + +# --------------------------------------------------------------------------- +# Optional rapidfuzz import -- used for NID scoring when available +# --------------------------------------------------------------------------- +try: + from rapidfuzz.distance import Indel + + _RAPIDFUZZ_AVAILABLE = True +except ImportError: + _RAPIDFUZZ_AVAILABLE = False + + +# --------------------------------------------------------------------------- +# Score thresholds and their human-readable interpretations +# --------------------------------------------------------------------------- +SCORE_LEVELS = [ + (0.95, "Excellent", "Output closely matches the ground truth."), + (0.85, "Good", "Minor differences; output is usable as-is."), + (0.70, "Fair", "Noticeable differences - consider hybrid mode or different options."), + (0.00, "Poor", "Significant quality issues - review extraction settings."), +] + + +def normalize(text: str) -> str: + """Collapse runs of whitespace to a single space and strip leading/trailing + whitespace. This makes the comparison insensitive to cosmetic formatting + differences such as extra blank lines or trailing spaces.""" + return re.sub(r"\s+", " ", text).strip() + + +def read_file(path: Path) -> str: + """Read a text file and return its content, normalized.""" + try: + raw = path.read_text(encoding="utf-8") + except UnicodeDecodeError: + # Fall back to Latin-1 for PDFs extracted without explicit encoding + raw = path.read_text(encoding="latin-1") + return normalize(raw) + + +def compute_similarity_stdlib(extracted: str, ground_truth: str) -> float: + """Return a similarity ratio in [0, 1] using difflib.SequenceMatcher. + + The ratio is defined as 2 * M / T, where M is the number of matching + characters and T is the total number of characters in both sequences. + This is equivalent to 1 - NID when strings share large common blocks. + """ + return difflib.SequenceMatcher(None, extracted, ground_truth, autojunk=False).ratio() + + +def compute_similarity_rapidfuzz(extracted: str, ground_truth: str) -> float: + """Return a similarity score in [0, 1] using rapidfuzz Indel distance. + + Computes Normalized Indel Distance: + NID = indel_distance / (len(a) + len(b)) + The similarity score returned is 1 - NID, so higher is better. + """ + if not extracted and not ground_truth: + return 1.0 + return max(0.0, 1.0 - float(Indel.normalized_distance(extracted, ground_truth))) + + +def compute_similarity(extracted: str, ground_truth: str) -> tuple[float, str]: + """Compute similarity score using the best available method. + + Returns: + (score, method_name) where score is in [0, 1]. + """ + if _RAPIDFUZZ_AVAILABLE: + return compute_similarity_rapidfuzz(extracted, ground_truth), "NID (rapidfuzz)" + return compute_similarity_stdlib(extracted, ground_truth), "SequenceMatcher ratio (difflib)" + + +def interpret_score(score: float) -> tuple[str, str]: + """Return (label, description) for a given score.""" + for threshold, label, description in SCORE_LEVELS: + if score >= threshold: + return label, description + # Should never reach here, but guard anyway + return "Poor", SCORE_LEVELS[-1][2] + + +def diff_snippets(extracted: str, ground_truth: str, max_snippets: int = 5) -> list[str]: + """Return up to max_snippets diff hunks for low-scoring sections. + + Uses difflib.unified_diff on word-tokenised lines so the output is readable + even for long single-line documents. + """ + # Re-wrap into ~80-char logical lines for readability + def wrap_words(text: str, width: int = 80) -> list[str]: + words = text.split() + lines: list[str] = [] + line: list[str] = [] + length = 0 + for word in words: + if length + len(word) + 1 > width and line: + lines.append(" ".join(line)) + line = [word] + length = len(word) + else: + line.append(word) + length += len(word) + 1 + if line: + lines.append(" ".join(line)) + return lines + + ext_lines = wrap_words(extracted) + gt_lines = wrap_words(ground_truth) + + diff = list( + difflib.unified_diff( + gt_lines, + ext_lines, + fromfile="ground-truth", + tofile="extracted", + lineterm="", + n=2, + ) + ) + + # Collect individual hunks (separated by @@ markers) + snippets: list[str] = [] + current_hunk: list[str] = [] + for line in diff: + if line.startswith("@@") and current_hunk: + snippets.append("\n".join(current_hunk)) + current_hunk = [line] + if len(snippets) >= max_snippets: + break + else: + current_hunk.append(line) + if current_hunk and len(snippets) < max_snippets: + snippets.append("\n".join(current_hunk)) + + return snippets + + +def build_report( + extracted_path: Path, + ground_truth_path: Path, + score: float, + method: str, + threshold: float, + verbose: bool, + extracted: str, + ground_truth: str, +) -> str: + """Assemble the formatted report string.""" + label, description = interpret_score(score) + passed = score >= threshold + status = "PASS" if passed else "FAIL" + + lines = [ + "=" * 60, + "ODL-PDF Quick Quality Evaluation", + "=" * 60, + f"Extracted: {extracted_path}", + f"Ground truth: {ground_truth_path}", + f"Method: {method}", + "-" * 60, + f"Score: {score:.4f} [{label}]", + f"Threshold: {threshold:.4f}", + f"Result: {status}", + "-" * 60, + f"Interpretation: {description}", + ] + + if not passed: + suggestions: list[str] = [] + if score < 0.70: + suggestions.extend([ + " - Try --hybrid docling-fast for better OCR coverage.", + " - Check --format is appropriate for this document type.", + " - Inspect whether the PDF is scanned (image-only) vs. native.", + ]) + elif score < 0.85: + suggestions.extend([ + " - Consider --hybrid docling-fast or --table-method cluster.", + " - Try --use-struct-tree if the PDF is tagged (accessible).", + ]) + else: + # Score is above the general-quality bar but below the caller's + # custom threshold. Generic guidance only. + suggestions.append( + " - Score is above the usable-quality bar but below your custom threshold; " + "tighten input quality or relax --threshold if appropriate." + ) + + if suggestions: + lines.append("") + lines.append("Suggestions:") + lines.extend(suggestions) + + if verbose: + lines.append("") + lines.append("Diff snippets (ground-truth → extracted):") + snippets = diff_snippets(extracted, ground_truth) + if snippets: + for i, snippet in enumerate(snippets, 1): + lines.append(f"\n--- Hunk {i} ---") + lines.append(snippet) + else: + lines.append(" (no differences found)") + + lines.append("=" * 60) + return "\n".join(lines) + + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Compare ODL-PDF extracted output against a ground truth file.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "extracted", + type=Path, + help="Path to the extracted text file produced by opendataloader-pdf.", + ) + parser.add_argument( + "ground_truth", + type=Path, + help="Path to the ground truth reference file.", + ) + parser.add_argument( + "--threshold", + type=float, + default=0.85, + metavar="T", + help="Pass/fail threshold in [0, 1]. Default: 0.85.", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Show diff snippets for sections where the files diverge.", + ) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + + # Validate input paths + if not args.extracted.is_file(): + print(f"ERROR: Extracted file not found: {args.extracted}", file=sys.stderr) + return 2 + if not args.ground_truth.is_file(): + print(f"ERROR: Ground truth file not found: {args.ground_truth}", file=sys.stderr) + return 2 + if not (0.0 <= args.threshold <= 1.0): + print(f"ERROR: --threshold must be between 0 and 1, got {args.threshold}", file=sys.stderr) + return 2 + + extracted = read_file(args.extracted) + ground_truth = read_file(args.ground_truth) + + score, method = compute_similarity(extracted, ground_truth) + + report = build_report( + extracted_path=args.extracted, + ground_truth_path=args.ground_truth, + score=score, + method=method, + threshold=args.threshold, + verbose=args.verbose, + extracted=extracted, + ground_truth=ground_truth, + ) + + print(report) + + # Exit 0 = pass, 1 = fail (score below threshold) + return 0 if score >= args.threshold else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/odl-pdf/scripts/sync-skill-refs.py b/skills/odl-pdf/scripts/sync-skill-refs.py new file mode 100644 index 000000000..cf7ab5925 --- /dev/null +++ b/skills/odl-pdf/scripts/sync-skill-refs.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 +"""Drift detection script for the ODL-PDF agent skill. + +Compares the option names declared in options.json (the authoritative source) +against the option names documented in skills/odl-pdf/references/options-matrix.md. + +Any mismatch means the skill reference is out of sync with the actual CLI — +a condition referred to here as "drift". Run this script in CI after any +change to options.json or options-matrix.md. + +Usage: + python sync-skill-refs.py + python sync-skill-refs.py --options-json path/to/options.json \ + --matrix path/to/options-matrix.md + +Exit codes: + 0 No drift detected. + 1 Drift detected (new or removed options). + 2 Input error (file not found, invalid JSON, etc.). +""" + +import argparse +import io +import json +import re +import sys +from pathlib import Path + +# Reconfigure stdout to UTF-8 when running on Windows with a legacy code page +# so that Unicode symbols (checkmark, cross) print correctly in all terminals. +if hasattr(sys.stdout, "reconfigure"): + try: + sys.stdout.reconfigure(encoding="utf-8") + except Exception: + pass + +# --------------------------------------------------------------------------- +# Defaults — resolved relative to this script's location so the script works +# when invoked from any directory. +# --------------------------------------------------------------------------- +_SCRIPT_DIR = Path(__file__).parent.resolve() +# skills/odl-pdf/scripts/ → project root is three levels up +_PROJECT_ROOT = _SCRIPT_DIR.parent.parent.parent + +DEFAULT_OPTIONS_JSON = _PROJECT_ROOT / "options.json" +DEFAULT_MATRIX = _SCRIPT_DIR.parent / "references" / "options-matrix.md" + + +# --------------------------------------------------------------------------- +# Parsing helpers +# --------------------------------------------------------------------------- + +def load_option_names_from_json(path: Path) -> set[str]: + """Return the set of option names declared in options.json. + + Expects the file to contain a top-level object with an "options" array, + where each element has a "name" field. Example: + + { "options": [ { "name": "output-dir", ... }, ... ] } + """ + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + print(f"ERROR: Failed to parse {path}: {exc}", file=sys.stderr) + sys.exit(2) + + options = data.get("options") + if not isinstance(options, list): + print( + f"ERROR: {path} does not contain a top-level 'options' array.", + file=sys.stderr, + ) + sys.exit(2) + + names: set[str] = set() + for i, item in enumerate(options): + if not isinstance(item, dict) or "name" not in item: + print( + f"ERROR: options[{i}] in {path} is missing the 'name' field.", + file=sys.stderr, + ) + sys.exit(2) + names.add(item["name"]) + + return names + + +def load_option_names_from_matrix(path: Path) -> set[str]: + """Return the set of option names found in options-matrix.md. + + Scans all Markdown table rows and extracts backtick-quoted option names + from the first column. Rows that contain only header separators (---) are + skipped. + + Expected table format (any number of columns): + | `option-name` | ... | + """ + text = path.read_text(encoding="utf-8") + + names: set[str] = set() + + # Match table rows whose first cell contains a backtick-quoted token. + # This pattern is intentionally permissive so it works even if the table + # adds extra spaces or alignment padding. + row_pattern = re.compile( + r"^\s*\|\s*`([^`]+)`", # | `option-name` (first cell, backtick-quoted) + re.MULTILINE, + ) + for match in row_pattern.finditer(text): + candidate = match.group(1).strip() + # Skip tokens that look like option values rather than names. + # Option names always contain at least one letter and may contain + # hyphens but not spaces or equals signs. + if re.fullmatch(r"[a-z][a-z0-9-]*", candidate): + names.add(candidate) + + return names + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def parse_args(argv: list[str] | None = None) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Detect drift between options.json and the skill reference matrix.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--options-json", + type=Path, + default=DEFAULT_OPTIONS_JSON, + metavar="PATH", + help=f"Path to options.json. Default: {DEFAULT_OPTIONS_JSON}", + ) + parser.add_argument( + "--matrix", + type=Path, + default=DEFAULT_MATRIX, + metavar="PATH", + help=f"Path to options-matrix.md. Default: {DEFAULT_MATRIX}", + ) + return parser.parse_args(argv) + + +def main(argv: list[str] | None = None) -> int: + args = parse_args(argv) + + # Validate input paths + if not args.options_json.is_file(): + print(f"ERROR: options.json not found: {args.options_json}", file=sys.stderr) + return 2 + if not args.matrix.is_file(): + print(f"ERROR: options-matrix.md not found: {args.matrix}", file=sys.stderr) + return 2 + + print("Checking skill drift...") + + json_names = load_option_names_from_json(args.options_json) + matrix_names = load_option_names_from_matrix(args.matrix) + + print(f"options.json: {len(json_names)} options") + print(f"options-matrix.md: {len(matrix_names)} options") + + # Compute drift sets + new_options = sorted(json_names - matrix_names) # in JSON but not in matrix + removed_options = sorted(matrix_names - json_names) # in matrix but not in JSON + + drift_detected = bool(new_options or removed_options) + + if not drift_detected: + print("\u2713 No drift detected.") + return 0 + + # Report drift + if new_options: + print(f"\nNEW options (in options.json, not in skill):") + for name in new_options: + print(f" - {name}") + + if removed_options: + print(f"\nREMOVED options (in skill, not in options.json):") + for name in removed_options: + print(f" - {name}") + + print( + "\n\u2717 Drift detected. " + "Update skills/odl-pdf/references/options-matrix.md to match options.json." + ) + return 1 + + +if __name__ == "__main__": + sys.exit(main())