opendataloader-project · hyunhee-jo · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -0,0 +1,22 @@
+{
+  "name": "opendataloader-pdf",
+  "owner": {
+    "name": "OpenDataLoader Project"
+  },
+  "metadata": {
+    "description": "AI-powered PDF extraction guidance and automation",
+    "version": "0.1.0"
+  },
+  "plugins": [
+    {
+      "name": "odl-pdf-skills",
+      "version": "0.1.0",
+      "description": "Expert guidance for opendataloader-pdf — environment detection, option recommendations, hybrid mode setup, quality diagnostics, and direct conversion execution",
+      "homepage": "https://github.com/opendataloader-project/opendataloader-pdf/tree/main/skills/odl-pdf",
+      "source": "./",
+      "skills": [
+        "./skills/odl-pdf"
+      ]
+    }
+  ]
+}
@@ -0,0 +1,45 @@
+# skill-drift-check.yml
+# Ensures skill references stay in sync with options.json when CLI options change.
+# Runs sync-skill-refs.py and fails the check if drift is detected (exit code 1).
+
+name: Skill Drift Check
+
+on:
+  push:
+    paths:
+      - 'options.json'
+      - 'skills/odl-pdf/scripts/sync-skill-refs.py'
+  pull_request:
+    paths:
+      - 'options.json'
+      - 'skills/odl-pdf/scripts/sync-skill-refs.py'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  check-drift:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Check skill drift
+        run: |
+          set +e
+          python skills/odl-pdf/scripts/sync-skill-refs.py
+          EXIT_CODE=$?
+          if [ $EXIT_CODE -eq 1 ]; then
+            echo ""
+            echo "Drift detected: skill references are out of sync with options.json."
+            echo "Update skills/odl-pdf/references/options-matrix.md to match options.json."
+            exit 1
+          elif [ $EXIT_CODE -ne 0 ]; then
+            echo ""
+            echo "Drift check failed due to an input/script error (exit $EXIT_CODE)."
+            exit $EXIT_CODE
+          fi
@@ -0,0 +1,106 @@
+# skill-smoke-test.yml
+# Cross-platform smoke test for the odl-pdf skill's executable assets.
+# Runs the shell scripts and Python scripts on ubuntu / windows / macos
+# to catch platform-specific regressions (line endings, console encoding,
+# shell portability) BEFORE a PR merges. Does NOT hit any external API.
+
+name: Skill Smoke Test
+
+on:
+  push:
+    paths:
+      - 'skills/odl-pdf/scripts/**'
+      - 'skills/odl-pdf/SKILL.md'
+      - 'skills/odl-pdf/references/**'
+      - '.github/workflows/skill-smoke-test.yml'
+  pull_request:
+    paths:
+      - 'skills/odl-pdf/scripts/**'
+      - 'skills/odl-pdf/SKILL.md'
+      - 'skills/odl-pdf/references/**'
+      - '.github/workflows/skill-smoke-test.yml'
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  smoke-test:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macos-latest]
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 10
+
+    defaults:
+      run:
+        # Use bash on every platform. Windows runners have Git Bash pre-installed.
+        shell: bash
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Show runner info
+        run: |
+          echo "OS: ${{ matrix.os }}"
+          bash --version | head -1
+          python --version
+
+      # --- detect-env.sh -------------------------------------------------
+      - name: detect-env.sh emits all 7 keys
+        run: |
+          out=$(bash skills/odl-pdf/scripts/detect-env.sh)
+          echo "$out"
+          for key in OS JAVA PYTHON NODE ODL_INSTALLED ODL_VERSION HYBRID_EXTRAS; do
+            echo "$out" | grep -q "^${key}=" \
+              || { echo "MISSING KEY: $key"; exit 1; }
+          done
+          echo "all 7 keys present"
+
+      # --- hybrid-health.sh (no server running is expected) --------------
+      - name: hybrid-health.sh handles no-server gracefully
+        run: |
+          out=$(bash skills/odl-pdf/scripts/hybrid-health.sh)
+          echo "$out"
+          echo "$out" | grep -q "HYBRID_SERVER=" \
+            || { echo "missing HYBRID_SERVER key"; exit 1; }
+
+      # --- quick-eval.py -------------------------------------------------
+      - name: quick-eval.py --help
+        run: python skills/odl-pdf/scripts/quick-eval.py --help
+
+      - name: quick-eval.py identical files -> PASS
+        run: |
+          tmp=$(mktemp -d)
+          printf '# Test\n\nSample paragraph one.\nSample paragraph two.\n' > "$tmp/a.md"
+          cp "$tmp/a.md" "$tmp/b.md"
+          python skills/odl-pdf/scripts/quick-eval.py "$tmp/a.md" "$tmp/b.md"
+          rm -rf "$tmp"
+
+      - name: quick-eval.py different files -> FAIL (exit 1)
+        run: |
+          tmp=$(mktemp -d)
+          printf 'apple pie recipe\n' > "$tmp/a.md"
+          printf 'quantum physics lecture\n' > "$tmp/b.md"
+          set +e
+          python skills/odl-pdf/scripts/quick-eval.py "$tmp/a.md" "$tmp/b.md"
+          rc=$?
+          set -e
+          rm -rf "$tmp"
+          [ "$rc" = "1" ] || { echo "expected exit 1, got $rc"; exit 1; }
+
+      - name: quick-eval.py prints em-dash-free output on cp1252 locale (Windows regression)
+        if: matrix.os == 'windows-latest'
+        shell: cmd
+        run: |
+          chcp 1252
+          python skills\odl-pdf\scripts\quick-eval.py skills\odl-pdf\evals\evals.json skills\odl-pdf\evals\evals.json
+
+      # --- sync-skill-refs.py --------------------------------------------
+      - name: sync-skill-refs.py reports no drift
+        run: python skills/odl-pdf/scripts/sync-skill-refs.py
@@ -78,4 +78,4 @@ content/docs/
 # Configuration files
 .claude/settings.local.json
 .claude/plans/
-
+.claude/review-rounds.md
@@ -21,3 +21,80 @@ Manual docs live in opendataloader.org repo. Reference docs (CLI options, JSON s
 - `./scripts/bench.sh --check-regression` — CI mode with threshold check
 - Benchmark code lives in [opendataloader-bench](https://github.com/opendataloader-project/opendataloader-bench)
 - Metrics: **NID** (reading order), **TEDS** (table structure), **MHS** (heading structure), **Table Detection F1**, **Speed**
+
+## Agent Skills
+
+`skills/odl-pdf/` contains the public agent skill shipped with this project.
+
+When adding or changing CLI options in Java, the following files may need
+manual updates. The drift CI (`skill-drift-check.yml`) only enforces step 2;
+the others are NOT auto-checked and will silently go stale if missed:
+
+1. Run `npm run sync` (regenerates `options.json` + Python/Node bindings)
+2. **Always**: update `skills/odl-pdf/references/options-matrix.md` to add /
+   rename / remove the row matching `options.json`. Drift CI enforces option
+   **names** here; description text is not auto-checked.
+3. **If the option is hybrid-related** (`--hybrid-*`, server flags like
+   `--enrich-*`, `--force-ocr`, `--ocr-lang`): also update
+   `skills/odl-pdf/references/hybrid-guide.md` — Client Options table, Server
+   Configuration table, or both.
+4. **If the option is a new output format or affects format selection** (touches
+   the `--format` enum, image handling, page separators): also update
+   `skills/odl-pdf/references/format-guide.md` and the Output Pipeline section
+   of `skills/odl-pdf/references/integration-examples.md`.
+5. **If the option introduces a silent failure mode, an unsafe default, or a
+   prerequisite**: also add it to the **Critical Gotchas** section of
+   `skills/odl-pdf/SKILL.md`. Silent failures (e.g., enrichments skipped in
+   `--hybrid-mode auto`, JVM cold-start cost on per-file calls) are the class
+   of issue the skill exists to surface — keep the gotchas list current.
+6. **If the option changes the recommended escalation path** for a quality
+   metric (NID / TEDS / MHS / Table Detection F1): also update the
+   corresponding Low-* section of `skills/odl-pdf/references/eval-metrics.md`.
+
+The `skill-smoke-test.yml` workflow runs automatically on push and
+verifies cross-platform shell and Python script behavior on
+ubuntu/windows/macos; it does not exercise model behavior.
+
+When bumping the minimum Java version (raising
+`<maven.compiler.source>` / `<maven.compiler.target>` in `java/pom.xml`),
+also update every explicit "Java 11" / "Java 11+" mention in these
+skill files — pip-installed users do not have `java/pom.xml` on disk
+and rely on the skill to state the concrete minimum:
+
+- `skills/odl-pdf/SKILL.md` — Persona, Phase 2A prerequisite and the
+  user-facing message, Action Mode A1 environment check, Gotcha 1
+  (title, body, Resolution, user-facing message), Session Checklist
+- `skills/odl-pdf/references/installation-matrix.md` — Prerequisites
+  paragraph and the Version Compatibility table's footer note
+- `skills/odl-pdf/references/integration-examples.md` — opening
+  requirement line
+- `skills/odl-pdf/evals/evals.json` — eval-006 `must_mention` array
+  (currently `"Java 11"`)
+
+The same pattern applies when the Python or Node.js runtime floor
+bumps, though the urgency is asymmetric:
+
+- **Node.js — peer to Java in silent-failure terms.** `npm` treats
+  `engines.node` in `node/opendataloader-pdf/package.json` as advisory
+  by default (`npm warn EBADENGINE` then installs anyway), so a user
+  below the floor gets a cryptic runtime error rather than a blocked
+  install. When bumping `engines.node`, grep for the current value
+  (e.g. `Node.js 20.19+`) across `skills/odl-pdf/` and update every
+  match. `pnpm` is strict by default, but the skill cannot assume the
+  user's package manager.
+- **Python — loud install failure.** Modern `pip` strictly enforces
+  `requires-python` in `python/opendataloader-pdf/pyproject.toml` and
+  refuses to install with a clear error. Surfacing the floor still
+  saves an agent-user round-trip, so when bumping `requires-python`,
+  grep for the current value (e.g. `Python 3.10+`) across
+  `skills/odl-pdf/` and update every match.
+
+Current Python/Node.js floor mentions live in the same skill locations
+as the Java ones: `SKILL.md` Persona, Phase 2A decision tree and
+default note, Session Checklist; `installation-matrix.md` Decision
+Tree and Prerequisites; `integration-examples.md` opening line. Grep
+is the authoritative discovery method for either bump — the file list
+above is a navigation aid, not a substitute for a fresh grep.
+
+The skill is written in English for external users. Do not include internal
+team terminology or company-specific policies.
@@ -134,5 +134,19 @@ git commit -s -m "your message"
 
 Make sure your Git config contains your real name and email.
 
+## Agent Skills Maintenance
+
+This project ships a built-in agent skill at `skills/odl-pdf/`. When you add
+or modify CLI options:
+
+1. Run `npm run sync` as usual
+2. Update `skills/odl-pdf/references/options-matrix.md` — add the new option
+   to the appropriate category with its type, default, and description
+3. If the new option has interaction rules with existing options (e.g., requires
+   another option to be set), document the rule in the "Interaction Rules" section
+
+The CI workflow `skill-drift-check.yml` will flag any mismatch between
+`options.json` and `options-matrix.md`.
+
 Thank you again for helping us improve this project! 🙌
 If you have any questions, open an issue or join the discussion.
@@ -451,6 +451,35 @@ Existing PDFs (untagged)
 
 [PDF Accessibility Guide](https://opendataloader.org/docs/accessibility-compliance)
 
+## Agent Skills
+
+Your AI coding agent knows how to use opendataloader-pdf — optimal options,
+hybrid mode setup, and quality diagnostics, all handled automatically.
+
+Follows the [Agent Skills](https://agentskills.io) open format. Native support in **Claude Code** via the included [`.claude-plugin/marketplace.json`](.claude-plugin/marketplace.json).
+
+### What the Skill Does
+
+| Phase | Description |
+|-------|-------------|
+| **Discover** | Detects your OS, Java, Python, Node.js, and ODL installation |
+| **Prescribe** | Recommends optimal install method, options, format, and mode |
+| **Execute** | Generates ready-to-run commands or runs conversions directly |
+| **Diagnose** | Identifies quality issues and escalates (local → cluster → hybrid) |
+| **Optimize** | Tunes batch processing, RAG integration, and performance |
+
+### Install
+
+Requires Java 11+ and Python 3.10+ with `opendataloader-pdf >= 2.2.0` (Node.js 20.19+ or Java SDK also supported).
+
+```bash
+npx skills add opendataloader-project/opendataloader-pdf --skill odl-pdf
+```
+
+After installation, invoke with `/odl-pdf` in Claude Code.
+
+For clients without a skills installer, copy [`skills/odl-pdf/`](skills/odl-pdf/) into the client's skills directory (location varies by client — see its docs).
+
 ## Roadmap
 
 | Feature | Timeline | Tier |