datopian
diff --git a/‎.env.example‎
Lines changed: 22 additions & 0 deletions b/‎.env.example‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎.harvest_state.json‎
Lines changed: 0 additions & 3 deletions b/‎.harvest_state.json‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎README.md‎
Lines changed: 117 additions & 35 deletions b/‎README.md‎
Lines changed: 117 additions & 35 deletions
diff --git a/‎config.ts‎
Lines changed: 7 additions & 7 deletions b/‎config.ts‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎gen-schema.ts‎
Lines changed: 62 additions & 0 deletions b/‎gen-schema.ts‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎index.ts‎
Lines changed: 2 additions & 3 deletions b/‎index.ts‎
Lines changed: 2 additions & 3 deletions
@@ -0,0 +1,22 @@
+# CKAN source
+SOURCE_CKAN_URL=https://ckan.com
+SOURCE_CKAN_API_KEY=
+SOURCE_CKAN_ORG_ID=
+
+# PortalJS Cloud target
+PORTALJS_CKAN_URL=https://my-org.portaljs.com
+PORTALJS_CKAN_API_KEY=xyz
+PORTALJS_ORG_ID=my-org
+
+# Harvest behavior
+CONCURRENCY=4                      
+RATE_LIMIT_RPS=2                    
+RETRY_MAX_ATTEMPTS=2
+RETRY_BASE_MS=500
+
+# Incremental window
+# If set, harvest only datasets with metadata_modified >= SINCE_ISO
+SINCE_ISO=2025-02-01T00:00:00Z                          
+# Alternatively, roll-forward state (persisted between runs)
+STATE_FILE=.harvest_state.json
+
@@ -1,52 +1,134 @@
-Open-source framework and scripts for harvesting datasets into [PortalJS](https://portaljs.com).
-This repo is designed as a **template** — fork or clone it to quickly set up your own dataset harvesting pipelines.
+# PortalJS CKAN Harvester 
 
-It includes:
+A template harvester that pulls datasets from a **CKAN source** and upserts them into a **PortalJS CKAN target**.
 
-* Reusable scripts for extracting datasets from common sources (APIs, CSVs, spreadsheets, etc.)
-* A plug-and-play **ETL framework** for transforming and publishing datasets
-* GitHub Actions workflow for automated harvesting
-* Config-driven setup — no need to hard-wire pipelines
+**fetch → map → upsert**
 
-## 🚀 Quickstart
+---
 
-1. **Use this template**
-   Click **“Use this template”** on GitHub to bootstrap your own repo.
+## Quick Start
 
-2. **Configure harvesters**
-   Edit `config.yml` to define dataset sources and pipelines:
+```bash
+npm install
+cp .env.example .env   # or edit the existing .env
+npm start              # run the harvester
+```
 
-   ```yaml
-   sources:
-     - name: world-bank
-       type: api
-       url: https://api.worldbank.org/v2/
-       format: json
-   ```
+---
 
-3. **Run**
+## Environment Variables (.env)
 
-TODO
+Use these exact names. Example values are placeholders:
 
-4. **Automate with GitHub Actions**
-   Push your repo — harvesting will run on schedule using the included workflow (`.github/workflows/harvest.yml`).
+```env
+# CKAN source
+SOURCE_CKAN_URL=<https://source-ckan.example.org>
+SOURCE_CKAN_API_KEY=<source-api-key-or-empty>
+SOURCE_CKAN_ORG_ID=<org-slug-or-empty>
 
-## 🛠 Features
+# PortalJS Cloud target
+PORTALJS_CKAN_URL=<http://localhost:5000>
+PORTALJS_CKAN_API_KEY=<target-api-key>
+PORTALJS_ORG_ID=<target-org-id>
 
-* **Modular scripts** – add your own connectors or reuse provided ones
-* **Config-driven** – no need to edit code for new datasets
-* **CI/CD ready** – run pipelines directly in GitHub Actions
-* **Extensible** – works with PortalJS or standalone
+# Harvest behavior
+CONCURRENCY=4
+RATE_LIMIT_RPS=2
+RETRY_MAX_ATTEMPTS=2
+RETRY_BASE_MS=500
 
-## 📦 Repo Structure
+# Incremental window
+SINCE_ISO=2025-02-01T00:00:00Z
+STATE_FILE=.harvest_state.json
 
-TODO
+```
 
-## 🤝 Contributing
+* **`SOURCE_CKAN_URL`** – source CKAN base URL
 
-PRs and new connectors welcome!
-Please open an issue if you’d like to propose a new feature or source integration.
+* **`SOURCE_CKAN_API_KEY`** – source API key (optional)
 
-## 📄 License
+* **`SOURCE_CKAN_ORG_ID`** – restrict harvest to one org (optional, empty = harvest all)
 
-MIT License. See [LICENSE](./LICENSE) for details.
+* **`PORTALJS_CKAN_URL`** – target CKAN base URL
+
+* **`PORTALJS_CKAN_API_KEY`** – target API key (**required**)
+
+* **`PORTALJS_ORG_ID`** – target org where datasets will be created (must exist first)
+
+* **`CONCURRENCY`** – how many datasets to process in parallel (optional, default 4) 
+
+* **`RATE_LIMIT_RPS`** – max HTTP requests per second (optional, default 2) 
+
+* **`RETRY_MAX_ATTEMPTS`** – number of retry attempts on failure (optional, default 2) 
+
+* **`RETRY_BASE_MS`** – base delay (ms) for exponential backoff (optional, default 500) 
+
+* **`SINCE_ISO`** – harvest only datasets modified after this date (overrides state file) (optional)
+
+* **`STATE_FILE`** – JSON file used to track last run. Stores `lastRunISO`. Lets the harvester run incrementally instead of fetching everything every time.
+
+---
+
+## How It Works
+
+1. **Discover** datasets from source CKAN (`package_search`), filtered by org and/or date.
+2. **Map** each dataset from source schema → target schema.
+3. **Upsert** into target CKAN (update if exists, create if not).
+4. **Persist state** in `STATE_FILE` for the next incremental run.
+
+---
+
+## Project Structure
+
+
+
+* **`index.ts`** – main entry. Loads env + state, chooses full vs incremental run, loops datasets, maps, upserts, logs results, updates state.
+* **`config.ts`** – loads `.env` with `dotenv` and validates using **Zod**.
+* **`gen-schema.ts`** – generates `schemas/target-schema.d.ts` from target CKAN scheming API.
+* **`.github/workflows/run-index.yml`** – GitHub Action to run on schedule or manual trigger.
+
+* **`schemas/`**
+
+  * **`source-schema.d.ts`** – interface for source datasets.
+  * **`target-schema.d.ts`** – auto-generated interface for target datasets.
+
+* **`src/`**
+
+  * **`source.ts`** – source CKAN client.
+
+    * `iterSourcePackages()` async generator over `package_search`.
+    * Supports org filter and incremental filtering (`metadata_modified >= …`).
+
+  * **`target.ts`** – target CKAN helpers.
+
+    * Preloads dataset list with `package_list`.
+    * `upsertPortalDataset()` creates or updates dataset with API key.
+
+  * **`map.ts`** – mapping logic.
+
+    * Sets `owner_org` to `PORTALJS_ORG_ID`.
+    * Prefixes dataset `name` with `<owner_org>--` (unique, PortalJS-friendly).
+    * Maps `title`, `notes`, resources, and ensures defaults (language = EN, description fallback, etc.).
+
+  * **`state.ts`** – reads/writes the `STATE_FILE` JSON.
+
+  * **`utils.ts`** – small helpers (`withRetry()`, `sleep()`, etc.).
+
+---
+
+## Running
+
+1. Edit `.env`.
+2. Run `npm start`.
+3. Logs will show:
+
+   * “Full harvest mode” or “Incremental mode since <ISO>”
+   * Final summary: `total=… upserts=… failures=…`
+
+---
+
+## Extending
+
+* **Mapping** – extend `src/map.ts` to add fields (tags, extras, licenses, etc.).
+* **Filters** – extend `iterSourcePackages()` to filter by groups, tags, etc.
+* **Retries** – tweak retry/backoff logic in `utils.ts`.
@@ -5,19 +5,19 @@ config(); // loads .env
 import { z } from "zod";
 
 const EnvSchema = z.object({
-  CKAN_BASE_URL: z.string().url(),
-  CKAN_API_KEY: z.string().optional(),
-  CKAN_ORG_ID: z.string().optional(),
+  SOURCE_CKAN_URL: z.string().url(),
+  SOURCE_CKAN_API_KEY: z.string().optional(),
+  SOURCE_CKAN_ORG_ID: z.string().optional(),
 
-  PORTALJS_BASE_URL: z.string().url(),
-  PORTALJS_API_TOKEN: z.string().min(1),
+  PORTALJS_CKAN_URL: z.string().url(),
+  PORTALJS_CKAN_API_KEY: z.string().min(1),
   PORTALJS_ORG_ID: z.string().min(1),
 
   CONCURRENCY: z.coerce.number().default(4),
   RATE_LIMIT_RPS: z.coerce.number().default(2),
-  RETRY_MAX_ATTEMPTS: z.coerce.number().default(5),
+  RETRY_MAX_ATTEMPTS: z.coerce.number().default(2),
   RETRY_BASE_MS: z.coerce.number().default(500),
-  DRY_RUN: z.preprocess(v => String(v).toLowerCase() === "true", z.boolean()).default(false),
+  DRY_RUN: z.coerce.boolean().default(false),
 
   SINCE_ISO: z.string().optional(),
   STATE_FILE: z.string().default(".harvest_state.json"),
 
@@ -0,0 +1,62 @@
+// generate-types.ts
+import fs from "fs";
+import { env } from "./config";
+import { capitalize } from "./src/utils";
+
+interface SchemaField {
+  field_name: string;
+  choices?: Array<{ value: string; label: string }>;
+  validators?: string | string[];
+}
+
+async function generateTypes(type: string, schemaUrl: string) {
+  const res = await fetch(schemaUrl);
+  if (!res.ok) {
+    const text = await res.text().catch(() => "");
+    throw new Error(
+      `Failed to fetch schema from ${schemaUrl}: ${res.status} ${
+        res.statusText
+      } ${text.slice(0, 300)}`
+    );
+  }
+  const data = await res.json();
+  const { dataset_fields, resource_fields } = data.result;
+
+  const datasetProps = dataset_fields.map((f: SchemaField) => {
+    let tsType = "string";
+
+    if (f.choices) {
+      tsType = f.choices.map((c) => JSON.stringify(String(c.value))).join(" | ");
+    }
+
+    const validators = Array.isArray(f.validators) ? f.validators : (f.validators ? [f.validators] : []);
+    const isRequired = validators.some(v => v.includes("not_empty") || v.includes("scheming_required"));
+
+    return `  ${f.field_name}${isRequired ? "" : "?"}: ${tsType};`;
+  });
+
+  const resourceProps = resource_fields.map((f: SchemaField) => {
+    return `  ${f.field_name}?: string;`;
+  });
+
+  const content = `// Auto-generated from CKAN schema
+export interface ${capitalize(type)}Schema {
+${datasetProps.join("\n")}
+  resources?: CkanResource[];
+}
+
+export interface CkanResource {
+${resourceProps.join("\n")}
+}
+`;
+
+  fs.mkdirSync("schemas", { recursive: true });
+  fs.writeFileSync(`schemas/${type}-schema.d.ts`, content);
+  console.log(`✅ Types generated in schemas/${type}-schema.d.ts`);
+}
+
+generateTypes(
+  "target",
+  env.PORTALJS_CKAN_URL +
+    "/api/3/action/scheming_dataset_schema_show?type=dataset"
+);
@@ -1,8 +1,8 @@
 import Bottleneck from "bottleneck";
 import { env } from "./config";
-import { iterSourcePackages } from "./src/ckan";
+import { iterSourcePackages } from "./src/source";
 import { mapCkanToPortalJS } from "./src/map";
-import { upsertPortalDataset } from "./src/cloud";
+import { upsertPortalDataset } from "./src/target";
 import { readState, writeState } from "./src/state";
 import { withRetry } from "./src/utils";
 
@@ -29,7 +29,6 @@ async function main() {
     const job = async () => {
       try {
         const payload = mapCkanToPortalJS(ds, env.PORTALJS_ORG_ID);
-
         await withRetry(() => upsertPortalDataset(payload), `upsert ${ds.name}`);
         upserts++;
       } catch (err: any) {