fix tags ARRAY column (#212)

andersy005 · andersy005 · commit 943be3dc11a6 · 2026-04-20T17:32:36.000-07:00
diff --git a/.github/workflows/update-db.yaml b/.github/workflows/update-db.yaml
@@ -1,12 +1,6 @@
 name: Update DB
 
 on:
-  push:
-    branches:
-      - main
-  pull_request:
-    branches:
-      - main
   workflow_dispatch:
   repository_dispatch:
     types: [update-db] # This is the event type we'll trigger from the API. It is currently used in carbonplan/offsets-db-download/.github/workflows
diff --git a/offsets_db_api/tasks.py b/offsets_db_api/tasks.py
@@ -1,6 +1,7 @@
 import csv
 import datetime
 import io
+import json
 import random
 import time
 import traceback
@@ -179,13 +180,23 @@ def process_dataframe(
         for col_name, dtype in dtype_dict.items():
             if 'ARRAY' in str(dtype) and col_name in df.columns:
                 logger.info(f'Converting column {col_name} to PostgreSQL array format')
-                df[col_name] = df[col_name].apply(
-                    lambda x: (
-                        '{' + ','.join(str(i) for i in x) + '}'
-                        if (hasattr(x, '__iter__') and not isinstance(x, str))
-                        else x
-                    )
-                )
+
+                def _to_pg_array(x):
+                    if x is None:
+                        return x
+                    if isinstance(x, str):
+                        try:
+                            parsed = json.loads(x)
+                            if isinstance(parsed, list):
+                                return '{' + ','.join(str(i) for i in parsed) + '}'
+                        except (json.JSONDecodeError, ValueError):
+                            pass
+                        return x
+                    if hasattr(x, '__iter__'):
+                        return '{' + ','.join(str(i) for i in x) + '}'
+                    return x
+
+                df[col_name] = df[col_name].apply(_to_pg_array)
 
     with engine.begin() as conn:
         if table_name == 'credit':
@@ -525,6 +536,11 @@ def truncate_clip_tables():
     for _, row in df.iterrows():
         clip_id = row['id']  # Assuming 'id' is the primary key in Clip model
         project_ids = row['project_ids']
+        if isinstance(project_ids, str):
+            try:
+                project_ids = json.loads(project_ids)
+            except (json.JSONDecodeError, ValueError):
+                project_ids = [project_ids]
         for project_id in project_ids:
             clip_projects_data.append({'id': index, 'clip_id': clip_id, 'project_id': project_id})
             index += 1
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -134,8 +134,12 @@
     db-down        = "docker compose down"
     db-logs        = "docker compose logs -f db"
     db-up          = "docker compose up -d --wait db"
+    # Wipe data volumes and start a completely fresh DB (WARNING: destroys all local data).
+    db-reset = "docker compose down -v && docker compose up -d --wait db"
     # Start DB, wait until healthy, then apply migrations (one-time setup or after a schema change).
     dev-setup = { depends-on = ["db-up", "migrate"] }
+    # Wipe DB, recreate, and apply migrations — use when the schema is broken or migrations are out of sync.
+    dev-reset = { depends-on = ["db-reset", "migrate"] }
     # Start DB then launch the dev server (daily startup).
     dev = { depends-on = ["db-up", "serve"] }
 
@@ -149,6 +153,8 @@
     # ── Database updates ──────────────────────────────────────────────────────
     update-db-production = "python scripts/update_database.py production --url https://offsets-db.fly.dev/files/"
     update-db-staging    = "python scripts/update_database.py staging --url https://offsets-db-staging.fly.dev/files/"
+    # Seed local DB from staging-files.json (requires `pixi run serve` to be running in another terminal).
+    seed-local = "OFFSETS_DB_API_KEY_STAGING=local-dev-key python scripts/update_database.py staging --url http://127.0.0.1:8000/files/"
 
     # ── Load testing ──────────────────────────────────────────────────────────
     loadtest = "locust -f load-testing/locustfile.py"
diff --git a/scripts/update_database.py b/scripts/update_database.py
@@ -3,10 +3,11 @@
 import json
 import os
 import sys
+import time
 
 import fsspec
+import httpx
 import pandas as pd
-import requests
 
 
 def generate_path(*, date: datetime.date, bucket: str, category: str) -> str:
@@ -77,43 +78,110 @@ def load_files_from_json(file_path: str) -> list[dict[str, str]]:
         sys.exit(1)
 
 
+def _get_api_key(env: str) -> str:
+    var = 'OFFSETS_DB_API_KEY_PRODUCTION' if env == 'production' else 'OFFSETS_DB_API_KEY_STAGING'
+    key = os.environ.get(var)
+    if key is None:
+        raise ValueError(f'{var} environment variable not set')
+    return key
+
+
+def _request(method: str, url: str, headers: dict, timeout: float = 30, **kwargs) -> httpx.Response:
+    try:
+        return httpx.request(method, url, headers=headers, timeout=timeout, **kwargs)
+    except httpx.ConnectError:
+        print(f'Error: could not connect to {url}')
+        print('Is the API server running?')
+        sys.exit(1)
+    except httpx.TimeoutException:
+        print(f'Error: request to {url} timed out after {timeout:.0f}s')
+        sys.exit(1)
+
+
+def _poll_until_complete(
+    *,
+    base_url: str,
+    file_ids: list[int],
+    headers: dict,
+    initial_delay: float = 2.0,
+    max_delay: float = 30.0,
+    timeout: float = 600.0,
+) -> list[dict]:
+    """Poll file statuses with exponential backoff until all leave pending state."""
+    pending = set(file_ids)
+    results: dict[int, dict] = {}
+    deadline = time.monotonic() + timeout
+    delay = initial_delay
+
+    print(f'\nPolling status for {len(file_ids)} file(s)...')
+
+    while pending:
+        if time.monotonic() > deadline:
+            timed_out = [str(i) for i in pending]
+            print(f'Timed out waiting for file(s): {", ".join(timed_out)}')
+            sys.exit(1)
+
+        time.sleep(delay)
+        delay = min(delay * 2, max_delay)
+
+        for file_id in list(pending):
+            resp = _request('GET', f'{base_url.rstrip("/")}/{file_id}', headers=headers)
+            if not resp.is_success:
+                print(f'  [{file_id}] HTTP {resp.status_code} polling status — skipping')
+                continue
+            file = resp.json()
+            if file['status'] != 'pending':
+                pending.discard(file_id)
+                results[file_id] = file
+                icon = '✓' if file['status'] == 'success' else '✗'
+                error = f'  error: {file["error"]}' if file.get('error') else ''
+                print(
+                    f'  {icon} [{file_id}] {file["category"]:8s} {file["status"]:8s}  {file["url"]}{error}'
+                )
+
+    return list(results.values())
+
+
 def post_data_to_environment(
     *,
     env: str,
     url: str,
     files: list[dict[str, str]],
+    post_timeout: float = 300,
 ) -> None:
-    """Post file definitions to the API."""
-    # Get API key from environment
-    if env == 'production':
-        api_key = os.environ.get('OFFSETS_DB_API_KEY_PRODUCTION')
-        if api_key is None:
-            raise ValueError('OFFSETS_DB_API_KEY_PRODUCTION environment variable not set')
-    else:
-        api_key = os.environ.get('OFFSETS_DB_API_KEY_STAGING')
-        if api_key is None:
-            raise ValueError('OFFSETS_DB_API_KEY_STAGING environment variable not set')
-
     headers = {
         'accept': 'application/json',
         'Content-Type': 'application/json',
-        'X-API-KEY': api_key,
+        'X-API-KEY': _get_api_key(env),
     }
 
-    print(f'\nSending {len(files)} files to {url}:')
+    print(f'\nSending {len(files)} file(s) to {url}:')
     for file in files:
         print(f'- {file["category"]}: {file["url"]}')
 
-    # Send the request
-    response = requests.post(url, headers=headers, data=json.dumps(files))
+    response = _request('POST', url, headers=headers, json=files, timeout=post_timeout)
+    if not response.is_success:
+        print(f'\nFailed in {env}: HTTP {response.status_code} {response.reason_phrase}')
+        if body := response.text.strip():
+            print(body)
+        sys.exit(1)
+
+    queued = response.json()
+    file_ids = [f['id'] for f in queued]
+    print(f'Queued {len(file_ids)} file(s) with ids: {file_ids}')
 
-    # Log the response
-    if response.ok:
-        print(f'\nSuccess in {env}:', response.json())
-    else:
-        print(f'\nFailed in {env}:', response.text)
+    results = _poll_until_complete(base_url=url, file_ids=file_ids, headers=headers)
+
+    if failures := [f for f in results if f['status'] == 'failure']:
+        print(f'\n{len(failures)} file(s) failed in {env}:')
+        for f in failures:
+            print(f'  - [{f["id"]}] {f["url"]}')
+            if f.get('error'):
+                print(f'    {f["error"]}')
         sys.exit(1)
 
+    print(f'\nAll {len(results)} file(s) processed successfully in {env}.')
+
 
 def main():
     parser = argparse.ArgumentParser(