open-metadata · prateekbaibhav · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026 · Apr 18, 2026
@@ -0,0 +1,40 @@
+# OpenMetadata AI SDK Recipes
+### By Baibhav Prateek | Hackathon 2026
+
+## Overview
+This is my submission for the OpenMetadata Hackathon 2026.
+I built 3 notebooks that showcase how to use AI with OpenMetadata.
+
+## Notebooks
+### 1. Metadata Health Report
+- Connects to OpenMetadata
+- Analyzes table and column documentation quality
+- Generates health score and visual charts
+- Saves results to CSV files
+
+### 2. LangChain OpenMetadata Template
+- Reusable template connecting AI to OpenMetadata
+- Ask questions about your data in plain English
+- Uses Groq AI (LLaMA 3) for natural language processing
+
+### 3. OpenMetadata AI Agent
+- Intelligent agent that decides how to search automatically
+- Uses multiple tools to fetch the right data
+- Most advanced of the three notebooks
+
+## How to Run
+
+### Prerequisites
+pip install openmetadata-ingestion groq google-genai requests pandas matplotlib
+
+### Setup
+1. Get your OpenMetadata token from sandbox.open-metadata.org
+2. Get your free Groq API key from console.groq.com
+3. Replace the placeholder keys in Cell 1 of each notebook
+4. Run all cells in order
+
+## Technologies Used
+- OpenMetadata API
+- Groq AI (LLaMA 3.3 70b)
+- Python, Pandas, Matplotlib
+- Jupyter Notebooks
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2d542bc1-1752-4bbe-9cac-c88548ce6393",
+   "metadata": {},
+   "source": [
+    "# LangChain + OpenMetadata Template\n",
+    "### Built by Baibhav Prateek | OpenMetadata Hackathon 2026\n",
+    "\n",
+    "## What is this?\n",
+    "A reusable template that connects AI to OpenMetadata.\n",
+    "Anyone can use this as a starting point for their own\n",
+    "AI-powered data catalog applications.\n",
+    "\n",
+    "## How to use this template:\n",
+    "1) Add your API keys\n",
+    "2) Run all cells in order\n",
+    "3) Ask your own questions\n",
+    "4) Customize the questions for your use case\n",
+    "\n",
+    "## Technologies used:\n",
+    "1) OpenMetadata API for metadata\n",
+    "2) Groq AI (LLaMA 3) for natural language processing\n",
+    "3) Python requests for API calls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac2f9ec7-80b3-4b2d-89ae-3bf237059733",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Setup complete!\n"
+     ]
+    }
+   ],
+   "source": [
+    "import requests\n",
+    "import json\n",
+    "from groq import Groq\n",
+    "\n",
+    "# Your credentials\n",
+    "GROQ_API_KEY = \"your_groq_api_key_here\"\n",
+    "BASE_URL = \"https://sandbox.open-metadata.org\"\n",
+    "TOKEN = \"your_openmetadata_token_here\"\n",
+    "\n",
+    "HEADERS = {\n",
+    "    \"Authorization\": f\"Bearer {TOKEN}\",\n",
+    "    \"Content-Type\": \"application/json\"\n",
+    "}\n",
+    "\n",
+    "# Initialize Groq client\n",
+    "client = Groq(api_key=GROQ_API_KEY)\n",
+    "\n",
+    "print(\"✅ Setup complete!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "cfa44929-6991-432b-8455-071cf8a12fe0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Helper functions ready!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Functions to fetch data from OpenMetadata\n",
+    "def get_tables(limit=10):\n",
+    "    response = requests.get(\n",
+    "        f\"{BASE_URL}/api/v1/tables\",\n",
+    "        headers=HEADERS,\n",
+    "        params={\"limit\": limit}\n",
+    "    )\n",
+    "    return response.json().get(\"data\", [])\n",
+    "\n",
+    "def get_databases():\n",
+    "    response = requests.get(\n",
+    "        f\"{BASE_URL}/api/v1/databases\",\n",
+    "        headers=HEADERS,\n",
+    "        params={\"limit\": 20}\n",
+    "    )\n",
+    "    return response.json().get(\"data\", [])\n",
+    "\n",
+    "def search_assets(query):\n",
+    "    response = requests.get(\n",
+    "        f\"{BASE_URL}/api/v1/search/query\",\n",
+    "        headers=HEADERS,\n",
+    "        params={\"q\": query, \"index\": \"table_search_index\", \"limit\": 5}\n",
+    "    )\n",
+    "    return response.json().get(\"hits\", {}).get(\"hits\", [])\n",
-    "def get_tables(limit=10):\n",
-    "    response = requests.get(\n",
-    "        f\"{BASE_URL}/api/v1/tables\",\n",
-    "        headers=HEADERS,\n",
-    "        params={\"limit\": limit}\n",
-    "    )\n",
-    "    return response.json().get(\"data\", [])\n",
-    "\n",
-    "def get_databases():\n",
-    "    response = requests.get(\n",
-    "        f\"{BASE_URL}/api/v1/databases\",\n",
-    "        headers=HEADERS,\n",
-    "        params={\"limit\": 20}\n",
-    "    )\n",
-    "    return response.json().get(\"data\", [])\n",
-    "\n",
-    "def search_assets(query):\n",
-    "    response = requests.get(\n",
-    "        f\"{BASE_URL}/api/v1/search/query\",\n",
-    "        headers=HEADERS,\n",
-    "        params={\"q\": query, \"index\": \"table_search_index\", \"limit\": 5}\n",
-    "    )\n",
-    "    return response.json().get(\"hits\", {}).get(\"hits\", [])\n",
+    "def openmetadata_get(path, params=None):\n",
+    "    url = f\"{BASE_URL}{path}\"\n",
+    "    response = requests.get(url, headers=HEADERS, params=params)\n",
+    "    try:\n",
+    "        response.raise_for_status()\n",
+    "    except requests.HTTPError as exc:\n",
+    "        error_body = response.text.strip()\n",
+    "        raise RuntimeError(\n",
+    "            f\"OpenMetadata API request failed for {url} with status \"\n",
+    "            f\"{response.status_code}: {error_body or 'No response body returned.'}\"\n",
+    "        ) from exc\n",
+    "\n",
+    "    try:\n",
+    "        return response.json()\n",
+    "    except ValueError as exc:\n",
+    "        raise RuntimeError(\n",
+    "            f\"OpenMetadata API request to {url} returned a non-JSON response.\"\n",
+    "        ) from exc\n",
+    "\n",
+    "def get_tables(limit=10):\n",
+    "    response_json = openmetadata_get(\n",
+    "        \"/api/v1/tables\",\n",
+    "        params={\"limit\": limit}\n",
+    "    )\n",
+    "    return response_json.get(\"data\", [])\n",
+    "\n",
+    "def get_databases():\n",
+    "    response_json = openmetadata_get(\n",
+    "        \"/api/v1/databases\",\n",
+    "        params={\"limit\": 20}\n",
+    "    )\n",
+    "    return response_json.get(\"data\", [])\n",
+    "\n",
+    "def search_assets(query):\n",
+    "    response_json = openmetadata_get(\n",
+    "        \"/api/v1/search/query\",\n",
+    "        params={\"q\": query, \"index\": \"table_search_index\", \"limit\": 5}\n",
+    "    )\n",
+    "    return response_json.get(\"hits\", {}).get(\"hits\", [])\n",
-    "def get_tables(limit=10):\n",
-    "    response = requests.get(\n",
-    "        f\"{BASE_URL}/api/v1/tables\",\n",
-    "        headers=HEADERS,\n",
-    "        params={\"limit\": limit}\n",
-    "    )\n",
-    "    return response.json().get(\"data\", [])\n",
-    "\n",
-    "def get_databases():\n",
-    "    response = requests.get(\n",
-    "        f\"{BASE_URL}/api/v1/databases\",\n",
-    "        headers=HEADERS,\n",
-    "        params={\"limit\": 20}\n",
-    "    )\n",
-    "    return response.json().get(\"data\", [])\n",
-    "\n",
-    "def search_assets(query):\n",
-    "    response = requests.get(\n",
-    "        f\"{BASE_URL}/api/v1/search/query\",\n",
-    "        headers=HEADERS,\n",
-    "        params={\"q\": query, \"index\": \"table_search_index\", \"limit\": 5}\n",
-    "    )\n",
-    "    return response.json().get(\"hits\", {}).get(\"hits\", [])\n",
+    "def openmetadata_get(path, params=None):\n",
+    "    url = f\"{BASE_URL}{path}\"\n",
+    "    response = requests.get(url, headers=HEADERS, params=params)\n",
+    "    try:\n",
+    "        response.raise_for_status()\n",
+    "    except requests.HTTPError as exc:\n",
+    "        error_body = response.text.strip()\n",
+    "        raise RuntimeError(\n",
+    "            f\"OpenMetadata API request failed for {url} with status \"\n",
+    "            f\"{response.status_code}: {error_body or 'No response body returned.'}\"\n",
+    "        ) from exc\n",
+    "\n",
+    "    try:\n",
+    "        return response.json()\n",
+    "    except ValueError as exc:\n",
+    "        raise RuntimeError(\n",
+    "            f\"OpenMetadata API request to {url} returned a non-JSON response.\"\n",
+    "        ) from exc\n",
+    "\n",
+    "def get_tables(limit=10):\n",
+    "    response_json = openmetadata_get(\n",
+    "        \"/api/v1/tables\",\n",
+    "        params={\"limit\": limit}\n",
+    "    )\n",
+    "    return response_json.get(\"data\", [])\n",
+    "\n",
+    "def get_databases():\n",
+    "    response_json = openmetadata_get(\n",
+    "        \"/api/v1/databases\",\n",
+    "        params={\"limit\": 20}\n",
+    "    )\n",
+    "    return response_json.get(\"data\", [])\n",
+    "\n",
+    "def search_assets(query):\n",
+    "    response_json = openmetadata_get(\n",
+    "        \"/api/v1/search/query\",\n",
+    "        params={\"q\": query, \"index\": \"table_search_index\", \"limit\": 5}\n",
+    "    )\n",
+    "    return response_json.get(\"hits\", {}).get(\"hits\", [])\n",
+    "\n",
+    "def get_table_details(table_name):\n",
+    "    results = search_assets(table_name)\n",
+    "    if results:\n",
+    "        return results[0].get(\"_source\", {})\n",
+    "    return {}\n",
+    "\n",
+    "print(\"✅ Helper functions ready!\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "ddbb5ecf-d621-43a7-a5b7-03ac2cdec978",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "🤖 AI says:\n",
+      "We have 9 tables in total. Some of the table names include 'ACCOUNTS', 'acct_issue_table', '_airbyte_raw_customers', '_airbyte_raw_orders', and others related to Airbyte raw data and a cdc table.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# This function connects AI with OpenMetadata\n",
+    "# Step 1; First I fetch real tables from OpenMetadata\n",
+    "# Step 2 ; I give that information to the AI as context\n",
+    "# Step 3; The AI uses that context to answer the question\n",
+    "# This way the AI always has uptodate information\n",
+    "\n",
+    "def ask_ai(question):\n",
+    "    # Fetch context from OpenMetadata\n",
+    "    tables = get_tables(limit=10)\n",
+    "    table_names = [t.get(\"name\", \"\") for t in tables]\n",
+    "    \n",
+    "    # Build prompt\n",
+    "    prompt = f\"\"\"You are a helpful data catalog assistant.\n",
+    "You have access to OpenMetadata with these tables: {table_names}\n",
+    "\n",
+    "User question: {question}\n",
+    "\n",
+    "Answer helpfully and concisely.\"\"\"\n",
+    "\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=\"llama-3.3-70b-versatile\",\n",
+    "        messages=[{\"role\": \"user\", \"content\": prompt}]\n",
+    "    )\n",
+    "    return response.choices[0].message.content\n",
+    "\n",
+    "# Test it!\n",
+    "answer = ask_ai(\"How many tables do we have and what are some of their names?\")\n",
-    "answer = ask_ai(\"How many tables do we have and what are some of their names?\")\n",
+    "answer = ask_ai(\"From the fetched sample of up to 10 tables, how many tables are listed and what are some of their names?\")\n",
-    "answer = ask_ai(\"How many tables do we have and what are some of their names?\")\n",
+    "answer = ask_ai(\"From the fetched sample of up to 10 tables, how many tables are listed and what are some of their names?\")\n",
+    "print(\"🤖 AI says:\")\n",
+    "print(answer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "72b75316-6857-4018-99b9-c75f29071e4a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================================================\n",
+      "   🤖 OpenMetadata AI Assistant Demo\n",
+      "============================================================\n",
+      "\n",
+      "❓ Question: Which tables seem to be related to customers?\n",
+      "----------------------------------------\n",
+      "🤖 Answer: The table that seems to be related to customers is '_airbyte_raw_customers'.\n",
+      "\n",
+      "\n",
+      "❓ Question: Which tables look like they contain financial data?\n",
+      "----------------------------------------\n",
+      "🤖 Answer: The tables that appear to contain financial data are: \n",
+      "\n",
+      "1. 'ACCOUNTS' (multiple instances)\n",
+      "2. 'acct_issue_table'\n",
+      "3. '_airbyte_raw_order_items' \n",
+      "4. '_airbyte_raw_orders' \n",
+      "\n",
+      "These tables have names that suggest they may contain information related to financial transactions, accounts, or orders.\n",
+      "\n",
+      "\n",
+      "❓ Question: What would you recommend to improve the data catalog?\n",
+      "----------------------------------------\n",
+      "🤖 Answer: To improve the data catalog, I recommend:\n",
+      "\n",
+      "1. **Data deduplication**: Remove duplicate 'ACCOUNTS' tables to avoid confusion.\n",
+      "2. **Table naming conventions**: Rename tables with underscores and prefixes (e.g., '_airbyte_raw_') to more descriptive names for better understanding.\n",
+      "3. **Data standardization**: Standardize column names and data types across similar tables (e.g., 'orders' and 'order_items') for easier data integration.\n",
+      "4. **Data documentation**: Add descriptions and metadata to each table to provide context and facilitate discovery.\n",
+      "5. **Categorization and tagging**: Organize tables into categories (e.g., 'customers', 'orders', 'staff') and apply relevant tags for efficient searching and filtering.\n",
+      "\n",
+      "============================================================\n",
+      "✅ Template demo complete!\n",
+      "============================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Interactive Q&A session\n",
+    "questions = [\n",
+    "    \"Which tables look incomplete or poorly documented?\",\n",
+    "    \"What kind of organization does this data belong to?\",\n",
+    "    \"If you were a new data analyst, which tables would you explore first?\",\n",
+    "]\n",
+    "\n",
+    "print(\"=\" * 60)\n",
+    "print(\"   🤖 OpenMetadata AI Assistant Demo\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "for question in questions:\n",
+    "    print(f\"\\n❓ Question: {question}\")\n",
+    "    print(\"-\" * 40)\n",
+    "    answer = ask_ai(question)\n",
+    "    print(f\"🤖 Answer: {answer}\")\n",
+    "    print()\n",
+    "\n",
+    "print(\"=\" * 60)\n",
+    "print(\"   🤖 OpenMetadata AI Template Demo\")\n",
+    "print(\"   Built for OpenMetadata Hackathon 2026\")\n",
+    "print(\"=\" * 60)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbcaac82-2132-47e9-8721-f384270685ad",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.9"
-   "version": "3.13.9"
+   "version": "3.11"
-   "version": "3.13.9"
+   "version": "3.11.0"
-   "version": "3.13.9"
+   "version": "3.11"
-   "version": "3.13.9"
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}