From b0fa59c861278ea8be04eb6340ee1780afd4865b Mon Sep 17 00:00:00 2001 From: Sriharsha Chintalapani Date: Tue, 31 Mar 2026 20:03:24 -0700 Subject: [PATCH 01/42] RDF, cleanup relations and remove unnecessary bindings, add distributed mode for RDF reindex --- .../native/1.13.0/mysql/schemaChanges.sql | 79 ++ .../native/1.13.0/postgres/schemaChanges.sql | 82 +++ docker/development/docker-compose-fuseki.yml | 37 + docker/run_local_docker.sh | 42 +- docker/run_local_docker_rdf.sh | 222 +----- docs/rdf-local-development.md | 22 +- .../service/OpenMetadataApplication.java | 13 + .../apps/bundles/rdf/RdfBatchProcessor.java | 211 ++++++ .../service/apps/bundles/rdf/RdfIndexApp.java | 399 +++++------ .../DistributedRdfIndexCoordinator.java | 604 ++++++++++++++++ .../DistributedRdfIndexExecutor.java | 297 ++++++++ .../RdfDistributedJobParticipant.java | 136 ++++ .../RdfDistributedJobStatsAggregator.java | 45 ++ .../bundles/rdf/distributed/RdfIndexJob.java | 70 ++ .../rdf/distributed/RdfIndexPartition.java | 35 + .../distributed/RdfPartitionCalculator.java | 101 +++ .../rdf/distributed/RdfPartitionWorker.java | 128 ++++ .../service/jdbi3/CollectionDAO.java | 675 ++++++++++++++++++ .../service/rdf/RdfRepository.java | 445 ++++++++++-- .../rdf/translator/RdfPropertyMapper.java | 127 +--- .../service/resources/rdf/RdfResource.java | 31 +- .../service/util/ODCSConverter.java | 2 +- .../resources/json/data/app/RdfIndexApp.json | 10 +- .../appMarketPlaceDefinition/RdfIndexApp.json | 10 +- .../apps/bundles/rdf/RdfIndexAppTest.java | 195 +++++ .../DistributedRdfIndexCoordinatorTest.java | 137 ++++ .../service/rdf/RdfPropertyMapperTest.java | 109 +-- .../internal/rdfIndexingAppConfig.json | 153 ++++ .../schema/entity/applications/jobStatus.json | 5 +- .../KnowledgeGraph.interface.ts | 17 + .../KnowledgeGraph/KnowledgeGraph.tsx | 74 +- .../AppDetails/AppDetails.component.tsx | 14 +- .../AppDetails/ApplicationsClassBase.test.ts | 9 + .../AppRunsHistory.component.tsx | 46 +- .../AppRunsHistory.interface.ts | 2 +- .../AppRunsHistory/AppRunsHistory.test.tsx | 67 ++ .../AppSchedule/AppScheduleProps.interface.ts | 2 +- .../resources/ui/src/constants/constants.ts | 1 + .../AirflowStatusProvider.test.tsx | 49 +- .../AirflowStatusProvider.tsx | 29 +- .../src/main/resources/ui/src/rest/rdfAPI.ts | 35 +- .../utils/ApplicationSchemas/RdfIndexApp.json | 142 ++++ 42 files changed, 4129 insertions(+), 780 deletions(-) mode change 100755 => 100644 docker/run_local_docker_rdf.sh create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfBatchProcessor.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinator.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexExecutor.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobParticipant.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobStatsAggregator.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexJob.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexPartition.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionCalculator.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionWorker.java create mode 100644 openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinatorTest.java create mode 100644 openmetadata-spec/src/main/resources/json/schema/entity/applications/configuration/internal/rdfIndexingAppConfig.json create mode 100644 openmetadata-ui/src/main/resources/ui/src/utils/ApplicationSchemas/RdfIndexApp.json diff --git a/bootstrap/sql/migrations/native/1.13.0/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.0/mysql/schemaChanges.sql index 306703ef6fb2..fb5bbdaa6fa7 100644 --- a/bootstrap/sql/migrations/native/1.13.0/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.0/mysql/schemaChanges.sql @@ -84,3 +84,82 @@ SELECT ue.id, re.id, 'user', 'role', 10 FROM user_entity ue, role_entity re WHERE ue.name = 'mcpapplicationbot' AND re.name = 'ApplicationBotImpersonationRole'; + +-- RDF distributed indexing state tables +CREATE TABLE IF NOT EXISTS rdf_index_job ( + id VARCHAR(36) NOT NULL, + status VARCHAR(32) NOT NULL, + jobConfiguration JSON NOT NULL, + totalRecords BIGINT NOT NULL DEFAULT 0, + processedRecords BIGINT NOT NULL DEFAULT 0, + successRecords BIGINT NOT NULL DEFAULT 0, + failedRecords BIGINT NOT NULL DEFAULT 0, + stats JSON, + createdBy VARCHAR(256) NOT NULL, + createdAt BIGINT NOT NULL, + startedAt BIGINT, + completedAt BIGINT, + updatedAt BIGINT NOT NULL, + errorMessage TEXT, + PRIMARY KEY (id), + INDEX idx_rdf_index_job_status (status), + INDEX idx_rdf_index_job_created (createdAt DESC) +); + +CREATE TABLE IF NOT EXISTS rdf_index_partition ( + id VARCHAR(36) NOT NULL, + jobId VARCHAR(36) NOT NULL, + entityType VARCHAR(128) NOT NULL, + partitionIndex INT NOT NULL, + rangeStart BIGINT NOT NULL, + rangeEnd BIGINT NOT NULL, + estimatedCount BIGINT NOT NULL, + workUnits BIGINT NOT NULL, + priority INT NOT NULL DEFAULT 50, + status VARCHAR(32) NOT NULL DEFAULT 'PENDING', + processingCursor BIGINT NOT NULL DEFAULT 0, + processedCount BIGINT NOT NULL DEFAULT 0, + successCount BIGINT NOT NULL DEFAULT 0, + failedCount BIGINT NOT NULL DEFAULT 0, + assignedServer VARCHAR(255), + claimedAt BIGINT, + startedAt BIGINT, + completedAt BIGINT, + lastUpdateAt BIGINT, + lastError TEXT, + retryCount INT NOT NULL DEFAULT 0, + claimableAt BIGINT NOT NULL DEFAULT 0, + PRIMARY KEY (id), + UNIQUE KEY uk_rdf_partition_job_entity_idx (jobId, entityType, partitionIndex), + INDEX idx_rdf_partition_job (jobId), + INDEX idx_rdf_partition_status_priority (status, priority DESC), + INDEX idx_rdf_partition_claimable (jobId, status, claimableAt), + INDEX idx_rdf_partition_assigned_server (jobId, assignedServer), + CONSTRAINT fk_rdf_partition_job FOREIGN KEY (jobId) REFERENCES rdf_index_job(id) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS rdf_reindex_lock ( + lockKey VARCHAR(64) NOT NULL, + jobId VARCHAR(36) NOT NULL, + serverId VARCHAR(255) NOT NULL, + acquiredAt BIGINT NOT NULL, + lastHeartbeat BIGINT NOT NULL, + expiresAt BIGINT NOT NULL, + PRIMARY KEY (lockKey) +); + +CREATE TABLE IF NOT EXISTS rdf_index_server_stats ( + id VARCHAR(36) NOT NULL, + jobId VARCHAR(36) NOT NULL, + serverId VARCHAR(256) NOT NULL, + entityType VARCHAR(128) NOT NULL, + processedRecords BIGINT DEFAULT 0, + successRecords BIGINT DEFAULT 0, + failedRecords BIGINT DEFAULT 0, + partitionsCompleted INT DEFAULT 0, + partitionsFailed INT DEFAULT 0, + lastUpdatedAt BIGINT NOT NULL, + PRIMARY KEY (id), + UNIQUE INDEX idx_rdf_index_server_stats_job_server_entity (jobId, serverId, entityType), + INDEX idx_rdf_index_server_stats_job_id (jobId) +); diff --git a/bootstrap/sql/migrations/native/1.13.0/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.13.0/postgres/schemaChanges.sql index 6fd046c7924b..400bfbc95200 100644 --- a/bootstrap/sql/migrations/native/1.13.0/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.13.0/postgres/schemaChanges.sql @@ -96,3 +96,85 @@ FROM user_entity ue, role_entity re WHERE ue.name = 'mcpapplicationbot' AND re.name = 'ApplicationBotImpersonationRole' ON CONFLICT DO NOTHING; + +-- RDF distributed indexing state tables +CREATE TABLE IF NOT EXISTS rdf_index_job ( + id VARCHAR(36) NOT NULL, + status VARCHAR(32) NOT NULL, + jobConfiguration JSONB NOT NULL, + totalRecords BIGINT NOT NULL DEFAULT 0, + processedRecords BIGINT NOT NULL DEFAULT 0, + successRecords BIGINT NOT NULL DEFAULT 0, + failedRecords BIGINT NOT NULL DEFAULT 0, + stats JSONB, + createdBy VARCHAR(256) NOT NULL, + createdAt BIGINT NOT NULL, + startedAt BIGINT, + completedAt BIGINT, + updatedAt BIGINT NOT NULL, + errorMessage TEXT, + PRIMARY KEY (id) +); + +CREATE INDEX IF NOT EXISTS idx_rdf_index_job_status ON rdf_index_job(status); +CREATE INDEX IF NOT EXISTS idx_rdf_index_job_created ON rdf_index_job(createdAt DESC); + +CREATE TABLE IF NOT EXISTS rdf_index_partition ( + id VARCHAR(36) NOT NULL, + jobId VARCHAR(36) NOT NULL, + entityType VARCHAR(128) NOT NULL, + partitionIndex INT NOT NULL, + rangeStart BIGINT NOT NULL, + rangeEnd BIGINT NOT NULL, + estimatedCount BIGINT NOT NULL, + workUnits BIGINT NOT NULL, + priority INT NOT NULL DEFAULT 50, + status VARCHAR(32) NOT NULL DEFAULT 'PENDING', + processingCursor BIGINT NOT NULL DEFAULT 0, + processedCount BIGINT NOT NULL DEFAULT 0, + successCount BIGINT NOT NULL DEFAULT 0, + failedCount BIGINT NOT NULL DEFAULT 0, + assignedServer VARCHAR(255), + claimedAt BIGINT, + startedAt BIGINT, + completedAt BIGINT, + lastUpdateAt BIGINT, + lastError TEXT, + retryCount INT NOT NULL DEFAULT 0, + claimableAt BIGINT NOT NULL DEFAULT 0, + PRIMARY KEY (id), + UNIQUE (jobId, entityType, partitionIndex), + CONSTRAINT fk_rdf_partition_job FOREIGN KEY (jobId) REFERENCES rdf_index_job(id) ON DELETE CASCADE +); + +CREATE INDEX IF NOT EXISTS idx_rdf_partition_job ON rdf_index_partition(jobId); +CREATE INDEX IF NOT EXISTS idx_rdf_partition_status_priority ON rdf_index_partition(status, priority DESC); +CREATE INDEX IF NOT EXISTS idx_rdf_partition_claimable ON rdf_index_partition(jobId, status, claimableAt); +CREATE INDEX IF NOT EXISTS idx_rdf_partition_assigned_server ON rdf_index_partition(jobId, assignedServer); + +CREATE TABLE IF NOT EXISTS rdf_reindex_lock ( + lockKey VARCHAR(64) NOT NULL, + jobId VARCHAR(36) NOT NULL, + serverId VARCHAR(255) NOT NULL, + acquiredAt BIGINT NOT NULL, + lastHeartbeat BIGINT NOT NULL, + expiresAt BIGINT NOT NULL, + PRIMARY KEY (lockKey) +); + +CREATE TABLE IF NOT EXISTS rdf_index_server_stats ( + id VARCHAR(36) NOT NULL, + jobId VARCHAR(36) NOT NULL, + serverId VARCHAR(256) NOT NULL, + entityType VARCHAR(128) NOT NULL, + processedRecords BIGINT DEFAULT 0, + successRecords BIGINT DEFAULT 0, + failedRecords BIGINT DEFAULT 0, + partitionsCompleted INT DEFAULT 0, + partitionsFailed INT DEFAULT 0, + lastUpdatedAt BIGINT NOT NULL, + PRIMARY KEY (id), + UNIQUE (jobId, serverId, entityType) +); + +CREATE INDEX IF NOT EXISTS idx_rdf_index_server_stats_job_id ON rdf_index_server_stats(jobId); diff --git a/docker/development/docker-compose-fuseki.yml b/docker/development/docker-compose-fuseki.yml index 14d7195a33a7..8de830822f3d 100644 --- a/docker/development/docker-compose-fuseki.yml +++ b/docker/development/docker-compose-fuseki.yml @@ -1,12 +1,44 @@ version: "3.9" services: + execute-migrate-all: + environment: + RDF_ENABLED: ${RDF_ENABLED:-true} + RDF_STORAGE_TYPE: ${RDF_STORAGE_TYPE:-FUSEKI} + RDF_ENDPOINT: ${RDF_ENDPOINT:-http://fuseki:3030/openmetadata} + RDF_REMOTE_USERNAME: ${RDF_REMOTE_USERNAME:-admin} + RDF_REMOTE_PASSWORD: ${RDF_REMOTE_PASSWORD:-admin} + RDF_BASE_URI: ${RDF_BASE_URI:-https://open-metadata.org/} + RDF_JSONLD_ENABLED: ${RDF_JSONLD_ENABLED:-true} + RDF_SPARQL_ENABLED: ${RDF_SPARQL_ENABLED:-true} + RDF_DATASET: ${RDF_DATASET:-openmetadata} + depends_on: + fuseki: + condition: service_healthy + + openmetadata-server: + environment: + RDF_ENABLED: ${RDF_ENABLED:-true} + RDF_STORAGE_TYPE: ${RDF_STORAGE_TYPE:-FUSEKI} + RDF_ENDPOINT: ${RDF_ENDPOINT:-http://fuseki:3030/openmetadata} + RDF_REMOTE_USERNAME: ${RDF_REMOTE_USERNAME:-admin} + RDF_REMOTE_PASSWORD: ${RDF_REMOTE_PASSWORD:-admin} + RDF_BASE_URI: ${RDF_BASE_URI:-https://open-metadata.org/} + RDF_JSONLD_ENABLED: ${RDF_JSONLD_ENABLED:-true} + RDF_SPARQL_ENABLED: ${RDF_SPARQL_ENABLED:-true} + RDF_DATASET: ${RDF_DATASET:-openmetadata} + depends_on: + fuseki: + condition: service_healthy + fuseki: image: stain/jena-fuseki:5.0.0 container_name: openmetadata-fuseki hostname: fuseki ports: - "3030:3030" + networks: + - local_app_net environment: - ADMIN_PASSWORD=admin - JVM_ARGS=-Xmx4g -Xms2g @@ -19,6 +51,11 @@ services: memory: 4G reservations: memory: 2G + healthcheck: + test: "curl -s -f http://localhost:3030/\\$/ping > /dev/null || exit 1" + interval: 15s + timeout: 10s + retries: 20 # Create the database directory before starting Fuseki entrypoint: /bin/sh -c "mkdir -p /fuseki/databases/openmetadata && exec /docker-entrypoint.sh /jena-fuseki/fuseki-server --update --loc=/fuseki/databases/openmetadata /openmetadata" diff --git a/docker/run_local_docker.sh b/docker/run_local_docker.sh index cec66a41114e..94c3519e7f1b 100755 --- a/docker/run_local_docker.sh +++ b/docker/run_local_docker.sh @@ -57,6 +57,8 @@ cd ../ echo "Stopping any previous Local Docker Containers" docker compose -f docker/development/docker-compose-postgres.yml down --remove-orphans docker compose -f docker/development/docker-compose.yml down --remove-orphans +docker compose -f docker/development/docker-compose-postgres.yml -f docker/development/docker-compose-fuseki.yml down --remove-orphans +docker compose -f docker/development/docker-compose.yml -f docker/development/docker-compose-fuseki.yml down --remove-orphans if [[ $skipMaven == "false" ]]; then if [[ $mode == "no-ui" ]]; then @@ -80,6 +82,14 @@ if [[ $debugOM == "true" ]]; then export OPENMETADATA_DEBUG=true fi +export RDF_ENABLED=true +export RDF_STORAGE_TYPE=FUSEKI +export RDF_ENDPOINT="${RDF_ENDPOINT:-http://fuseki:3030/openmetadata}" +export RDF_REMOTE_USERNAME="${RDF_REMOTE_USERNAME:-admin}" +export RDF_REMOTE_PASSWORD="${RDF_REMOTE_PASSWORD:-admin}" +export RDF_BASE_URI="${RDF_BASE_URI:-https://open-metadata.org/}" +export RDF_DATASET="${RDF_DATASET:-openmetadata}" + if [[ $cleanDbVolumes == "true" ]] then if [[ -d "$PWD/docker/development/docker-volume/" ]] @@ -116,13 +126,16 @@ else exit 1 fi +RDF_COMPOSE_FILE="docker/development/docker-compose-fuseki.yml" +COMPOSE_ARGS=(-f "$COMPOSE_FILE" -f "$RDF_COMPOSE_FILE") + if [[ $includeIngestion == "true" ]]; then echo "Building all services including ingestion (dependency: ${INGESTION_DEPENDENCY:-all})" - docker compose -f $COMPOSE_FILE build --build-arg INGESTION_DEPENDENCY="${INGESTION_DEPENDENCY:-all}" && docker compose -f $COMPOSE_FILE up -d + docker compose "${COMPOSE_ARGS[@]}" build --build-arg INGESTION_DEPENDENCY="${INGESTION_DEPENDENCY:-all}" && docker compose "${COMPOSE_ARGS[@]}" up -d else echo "Building services without ingestion" - docker compose -f $COMPOSE_FILE build $SEARCH_SERVICE $DB_SERVICE execute-migrate-all openmetadata-server && \ - docker compose -f $COMPOSE_FILE up -d $SEARCH_SERVICE $DB_SERVICE execute-migrate-all openmetadata-server + docker compose "${COMPOSE_ARGS[@]}" build $SEARCH_SERVICE $DB_SERVICE execute-migrate-all openmetadata-server && \ + docker compose "${COMPOSE_ARGS[@]}" up -d fuseki $SEARCH_SERVICE $DB_SERVICE execute-migrate-all openmetadata-server fi RESULT=$? @@ -136,6 +149,11 @@ until curl -s -f "http://localhost:9200/_cat/indices/openmetadata_team_search_in sleep 5 done +until curl -s -f "http://localhost:3030/\$/ping" > /dev/null 2>&1; do + echo 'Checking if Fuseki is reachable...\n' + sleep 5 +done + if [[ $includeIngestion == "true" ]]; then # Function to get OAuth access token for Airflow API get_airflow_token() { @@ -288,6 +306,22 @@ curl --location --request POST 'http://localhost:8585/api/v1/apps/trigger/Search --header 'Authorization: Bearer eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg' sleep 60 # Sleep for 60 seconds to make sure the elasticsearch reindexing from UI finishes + +echo "✔running RDF reindexing" +curl --location --request POST 'http://localhost:8585/api/v1/apps/trigger/RdfIndexApp' \ +--header 'Authorization: Bearer eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg' \ +--header 'Content-Type: application/json' \ +--data-raw '{ + "entities": ["all"], + "recreateIndex": true, + "batchSize": 100, + "useDistributedIndexing": true, + "partitionSize": 10000 +}' + +sleep 30 tput setaf 2 echo "✔ OpenMetadata is up and running" - +echo "✔ RDF/Knowledge Graph support is enabled" +echo " - Fuseki UI: http://localhost:3030" +echo " - SPARQL endpoint: http://localhost:3030/openmetadata/sparql" diff --git a/docker/run_local_docker_rdf.sh b/docker/run_local_docker_rdf.sh old mode 100755 new mode 100644 index 9f7aef4e14d8..0a11edf11c45 --- a/docker/run_local_docker_rdf.sh +++ b/docker/run_local_docker_rdf.sh @@ -10,216 +10,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -cd "$(dirname "${BASH_SOURCE[0]}")" || exit - -helpFunction() -{ - echo "" - echo "Usage: $0 -m mode -d database" - echo "\t-m Running mode: [ui, no-ui]. Default [ui]" - echo "\t-d Database: [mysql, postgresql]. Default [mysql]" - echo "\t-s Skip maven build: [true, false]. Default [false]" - echo "\t-x Open JVM debug port on 5005: [true, false]. Default [false]" - echo "\t-h For usage help" - echo "\t-r For Cleaning DB Volumes. [true, false]. Default [true]" - echo "\t-f Start Fuseki for RDF support: [true, false]. Default [true]" - exit 1 # Exit script after printing help -} - -while getopts "m:d:s:x:r:f:h" opt -do - case "$opt" in - m ) mode="$OPTARG" ;; - d ) database="$OPTARG" ;; - s ) skipMaven="$OPTARG" ;; - x ) debugOM="$OPTARG" ;; - r ) cleanDbVolumes="$OPTARG" ;; - f ) startFuseki="$OPTARG" ;; - h ) helpFunction ;; - ? ) helpFunction ;; - esac -done - -mode="${mode:=ui}" -database="${database:=mysql}" -skipMaven="${skipMaven:=false}" -debugOM="${debugOM:=false}" -authorizationToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" -cleanDbVolumes="${cleanDbVolumes:=true}" -startFuseki="${startFuseki:=true}" - -echo "Running local docker using mode [$mode] database [$database] and skipping maven build [$skipMaven] with cleanDB as [$cleanDbVolumes] and Fuseki [$startFuseki]" - -cd ../ - -echo "Stopping any previous Local Docker Containers" -docker compose -f docker/development/docker-compose-postgres.yml down --remove-orphans -docker compose -f docker/development/docker-compose.yml down --remove-orphans -docker compose -f docker/development/docker-compose-fuseki.yml down --remove-orphans - -if [[ $skipMaven == "false" ]]; then - if [[ $mode == "no-ui" ]]; then - echo "Maven Build - Skipping Tests and UI" - mvn -DskipTests -DonlyBackend clean package -pl !openmetadata-ui - else - echo "Maven Build - Skipping Tests" - mvn -DskipTests clean package - fi -else - echo "Skipping Maven Build" -fi - -RESULT=$? -if [ $RESULT -ne 0 ]; then - echo "Failed to run Maven build!" - exit 1 -fi - -if [[ $debugOM == "true" ]]; then - export OPENMETADATA_DEBUG=true -fi - -if [[ $cleanDbVolumes == "true" ]] -then - if [[ -d "$PWD/docker/development/docker-volume/" ]] - then - rm -rf $PWD/docker/development/docker-volume - fi -fi - -if [[ $VIRTUAL_ENV == "" ]]; -then - echo "Please Use Virtual Environment and make sure to generate Pydantic Models"; -else - echo "Generating Pydantic Models"; - make install_dev generate -fi +set -euo pipefail -# Start Fuseki if requested -if [[ $startFuseki == "true" ]]; then - echo "Starting Apache Jena Fuseki for RDF support" - docker compose -f docker/development/docker-compose-fuseki.yml up -d - - # Wait for Fuseki to be ready - until curl -s -f "http://localhost:3030/$/ping" > /dev/null 2>&1; do - echo 'Waiting for Fuseki to start...' - sleep 5 - done - echo "✔ Fuseki is ready" - - # Set RDF environment variables - export RDF_ENABLED=true - export RDF_STORAGE_TYPE=FUSEKI - export RDF_BASE_URI="https://open-metadata.org/" - export RDF_ENDPOINT="http://localhost:3030/openmetadata" - export RDF_REMOTE_USERNAME="admin" - export RDF_REMOTE_PASSWORD="admin" - export RDF_DATASET="openmetadata" -fi - -echo "Starting Local Docker Containers" -echo "Using ingestion dependency: ${INGESTION_DEPENDENCY:-all}" - -if [[ $database == "postgresql" ]]; then - docker compose -f docker/development/docker-compose-postgres.yml build --build-arg INGESTION_DEPENDENCY="${INGESTION_DEPENDENCY:-all}" && docker compose -f docker/development/docker-compose-postgres.yml up -d -elif [[ $database == "mysql" ]]; then - docker compose -f docker/development/docker-compose.yml build --build-arg INGESTION_DEPENDENCY="${INGESTION_DEPENDENCY:-all}" && docker compose -f docker/development/docker-compose.yml up -d -else - echo "Invalid database type: $database" - exit 1 -fi +cd "$(dirname "${BASH_SOURCE[0]}")" || exit -RESULT=$? -if [ $RESULT -ne 0 ]; then - echo "Failed to start Docker instances!" - exit 1 -fi +filtered_args=() +skip_next=false +for arg in "$@"; do + if [[ "$skip_next" == "true" ]]; then + skip_next=false + continue + fi -until curl -s -f "http://localhost:9200/_cat/indices/openmetadata_team_search_index"; do - echo 'Checking if Elastic Search instance is up...' - sleep 5 -done + if [[ "$arg" == "-f" ]]; then + skip_next=true + continue + fi -until curl -s -f --header 'Authorization: Basic YWRtaW46YWRtaW4=' "http://localhost:8080/api/v1/dags/sample_data"; do - echo 'Checking if Sample Data DAG is reachable...' - sleep 5 + filtered_args+=("$arg") done -until curl -s -f --header "Authorization: Bearer $authorizationToken" "http://localhost:8585/api/v1/tables"; do - echo 'Checking if OM Server is reachable...' - sleep 5 -done - -curl --location --request PATCH 'localhost:8080/api/v1/dags/sample_data' \ - --header 'Authorization: Basic YWRtaW46YWRtaW4=' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "is_paused": false - }' - -curl --location --request PATCH 'localhost:8080/api/v1/dags/extended_sample_data' \ - --header 'Authorization: Basic YWRtaW46YWRtaW4=' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "is_paused": false - }' - -echo 'Validate sample data DAG...' -sleep 5 -# This validates the sample data DAG flow -make install -python docker/validate_compose.py - -sleep 5 -curl --location --request PATCH 'localhost:8080/api/v1/dags/sample_usage' \ - --header 'Authorization: Basic YWRtaW46YWRtaW4=' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "is_paused": false - }' -sleep 5 -curl --location --request PATCH 'localhost:8080/api/v1/dags/index_metadata' \ - --header 'Authorization: Basic YWRtaW46YWRtaW4=' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "is_paused": false - }' -sleep 2 -curl --location --request PATCH 'localhost:8080/api/v1/dags/sample_lineage' \ - --header 'Authorization: Basic YWRtaW46YWRtaW4=' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "is_paused": false - }' - -echo "✔running reindexing" -# Trigger ElasticSearch ReIndexing from UI -curl --location --request POST 'http://localhost:8585/api/v1/apps/trigger/SearchIndexingApplication' \ ---header 'Authorization: Bearer eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg' - -sleep 60 # Sleep for 60 seconds to make sure the elasticsearch reindexing from UI finishes - -# If RDF is enabled, trigger RDF indexing -if [[ $startFuseki == "true" ]]; then - echo "✔running RDF reindexing" - # Trigger RDF ReIndexing from UI - curl --location --request POST 'http://localhost:8585/api/v1/apps/trigger/RdfIndexApp' \ - --header 'Authorization: Bearer eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg' \ - --header 'Content-Type: application/json' \ - --data-raw '{ - "entities": ["all"], - "recreateIndex": true, - "batchSize": 100 - }' - - sleep 30 # Wait for RDF indexing to complete -fi - -tput setaf 2 -echo "✔ OpenMetadata is up and running" -if [[ $startFuseki == "true" ]]; then - echo "✔ RDF/Knowledge Graph support is enabled" - echo " - Fuseki UI: http://localhost:3030" - echo " - SPARQL endpoint: http://localhost:3030/openmetadata/sparql" -fi -echo "" \ No newline at end of file +exec ./run_local_docker.sh "${filtered_args[@]}" diff --git a/docs/rdf-local-development.md b/docs/rdf-local-development.md index 44b166eaea2f..0bb5b0782400 100644 --- a/docs/rdf-local-development.md +++ b/docs/rdf-local-development.md @@ -1,6 +1,6 @@ # RDF/Apache Jena Local Development Guide -This guide documents how to set up RDF/Knowledge Graph support for local development with OpenMetadata running in IntelliJ IDEA and Apache Jena Fuseki running in Docker. +This guide documents how to set up RDF/Knowledge Graph support for local development with OpenMetadata and Apache Jena Fuseki. ## Overview @@ -28,21 +28,29 @@ OpenMetadata supports RDF (Resource Description Framework) for knowledge graph c ## Quick Start -### Step 1: Start Apache Jena Fuseki +### Step 1: Start the Default Local Docker Stack -Start the Fuseki triple store using Docker Compose: +The default local Docker flow now starts Fuseki alongside OpenMetadata: ```bash cd /path/to/OpenMetadata -docker compose -f docker/development/docker-compose-fuseki.yml up -d +./docker/run_local_docker.sh -d mysql ``` -This starts Fuseki with: +For PostgreSQL-based development: + +```bash +./docker/run_local_docker.sh -d postgresql +``` + +This starts OpenMetadata, the backing database, search, ingestion services, and Fuseki with: - **Port**: 3030 - **Admin Password**: admin - **Dataset**: openmetadata - **Memory**: 2-4GB allocated +`./docker/run_local_docker_rdf.sh` remains available as a compatibility wrapper, but it now delegates to `./docker/run_local_docker.sh`. + ### Step 2: Verify Fuseki is Running ```bash @@ -59,7 +67,9 @@ The Fuseki web UI is available at `http://localhost:3030` with credentials: ### Step 3: Configure IntelliJ Run Configuration -Create or modify your IntelliJ run configuration for `OpenMetadataApplication` with these environment variables: +If you are using `run_local_docker.sh`, the Docker services already receive the RDF environment variables automatically. + +Create or modify your IntelliJ run configuration for `OpenMetadataApplication` with these environment variables only when you want to run the OpenMetadata server directly from IntelliJ while keeping Fuseki in Docker: ``` RDF_ENABLED=true diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java b/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java index 232ce1f536fb..74aa860dc387 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/OpenMetadataApplication.java @@ -83,6 +83,7 @@ import org.openmetadata.service.apps.ApplicationContext; import org.openmetadata.service.apps.ApplicationHandler; import org.openmetadata.service.apps.McpServerProvider; +import org.openmetadata.service.apps.bundles.rdf.distributed.RdfDistributedJobParticipant; import org.openmetadata.service.apps.bundles.searchIndex.distributed.DistributedJobParticipant; import org.openmetadata.service.apps.bundles.searchIndex.distributed.ServerIdentityResolver; import org.openmetadata.service.apps.scheduler.AppScheduler; @@ -377,6 +378,7 @@ public void run(OpenMetadataApplicationConfig catalogConfig, Environment environ // Register Distributed Job Participant for distributed search indexing registerDistributedJobParticipant(environment, jdbi, catalogConfig.getCacheConfig()); + registerDistributedRdfJobParticipant(environment, jdbi); // Register Event publishers registerEventPublisher(catalogConfig); @@ -1125,6 +1127,17 @@ protected void registerDistributedJobParticipant( } } + protected void registerDistributedRdfJobParticipant(Environment environment, Jdbi jdbi) { + try { + CollectionDAO collectionDAO = jdbi.onDemand(CollectionDAO.class); + RdfDistributedJobParticipant participant = new RdfDistributedJobParticipant(collectionDAO); + environment.lifecycle().manage(participant); + LOG.info("Registered RdfDistributedJobParticipant for distributed RDF indexing"); + } catch (Exception e) { + LOG.warn("Failed to register RdfDistributedJobParticipant: {}", e.getMessage()); + } + } + public static void main(String[] args) throws Exception { OpenMetadataApplication openMetadataApplication = new OpenMetadataApplication(); openMetadataApplication.run(args); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfBatchProcessor.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfBatchProcessor.java new file mode 100644 index 000000000000..ccb406d91ed1 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfBatchProcessor.java @@ -0,0 +1,211 @@ +package org.openmetadata.service.apps.bundles.rdf; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.UUID; +import java.util.function.BooleanSupplier; +import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.EntityInterface; +import org.openmetadata.schema.entity.data.GlossaryTerm; +import org.openmetadata.schema.type.LineageDetails; +import org.openmetadata.schema.type.Relationship; +import org.openmetadata.schema.type.TermRelation; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.jdbi3.CollectionDAO; +import org.openmetadata.service.jdbi3.CollectionDAO.EntityRelationshipObject; +import org.openmetadata.service.rdf.RdfRepository; + +@Slf4j +public class RdfBatchProcessor { + public static final List ALL_RELATIONSHIPS = + java.util.Arrays.stream(Relationship.values()).map(Relationship::ordinal).toList(); + + public static final Set EXCLUDED_RELATIONSHIP_ENTITY_TYPES = + Set.of( + "changeEvent", + "auditLog", + "webAnalyticEvent", + "entityUsage", + "eventSubscription", + "vote", + "THREAD"); + + public static final Set EXCLUDED_RELATIONSHIP_TYPES = + Set.of(Relationship.VOTED.ordinal(), Relationship.FOLLOWS.ordinal()); + + private final CollectionDAO collectionDAO; + private final RdfRepository rdfRepository; + + public RdfBatchProcessor(CollectionDAO collectionDAO, RdfRepository rdfRepository) { + this.collectionDAO = collectionDAO; + this.rdfRepository = rdfRepository; + } + + public BatchProcessingResult processEntities( + String entityType, List entities, BooleanSupplier stopRequested) { + if (entities == null || entities.isEmpty()) { + return new BatchProcessingResult(0, 0); + } + + BooleanSupplier effectiveStopRequested = stopRequested != null ? stopRequested : () -> false; + int successCount = 0; + int failedCount = 0; + + for (EntityInterface entity : entities) { + if (effectiveStopRequested.getAsBoolean()) { + break; + } + try { + rdfRepository.createOrUpdate(entity); + successCount++; + } catch (Exception e) { + LOG.error("Failed to index entity {} to RDF", entity.getId(), e); + failedCount++; + } + } + + processBatchRelationships(entityType, entities); + if ("glossaryTerm".equals(entityType)) { + processGlossaryTermRelations(entities, effectiveStopRequested); + } + + return new BatchProcessingResult(successCount, failedCount); + } + + public void processBatchRelationships( + String entityType, List entities) { + if (entities == null || entities.isEmpty()) { + return; + } + + try { + List entityIds = + entities.stream().map(entity -> entity.getId().toString()).collect(Collectors.toList()); + + List outgoingRelationships = + collectionDAO + .relationshipDAO() + .findToBatchWithRelations(entityIds, entityType, ALL_RELATIONSHIPS); + + List incomingLineage = + collectionDAO + .relationshipDAO() + .findFromBatch( + entityIds, + Relationship.UPSTREAM.ordinal(), + org.openmetadata.schema.type.Include.ALL); + + List allRelationships = new ArrayList<>(); + + for (EntityRelationshipObject rel : outgoingRelationships) { + if (shouldSkipRelationship(rel)) { + continue; + } + + if (rel.getRelation() == Relationship.UPSTREAM.ordinal() && rel.getJson() != null) { + processLineageRelationship(rel); + } else { + if ("glossaryTerm".equals(entityType) + && rel.getRelation() == Relationship.RELATED_TO.ordinal() + && "glossaryTerm".equals(rel.getToEntity())) { + continue; + } + allRelationships.add(convertToEntityRelationship(rel)); + } + } + + for (EntityRelationshipObject rel : incomingLineage) { + if (shouldSkipRelationship(rel)) { + continue; + } + + if (rel.getJson() != null) { + processLineageRelationship(rel); + } else { + allRelationships.add(convertToEntityRelationship(rel)); + } + } + + if (!allRelationships.isEmpty()) { + rdfRepository.bulkAddRelationships(allRelationships); + } + } catch (Exception e) { + LOG.error("Failed to process batch relationships for entity type {}", entityType, e); + } + } + + public org.openmetadata.schema.type.EntityRelationship convertToEntityRelationship( + EntityRelationshipObject rel) { + return new org.openmetadata.schema.type.EntityRelationship() + .withFromEntity(rel.getFromEntity()) + .withFromId(UUID.fromString(rel.getFromId())) + .withToEntity(rel.getToEntity()) + .withToId(UUID.fromString(rel.getToId())) + .withRelation(rel.getRelation()) + .withRelationshipType(Relationship.values()[rel.getRelation()]); + } + + private boolean shouldSkipRelationship(EntityRelationshipObject rel) { + return EXCLUDED_RELATIONSHIP_ENTITY_TYPES.contains(rel.getToEntity()) + || EXCLUDED_RELATIONSHIP_ENTITY_TYPES.contains(rel.getFromEntity()) + || EXCLUDED_RELATIONSHIP_TYPES.contains(rel.getRelation()); + } + + void processLineageRelationship(EntityRelationshipObject rel) { + try { + UUID fromId = UUID.fromString(rel.getFromId()); + UUID toId = UUID.fromString(rel.getToId()); + LineageDetails lineageDetails = JsonUtils.readValue(rel.getJson(), LineageDetails.class); + rdfRepository.addLineageWithDetails( + rel.getFromEntity(), fromId, rel.getToEntity(), toId, lineageDetails); + } catch (Exception e) { + LOG.debug("Failed to parse lineage details, falling back to basic relationship", e); + try { + rdfRepository.addRelationship(convertToEntityRelationship(rel)); + } catch (Exception ex) { + LOG.debug("Failed to add basic lineage relationship", ex); + } + } + } + + void processGlossaryTermRelations( + List entities, BooleanSupplier stopRequested) { + List relations = new ArrayList<>(); + + for (EntityInterface entity : entities) { + if (stopRequested.getAsBoolean()) { + break; + } + + if (!(entity instanceof GlossaryTerm glossaryTerm)) { + continue; + } + + List relatedTerms = glossaryTerm.getRelatedTerms(); + if (relatedTerms == null || relatedTerms.isEmpty()) { + continue; + } + + UUID fromTermId = glossaryTerm.getId(); + for (TermRelation termRelation : relatedTerms) { + if (termRelation.getTerm() == null || termRelation.getTerm().getId() == null) { + continue; + } + + String relationType = + termRelation.getRelationType() != null ? termRelation.getRelationType() : "relatedTo"; + relations.add( + new RdfRepository.GlossaryTermRelationData( + fromTermId, termRelation.getTerm().getId(), relationType)); + } + } + + if (!relations.isEmpty()) { + rdfRepository.bulkAddGlossaryTermRelations(relations); + } + } + + public record BatchProcessingResult(int successCount, int failedCount) {} +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexApp.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexApp.java index 139261709d53..e5a966df79ad 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexApp.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexApp.java @@ -10,19 +10,22 @@ import jakarta.ws.rs.core.Response; import java.util.ArrayList; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.UUID; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.CancellationException; import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; -import java.util.stream.Collectors; import lombok.Getter; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.EntityInterface; @@ -30,20 +33,19 @@ import org.openmetadata.schema.entity.app.AppRunRecord; import org.openmetadata.schema.entity.app.FailureContext; import org.openmetadata.schema.entity.app.SuccessContext; -import org.openmetadata.schema.entity.data.GlossaryTerm; import org.openmetadata.schema.system.EntityStats; import org.openmetadata.schema.system.EventPublisherJob; import org.openmetadata.schema.system.IndexingError; import org.openmetadata.schema.system.Stats; import org.openmetadata.schema.system.StepStats; import org.openmetadata.schema.type.Include; -import org.openmetadata.schema.type.LineageDetails; -import org.openmetadata.schema.type.Relationship; -import org.openmetadata.schema.type.TermRelation; import org.openmetadata.schema.utils.JsonUtils; import org.openmetadata.schema.utils.ResultList; import org.openmetadata.service.Entity; import org.openmetadata.service.apps.AbstractNativeApplication; +import org.openmetadata.service.apps.bundles.rdf.distributed.DistributedRdfIndexExecutor; +import org.openmetadata.service.apps.bundles.rdf.distributed.RdfDistributedJobStatsAggregator; +import org.openmetadata.service.apps.bundles.rdf.distributed.RdfIndexJob; import org.openmetadata.service.exception.AppException; import org.openmetadata.service.jdbi3.CollectionDAO; import org.openmetadata.service.jdbi3.CollectionDAO.EntityRelationshipObject; @@ -66,29 +68,14 @@ public class RdfIndexApp extends AbstractNativeApplication { private static final int MAX_CONSUMER_THREADS = 5; private static final long WEBSOCKET_UPDATE_INTERVAL_MS = 2000; - private static final List ALL_RELATIONSHIPS = - java.util.Arrays.stream(Relationship.values()) - .map(Relationship::ordinal) - .collect(Collectors.toList()); - - // Entity types that should be excluded from RDF relationships as they don't provide - // meaningful semantic value (operational/audit entities) + private static final List ALL_RELATIONSHIPS = RdfBatchProcessor.ALL_RELATIONSHIPS; private static final Set EXCLUDED_RELATIONSHIP_ENTITY_TYPES = - Set.of( - "changeEvent", - "auditLog", - "webAnalyticEvent", - "entityUsage", - "eventSubscription", - "vote", - "THREAD"); - - // Relationship types that should be excluded from RDF as they don't provide - // meaningful semantic relationships (user interactions, not data relationships) + RdfBatchProcessor.EXCLUDED_RELATIONSHIP_ENTITY_TYPES; private static final Set EXCLUDED_RELATIONSHIP_TYPES = - Set.of(Relationship.VOTED.ordinal(), Relationship.FOLLOWS.ordinal()); + RdfBatchProcessor.EXCLUDED_RELATIONSHIP_TYPES; private final RdfRepository rdfRepository; + private final RdfBatchProcessor batchProcessor; private volatile boolean stopped = false; private volatile long lastWebSocketUpdate = 0; @@ -100,6 +87,7 @@ public class RdfIndexApp extends AbstractNativeApplication { private final AtomicReference rdfIndexStats = new AtomicReference<>(); private final AtomicBoolean producersDone = new AtomicBoolean(false); private BlockingQueue taskQueue; + private volatile DistributedRdfIndexExecutor distributedExecutor; record IndexingTask( String entityType, List entities, int offset, int retryCount) { @@ -115,6 +103,7 @@ boolean isPoisonPill() { public RdfIndexApp(CollectionDAO collectionDAO, SearchRepository searchRepository) { super(collectionDAO, searchRepository); this.rdfRepository = RdfRepository.getInstance(); + this.batchProcessor = new RdfBatchProcessor(collectionDAO, rdfRepository); } @Override @@ -160,9 +149,10 @@ public void execute(JobExecutionContext jobExecutionContext) { } try { - boolean containsAll = jobData.getEntities().contains(ALL); - if (containsAll) { - jobData.setEntities(getAll()); + jobData.setEntities(resolveEntityTypes(jobData.getEntities())); + if (jobData.getEntities().isEmpty()) { + throw new IllegalStateException( + "No repository-backed entity types configured for RDF indexing"); } LOG.info( @@ -178,7 +168,11 @@ public void execute(JobExecutionContext jobExecutionContext) { } updateJobStatus(EventPublisherJob.Status.RUNNING); - reIndexFromStartToEnd(); + if (Boolean.TRUE.equals(jobData.getUseDistributedIndexing())) { + reIndexDistributed(); + } else { + reIndexFromStartToEnd(); + } if (stopped) { updateJobStatus(EventPublisherJob.Status.STOPPED); @@ -208,6 +202,11 @@ private void initializeJob(JobExecutionContext jobExecutionContext) { rdfIndexStats.set(initializeTotalRecords(jobData.getEntities())); jobData.setStats(rdfIndexStats.get()); + if (Boolean.TRUE.equals(jobData.getUseDistributedIndexing())) { + sendUpdates(jobExecutionContext, true); + return; + } + int queueSize = jobData.getQueueSize() != null ? jobData.getQueueSize() : DEFAULT_QUEUE_SIZE; int effectiveQueueSize = calculateMemoryAwareQueueSize(queueSize); taskQueue = new LinkedBlockingQueue<>(effectiveQueueSize); @@ -236,6 +235,35 @@ private void clearRdfData() { } } + private void reIndexDistributed() throws InterruptedException { + int partitionSize = jobData.getPartitionSize() != null ? jobData.getPartitionSize() : 10000; + String createdBy = + getApp() != null && getApp().getName() != null ? getApp().getName() : "system"; + + distributedExecutor = new DistributedRdfIndexExecutor(collectionDAO, partitionSize); + distributedExecutor.performStartupRecovery(); + + RdfIndexJob distributedJob = + distributedExecutor.createJob(jobData.getEntities(), jobData, createdBy); + + ExecutorService distributedExecutionExecutor = + Executors.newSingleThreadExecutor( + Thread.ofVirtual().name("rdf-distributed-execution-", 0).factory()); + Future distributedExecution = + distributedExecutionExecutor.submit( + () -> { + distributedExecutor.execute(jobData); + return null; + }); + + try { + monitorDistributedJob(distributedJob.getId(), distributedExecution); + awaitDistributedExecution(distributedExecution); + } finally { + distributedExecutionExecutor.shutdownNow(); + } + } + private void reIndexFromStartToEnd() throws InterruptedException { long totalEntities = rdfIndexStats.get().getJobStats().getTotalRecords(); int numProducers = Math.clamp((int) (totalEntities / 5000), 2, MAX_PRODUCER_THREADS); @@ -266,10 +294,6 @@ private void reIndexFromStartToEnd() throws InterruptedException { } try { - // Clear entire RDF store before re-indexing to remove stale data - LOG.info("Clearing RDF store before re-indexing"); - rdfRepository.clearAll(); - processEntityTypes(); signalConsumersToStop(numConsumers); consumerLatch.await(); @@ -282,6 +306,73 @@ private void reIndexFromStartToEnd() throws InterruptedException { } } + private void monitorDistributedJob(UUID jobId, Future distributedExecution) + throws InterruptedException { + RdfDistributedJobStatsAggregator statsAggregator = new RdfDistributedJobStatsAggregator(); + + while (!stopped) { + RdfIndexJob latestJob = + distributedExecutor != null ? distributedExecutor.getJobWithFreshStats() : null; + if (latestJob != null) { + Stats aggregatedStats = statsAggregator.toStats(latestJob); + rdfIndexStats.set(aggregatedStats); + jobData.setStats(aggregatedStats); + sendUpdates(jobExecutionContext, false); + + if (latestJob.isTerminal()) { + if (latestJob.getStatus() + == org.openmetadata + .service + .apps + .bundles + .searchIndex + .distributed + .IndexJobStatus + .STOPPED) { + stopped = true; + } else if (latestJob.getStatus() + == org.openmetadata + .service + .apps + .bundles + .searchIndex + .distributed + .IndexJobStatus + .FAILED) { + jobData.setFailure( + new IndexingError() + .withErrorSource(IndexingError.ErrorSource.JOB) + .withMessage(latestJob.getErrorMessage())); + } + return; + } + } + + if (distributedExecution.isDone()) { + return; + } + + TimeUnit.SECONDS.sleep(2); + } + } + + private void awaitDistributedExecution(Future distributedExecution) + throws InterruptedException { + try { + distributedExecution.get(); + } catch (CancellationException e) { + if (!stopped) { + throw new RuntimeException("Distributed RDF execution was cancelled unexpectedly", e); + } + } catch (ExecutionException e) { + Throwable cause = e.getCause(); + if (cause instanceof RuntimeException runtimeException) { + throw runtimeException; + } + throw new RuntimeException("Distributed RDF execution failed", cause); + } + } + private void runConsumer(int consumerId, CountDownLatch consumerLatch) { LOG.info("Consumer {} started", consumerId); try { @@ -310,220 +401,40 @@ private void processTask(IndexingTask task) { return; } - int successCount = 0; - int failedCount = 0; - try { - for (EntityInterface entity : entities) { - if (stopped) { - break; - } - try { - rdfRepository.createOrUpdate(entity); - successCount++; - } catch (Exception e) { - LOG.error("Failed to index entity {} to RDF", entity.getId(), e); - failedCount++; - } - } - - processBatchRelationships(entityType, entities); - - // Process glossary term relations if this is a glossaryTerm batch - if ("glossaryTerm".equals(entityType)) { - processGlossaryTermRelations(entities); - } + RdfBatchProcessor.BatchProcessingResult result = + batchProcessor.processEntities(entityType, entities, () -> stopped); StepStats currentStats = - new StepStats().withSuccessRecords(successCount).withFailedRecords(failedCount); + new StepStats() + .withSuccessRecords(result.successCount()) + .withFailedRecords(result.failedCount()); updateEntityStats(entityType, currentStats); sendUpdates(jobExecutionContext, false); } catch (Exception e) { LOG.error("Error processing batch for entity type {}", entityType, e); updateEntityStats( - entityType, - new StepStats() - .withSuccessRecords(successCount) - .withFailedRecords(entities.size() - successCount)); + entityType, new StepStats().withSuccessRecords(0).withFailedRecords(entities.size())); } } private void processBatchRelationships( String entityType, List entities) { - if (entities.isEmpty()) { - return; - } - - List entityIds = - entities.stream().map(e -> e.getId().toString()).collect(Collectors.toList()); - - try { - List outgoingRelationships = - collectionDAO - .relationshipDAO() - .findToBatchWithRelations(entityIds, entityType, ALL_RELATIONSHIPS); - - List incomingLineage = - collectionDAO - .relationshipDAO() - .findFromBatch(entityIds, Relationship.UPSTREAM.ordinal(), Include.ALL); - - List allRelationships = new ArrayList<>(); - - for (EntityRelationshipObject rel : outgoingRelationships) { - // Skip relationships to/from excluded entity types (changeEvent, auditLog, vote, etc.) - // These don't provide meaningful semantic value in the knowledge graph - if (EXCLUDED_RELATIONSHIP_ENTITY_TYPES.contains(rel.getToEntity()) - || EXCLUDED_RELATIONSHIP_ENTITY_TYPES.contains(rel.getFromEntity())) { - LOG.debug( - "Skipping relationship {} -> {} (excluded entity type: {} or {})", - rel.getFromId(), - rel.getToId(), - rel.getFromEntity(), - rel.getToEntity()); - continue; - } - - // Skip excluded relationship types (VOTED, FOLLOWS, etc.) - if (EXCLUDED_RELATIONSHIP_TYPES.contains(rel.getRelation())) { - LOG.debug( - "Skipping relationship {} -> {} (excluded relationship type: {})", - rel.getFromId(), - rel.getToId(), - rel.getRelation()); - continue; - } - - if (rel.getRelation() == Relationship.UPSTREAM.ordinal() && rel.getJson() != null) { - processLineageRelationship(rel); - } else { - // Skip glossary term RELATED_TO relationships - they're handled separately - // by processGlossaryTermRelations() with typed predicates - if ("glossaryTerm".equals(entityType) - && rel.getRelation() == Relationship.RELATED_TO.ordinal() - && "glossaryTerm".equals(rel.getToEntity())) { - LOG.debug( - "Skipping glossary term relation {} -> {} (handled by processGlossaryTermRelations)", - rel.getFromId(), - rel.getToId()); - continue; - } - allRelationships.add(convertToEntityRelationship(rel)); - } - } - - for (EntityRelationshipObject rel : incomingLineage) { - // Skip relationships to/from excluded entity types - if (EXCLUDED_RELATIONSHIP_ENTITY_TYPES.contains(rel.getToEntity()) - || EXCLUDED_RELATIONSHIP_ENTITY_TYPES.contains(rel.getFromEntity())) { - continue; - } - - // Skip excluded relationship types - if (EXCLUDED_RELATIONSHIP_TYPES.contains(rel.getRelation())) { - continue; - } - - if (rel.getJson() != null) { - processLineageRelationship(rel); - } else { - allRelationships.add(convertToEntityRelationship(rel)); - } - } - - if (!allRelationships.isEmpty()) { - rdfRepository.bulkAddRelationships(allRelationships); - LOG.debug( - "Bulk added {} relationships for {} entities", - allRelationships.size(), - entities.size()); - } - - } catch (Exception e) { - LOG.error("Failed to process batch relationships for entity type {}", entityType, e); - } + batchProcessor.processBatchRelationships(entityType, entities); } private void processLineageRelationship(EntityRelationshipObject rel) { - try { - UUID fromId = UUID.fromString(rel.getFromId()); - UUID toId = UUID.fromString(rel.getToId()); - LineageDetails lineageDetails = JsonUtils.readValue(rel.getJson(), LineageDetails.class); - rdfRepository.addLineageWithDetails( - rel.getFromEntity(), fromId, rel.getToEntity(), toId, lineageDetails); - LOG.debug( - "Added lineage with details from {}/{} to {}/{}", - rel.getFromEntity(), - fromId, - rel.getToEntity(), - toId); - } catch (Exception e) { - LOG.debug("Failed to parse lineage details, falling back to basic relationship", e); - try { - rdfRepository.addRelationship(convertToEntityRelationship(rel)); - } catch (Exception ex) { - LOG.debug("Failed to add basic lineage relationship", ex); - } - } + batchProcessor.processLineageRelationship(rel); } private void processGlossaryTermRelations(List entities) { - List relations = new ArrayList<>(); - - for (EntityInterface entity : entities) { - if (stopped) { - break; - } - - if (entity instanceof GlossaryTerm glossaryTerm) { - List relatedTerms = glossaryTerm.getRelatedTerms(); - if (relatedTerms != null && !relatedTerms.isEmpty()) { - UUID fromTermId = glossaryTerm.getId(); - LOG.info( - "Processing glossary term {} ({}) with {} relations", - glossaryTerm.getName(), - fromTermId, - relatedTerms.size()); - - for (TermRelation termRelation : relatedTerms) { - if (termRelation.getTerm() != null && termRelation.getTerm().getId() != null) { - UUID toTermId = termRelation.getTerm().getId(); - String relationType = - termRelation.getRelationType() != null - ? termRelation.getRelationType() - : "relatedTo"; - - LOG.info( - " Relation: {} -> {} (type: {}, raw: {})", - glossaryTerm.getName(), - termRelation.getTerm().getName(), - relationType, - termRelation.getRelationType()); - - relations.add( - new RdfRepository.GlossaryTermRelationData(fromTermId, toTermId, relationType)); - } - } - } - } - } - - if (!relations.isEmpty()) { - rdfRepository.bulkAddGlossaryTermRelations(relations); - LOG.info("Added {} glossary term relations to RDF store", relations.size()); - } + batchProcessor.processGlossaryTermRelations(entities, () -> stopped); } private org.openmetadata.schema.type.EntityRelationship convertToEntityRelationship( EntityRelationshipObject rel) { - return new org.openmetadata.schema.type.EntityRelationship() - .withFromEntity(rel.getFromEntity()) - .withFromId(UUID.fromString(rel.getFromId())) - .withToEntity(rel.getToEntity()) - .withToId(UUID.fromString(rel.getToId())) - .withRelation(rel.getRelation()) - .withRelationshipType(Relationship.values()[rel.getRelation()]); + return batchProcessor.convertToEntityRelationship(rel); } private void processEntityTypes() throws InterruptedException { @@ -755,6 +666,7 @@ public void updateRecordToDbAndNotify(JobExecutionContext jobExecutionContext) { appRecord.setSuccessContext( new SuccessContext().withAdditionalProperty("stats", jobData.getStats())); } + pushAppStatusUpdates(jobExecutionContext, appRecord, true); if (WebSocketManager.getInstance() != null) { String messageJson = JsonUtils.pojoToJson(appRecord); @@ -823,6 +735,9 @@ public void stop() { if (jobExecutor != null) { jobExecutor.shutdownNow(); } + if (distributedExecutor != null) { + distributedExecutor.stop(); + } LOG.info("RDF indexing job stopped successfully."); } @@ -837,6 +752,44 @@ protected void validateConfig(Map appConfig) { } private Set getAll() { - return new HashSet<>(Entity.getEntityList()); + return resolveEntityTypes(new HashSet<>(Entity.getEntityList())); + } + + private Set resolveEntityTypes(Set requestedEntities) { + Set resolvedEntities = new LinkedHashSet<>(); + if (requestedEntities == null || requestedEntities.isEmpty()) { + return resolvedEntities; + } + + if (requestedEntities.contains(ALL)) { + requestedEntities = new HashSet<>(Entity.getEntityList()); + } + + List skippedEntities = new ArrayList<>(); + for (String entityType : requestedEntities) { + if (entityType == null || entityType.isBlank() || ALL.equals(entityType)) { + continue; + } + if (isIndexableEntityType(entityType)) { + resolvedEntities.add(entityType); + } else { + skippedEntities.add(entityType); + } + } + + if (!skippedEntities.isEmpty()) { + LOG.info("Skipping RDF indexing for non repository-backed entity types: {}", skippedEntities); + } + + return resolvedEntities; + } + + private boolean isIndexableEntityType(String entityType) { + try { + Entity.getEntityRepository(entityType); + return true; + } catch (Exception e) { + return false; + } } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinator.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinator.java new file mode 100644 index 000000000000..b336f4eac7d8 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinator.java @@ -0,0 +1,604 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import com.fasterxml.jackson.core.type.TypeReference; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.system.EventPublisherJob; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.IndexJobStatus; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.PartitionStatus; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.ServerIdentityResolver; +import org.openmetadata.service.jdbi3.CollectionDAO; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexJobDAO.RdfIndexJobRecord; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexPartitionDAO.RdfAggregatedStatsRecord; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexPartitionDAO.RdfEntityStatsRecord; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexPartitionDAO.RdfIndexPartitionRecord; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexPartitionDAO.RdfServerPartitionStatsRecord; + +@Slf4j +public class DistributedRdfIndexCoordinator { + private static final String REINDEX_LOCK_KEY = "RDF_REINDEX_LOCK"; + private static final long LOCK_TIMEOUT_MS = TimeUnit.MINUTES.toMillis(5); + private static final long PARTITION_STALE_TIMEOUT_MS = TimeUnit.MINUTES.toMillis(3); + private static final int MAX_PARTITION_RETRIES = 3; + private static final double IMMEDIATE_CLAIMABLE_PERCENT = 0.50; + private static final long PARTITION_RELEASE_WINDOW_MS = TimeUnit.SECONDS.toMillis(5); + + private final CollectionDAO collectionDAO; + private final RdfPartitionCalculator partitionCalculator; + private final String serverId; + private final AtomicLong claimCounter = new AtomicLong(0); + + public DistributedRdfIndexCoordinator(CollectionDAO collectionDAO) { + this(collectionDAO, new RdfPartitionCalculator()); + } + + public DistributedRdfIndexCoordinator( + CollectionDAO collectionDAO, RdfPartitionCalculator partitionCalculator) { + this.collectionDAO = collectionDAO; + this.partitionCalculator = partitionCalculator; + this.serverId = ServerIdentityResolver.getInstance().getServerId(); + } + + public CollectionDAO getCollectionDAO() { + return collectionDAO; + } + + public boolean tryAcquireReindexLock(UUID jobId) { + long now = System.currentTimeMillis(); + return collectionDAO + .rdfReindexLockDAO() + .tryAcquireLock(REINDEX_LOCK_KEY, jobId.toString(), serverId, now, now + LOCK_TIMEOUT_MS); + } + + public boolean transferReindexLock(UUID fromJobId, UUID toJobId) { + long now = System.currentTimeMillis(); + return collectionDAO + .rdfReindexLockDAO() + .transferLock( + REINDEX_LOCK_KEY, + fromJobId.toString(), + toJobId.toString(), + serverId, + now, + now + LOCK_TIMEOUT_MS); + } + + public void refreshReindexLock(UUID jobId) { + long now = System.currentTimeMillis(); + collectionDAO + .rdfReindexLockDAO() + .updateHeartbeat(REINDEX_LOCK_KEY, jobId.toString(), now, now + LOCK_TIMEOUT_MS); + } + + public void releaseReindexLock(UUID jobId) { + collectionDAO.rdfReindexLockDAO().releaseLock(REINDEX_LOCK_KEY, jobId.toString()); + } + + public Optional getJob(UUID jobId) { + RdfIndexJobRecord record = collectionDAO.rdfIndexJobDAO().findById(jobId.toString()); + return Optional.ofNullable(record).map(this::toJob); + } + + public List getRecentJobs(List statuses, int limit) { + List statusNames = statuses.stream().map(Enum::name).toList(); + return collectionDAO.rdfIndexJobDAO().findByStatusesWithLimit(statusNames, limit).stream() + .map(this::toJob) + .toList(); + } + + public Optional getBlockingJob() { + List jobs = + getRecentJobs( + List.of(IndexJobStatus.READY, IndexJobStatus.RUNNING, IndexJobStatus.STOPPING), 1); + return jobs.stream().findFirst(); + } + + public RdfIndexJob createJob( + Set entities, EventPublisherJob jobConfiguration, String createdBy) { + UUID jobId = UUID.randomUUID(); + long now = System.currentTimeMillis(); + + Map entityStats = new HashMap<>(); + long totalRecords = 0; + for (String entityType : entities) { + long count = partitionCalculator.getEntityCount(entityType); + totalRecords += count; + entityStats.put( + entityType, + RdfIndexJob.EntityTypeStats.builder() + .entityType(entityType) + .totalRecords(count) + .processedRecords(0) + .successRecords(0) + .failedRecords(0) + .totalPartitions(0) + .completedPartitions(0) + .failedPartitions(0) + .build()); + } + + RdfIndexJob job = + RdfIndexJob.builder() + .id(jobId) + .status(IndexJobStatus.INITIALIZING) + .jobConfiguration(jobConfiguration) + .totalRecords(totalRecords) + .processedRecords(0) + .successRecords(0) + .failedRecords(0) + .entityStats(entityStats) + .createdBy(createdBy) + .createdAt(now) + .updatedAt(now) + .build(); + + collectionDAO + .rdfIndexJobDAO() + .insert( + jobId.toString(), + job.getStatus().name(), + JsonUtils.pojoToJson(jobConfiguration), + job.getTotalRecords(), + 0, + 0, + 0, + serializeEntityStats(entityStats), + createdBy, + now, + now); + return job; + } + + public RdfIndexJob initializePartitions(UUID jobId) { + RdfIndexJob job = + getJob(jobId).orElseThrow(() -> new IllegalStateException("RDF job not found: " + jobId)); + Set entityTypes = Set.copyOf(job.getJobConfiguration().getEntities()); + List partitions = + partitionCalculator.calculatePartitions(jobId, entityTypes); + long now = System.currentTimeMillis(); + int immediateCount = + Math.max(1, (int) Math.ceil(partitions.size() * IMMEDIATE_CLAIMABLE_PERCENT)); + + for (int i = 0; i < partitions.size(); i++) { + RdfIndexPartition partition = partitions.get(i); + long claimableAt; + if (i < immediateCount) { + claimableAt = now; + } else { + int remainingIndex = i - immediateCount; + int remainingCount = Math.max(1, partitions.size() - immediateCount); + claimableAt = now + (remainingIndex * PARTITION_RELEASE_WINDOW_MS) / remainingCount; + } + insertPartition(partition.withClaimableAt(claimableAt)); + } + + Map entityStats = new HashMap<>(job.getEntityStats()); + for (String entityType : entityTypes) { + int totalPartitions = + (int) + partitions.stream() + .filter(partition -> entityType.equals(partition.getEntityType())) + .count(); + RdfIndexJob.EntityTypeStats existing = entityStats.get(entityType); + if (existing != null) { + entityStats.put(entityType, existing.toBuilder().totalPartitions(totalPartitions).build()); + } + } + + long totalRecords = partitions.stream().mapToLong(RdfIndexPartition::getEstimatedCount).sum(); + RdfIndexJob updated = + job.toBuilder() + .status(IndexJobStatus.READY) + .totalRecords(totalRecords) + .entityStats(entityStats) + .updatedAt(System.currentTimeMillis()) + .build(); + updateJob(updated); + return updated; + } + + public RdfIndexPartition claimNextPartition(UUID jobId) { + long claimAt = (System.currentTimeMillis() * 1000) + claimCounter.incrementAndGet(); + int updated = + collectionDAO + .rdfIndexPartitionDAO() + .claimNextPartitionAtomic(jobId.toString(), serverId, claimAt); + if (updated <= 0) { + return null; + } + + RdfIndexPartitionRecord record = + collectionDAO + .rdfIndexPartitionDAO() + .findLatestClaimedPartition(jobId.toString(), serverId, claimAt); + return record != null ? toPartition(record) : null; + } + + public void updatePartitionProgress(RdfIndexPartition partition) { + collectionDAO + .rdfIndexPartitionDAO() + .updateProgress( + partition.getId().toString(), + partition.getCursor(), + partition.getProcessedCount(), + partition.getSuccessCount(), + partition.getFailedCount(), + System.currentTimeMillis()); + } + + public void completePartition( + UUID partitionId, long cursor, long processedCount, long successCount, long failedCount) { + RdfIndexPartition partition = getPartition(partitionId); + long now = System.currentTimeMillis(); + collectionDAO + .rdfIndexPartitionDAO() + .update( + partitionId.toString(), + PartitionStatus.COMPLETED.name(), + cursor, + processedCount, + successCount, + failedCount, + partition.getAssignedServer(), + partition.getClaimedAt(), + partition.getStartedAt(), + now, + now, + null, + partition.getRetryCount()); + incrementServerStats(partition, processedCount, successCount, failedCount, 1, 0); + refreshAggregatedJob(jobIdFrom(partition)); + } + + public void failPartition( + UUID partitionId, + long cursor, + long processedCount, + long successCount, + long failedCount, + String errorMessage) { + RdfIndexPartition partition = getPartition(partitionId); + long now = System.currentTimeMillis(); + collectionDAO + .rdfIndexPartitionDAO() + .update( + partitionId.toString(), + PartitionStatus.FAILED.name(), + cursor, + processedCount, + successCount, + failedCount, + partition.getAssignedServer(), + partition.getClaimedAt(), + partition.getStartedAt(), + now, + now, + errorMessage, + partition.getRetryCount() + 1); + incrementServerStats(partition, processedCount, successCount, failedCount, 0, 1); + refreshAggregatedJob(jobIdFrom(partition)); + } + + public int reclaimStalePartitions(UUID jobId) { + long staleThreshold = System.currentTimeMillis() - PARTITION_STALE_TIMEOUT_MS; + int reclaimed = + collectionDAO + .rdfIndexPartitionDAO() + .reclaimStalePartitionsForRetry( + jobId.toString(), staleThreshold, MAX_PARTITION_RETRIES); + int failed = + collectionDAO + .rdfIndexPartitionDAO() + .failStalePartitionsExceedingRetries( + jobId.toString(), + staleThreshold, + MAX_PARTITION_RETRIES, + System.currentTimeMillis()); + if (reclaimed > 0 || failed > 0) { + LOG.info( + "Recovered RDF job {} partitions: reclaimed={}, failed={}", jobId, reclaimed, failed); + refreshAggregatedJob(jobId); + } + return reclaimed + failed; + } + + public void cancelPendingPartitions(UUID jobId) { + collectionDAO.rdfIndexPartitionDAO().cancelPendingPartitions(jobId.toString()); + refreshAggregatedJob(jobId); + } + + public void releaseServerPartitions(UUID jobId, String serverId, boolean stopJob, String reason) { + long now = System.currentTimeMillis(); + collectionDAO + .rdfIndexPartitionDAO() + .releaseProcessingPartitions( + jobId.toString(), + serverId, + stopJob ? PartitionStatus.CANCELLED.name() : PartitionStatus.PENDING.name(), + reason, + now, + stopJob ? now : null); + refreshAggregatedJob(jobId); + } + + public void updateJobStatus(UUID jobId, IndexJobStatus status, String errorMessage) { + RdfIndexJob job = + getJob(jobId).orElseThrow(() -> new IllegalStateException("RDF job not found: " + jobId)); + long now = System.currentTimeMillis(); + Long startedAt = job.getStartedAt(); + Long completedAt = job.getCompletedAt(); + + if (status == IndexJobStatus.RUNNING && startedAt == null) { + startedAt = now; + } + if (status == IndexJobStatus.STOPPED + || status == IndexJobStatus.COMPLETED + || status == IndexJobStatus.COMPLETED_WITH_ERRORS + || status == IndexJobStatus.FAILED) { + completedAt = now; + } + + collectionDAO + .rdfIndexJobDAO() + .update( + jobId.toString(), + status.name(), + job.getProcessedRecords(), + job.getSuccessRecords(), + job.getFailedRecords(), + serializeEntityStats(job.getEntityStats()), + startedAt, + completedAt, + now, + errorMessage); + } + + public RdfIndexJob getJobWithAggregatedStats(UUID jobId) { + return refreshAggregatedJob(jobId); + } + + public boolean hasClaimableWork(UUID jobId) { + RdfIndexJob job = refreshAggregatedJob(jobId); + if (job == null || job.isTerminal()) { + return false; + } + return collectionDAO.rdfIndexPartitionDAO().findByJobId(jobId.toString()).stream() + .anyMatch( + partition -> + partition.status().equals(PartitionStatus.PENDING.name()) + || partition.status().equals(PartitionStatus.PROCESSING.name())); + } + + public void performStartupRecovery() { + for (RdfIndexJob job : + getRecentJobs( + List.of(IndexJobStatus.READY, IndexJobStatus.RUNNING, IndexJobStatus.STOPPING), 20)) { + reclaimStalePartitions(job.getId()); + refreshAggregatedJob(job.getId()); + } + } + + private RdfIndexJob refreshAggregatedJob(UUID jobId) { + RdfIndexJob existing = getJob(jobId).orElse(null); + if (existing == null) { + return null; + } + + RdfAggregatedStatsRecord aggregate = + collectionDAO.rdfIndexPartitionDAO().getAggregatedStats(jobId.toString()); + Map entityStats = + collectionDAO.rdfIndexPartitionDAO().getEntityStats(jobId.toString()).stream() + .collect( + Collectors.toMap( + RdfEntityStatsRecord::entityType, + record -> + RdfIndexJob.EntityTypeStats.builder() + .entityType(record.entityType()) + .totalRecords(record.totalRecords()) + .processedRecords(record.processedRecords()) + .successRecords(record.successRecords()) + .failedRecords(record.failedRecords()) + .totalPartitions(record.totalPartitions()) + .completedPartitions(record.completedPartitions()) + .failedPartitions(record.failedPartitions()) + .build(), + (left, right) -> right, + HashMap::new)); + Map serverStats = + collectionDAO.rdfIndexPartitionDAO().getServerStats(jobId.toString()).stream() + .collect( + Collectors.toMap( + RdfServerPartitionStatsRecord::serverId, + record -> + RdfIndexJob.ServerStats.builder() + .serverId(record.serverId()) + .processedRecords(record.processedRecords()) + .successRecords(record.successRecords()) + .failedRecords(record.failedRecords()) + .totalPartitions(record.totalPartitions()) + .completedPartitions(record.completedPartitions()) + .processingPartitions(record.processingPartitions()) + .build(), + (left, right) -> right, + HashMap::new)); + + IndexJobStatus status = existing.getStatus(); + String errorMessage = existing.getErrorMessage(); + if (aggregate.pendingPartitions() == 0 && aggregate.processingPartitions() == 0) { + if (status == IndexJobStatus.STOPPING) { + status = IndexJobStatus.STOPPED; + } else if (aggregate.failedPartitions() > 0 || aggregate.failedRecords() > 0) { + status = IndexJobStatus.COMPLETED_WITH_ERRORS; + } else if (status == IndexJobStatus.READY || status == IndexJobStatus.RUNNING) { + status = IndexJobStatus.COMPLETED; + } + } else if (status == IndexJobStatus.READY) { + status = IndexJobStatus.RUNNING; + } + + Long completedAt = existing.getCompletedAt(); + if (status == IndexJobStatus.COMPLETED + || status == IndexJobStatus.COMPLETED_WITH_ERRORS + || status == IndexJobStatus.FAILED + || status == IndexJobStatus.STOPPED) { + completedAt = System.currentTimeMillis(); + } + + RdfIndexJob refreshed = + existing.toBuilder() + .status(status) + .processedRecords(aggregate.processedRecords()) + .successRecords(aggregate.successRecords()) + .failedRecords(aggregate.failedRecords()) + .entityStats(entityStats) + .serverStats(serverStats) + .updatedAt(System.currentTimeMillis()) + .errorMessage(errorMessage) + .completedAt(completedAt) + .build(); + + updateJob(refreshed); + return refreshed; + } + + private void incrementServerStats( + RdfIndexPartition partition, + long processedCount, + long successCount, + long failedCount, + int partitionsCompleted, + int partitionsFailed) { + String assignedServer = + partition.getAssignedServer() != null ? partition.getAssignedServer() : serverId; + collectionDAO + .rdfIndexServerStatsDAO() + .incrementStats( + UUID.randomUUID().toString(), + partition.getJobId().toString(), + assignedServer, + partition.getEntityType(), + processedCount, + successCount, + failedCount, + partitionsCompleted, + partitionsFailed, + System.currentTimeMillis()); + } + + private void insertPartition(RdfIndexPartition partition) { + collectionDAO + .rdfIndexPartitionDAO() + .insert( + partition.getId().toString(), + partition.getJobId().toString(), + partition.getEntityType(), + partition.getPartitionIndex(), + partition.getRangeStart(), + partition.getRangeEnd(), + partition.getEstimatedCount(), + partition.getWorkUnits(), + partition.getPriority(), + partition.getStatus().name(), + partition.getCursor(), + partition.getClaimableAt()); + } + + private void updateJob(RdfIndexJob job) { + collectionDAO + .rdfIndexJobDAO() + .update( + job.getId().toString(), + job.getStatus().name(), + job.getProcessedRecords(), + job.getSuccessRecords(), + job.getFailedRecords(), + serializeEntityStats(job.getEntityStats()), + job.getStartedAt(), + job.getCompletedAt(), + job.getUpdatedAt(), + job.getErrorMessage()); + } + + private RdfIndexPartition getPartition(UUID partitionId) { + RdfIndexPartitionRecord record = + collectionDAO.rdfIndexPartitionDAO().findById(partitionId.toString()); + if (record == null) { + throw new IllegalStateException("RDF partition not found: " + partitionId); + } + return toPartition(record); + } + + private UUID jobIdFrom(RdfIndexPartition partition) { + return partition.getJobId(); + } + + private String serializeEntityStats(Map entityStats) { + return JsonUtils.pojoToJson(entityStats != null ? entityStats : Map.of()); + } + + private RdfIndexJob toJob(RdfIndexJobRecord record) { + Map entityStats = + record.stats() != null && !record.stats().isBlank() + ? JsonUtils.readValue( + record.stats(), new TypeReference>() {}) + : new HashMap<>(); + + EventPublisherJob jobConfiguration = + record.jobConfiguration() != null + ? JsonUtils.readValue(record.jobConfiguration(), EventPublisherJob.class) + : new EventPublisherJob(); + + return RdfIndexJob.builder() + .id(UUID.fromString(record.id())) + .status(IndexJobStatus.valueOf(record.status())) + .jobConfiguration(jobConfiguration) + .totalRecords(record.totalRecords()) + .processedRecords(record.processedRecords()) + .successRecords(record.successRecords()) + .failedRecords(record.failedRecords()) + .entityStats(entityStats) + .createdBy(record.createdBy()) + .createdAt(record.createdAt()) + .startedAt(record.startedAt()) + .completedAt(record.completedAt()) + .updatedAt(record.updatedAt()) + .errorMessage(record.errorMessage()) + .build(); + } + + private RdfIndexPartition toPartition(RdfIndexPartitionRecord record) { + return RdfIndexPartition.builder() + .id(UUID.fromString(record.id())) + .jobId(UUID.fromString(record.jobId())) + .entityType(record.entityType()) + .partitionIndex(record.partitionIndex()) + .rangeStart(record.rangeStart()) + .rangeEnd(record.rangeEnd()) + .estimatedCount(record.estimatedCount()) + .workUnits(record.workUnits()) + .priority(record.priority()) + .status(PartitionStatus.valueOf(record.status())) + .cursor(record.cursor()) + .processedCount(record.processedCount()) + .successCount(record.successCount()) + .failedCount(record.failedCount()) + .assignedServer(record.assignedServer()) + .claimedAt(record.claimedAt()) + .startedAt(record.startedAt()) + .completedAt(record.completedAt()) + .lastUpdateAt(record.lastUpdateAt()) + .lastError(record.lastError()) + .retryCount(record.retryCount()) + .claimableAt(record.claimableAt()) + .build(); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexExecutor.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexExecutor.java new file mode 100644 index 000000000000..0455ad10524c --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexExecutor.java @@ -0,0 +1,297 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.UUID; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.system.EventPublisherJob; +import org.openmetadata.service.apps.bundles.rdf.RdfBatchProcessor; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.IndexJobStatus; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.ServerIdentityResolver; +import org.openmetadata.service.jdbi3.CollectionDAO; +import org.openmetadata.service.rdf.RdfRepository; + +@Slf4j +public class DistributedRdfIndexExecutor { + private static final Set COORDINATED_JOBS = ConcurrentHashMap.newKeySet(); + private static final long LOCK_REFRESH_INTERVAL_MS = TimeUnit.MINUTES.toMillis(1); + private static final long STALE_CHECK_INTERVAL_MS = TimeUnit.SECONDS.toMillis(30); + private static final long CLAIM_RETRY_SLEEP_MS = 1000; + private static final long SHUTDOWN_TIMEOUT_SECONDS = 30; + + private final CollectionDAO collectionDAO; + private final DistributedRdfIndexCoordinator coordinator; + private final String serverId; + private final AtomicBoolean stopped = new AtomicBoolean(false); + private final List activeWorkers = new ArrayList<>(); + + @Getter private RdfIndexJob currentJob; + private volatile ExecutorService workerExecutor; + private volatile Thread lockRefreshThread; + private volatile Thread staleReclaimerThread; + private volatile boolean coordinatorOwnedJob; + + public DistributedRdfIndexExecutor(CollectionDAO collectionDAO, int partitionSize) { + this.collectionDAO = collectionDAO; + this.coordinator = + new DistributedRdfIndexCoordinator( + collectionDAO, new RdfPartitionCalculator(partitionSize)); + this.serverId = ServerIdentityResolver.getInstance().getServerId(); + } + + public static boolean isCoordinatingJob(UUID jobId) { + return COORDINATED_JOBS.contains(jobId); + } + + public void performStartupRecovery() { + coordinator.performStartupRecovery(); + } + + public RdfIndexJob createJob( + Set entities, EventPublisherJob jobConfiguration, String createdBy) { + Optional blockingJob = coordinator.getBlockingJob(); + if (blockingJob.isPresent()) { + throw new IllegalStateException( + "Another RDF reindex job is already active: " + blockingJob.get().getId()); + } + + UUID tempJobId = UUID.randomUUID(); + if (!coordinator.tryAcquireReindexLock(tempJobId)) { + throw new IllegalStateException("Failed to acquire RDF reindex lock"); + } + + try { + currentJob = coordinator.createJob(entities, jobConfiguration, createdBy); + currentJob = coordinator.initializePartitions(currentJob.getId()); + if (!coordinator.transferReindexLock(tempJobId, currentJob.getId())) { + throw new IllegalStateException("Failed to transfer RDF reindex lock to job"); + } + coordinatorOwnedJob = true; + return currentJob; + } catch (Exception e) { + coordinator.releaseReindexLock(tempJobId); + throw e; + } + } + + public void execute(EventPublisherJob jobConfiguration) throws InterruptedException { + if (currentJob == null) { + throw new IllegalStateException("RDF distributed job must be created before execution"); + } + + stopped.set(false); + COORDINATED_JOBS.add(currentJob.getId()); + coordinator.updateJobStatus(currentJob.getId(), IndexJobStatus.RUNNING, null); + currentJob = coordinator.getJobWithAggregatedStats(currentJob.getId()); + + startCoordinatorThreads(); + runWorkers(jobConfiguration, true); + } + + public void joinJob(RdfIndexJob job, EventPublisherJob jobConfiguration) + throws InterruptedException { + currentJob = job; + coordinatorOwnedJob = false; + stopped.set(false); + runWorkers(jobConfiguration, false); + } + + public RdfIndexJob getJobWithFreshStats() { + if (currentJob == null) { + return null; + } + currentJob = coordinator.getJobWithAggregatedStats(currentJob.getId()); + return currentJob; + } + + public void stop() { + stopped.set(true); + + if (currentJob != null) { + if (coordinatorOwnedJob) { + coordinator.updateJobStatus(currentJob.getId(), IndexJobStatus.STOPPING, null); + coordinator.cancelPendingPartitions(currentJob.getId()); + coordinator.releaseServerPartitions(currentJob.getId(), serverId, true, "Stopped by user"); + } else { + coordinator.releaseServerPartitions( + currentJob.getId(), serverId, false, "Worker server stopped participating"); + } + } + + for (RdfPartitionWorker worker : activeWorkers) { + worker.stop(); + } + + shutdownWorkerExecutor(); + interruptThread(lockRefreshThread); + interruptThread(staleReclaimerThread); + } + + private void runWorkers(EventPublisherJob jobConfiguration, boolean coordinatorMode) + throws InterruptedException { + int workerCount = + Math.max( + 1, + Math.min( + jobConfiguration.getConsumerThreads() != null + ? jobConfiguration.getConsumerThreads() + : Runtime.getRuntime().availableProcessors(), + Runtime.getRuntime().availableProcessors() * 2)); + int batchSize = jobConfiguration.getBatchSize() != null ? jobConfiguration.getBatchSize() : 100; + RdfBatchProcessor batchProcessor = + new RdfBatchProcessor(collectionDAO, RdfRepository.getInstance()); + + workerExecutor = + Executors.newFixedThreadPool( + workerCount, + Thread.ofPlatform() + .name( + coordinatorMode + ? "rdf-distributed-coordinator-" + : "rdf-distributed-participant-", + 0) + .factory()); + + for (int i = 0; i < workerCount; i++) { + RdfPartitionWorker worker = new RdfPartitionWorker(coordinator, batchProcessor, batchSize); + activeWorkers.add(worker); + workerExecutor.submit(() -> workerLoop(worker)); + } + + workerExecutor.shutdown(); + workerExecutor.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS); + + if (coordinatorMode) { + finalizeCoordinatorJob(); + } + } + + private void workerLoop(RdfPartitionWorker worker) { + while (!stopped.get() && !Thread.currentThread().isInterrupted()) { + RdfIndexJob latestJob = getJobWithFreshStats(); + if (latestJob == null + || latestJob.isTerminal() + || latestJob.getStatus() == IndexJobStatus.STOPPING) { + return; + } + + RdfIndexPartition partition = coordinator.claimNextPartition(latestJob.getId()); + if (partition == null) { + try { + TimeUnit.MILLISECONDS.sleep(CLAIM_RETRY_SLEEP_MS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + continue; + } + + worker.processPartition(partition); + } + } + + private void finalizeCoordinatorJob() { + try { + currentJob = coordinator.getJobWithAggregatedStats(currentJob.getId()); + if (currentJob == null) { + return; + } + + if (stopped.get()) { + coordinator.updateJobStatus(currentJob.getId(), IndexJobStatus.STOPPED, null); + } else if (!currentJob.isTerminal()) { + IndexJobStatus terminalStatus = + currentJob.getFailedRecords() > 0 + ? IndexJobStatus.COMPLETED_WITH_ERRORS + : IndexJobStatus.COMPLETED; + coordinator.updateJobStatus( + currentJob.getId(), terminalStatus, currentJob.getErrorMessage()); + } + + currentJob = coordinator.getJobWithAggregatedStats(currentJob.getId()); + } finally { + if (currentJob != null && coordinatorOwnedJob) { + coordinator.releaseReindexLock(currentJob.getId()); + } + if (currentJob != null) { + COORDINATED_JOBS.remove(currentJob.getId()); + } + interruptThread(lockRefreshThread); + interruptThread(staleReclaimerThread); + } + } + + private void startCoordinatorThreads() { + lockRefreshThread = + Thread.ofVirtual() + .name("rdf-lock-refresh-" + currentJob.getId().toString().substring(0, 8)) + .start( + () -> { + while (!stopped.get() && !Thread.currentThread().isInterrupted()) { + try { + coordinator.refreshReindexLock(currentJob.getId()); + TimeUnit.MILLISECONDS.sleep(LOCK_REFRESH_INTERVAL_MS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } catch (Exception e) { + LOG.warn("Failed to refresh RDF reindex lock for {}", currentJob.getId(), e); + } + } + }); + + staleReclaimerThread = + Thread.ofVirtual() + .name("rdf-stale-reclaimer-" + currentJob.getId().toString().substring(0, 8)) + .start( + () -> { + while (!stopped.get() && !Thread.currentThread().isInterrupted()) { + try { + coordinator.reclaimStalePartitions(currentJob.getId()); + TimeUnit.MILLISECONDS.sleep(STALE_CHECK_INTERVAL_MS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } catch (Exception e) { + LOG.warn( + "Failed to reclaim stale RDF partitions for {}", currentJob.getId(), e); + } + } + }); + } + + private void shutdownWorkerExecutor() { + if (workerExecutor == null || workerExecutor.isShutdown()) { + return; + } + + workerExecutor.shutdownNow(); + try { + if (!workerExecutor.awaitTermination(SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS)) { + LOG.warn("Timed out waiting for RDF distributed workers to stop"); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private void interruptThread(Thread thread) { + if (thread == null) { + return; + } + thread.interrupt(); + try { + thread.join(5_000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobParticipant.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobParticipant.java new file mode 100644 index 000000000000..43e2e7b7d9b6 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobParticipant.java @@ -0,0 +1,136 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import io.dropwizard.lifecycle.Managed; +import java.util.List; +import java.util.UUID; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.IndexJobStatus; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.ServerIdentityResolver; +import org.openmetadata.service.jdbi3.CollectionDAO; +import org.openmetadata.service.rdf.RdfRepository; + +@Slf4j +public class RdfDistributedJobParticipant implements Managed { + private static final long POLL_INTERVAL_MS = TimeUnit.SECONDS.toMillis(15); + + private final CollectionDAO collectionDAO; + private final String serverId; + private final DistributedRdfIndexCoordinator coordinator; + private final AtomicBoolean running = new AtomicBoolean(false); + private final AtomicBoolean participating = new AtomicBoolean(false); + + @Getter private UUID currentJobId; + + private volatile Thread pollThread; + private volatile Thread participantThread; + + public RdfDistributedJobParticipant(CollectionDAO collectionDAO) { + this.collectionDAO = collectionDAO; + this.serverId = ServerIdentityResolver.getInstance().getServerId(); + this.coordinator = new DistributedRdfIndexCoordinator(collectionDAO); + } + + @Override + public void start() { + if (!RdfRepository.getInstance().isEnabled()) { + LOG.info("Skipping RDF distributed participant registration because RDF is disabled"); + return; + } + + if (running.compareAndSet(false, true)) { + pollThread = + Thread.ofVirtual() + .name("rdf-distributed-participant-poll") + .start( + () -> { + while (running.get() && !Thread.currentThread().isInterrupted()) { + try { + pollForJobs(); + TimeUnit.MILLISECONDS.sleep(POLL_INTERVAL_MS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } catch (Exception e) { + LOG.warn("Error polling for distributed RDF jobs", e); + } + } + }); + LOG.info("Started RDF distributed job participant on server {}", serverId); + } + } + + @Override + public void stop() { + if (running.compareAndSet(true, false)) { + interruptThread(pollThread); + interruptThread(participantThread); + LOG.info("Stopped RDF distributed job participant on server {}", serverId); + } + } + + private void pollForJobs() { + if (participating.get()) { + return; + } + + List activeJobs = + coordinator.getRecentJobs(List.of(IndexJobStatus.RUNNING, IndexJobStatus.STOPPING), 10); + for (RdfIndexJob job : activeJobs) { + if (job.isTerminal() + || job.getStatus() != IndexJobStatus.RUNNING + || DistributedRdfIndexExecutor.isCoordinatingJob(job.getId()) + || !coordinator.hasClaimableWork(job.getId())) { + continue; + } + + joinJob(job); + return; + } + } + + private void joinJob(RdfIndexJob job) { + if (!participating.compareAndSet(false, true)) { + return; + } + + currentJobId = job.getId(); + participantThread = + Thread.ofVirtual() + .name("rdf-distributed-participant-" + job.getId().toString().substring(0, 8)) + .start( + () -> { + try { + int partitionSize = + job.getJobConfiguration().getPartitionSize() != null + ? job.getJobConfiguration().getPartitionSize() + : 10000; + DistributedRdfIndexExecutor executor = + new DistributedRdfIndexExecutor(collectionDAO, partitionSize); + executor.joinJob(job, job.getJobConfiguration()); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } catch (Exception e) { + LOG.warn("Failed to participate in RDF job {}", job.getId(), e); + } finally { + currentJobId = null; + participating.set(false); + participantThread = null; + } + }); + } + + private void interruptThread(Thread thread) { + if (thread == null) { + return; + } + thread.interrupt(); + try { + thread.join(5_000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobStatsAggregator.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobStatsAggregator.java new file mode 100644 index 000000000000..68209f27d05c --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfDistributedJobStatsAggregator.java @@ -0,0 +1,45 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import org.openmetadata.schema.system.EntityStats; +import org.openmetadata.schema.system.Stats; +import org.openmetadata.schema.system.StepStats; + +public class RdfDistributedJobStatsAggregator { + public Stats toStats(RdfIndexJob job) { + Stats stats = new Stats(); + stats.setEntityStats(new EntityStats()); + + StepStats jobStats = + new StepStats() + .withTotalRecords(safeToInt(job.getTotalRecords())) + .withSuccessRecords(safeToInt(job.getSuccessRecords())) + .withFailedRecords(safeToInt(job.getFailedRecords())); + stats.setJobStats(jobStats); + + if (job.getEntityStats() != null) { + job.getEntityStats() + .forEach( + (entityType, entityStats) -> + stats + .getEntityStats() + .setAdditionalProperty( + entityType, + new StepStats() + .withTotalRecords(safeToInt(entityStats.getTotalRecords())) + .withSuccessRecords(safeToInt(entityStats.getSuccessRecords())) + .withFailedRecords(safeToInt(entityStats.getFailedRecords())))); + } + + return stats; + } + + private int safeToInt(long value) { + if (value > Integer.MAX_VALUE) { + return Integer.MAX_VALUE; + } + if (value < Integer.MIN_VALUE) { + return Integer.MIN_VALUE; + } + return (int) value; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexJob.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexJob.java new file mode 100644 index 000000000000..764e1fd42146 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexJob.java @@ -0,0 +1,70 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import java.util.Map; +import java.util.UUID; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import lombok.With; +import org.openmetadata.schema.system.EventPublisherJob; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.IndexJobStatus; + +@Data +@Builder(toBuilder = true) +@NoArgsConstructor +@AllArgsConstructor +@With +public class RdfIndexJob { + private UUID id; + private IndexJobStatus status; + private EventPublisherJob jobConfiguration; + private long totalRecords; + private long processedRecords; + private long successRecords; + private long failedRecords; + private Map entityStats; + private Map serverStats; + private String createdBy; + private long createdAt; + private Long startedAt; + private Long completedAt; + private long updatedAt; + private String errorMessage; + + public boolean isTerminal() { + return status == IndexJobStatus.COMPLETED + || status == IndexJobStatus.COMPLETED_WITH_ERRORS + || status == IndexJobStatus.FAILED + || status == IndexJobStatus.STOPPED; + } + + @Data + @Builder(toBuilder = true) + @NoArgsConstructor + @AllArgsConstructor + public static class EntityTypeStats { + private String entityType; + private long totalRecords; + private long processedRecords; + private long successRecords; + private long failedRecords; + private int totalPartitions; + private int completedPartitions; + private int failedPartitions; + } + + @Data + @Builder(toBuilder = true) + @NoArgsConstructor + @AllArgsConstructor + public static class ServerStats { + private String serverId; + private long processedRecords; + private long successRecords; + private long failedRecords; + private int totalPartitions; + private int completedPartitions; + private int processingPartitions; + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexPartition.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexPartition.java new file mode 100644 index 000000000000..1d271e62a2b2 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfIndexPartition.java @@ -0,0 +1,35 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import java.util.UUID; +import lombok.Builder; +import lombok.Data; +import lombok.With; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.PartitionStatus; + +@Data +@Builder(toBuilder = true) +@With +public class RdfIndexPartition { + private UUID id; + private UUID jobId; + private String entityType; + private int partitionIndex; + private long rangeStart; + private long rangeEnd; + private long estimatedCount; + private long workUnits; + private int priority; + private PartitionStatus status; + private long cursor; + private long processedCount; + private long successCount; + private long failedCount; + private String assignedServer; + private Long claimedAt; + private Long startedAt; + private Long completedAt; + private Long lastUpdateAt; + private String lastError; + private int retryCount; + private long claimableAt; +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionCalculator.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionCalculator.java new file mode 100644 index 000000000000..acc3b4538ce1 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionCalculator.java @@ -0,0 +1,101 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.service.Entity; +import org.openmetadata.service.apps.bundles.searchIndex.EntityPriority; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.PartitionStatus; + +@Slf4j +public class RdfPartitionCalculator { + + private static final int DEFAULT_PARTITION_SIZE = 10000; + private static final int MIN_PARTITION_SIZE = 1000; + private static final int MAX_PARTITION_SIZE = 50000; + + private static final Map ENTITY_COMPLEXITY_FACTORS = + Map.of( + "table", 1.5, + "dashboard", 1.3, + "pipeline", 1.2, + "mlmodel", 1.3, + "glossaryTerm", 1.1); + + private final int partitionSize; + + public RdfPartitionCalculator() { + this(DEFAULT_PARTITION_SIZE); + } + + public RdfPartitionCalculator(int partitionSize) { + this.partitionSize = Math.clamp(partitionSize, MIN_PARTITION_SIZE, MAX_PARTITION_SIZE); + } + + public List calculatePartitions(UUID jobId, Set entityTypes) { + List partitions = new ArrayList<>(); + for (String entityType : entityTypes) { + partitions.addAll(calculatePartitionsForEntity(jobId, entityType)); + } + return partitions; + } + + public List calculatePartitionsForEntity(UUID jobId, String entityType) { + long totalCount = getEntityCount(entityType); + if (totalCount <= 0) { + return List.of(); + } + + double complexityFactor = ENTITY_COMPLEXITY_FACTORS.getOrDefault(entityType, 1.0); + long adjustedPartitionSize = + Math.max(MIN_PARTITION_SIZE, (long) (partitionSize / complexityFactor)); + int priority = EntityPriority.getNumericPriority(entityType); + long numPartitions = (totalCount + adjustedPartitionSize - 1) / adjustedPartitionSize; + + List partitions = new ArrayList<>(); + for (int index = 0; index < numPartitions; index++) { + long rangeStart = index * adjustedPartitionSize; + long rangeEnd = Math.min(rangeStart + adjustedPartitionSize, totalCount); + long estimatedCount = rangeEnd - rangeStart; + partitions.add( + RdfIndexPartition.builder() + .id(UUID.randomUUID()) + .jobId(jobId) + .entityType(entityType) + .partitionIndex(index) + .rangeStart(rangeStart) + .rangeEnd(rangeEnd) + .estimatedCount(estimatedCount) + .workUnits((long) (estimatedCount * complexityFactor)) + .priority(priority) + .status(PartitionStatus.PENDING) + .cursor(rangeStart) + .processedCount(0) + .successCount(0) + .failedCount(0) + .retryCount(0) + .claimableAt(0) + .build()); + } + + LOG.info( + "Calculated {} RDF partitions for {} (totalRecords={}, partitionSize={})", + partitions.size(), + entityType, + totalCount, + adjustedPartitionSize); + return partitions; + } + + public long getEntityCount(String entityType) { + try { + return Entity.getEntityRepository(entityType).getDao().listTotalCount(); + } catch (Exception e) { + LOG.warn("Failed to fetch entity count for {}", entityType, e); + return 0; + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionWorker.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionWorker.java new file mode 100644 index 000000000000..dc85e8a2025e --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/rdf/distributed/RdfPartitionWorker.java @@ -0,0 +1,128 @@ +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import static org.openmetadata.common.utils.CommonUtil.listOrEmpty; + +import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.EntityInterface; +import org.openmetadata.schema.type.Include; +import org.openmetadata.schema.utils.ResultList; +import org.openmetadata.service.Entity; +import org.openmetadata.service.apps.bundles.rdf.RdfBatchProcessor; +import org.openmetadata.service.exception.SearchIndexException; +import org.openmetadata.service.jdbi3.ListFilter; +import org.openmetadata.service.workflows.searchIndex.PaginatedEntitiesSource; + +@Slf4j +public class RdfPartitionWorker { + private static final int PROGRESS_UPDATE_INTERVAL = 100; + + private final DistributedRdfIndexCoordinator coordinator; + private final RdfBatchProcessor batchProcessor; + private final int batchSize; + private final AtomicBoolean stopped = new AtomicBoolean(false); + + public RdfPartitionWorker( + DistributedRdfIndexCoordinator coordinator, RdfBatchProcessor batchProcessor, int batchSize) { + this.coordinator = coordinator; + this.batchProcessor = batchProcessor; + this.batchSize = batchSize; + } + + public PartitionResult processPartition(RdfIndexPartition partition) { + String entityType = partition.getEntityType(); + long currentOffset = Math.max(partition.getCursor(), partition.getRangeStart()); + long processedCount = partition.getProcessedCount(); + long successCount = partition.getSuccessCount(); + long failedCount = partition.getFailedCount(); + + try { + String keysetCursor = initializeKeysetCursor(entityType, currentOffset); + while (currentOffset < partition.getRangeEnd() + && !stopped.get() + && !Thread.currentThread().isInterrupted()) { + int currentBatchSize = (int) Math.min(batchSize, partition.getRangeEnd() - currentOffset); + ResultList resultList = + readEntitiesKeyset(entityType, keysetCursor, currentBatchSize); + + if (resultList == null || listOrEmpty(resultList.getData()).isEmpty()) { + break; + } + + RdfBatchProcessor.BatchProcessingResult batchResult = + batchProcessor.processEntities(entityType, resultList.getData(), stopped::get); + int readerErrors = listOrEmpty(resultList.getErrors()).size(); + long batchProcessed = resultList.getData().size() + readerErrors; + + processedCount += batchProcessed; + successCount += batchResult.successCount(); + failedCount += batchResult.failedCount() + readerErrors; + currentOffset += batchProcessed; + + if (processedCount % PROGRESS_UPDATE_INTERVAL < batchProcessed) { + coordinator.updatePartitionProgress( + partition.toBuilder() + .cursor(currentOffset) + .processedCount(processedCount) + .successCount(successCount) + .failedCount(failedCount) + .build()); + } + + keysetCursor = resultList.getPaging() != null ? resultList.getPaging().getAfter() : null; + if (keysetCursor == null && currentOffset < partition.getRangeEnd()) { + keysetCursor = initializeKeysetCursor(entityType, currentOffset); + if (keysetCursor == null) { + break; + } + } + } + + if (stopped.get() || Thread.currentThread().isInterrupted()) { + return new PartitionResult(processedCount, successCount, failedCount, true, null); + } + + coordinator.completePartition( + partition.getId(), currentOffset, processedCount, successCount, failedCount); + return new PartitionResult(processedCount, successCount, failedCount, false, null); + } catch (Exception e) { + LOG.error("Failed to process RDF partition {}", partition.getId(), e); + coordinator.failPartition( + partition.getId(), + currentOffset, + processedCount, + successCount, + failedCount, + e.getMessage()); + return new PartitionResult(processedCount, successCount, failedCount, false, e.getMessage()); + } + } + + public void stop() { + stopped.set(true); + } + + private ResultList readEntitiesKeyset( + String entityType, String keysetCursor, int limit) throws SearchIndexException { + PaginatedEntitiesSource source = + new PaginatedEntitiesSource(entityType, limit, List.of("*"), 0); + return source.readNextKeyset(keysetCursor); + } + + private String initializeKeysetCursor(String entityType, long offset) { + if (offset <= 0) { + return null; + } + int cursorOffset = (int) offset - 1; + return Entity.getEntityRepository(entityType) + .getCursorAtOffset(new ListFilter(Include.ALL), cursorOffset); + } + + public record PartitionResult( + long processedCount, + long successCount, + long failedCount, + boolean stopped, + String errorMessage) {} +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/jdbi3/CollectionDAO.java b/openmetadata-service/src/main/java/org/openmetadata/service/jdbi3/CollectionDAO.java index e0c85928ba19..8e7e9e54fa58 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/jdbi3/CollectionDAO.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/jdbi3/CollectionDAO.java @@ -489,6 +489,18 @@ public interface CollectionDAO { @CreateSqlObject SearchIndexServerStatsDAO searchIndexServerStatsDAO(); + @CreateSqlObject + RdfIndexJobDAO rdfIndexJobDAO(); + + @CreateSqlObject + RdfIndexPartitionDAO rdfIndexPartitionDAO(); + + @CreateSqlObject + RdfReindexLockDAO rdfReindexLockDAO(); + + @CreateSqlObject + RdfIndexServerStatsDAO rdfIndexServerStatsDAO(); + @CreateSqlObject AuditLogDAO auditLogDAO(); @@ -11221,6 +11233,669 @@ public EntityStats map(ResultSet rs, StatementContext ctx) throws SQLException { } } + /** DAO for distributed RDF index jobs. */ + interface RdfIndexJobDAO { + + @ConnectionAwareSqlUpdate( + value = + "INSERT INTO rdf_index_job (id, status, jobConfiguration, totalRecords, processedRecords, " + + "successRecords, failedRecords, stats, createdBy, createdAt, updatedAt) " + + "VALUES (:id, :status, :jobConfiguration, :totalRecords, :processedRecords, " + + ":successRecords, :failedRecords, :stats, :createdBy, :createdAt, :updatedAt)", + connectionType = MYSQL) + @ConnectionAwareSqlUpdate( + value = + "INSERT INTO rdf_index_job (id, status, jobConfiguration, totalRecords, processedRecords, " + + "successRecords, failedRecords, stats, createdBy, createdAt, updatedAt) " + + "VALUES (:id, :status, :jobConfiguration::jsonb, :totalRecords, :processedRecords, " + + ":successRecords, :failedRecords, :stats::jsonb, :createdBy, :createdAt, :updatedAt)", + connectionType = POSTGRES) + void insert( + @Bind("id") String id, + @Bind("status") String status, + @Bind("jobConfiguration") String jobConfiguration, + @Bind("totalRecords") long totalRecords, + @Bind("processedRecords") long processedRecords, + @Bind("successRecords") long successRecords, + @Bind("failedRecords") long failedRecords, + @Bind("stats") String stats, + @Bind("createdBy") String createdBy, + @Bind("createdAt") long createdAt, + @Bind("updatedAt") long updatedAt); + + @ConnectionAwareSqlUpdate( + value = + "UPDATE rdf_index_job SET status = :status, processedRecords = :processedRecords, " + + "successRecords = :successRecords, failedRecords = :failedRecords, stats = :stats, " + + "startedAt = :startedAt, completedAt = :completedAt, updatedAt = :updatedAt, " + + "errorMessage = :errorMessage WHERE id = :id", + connectionType = MYSQL) + @ConnectionAwareSqlUpdate( + value = + "UPDATE rdf_index_job SET status = :status, processedRecords = :processedRecords, " + + "successRecords = :successRecords, failedRecords = :failedRecords, stats = :stats::jsonb, " + + "startedAt = :startedAt, completedAt = :completedAt, updatedAt = :updatedAt, " + + "errorMessage = :errorMessage WHERE id = :id", + connectionType = POSTGRES) + void update( + @Bind("id") String id, + @Bind("status") String status, + @Bind("processedRecords") long processedRecords, + @Bind("successRecords") long successRecords, + @Bind("failedRecords") long failedRecords, + @Bind("stats") String stats, + @Bind("startedAt") Long startedAt, + @Bind("completedAt") Long completedAt, + @Bind("updatedAt") long updatedAt, + @Bind("errorMessage") String errorMessage); + + @SqlUpdate("UPDATE rdf_index_job SET updatedAt = :updatedAt WHERE id = :id") + void touchJob(@Bind("id") String id, @Bind("updatedAt") long updatedAt); + + @SqlQuery("SELECT * FROM rdf_index_job WHERE id = :id") + @RegisterRowMapper(RdfIndexJobMapper.class) + RdfIndexJobRecord findById(@Bind("id") String id); + + @SqlQuery("SELECT * FROM rdf_index_job WHERE status IN () ORDER BY createdAt DESC") + @RegisterRowMapper(RdfIndexJobMapper.class) + List findByStatuses(@BindList("statuses") List statuses); + + @SqlQuery( + "SELECT * FROM rdf_index_job WHERE status IN () ORDER BY createdAt DESC LIMIT :limit") + @RegisterRowMapper(RdfIndexJobMapper.class) + List findByStatusesWithLimit( + @BindList("statuses") List statuses, @Bind("limit") int limit); + + @SqlQuery("SELECT id FROM rdf_index_job WHERE status IN ('READY', 'RUNNING', 'STOPPING')") + List getRunningJobIds(); + + @SqlUpdate("DELETE FROM rdf_index_job") + void deleteAll(); + + class RdfIndexJobMapper implements RowMapper { + @Override + public RdfIndexJobRecord map(ResultSet rs, StatementContext ctx) throws SQLException { + return new RdfIndexJobRecord( + rs.getString("id"), + rs.getString("status"), + rs.getString("jobConfiguration"), + rs.getLong("totalRecords"), + rs.getLong("processedRecords"), + rs.getLong("successRecords"), + rs.getLong("failedRecords"), + rs.getString("stats"), + rs.getString("createdBy"), + rs.getLong("createdAt"), + (Long) rs.getObject("startedAt"), + (Long) rs.getObject("completedAt"), + rs.getLong("updatedAt"), + rs.getString("errorMessage")); + } + } + + record RdfIndexJobRecord( + String id, + String status, + String jobConfiguration, + long totalRecords, + long processedRecords, + long successRecords, + long failedRecords, + String stats, + String createdBy, + long createdAt, + Long startedAt, + Long completedAt, + long updatedAt, + String errorMessage) {} + } + + /** DAO for distributed RDF partitions. */ + interface RdfIndexPartitionDAO { + + @ConnectionAwareSqlUpdate( + value = + "INSERT INTO rdf_index_partition (id, jobId, entityType, partitionIndex, rangeStart, rangeEnd, " + + "estimatedCount, workUnits, priority, status, processingCursor, claimableAt) " + + "VALUES (:id, :jobId, :entityType, :partitionIndex, :rangeStart, :rangeEnd, " + + ":estimatedCount, :workUnits, :priority, :status, :cursor, :claimableAt)", + connectionType = MYSQL) + @ConnectionAwareSqlUpdate( + value = + "INSERT INTO rdf_index_partition (id, jobId, entityType, partitionIndex, rangeStart, rangeEnd, " + + "estimatedCount, workUnits, priority, status, processingCursor, claimableAt) " + + "VALUES (:id, :jobId, :entityType, :partitionIndex, :rangeStart, :rangeEnd, " + + ":estimatedCount, :workUnits, :priority, :status, :cursor, :claimableAt)", + connectionType = POSTGRES) + void insert( + @Bind("id") String id, + @Bind("jobId") String jobId, + @Bind("entityType") String entityType, + @Bind("partitionIndex") int partitionIndex, + @Bind("rangeStart") long rangeStart, + @Bind("rangeEnd") long rangeEnd, + @Bind("estimatedCount") long estimatedCount, + @Bind("workUnits") long workUnits, + @Bind("priority") int priority, + @Bind("status") String status, + @Bind("cursor") long cursor, + @Bind("claimableAt") long claimableAt); + + @SqlUpdate( + "UPDATE rdf_index_partition SET status = :status, processingCursor = :cursor, " + + "processedCount = :processedCount, successCount = :successCount, failedCount = :failedCount, " + + "assignedServer = :assignedServer, claimedAt = :claimedAt, startedAt = :startedAt, " + + "completedAt = :completedAt, lastUpdateAt = :lastUpdateAt, lastError = :lastError, " + + "retryCount = :retryCount WHERE id = :id") + void update( + @Bind("id") String id, + @Bind("status") String status, + @Bind("cursor") long cursor, + @Bind("processedCount") long processedCount, + @Bind("successCount") long successCount, + @Bind("failedCount") long failedCount, + @Bind("assignedServer") String assignedServer, + @Bind("claimedAt") Long claimedAt, + @Bind("startedAt") Long startedAt, + @Bind("completedAt") Long completedAt, + @Bind("lastUpdateAt") Long lastUpdateAt, + @Bind("lastError") String lastError, + @Bind("retryCount") int retryCount); + + @SqlUpdate( + "UPDATE rdf_index_partition SET processingCursor = :cursor, processedCount = :processedCount, " + + "successCount = :successCount, failedCount = :failedCount, lastUpdateAt = :lastUpdateAt " + + "WHERE id = :id") + void updateProgress( + @Bind("id") String id, + @Bind("cursor") long cursor, + @Bind("processedCount") long processedCount, + @Bind("successCount") long successCount, + @Bind("failedCount") long failedCount, + @Bind("lastUpdateAt") long lastUpdateAt); + + @SqlUpdate("UPDATE rdf_index_partition SET lastUpdateAt = :lastUpdateAt WHERE id = :id") + void updateHeartbeat(@Bind("id") String id, @Bind("lastUpdateAt") long lastUpdateAt); + + @SqlQuery("SELECT * FROM rdf_index_partition WHERE id = :id") + @RegisterRowMapper(RdfIndexPartitionMapper.class) + RdfIndexPartitionRecord findById(@Bind("id") String id); + + @SqlQuery( + "SELECT * FROM rdf_index_partition WHERE jobId = :jobId ORDER BY priority DESC, entityType, partitionIndex") + @RegisterRowMapper(RdfIndexPartitionMapper.class) + List findByJobId(@Bind("jobId") String jobId); + + @ConnectionAwareSqlUpdate( + value = + "UPDATE rdf_index_partition p " + + "JOIN (SELECT id FROM rdf_index_partition WHERE jobId = :jobId AND status = 'PENDING' " + + "AND claimableAt <= :now " + + "ORDER BY priority DESC, entityType, partitionIndex LIMIT 1 FOR UPDATE SKIP LOCKED) t ON p.id = t.id " + + "SET p.status = 'PROCESSING', p.assignedServer = :serverId, p.claimedAt = :now, " + + "p.startedAt = :now, p.lastUpdateAt = :now", + connectionType = MYSQL) + @ConnectionAwareSqlUpdate( + value = + "UPDATE rdf_index_partition SET status = 'PROCESSING', " + + "assignedServer = :serverId, claimedAt = :now, startedAt = :now, lastUpdateAt = :now " + + "WHERE id = (SELECT id FROM rdf_index_partition WHERE jobId = :jobId AND status = 'PENDING' " + + "AND claimableAt <= :now " + + "ORDER BY priority DESC, entityType, partitionIndex LIMIT 1 FOR UPDATE SKIP LOCKED)", + connectionType = POSTGRES) + int claimNextPartitionAtomic( + @Bind("jobId") String jobId, @Bind("serverId") String serverId, @Bind("now") long now); + + @SqlQuery( + "SELECT * FROM rdf_index_partition WHERE jobId = :jobId AND status = 'PROCESSING' " + + "AND assignedServer = :serverId AND claimedAt = :claimedAt " + + "ORDER BY priority DESC, entityType, partitionIndex LIMIT 1") + @RegisterRowMapper(RdfIndexPartitionMapper.class) + RdfIndexPartitionRecord findLatestClaimedPartition( + @Bind("jobId") String jobId, + @Bind("serverId") String serverId, + @Bind("claimedAt") long claimedAt); + + @SqlUpdate( + "UPDATE rdf_index_partition SET status = 'PENDING', assignedServer = NULL, claimedAt = NULL, " + + "retryCount = retryCount + 1, lastError = 'Reclaimed due to stale heartbeat' " + + "WHERE jobId = :jobId AND status = 'PROCESSING' AND lastUpdateAt < :staleThreshold " + + "AND retryCount < :maxRetries") + int reclaimStalePartitionsForRetry( + @Bind("jobId") String jobId, + @Bind("staleThreshold") long staleThreshold, + @Bind("maxRetries") int maxRetries); + + @SqlUpdate( + "UPDATE rdf_index_partition SET status = 'FAILED', " + + "lastError = 'Exceeded max retries after stale heartbeat', completedAt = :now " + + "WHERE jobId = :jobId AND status = 'PROCESSING' AND lastUpdateAt < :staleThreshold " + + "AND retryCount >= :maxRetries") + int failStalePartitionsExceedingRetries( + @Bind("jobId") String jobId, + @Bind("staleThreshold") long staleThreshold, + @Bind("maxRetries") int maxRetries, + @Bind("now") long now); + + @SqlUpdate( + "UPDATE rdf_index_partition SET status = 'CANCELLED' WHERE jobId = :jobId AND status = 'PENDING'") + int cancelPendingPartitions(@Bind("jobId") String jobId); + + @SqlUpdate( + "UPDATE rdf_index_partition SET status = :status, assignedServer = NULL, claimedAt = NULL, " + + "lastError = :reason, lastUpdateAt = :updatedAt, completedAt = :completedAt " + + "WHERE jobId = :jobId AND status = 'PROCESSING' AND assignedServer = :serverId") + int releaseProcessingPartitions( + @Bind("jobId") String jobId, + @Bind("serverId") String serverId, + @Bind("status") String status, + @Bind("reason") String reason, + @Bind("updatedAt") long updatedAt, + @Bind("completedAt") Long completedAt); + + @SqlQuery( + "SELECT entityType, " + + "SUM(estimatedCount) as totalRecords, " + + "SUM(processedCount) as processedRecords, " + + "SUM(successCount) as successRecords, " + + "SUM(failedCount) as failedRecords, " + + "COUNT(*) as totalPartitions, " + + "SUM(CASE WHEN status = 'COMPLETED' THEN 1 ELSE 0 END) as completedPartitions, " + + "SUM(CASE WHEN status = 'FAILED' THEN 1 ELSE 0 END) as failedPartitions " + + "FROM rdf_index_partition WHERE jobId = :jobId GROUP BY entityType") + @RegisterRowMapper(RdfEntityStatsMapper.class) + List getEntityStats(@Bind("jobId") String jobId); + + @SqlQuery( + "SELECT " + + "SUM(estimatedCount) as totalRecords, " + + "SUM(processedCount) as processedRecords, " + + "SUM(successCount) as successRecords, " + + "SUM(failedCount) as failedRecords, " + + "COUNT(*) as totalPartitions, " + + "SUM(CASE WHEN status = 'COMPLETED' THEN 1 ELSE 0 END) as completedPartitions, " + + "SUM(CASE WHEN status = 'FAILED' THEN 1 ELSE 0 END) as failedPartitions, " + + "SUM(CASE WHEN status = 'PENDING' THEN 1 ELSE 0 END) as pendingPartitions, " + + "SUM(CASE WHEN status = 'PROCESSING' THEN 1 ELSE 0 END) as processingPartitions " + + "FROM rdf_index_partition WHERE jobId = :jobId") + @RegisterRowMapper(RdfAggregatedStatsMapper.class) + RdfAggregatedStatsRecord getAggregatedStats(@Bind("jobId") String jobId); + + @SqlQuery( + "SELECT assignedServer, " + + "SUM(processedCount) as processedRecords, " + + "SUM(successCount) as successRecords, " + + "SUM(failedCount) as failedRecords, " + + "COUNT(*) as totalPartitions, " + + "SUM(CASE WHEN status = 'COMPLETED' THEN 1 ELSE 0 END) as completedPartitions, " + + "SUM(CASE WHEN status = 'PROCESSING' THEN 1 ELSE 0 END) as processingPartitions " + + "FROM rdf_index_partition WHERE jobId = :jobId AND assignedServer IS NOT NULL " + + "GROUP BY assignedServer") + @RegisterRowMapper(RdfServerStatsMapper.class) + List getServerStats(@Bind("jobId") String jobId); + + @SqlQuery( + "SELECT DISTINCT assignedServer FROM rdf_index_partition " + + "WHERE jobId = :jobId AND assignedServer IS NOT NULL") + List getAssignedServers(@Bind("jobId") String jobId); + + @SqlUpdate("DELETE FROM rdf_index_partition") + void deleteAll(); + + class RdfIndexPartitionMapper implements RowMapper { + @Override + public RdfIndexPartitionRecord map(ResultSet rs, StatementContext ctx) throws SQLException { + return new RdfIndexPartitionRecord( + rs.getString("id"), + rs.getString("jobId"), + rs.getString("entityType"), + rs.getInt("partitionIndex"), + rs.getLong("rangeStart"), + rs.getLong("rangeEnd"), + rs.getLong("estimatedCount"), + rs.getLong("workUnits"), + rs.getInt("priority"), + rs.getString("status"), + rs.getLong("processingCursor"), + rs.getLong("processedCount"), + rs.getLong("successCount"), + rs.getLong("failedCount"), + rs.getString("assignedServer"), + (Long) rs.getObject("claimedAt"), + (Long) rs.getObject("startedAt"), + (Long) rs.getObject("completedAt"), + (Long) rs.getObject("lastUpdateAt"), + rs.getString("lastError"), + rs.getInt("retryCount"), + rs.getLong("claimableAt")); + } + } + + class RdfEntityStatsMapper implements RowMapper { + @Override + public RdfEntityStatsRecord map(ResultSet rs, StatementContext ctx) throws SQLException { + return new RdfEntityStatsRecord( + rs.getString("entityType"), + rs.getLong("totalRecords"), + rs.getLong("processedRecords"), + rs.getLong("successRecords"), + rs.getLong("failedRecords"), + rs.getInt("totalPartitions"), + rs.getInt("completedPartitions"), + rs.getInt("failedPartitions")); + } + } + + class RdfAggregatedStatsMapper implements RowMapper { + @Override + public RdfAggregatedStatsRecord map(ResultSet rs, StatementContext ctx) throws SQLException { + return new RdfAggregatedStatsRecord( + rs.getLong("totalRecords"), + rs.getLong("processedRecords"), + rs.getLong("successRecords"), + rs.getLong("failedRecords"), + rs.getInt("totalPartitions"), + rs.getInt("completedPartitions"), + rs.getInt("failedPartitions"), + rs.getInt("pendingPartitions"), + rs.getInt("processingPartitions")); + } + } + + class RdfServerStatsMapper implements RowMapper { + @Override + public RdfServerPartitionStatsRecord map(ResultSet rs, StatementContext ctx) + throws SQLException { + return new RdfServerPartitionStatsRecord( + rs.getString("assignedServer"), + rs.getLong("processedRecords"), + rs.getLong("successRecords"), + rs.getLong("failedRecords"), + rs.getInt("totalPartitions"), + rs.getInt("completedPartitions"), + rs.getInt("processingPartitions")); + } + } + + record RdfIndexPartitionRecord( + String id, + String jobId, + String entityType, + int partitionIndex, + long rangeStart, + long rangeEnd, + long estimatedCount, + long workUnits, + int priority, + String status, + long cursor, + long processedCount, + long successCount, + long failedCount, + String assignedServer, + Long claimedAt, + Long startedAt, + Long completedAt, + Long lastUpdateAt, + String lastError, + int retryCount, + long claimableAt) {} + + record RdfEntityStatsRecord( + String entityType, + long totalRecords, + long processedRecords, + long successRecords, + long failedRecords, + int totalPartitions, + int completedPartitions, + int failedPartitions) {} + + record RdfAggregatedStatsRecord( + long totalRecords, + long processedRecords, + long successRecords, + long failedRecords, + int totalPartitions, + int completedPartitions, + int failedPartitions, + int pendingPartitions, + int processingPartitions) {} + + record RdfServerPartitionStatsRecord( + String serverId, + long processedRecords, + long successRecords, + long failedRecords, + int totalPartitions, + int completedPartitions, + int processingPartitions) {} + } + + /** DAO for RDF distributed reindex lock. */ + interface RdfReindexLockDAO { + + @ConnectionAwareSqlUpdate( + value = + "INSERT IGNORE INTO rdf_reindex_lock (lockKey, jobId, serverId, acquiredAt, lastHeartbeat, expiresAt) " + + "VALUES (:lockKey, :jobId, :serverId, :acquiredAt, :lastHeartbeat, :expiresAt)", + connectionType = MYSQL) + @ConnectionAwareSqlUpdate( + value = + "INSERT INTO rdf_reindex_lock (lockKey, jobId, serverId, acquiredAt, lastHeartbeat, expiresAt) " + + "VALUES (:lockKey, :jobId, :serverId, :acquiredAt, :lastHeartbeat, :expiresAt) " + + "ON CONFLICT (lockKey) DO NOTHING", + connectionType = POSTGRES) + int insertIfNotExists( + @Bind("lockKey") String lockKey, + @Bind("jobId") String jobId, + @Bind("serverId") String serverId, + @Bind("acquiredAt") long acquiredAt, + @Bind("lastHeartbeat") long lastHeartbeat, + @Bind("expiresAt") long expiresAt); + + @SqlUpdate( + "UPDATE rdf_reindex_lock SET lastHeartbeat = :lastHeartbeat, expiresAt = :expiresAt " + + "WHERE lockKey = :lockKey AND jobId = :jobId") + int updateHeartbeat( + @Bind("lockKey") String lockKey, + @Bind("jobId") String jobId, + @Bind("lastHeartbeat") long lastHeartbeat, + @Bind("expiresAt") long expiresAt); + + @SqlQuery("SELECT * FROM rdf_reindex_lock WHERE lockKey = :lockKey") + @RegisterRowMapper(RdfReindexLockMapper.class) + RdfReindexLockRecord findByKey(@Bind("lockKey") String lockKey); + + @SqlUpdate("DELETE FROM rdf_reindex_lock WHERE lockKey = :lockKey") + void delete(@Bind("lockKey") String lockKey); + + @SqlUpdate("DELETE FROM rdf_reindex_lock WHERE lockKey = :lockKey AND jobId = :jobId") + int deleteByKeyAndJob(@Bind("lockKey") String lockKey, @Bind("jobId") String jobId); + + @SqlUpdate("DELETE FROM rdf_reindex_lock WHERE expiresAt < :now") + int deleteExpiredLocks(@Bind("now") long now); + + @SqlUpdate( + "UPDATE rdf_reindex_lock SET jobId = :toJobId, serverId = :serverId, " + + "lastHeartbeat = :heartbeat, expiresAt = :expiresAt " + + "WHERE lockKey = :lockKey AND jobId = :fromJobId") + int updateLockOwner( + @Bind("lockKey") String lockKey, + @Bind("fromJobId") String fromJobId, + @Bind("toJobId") String toJobId, + @Bind("serverId") String serverId, + @Bind("heartbeat") long heartbeat, + @Bind("expiresAt") long expiresAt); + + default boolean tryAcquireLock( + String lockKey, String jobId, String serverId, long acquiredAt, long expiresAt) { + deleteExpiredLocks(System.currentTimeMillis()); + int inserted = insertIfNotExists(lockKey, jobId, serverId, acquiredAt, acquiredAt, expiresAt); + if (inserted > 0) { + return true; + } + + RdfReindexLockRecord existing = findByKey(lockKey); + if (existing != null && existing.isExpired()) { + delete(lockKey); + inserted = insertIfNotExists(lockKey, jobId, serverId, acquiredAt, acquiredAt, expiresAt); + return inserted > 0; + } + return false; + } + + default void releaseLock(String lockKey, String jobId) { + deleteByKeyAndJob(lockKey, jobId); + } + + default boolean transferLock( + String lockKey, + String fromJobId, + String toJobId, + String serverId, + long heartbeat, + long expiresAt) { + return updateLockOwner(lockKey, fromJobId, toJobId, serverId, heartbeat, expiresAt) > 0; + } + + class RdfReindexLockMapper implements RowMapper { + @Override + public RdfReindexLockRecord map(ResultSet rs, StatementContext ctx) throws SQLException { + return new RdfReindexLockRecord( + rs.getString("lockKey"), + rs.getString("jobId"), + rs.getString("serverId"), + rs.getLong("acquiredAt"), + rs.getLong("lastHeartbeat"), + rs.getLong("expiresAt")); + } + } + + record RdfReindexLockRecord( + String lockKey, + String jobId, + String serverId, + long acquiredAt, + long lastHeartbeat, + long expiresAt) { + + public boolean isExpired() { + return System.currentTimeMillis() > expiresAt; + } + } + } + + /** DAO for RDF per-server distributed stats. */ + interface RdfIndexServerStatsDAO { + + record ServerStatsRecord( + String id, + String jobId, + String serverId, + String entityType, + long processedRecords, + long successRecords, + long failedRecords, + int partitionsCompleted, + int partitionsFailed, + long lastUpdatedAt) {} + + record AggregatedServerStats( + long processedRecords, + long successRecords, + long failedRecords, + int partitionsCompleted, + int partitionsFailed) {} + + @ConnectionAwareSqlUpdate( + value = + "INSERT INTO rdf_index_server_stats (id, jobId, serverId, entityType, processedRecords, " + + "successRecords, failedRecords, partitionsCompleted, partitionsFailed, lastUpdatedAt) " + + "VALUES (:id, :jobId, :serverId, :entityType, :processedRecords, :successRecords, " + + ":failedRecords, :partitionsCompleted, :partitionsFailed, :lastUpdatedAt) " + + "ON DUPLICATE KEY UPDATE " + + "processedRecords = processedRecords + VALUES(processedRecords), " + + "successRecords = successRecords + VALUES(successRecords), " + + "failedRecords = failedRecords + VALUES(failedRecords), " + + "partitionsCompleted = partitionsCompleted + VALUES(partitionsCompleted), " + + "partitionsFailed = partitionsFailed + VALUES(partitionsFailed), " + + "lastUpdatedAt = VALUES(lastUpdatedAt)", + connectionType = MYSQL) + @ConnectionAwareSqlUpdate( + value = + "INSERT INTO rdf_index_server_stats (id, jobId, serverId, entityType, processedRecords, " + + "successRecords, failedRecords, partitionsCompleted, partitionsFailed, lastUpdatedAt) " + + "VALUES (:id, :jobId, :serverId, :entityType, :processedRecords, :successRecords, " + + ":failedRecords, :partitionsCompleted, :partitionsFailed, :lastUpdatedAt) " + + "ON CONFLICT (jobId, serverId, entityType) DO UPDATE SET " + + "processedRecords = rdf_index_server_stats.processedRecords + EXCLUDED.processedRecords, " + + "successRecords = rdf_index_server_stats.successRecords + EXCLUDED.successRecords, " + + "failedRecords = rdf_index_server_stats.failedRecords + EXCLUDED.failedRecords, " + + "partitionsCompleted = rdf_index_server_stats.partitionsCompleted + EXCLUDED.partitionsCompleted, " + + "partitionsFailed = rdf_index_server_stats.partitionsFailed + EXCLUDED.partitionsFailed, " + + "lastUpdatedAt = EXCLUDED.lastUpdatedAt", + connectionType = POSTGRES) + void incrementStats( + @Bind("id") String id, + @Bind("jobId") String jobId, + @Bind("serverId") String serverId, + @Bind("entityType") String entityType, + @Bind("processedRecords") long processedRecords, + @Bind("successRecords") long successRecords, + @Bind("failedRecords") long failedRecords, + @Bind("partitionsCompleted") int partitionsCompleted, + @Bind("partitionsFailed") int partitionsFailed, + @Bind("lastUpdatedAt") long lastUpdatedAt); + + @SqlQuery("SELECT * FROM rdf_index_server_stats WHERE jobId = :jobId") + @RegisterRowMapper(RdfServerStatsRecordMapper.class) + List findByJobId(@Bind("jobId") String jobId); + + @SqlQuery( + "SELECT " + + "COALESCE(SUM(processedRecords), 0) as processedRecords, " + + "COALESCE(SUM(successRecords), 0) as successRecords, " + + "COALESCE(SUM(failedRecords), 0) as failedRecords, " + + "COALESCE(SUM(partitionsCompleted), 0) as partitionsCompleted, " + + "COALESCE(SUM(partitionsFailed), 0) as partitionsFailed " + + "FROM rdf_index_server_stats WHERE jobId = :jobId") + @RegisterRowMapper(RdfAggregatedServerStatsMapper.class) + AggregatedServerStats getAggregatedStats(@Bind("jobId") String jobId); + + @SqlUpdate("DELETE FROM rdf_index_server_stats") + void deleteAll(); + + class RdfServerStatsRecordMapper implements RowMapper { + @Override + public ServerStatsRecord map(ResultSet rs, StatementContext ctx) throws SQLException { + return new ServerStatsRecord( + rs.getString("id"), + rs.getString("jobId"), + rs.getString("serverId"), + rs.getString("entityType"), + rs.getLong("processedRecords"), + rs.getLong("successRecords"), + rs.getLong("failedRecords"), + rs.getInt("partitionsCompleted"), + rs.getInt("partitionsFailed"), + rs.getLong("lastUpdatedAt")); + } + } + + class RdfAggregatedServerStatsMapper implements RowMapper { + @Override + public AggregatedServerStats map(ResultSet rs, StatementContext ctx) throws SQLException { + return new AggregatedServerStats( + rs.getLong("processedRecords"), + rs.getLong("successRecords"), + rs.getLong("failedRecords"), + rs.getInt("partitionsCompleted"), + rs.getInt("partitionsFailed")); + } + } + } + @RegisterRowMapper(AuditLogRecordMapper.class) interface AuditLogDAO { @ConnectionAwareSqlUpdate( diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfRepository.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfRepository.java index 308645ca3130..e5ac76e09cb9 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfRepository.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/RdfRepository.java @@ -4,9 +4,12 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; +import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.UUID; @@ -754,7 +757,13 @@ private String getJenaFormat(String mimeType) { return "TURTLE"; // default } - public String getEntityGraph(UUID entityId, String entityType, int depth) throws IOException { + public String getEntityGraph( + UUID entityId, + String entityType, + int depth, + Set entityTypes, + Set relationshipTypes) + throws IOException { if (!isEnabled()) { throw new IllegalStateException("RDF Repository is not enabled"); } @@ -762,35 +771,21 @@ public String getEntityGraph(UUID entityId, String entityType, int depth) throws String entityUri = config.getBaseUri().toString() + "entity/" + entityType + "/" + entityId; try { - Set visitedNodes = new HashSet<>(); - Set currentLevelNodes = new HashSet<>(); - List allEdges = new ArrayList<>(); - - currentLevelNodes.add(entityUri); - visitedNodes.add(entityUri); - - for (int currentDepth = 0; - currentDepth < depth && !currentLevelNodes.isEmpty(); - currentDepth++) { - Set nextLevelNodes = new HashSet<>(); - - // For each node at current level, get its relationships - for (String nodeUri : currentLevelNodes) { - String sparql = buildSingleNodeQuery(nodeUri); - String results = - storageService.executeSparqlQuery(sparql, "application/sparql-results+json"); - - if (results != null && !results.trim().isEmpty()) { - List edges = parseEdgesFromResults(results, visitedNodes, nextLevelNodes); - allEdges.addAll(edges); - } - } - - currentLevelNodes = nextLevelNodes; - visitedNodes.addAll(nextLevelNodes); - } - - return convertEdgesToGraphData(allEdges); + EntityGraphTraversalResult traversalResult = traverseEntityGraph(entityUri, depth); + FilteredEntityGraph filteredGraph = + applyGraphFilters( + entityUri, + traversalResult.nodeUris(), + traversalResult.edges(), + entityTypes, + relationshipTypes); + + return convertEdgesToGraphData( + entityUri, + filteredGraph.nodeUris(), + filteredGraph.edges(), + buildEntityTypeFilterOptions(traversalResult.nodeUris()), + buildRelationshipFilterOptions(traversalResult.edges())); } catch (Exception e) { LOG.error("Error getting entity graph for {}", entityUri, e); throw new IOException("Failed to get entity graph", e); @@ -1494,56 +1489,126 @@ private String formatRelationshipLabel(String relationship) { }; } - private String buildSingleNodeQuery(String nodeUri) { + private EntityGraphTraversalResult traverseEntityGraph(String rootUri, int depth) { + Set visitedNodes = new HashSet<>(); + Set currentLevelNodes = new HashSet<>(); + Set discoveredNodes = new HashSet<>(); + Set edgeKeys = new HashSet<>(); + List allEdges = new ArrayList<>(); + + currentLevelNodes.add(rootUri); + visitedNodes.add(rootUri); + discoveredNodes.add(rootUri); + + for (int currentDepth = 0; + currentDepth < depth && !currentLevelNodes.isEmpty(); + currentDepth++) { + String sparql = buildEntityGraphBatchQuery(currentLevelNodes); + String results = storageService.executeSparqlQuery(sparql, "application/sparql-results+json"); + + Set nextLevelNodes = new HashSet<>(); + if (results != null && !results.trim().isEmpty()) { + allEdges.addAll( + parseEntityGraphEdgesFromResults( + results, visitedNodes, nextLevelNodes, discoveredNodes, edgeKeys)); + } + + nextLevelNodes.removeAll(visitedNodes); + visitedNodes.addAll(nextLevelNodes); + currentLevelNodes = nextLevelNodes; + } + + return new EntityGraphTraversalResult(discoveredNodes, allEdges); + } + + private String buildEntityGraphBatchQuery(Set nodeUris) { + String entityPrefix = config.getBaseUri().toString() + "entity/"; + String valuesClause = + nodeUris.stream() + .sorted() + .map(uri -> "<" + uri + ">") + .collect(java.util.stream.Collectors.joining(" ")); + return "PREFIX om: " + "PREFIX rdfs: " + "PREFIX rdf: " + "SELECT DISTINCT ?subject ?predicate ?object WHERE { " + " { " - + " GRAPH ?g { <" - + nodeUri - + "> ?predicate ?object . " - + " FILTER(isIRI(?object) && " - + " ?predicate != rdf:type && " - + " ?predicate != rdfs:label) } " - + " BIND(<" - + nodeUri - + "> AS ?subject) " + + " VALUES ?frontier { " + + valuesClause + + " } " + + " GRAPH ?g { " + + " ?frontier ?predicate ?object . " + + " FILTER(isIRI(?object) && " + + " STRSTARTS(STR(?object), \"" + + entityPrefix + + "\") && " + + " ?predicate != rdf:type && " + + " ?predicate != rdfs:label) " + + " } " + + " BIND(?frontier AS ?subject) " + " } UNION { " - + " GRAPH ?g { ?subject ?predicate <" - + nodeUri - + "> . " - + " FILTER(isIRI(?subject) && " - + " ?predicate != rdf:type && " - + " ?predicate != rdfs:label) } " - + " BIND(<" - + nodeUri - + "> AS ?object) " + + " VALUES ?frontier { " + + valuesClause + + " } " + + " GRAPH ?g { " + + " ?subject ?predicate ?frontier . " + + " FILTER(isIRI(?subject) && " + + " STRSTARTS(STR(?subject), \"" + + entityPrefix + + "\") && " + + " ?predicate != rdf:type && " + + " ?predicate != rdfs:label) " + + " } " + + " BIND(?frontier AS ?object) " + " } " - + "} LIMIT 200"; + + "} LIMIT 5000"; } - private List parseEdgesFromResults( - String sparqlResults, Set visitedNodes, Set nextLevelNodes) { + private List parseEntityGraphEdgesFromResults( + String sparqlResults, + Set visitedNodes, + Set nextLevelNodes, + Set discoveredNodes, + Set edgeKeys) { List edges = new ArrayList<>(); com.fasterxml.jackson.databind.JsonNode resultsJson = JsonUtils.readTree(sparqlResults); if (resultsJson.has("results") && resultsJson.get("results").has("bindings")) { for (com.fasterxml.jackson.databind.JsonNode binding : resultsJson.get("results").get("bindings")) { - String subjectUri = binding.get("subject").get("value").asText(); - String objectUri = binding.get("object").get("value").asText(); - String predicate = binding.get("predicate").get("value").asText(); + String subjectUri = + binding.has("subject") ? binding.get("subject").get("value").asText() : null; + String objectUri = + binding.has("object") ? binding.get("object").get("value").asText() : null; + String predicate = + binding.has("predicate") ? binding.get("predicate").get("value").asText() : null; + + if (!isEntityUri(subjectUri) || !isEntityUri(objectUri)) { + continue; + } - EdgeInfo edge = new EdgeInfo(subjectUri, objectUri, extractPredicateName(predicate)); - edges.add(edge); + String relationType = extractEntityRelationType(predicate); + if (relationType == null || relationType.isBlank()) { + continue; + } - if (!visitedNodes.contains(objectUri)) { - nextLevelNodes.add(objectUri); + String edgeKey = subjectUri + "|" + relationType + "|" + objectUri; + if (!edgeKeys.add(edgeKey)) { + continue; } + + EdgeInfo edge = new EdgeInfo(subjectUri, objectUri, relationType); + edges.add(edge); + discoveredNodes.add(subjectUri); + discoveredNodes.add(objectUri); + if (!visitedNodes.contains(subjectUri)) { nextLevelNodes.add(subjectUri); } + if (!visitedNodes.contains(objectUri)) { + nextLevelNodes.add(objectUri); + } } } @@ -1562,52 +1627,274 @@ private static class EdgeInfo { } } - private String convertEdgesToGraphData(List edges) { + private FilteredEntityGraph applyGraphFilters( + String rootUri, + Set nodeUris, + List edges, + Set entityTypeFilters, + Set relationshipTypeFilters) { + if ((entityTypeFilters == null || entityTypeFilters.isEmpty()) + && (relationshipTypeFilters == null || relationshipTypeFilters.isEmpty())) { + return new FilteredEntityGraph(new HashSet<>(nodeUris), edges); + } + + Set normalizedEntityFilters = new HashSet<>(); + if (entityTypeFilters != null) { + entityTypeFilters.stream() + .map(this::normalizeEntityTypeFilter) + .filter(value -> !value.isBlank()) + .forEach(normalizedEntityFilters::add); + } + + Set normalizedRelationshipFilters = new HashSet<>(); + if (relationshipTypeFilters != null) { + relationshipTypeFilters.stream() + .map(this::normalizeRelationTypeFilter) + .filter(value -> !value.isBlank()) + .forEach(normalizedRelationshipFilters::add); + } + + Set allowedNodes = new HashSet<>(); + for (String nodeUri : nodeUris) { + if (rootUri.equals(nodeUri) + || normalizedEntityFilters.isEmpty() + || normalizedEntityFilters.contains( + normalizeEntityTypeFilter(extractEntityTypeFromUri(nodeUri)))) { + allowedNodes.add(nodeUri); + } + } + + List filteredEdges = new ArrayList<>(); + Set connectedNodes = new HashSet<>(); + connectedNodes.add(rootUri); + + for (EdgeInfo edge : edges) { + boolean relationshipAllowed = + normalizedRelationshipFilters.isEmpty() + || normalizedRelationshipFilters.contains(normalizeRelationTypeFilter(edge.relation)); + if (!relationshipAllowed) { + continue; + } + + if (!allowedNodes.contains(edge.fromUri) || !allowedNodes.contains(edge.toUri)) { + continue; + } + + filteredEdges.add(edge); + connectedNodes.add(edge.fromUri); + connectedNodes.add(edge.toUri); + } + + Set filteredNodes = new HashSet<>(); + for (String nodeUri : allowedNodes) { + if (rootUri.equals(nodeUri) || connectedNodes.contains(nodeUri)) { + filteredNodes.add(nodeUri); + } + } + filteredNodes.add(rootUri); + + return new FilteredEntityGraph(filteredNodes, filteredEdges); + } + + private List buildEntityTypeFilterOptions(Set nodeUris) { + Map counts = new LinkedHashMap<>(); + for (String nodeUri : nodeUris) { + String entityType = extractEntityTypeFromUri(nodeUri); + counts.merge(entityType, 1, Integer::sum); + } + return buildFilterOptions(counts); + } + + private List buildRelationshipFilterOptions(List edges) { + Map counts = new LinkedHashMap<>(); + for (EdgeInfo edge : edges) { + counts.merge(edge.relation, 1, Integer::sum); + } + return buildFilterOptions(counts); + } + + private List buildFilterOptions(Map counts) { + return counts.entrySet().stream() + .map( + entry -> + new FilterOptionInfo( + entry.getKey(), formatRelationshipLabel(entry.getKey()), entry.getValue())) + .sorted( + Comparator.comparingInt(FilterOptionInfo::count) + .reversed() + .thenComparing(FilterOptionInfo::label)) + .toList(); + } + + private String convertEdgesToGraphData( + String rootUri, + Set nodeUris, + List edges, + List entityTypeOptions, + List relationshipTypeOptions) { com.fasterxml.jackson.databind.node.ObjectNode graphData = JsonUtils.getObjectMapper().createObjectNode(); com.fasterxml.jackson.databind.node.ArrayNode nodes = JsonUtils.getObjectMapper().createArrayNode(); com.fasterxml.jackson.databind.node.ArrayNode graphEdges = JsonUtils.getObjectMapper().createArrayNode(); + com.fasterxml.jackson.databind.node.ObjectNode filterOptions = + JsonUtils.getObjectMapper().createObjectNode(); + com.fasterxml.jackson.databind.node.ArrayNode entityTypeFilterOptions = + JsonUtils.getObjectMapper().createArrayNode(); + com.fasterxml.jackson.databind.node.ArrayNode relationshipTypeFilterOptions = + JsonUtils.getObjectMapper().createArrayNode(); - Set addedNodes = new HashSet<>(); Map nodeMap = new HashMap<>(); - for (EdgeInfo edge : edges) { - String fromUri = edge.fromUri; - String toUri = edge.toUri; - - if (!addedNodes.contains(fromUri)) { - com.fasterxml.jackson.databind.node.ObjectNode fromNode = createNodeFromUri(fromUri); - nodes.add(fromNode); - nodeMap.put(fromUri, fromNode); - addedNodes.add(fromUri); - } + List orderedNodeUris = + nodeUris.stream() + .sorted( + Comparator.comparing((String uri) -> !rootUri.equals(uri)) + .thenComparing(this::extractEntityTypeFromUri) + .thenComparing(uri -> uri)) + .toList(); - if (!addedNodes.contains(toUri)) { - com.fasterxml.jackson.databind.node.ObjectNode toNode = createNodeFromUri(toUri); - nodes.add(toNode); - nodeMap.put(toUri, toNode); - addedNodes.add(toUri); - } + for (String nodeUri : orderedNodeUris) { + com.fasterxml.jackson.databind.node.ObjectNode node = createNodeFromUri(nodeUri); + nodes.add(node); + nodeMap.put(nodeUri, node); + } + for (EdgeInfo edge : edges) { com.fasterxml.jackson.databind.node.ObjectNode graphEdge = JsonUtils.getObjectMapper().createObjectNode(); - graphEdge.put("from", fromUri); - graphEdge.put("to", toUri); + graphEdge.put("from", edge.fromUri); + graphEdge.put("to", edge.toUri); graphEdge.put("label", formatRelationshipLabel(edge.relation)); + graphEdge.put("relationType", edge.relation); graphEdge.put("arrows", "to"); graphEdges.add(graphEdge); } + for (FilterOptionInfo filterOption : entityTypeOptions) { + entityTypeFilterOptions.add(createFilterOptionNode(filterOption)); + } + + for (FilterOptionInfo filterOption : relationshipTypeOptions) { + relationshipTypeFilterOptions.add(createFilterOptionNode(filterOption)); + } + enhanceNodesWithEntityDetails(nodeMap); graphData.set("nodes", nodes); graphData.set("edges", graphEdges); + graphData.put("totalNodes", nodes.size()); + graphData.put("totalEdges", graphEdges.size()); + graphData.put("source", "rdf"); + + filterOptions.set("entityTypes", entityTypeFilterOptions); + filterOptions.set("relationshipTypes", relationshipTypeFilterOptions); + graphData.set("filterOptions", filterOptions); return JsonUtils.pojoToJson(graphData); } + private com.fasterxml.jackson.databind.node.ObjectNode createFilterOptionNode( + FilterOptionInfo filterOption) { + com.fasterxml.jackson.databind.node.ObjectNode option = + JsonUtils.getObjectMapper().createObjectNode(); + option.put("id", filterOption.id()); + option.put("label", filterOption.label()); + option.put("count", filterOption.count()); + return option; + } + + private boolean isEntityUri(String uri) { + if (uri == null || !uri.startsWith(config.getBaseUri().toString() + "entity/")) { + return false; + } + String[] parts = uri.split("/entity/")[1].split("/"); + return parts.length >= 2 && !parts[0].isBlank() && !parts[1].isBlank(); + } + + private String extractEntityRelationType(String predicateUri) { + if (predicateUri == null || predicateUri.isBlank()) { + return null; + } + + String localName = extractUriLocalName(predicateUri); + if (localName == null || localName.isBlank()) { + return null; + } + + String normalized = localName.replaceAll("[^A-Za-z0-9]", "").toLowerCase(Locale.ROOT); + return switch (normalized) { + case "used" -> "uses"; + case "wasderivedfrom", "upstream" -> "upstream"; + case "wasinfluencedby", "downstream" -> "downstream"; + case "wasgeneratedby" -> "processedBy"; + default -> toCanonicalIdentifier(localName); + }; + } + + private String normalizeEntityTypeFilter(String entityType) { + return entityType == null ? "" : entityType.trim().toLowerCase(Locale.ROOT); + } + + private String normalizeRelationTypeFilter(String relationType) { + String canonical = toCanonicalIdentifier(relationType); + return canonical == null ? "" : canonical.toLowerCase(Locale.ROOT); + } + + private String extractUriLocalName(String uri) { + if (uri == null || uri.isBlank()) { + return null; + } + if (uri.contains("#")) { + return uri.substring(uri.lastIndexOf('#') + 1); + } + if (uri.contains("/")) { + return uri.substring(uri.lastIndexOf('/') + 1); + } + return uri; + } + + private String toCanonicalIdentifier(String value) { + String localName = extractUriLocalName(value); + if (localName == null || localName.isBlank()) { + return null; + } + + if (localName.equals(localName.toUpperCase(Locale.ROOT))) { + return localName.toLowerCase(Locale.ROOT); + } + + if (localName.matches("[a-z]+([A-Z][a-z0-9]+)+")) { + return Character.toLowerCase(localName.charAt(0)) + localName.substring(1); + } + + String spaced = + localName.replaceAll("([a-z0-9])([A-Z])", "$1 $2").replace('_', ' ').replace('-', ' '); + String[] parts = spaced.trim().split("\\s+"); + if (parts.length == 0) { + return localName; + } + + StringBuilder builder = new StringBuilder(parts[0].toLowerCase(Locale.ROOT)); + for (int i = 1; i < parts.length; i++) { + if (parts[i].isBlank()) { + continue; + } + String normalizedPart = parts[i].toLowerCase(Locale.ROOT); + builder + .append(Character.toUpperCase(normalizedPart.charAt(0))) + .append(normalizedPart.substring(1)); + } + return builder.toString(); + } + + private record EntityGraphTraversalResult(Set nodeUris, List edges) {} + + private record FilteredEntityGraph(Set nodeUris, List edges) {} + + private record FilterOptionInfo(String id, String label, int count) {} + public void executeSparqlUpdate(String update) { if (!isEnabled()) { throw new IllegalStateException("RDF not enabled"); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java index b69dd1a3d363..810b6cb5d3e5 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/rdf/translator/RdfPropertyMapper.java @@ -43,7 +43,10 @@ public class RdfPropertyMapper { // Properties that should be mapped to structured RDF instead of JSON literals private static final Set STRUCTURED_PROPERTIES = - Set.of("changeDescription", "votes", "lifeCycle", "customProperties", "extension"); + Set.of("votes", "lifeCycle", "customProperties", "extension"); + + // Properties that should be omitted from RDF because they are audit/helper data. + private static final Set IGNORED_PROPERTIES = Set.of("changeDescription"); // Lineage properties that need special handling private static final Set LINEAGE_PROPERTIES = @@ -119,7 +122,7 @@ private void processContextMappings( // Skip fields that are handled separately with typed predicates // (e.g., relatedTerms which use typed relations like broader, synonym, etc.) - if (TYPED_RELATION_FIELDS.contains(fieldName)) { + if (TYPED_RELATION_FIELDS.contains(fieldName) || IGNORED_PROPERTIES.contains(fieldName)) { continue; } @@ -400,7 +403,6 @@ private UUID tryResolveGlossaryTermIdFromHref(JsonNode tagLabel) { private void addStructuredProperty( String fieldName, JsonNode value, Resource entityResource, Model model) { switch (fieldName) { - case "changeDescription" -> addChangeDescription(value, entityResource, model); case "votes" -> addVotes(value, entityResource, model); case "lifeCycle" -> addLifeCycle(value, entityResource, model); case "extension" -> addExtension(value, entityResource, model); @@ -421,102 +423,9 @@ private void addStructuredArrayProperty( } } - /** - * Converts ChangeDescription to structured RDF triples. Enables SPARQL queries like: "Find all - * entities where description was changed by user X after date Y" - * - *

Structure: entity -> om:hasChangeDescription -> _:changeNode _:changeNode a - * om:ChangeDescription _:changeNode om:previousVersion "1.0" _:changeNode om:fieldsAdded -> - * _:fieldChange1 - */ - private void addChangeDescription(JsonNode changeDesc, Resource entityResource, Model model) { - if (changeDesc == null || changeDesc.isNull()) { - return; - } - - // Create a blank node for the change description - String changeNodeUri = - baseUri + "change/" + entityResource.getLocalName() + "/" + UUID.randomUUID(); - Resource changeNode = model.createResource(changeNodeUri); - - // Link entity to change description - Property hasChangeDesc = model.createProperty(OM_NS, "hasChangeDescription"); - entityResource.addProperty(hasChangeDesc, changeNode); - - // Add type - changeNode.addProperty(RDF.type, model.createResource(OM_NS + "ChangeDescription")); - - // Add previous version - if (changeDesc.has("previousVersion")) { - changeNode.addProperty( - model.createProperty(OM_NS, "previousVersion"), - model.createTypedLiteral(changeDesc.get("previousVersion").asDouble())); - } - - // Add fields added - if (changeDesc.has("fieldsAdded") && changeDesc.get("fieldsAdded").isArray()) { - addFieldChanges( - changeDesc.get("fieldsAdded"), changeNode, "fieldsAdded", entityResource, model); - } - - // Add fields updated - if (changeDesc.has("fieldsUpdated") && changeDesc.get("fieldsUpdated").isArray()) { - addFieldChanges( - changeDesc.get("fieldsUpdated"), changeNode, "fieldsUpdated", entityResource, model); - } - - // Add fields deleted - if (changeDesc.has("fieldsDeleted") && changeDesc.get("fieldsDeleted").isArray()) { - addFieldChanges( - changeDesc.get("fieldsDeleted"), changeNode, "fieldsDeleted", entityResource, model); - } - } - - /** - * Adds field change details as structured RDF - */ - private void addFieldChanges( - JsonNode fieldsArray, - Resource changeNode, - String changeType, - Resource entityResource, - Model model) { - Property changeProp = model.createProperty(OM_NS, changeType); - - for (JsonNode fieldChange : fieldsArray) { - // Create a blank node for each field change - String fieldChangeUri = - baseUri + "fieldChange/" + entityResource.getLocalName() + "/" + UUID.randomUUID(); - Resource fieldChangeNode = model.createResource(fieldChangeUri); - - changeNode.addProperty(changeProp, fieldChangeNode); - fieldChangeNode.addProperty(RDF.type, model.createResource(OM_NS + "FieldChange")); - - // Add field name - if (fieldChange.has("name")) { - fieldChangeNode.addProperty( - model.createProperty(OM_NS, "fieldName"), fieldChange.get("name").asText()); - } - - // Add old value (as string representation for queryability) - if (fieldChange.has("oldValue") && !fieldChange.get("oldValue").isNull()) { - JsonNode oldVal = fieldChange.get("oldValue"); - String oldValueStr = oldVal.isTextual() ? oldVal.asText() : oldVal.toString(); - fieldChangeNode.addProperty(model.createProperty(OM_NS, "oldValue"), oldValueStr); - } - - // Add new value (as string representation for queryability) - if (fieldChange.has("newValue") && !fieldChange.get("newValue").isNull()) { - JsonNode newVal = fieldChange.get("newValue"); - String newValueStr = newVal.isTextual() ? newVal.asText() : newVal.toString(); - fieldChangeNode.addProperty(model.createProperty(OM_NS, "newValue"), newValueStr); - } - } - } - /** * Converts Votes to structured RDF triples. Enables SPARQL queries like: "Find all entities with - * more than 10 upvotes" or "Find entities upvoted by user X" + * more than 10 upvotes" without exposing individual voter identities as graph edges. */ private void addVotes(JsonNode votes, Resource entityResource, Model model) { if (votes == null || votes.isNull()) { @@ -547,30 +456,6 @@ private void addVotes(JsonNode votes, Resource entityResource, Model model) { model.createProperty(OM_NS, "downVotes"), model.createTypedLiteral(votes.get("downVotes").asInt())); } - - // Add upVoters as entity references - if (votes.has("upVoters") && votes.get("upVoters").isArray()) { - Property upVotersProp = model.createProperty(OM_NS, "upVoters"); - for (JsonNode voter : votes.get("upVoters")) { - if (voter.has("id") && voter.has("type")) { - String voterUri = - baseUri + "entity/" + voter.get("type").asText() + "/" + voter.get("id").asText(); - votesNode.addProperty(upVotersProp, model.createResource(voterUri)); - } - } - } - - // Add downVoters as entity references - if (votes.has("downVoters") && votes.get("downVoters").isArray()) { - Property downVotersProp = model.createProperty(OM_NS, "downVoters"); - for (JsonNode voter : votes.get("downVoters")) { - if (voter.has("id") && voter.has("type")) { - String voterUri = - baseUri + "entity/" + voter.get("type").asText() + "/" + voter.get("id").asText(); - votesNode.addProperty(downVotersProp, model.createResource(voterUri)); - } - } - } } /** diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java index 56e7bbb379df..297ba4122a2e 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/rdf/RdfResource.java @@ -20,6 +20,9 @@ import jakarta.ws.rs.core.SecurityContext; import jakarta.ws.rs.core.UriInfo; import java.io.IOException; +import java.util.Arrays; +import java.util.LinkedHashSet; +import java.util.Set; import java.util.UUID; import javax.validation.constraints.NotEmpty; import lombok.extern.slf4j.Slf4j; @@ -240,7 +243,13 @@ public Response exploreEntityGraph( @Parameter(description = "Depth of relationships to explore") @QueryParam("depth") @DefaultValue("2") - int depth) { + int depth, + @Parameter(description = "Comma-separated entity types to keep in the graph") + @QueryParam("entityTypes") + String entityTypes, + @Parameter(description = "Comma-separated relationship types to keep in the graph") + @QueryParam("relationshipTypes") + String relationshipTypes) { authorizer.authorizeAdmin(securityContext); try { if (getRdfRepository() == null || !getRdfRepository().isEnabled()) { @@ -249,7 +258,14 @@ public Response exploreEntityGraph( .build(); } - String graphData = getRdfRepository().getEntityGraph(entityId, entityType, depth); + String graphData = + getRdfRepository() + .getEntityGraph( + entityId, + entityType, + depth, + parseCsvFilter(entityTypes), + parseCsvFilter(relationshipTypes)); return Response.ok(graphData, MediaType.APPLICATION_JSON).build(); } catch (Exception e) { @@ -258,6 +274,17 @@ public Response exploreEntityGraph( } } + private Set parseCsvFilter(String values) { + if (values == null || values.isBlank()) { + return Set.of(); + } + + return Arrays.stream(values.split(",")) + .map(String::trim) + .filter(value -> !value.isEmpty()) + .collect(java.util.stream.Collectors.toCollection(LinkedHashSet::new)); + } + @GET @Path("/sparql") @Operation( diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/util/ODCSConverter.java b/openmetadata-service/src/main/java/org/openmetadata/service/util/ODCSConverter.java index 3be0a07736f2..a8439f264e7f 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/util/ODCSConverter.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/util/ODCSConverter.java @@ -330,8 +330,8 @@ private static ODCSDataContract.OdcsStatus mapContractStatusToODCS(EntityStatus if (status == null) return ODCSDataContract.OdcsStatus.DRAFT; return switch (status) { case APPROVED -> ODCSDataContract.OdcsStatus.ACTIVE; - case DEPRECATED -> ODCSDataContract.OdcsStatus.DEPRECATED; case ARCHIVED -> ODCSDataContract.OdcsStatus.RETIRED; + case DEPRECATED -> ODCSDataContract.OdcsStatus.DEPRECATED; case DRAFT, IN_REVIEW, REJECTED, UNPROCESSED -> ODCSDataContract.OdcsStatus.DRAFT; }; } diff --git a/openmetadata-service/src/main/resources/json/data/app/RdfIndexApp.json b/openmetadata-service/src/main/resources/json/data/app/RdfIndexApp.json index 3695ec8e493a..d34cd25357e9 100644 --- a/openmetadata-service/src/main/resources/json/data/app/RdfIndexApp.json +++ b/openmetadata-service/src/main/resources/json/data/app/RdfIndexApp.json @@ -1,16 +1,22 @@ { "name": "RdfIndexApp", "displayName": "RDF Knowledge Graph Indexing", + "allowConfiguration": true, "appConfiguration": { "entities": [ "all" ], "recreateIndex": false, - "batchSize": "100" + "batchSize": 100, + "producerThreads": 2, + "consumerThreads": 3, + "queueSize": 5000, + "useDistributedIndexing": true, + "partitionSize": 10000 }, "appSchedule": { "scheduleTimeline": "Custom", "cronExpression": "0 0 * * *" }, "supportsInterrupt": true -} \ No newline at end of file +} diff --git a/openmetadata-service/src/main/resources/json/data/appMarketPlaceDefinition/RdfIndexApp.json b/openmetadata-service/src/main/resources/json/data/appMarketPlaceDefinition/RdfIndexApp.json index 74ee1963e36e..90a37b9f096f 100644 --- a/openmetadata-service/src/main/resources/json/data/appMarketPlaceDefinition/RdfIndexApp.json +++ b/openmetadata-service/src/main/resources/json/data/appMarketPlaceDefinition/RdfIndexApp.json @@ -12,6 +12,7 @@ "scheduleType": "ScheduledOrManual", "permission": "All", "className": "org.openmetadata.service.apps.bundles.rdf.RdfIndexApp", + "allowConfiguration": true, "runtime": { "enabled": true }, @@ -21,6 +22,11 @@ "all" ], "recreateIndex": false, - "batchSize": "100" + "batchSize": 100, + "producerThreads": 2, + "consumerThreads": 3, + "queueSize": 5000, + "useDistributedIndexing": true, + "partitionSize": 10000 } -} \ No newline at end of file +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexAppTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexAppTest.java index 3476da14b28f..1470e08acdb6 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexAppTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/RdfIndexAppTest.java @@ -3,10 +3,13 @@ import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; +import com.fasterxml.jackson.core.type.TypeReference; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.UUID; +import java.util.concurrent.CompletableFuture; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -18,14 +21,25 @@ import org.mockito.MockedStatic; import org.mockito.junit.jupiter.MockitoExtension; import org.openmetadata.schema.EntityInterface; +import org.openmetadata.schema.entity.app.AppRunRecord; import org.openmetadata.schema.system.EventPublisherJob; +import org.openmetadata.schema.system.IndexingError; +import org.openmetadata.schema.system.Stats; +import org.openmetadata.schema.system.StepStats; import org.openmetadata.schema.type.Include; import org.openmetadata.schema.type.Relationship; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.Entity; +import org.openmetadata.service.apps.bundles.rdf.distributed.RdfIndexJob; import org.openmetadata.service.jdbi3.CollectionDAO; import org.openmetadata.service.jdbi3.CollectionDAO.EntityRelationshipDAO; import org.openmetadata.service.jdbi3.CollectionDAO.EntityRelationshipObject; +import org.openmetadata.service.jdbi3.EntityRepository; import org.openmetadata.service.rdf.RdfRepository; import org.openmetadata.service.search.SearchRepository; +import org.quartz.JobDataMap; +import org.quartz.JobDetail; +import org.quartz.JobExecutionContext; @ExtendWith(MockitoExtension.class) @DisplayName("RdfIndexApp Tests") @@ -39,6 +53,30 @@ class RdfIndexAppTest { private static RdfRepository mockRdfRepository; private RdfIndexApp rdfIndexApp; + private static class TestableRdfIndexApp extends RdfIndexApp { + private AppRunRecord appRunRecord; + private JobExecutionContext pushedContext; + private AppRunRecord pushedRecord; + private boolean pushedUpdate; + + TestableRdfIndexApp(CollectionDAO collectionDAO, SearchRepository searchRepository) { + super(collectionDAO, searchRepository); + } + + @Override + protected AppRunRecord getJobRecord(JobExecutionContext jobExecutionContext) { + return appRunRecord; + } + + @Override + protected void pushAppStatusUpdates( + JobExecutionContext jobExecutionContext, AppRunRecord appRecord, boolean update) { + this.pushedContext = jobExecutionContext; + this.pushedRecord = appRecord; + this.pushedUpdate = update; + } + } + @BeforeAll static void setUpClass() { mockRdfRepository = mock(RdfRepository.class); @@ -383,6 +421,163 @@ void testValidateValidConfig() { } } + @Nested + @DisplayName("Distributed Job Serialization Tests") + class DistributedJobSerializationTests { + + @Test + @DisplayName("Should deserialize distributed RDF entity stats from JSON") + void testDistributedEntityStatsJsonRoundTrip() { + Map original = + Map.of( + "app", + RdfIndexJob.EntityTypeStats.builder() + .entityType("app") + .totalRecords(12) + .processedRecords(4) + .successRecords(3) + .failedRecords(1) + .totalPartitions(2) + .completedPartitions(1) + .failedPartitions(0) + .build()); + + String json = JsonUtils.pojoToJson(original); + Map roundTrip = + JsonUtils.readValue( + json, new TypeReference>() {}); + + assertEquals(12, roundTrip.get("app").getTotalRecords()); + assertEquals(3, roundTrip.get("app").getSuccessRecords()); + assertEquals(2, roundTrip.get("app").getTotalPartitions()); + } + } + + @Nested + @DisplayName("Entity Selection Tests") + class EntitySelectionTests { + + @Test + @DisplayName("Should skip entity types without repositories") + void testResolveEntityTypesSkipsUnsupportedEntities() throws Exception { + try (MockedStatic entityMock = mockStatic(Entity.class)) { + EntityRepository mockRepository = mock(EntityRepository.class); + entityMock.when(() -> Entity.getEntityRepository("table")).thenReturn(mockRepository); + entityMock + .when(() -> Entity.getEntityRepository("queryCostRecord")) + .thenThrow(new IllegalStateException("Unsupported entity")); + + var method = RdfIndexApp.class.getDeclaredMethod("resolveEntityTypes", Set.class); + method.setAccessible(true); + + @SuppressWarnings("unchecked") + Set result = + (Set) method.invoke(rdfIndexApp, Set.of("table", "queryCostRecord")); + + assertEquals(Set.of("table"), result); + } + } + } + + @Nested + @DisplayName("Run Record Update Tests") + class RunRecordUpdateTests { + + @Test + @DisplayName("Should persist updated app run record state during execution") + void testUpdateRecordToDbAndNotifyPersistsRunRecord() throws Exception { + TestableRdfIndexApp testApp = new TestableRdfIndexApp(collectionDAO, searchRepository); + + EventPublisherJob jobConfig = new EventPublisherJob(); + jobConfig.setStatus(EventPublisherJob.Status.FAILED); + jobConfig.setFailure( + new IndexingError().withMessage("distributed rdf job initialization failed")); + Stats stats = new Stats(); + stats.setJobStats(new StepStats().withTotalRecords(5).withFailedRecords(5)); + jobConfig.setStats(stats); + + var jobDataField = RdfIndexApp.class.getDeclaredField("jobData"); + jobDataField.setAccessible(true); + jobDataField.set(testApp, jobConfig); + + JobExecutionContext context = mock(JobExecutionContext.class); + testApp.appRunRecord = new AppRunRecord().withStatus(AppRunRecord.Status.RUNNING); + + testApp.updateRecordToDbAndNotify(context); + + assertEquals(AppRunRecord.Status.FAILED, testApp.appRunRecord.getStatus()); + assertNotNull(testApp.appRunRecord.getFailureContext()); + assertNotNull(testApp.appRunRecord.getSuccessContext()); + assertSame(context, testApp.pushedContext); + assertSame(testApp.appRunRecord, testApp.pushedRecord); + assertTrue(testApp.pushedUpdate); + } + + @Test + @DisplayName("Should publish distributed RDF progress before job completion") + void testMonitorDistributedJobPublishesRunningProgress() throws Exception { + TestableRdfIndexApp testApp = new TestableRdfIndexApp(collectionDAO, searchRepository); + + EventPublisherJob jobConfig = new EventPublisherJob(); + jobConfig.setStatus(EventPublisherJob.Status.RUNNING); + + var jobDataField = RdfIndexApp.class.getDeclaredField("jobData"); + jobDataField.setAccessible(true); + jobDataField.set(testApp, jobConfig); + + JobExecutionContext context = mock(JobExecutionContext.class); + JobDetail jobDetail = mock(JobDetail.class); + JobDataMap jobDataMap = new JobDataMap(); + when(context.getJobDetail()).thenReturn(jobDetail); + when(jobDetail.getJobDataMap()).thenReturn(jobDataMap); + + var jobExecutionContextField = RdfIndexApp.class.getDeclaredField("jobExecutionContext"); + jobExecutionContextField.setAccessible(true); + jobExecutionContextField.set(testApp, context); + + testApp.appRunRecord = new AppRunRecord().withStatus(AppRunRecord.Status.RUNNING); + + var distributedExecutorField = RdfIndexApp.class.getDeclaredField("distributedExecutor"); + distributedExecutorField.setAccessible(true); + var mockDistributedExecutor = + mock( + org.openmetadata.service.apps.bundles.rdf.distributed.DistributedRdfIndexExecutor + .class); + when(mockDistributedExecutor.getJobWithFreshStats()) + .thenReturn( + RdfIndexJob.builder() + .id(UUID.randomUUID()) + .status( + org.openmetadata + .service + .apps + .bundles + .searchIndex + .distributed + .IndexJobStatus + .RUNNING) + .totalRecords(10) + .processedRecords(7) + .successRecords(7) + .failedRecords(0) + .build()); + distributedExecutorField.set(testApp, mockDistributedExecutor); + + var method = + RdfIndexApp.class.getDeclaredMethod( + "monitorDistributedJob", UUID.class, java.util.concurrent.Future.class); + method.setAccessible(true); + method.invoke(testApp, UUID.randomUUID(), CompletableFuture.completedFuture(null)); + + assertNotNull(testApp.getJobData().getStats()); + assertEquals(10, testApp.getJobData().getStats().getJobStats().getTotalRecords()); + assertEquals(7, testApp.getJobData().getStats().getJobStats().getSuccessRecords()); + assertSame(context, testApp.pushedContext); + assertNotNull(testApp.appRunRecord.getSuccessContext()); + assertEquals(AppRunRecord.Status.RUNNING, testApp.appRunRecord.getStatus()); + } + } + @Nested @DisplayName("IndexingTask Record Tests") class IndexingTaskTests { diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinatorTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinatorTest.java new file mode 100644 index 000000000000..d9b8f211e9c3 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/rdf/distributed/DistributedRdfIndexCoordinatorTest.java @@ -0,0 +1,137 @@ +/* + * Copyright 2024 Collate + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.openmetadata.service.apps.bundles.rdf.distributed; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.ArgumentMatchers.isNull; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.UUID; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.MockedStatic; +import org.mockito.junit.jupiter.MockitoExtension; +import org.openmetadata.schema.system.EventPublisherJob; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.IndexJobStatus; +import org.openmetadata.service.apps.bundles.searchIndex.distributed.ServerIdentityResolver; +import org.openmetadata.service.jdbi3.CollectionDAO; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexJobDAO; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexJobDAO.RdfIndexJobRecord; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexPartitionDAO; +import org.openmetadata.service.jdbi3.CollectionDAO.RdfIndexPartitionDAO.RdfAggregatedStatsRecord; + +@ExtendWith(MockitoExtension.class) +class DistributedRdfIndexCoordinatorTest { + + private static final String TEST_SERVER_ID = "rdf-test-server"; + + @Mock private CollectionDAO collectionDAO; + @Mock private RdfIndexJobDAO jobDAO; + @Mock private RdfIndexPartitionDAO partitionDAO; + @Mock private RdfPartitionCalculator partitionCalculator; + + private DistributedRdfIndexCoordinator coordinator; + private MockedStatic serverIdentityMock; + + @BeforeEach + void setUp() { + ServerIdentityResolver resolver = mock(ServerIdentityResolver.class); + when(resolver.getServerId()).thenReturn(TEST_SERVER_ID); + + serverIdentityMock = mockStatic(ServerIdentityResolver.class); + serverIdentityMock.when(ServerIdentityResolver::getInstance).thenReturn(resolver); + + when(collectionDAO.rdfIndexJobDAO()).thenReturn(jobDAO); + when(collectionDAO.rdfIndexPartitionDAO()).thenReturn(partitionDAO); + + coordinator = new DistributedRdfIndexCoordinator(collectionDAO, partitionCalculator); + } + + @AfterEach + void tearDown() { + if (serverIdentityMock != null) { + serverIdentityMock.close(); + } + } + + @Test + void getJobWithAggregatedStatsKeepsCompletedAtNullForNonTerminalJob() { + UUID jobId = UUID.randomUUID(); + EventPublisherJob jobConfiguration = new EventPublisherJob().withEntities(Set.of("table")); + RdfIndexJobRecord jobRecord = + new RdfIndexJobRecord( + jobId.toString(), + IndexJobStatus.READY.name(), + JsonUtils.pojoToJson(jobConfiguration), + 25L, + 0L, + 0L, + 0L, + JsonUtils.pojoToJson( + Map.of( + "table", + RdfIndexJob.EntityTypeStats.builder() + .entityType("table") + .totalRecords(25) + .build())), + "admin", + System.currentTimeMillis(), + null, + null, + System.currentTimeMillis(), + null); + + when(jobDAO.findById(jobId.toString())).thenReturn(jobRecord); + when(partitionDAO.getAggregatedStats(jobId.toString())) + .thenReturn(new RdfAggregatedStatsRecord(25L, 0L, 0L, 0L, 1, 0, 0, 1, 0)); + when(partitionDAO.getEntityStats(jobId.toString())) + .thenReturn( + List.of( + new CollectionDAO.RdfIndexPartitionDAO.RdfEntityStatsRecord( + "table", 25L, 0L, 0L, 0L, 1, 0, 0))); + when(partitionDAO.getServerStats(jobId.toString())).thenReturn(List.of()); + + RdfIndexJob refreshed = coordinator.getJobWithAggregatedStats(jobId); + + assertEquals(IndexJobStatus.RUNNING, refreshed.getStatus()); + assertNull(refreshed.getCompletedAt()); + + verify(jobDAO) + .update( + eq(jobId.toString()), + eq(IndexJobStatus.RUNNING.name()), + eq(0L), + eq(0L), + eq(0L), + anyString(), + isNull(), + isNull(), + anyLong(), + isNull()); + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java index bae9ceebe335..b59adc5e5ca0 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/rdf/RdfPropertyMapperTest.java @@ -61,85 +61,42 @@ void setUp() { class ChangeDescriptionTests { @Test - @DisplayName("ChangeDescription should be stored as structured RDF, not JSON literal") - void testChangeDescriptionStructured() throws Exception { + @DisplayName("ChangeDescription should be ignored during RDF field processing") + void testChangeDescriptionIsIgnored() throws Exception { ObjectNode changeDesc = objectMapper.createObjectNode(); changeDesc.put("previousVersion", 1.0); - ArrayNode fieldsAdded = objectMapper.createArrayNode(); - ObjectNode addedField = objectMapper.createObjectNode(); - addedField.put("name", "description"); - addedField.put("newValue", "New description value"); - fieldsAdded.add(addedField); - changeDesc.set("fieldsAdded", fieldsAdded); - - ArrayNode fieldsUpdated = objectMapper.createArrayNode(); - ObjectNode updatedField = objectMapper.createObjectNode(); - updatedField.put("name", "tags"); - updatedField.put("oldValue", "[]"); - updatedField.put("newValue", "[\"PII\"]"); - fieldsUpdated.add(updatedField); - changeDesc.set("fieldsUpdated", fieldsUpdated); - - // Use reflection to call the private method - java.lang.reflect.Method method = - RdfPropertyMapper.class.getDeclaredMethod( - "addChangeDescription", JsonNode.class, Resource.class, Model.class); - method.setAccessible(true); - method.invoke(propertyMapper, changeDesc, entityResource, model); - - // Verify structured RDF was created - Property hasChangeDesc = model.createProperty(OM_NS, "hasChangeDescription"); - assertTrue( - model.contains(entityResource, hasChangeDesc), - "Entity should have hasChangeDescription property"); - - // Find the change description resource - Resource changeDescResource = - model.listObjectsOfProperty(entityResource, hasChangeDesc).next().asResource(); - - // Verify type - assertTrue( - model.contains( - changeDescResource, RDF.type, model.createResource(OM_NS + "ChangeDescription")), - "ChangeDescription should have correct type"); + ObjectNode entityJson = objectMapper.createObjectNode(); + entityJson.set("changeDescription", changeDesc); - // Verify previousVersion is stored as a typed literal, not JSON - Property prevVersion = model.createProperty(OM_NS, "previousVersion"); - assertTrue( - model.contains(changeDescResource, prevVersion), - "ChangeDescription should have previousVersion"); - - // Verify fieldsAdded are stored as structured nodes - Property fieldsAddedProp = model.createProperty(OM_NS, "fieldsAdded"); - assertTrue( - model.contains(changeDescResource, fieldsAddedProp), - "ChangeDescription should have fieldsAdded"); + invokePrivate( + "processContextMappings", + new Class[] {Map.class, JsonNode.class, Resource.class, Model.class}, + Map.of("changeDescription", Map.of("@id", "om:hasChangeDescription", "@type", "@json")), + entityJson, + entityResource, + model); - // Verify the field change has a name property (not stored as JSON blob) - Resource fieldChangeResource = - model.listObjectsOfProperty(changeDescResource, fieldsAddedProp).next().asResource(); - Property fieldNameProp = model.createProperty(OM_NS, "fieldName"); - assertTrue( - model.contains(fieldChangeResource, fieldNameProp), - "FieldChange should have fieldName property"); + assertFalse( + model.contains(entityResource, model.createProperty(OM_NS, "hasChangeDescription")), + "ChangeDescription helper nodes should not be emitted into RDF"); } @Test - @DisplayName("Empty ChangeDescription should not create any triples") - void testEmptyChangeDescription() throws Exception { + @DisplayName("Structured property dispatch should ignore changeDescription") + void testStructuredDispatchIgnoresChangeDescription() throws Exception { ObjectNode changeDesc = objectMapper.createObjectNode(); - java.lang.reflect.Method method = - RdfPropertyMapper.class.getDeclaredMethod( - "addChangeDescription", JsonNode.class, Resource.class, Model.class); - method.setAccessible(true); - method.invoke(propertyMapper, changeDesc, entityResource, model); + invokePrivate( + "addStructuredProperty", + new Class[] {String.class, JsonNode.class, Resource.class, Model.class}, + "changeDescription", + changeDesc, + entityResource, + model); - Property hasChangeDesc = model.createProperty(OM_NS, "hasChangeDescription"); - assertTrue( - model.contains(entityResource, hasChangeDesc), - "Entity should still have hasChangeDescription for empty change"); + assertFalse( + model.contains(entityResource, model.createProperty(OM_NS, "hasChangeDescription"))); } } @@ -148,7 +105,7 @@ void testEmptyChangeDescription() throws Exception { class VotesTests { @Test - @DisplayName("Votes should be stored as structured RDF with upVotes/downVotes as integers") + @DisplayName("Votes should keep counts but omit voter relationship edges") void testVotesStructured() throws Exception { ObjectNode votes = objectMapper.createObjectNode(); votes.put("upVotes", 10); @@ -192,9 +149,12 @@ void testVotesStructured() throws Exception { stmt = model.getProperty(votesResource, downVotesProp); assertEquals(2, stmt.getInt(), "downVotes should be 2"); - // Verify upVoters are stored as entity references + // Verify individual voter references are not stored as graph edges Property upVotersProp = model.createProperty(OM_NS, "upVoters"); - assertTrue(model.contains(votesResource, upVotersProp), "Votes should have upVoters"); + assertFalse(model.contains(votesResource, upVotersProp), "Votes should not expose upVoters"); + assertFalse( + model.contains(votesResource, model.createProperty(OM_NS, "downVoters")), + "Votes should not expose downVoters"); } } @@ -823,11 +783,12 @@ void testContainerVotesAndExtensionHelpersCoverRemainingBranches() throws Except .listObjectsOfProperty(entityResource, model.createProperty(OM_NS, "hasVotes")) .next() .asResource(); - assertTrue( + assertFalse( model.contains( votesResource, model.createProperty(OM_NS, "downVoters"), - model.createResource(BASE_URI + "entity/user/" + reviewerId))); + model.createResource(BASE_URI + "entity/user/" + reviewerId)), + "Vote helpers should not emit voter references"); ObjectNode extension = objectMapper.createObjectNode(); extension.put("threshold", 2.5); @@ -868,7 +829,7 @@ void testStructuredPropertyDispatchAndCustomProperties() throws Exception { changeDescription, entityResource, model); - assertTrue( + assertFalse( model.contains(entityResource, model.createProperty(OM_NS, "hasChangeDescription"))); ObjectNode votes = objectMapper.createObjectNode(); diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/applications/configuration/internal/rdfIndexingAppConfig.json b/openmetadata-spec/src/main/resources/json/schema/entity/applications/configuration/internal/rdfIndexingAppConfig.json new file mode 100644 index 000000000000..e3939bf1a70b --- /dev/null +++ b/openmetadata-spec/src/main/resources/json/schema/entity/applications/configuration/internal/rdfIndexingAppConfig.json @@ -0,0 +1,153 @@ +{ + "$id": "https://open-metadata.org/schema/entity/applications/configuration/rdfIndexingApp.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "RdfIndexingApp", + "type": "object", + "description": "RDF indexing application configuration.", + "definitions": { + "rdfIndexingType": { + "description": "Application type.", + "type": "string", + "enum": ["RdfIndexing"], + "default": "RdfIndexing" + } + }, + "properties": { + "type": { + "title": "Application Type", + "description": "Application Type", + "$ref": "#/definitions/rdfIndexingType", + "default": "RdfIndexing" + }, + "entities": { + "title": "Entities", + "description": "List of entities that you need to reindex.", + "type": "array", + "items": { + "type": "string", + "enum": [ + "aiApplication", + "aiGovernancePolicy", + "apiCollection", + "apiEndpoint", + "apiService", + "app", + "appMarketPlaceDefinition", + "bot", + "chart", + "classification", + "container", + "dashboard", + "dashboardDataModel", + "dashboardService", + "dataContract", + "dataInsightChart", + "dataInsightCustomChart", + "dataProduct", + "database", + "databaseSchema", + "databaseService", + "directory", + "document", + "domain", + "driveService", + "eventsubscription", + "file", + "glossary", + "glossaryTerm", + "ingestionPipeline", + "kpi", + "learningResource", + "llmModel", + "llmService", + "messagingService", + "metadataService", + "metric", + "mlmodel", + "mlmodelService", + "notificationTemplate", + "persona", + "pipeline", + "pipelineService", + "policy", + "promptTemplate", + "query", + "report", + "role", + "searchIndex", + "searchService", + "securityService", + "spreadsheet", + "storageService", + "storedProcedure", + "table", + "tag", + "team", + "testCase", + "testConnectionDefinition", + "testDefinition", + "testSuite", + "topic", + "type", + "user", + "webAnalyticEvent", + "workflow", + "workflowDefinition", + "worksheet" + ] + }, + "default": ["all"], + "uiFieldType": "treeSelect", + "uniqueItems": true + }, + "recreateIndex": { + "title": "Recreate RDF Store", + "description": "Recreate the RDF store before indexing.", + "type": "boolean", + "default": false + }, + "batchSize": { + "title": "Batch Size", + "description": "Maximum number of entities processed in a batch.", + "type": "integer", + "default": 100, + "minimum": 1 + }, + "producerThreads": { + "title": "Number of Producer Threads", + "description": "Number of producer threads to use for non-distributed RDF reindexing", + "type": "integer", + "default": 2, + "minimum": 1 + }, + "consumerThreads": { + "title": "Number of Consumer Threads", + "description": "Number of consumer threads to use for non-distributed RDF reindexing", + "type": "integer", + "default": 3, + "minimum": 1 + }, + "queueSize": { + "title": "Queue Size", + "description": "Queue size to use internally for non-distributed RDF reindexing.", + "type": "integer", + "default": 5000, + "minimum": 1 + }, + "useDistributedIndexing": { + "title": "Use Distributed Indexing", + "description": "Enable distributed RDF indexing across multiple servers with partition coordination and recovery.", + "type": "boolean", + "default": true + }, + "partitionSize": { + "title": "Partition Size", + "description": "Number of entities per partition for distributed RDF indexing. Smaller values create more partitions for better distribution across servers.", + "type": "integer", + "default": 10000, + "minimum": 1000, + "maximum": 50000 + } + }, + "additionalProperties": false +} diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/applications/jobStatus.json b/openmetadata-spec/src/main/resources/json/schema/entity/applications/jobStatus.json index f8ba929466fd..8a0f40b96ba0 100644 --- a/openmetadata-spec/src/main/resources/json/schema/entity/applications/jobStatus.json +++ b/openmetadata-spec/src/main/resources/json/schema/entity/applications/jobStatus.json @@ -11,8 +11,11 @@ "oneOf": [ { "$ref": "configuration/internal/searchIndexingAppConfig.json" + }, + { + "$ref": "configuration/internal/rdfIndexingAppConfig.json" } ] } } -} \ No newline at end of file +} diff --git a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.interface.ts b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.interface.ts index 2b91dfa86786..a044ea7412e6 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.interface.ts +++ b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.interface.ts @@ -35,10 +35,27 @@ export interface GraphEdge { from: string; to: string; label: string; + relationType?: string; arrows?: string; } +export interface GraphFilterOption { + id: string; + label: string; + count: number; +} + +export interface GraphFilterOptions { + entityTypes: GraphFilterOption[]; + relationshipTypes: GraphFilterOption[]; +} + export interface GraphData { nodes: GraphNode[]; edges: GraphEdge[]; + filterOptions?: GraphFilterOptions; + totalNodes?: number; + totalEdges?: number; + source?: string; + error?: string; } diff --git a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx index b1883dcd1b4d..f9460f485c20 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx @@ -72,6 +72,10 @@ const KnowledgeGraph: React.FC = ({ const [layout, setLayout] = useState<'hierarchical' | 'force'>( 'hierarchical' ); + const [selectedEntityTypes, setSelectedEntityTypes] = useState([]); + const [selectedRelationshipTypes, setSelectedRelationshipTypes] = useState< + string[] + >([]); const [, setHoveredNode] = useState(null); const [selectedNode, setSelectedNode] = useState(null); @@ -370,18 +374,27 @@ const KnowledgeGraph: React.FC = ({ setLoading(true); try { - const data = await getEntityGraphData( - entity.id, + const data = await getEntityGraphData({ + entityId: entity.id, entityType, - selectedDepth - ); + depth: selectedDepth, + entityTypes: selectedEntityTypes, + relationshipTypes: selectedRelationshipTypes, + }); setGraphData(data); } catch (error) { showErrorToast(error as AxiosError, t('server.entity-graph-fetch-error')); } finally { setLoading(false); } - }, [entity?.id, entityType, selectedDepth, t]); + }, [ + entity?.id, + entityType, + selectedDepth, + selectedEntityTypes, + selectedRelationshipTypes, + t, + ]); useEffect(() => { fetchGraphData(); @@ -540,13 +553,6 @@ const KnowledgeGraph: React.FC = ({ }; }, [graphData, loading, networkOptions, entity?.id]); - // Re-fetch data when depth changes - useEffect(() => { - if (entity?.id) { - fetchGraphData(); - } - }, [selectedDepth]); - const handleFit = () => { networkRef.current?.fit({ animation: { @@ -564,6 +570,24 @@ const KnowledgeGraph: React.FC = ({ setSelectedDepth(value); }; + const entityTypeOptions = useMemo( + () => + graphData?.filterOptions?.entityTypes.map((option) => ({ + label: `${option.label} (${option.count})`, + value: option.id, + })) ?? [], + [graphData?.filterOptions] + ); + + const relationshipTypeOptions = useMemo( + () => + graphData?.filterOptions?.relationshipTypes.map((option) => ({ + label: `${option.label} (${option.count})`, + value: option.id, + })) ?? [], + [graphData?.filterOptions] + ); + if (!entity) { return ; } @@ -611,6 +635,32 @@ const KnowledgeGraph: React.FC = ({ onChange={handleDepthChange} /> + } + value={searchQuery} + onChange={(event) => setSearchQuery(event.target.value)} + /> + + void handleExportGraph(key as EntityGraphExportFormat), + }} + onClick={() => void handleExportGraph('turtle')}> + + + {t('label.export-graph')} + + + {/* Selected node details */} {selectedNode && ( diff --git a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/ExportGraphPanel.tsx b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/ExportGraphPanel.tsx index 248510aba9a0..6e7389ef7ab0 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/ExportGraphPanel.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/ExportGraphPanel.tsx @@ -34,7 +34,7 @@ const ExportGraphPanel: React.FC = ({ const items = [ { id: EXPORT_PNG, label: t('label.png-uppercase') }, - { id: EXPORT_SVG, label: t('label.svg-uppercase') }, + { id: EXPORT_SVG, label: `${t('label.svg-uppercase')} (raster)` }, ]; const handleAction = async (key: Key) => { diff --git a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/OntologyGraphG6.tsx b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/OntologyGraphG6.tsx index ddbd92534dc8..06f60099c83a 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/OntologyGraphG6.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/OntologyGraphG6.tsx @@ -158,6 +158,7 @@ const OntologyGraph = forwardRef( if (!graph) { return; } + // G6 only supports raster image export here; keep the SVG wrapper explicit. const dataUrl = await graph.toDataURL({ mode: 'overall', type: 'image/png', @@ -170,12 +171,12 @@ const OntologyGraph = forwardRef( const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; - a.download = 'ontology-graph.svg'; + a.download = 'ontology-graph-raster.svg'; a.click(); URL.revokeObjectURL(url); }, }), - [extractNodePositions, graphRef] + [explorationMode, extractNodePositions, graphRef] ); return ( diff --git a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useGraphData.ts b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useGraphData.ts index 5860e41e6451..b2ec5f7f047a 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useGraphData.ts +++ b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useGraphData.ts @@ -695,9 +695,9 @@ export function useGraphDataBuilder({ const assetToTermMap = useMemo(() => { if (explorationMode !== 'data') { - return {} as Record; + return {} as Record; } - const map: Record = {}; + const map: Record = {}; const allAssetIds = new Set( inputNodes .filter((n) => n.type === 'dataAsset' || n.type === 'metric') @@ -708,9 +708,17 @@ export function useGraphDataBuilder({ ); mergedEdgesList.forEach((edge) => { if (allTermIds.has(edge.from) && allAssetIds.has(edge.to)) { - map[edge.to] = edge.from; + const existing = map[edge.to] ?? []; + if (!existing.includes(edge.from)) { + existing.push(edge.from); + map[edge.to] = existing; + } } else if (allAssetIds.has(edge.from) && allTermIds.has(edge.to)) { - map[edge.from] = edge.to; + const existing = map[edge.from] ?? []; + if (!existing.includes(edge.to)) { + existing.push(edge.to); + map[edge.from] = existing; + } } }); diff --git a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useOntologyGraph.ts b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useOntologyGraph.ts index 8d83a8d7344a..2447436d9713 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useOntologyGraph.ts +++ b/openmetadata-ui/src/main/resources/ui/src/components/OntologyExplorer/hooks/useOntologyGraph.ts @@ -162,7 +162,7 @@ interface UseOntologyGraphProps { neighborSet: Set; glossaryColorMap: Record; computeNodeColor: (node: OntologyNode) => string; - assetToTermMap: Record; + assetToTermMap: Record; } export function useOntologyGraph({ @@ -231,36 +231,77 @@ export function useOntologyGraph({ /** Places asset nodes in concentric rings around their parent term's current drawn position. */ const positionAssetNodes = useCallback((graph: Graph) => { const map = assetToTermMapRef.current; - const assetsByTerm = new Map(); - Object.entries(map).forEach(([assetId, termId]) => { - const list = assetsByTerm.get(termId) ?? []; - list.push(assetId); - assetsByTerm.set(termId, list); + const updates: NodeData[] = []; + const assignRingPositions = (anchorX: number, anchorY: number, assetIds: string[]) => { + const ringPositions = computeAssetRingPositions(anchorX, anchorY, assetIds); + Object.entries(ringPositions).forEach(([assetId, pos]) => { + const nodeData = graph.getNodeData(assetId); + if (nodeData) { + updates.push({ + id: assetId, + style: { ...(nodeData.style ?? {}), x: pos.x, y: pos.y }, + }); + } + }); + }; + + const singleTermAssets = new Map(); + const multiTermAssets = new Map(); + + Object.entries(map).forEach(([assetId, connectedTermIds]) => { + const uniqueTermIds = [...new Set(connectedTermIds)]; + if (uniqueTermIds.length <= 1) { + const termId = uniqueTermIds[0]; + if (!termId) { + return; + } + const assetIds = singleTermAssets.get(termId) ?? []; + assetIds.push(assetId); + singleTermAssets.set(termId, assetIds); + return; + } + + const sortedTermIds = [...uniqueTermIds].sort(); + const key = sortedTermIds.join('|'); + const group = multiTermAssets.get(key) ?? { + termIds: sortedTermIds, + assetIds: [], + }; + group.assetIds.push(assetId); + multiTermAssets.set(key, group); }); - const updates: NodeData[] = []; - assetsByTerm.forEach((assetIds, termId) => { + singleTermAssets.forEach((assetIds, termId) => { try { const termPos = graph.getElementPosition(termId); if (!termPos) { return; } - const [termX, termY] = termPos; - const ringPositions = computeAssetRingPositions(termX, termY, assetIds); - Object.entries(ringPositions).forEach(([assetId, pos]) => { - const nodeData = graph.getNodeData(assetId); - if (nodeData) { - updates.push({ - id: assetId, - style: { ...(nodeData.style ?? {}), x: pos.x, y: pos.y }, - }); - } - }); + assignRingPositions(termPos[0], termPos[1], assetIds); } catch { // term not yet in graph } }); + multiTermAssets.forEach(({ termIds, assetIds }) => { + try { + const termPositions = termIds + .map((termId) => graph.getElementPosition(termId)) + .filter((position): position is [number, number] => Array.isArray(position)); + if (termPositions.length === 0) { + return; + } + + const centerX = + termPositions.reduce((sum, [x]) => sum + x, 0) / termPositions.length; + const centerY = + termPositions.reduce((sum, [, y]) => sum + y, 0) / termPositions.length; + assignRingPositions(centerX, centerY, assetIds); + } catch { + // one or more terms are not yet in the graph + } + }); + if (updates.length > 0) { graph.updateNodeData(updates); } diff --git a/openmetadata-ui/src/main/resources/ui/src/rest/rdfAPI.ts b/openmetadata-ui/src/main/resources/ui/src/rest/rdfAPI.ts index 99df88d27a46..e7a4b86fd90b 100644 --- a/openmetadata-ui/src/main/resources/ui/src/rest/rdfAPI.ts +++ b/openmetadata-ui/src/main/resources/ui/src/rest/rdfAPI.ts @@ -43,6 +43,8 @@ export interface EntityGraphParams { relationshipTypes?: string[]; } +export type EntityGraphExportFormat = 'turtle' | 'jsonld'; + export interface GlossaryGraphParams { glossaryId?: string; relationTypes?: string; @@ -92,6 +94,68 @@ export const getEntityGraphData = async ( return response.data; }; +export const exportEntityGraph = async ( + params: EntityGraphParams & { + format?: EntityGraphExportFormat; + } +): Promise => { + const { + entityId, + entityType, + depth = 2, + entityTypes, + relationshipTypes, + format = 'turtle', + } = params; + + const response = await APIClient.get('/rdf/graph/explore/export', { + params: { + entityId, + entityType, + depth, + entityTypes: entityTypes?.length ? entityTypes.join(',') : undefined, + relationshipTypes: relationshipTypes?.length + ? relationshipTypes.join(',') + : undefined, + format, + }, + responseType: 'blob', + headers: { + Accept: format === 'jsonld' ? 'application/ld+json' : 'text/turtle', + }, + }); + + return response.data; +}; + +export const downloadEntityGraph = async ( + params: EntityGraphParams & { + entityName: string; + format?: EntityGraphExportFormat; + } +): Promise => { + const { entityName, format = 'turtle', ...graphParams } = params; + const blob = await exportEntityGraph({ ...graphParams, format }); + const safeFilename = entityName.replace(/[^a-zA-Z0-9-_]/g, '_'); + const extension = format === 'jsonld' ? 'jsonld' : 'ttl'; + const filename = `${safeFilename}_knowledge_graph.${extension}`; + const downloadBlob = + blob instanceof Blob ? blob : new Blob([blob], { type: 'text/plain' }); + + const url = window.URL.createObjectURL(downloadBlob); + const link = document.createElement('a'); + link.href = url; + link.download = filename; + link.style.display = 'none'; + document.body.appendChild(link); + link.click(); + + setTimeout(() => { + document.body.removeChild(link); + window.URL.revokeObjectURL(url); + }, 100); +}; + export const getGlossaryTermGraph = async ( params: GlossaryGraphParams = {} ): Promise => { From 2841a12c038110417d3159ca9c05a37243dd75d9 Mon Sep 17 00:00:00 2001 From: Aniket Katkar Date: Mon, 6 Apr 2026 15:41:58 +0530 Subject: [PATCH 06/42] Fix minor UI bugs --- .../src/components/base/tooltip/tooltip.tsx | 5 ++- .../src/components/foundations/typography.tsx | 9 ++++- .../GraphElements/CustomNode.tsx | 25 ++++++++----- .../GraphElements/custom-node.less | 36 +++--------------- .../KnowledgeGraph/KnowledgeGraph.tsx | 37 ++++++++++++------- .../resources/ui/src/utils/TableUtils.tsx | 30 ++++++++------- 6 files changed, 71 insertions(+), 71 deletions(-) diff --git a/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/tooltip/tooltip.tsx b/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/tooltip/tooltip.tsx index 443cb91ebba9..11f7dbeed144 100644 --- a/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/tooltip/tooltip.tsx +++ b/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/tooltip/tooltip.tsx @@ -1,3 +1,4 @@ +import { cx } from '@/utils/cx'; import type { ReactNode } from 'react'; import type { ButtonProps as AriaButtonProps, @@ -10,7 +11,6 @@ import { Tooltip as AriaTooltip, TooltipTrigger as AriaTooltipTrigger, } from 'react-aria-components'; -import { cx } from '@/utils/cx'; interface TooltipProps extends AriaTooltipTriggerComponentProps, @@ -97,7 +97,8 @@ export const Tooltip = ({ className={({ isEntering, isExiting }) => cx( isEntering && 'tw:ease-out tw:animate-in', - isExiting && 'tw:ease-in tw:animate-out' + isExiting && 'tw:ease-in tw:animate-out', + 'tw:break-all' // Ensure long words in the tooltip wrap instead of overflowing. ) } crossOffset={crossOffset ?? calculatedCrossOffset} diff --git a/openmetadata-ui-core-components/src/main/resources/ui/src/components/foundations/typography.tsx b/openmetadata-ui-core-components/src/main/resources/ui/src/components/foundations/typography.tsx index e85a6ff73718..c7d813f57c2c 100644 --- a/openmetadata-ui-core-components/src/main/resources/ui/src/components/foundations/typography.tsx +++ b/openmetadata-ui-core-components/src/main/resources/ui/src/components/foundations/typography.tsx @@ -134,7 +134,12 @@ export const Typography = (props: TypographyProps) => { return ( -

+
{children} @@ -145,7 +150,7 @@ export const Typography = (props: TypographyProps) => { } return ( -
+
{children} diff --git a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/CustomNode.tsx b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/CustomNode.tsx index d2fa5e874623..5248a9bfaf69 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/CustomNode.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/CustomNode.tsx @@ -12,6 +12,7 @@ */ import { NodeData } from '@antv/g6'; +import { Box, Typography } from '@openmetadata/ui-core-components'; import { getEntityIcon } from '../../../utils/TableUtils'; import './custom-node.less'; @@ -31,30 +32,34 @@ function CustomNode({ nodeData }: Readonly) { }`} data-node-id={nodeData.id} data-testid={`node-${nodeData.data?.label as string}`}> -
-
+ + {getEntityIcon(nodeData.data?.type as string, '', { width: 12, height: 12, })} -
-
+ + ellipsis={{ + tooltip: nodeData.data?.label as string, + rows: 1, + }} + weight="semibold"> {nodeData.data?.label as string} -
-
-
+ + {nodeData.data?.type as string} -
+
); } diff --git a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/custom-node.less b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/custom-node.less index 33ec86ae96c3..3d848bae1d91 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/custom-node.less +++ b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/GraphElements/custom-node.less @@ -11,6 +11,8 @@ * limitations under the License. */ +@import (reference) '../../../styles/variables.less'; + .knowledge-graph-custom-node { display: flex; align-items: center; @@ -18,49 +20,21 @@ gap: 8px; padding: 8px; border-radius: 10px; - background-color: #ffffff; - border: 1px solid #e9eaeb; + background-color: @background-color; + border: 1px solid @border-color; box-sizing: border-box; width: 100%; height: 100%; cursor: pointer; - .entity-name-container { - display: flex; - align-items: center; - gap: 8px; - white-space: nowrap; - overflow: hidden; - } - - .icon-container { - display: flex; - align-items: center; - justify-content: center; - color: #717680; - } - - .asset-name { - font-size: 14px; - font-weight: 600; - color: #181d27; - overflow: hidden; - text-overflow: ellipsis; - flex: 1; - min-width: 0; - } - .asset-type-tag { - font-size: 12px; padding: 2px 4px; border-radius: 4px; border-width: 1px; border-style: solid; - border-color: #e9eaeb; } &.highlighted { - border-color: #1677ff; - box-shadow: 0 0 0 2px rgba(22, 119, 255, 0.15); + border-color: @primary-color; } } diff --git a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx index 497045aba575..d4702e4ed018 100644 --- a/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/components/KnowledgeGraph/KnowledgeGraph.tsx @@ -51,7 +51,11 @@ import { ReactComponent as LineageIcon } from '../../assets/svg/ic-platform-line import { ReactComponent as ZoomInIcon } from '../../assets/svg/ic-zoom-in.svg'; import { ReactComponent as ZoomOutIcon } from '../../assets/svg/ic-zoom-out.svg'; import { ReactComponent as RefreshIcon } from '../../assets/svg/reload.svg'; -import { FULLSCREEN_QUERY_PARAM_KEY } from '../../constants/constants'; +import { + FULLSCREEN_QUERY_PARAM_KEY, + LITE_GRAY_COLOR, + WHITE_COLOR, +} from '../../constants/constants'; import { useTheme } from '../../context/UntitledUIThemeProvider/theme-provider'; import { ERROR_PLACEHOLDER_TYPE, SIZE } from '../../enums/common.enum'; import { EntityType } from '../../enums/entity.enum'; @@ -283,8 +287,8 @@ const KnowledgeGraph: React.FC = ({ key: 'left', placement: [-0.04, 0.5] as [number, number], r: 6, - fill: '#ffffff', - stroke: '#d9d9d9', + fill: WHITE_COLOR, + stroke: LITE_GRAY_COLOR, lineWidth: 1.5, }; @@ -292,8 +296,8 @@ const KnowledgeGraph: React.FC = ({ key: 'right', placement: [1.04, 0.5] as [number, number], r: 6, - fill: '#ffffff', - stroke: '#d9d9d9', + fill: WHITE_COLOR, + stroke: LITE_GRAY_COLOR, lineWidth: 1.5, }; @@ -489,17 +493,24 @@ const KnowledgeGraph: React.FC = ({ networkRef.current = graph; + const resizeObserver = new ResizeObserver(() => { + if (containerRef.current && networkRef.current) { + networkRef.current.resize( + containerRef.current.offsetWidth, + containerRef.current.offsetHeight + ); + } + }); + resizeObserver.observe(containerRef.current); + return () => { + if (networkRef.current === graph) { + networkRef.current = null; + } graph.destroy(); + resizeObserver.disconnect(); }; - }, [ - graphData, - loading, - layout, - entity?.id, - isFullscreen, - transformToG6Format, - ]); + }, [graphData, loading, layout, entity?.id, isFullscreen]); useEffect(() => { if (entity?.id) { diff --git a/openmetadata-ui/src/main/resources/ui/src/utils/TableUtils.tsx b/openmetadata-ui/src/main/resources/ui/src/utils/TableUtils.tsx index ba597e0687bd..6aa877895381 100644 --- a/openmetadata-ui/src/main/resources/ui/src/utils/TableUtils.tsx +++ b/openmetadata-ui/src/main/resources/ui/src/utils/TableUtils.tsx @@ -141,7 +141,6 @@ import SchemaTable from '../components/Database/SchemaTable/SchemaTable.componen import TableQueries from '../components/Database/TableQueries/TableQueries'; import { ContractTab } from '../components/DataContract/ContractTab/ContractTab'; import { useEntityExportModalProvider } from '../components/Entity/EntityExportModalProvider/EntityExportModalProvider.component'; -import KnowledgeGraph from '../components/KnowledgeGraph/KnowledgeGraph'; import { SourceType } from '../components/SearchedData/SearchedData.interface'; import { NON_SERVICE_TYPE_ASSETS } from '../constants/Assets.constants'; import { FQN_SEPARATOR_CHAR } from '../constants/char.constants'; @@ -204,6 +203,9 @@ import { ordinalize } from './StringsUtils'; import { TableDetailPageTabProps } from './TableClassBase'; import { TableFieldsInfoCommonEntities } from './TableUtils.interface'; import { extractTopicFields } from './TopicDetailsUtils'; +const KnowledgeGraph = lazy( + () => import('../components/KnowledgeGraph/KnowledgeGraph') +); const EntityLineageTab = lazy(() => import('../components/Lineage/EntityLineageTab/EntityLineageTab').then( @@ -980,18 +982,20 @@ export const getTableDetailPageBaseTabs = ({ ), key: EntityTabs.KNOWLEDGE_GRAPH, children: ( - + }> + + ), isHidden: !useApplicationStore.getState().rdfEnabled, }, From 88672fcad30045a1753331487fb196d29c370570 Mon Sep 17 00:00:00 2001 From: Aniket Katkar Date: Mon, 6 Apr 2026 18:03:38 +0530 Subject: [PATCH 07/42] Add the missing filters --- .../src/components/base/dropdown/dropdown.tsx | 22 +- .../KnowledgeGraph/KnowledgeGraph.tsx | 314 +++++++++++++++--- 2 files changed, 282 insertions(+), 54 deletions(-) diff --git a/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/dropdown/dropdown.tsx b/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/dropdown/dropdown.tsx index 51144adfb8c5..829db4711c97 100644 --- a/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/dropdown/dropdown.tsx +++ b/openmetadata-ui-core-components/src/main/resources/ui/src/components/base/dropdown/dropdown.tsx @@ -17,6 +17,7 @@ import { Popover as AriaPopover, Separator as AriaSeparator, } from 'react-aria-components'; +import { CheckboxBase } from '@/components/base/checkbox/checkbox'; import { cx } from '@/utils/cx'; interface DropdownItemProps extends AriaMenuItemProps { @@ -28,6 +29,8 @@ interface DropdownItemProps extends AriaMenuItemProps { unstyled?: boolean; /** An icon to be displayed on the left side of the item. */ icon?: FC<{ className?: string }>; + /** If true, shows a checkbox on the left to indicate selection state. */ + showCheckbox?: boolean; } const DropdownItem = ({ @@ -36,6 +39,7 @@ const DropdownItem = ({ addon, icon: Icon, unstyled, + showCheckbox, ...props }: DropdownItemProps) => { if (unstyled) { @@ -57,16 +61,26 @@ const DropdownItem = ({ {(state) => (
+ {showCheckbox && ( + + )} + {Icon && (