Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

logger = ingestion_logger()

ischema_names = update_mssql_ischema_names(ischema_names)
update_mssql_ischema_names(ischema_names)

MSDialect.get_table_comment = get_table_comment
MSDialect.get_view_definition = get_view_definition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@
"UInt8": SMALLINT,
"IPv4": create_sqlalchemy_type("IPv4"),
"IPv6": create_sqlalchemy_type("IPv6"),
# ClickHouse geo types (v21+)
"Point": create_sqlalchemy_type("Point"),
"Ring": create_sqlalchemy_type("Ring"),
"Polygon": create_sqlalchemy_type("Polygon"),
"MultiPolygon": create_sqlalchemy_type("MultiPolygon"),
"LineString": create_sqlalchemy_type("LineString"),
"MultiLineString": create_sqlalchemy_type("MultiLineString"),
}
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
"pg_snapshot": create_sqlalchemy_type("PG_SNAPSHOT"),
"tsquery": create_sqlalchemy_type("TSQUERY"),
"txid_snapshot": create_sqlalchemy_type("TXID_SNAPSHOT"),
"tid": SqlAlchemyString,
"xid": SqlAlchemyString,
"xml": create_sqlalchemy_type("XML"),
Comment thread
SumanMaharana marked this conversation as resolved.
# PostgreSQL range types (used by TimescaleDB for chunk boundaries)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
# Avoid using these data types in new development work, and plan to modify applications that currently use them.
# Use nvarchar(max), varchar(max), and varbinary(max) instead.
# ref: https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver16
ischema_names = update_mssql_ischema_names(ischema_names)
update_mssql_ischema_names(ischema_names)

MSDialect.get_table_comment = get_table_comment
MSDialect.get_view_definition = get_view_definition
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,12 @@ def _get_sqlalchemy_type(type_str):
type_mapping = {
"VARCHAR": sqltypes.VARCHAR,
"CHAR": sqltypes.CHAR,
"TINYINT": sqltypes.SMALLINT,
"SMALLINT": sqltypes.SMALLINT,
"INT": sqltypes.INT,
"INTEGER": sqltypes.INTEGER,
"BIGINT": sqltypes.BIGINT,
"LARGEINT": sqltypes.BIGINT,
"FLOAT": sqltypes.FLOAT,
"DOUBLE": sqltypes.FLOAT,
"DECIMAL": sqltypes.DECIMAL,
Expand All @@ -103,11 +107,17 @@ def _get_sqlalchemy_type(type_str):
"TIMESTAMP": sqltypes.TIMESTAMP,
"BOOLEAN": sqltypes.BOOLEAN,
"ARRAY": sqltypes.ARRAY,
"MAP": sqltypes.TEXT,
"STRUCT": sqltypes.TEXT,
"JSON": sqltypes.JSON,
"STRING": sqltypes.TEXT,
"BINARY": sqltypes.BINARY,
"VARBINARY": sqltypes.VARBINARY,
"TEXT": sqltypes.TEXT,
# StarRocks specialised analytics types — no SQL equivalent; store as TEXT
"BITMAP": sqltypes.TEXT,
"HLL": sqltypes.TEXT,
"PERCENTILE": sqltypes.TEXT,
"UNKNOWN": sqltypes.NullType,
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@
{
"UUID": create_sqlalchemy_type("UUID"),
"GEOGRAPHY": create_sqlalchemy_type("GEOGRAPHY"),
"GEOMETRY": create_sqlalchemy_type("GEOMETRY"),
# Binary types
"BINARY": sqltypes.LargeBinary,
"VARBINARY": sqltypes.LargeBinary,
"LONG VARBINARY": sqltypes.LargeBinary,
# Long string
"LONG VARCHAR": sqltypes.Text,
# Complex / semi-structured types (Vertica v11+)
Comment thread
SumanMaharana marked this conversation as resolved.
"ARRAY": create_sqlalchemy_type("ARRAY"),
"NATIVE ARRAY": create_sqlalchemy_type("ARRAY"),
"ROW": create_sqlalchemy_type("ROW"),
"SET": create_sqlalchemy_type("SET"),
}
)

Expand Down
4 changes: 2 additions & 2 deletions ingestion/src/metadata/utils/sqa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,8 @@ def is_array(kwargs: Dict) -> bool:
return False


def update_mssql_ischema_names(ischema_names):
return ischema_names.update(
def update_mssql_ischema_names(ischema_names: dict) -> None:
ischema_names.update(
{
"nvarchar": create_sqlalchemy_type("NVARCHAR"),
"nchar": create_sqlalchemy_type("NCHAR"),
Expand Down
66 changes: 66 additions & 0 deletions ingestion/tests/unit/topology/database/test_clickhouse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,69 @@ def test_nullable_unwraps_to_inner_type(self):
def test_unknown_type_returns_null_type(self):
result = self.dialect._get_column_type("col", "SomeUnknownType")
assert result is sqltypes.NullType


class TestClickhouseGeoTypes:
"""Verify that ClickHouse geo types are registered in ischema_names
and resolved correctly by _get_column_type."""

def setup_method(self):
self.dialect = MockDialect()

# --- Registration checks ---

def test_geo_types_registered_in_ischema_names(self):
for geo_type in (
"Point",
"Ring",
"Polygon",
"MultiPolygon",
"LineString",
"MultiLineString",
):
assert (
geo_type in ch_ischema_names
), f"{geo_type} not found in ischema_names"

# --- Resolution via _get_column_type ---

def test_point_type_resolves(self):
result = self.dialect._get_column_type("col", "Point")
assert result == ch_ischema_names["Point"]

def test_ring_type_resolves(self):
result = self.dialect._get_column_type("col", "Ring")
assert result == ch_ischema_names["Ring"]

def test_polygon_type_resolves(self):
result = self.dialect._get_column_type("col", "Polygon")
assert result == ch_ischema_names["Polygon"]

def test_multipolygon_type_resolves(self):
result = self.dialect._get_column_type("col", "MultiPolygon")
assert result == ch_ischema_names["MultiPolygon"]

def test_linestring_type_resolves(self):
result = self.dialect._get_column_type("col", "LineString")
assert result == ch_ischema_names["LineString"]

def test_multilinestring_type_resolves(self):
result = self.dialect._get_column_type("col", "MultiLineString")
assert result == ch_ischema_names["MultiLineString"]
Comment thread
SumanMaharana marked this conversation as resolved.

def test_geo_types_are_distinct(self):
"""Each geo type should resolve to a different object."""
types = {
name: ch_ischema_names[name]
for name in (
"Point",
"Ring",
"Polygon",
"MultiPolygon",
"LineString",
"MultiLineString",
)
}
# All values should be distinct from NullType
for name, t in types.items():
assert t is not sqltypes.NullType, f"{name} resolved to NullType"
Copy link

Copilot AI Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_geo_types_are_distinct claims each geo type should resolve to a different object, but the assertions only check the entries are not NullType and never verify distinctness across the mapped values. Either add an explicit uniqueness assertion (e.g., all mapped values are unique) or update the docstring/test name to match the current check.

Suggested change
assert t is not sqltypes.NullType, f"{name} resolved to NullType"
assert t is not sqltypes.NullType, f"{name} resolved to NullType"
# And all geo types should map to distinct objects
assert len(set(types.values())) == len(
types
), "Geo types do not resolve to distinct objects"

Copilot uses AI. Check for mistakes.
50 changes: 50 additions & 0 deletions ingestion/tests/unit/topology/database/test_mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
MSSQL_GET_DATABASE,
MSSQL_TEST_GET_QUERIES,
)
from metadata.utils.sqa_utils import update_mssql_ischema_names

mock_mssql_config = {
"source": {
Expand Down Expand Up @@ -333,6 +334,55 @@ def test_get_stored_procedures(self):
self.assertEqual(len(results), 1)
self.assertEqual(results[0].name, "sp_include")


class TestUpdateMssqlIschemaNames:
"""Verify update_mssql_ischema_names mutates the dict in-place and returns None."""

EXPECTED_MSSQL_TYPES = [
"nvarchar",
"nchar",
"ntext",
"bit",
"image",
"binary",
"smallmoney",
"money",
"real",
"smalldatetime",
"datetime2",
"datetimeoffset",
"sql_variant",
"uniqueidentifier",
"xml",
"hierarchyid",
"geography",
"geometry",
]

def test_returns_none(self):
result = update_mssql_ischema_names({})
assert result is None

def test_mutates_dict_in_place(self):
target = {}
update_mssql_ischema_names(target)
for type_key in self.EXPECTED_MSSQL_TYPES:
assert (
type_key in target
), f"'{type_key}' was not added by update_mssql_ischema_names"

def test_all_added_types_are_not_none(self):
target = {}
update_mssql_ischema_names(target)
for type_key in self.EXPECTED_MSSQL_TYPES:
assert target[type_key] is not None, f"'{type_key}' was mapped to None"

def test_does_not_overwrite_existing_entries(self):
sentinel = object()
target = {"existing_key": sentinel}
update_mssql_ischema_names(target)
assert target["existing_key"] is sentinel
Comment thread
gitar-bot[bot] marked this conversation as resolved.
Comment on lines +380 to +384
Copy link

Copilot AI Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test_does_not_overwrite_existing_entries doesn’t actually verify non-overwrite for keys that update_mssql_ischema_names updates (it uses a non-colliding key). Either change the test to pre-populate one of the keys being added (e.g., xml/geography) and assert it’s preserved, or rename the test to reflect what it checks. If the intent is truly “don’t overwrite”, the helper should use setdefault/conditional updates instead of dict.update().

Copilot uses AI. Check for mistakes.

@patch(
"metadata.ingestion.source.database.mssql.connection.test_connection_db_common"
)
Expand Down
27 changes: 27 additions & 0 deletions ingestion/tests/unit/topology/database/test_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -923,3 +923,30 @@ def test_mark_deleted_databases_with_multiple_databases(self):
self.assertEqual(
call_args[1]["entity_source_state"], expected_source_state
)


class TestPostgresCommonMappings(TestCase):
"""Verify extended type entries in the shared PostgreSQL ischema_names map."""

def test_tid_type_registered(self):
"""'tid' must be present in the PostgreSQL ischema_names after common_pg_mappings is loaded."""
# common_pg_mappings registers types as a side-effect of module import
from sqlalchemy.dialects.postgresql.base import (
ischema_names as pg_ischema_names,
)

import metadata.ingestion.source.database.common_pg_mappings # noqa: F401

self.assertIn("tid", pg_ischema_names)

def test_tid_maps_to_string(self):
"""'tid' must map to a String-compatible SQLAlchemy type."""
from sqlalchemy import String as SqlAlchemyString
from sqlalchemy.dialects.postgresql.base import (
ischema_names as pg_ischema_names,
)

import metadata.ingestion.source.database.common_pg_mappings # noqa: F401

tid_type = pg_ischema_names["tid"]
self.assertIs(tid_type, SqlAlchemyString)
Comment thread
SumanMaharana marked this conversation as resolved.
46 changes: 45 additions & 1 deletion ingestion/tests/unit/topology/database/test_starrocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,16 @@
from unittest import TestCase
from unittest.mock import patch

import pytest
from sqlalchemy import types as sqltypes

from metadata.generated.schema.metadataIngestion.workflow import (
OpenMetadataWorkflowConfig,
)
from metadata.ingestion.source.database.starrocks.metadata import StarRocksSource
from metadata.ingestion.source.database.starrocks.metadata import (
StarRocksSource,
_get_sqlalchemy_type,
)

mock_starrocks_config = {
"source": {
Expand Down Expand Up @@ -129,6 +135,44 @@ def test_ssl_manager_initialized(self):
self.assertIsNotNone(self.starrocks_source.ssl_manager)


class TestStarRocksTypeMappings:
"""Verify _get_sqlalchemy_type returns the correct SQLAlchemy type for every
type added in the recent type-mapping expansion."""

@pytest.mark.parametrize(
"type_str, expected_sqa_class",
[
# Integer family
("TINYINT", sqltypes.SMALLINT),
("SMALLINT", sqltypes.SMALLINT),
("INTEGER", sqltypes.INTEGER),
("INT", sqltypes.INT),
("BIGINT", sqltypes.BIGINT),
("LARGEINT", sqltypes.BIGINT),
# Analytics / semi-structured types stored as TEXT
("MAP", sqltypes.TEXT),
("STRUCT", sqltypes.TEXT),
("BITMAP", sqltypes.TEXT),
("HLL", sqltypes.TEXT),
("PERCENTILE", sqltypes.TEXT),
# Existing string types (regression guard)
("STRING", sqltypes.TEXT),
("TEXT", sqltypes.TEXT),
("JSON", sqltypes.JSON),
],
)
def test_type_resolves_to_expected_class(self, type_str, expected_sqa_class):
result = _get_sqlalchemy_type(type_str)
assert isinstance(result, expected_sqa_class), (
f"_get_sqlalchemy_type('{type_str}') returned {type(result).__name__}, "
f"expected {expected_sqa_class.__name__}"
)

def test_unknown_type_returns_null_type(self):
result = _get_sqlalchemy_type("UNKNOWN_CUSTOM_TYPE")
assert isinstance(result, sqltypes.NullType)


class TestStarRocksIcebergMapping(TestCase):
def test_iceberg_relkind_mapping(self):
from metadata.generated.schema.entity.data.table import TableType
Expand Down
Loading
Loading