open-metadata · keshavmohta09 · Apr 7, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 3, 2026
@@ -43,7 +43,7 @@
 
 logger = ingestion_logger()
 
-ischema_names = update_mssql_ischema_names(ischema_names)
+update_mssql_ischema_names(ischema_names)
 
 MSDialect.get_table_comment = get_table_comment
 MSDialect.get_view_definition = get_view_definition

@@ -62,6 +62,13 @@
         "UInt8": SMALLINT,
         "IPv4": create_sqlalchemy_type("IPv4"),
         "IPv6": create_sqlalchemy_type("IPv6"),
+        # ClickHouse geo types (v21+)
+        "Point": create_sqlalchemy_type("Point"),
+        "Ring": create_sqlalchemy_type("Ring"),
+        "Polygon": create_sqlalchemy_type("Polygon"),
+        "MultiPolygon": create_sqlalchemy_type("MultiPolygon"),
+        "LineString": create_sqlalchemy_type("LineString"),
+        "MultiLineString": create_sqlalchemy_type("MultiLineString"),
     }
 )
 

@@ -43,6 +43,7 @@
         "pg_snapshot": create_sqlalchemy_type("PG_SNAPSHOT"),
         "tsquery": create_sqlalchemy_type("TSQUERY"),
         "txid_snapshot": create_sqlalchemy_type("TXID_SNAPSHOT"),
+        "tid": SqlAlchemyString,
         "xid": SqlAlchemyString,
         "xml": create_sqlalchemy_type("XML"),
         # PostgreSQL range types (used by TimescaleDB for chunk boundaries)

@@ -75,7 +75,7 @@
 # Avoid using these data types in new development work, and plan to modify applications that currently use them.
 # Use nvarchar(max), varchar(max), and varbinary(max) instead.
 # ref: https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql?view=sql-server-ver16
-ischema_names = update_mssql_ischema_names(ischema_names)
+update_mssql_ischema_names(ischema_names)
 
 MSDialect.get_table_comment = get_table_comment
 MSDialect.get_view_definition = get_view_definition

@@ -93,8 +93,12 @@ def _get_sqlalchemy_type(type_str):
     type_mapping = {
         "VARCHAR": sqltypes.VARCHAR,
         "CHAR": sqltypes.CHAR,
+        "TINYINT": sqltypes.SMALLINT,
+        "SMALLINT": sqltypes.SMALLINT,
         "INT": sqltypes.INT,
+        "INTEGER": sqltypes.INTEGER,
         "BIGINT": sqltypes.BIGINT,
+        "LARGEINT": sqltypes.BIGINT,
         "FLOAT": sqltypes.FLOAT,
         "DOUBLE": sqltypes.FLOAT,
         "DECIMAL": sqltypes.DECIMAL,
@@ -103,11 +107,17 @@ def _get_sqlalchemy_type(type_str):
         "TIMESTAMP": sqltypes.TIMESTAMP,
         "BOOLEAN": sqltypes.BOOLEAN,
         "ARRAY": sqltypes.ARRAY,
+        "MAP": sqltypes.TEXT,
+        "STRUCT": sqltypes.TEXT,
         "JSON": sqltypes.JSON,
         "STRING": sqltypes.TEXT,
         "BINARY": sqltypes.BINARY,
         "VARBINARY": sqltypes.VARBINARY,
         "TEXT": sqltypes.TEXT,
+        # StarRocks specialised analytics types — no SQL equivalent; store as TEXT
+        "BITMAP": sqltypes.TEXT,
+        "HLL": sqltypes.TEXT,
+        "PERCENTILE": sqltypes.TEXT,
         "UNKNOWN": sqltypes.NullType,
     }
 

@@ -56,6 +56,18 @@
     {
         "UUID": create_sqlalchemy_type("UUID"),
         "GEOGRAPHY": create_sqlalchemy_type("GEOGRAPHY"),
+        "GEOMETRY": create_sqlalchemy_type("GEOMETRY"),
+        # Binary types
+        "BINARY": sqltypes.LargeBinary,
+        "VARBINARY": sqltypes.LargeBinary,
+        "LONG VARBINARY": sqltypes.LargeBinary,
+        # Long string
+        "LONG VARCHAR": sqltypes.Text,
+        # Complex / semi-structured types (Vertica v11+)
+        "ARRAY": create_sqlalchemy_type("ARRAY"),
+        "NATIVE ARRAY": create_sqlalchemy_type("ARRAY"),
+        "ROW": create_sqlalchemy_type("ROW"),
+        "SET": create_sqlalchemy_type("SET"),
     }
 )
 

@@ -256,8 +256,8 @@ def is_array(kwargs: Dict) -> bool:
     return False
 
 
-def update_mssql_ischema_names(ischema_names):
-    return ischema_names.update(
+def update_mssql_ischema_names(ischema_names: dict) -> None:
+    ischema_names.update(
         {
             "nvarchar": create_sqlalchemy_type("NVARCHAR"),
             "nchar": create_sqlalchemy_type("NCHAR"),

@@ -68,3 +68,69 @@ def test_nullable_unwraps_to_inner_type(self):
     def test_unknown_type_returns_null_type(self):
         result = self.dialect._get_column_type("col", "SomeUnknownType")
         assert result is sqltypes.NullType
+
+
+class TestClickhouseGeoTypes:
+    """Verify that ClickHouse geo types are registered in ischema_names
+    and resolved correctly by _get_column_type."""
+
+    def setup_method(self):
+        self.dialect = MockDialect()
+
+    # --- Registration checks ---
+
+    def test_geo_types_registered_in_ischema_names(self):
+        for geo_type in (
+            "Point",
+            "Ring",
+            "Polygon",
+            "MultiPolygon",
+            "LineString",
+            "MultiLineString",
+        ):
+            assert (
+                geo_type in ch_ischema_names
+            ), f"{geo_type} not found in ischema_names"
+
+    # --- Resolution via _get_column_type ---
+
+    def test_point_type_resolves(self):
+        result = self.dialect._get_column_type("col", "Point")
+        assert result == ch_ischema_names["Point"]
+
+    def test_ring_type_resolves(self):
+        result = self.dialect._get_column_type("col", "Ring")
+        assert result == ch_ischema_names["Ring"]
+
+    def test_polygon_type_resolves(self):
+        result = self.dialect._get_column_type("col", "Polygon")
+        assert result == ch_ischema_names["Polygon"]
+
+    def test_multipolygon_type_resolves(self):
+        result = self.dialect._get_column_type("col", "MultiPolygon")
+        assert result == ch_ischema_names["MultiPolygon"]
+
+    def test_linestring_type_resolves(self):
+        result = self.dialect._get_column_type("col", "LineString")
+        assert result == ch_ischema_names["LineString"]
+
+    def test_multilinestring_type_resolves(self):
+        result = self.dialect._get_column_type("col", "MultiLineString")
+        assert result == ch_ischema_names["MultiLineString"]
+
+    def test_geo_types_are_distinct(self):
+        """Each geo type should resolve to a different object."""
+        types = {
+            name: ch_ischema_names[name]
+            for name in (
+                "Point",
+                "Ring",
+                "Polygon",
+                "MultiPolygon",
+                "LineString",
+                "MultiLineString",
+            )
+        }
+        # All values should be distinct from NullType
+        for name, t in types.items():
+            assert t is not sqltypes.NullType, f"{name} resolved to NullType"
-            assert t is not sqltypes.NullType, f"{name} resolved to NullType"
+            assert t is not sqltypes.NullType, f"{name} resolved to NullType"
+        # And all geo types should map to distinct objects
+        assert len(set(types.values())) == len(
+            types
+        ), "Geo types do not resolve to distinct objects"
-            assert t is not sqltypes.NullType, f"{name} resolved to NullType"
+            assert t is not sqltypes.NullType, f"{name} resolved to NullType"
+        # And all geo types should map to distinct objects
+        assert len(set(types.values())) == len(
+            types
+        ), "Geo types do not resolve to distinct objects"
@@ -50,6 +50,7 @@
     MSSQL_GET_DATABASE,
     MSSQL_TEST_GET_QUERIES,
 )
+from metadata.utils.sqa_utils import update_mssql_ischema_names
 
 mock_mssql_config = {
     "source": {
@@ -333,6 +334,55 @@ def test_get_stored_procedures(self):
         self.assertEqual(len(results), 1)
         self.assertEqual(results[0].name, "sp_include")
 
+
+class TestUpdateMssqlIschemaNames:
+    """Verify update_mssql_ischema_names mutates the dict in-place and returns None."""
+
+    EXPECTED_MSSQL_TYPES = [
+        "nvarchar",
+        "nchar",
+        "ntext",
+        "bit",
+        "image",
+        "binary",
+        "smallmoney",
+        "money",
+        "real",
+        "smalldatetime",
+        "datetime2",
+        "datetimeoffset",
+        "sql_variant",
+        "uniqueidentifier",
+        "xml",
+        "hierarchyid",
+        "geography",
+        "geometry",
+    ]
+
+    def test_returns_none(self):
+        result = update_mssql_ischema_names({})
+        assert result is None
+
+    def test_mutates_dict_in_place(self):
+        target = {}
+        update_mssql_ischema_names(target)
+        for type_key in self.EXPECTED_MSSQL_TYPES:
+            assert (
+                type_key in target
+            ), f"'{type_key}' was not added by update_mssql_ischema_names"
+
+    def test_all_added_types_are_not_none(self):
+        target = {}
+        update_mssql_ischema_names(target)
+        for type_key in self.EXPECTED_MSSQL_TYPES:
+            assert target[type_key] is not None, f"'{type_key}' was mapped to None"
+
+    def test_does_not_overwrite_existing_entries(self):
+        sentinel = object()
+        target = {"existing_key": sentinel}
+        update_mssql_ischema_names(target)
+        assert target["existing_key"] is sentinel
+
     @patch(
         "metadata.ingestion.source.database.mssql.connection.test_connection_db_common"
     )

@@ -923,3 +923,30 @@ def test_mark_deleted_databases_with_multiple_databases(self):
                 self.assertEqual(
                     call_args[1]["entity_source_state"], expected_source_state
                 )
+
+
+class TestPostgresCommonMappings(TestCase):
+    """Verify extended type entries in the shared PostgreSQL ischema_names map."""
+
+    def test_tid_type_registered(self):
+        """'tid' must be present in the PostgreSQL ischema_names after common_pg_mappings is loaded."""
+        # common_pg_mappings registers types as a side-effect of module import
+        from sqlalchemy.dialects.postgresql.base import (
+            ischema_names as pg_ischema_names,
+        )
+
+        import metadata.ingestion.source.database.common_pg_mappings  # noqa: F401
+
+        self.assertIn("tid", pg_ischema_names)
+
+    def test_tid_maps_to_string(self):
+        """'tid' must map to a String-compatible SQLAlchemy type."""
+        from sqlalchemy import String as SqlAlchemyString
+        from sqlalchemy.dialects.postgresql.base import (
+            ischema_names as pg_ischema_names,
+        )
+
+        import metadata.ingestion.source.database.common_pg_mappings  # noqa: F401
+
+        tid_type = pg_ischema_names["tid"]
+        self.assertIs(tid_type, SqlAlchemyString)
@@ -16,10 +16,16 @@
 from unittest import TestCase
 from unittest.mock import patch
 
+import pytest
+from sqlalchemy import types as sqltypes
+
 from metadata.generated.schema.metadataIngestion.workflow import (
     OpenMetadataWorkflowConfig,
 )
-from metadata.ingestion.source.database.starrocks.metadata import StarRocksSource
+from metadata.ingestion.source.database.starrocks.metadata import (
+    StarRocksSource,
+    _get_sqlalchemy_type,
+)
 
 mock_starrocks_config = {
     "source": {
@@ -129,6 +135,44 @@ def test_ssl_manager_initialized(self):
         self.assertIsNotNone(self.starrocks_source.ssl_manager)
 
 
+class TestStarRocksTypeMappings:
+    """Verify _get_sqlalchemy_type returns the correct SQLAlchemy type for every
+    type added in the recent type-mapping expansion."""
+
+    @pytest.mark.parametrize(
+        "type_str, expected_sqa_class",
+        [
+            # Integer family
+            ("TINYINT", sqltypes.SMALLINT),
+            ("SMALLINT", sqltypes.SMALLINT),
+            ("INTEGER", sqltypes.INTEGER),
+            ("INT", sqltypes.INT),
+            ("BIGINT", sqltypes.BIGINT),
+            ("LARGEINT", sqltypes.BIGINT),
+            # Analytics / semi-structured types stored as TEXT
+            ("MAP", sqltypes.TEXT),
+            ("STRUCT", sqltypes.TEXT),
+            ("BITMAP", sqltypes.TEXT),
+            ("HLL", sqltypes.TEXT),
+            ("PERCENTILE", sqltypes.TEXT),
+            # Existing string types (regression guard)
+            ("STRING", sqltypes.TEXT),
+            ("TEXT", sqltypes.TEXT),
+            ("JSON", sqltypes.JSON),
+        ],
+    )
+    def test_type_resolves_to_expected_class(self, type_str, expected_sqa_class):
+        result = _get_sqlalchemy_type(type_str)
+        assert isinstance(result, expected_sqa_class), (
+            f"_get_sqlalchemy_type('{type_str}') returned {type(result).__name__}, "
+            f"expected {expected_sqa_class.__name__}"
+        )
+
+    def test_unknown_type_returns_null_type(self):
+        result = _get_sqlalchemy_type("UNKNOWN_CUSTOM_TYPE")
+        assert isinstance(result, sqltypes.NullType)
+
+
 class TestStarRocksIcebergMapping(TestCase):
     def test_iceberg_relkind_mapping(self):
         from metadata.generated.schema.entity.data.table import TableType