open-metadata · TeddyCr · Apr 14, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/bootstrap/sql/migrations/native/1.13.0/mysql/postDataMigrationSQLScript.sql b/bootstrap/sql/migrations/native/1.13.0/mysql/postDataMigrationSQLScript.sql
@@ -3,6 +3,24 @@ SET json = JSON_REMOVE(json, '$.sourceConfig.config.computeMetrics')
 WHERE JSON_EXTRACT(json, '$.sourceConfig.config.computeMetrics') IS NOT NULL
 AND pipelineType = 'profiler';
 
+-- Set randomizedSample to false where it was true (old default behavior)
+UPDATE ingestion_pipeline_entity
+SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
+AND pipelineType = 'profiler';
+
+UPDATE table_entity
+SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;
+
+UPDATE database_entity
+SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;
+
+UPDATE database_schema_entity
+SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;
+
-- Set randomizedSample to false where it was true (old default behavior)
-UPDATE ingestion_pipeline_entity
-SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
-AND pipelineType = 'profiler';
-
-UPDATE table_entity
-SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;
-
-UPDATE database_entity
-SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;
-
-UPDATE database_schema_entity
-SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;
+-- Preserve existing randomizedSample values during migration.
+-- Any new default for randomizedSample must be applied only to missing/null
+-- fields in application logic or a separate, explicitly documented migration.
-- Set randomizedSample to false where it was true (old default behavior)
-UPDATE ingestion_pipeline_entity
-SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
-AND pipelineType = 'profiler';
-
-UPDATE table_entity
-SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;
-
-UPDATE database_entity
-SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;
-
-UPDATE database_schema_entity
-SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;
+-- Preserve existing randomizedSample values during migration.
+-- Any new default for randomizedSample must be applied only to missing/null
+-- fields in application logic or a separate, explicitly documented migration.
 -- Hard-delete ingestion pipelines for Iceberg services (must run before service migration)
 DELETE ipe FROM ingestion_pipeline_entity ipe
 JOIN dbservice_entity dse

diff --git a/bootstrap/sql/migrations/native/1.13.0/postgres/postDataMigrationSQLScript.sql b/bootstrap/sql/migrations/native/1.13.0/postgres/postDataMigrationSQLScript.sql
@@ -3,6 +3,24 @@ SET json = (json::jsonb #- '{sourceConfig,config,computeMetrics}')::json
 WHERE json::jsonb -> 'sourceConfig' -> 'config' -> 'computeMetrics' IS NOT NULL
 AND pipelineType = 'profiler';
 
+-- Set randomizedSample to false where it was true (old default behavior)
+UPDATE ingestion_pipeline_entity
+SET json = jsonb_set(json::jsonb, '{sourceConfig,config,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{sourceConfig,config,randomizedSample}' = 'true'
+AND pipelineType = 'profiler';
+
+UPDATE table_entity
+SET json = jsonb_set(json::jsonb, '{tableProfilerConfig,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{tableProfilerConfig,randomizedSample}' = 'true';
+
+UPDATE database_entity
+SET json = jsonb_set(json::jsonb, '{databaseProfilerConfig,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{databaseProfilerConfig,randomizedSample}' = 'true';
+
+UPDATE database_schema_entity
+SET json = jsonb_set(json::jsonb, '{databaseSchemaProfilerConfig,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{databaseSchemaProfilerConfig,randomizedSample}' = 'true';
+
 -- Hard-delete ingestion pipelines for Iceberg services (must run before service migration)
 DELETE FROM ingestion_pipeline_entity ipe
 USING dbservice_entity dse

@@ -11,6 +11,7 @@
 """
 Sampling Models
 """
+
 from typing import Any, List, Optional, Union
 
 from pydantic import Field, model_validator
@@ -42,7 +43,7 @@ class BaseProfileConfig(ConfigModel):
     profileSampleType: Optional[ProfileSampleType] = None
     samplingMethodType: Optional[SamplingMethodType] = None
     sampleDataCount: Optional[int] = 100
-    randomizedSample: Optional[bool] = True
+    randomizedSample: Optional[bool] = False
 
 
 class ColumnConfig(ConfigModel):
@@ -58,7 +59,7 @@ class TableConfig(BaseProfileConfig):
     profileQuery: Optional[str] = None
     partitionConfig: Optional[PartitionProfilerConfig] = None
     columnConfig: Optional[ColumnConfig] = None
-    randomizedSample: Optional[bool] = True
+    randomizedSample: Optional[bool] = False
 
     @classmethod
     def from_database_and_schema_config(
@@ -127,4 +128,4 @@ class SampleConfig(ConfigModel):
     profileSample: Optional[Union[float, int]] = None
     profileSampleType: Optional[ProfileSampleType] = ProfileSampleType.PERCENTAGE
     samplingMethodType: Optional[SamplingMethodType] = None
-    randomizedSample: Optional[bool] = True
+    randomizedSample: Optional[bool] = False
@@ -107,9 +107,13 @@ def get_dataset(self, **__):
         if self.partition_details:
             raw_dataset = self._partitioned_table()
 
-        if not self.sample_config.profileSample or (
+        if not self.sample_config.profileSample:
+            return raw_dataset
+
+        if (
             self.sample_config.profileSample == 100
             and self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE
+            and self.sample_config.randomizedSample is not True
         ):
             return raw_dataset
         return self.get_sampled_dataframe(raw_dataset, self.sample_config)

@@ -164,22 +164,25 @@ def get_sample_query(self, *, column=None) -> Query:
                     (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL),
                 ).cte(f"{self.get_sampler_table_name()}_rnd")
                 session_query = client.query(rnd)
-                return session_query.where(
+                query = session_query.where(
                     rnd.c.random <= self.sample_config.profileSample
-                ).cte(f"{self.get_sampler_table_name()}_sample")
+                )
+                if self.sample_config.randomizedSample is True:
+                    query = query.order_by(rnd.c.random)
+                return query.cte(f"{self.get_sampler_table_name()}_sample")
 
             table_query = client.query(self.raw_dataset)
             if self.partition_details:
                 table_query = self.get_partitioned_query(table_query)
             session_query = self._base_sample_query(
                 column,
                 (ModuloFn(RandomNumFn(), table_query.count())).label(RANDOM_LABEL)
-                if self.sample_config.randomizedSample
+                if self.sample_config.randomizedSample is True
                 else None,
             )
             query = (
                 session_query.order_by(RANDOM_LABEL)
-                if self.sample_config.randomizedSample
+                if self.sample_config.randomizedSample is True
                 else session_query
             )
             return query.limit(self.sample_config.profileSample).cte(
@@ -194,9 +197,16 @@ def get_dataset(self, column=None, **__) -> Union[type, AliasedClass]:
         if self.sample_query:
             return self._rdn_sample_from_user_query()
 
-        if not self.sample_config.profileSample or (
+        if not self.sample_config.profileSample:
+            if self.partition_details:
+                return self._partitioned_table()
+
+            return self.raw_dataset
+
+        if (
             self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE
             and self.sample_config.profileSample == 100
+            and self.sample_config.randomizedSample is not True
         ):
             if self.partition_details:
                 return self._partitioned_table()
@@ -217,7 +227,6 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData
         if self.sample_query:
             return self._fetch_sample_data_from_user_query()
 
-        # Add new RandomNumFn column
         ds = self.get_dataset()
         if not columns:
             sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]

@@ -21,7 +21,12 @@
 from sqlalchemy.orm import DeclarativeBase
 
 from metadata.generated.schema.entity.data.table import Column as EntityColumn
-from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table
+from metadata.generated.schema.entity.data.table import (
+    ColumnName,
+    DataType,
+    ProfileSampleType,
+    Table,
+)
 from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
     SQLiteConnection,
     SQLiteScheme,
@@ -361,6 +366,114 @@ def test_sample_from_user_query(self, sampler_mock):
         names = [col.root for col in sample_data.columns]
         assert names == ["id", "name"]
 
+    def test_full_percentage_randomized_uses_sample_query(self, sampler_mock):
+        """100% PERCENTAGE + randomizedSample=True should go through
+        get_sample_query which adds ORDER BY on the random column."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=True,
+                ),
+                sample_data_count=5,
+            )
+
+        with patch.object(
+            sampler, "get_sample_query", wraps=sampler.get_sample_query
+        ) as mock_gsq:
+            sampler.fetch_sample_data()
+            assert mock_gsq.called
+
+    def test_full_percentage_not_randomized_skips_sample_query(self, sampler_mock):
+        """100% PERCENTAGE + randomizedSample=False should short-circuit
+        to raw dataset and NOT call get_sample_query."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=False,
+                ),
+                sample_data_count=5,
+            )
+
+        with patch.object(
+            sampler, "get_sample_query", wraps=sampler.get_sample_query
+        ) as mock_gsq:
+            sampler.fetch_sample_data()
+            assert not mock_gsq.called
+
+    def test_full_percentage_none_randomized_skips_sample_query(self, sampler_mock):
+        """100% PERCENTAGE + randomizedSample=None should short-circuit
+        (only explicit True enables randomization)."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=None,
+                ),
+                sample_data_count=5,
+            )
+
+        with patch.object(
+            sampler, "get_sample_query", wraps=sampler.get_sample_query
+        ) as mock_gsq:
+            sampler.fetch_sample_data()
+            assert not mock_gsq.called
+
+    def test_randomized_true_produces_non_deterministic_rows(self, sampler_mock):
+        """With randomizedSample=True at 100% PERCENTAGE, multiple
+        fetch_sample_data calls should return rows in different orders."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=True,
+                ),
+                sample_data_count=5,
+            )
+
+        results = [sampler.fetch_sample_data().rows for _ in range(20)]
+        assert any(
+            results[i] != results[0] for i in range(1, len(results))
+        ), "Expected non-deterministic row ordering with randomizedSample=True"
+
+    def test_randomized_false_produces_deterministic_rows(self, sampler_mock):
+        """With randomizedSample=False at 100% PERCENTAGE, multiple
+        fetch_sample_data calls should return rows in the same order."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=False,
+                ),
+                sample_data_count=5,
+            )
+
+        results = [sampler.fetch_sample_data().rows for _ in range(5)]
+        assert all(
+            results[i] == results[0] for i in range(1, len(results))
+        ), "Expected deterministic row ordering with randomizedSample=False"
+
     @classmethod
     def tearDownClass(cls) -> None:
         os.remove(cls.db_path)