open-metadata · TeddyCr · Apr 14, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/bootstrap/sql/migrations/native/1.13.0/mysql/postDataMigrationSQLScript.sql b/bootstrap/sql/migrations/native/1.13.0/mysql/postDataMigrationSQLScript.sql
@@ -3,6 +3,24 @@ SET json = JSON_REMOVE(json, '$.sourceConfig.config.computeMetrics')
 WHERE JSON_EXTRACT(json, '$.sourceConfig.config.computeMetrics') IS NOT NULL
 AND pipelineType = 'profiler';
 
+-- Set randomizedSample to false where it was true (old default behavior)
+UPDATE ingestion_pipeline_entity
+SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
+AND pipelineType = 'profiler';
+
+UPDATE table_entity
+SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;
+
+UPDATE database_entity
+SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;
+
+UPDATE database_schema_entity
+SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
+WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;
+
-- Set randomizedSample to false where it was true (old default behavior)
-UPDATE ingestion_pipeline_entity
-SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
-AND pipelineType = 'profiler';
-
-UPDATE table_entity
-SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;
-
-UPDATE database_entity
-SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;
-
-UPDATE database_schema_entity
-SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;
+-- Preserve existing randomizedSample values during migration.
+-- Any new default for randomizedSample must be applied only to missing/null
+-- fields in application logic or a separate, explicitly documented migration.
-- Set randomizedSample to false where it was true (old default behavior)
-UPDATE ingestion_pipeline_entity
-SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
-AND pipelineType = 'profiler';
-
-UPDATE table_entity
-SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;
-
-UPDATE database_entity
-SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;
-
-UPDATE database_schema_entity
-SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
-WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;
+-- Preserve existing randomizedSample values during migration.
+-- Any new default for randomizedSample must be applied only to missing/null
+-- fields in application logic or a separate, explicitly documented migration.
 -- Hard-delete ingestion pipelines for Iceberg services (must run before service migration)
 DELETE ipe FROM ingestion_pipeline_entity ipe
 JOIN dbservice_entity dse

diff --git a/bootstrap/sql/migrations/native/1.13.0/postgres/postDataMigrationSQLScript.sql b/bootstrap/sql/migrations/native/1.13.0/postgres/postDataMigrationSQLScript.sql
@@ -3,6 +3,24 @@ SET json = (json::jsonb #- '{sourceConfig,config,computeMetrics}')::json
 WHERE json::jsonb -> 'sourceConfig' -> 'config' -> 'computeMetrics' IS NOT NULL
 AND pipelineType = 'profiler';
 
+-- Set randomizedSample to false where it was true (old default behavior)
+UPDATE ingestion_pipeline_entity
+SET json = jsonb_set(json::jsonb, '{sourceConfig,config,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{sourceConfig,config,randomizedSample}' = 'true'
+AND pipelineType = 'profiler';
+
+UPDATE table_entity
+SET json = jsonb_set(json::jsonb, '{tableProfilerConfig,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{tableProfilerConfig,randomizedSample}' = 'true';
+
+UPDATE database_entity
+SET json = jsonb_set(json::jsonb, '{databaseProfilerConfig,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{databaseProfilerConfig,randomizedSample}' = 'true';
+
+UPDATE database_schema_entity
+SET json = jsonb_set(json::jsonb, '{databaseSchemaProfilerConfig,randomizedSample}', 'false'::jsonb)::json
+WHERE json::jsonb #>> '{databaseSchemaProfilerConfig,randomizedSample}' = 'true';
+
 -- Hard-delete ingestion pipelines for Iceberg services (must run before service migration)
 DELETE FROM ingestion_pipeline_entity ipe
 USING dbservice_entity dse

@@ -107,9 +107,13 @@ def get_dataset(self, **__):
         if self.partition_details:
             raw_dataset = self._partitioned_table()
 
-        if not self.sample_config.profileSample or (
+        if not self.sample_config.profileSample:
+            return raw_dataset
+
+        if (
             self.sample_config.profileSample == 100
             and self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE
+            and self.sample_config.randomizedSample is not True
         ):
             return raw_dataset
         return self.get_sampled_dataframe(raw_dataset, self.sample_config)

@@ -164,22 +164,25 @@ def get_sample_query(self, *, column=None) -> Query:
                     (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL),
                 ).cte(f"{self.get_sampler_table_name()}_rnd")
                 session_query = client.query(rnd)
-                return session_query.where(
+                query = session_query.where(
                     rnd.c.random <= self.sample_config.profileSample
-                ).cte(f"{self.get_sampler_table_name()}_sample")
+                )
+                if self.sample_config.randomizedSample is True:
+                    query = query.order_by(rnd.c.random)
+                return query.cte(f"{self.get_sampler_table_name()}_sample")
 
             table_query = client.query(self.raw_dataset)
             if self.partition_details:
                 table_query = self.get_partitioned_query(table_query)
             session_query = self._base_sample_query(
                 column,
                 (ModuloFn(RandomNumFn(), table_query.count())).label(RANDOM_LABEL)
-                if self.sample_config.randomizedSample
+                if self.sample_config.randomizedSample is True
                 else None,
             )
             query = (
                 session_query.order_by(RANDOM_LABEL)
-                if self.sample_config.randomizedSample
+                if self.sample_config.randomizedSample is True
                 else session_query
             )
             return query.limit(self.sample_config.profileSample).cte(
@@ -194,9 +197,16 @@ def get_dataset(self, column=None, **__) -> Union[type, AliasedClass]:
         if self.sample_query:
             return self._rdn_sample_from_user_query()
 
-        if not self.sample_config.profileSample or (
+        if not self.sample_config.profileSample:
+            if self.partition_details:
+                return self._partitioned_table()
+
+            return self.raw_dataset
+
+        if (
             self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE
             and self.sample_config.profileSample == 100
+            and self.sample_config.randomizedSample is not True
         ):
             if self.partition_details:
                 return self._partitioned_table()
@@ -217,7 +227,6 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData
         if self.sample_query:
             return self._fetch_sample_data_from_user_query()
 
-        # Add new RandomNumFn column
         ds = self.get_dataset()
         if not columns:
             sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]

diff --git a/ingestion/tests/unit/observability/profiler/sqlalchemy/bigquery/test_bigquery_sampling.py b/ingestion/tests/unit/observability/profiler/sqlalchemy/bigquery/test_bigquery_sampling.py
@@ -158,6 +158,7 @@ def test_sampling_for_views(self, sampler_mock):
             'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users.id AS id, ABS(RANDOM()) * 100 %% 100 AS random \n'
             'FROM users)\n SELECT "9bc65c2abec141778ffaa729489f3e87_rnd".id, "9bc65c2abec141778ffaa729489f3e87_rnd".random \n'
             'FROM "9bc65c2abec141778ffaa729489f3e87_rnd" \nWHERE "9bc65c2abec141778ffaa729489f3e87_rnd".random <= 50.0'
+            ' ORDER BY "9bc65c2abec141778ffaa729489f3e87_rnd".random'
         )
         assert (
             expected_query.casefold()
@@ -201,6 +202,7 @@ def test_sampling_view_with_partition(self, sampler_mock):
             'WITH "9bc65c2abec141778ffaa729489f3e87_rnd" AS \n(SELECT users.id AS id, ABS(RANDOM()) * 100 %% 100 AS random \n'
             "FROM users \nWHERE id in ('1', '2'))\n SELECT \"9bc65c2abec141778ffaa729489f3e87_rnd\".id, \"9bc65c2abec141778ffaa729489f3e87_rnd\".random \n"
             'FROM "9bc65c2abec141778ffaa729489f3e87_rnd" \nWHERE "9bc65c2abec141778ffaa729489f3e87_rnd".random <= 50.0'
+            ' ORDER BY "9bc65c2abec141778ffaa729489f3e87_rnd".random'
         )
         assert (
             expected_query.casefold()

@@ -21,7 +21,12 @@
 from sqlalchemy.orm import DeclarativeBase
 
 from metadata.generated.schema.entity.data.table import Column as EntityColumn
-from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table
+from metadata.generated.schema.entity.data.table import (
+    ColumnName,
+    DataType,
+    ProfileSampleType,
+    Table,
+)
 from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
     SQLiteConnection,
     SQLiteScheme,
@@ -361,6 +366,114 @@ def test_sample_from_user_query(self, sampler_mock):
         names = [col.root for col in sample_data.columns]
         assert names == ["id", "name"]
 
+    def test_full_percentage_randomized_uses_sample_query(self, sampler_mock):
+        """100% PERCENTAGE + randomizedSample=True should go through
+        get_sample_query so fetch_sample_data can ORDER BY the random column."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=True,
+                ),
+                sample_data_count=5,
+            )
+
+        with patch.object(
+            sampler, "get_sample_query", wraps=sampler.get_sample_query
+        ) as mock_gsq:
+            sampler.fetch_sample_data()
+            assert mock_gsq.called
+
+    def test_full_percentage_not_randomized_skips_sample_query(self, sampler_mock):
+        """100% PERCENTAGE + randomizedSample=False should short-circuit
+        to raw dataset and NOT call get_sample_query."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=False,
+                ),
+                sample_data_count=5,
+            )
+
+        with patch.object(
+            sampler, "get_sample_query", wraps=sampler.get_sample_query
+        ) as mock_gsq:
+            sampler.fetch_sample_data()
+            assert not mock_gsq.called
+
+    def test_full_percentage_none_randomized_skips_sample_query(self, sampler_mock):
+        """100% PERCENTAGE + randomizedSample=None should short-circuit
+        (only explicit True enables randomization)."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=None,
+                ),
+                sample_data_count=5,
+            )
+
+        with patch.object(
+            sampler, "get_sample_query", wraps=sampler.get_sample_query
+        ) as mock_gsq:
+            sampler.fetch_sample_data()
+            assert not mock_gsq.called
+
+    def test_randomized_true_produces_non_deterministic_rows(self, sampler_mock):
+        """With randomizedSample=True at 100% PERCENTAGE, multiple
+        fetch_sample_data calls should return rows in different orders."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=True,
+                ),
+                sample_data_count=5,
+            )
+
+        results = [sampler.fetch_sample_data().rows for _ in range(10)]
+        assert any(
+            results[i] != results[0] for i in range(1, len(results))
+        ), "Expected non-deterministic row ordering with randomizedSample=True"
+
+    def test_randomized_false_produces_deterministic_rows(self, sampler_mock):
+        """With randomizedSample=False at 100% PERCENTAGE, multiple
+        fetch_sample_data calls should return rows in the same order."""
+        with patch.object(SQASampler, "build_table_orm", return_value=User):
+            sampler = SQASampler(
+                service_connection_config=self.sqlite_conn,
+                ometa_client=None,
+                entity=None,
+                sample_config=SampleConfig(
+                    profileSampleType=ProfileSampleType.PERCENTAGE,
+                    profileSample=100,
+                    randomizedSample=False,
+                ),
+                sample_data_count=5,
+            )
+
+        results = [sampler.fetch_sample_data().rows for _ in range(5)]
+        assert all(
+            results[i] == results[0] for i in range(1, len(results))
+        ), "Expected deterministic row ordering with randomizedSample=False"
+
     @classmethod
     def tearDownClass(cls) -> None:
         os.remove(cls.db_path)

@@ -0,0 +1,120 @@
+#  Copyright 2025 Collate
+#  Licensed under the Collate Community License, Version 1.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""
+Tests for 100% PERCENTAGE sampling edge case (#21304).
+
+Verifies that the get_dataset() short-circuit at 100% correctly
+respects the randomizedSample flag. Only an explicit True enables
+randomization; None and False both skip randomization.
+"""
+from unittest.mock import MagicMock, patch
+
+from metadata.generated.schema.entity.data.table import ProfileSampleType
+from metadata.sampler.models import SampleConfig
+
+
+class TestSQASampler100Pct:
+    """Test SQASampler.get_dataset() at 100% PERCENTAGE sampling."""
+
+    def _make_sampler(self, randomized_sample):
+        """Create a SQASampler mock with the given randomizedSample value."""
+        with patch(
+            "metadata.sampler.sqlalchemy.sampler.SQASampler.__init__",
+            return_value=None,
+        ):
+            from metadata.sampler.sqlalchemy.sampler import SQASampler
+
+            sampler = SQASampler()
+            sampler.sample_config = SampleConfig(
+                profileSample=100,
+                profileSampleType=ProfileSampleType.PERCENTAGE,
+                randomizedSample=randomized_sample,
+            )
+            sampler.sample_query = None
+            sampler.partition_details = None
+            sampler._table = MagicMock(name="raw_table")
+            sampler.get_sample_query = MagicMock(
+                name="get_sample_query", return_value=MagicMock(name="sample_cte")
+            )
+            return sampler
+
+    def test_100_pct_randomized_true_delegates_to_sample_query(self):
+        """100% + randomizedSample=True should NOT short-circuit."""
+        sampler = self._make_sampler(randomized_sample=True)
+        result = sampler.get_dataset()
+        sampler.get_sample_query.assert_called_once()
+        assert result == sampler.get_sample_query.return_value
+
+    def test_100_pct_randomized_false_returns_raw_dataset(self):
+        """100% + randomizedSample=False should short-circuit to raw dataset."""
+        sampler = self._make_sampler(randomized_sample=False)
+        result = sampler.get_dataset()
+        sampler.get_sample_query.assert_not_called()
+        assert result == sampler._table
+
+    def test_100_pct_randomized_none_returns_raw_dataset(self):
+        """100% + randomizedSample=None should short-circuit (only explicit True randomizes)."""
+        sampler = self._make_sampler(randomized_sample=None)
+        result = sampler.get_dataset()
+        sampler.get_sample_query.assert_not_called()
+        assert result == sampler._table
+
+
+class TestDatalakeSampler100Pct:
+    """Test DatalakeSampler.get_dataset() at 100% PERCENTAGE sampling."""
+
+    def _make_sampler(self, randomized_sample):
+        """Create a DatalakeSampler mock with the given randomizedSample value."""
+        with patch(
+            "metadata.sampler.pandas.sampler.DatalakeSampler.__init__",
+            return_value=None,
+        ):
+            from metadata.sampler.pandas.sampler import DatalakeSampler
+
+            sampler = DatalakeSampler()
+            sampler.sample_config = SampleConfig(
+                profileSample=100,
+                profileSampleType=ProfileSampleType.PERCENTAGE,
+                randomizedSample=randomized_sample,
+            )
+            sampler.sample_query = None
+            sampler.partition_details = None
+            table_mock = MagicMock(name="table_wrapper")
+            table_mock.dataframes = MagicMock(name="raw_dataframes")
+            sampler._table = table_mock
+            sampler.get_sampled_dataframe = MagicMock(
+                name="get_sampled_dataframe",
+                return_value=MagicMock(name="sampled_df"),
+            )
+            sampler.service_connection_config = MagicMock()
+            sampler.connection = MagicMock()
+            return sampler
+
+    def test_100_pct_randomized_true_delegates_to_sampled_dataframe(self):
+        """100% + randomizedSample=True should NOT short-circuit."""
+        sampler = self._make_sampler(randomized_sample=True)
+        result = sampler.get_dataset()
+        sampler.get_sampled_dataframe.assert_called_once()
+        assert result == sampler.get_sampled_dataframe.return_value
+
+    def test_100_pct_randomized_false_returns_raw_dataset(self):
+        """100% + randomizedSample=False should short-circuit to raw dataset."""
+        sampler = self._make_sampler(randomized_sample=False)
+        result = sampler.get_dataset()
+        sampler.get_sampled_dataframe.assert_not_called()
+        assert result == sampler._table.dataframes
+
+    def test_100_pct_randomized_none_returns_raw_dataset(self):
+        """100% + randomizedSample=None should short-circuit (only explicit True randomizes)."""
+        sampler = self._make_sampler(randomized_sample=None)
+        result = sampler.get_dataset()
+        sampler.get_sampled_dataframe.assert_not_called()
+        assert result == sampler._table.dataframes
diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/data/database.json b/openmetadata-spec/src/main/resources/json/schema/entity/data/database.json
@@ -175,7 +175,7 @@
         "randomizedSample": {
           "description": "Whether to randomize the sample data or not.",
           "type": "boolean",
-          "default": true
+          "default": false
         }
       }
     },