-
Notifications
You must be signed in to change notification settings - Fork 2k
fix(sampler): Respect randomizedSample flag at 100% percentage sampling #26966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7054e63
389974e
b665d8a
34a6c8d
f746c76
6197729
68118b1
0180946
8324863
fb8c05a
cf253ac
53df3cb
bfd09e8
692f27b
e31a335
3909d3c
a084392
61b0678
7636fe3
3dcfe85
06f70ea
8667db8
b3bd457
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -3,6 +3,24 @@ SET json = JSON_REMOVE(json, '$.sourceConfig.config.computeMetrics') | |||||||||||||||||||||||||||||||||||||||||
| WHERE JSON_EXTRACT(json, '$.sourceConfig.config.computeMetrics') IS NOT NULL | ||||||||||||||||||||||||||||||||||||||||||
| AND pipelineType = 'profiler'; | ||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||
| -- Set randomizedSample to false where it was true (old default behavior) | ||||||||||||||||||||||||||||||||||||||||||
| UPDATE ingestion_pipeline_entity | ||||||||||||||||||||||||||||||||||||||||||
| SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false) | ||||||||||||||||||||||||||||||||||||||||||
| WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true | ||||||||||||||||||||||||||||||||||||||||||
| AND pipelineType = 'profiler'; | ||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||
| UPDATE table_entity | ||||||||||||||||||||||||||||||||||||||||||
| SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false) | ||||||||||||||||||||||||||||||||||||||||||
| WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true; | ||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||
| UPDATE database_entity | ||||||||||||||||||||||||||||||||||||||||||
| SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false) | ||||||||||||||||||||||||||||||||||||||||||
| WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true; | ||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||
| UPDATE database_schema_entity | ||||||||||||||||||||||||||||||||||||||||||
| SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false) | ||||||||||||||||||||||||||||||||||||||||||
| WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true; | ||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+6
to
+23
|
||||||||||||||||||||||||||||||||||||||||||
| -- Set randomizedSample to false where it was true (old default behavior) | |
| UPDATE ingestion_pipeline_entity | |
| SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false) | |
| WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true | |
| AND pipelineType = 'profiler'; | |
| UPDATE table_entity | |
| SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false) | |
| WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true; | |
| UPDATE database_entity | |
| SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false) | |
| WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true; | |
| UPDATE database_schema_entity | |
| SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false) | |
| WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true; | |
| -- Preserve existing randomizedSample values during migration. | |
| -- Any new default for randomizedSample must be applied only to missing/null | |
| -- fields in application logic or a separate, explicitly documented migration. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,12 @@ | |
| from sqlalchemy.orm import DeclarativeBase | ||
|
|
||
| from metadata.generated.schema.entity.data.table import Column as EntityColumn | ||
| from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table | ||
| from metadata.generated.schema.entity.data.table import ( | ||
| ColumnName, | ||
| DataType, | ||
| ProfileSampleType, | ||
| Table, | ||
| ) | ||
| from metadata.generated.schema.entity.services.connections.database.sqliteConnection import ( | ||
| SQLiteConnection, | ||
| SQLiteScheme, | ||
|
|
@@ -361,6 +366,114 @@ def test_sample_from_user_query(self, sampler_mock): | |
| names = [col.root for col in sample_data.columns] | ||
| assert names == ["id", "name"] | ||
|
|
||
| def test_full_percentage_randomized_uses_sample_query(self, sampler_mock): | ||
| """100% PERCENTAGE + randomizedSample=True should go through | ||
| get_sample_query which adds ORDER BY on the random column.""" | ||
| with patch.object(SQASampler, "build_table_orm", return_value=User): | ||
| sampler = SQASampler( | ||
| service_connection_config=self.sqlite_conn, | ||
| ometa_client=None, | ||
| entity=None, | ||
| sample_config=SampleConfig( | ||
| profileSampleType=ProfileSampleType.PERCENTAGE, | ||
| profileSample=100, | ||
| randomizedSample=True, | ||
| ), | ||
| sample_data_count=5, | ||
| ) | ||
|
|
||
| with patch.object( | ||
| sampler, "get_sample_query", wraps=sampler.get_sample_query | ||
| ) as mock_gsq: | ||
| sampler.fetch_sample_data() | ||
| assert mock_gsq.called | ||
|
Comment on lines
+369
to
+389
|
||
|
|
||
| def test_full_percentage_not_randomized_skips_sample_query(self, sampler_mock): | ||
| """100% PERCENTAGE + randomizedSample=False should short-circuit | ||
| to raw dataset and NOT call get_sample_query.""" | ||
| with patch.object(SQASampler, "build_table_orm", return_value=User): | ||
| sampler = SQASampler( | ||
| service_connection_config=self.sqlite_conn, | ||
| ometa_client=None, | ||
| entity=None, | ||
| sample_config=SampleConfig( | ||
| profileSampleType=ProfileSampleType.PERCENTAGE, | ||
| profileSample=100, | ||
| randomizedSample=False, | ||
| ), | ||
| sample_data_count=5, | ||
| ) | ||
|
|
||
| with patch.object( | ||
| sampler, "get_sample_query", wraps=sampler.get_sample_query | ||
| ) as mock_gsq: | ||
| sampler.fetch_sample_data() | ||
| assert not mock_gsq.called | ||
|
|
||
| def test_full_percentage_none_randomized_skips_sample_query(self, sampler_mock): | ||
| """100% PERCENTAGE + randomizedSample=None should short-circuit | ||
| (only explicit True enables randomization).""" | ||
| with patch.object(SQASampler, "build_table_orm", return_value=User): | ||
| sampler = SQASampler( | ||
| service_connection_config=self.sqlite_conn, | ||
| ometa_client=None, | ||
| entity=None, | ||
| sample_config=SampleConfig( | ||
| profileSampleType=ProfileSampleType.PERCENTAGE, | ||
| profileSample=100, | ||
| randomizedSample=None, | ||
| ), | ||
|
Comment on lines
+413
to
+425
|
||
| sample_data_count=5, | ||
| ) | ||
|
Comment on lines
+413
to
+427
|
||
|
|
||
| with patch.object( | ||
| sampler, "get_sample_query", wraps=sampler.get_sample_query | ||
| ) as mock_gsq: | ||
| sampler.fetch_sample_data() | ||
| assert not mock_gsq.called | ||
|
|
||
| def test_randomized_true_produces_non_deterministic_rows(self, sampler_mock): | ||
| """With randomizedSample=True at 100% PERCENTAGE, multiple | ||
| fetch_sample_data calls should return rows in different orders.""" | ||
| with patch.object(SQASampler, "build_table_orm", return_value=User): | ||
| sampler = SQASampler( | ||
| service_connection_config=self.sqlite_conn, | ||
| ometa_client=None, | ||
| entity=None, | ||
| sample_config=SampleConfig( | ||
| profileSampleType=ProfileSampleType.PERCENTAGE, | ||
| profileSample=100, | ||
| randomizedSample=True, | ||
| ), | ||
| sample_data_count=5, | ||
| ) | ||
|
|
||
| results = [sampler.fetch_sample_data().rows for _ in range(20)] | ||
| assert any( | ||
| results[i] != results[0] for i in range(1, len(results)) | ||
| ), "Expected non-deterministic row ordering with randomizedSample=True" | ||
|
|
||
|
RajdeepKushwaha5 marked this conversation as resolved.
|
||
| def test_randomized_false_produces_deterministic_rows(self, sampler_mock): | ||
| """With randomizedSample=False at 100% PERCENTAGE, multiple | ||
| fetch_sample_data calls should return rows in the same order.""" | ||
| with patch.object(SQASampler, "build_table_orm", return_value=User): | ||
| sampler = SQASampler( | ||
| service_connection_config=self.sqlite_conn, | ||
| ometa_client=None, | ||
| entity=None, | ||
| sample_config=SampleConfig( | ||
| profileSampleType=ProfileSampleType.PERCENTAGE, | ||
| profileSample=100, | ||
| randomizedSample=False, | ||
| ), | ||
| sample_data_count=5, | ||
| ) | ||
|
|
||
| results = [sampler.fetch_sample_data().rows for _ in range(5)] | ||
| assert all( | ||
| results[i] == results[0] for i in range(1, len(results)) | ||
| ), "Expected deterministic row ordering with randomizedSample=False" | ||
|
|
||
| @classmethod | ||
| def tearDownClass(cls) -> None: | ||
| os.remove(cls.db_path) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.