Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
7054e63
fix(sampler): respect randomizedSample flag at 100% percentage sampling
RajdeepKushwaha5 Apr 2, 2026
389974e
Address review: use 'is False' for Optional[bool] and add unit tests
RajdeepKushwaha5 Apr 2, 2026
b665d8a
Add ORDER BY on random column in fetch_sample_data for true randomiza…
RajdeepKushwaha5 Apr 2, 2026
34a6c8d
Address review: remove stale comment, unused import, add return asser…
RajdeepKushwaha5 Apr 2, 2026
f746c76
Apply suggestion from @Copilot
TeddyCr Apr 2, 2026
6197729
Address review: move ORDER BY to get_sample_query, clean up fetch_sam…
RajdeepKushwaha5 Apr 2, 2026
68118b1
Address review: None defaults to False for randomizedSample
RajdeepKushwaha5 Apr 2, 2026
0180946
Fix integration test: None should skip sample_query (matches truthine…
RajdeepKushwaha5 Apr 2, 2026
8324863
fix(tests): update BigQuery view sampling expected queries with ORDER BY
RajdeepKushwaha5 Apr 2, 2026
fb8c05a
refactor: use explicit is False for randomizedSample checks
RajdeepKushwaha5 Apr 2, 2026
cf253ac
ci: re-trigger checks after SIGSEGV flake
RajdeepKushwaha5 Apr 2, 2026
53df3cb
refactor: only explicit True randomizes, add non-determinism tests
RajdeepKushwaha5 Apr 3, 2026
bfd09e8
test: increase non-determinism iterations to reduce flakiness
RajdeepKushwaha5 Apr 3, 2026
692f27b
chore: added randomize as false
TeddyCr Apr 3, 2026
e31a335
Merge remote-tracking branch 'upstream/main' into fix/21304-sampler-r…
TeddyCr Apr 3, 2026
3909d3c
Merge remote-tracking branch 'upstream/main' into fix/21304-sampler-r…
TeddyCr Apr 3, 2026
a084392
fix: align randomizedSample defaults with schema (false)
RajdeepKushwaha5 Apr 3, 2026
61b0678
Merge branch 'main' into fix/21304-sampler-randomization
TeddyCr Apr 7, 2026
7636fe3
Merge branch 'main' into fix/21304-sampler-randomization
RajdeepKushwaha5 Apr 7, 2026
3dcfe85
Merge branch 'main' into fix/21304-sampler-randomization
RajdeepKushwaha5 Apr 11, 2026
06f70ea
fix: remove ORDER BY from BigQuery test expectations
RajdeepKushwaha5 Apr 11, 2026
8667db8
Merge branch 'main' into fix/21304-sampler-randomization
RajdeepKushwaha5 Apr 11, 2026
b3bd457
test: increase non-determinism test iterations to reduce flakiness
RajdeepKushwaha5 Apr 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,24 @@ SET json = JSON_REMOVE(json, '$.sourceConfig.config.computeMetrics')
WHERE JSON_EXTRACT(json, '$.sourceConfig.config.computeMetrics') IS NOT NULL
AND pipelineType = 'profiler';

-- Set randomizedSample to false where it was true (old default behavior)
UPDATE ingestion_pipeline_entity
SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
AND pipelineType = 'profiler';

UPDATE table_entity
SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;

UPDATE database_entity
SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;

UPDATE database_schema_entity
SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;

Comment thread
TeddyCr marked this conversation as resolved.
Comment on lines +6 to +23
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This migration flips any stored randomizedSample=true to false, which will override existing user choices and silently change runtime behavior. If you only want a new default, avoid rewriting explicit values; migrate only missing/null fields if needed.

If the intent is to globally disable randomization, please treat this as a breaking change and document it clearly instead of silently mutating config JSON.

Suggested change
-- Set randomizedSample to false where it was true (old default behavior)
UPDATE ingestion_pipeline_entity
SET json = JSON_SET(json, '$.sourceConfig.config.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.sourceConfig.config.randomizedSample') = true
AND pipelineType = 'profiler';
UPDATE table_entity
SET json = JSON_SET(json, '$.tableProfilerConfig.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.tableProfilerConfig.randomizedSample') = true;
UPDATE database_entity
SET json = JSON_SET(json, '$.databaseProfilerConfig.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.databaseProfilerConfig.randomizedSample') = true;
UPDATE database_schema_entity
SET json = JSON_SET(json, '$.databaseSchemaProfilerConfig.randomizedSample', false)
WHERE JSON_EXTRACT(json, '$.databaseSchemaProfilerConfig.randomizedSample') = true;
-- Preserve existing randomizedSample values during migration.
-- Any new default for randomizedSample must be applied only to missing/null
-- fields in application logic or a separate, explicitly documented migration.

Copilot uses AI. Check for mistakes.
-- Hard-delete ingestion pipelines for Iceberg services (must run before service migration)
DELETE ipe FROM ingestion_pipeline_entity ipe
JOIN dbservice_entity dse
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,24 @@ SET json = (json::jsonb #- '{sourceConfig,config,computeMetrics}')::json
WHERE json::jsonb -> 'sourceConfig' -> 'config' -> 'computeMetrics' IS NOT NULL
AND pipelineType = 'profiler';

-- Set randomizedSample to false where it was true (old default behavior)
UPDATE ingestion_pipeline_entity
SET json = jsonb_set(json::jsonb, '{sourceConfig,config,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{sourceConfig,config,randomizedSample}' = 'true'
AND pipelineType = 'profiler';

UPDATE table_entity
SET json = jsonb_set(json::jsonb, '{tableProfilerConfig,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{tableProfilerConfig,randomizedSample}' = 'true';

UPDATE database_entity
SET json = jsonb_set(json::jsonb, '{databaseProfilerConfig,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{databaseProfilerConfig,randomizedSample}' = 'true';

UPDATE database_schema_entity
SET json = jsonb_set(json::jsonb, '{databaseSchemaProfilerConfig,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{databaseSchemaProfilerConfig,randomizedSample}' = 'true';

Comment thread
TeddyCr marked this conversation as resolved.
-- Hard-delete ingestion pipelines for Iceberg services (must run before service migration)
DELETE FROM ingestion_pipeline_entity ipe
USING dbservice_entity dse
Expand Down
7 changes: 4 additions & 3 deletions ingestion/src/metadata/sampler/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"""
Sampling Models
"""

from typing import Any, List, Optional, Union

from pydantic import Field, model_validator
Expand Down Expand Up @@ -42,7 +43,7 @@ class BaseProfileConfig(ConfigModel):
profileSampleType: Optional[ProfileSampleType] = None
samplingMethodType: Optional[SamplingMethodType] = None
sampleDataCount: Optional[int] = 100
randomizedSample: Optional[bool] = True
randomizedSample: Optional[bool] = False


class ColumnConfig(ConfigModel):
Expand All @@ -58,7 +59,7 @@ class TableConfig(BaseProfileConfig):
profileQuery: Optional[str] = None
partitionConfig: Optional[PartitionProfilerConfig] = None
columnConfig: Optional[ColumnConfig] = None
randomizedSample: Optional[bool] = True
randomizedSample: Optional[bool] = False
Comment thread
RajdeepKushwaha5 marked this conversation as resolved.

@classmethod
def from_database_and_schema_config(
Expand Down Expand Up @@ -127,4 +128,4 @@ class SampleConfig(ConfigModel):
profileSample: Optional[Union[float, int]] = None
profileSampleType: Optional[ProfileSampleType] = ProfileSampleType.PERCENTAGE
samplingMethodType: Optional[SamplingMethodType] = None
randomizedSample: Optional[bool] = True
randomizedSample: Optional[bool] = False
6 changes: 5 additions & 1 deletion ingestion/src/metadata/sampler/pandas/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,13 @@ def get_dataset(self, **__):
if self.partition_details:
raw_dataset = self._partitioned_table()

if not self.sample_config.profileSample or (
if not self.sample_config.profileSample:
return raw_dataset

if (
self.sample_config.profileSample == 100
and self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE
and self.sample_config.randomizedSample is not True
Comment thread
RajdeepKushwaha5 marked this conversation as resolved.
):
return raw_dataset
return self.get_sampled_dataframe(raw_dataset, self.sample_config)
Expand Down
21 changes: 15 additions & 6 deletions ingestion/src/metadata/sampler/sqlalchemy/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,22 +164,25 @@ def get_sample_query(self, *, column=None) -> Query:
(ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL),
).cte(f"{self.get_sampler_table_name()}_rnd")
session_query = client.query(rnd)
return session_query.where(
query = session_query.where(
rnd.c.random <= self.sample_config.profileSample
).cte(f"{self.get_sampler_table_name()}_sample")
)
if self.sample_config.randomizedSample is True:
query = query.order_by(rnd.c.random)
Comment thread
TeddyCr marked this conversation as resolved.
return query.cte(f"{self.get_sampler_table_name()}_sample")
Comment thread
TeddyCr marked this conversation as resolved.

table_query = client.query(self.raw_dataset)
if self.partition_details:
table_query = self.get_partitioned_query(table_query)
session_query = self._base_sample_query(
column,
(ModuloFn(RandomNumFn(), table_query.count())).label(RANDOM_LABEL)
if self.sample_config.randomizedSample
if self.sample_config.randomizedSample is True
else None,
)
query = (
session_query.order_by(RANDOM_LABEL)
if self.sample_config.randomizedSample
if self.sample_config.randomizedSample is True
else session_query
)
Comment thread
RajdeepKushwaha5 marked this conversation as resolved.
return query.limit(self.sample_config.profileSample).cte(
Expand All @@ -194,9 +197,16 @@ def get_dataset(self, column=None, **__) -> Union[type, AliasedClass]:
if self.sample_query:
return self._rdn_sample_from_user_query()

if not self.sample_config.profileSample or (
if not self.sample_config.profileSample:
if self.partition_details:
return self._partitioned_table()

return self.raw_dataset

if (
self.sample_config.profileSampleType == ProfileSampleType.PERCENTAGE
and self.sample_config.profileSample == 100
and self.sample_config.randomizedSample is not True
):
Comment thread
TeddyCr marked this conversation as resolved.
if self.partition_details:
return self._partitioned_table()
Expand All @@ -217,7 +227,6 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData
if self.sample_query:
return self._fetch_sample_data_from_user_query()

# Add new RandomNumFn column
ds = self.get_dataset()
if not columns:
sqa_columns = [col for col in inspect(ds).c if col.name != RANDOM_LABEL]
Comment thread
RajdeepKushwaha5 marked this conversation as resolved.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@
from sqlalchemy.orm import DeclarativeBase

from metadata.generated.schema.entity.data.table import Column as EntityColumn
from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table
from metadata.generated.schema.entity.data.table import (
ColumnName,
DataType,
ProfileSampleType,
Table,
)
from metadata.generated.schema.entity.services.connections.database.sqliteConnection import (
SQLiteConnection,
SQLiteScheme,
Expand Down Expand Up @@ -361,6 +366,114 @@ def test_sample_from_user_query(self, sampler_mock):
names = [col.root for col in sample_data.columns]
assert names == ["id", "name"]

def test_full_percentage_randomized_uses_sample_query(self, sampler_mock):
"""100% PERCENTAGE + randomizedSample=True should go through
get_sample_query which adds ORDER BY on the random column."""
with patch.object(SQASampler, "build_table_orm", return_value=User):
sampler = SQASampler(
service_connection_config=self.sqlite_conn,
ometa_client=None,
entity=None,
sample_config=SampleConfig(
profileSampleType=ProfileSampleType.PERCENTAGE,
profileSample=100,
randomizedSample=True,
),
sample_data_count=5,
)

with patch.object(
sampler, "get_sample_query", wraps=sampler.get_sample_query
) as mock_gsq:
sampler.fetch_sample_data()
assert mock_gsq.called
Comment on lines +369 to +389
Copy link

Copilot AI Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new “full percentage randomized” tests only assert that get_sample_query was invoked, but they don’t validate the actual bug report: that fetch_sample_data() returns a non-deterministic subset (or at least that the generated SQL applies ORDER BY on the random column before LIMIT). Consider asserting on the compiled query (contains ORDER BY random) or verifying that repeated calls can yield different first rows under SQLite to ensure the regression is truly covered.

Copilot uses AI. Check for mistakes.

def test_full_percentage_not_randomized_skips_sample_query(self, sampler_mock):
"""100% PERCENTAGE + randomizedSample=False should short-circuit
to raw dataset and NOT call get_sample_query."""
with patch.object(SQASampler, "build_table_orm", return_value=User):
sampler = SQASampler(
service_connection_config=self.sqlite_conn,
ometa_client=None,
entity=None,
sample_config=SampleConfig(
profileSampleType=ProfileSampleType.PERCENTAGE,
profileSample=100,
randomizedSample=False,
),
sample_data_count=5,
)

with patch.object(
sampler, "get_sample_query", wraps=sampler.get_sample_query
) as mock_gsq:
sampler.fetch_sample_data()
assert not mock_gsq.called

def test_full_percentage_none_randomized_skips_sample_query(self, sampler_mock):
"""100% PERCENTAGE + randomizedSample=None should short-circuit
(only explicit True enables randomization)."""
with patch.object(SQASampler, "build_table_orm", return_value=User):
sampler = SQASampler(
service_connection_config=self.sqlite_conn,
ometa_client=None,
entity=None,
sample_config=SampleConfig(
profileSampleType=ProfileSampleType.PERCENTAGE,
profileSample=100,
randomizedSample=None,
),
Comment on lines +413 to +425
Copy link

Copilot AI Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These new tests assert that randomizedSample=None should short-circuit (i.e., behave like False). This conflicts with the PR description’s stated intent of handling Optional[bool] correctly by only skipping randomization when randomizedSample is False (so None follows the default behavior).

Either adjust the implementation to treat None as “use default” (and update these tests accordingly), or explicitly document and enforce the new “opt-in randomization” contract across schema defaults, models, and migration strategy.

Copilot uses AI. Check for mistakes.
sample_data_count=5,
)
Comment on lines +413 to +427
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests expect randomizedSample=None to behave like False (short-circuit). Since the schema defines randomizedSample as boolean default true (non-null), consider removing the None scenario or asserting it follows the default behavior (randomize) to avoid locking in an inconsistent contract.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TeddyCr explicitly requested this behavior: "None or False should not randomize. only explicit True should randomize." The None test validates that edge case. The schema default is True, so in practice users always get True unless they explicitly pass None or False.


with patch.object(
sampler, "get_sample_query", wraps=sampler.get_sample_query
) as mock_gsq:
sampler.fetch_sample_data()
assert not mock_gsq.called

def test_randomized_true_produces_non_deterministic_rows(self, sampler_mock):
"""With randomizedSample=True at 100% PERCENTAGE, multiple
fetch_sample_data calls should return rows in different orders."""
with patch.object(SQASampler, "build_table_orm", return_value=User):
sampler = SQASampler(
service_connection_config=self.sqlite_conn,
ometa_client=None,
entity=None,
sample_config=SampleConfig(
profileSampleType=ProfileSampleType.PERCENTAGE,
profileSample=100,
randomizedSample=True,
),
sample_data_count=5,
)

results = [sampler.fetch_sample_data().rows for _ in range(20)]
assert any(
results[i] != results[0] for i in range(1, len(results))
), "Expected non-deterministic row ordering with randomizedSample=True"

Comment thread
RajdeepKushwaha5 marked this conversation as resolved.
def test_randomized_false_produces_deterministic_rows(self, sampler_mock):
"""With randomizedSample=False at 100% PERCENTAGE, multiple
fetch_sample_data calls should return rows in the same order."""
with patch.object(SQASampler, "build_table_orm", return_value=User):
sampler = SQASampler(
service_connection_config=self.sqlite_conn,
ometa_client=None,
entity=None,
sample_config=SampleConfig(
profileSampleType=ProfileSampleType.PERCENTAGE,
profileSample=100,
randomizedSample=False,
),
sample_data_count=5,
)

results = [sampler.fetch_sample_data().rows for _ in range(5)]
assert all(
results[i] == results[0] for i in range(1, len(results))
), "Expected deterministic row ordering with randomizedSample=False"

@classmethod
def tearDownClass(cls) -> None:
os.remove(cls.db_path)
Expand Down
Loading
Loading