Skip to content

Commit 6639d4e

Browse files
authored
feat!: reject non-null columns in CREATE TABLE unconditionally (#2404)
## What changes are proposed in this pull request? Gates non-null columns (nullable: false) on the invariants writer feature during CREATE TABLE. Delta Spark treats non-nullable schema columns as implicit invariants, requiring the invariants writer feature in the protocol. Tables created by kernel with non-null columns but without the feature cause Spark read failures. Since invariants is not in ALLOWED_DELTA_FEATURES and is not auto-enabled by any property or schema-driven check, non-null columns are effectively rejected unconditionally today. The invariants gate ensures this relaxes automatically when invariants support is added in the future. Core changes: 1/ Added invariants_enabled parameter to validate_schema_for_create with recursive non-null rejection in SchemaValidator 2/ Overrides transform_variant to skip protocol-mandated non-null Variant internal fields (metadata, value) 3/ Passes the invariants feature check from validated.writer_features into schema validation during build() 4 Converted incidental non-null fields to nullable in test schemas that flow through create_table but are not testing nullability behavior ## How was this change tested? 1/ Added rstest unit coverage in schema/validation.rs for non-null rejection across schema shapes: top-level, nested struct, array-nested, map-nested, and invariants-enabled pass cases 2/ Added integration test in create_table/main.rs asserting CREATE TABLE rejects non-null schemas with top-level and nested cases 3/ Full cargo test -p delta_kernel --all-features --locked passes (0 failures) 4/ cargo clippy and cargo fmt clean --------- Co-authored-by: Sanuj Basu <sanujbasu@users.noreply.github.com>
1 parent b4280c6 commit 6639d4e

16 files changed

Lines changed: 235 additions & 59 deletions

File tree

delta-kernel-unity-catalog/src/utils/create_table.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ mod tests {
171171
let table_path = "memory:///test_table/";
172172
let schema = Arc::new(
173173
StructType::try_new(vec![
174-
StructField::new("id", DataType::INTEGER, false),
174+
StructField::new("id", DataType::INTEGER, true),
175175
StructField::new("region", DataType::STRING, true),
176176
])
177177
.unwrap(),
@@ -232,7 +232,7 @@ mod tests {
232232
]);
233233
let schema = Arc::new(
234234
StructType::try_new(vec![
235-
StructField::new("id", DataType::INTEGER, false),
235+
StructField::new("id", DataType::INTEGER, true),
236236
StructField::new("region", DataType::STRING, true),
237237
StructField::new("address", DataType::Struct(Box::new(address_struct)), true),
238238
])
@@ -278,7 +278,7 @@ mod tests {
278278
let engine = DefaultEngineBuilder::new(storage).build();
279279
let table_path = "memory:///test_version_check/";
280280
let schema = Arc::new(
281-
StructType::try_new(vec![StructField::new("id", DataType::INTEGER, false)]).unwrap(),
281+
StructType::try_new(vec![StructField::new("id", DataType::INTEGER, true)]).unwrap(),
282282
);
283283

284284
// Create a table (version 0) and append (version 1)

kernel/src/schema/validation.rs

Lines changed: 135 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,16 @@ const INVALID_PARQUET_CHARS: &[char] = &[' ', ',', ';', '{', '}', '(', ')', '\n'
2020
/// 1. Schema is non-empty
2121
/// 2. No duplicate column names (case-insensitive, including nested fields)
2222
/// 3. Column names contain only valid characters
23+
/// 4. Rejects non-null columns when the `invariants` writer feature is not enabled
2324
pub(crate) fn validate_schema_for_create(
2425
schema: &StructType,
2526
column_mapping_mode: ColumnMappingMode,
27+
invariants_enabled: bool,
2628
) -> DeltaResult<()> {
2729
if schema.num_fields() == 0 {
2830
return Err(Error::generic("Schema cannot be empty"));
2931
}
30-
let mut validator = SchemaValidator::new(column_mapping_mode);
32+
let mut validator = SchemaValidator::new(column_mapping_mode, invariants_enabled);
3133
// We reuse the SchemaTransform trait for its recursive traversal machinery.
3234
// The validator never transforms the schema -- it only inspects fields and
3335
// collects errors. The return value is intentionally discarded.
@@ -46,15 +48,17 @@ pub(crate) fn validate_schema_for_create(
4648
/// built with `new_unchecked`.
4749
struct SchemaValidator {
4850
cm_enabled: bool,
51+
invariants_enabled: bool,
4952
seen_paths: HashSet<String>,
5053
current_path: Vec<String>,
5154
errors: Vec<String>,
5255
}
5356

5457
impl SchemaValidator {
55-
fn new(column_mapping_mode: ColumnMappingMode) -> Self {
58+
fn new(column_mapping_mode: ColumnMappingMode, invariants_enabled: bool) -> Self {
5659
Self {
5760
cm_enabled: !matches!(column_mapping_mode, ColumnMappingMode::None),
61+
invariants_enabled,
5862
seen_paths: HashSet::new(),
5963
current_path: Vec::new(),
6064
errors: Vec::new(),
@@ -74,6 +78,15 @@ impl SchemaValidator {
7478
}
7579

7680
impl<'a> SchemaTransform<'a> for SchemaValidator {
81+
/// The default `transform_variant` recurses into the variant's struct fields
82+
/// (metadata, value) via `recurse_into_struct`. Those fields are protocol-defined
83+
/// and must be non-null -- they are not user-controlled schema columns. We override
84+
/// to return the variant struct unchanged, skipping recursion so the non-null check
85+
/// in `transform_struct_field` does not reject these fixed internal fields.
86+
fn transform_variant(&mut self, stype: &'a StructType) -> Option<Cow<'a, StructType>> {
87+
Some(Cow::Borrowed(stype))
88+
}
89+
7790
fn transform_struct_field(&mut self, field: &'a StructField) -> Option<Cow<'a, StructField>> {
7891
if let Err(e) = validate_field_name(field.name(), self.cm_enabled) {
7992
self.errors.push(e.to_string());
@@ -84,6 +97,13 @@ impl<'a> SchemaTransform<'a> for SchemaValidator {
8497
// separator would make column "a.b" indistinguishable from nested field b in
8598
// struct a. Null bytes cannot appear in column names, so they are safe to use.
8699
self.current_path.push(field.name().to_ascii_lowercase());
100+
if !self.invariants_enabled && !field.is_nullable() {
101+
self.errors.push(format!(
102+
"Non-null column '{}' is not supported during CREATE TABLE unless \
103+
writer feature 'invariants' is enabled",
104+
self.current_path.join(".")
105+
));
106+
}
87107
let key = self.current_path.join("\0");
88108
if !self.seen_paths.insert(key) {
89109
self.errors.push(format!(
@@ -274,6 +294,57 @@ mod tests {
274294
])
275295
}
276296

297+
fn schema_all_nullable() -> StructType {
298+
StructType::new_unchecked(vec![
299+
StructField::new("id", DataType::INTEGER, true),
300+
StructField::new("name", DataType::STRING, true),
301+
])
302+
}
303+
304+
fn schema_top_level_non_null() -> StructType {
305+
StructType::new_unchecked(vec![
306+
StructField::new("id", DataType::INTEGER, false),
307+
StructField::new("name", DataType::STRING, true),
308+
])
309+
}
310+
311+
fn schema_nested_non_null() -> StructType {
312+
let nested =
313+
StructType::new_unchecked(vec![StructField::new("child", DataType::INTEGER, false)]);
314+
StructType::new_unchecked(vec![StructField::new(
315+
"parent",
316+
DataType::Struct(Box::new(nested)),
317+
true,
318+
)])
319+
}
320+
321+
fn schema_array_nested_non_null() -> StructType {
322+
let nested =
323+
StructType::new_unchecked(vec![StructField::new("child", DataType::INTEGER, false)]);
324+
StructType::new_unchecked(vec![StructField::new(
325+
"arr",
326+
DataType::Array(Box::new(ArrayType::new(
327+
DataType::Struct(Box::new(nested)),
328+
true,
329+
))),
330+
true,
331+
)])
332+
}
333+
334+
fn schema_map_nested_non_null() -> StructType {
335+
let nested =
336+
StructType::new_unchecked(vec![StructField::new("child", DataType::INTEGER, false)]);
337+
StructType::new_unchecked(vec![StructField::new(
338+
"map",
339+
DataType::Map(Box::new(MapType::new(
340+
DataType::STRING,
341+
DataType::Struct(Box::new(nested)),
342+
true,
343+
))),
344+
true,
345+
)])
346+
}
347+
277348
// === Valid schemas ===
278349

279350
#[rstest]
@@ -283,7 +354,7 @@ mod tests {
283354
#[case::dot_in_name_with_cm(schema_with_dot(), ColumnMappingMode::Name)]
284355
#[case::different_struct_children(schema_different_struct_children(), ColumnMappingMode::None)]
285356
fn valid_schema_accepted(#[case] schema: StructType, #[case] cm: ColumnMappingMode) {
286-
assert!(validate_schema_for_create(&schema, cm).is_ok());
357+
assert!(validate_schema_for_create(&schema, cm, true).is_ok());
287358
}
288359

289360
// === Invalid schemas ===
@@ -305,7 +376,7 @@ mod tests {
305376
#[case] cm: ColumnMappingMode,
306377
#[case] expected_errs: &[&str],
307378
) {
308-
let result = validate_schema_for_create(&schema, cm);
379+
let result = validate_schema_for_create(&schema, cm, true);
309380
assert!(result.is_err());
310381
let err = result.unwrap_err().to_string();
311382
for expected in expected_errs {
@@ -315,4 +386,64 @@ mod tests {
315386
);
316387
}
317388
}
389+
390+
#[rstest]
391+
#[case::all_nullable_invariants_disabled(schema_all_nullable(), false, true, None)]
392+
#[case::top_level_non_null_invariants_disabled(
393+
schema_top_level_non_null(),
394+
false,
395+
false,
396+
Some("id")
397+
)]
398+
#[case::nested_non_null_invariants_disabled(
399+
schema_nested_non_null(),
400+
false,
401+
false,
402+
Some("parent.child")
403+
)]
404+
#[case::array_nested_non_null_invariants_disabled(
405+
schema_array_nested_non_null(),
406+
false,
407+
false,
408+
Some("arr.child")
409+
)]
410+
#[case::map_nested_non_null_invariants_disabled(
411+
schema_map_nested_non_null(),
412+
false,
413+
false,
414+
Some("map.child")
415+
)]
416+
#[case::top_level_non_null_invariants_enabled(schema_top_level_non_null(), true, true, None)]
417+
#[case::nested_non_null_invariants_enabled(schema_nested_non_null(), true, true, None)]
418+
fn non_null_columns_require_invariants_feature(
419+
#[case] schema: StructType,
420+
#[case] invariants_enabled: bool,
421+
#[case] expect_ok: bool,
422+
#[case] expected_path: Option<&str>,
423+
) {
424+
let result =
425+
validate_schema_for_create(&schema, ColumnMappingMode::None, invariants_enabled);
426+
427+
if expect_ok {
428+
assert!(result.is_ok(), "expected success, got {result:?}");
429+
return;
430+
}
431+
432+
let err = result.expect_err("expected non-null validation error");
433+
let msg = err.to_string();
434+
assert!(
435+
msg.contains("Non-null column"),
436+
"Expected non-null validation error, got: {msg}"
437+
);
438+
if let Some(path) = expected_path {
439+
assert!(
440+
msg.contains(path),
441+
"Expected path '{path}' in error message, got: {msg}"
442+
);
443+
}
444+
assert!(
445+
msg.contains("writer feature 'invariants'"),
446+
"Expected invariants guidance in error message, got: {msg}"
447+
);
448+
}
318449
}

kernel/src/snapshot/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3030,7 +3030,7 @@ mod tests {
30303030
let engine = DefaultEngineBuilder::new(storage).build();
30313031
let schema = Arc::new(
30323032
crate::schema::StructType::try_new(vec![
3033-
crate::schema::StructField::new("id", crate::schema::DataType::INTEGER, false),
3033+
crate::schema::StructField::new("id", crate::schema::DataType::INTEGER, true),
30343034
crate::schema::StructField::new("region", crate::schema::DataType::STRING, true),
30353035
])
30363036
.unwrap(),

kernel/src/transaction/builder/create_table.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use crate::clustering::{create_clustering_domain_metadata, validate_clustering_c
1818
use crate::committer::Committer;
1919
use crate::expressions::ColumnName;
2020
use crate::log_segment::LogSegment;
21+
use crate::schema::validation::validate_schema_for_create;
2122
use crate::schema::variant_utils::schema_contains_variant_type;
2223
use crate::schema::{normalize_column_names_to_schema_casing, DataType, SchemaRef, StructType};
2324
use crate::snapshot::Snapshot;
@@ -648,7 +649,7 @@ impl CreateTableTransactionBuilder {
648649
/// # use delta_kernel::schema::{StructType, DataType, StructField};
649650
/// # use std::sync::Arc;
650651
/// # fn example() -> delta_kernel::DeltaResult<()> {
651-
/// # let schema = Arc::new(StructType::try_new(vec![StructField::new("id", DataType::INTEGER, false)])?);
652+
/// # let schema = Arc::new(StructType::try_new(vec![StructField::new("id", DataType::INTEGER, true)])?);
652653
/// let builder = create_table("/path/to/table", schema, "MyApp/1.0")
653654
/// .with_table_properties([
654655
/// ("myapp.version", "1.0"),
@@ -693,8 +694,8 @@ impl CreateTableTransactionBuilder {
693694
/// # use std::sync::Arc;
694695
/// # fn example() -> delta_kernel::DeltaResult<()> {
695696
/// # let schema = Arc::new(StructType::try_new(vec![
696-
/// # StructField::new("id", DataType::INTEGER, false),
697-
/// # StructField::new("date", DataType::STRING, false),
697+
/// # StructField::new("id", DataType::INTEGER, true),
698+
/// # StructField::new("date", DataType::STRING, true),
698699
/// # ])?);
699700
/// // Clustered layout:
700701
/// let builder = create_table("/path/to/table", schema.clone(), "MyApp/1.0")
@@ -721,6 +722,7 @@ impl CreateTableTransactionBuilder {
721722
/// - Checks that the table path is valid
722723
/// - Verifies the table doesn't already exist
723724
/// - Validates the schema is non-empty
725+
/// - Rejects non-null columns unless `invariants` writer feature is enabled
724726
/// - Validates the data layout is valid
725727
/// - Validates table properties against the allow list
726728
///
@@ -735,6 +737,7 @@ impl CreateTableTransactionBuilder {
735737
/// - The table path is invalid
736738
/// - A table already exists at the given path
737739
/// - The schema is empty
740+
/// - The schema has non-null columns without the `invariants` writer feature
738741
/// - The data layout is invalid
739742
/// - Unsupported delta properties or feature flags are specified
740743
pub fn build(
@@ -761,9 +764,12 @@ impl CreateTableTransactionBuilder {
761764
maybe_apply_column_mapping_for_table_create(&self.schema, &mut validated)?;
762765

763766
// Validate schema (non-empty, column names, duplicates)
764-
crate::schema::validation::validate_schema_for_create(
767+
validate_schema_for_create(
765768
&effective_schema,
766769
column_mapping_mode,
770+
validated
771+
.writer_features
772+
.contains(&TableFeature::Invariants),
767773
)?;
768774

769775
// Validate data layout and resolve column names (physical for clustering, logical

kernel/src/transaction/create_table.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
//! # fn example(engine: &dyn Engine) -> delta_kernel::DeltaResult<()> {
1616
//!
1717
//! let schema = Arc::new(StructType::try_new(vec![
18-
//! StructField::new("id", DataType::INTEGER, false),
18+
//! StructField::new("id", DataType::INTEGER, true),
1919
//! ])?);
2020
//!
2121
//! let result = create_table("/path/to/table", schema, "MyApp/1.0")
@@ -72,7 +72,7 @@ use crate::DeltaResult;
7272
/// # fn example(engine: &dyn Engine) -> delta_kernel::DeltaResult<()> {
7373
///
7474
/// let schema = Arc::new(StructType::try_new(vec![
75-
/// StructField::new("id", DataType::INTEGER, false),
75+
/// StructField::new("id", DataType::INTEGER, true),
7676
/// ])?);
7777
///
7878
/// let result = create_table("/path/to/table", schema, "MyApp/1.0")
@@ -106,7 +106,7 @@ pub type CreateTableTransaction = Transaction<CreateTable>;
106106
///
107107
/// # fn main() -> delta_kernel::DeltaResult<()> {
108108
/// let schema = Arc::new(StructType::new_unchecked(vec![
109-
/// StructField::new("id", DataType::INTEGER, false),
109+
/// StructField::new("id", DataType::INTEGER, true),
110110
/// StructField::new("name", DataType::STRING, true),
111111
/// ]));
112112
///

kernel/src/transaction/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2469,7 +2469,7 @@ mod tests {
24692469

24702470
// Create a non-catalog-managed table using a catalog committer
24712471
let schema = Arc::new(crate::schema::StructType::new_unchecked(vec![
2472-
crate::schema::StructField::new("id", crate::schema::DataType::INTEGER, false),
2472+
crate::schema::StructField::new("id", crate::schema::DataType::INTEGER, true),
24732473
]));
24742474
let committer = Box::new(MockCatalogCommitter);
24752475
let err = create_table("memory:///", schema, "test-engine")

0 commit comments

Comments
 (0)