Skip to content

Commit 136e975

Browse files
fix: propagate null bitmap in evaluate_map_to_struct (#2419)
## What changes are proposed in this pull request? `evaluate_map_to_struct` creates the output `StructArray` with `None` as the null buffer, even when the input `MapArray` has null rows. This causes Arrow validation to reject the array with `"Found unmasked nulls for non-nullable StructArray field"` when any output field is non-nullable. This manifests during checkpoint creation for partitioned tables where: 1. A partition column is declared `NOT NULL` in the table schema 2. `delta.checkpoint.writeStatsAsStruct = true` 3. The delta log contains non-add actions (remove/metadata/protocol) where `add` is null The `COALESCE(partitionValues_parsed, MAP_TO_STRUCT(partitionValues))` expression evaluates `MAP_TO_STRUCT` for all rows. For rows where the map is null, child builders get null values appended but the output struct has no null mask to cover them. **Context:** `MapToStruct` was introduced in #1895 and used for `partitionValues_parsed` in #1932. ## How was this change tested? --------- Co-authored-by: Momcilo Mrkaic <momcilomrk-db@users.noreply.github.com>
1 parent 8d3e10a commit 136e975

1 file changed

Lines changed: 107 additions & 8 deletions

File tree

kernel/src/engine/arrow_expression/evaluate_expression.rs

Lines changed: 107 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -902,10 +902,22 @@ fn evaluate_map_to_struct(
902902
.map(|f| ArrowField::try_from_kernel(*f))
903903
.try_collect()?;
904904

905+
// Propagate the input map's null bitmap to the output struct. This is critical:
906+
// when a map row is null, the loop above appends null to every child builder
907+
// (since no keys match). Without this null bitmap, the output struct row appears
908+
// valid (non-null) to Arrow, but its children contain nulls. If any child field
909+
// is non-nullable, Arrow rejects this as "Found unmasked nulls for non-nullable
910+
// StructArray field". With the bitmap, the struct row is marked null, which masks
911+
// the child nulls and satisfies Arrow's validation.
912+
//
913+
// This matters during checkpoint creation: the COALESCE expression evaluates
914+
// MAP_TO_STRUCT for all rows including non-add actions (remove, metadata, protocol)
915+
// where the partition values map is null. Partition columns declared NOT NULL would
916+
// cause the checkpoint to fail without this propagation.
905917
Ok(StructArray::try_new(
906918
arrow_fields.into(),
907919
output_columns,
908-
None,
920+
map_array.nulls().cloned(),
909921
)?)
910922
}
911923

@@ -924,7 +936,8 @@ mod tests {
924936

925937
use super::*;
926938
use crate::arrow::array::{
927-
ArrayRef, BooleanArray, Int32Array, Int64Array, StringArray, StructArray,
939+
ArrayRef, BooleanArray, Int32Array, Int64Array, MapBuilder, StringArray, StringBuilder,
940+
StructArray,
928941
};
929942
use crate::arrow::datatypes::{
930943
DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema,
@@ -1971,8 +1984,6 @@ mod tests {
19711984

19721985
/// Helper: creates a RecordBatch with a `pv` column of type Map<String, String>.
19731986
fn create_partition_map_batch() -> RecordBatch {
1974-
use crate::arrow::array::{MapBuilder, StringBuilder};
1975-
19761987
let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
19771988

19781989
// Row 0: {"date": "2024-01-15", "region": "us", "id": "42"}
@@ -2073,8 +2084,6 @@ mod tests {
20732084

20742085
#[test]
20752086
fn test_map_to_struct_parse_error() {
2076-
use crate::arrow::array::{MapBuilder, StringBuilder};
2077-
20782087
let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
20792088
builder.keys().append_value("count");
20802089
builder.values().append_value("not_a_number");
@@ -2098,8 +2107,6 @@ mod tests {
20982107

20992108
#[test]
21002109
fn test_map_to_struct_duplicate_keys() {
2101-
use crate::arrow::array::{MapBuilder, StringBuilder};
2102-
21032110
let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
21042111
builder.keys().append_value("x");
21052112
builder.values().append_value("first");
@@ -2130,6 +2137,98 @@ mod tests {
21302137
assert_eq!(col.value(0), "last");
21312138
}
21322139

2140+
#[rstest]
2141+
#[case::mixed_nulls(
2142+
vec![
2143+
Some(vec![("region", "us"), ("id", "42")]),
2144+
None,
2145+
Some(vec![("region", "eu"), ("id", "7")]),
2146+
],
2147+
vec![true, false, true],
2148+
)]
2149+
#[case::all_nulls(vec![None, None], vec![false, false])]
2150+
fn test_map_to_struct_null_propagation_with_non_nullable_fields(
2151+
#[case] rows: Vec<Option<Vec<(&str, &str)>>>,
2152+
#[case] expected_validity: Vec<bool>,
2153+
) {
2154+
let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
2155+
for row in &rows {
2156+
match row {
2157+
Some(entries) => {
2158+
for (k, v) in entries {
2159+
builder.keys().append_value(k);
2160+
builder.values().append_value(v);
2161+
}
2162+
builder.append(true).unwrap();
2163+
}
2164+
None => {
2165+
builder.append(false).unwrap();
2166+
}
2167+
}
2168+
}
2169+
let map_array = builder.finish();
2170+
let schema = ArrowSchema::new(vec![ArrowField::new(
2171+
"pv",
2172+
map_array.data_type().clone(),
2173+
true,
2174+
)]);
2175+
let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]).unwrap();
2176+
2177+
let output_schema = StructType::new_unchecked(vec![
2178+
StructField::new("region", DataType::STRING, false),
2179+
StructField::new("id", DataType::INTEGER, false),
2180+
]);
2181+
let result_type = DataType::Struct(Box::new(output_schema));
2182+
let expr = Expr::map_to_struct(column_expr!("pv"));
2183+
let result = evaluate_expression(&expr, &batch, Some(&result_type)).unwrap();
2184+
let structs = result.as_any().downcast_ref::<StructArray>().unwrap();
2185+
2186+
assert_eq!(structs.len(), expected_validity.len());
2187+
for (i, &valid) in expected_validity.iter().enumerate() {
2188+
assert_eq!(structs.is_valid(i), valid, "row {i} validity mismatch");
2189+
}
2190+
}
2191+
2192+
#[test]
2193+
fn test_coalesce_map_to_struct_with_null_map_non_nullable_fields() {
2194+
let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new());
2195+
builder.keys().append_value("date");
2196+
builder.values().append_value("2024-01-15");
2197+
builder.append(true).unwrap();
2198+
builder.append(false).unwrap();
2199+
2200+
let map_array = builder.finish();
2201+
let map_type = map_array.data_type().clone();
2202+
let schema = Arc::new(ArrowSchema::new(vec![
2203+
ArrowField::new(
2204+
"pv_parsed",
2205+
ArrowDataType::Struct(
2206+
vec![ArrowField::new("date", ArrowDataType::Date32, false)].into(),
2207+
),
2208+
true,
2209+
),
2210+
ArrowField::new("pv", map_type, true),
2211+
]));
2212+
2213+
let pv_parsed = new_null_array(schema.field(0).data_type(), 2);
2214+
let batch = RecordBatch::try_new(schema, vec![pv_parsed, Arc::new(map_array)]).unwrap();
2215+
2216+
let output_schema =
2217+
StructType::new_unchecked(vec![StructField::new("date", DataType::DATE, false)]);
2218+
let result_type = DataType::Struct(Box::new(output_schema));
2219+
let expr = Expr::coalesce([
2220+
Expr::column(["pv_parsed"]),
2221+
Expr::map_to_struct(column_expr!("pv")),
2222+
]);
2223+
let result = evaluate_expression(&expr, &batch, Some(&result_type)).unwrap();
2224+
let structs = result.as_any().downcast_ref::<StructArray>().unwrap();
2225+
2226+
// Row 0: pv_parsed null, MAP_TO_STRUCT succeeds
2227+
assert!(structs.is_valid(0));
2228+
// Row 1: pv_parsed null, map null → null struct
2229+
assert!(structs.is_null(1));
2230+
}
2231+
21332232
#[test]
21342233
fn test_map_to_struct_non_map_input() {
21352234
let schema = ArrowSchema::new(vec![ArrowField::new("s", ArrowDataType::Utf8, true)]);

0 commit comments

Comments
 (0)