Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 66 additions & 3 deletions datafusion/functions-nested/src/resize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,14 @@ fn general_list_resize<O: OffsetSizeTrait + TryInto<i64>>(
}
}

if output_values_len > max_resize_values(&data_type)
|| O::from_usize(output_values_len).is_none()
{
return exec_err!(
"array_resize: resulting array of {output_values_len} elements exceeds the maximum array size"
);
}

// The fast path is valid when at least one row grows and every row would
// use the same fill value.
let use_bulk_fill = max_extra > 0
Expand Down Expand Up @@ -342,12 +350,27 @@ where
)?))
}

/// Largest element count whose eager value buffer stays within `isize::MAX`
/// bytes, so `array_resize` rejects oversized results instead of panicking.
/// Only primitive and `FixedSizeBinary` leaves are byte-exact.
fn max_resize_values(value_type: &DataType) -> usize {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the compiler prob materialize it, but would it be better to prematerialize it manually through

static LazyLock<HashMap<DataType, usize>> ?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since there are parameterized types I would need to keep match right? I'm not sure it would be better but can't say I've a strong opinion. If you say go for it I can change it that way

let element_width = match value_type {
DataType::FixedSizeBinary(size) if *size > 0 => {
(*size as usize).saturating_mul(u8::BITS as usize)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need .saturating_mul(u8::BITS as usize)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't actually I actually mixed size of bytes and bits there. It just narrowed the limit 1/8 but size is already byte and we should return it there. changing this and rerunning tests now

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for the heads up 🙏

}
_ => value_type.primitive_width().unwrap_or(size_of::<u128>()),
};
(isize::MAX as usize) / element_width.max(1)
}

#[cfg(test)]
mod tests {
use super::array_resize_inner;
use arrow::array::{ArrayRef, AsArray, Int64Array, ListArray};
use arrow::buffer::{NullBuffer, ScalarBuffer};
use arrow::datatypes::Int32Type;
use arrow::array::{
ArrayRef, AsArray, FixedSizeBinaryArray, Int64Array, LargeListArray, ListArray,
};
use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer};
use arrow::datatypes::{DataType, Field, Int32Type, Int64Type};
use datafusion_common::Result;
use std::sync::Arc;

Expand All @@ -373,4 +396,44 @@ mod tests {

Ok(())
}

#[test]
fn test_array_resize_large_size_errors_without_panicking() {
let array: ArrayRef =
Arc::new(ListArray::from_iter_primitive::<Int64Type, _, _>(vec![
Some(vec![Some(1)]),
]));
let size: ArrayRef = Arc::new(Int64Array::from(vec![i64::MAX]));
let fill: ArrayRef = Arc::new(Int64Array::from(vec![0]));

let err = array_resize_inner(&[array, size, fill]).unwrap_err();
assert!(
err.to_string().contains("exceeds the maximum array size"),
"unexpected error: {err}"
);
}

#[test]
fn test_array_resize_fixed_size_binary_large_size_errors_without_panicking() {
// FixedSizeBinary is byte-accounted separately from the generic fallback.
let values =
FixedSizeBinaryArray::try_from_iter(vec![vec![0u8; 32]].into_iter()).unwrap();
let elem_field =
Arc::new(Field::new_list_field(DataType::FixedSizeBinary(32), true));
let offsets = OffsetBuffer::<i64>::new(vec![0i64, 1].into());
let array: ArrayRef = Arc::new(LargeListArray::new(
elem_field,
offsets,
Arc::new(values) as ArrayRef,
None,
));
// Passes the width-16 bound (isize::MAX / 16) but overflows at width 32.
let size: ArrayRef = Arc::new(Int64Array::from(vec![400_000_000_000_000_000i64]));

let err = array_resize_inner(&[array, size]).unwrap_err();
assert!(
err.to_string().contains("exceeds the maximum array size"),
"unexpected error: {err}"
);
}
}
12 changes: 12 additions & 0 deletions datafusion/sqllogictest/test_files/array/array_resize.slt
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,18 @@ select array_resize(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 5, 4);
query error
select array_resize(make_array(1, 2, 3), -5, 2);

# array_resize with a very large size should error instead of panicking (capacity overflow)
query error DataFusion error: Execution error: array_resize: resulting array of 9223372036854775807 elements exceeds the maximum array size
select array_resize(make_array(1), 9223372036854775807, 0);

query error DataFusion error: Execution error: array_resize: resulting array of 9223372036854775807 elements exceeds the maximum array size
select array_resize(arrow_cast(make_array(1), 'LargeList(Int64)'), 9223372036854775807, 0);

# List size above i32::MAX must error via the offset-type guard instead of
# silently truncating the offsets or attempting a multi-GB allocation
query error DataFusion error: Execution error: array_resize: resulting array of 3000000000 elements exceeds the maximum array size
select array_resize(make_array(1), 3000000000, 0);

# array_resize scalar function #5
query ?
select array_resize(make_array(1.1, 2.2, 3.3), 10, 9.9);
Expand Down
Loading