Skip to content

Commit 365a9ec

Browse files
klion26alamb
andauthored
[Variant] Add variant to arrow for Date64/Timestamp(Second/Millisecond)/Time32/Time64 (#8950)
# Which issue does this PR close? - Closes #8805 . # What changes are included in this PR? Add support for variant to arrow primitive types(for the remaining arrow primitive types), and some tests to cover them. For the behavior that can't be cast safely, I'll continue to track them in #8086 and #8873 > Self::make_time will return the native value for the given timestamp type > Date64Type::from_naive_date(v) will return the milliseconds elapsed since UNIX epoch | VariantType | Arrow Type | Logic| | -- | -- | -- | | Date | Date64 | datatypes::Date64Type::from_naive_date(v) | | Timestamp\[_ntz\](Micro/Nano) | Timestamp\[_ntz\](Second) | - if (timestamp.nano == 0) Self::make_time(timestamp) <br> - else None | | Timestamp\[_ntz\](Micro/Nano) | Timestamp\[_ntz\](Millisecond) | - if (timestamp.nano % 1_000_000 == 0) Self::make_time(timestamp) <br> - else None | | Time | Time32(Second)| - if (timestamp.nano == 0) v. num_seconds_from_midnight() <br> - else None | | Time | Time32(Millisecond) | - if (timestamp.nano % 1_000_000 == 0) v.num_seconds_from_midnight() * 1000 + v.nano / 1_000_000 <br> - else Nnoe | | Time | Time64(Nano) | timestamp.num_seconds * 1_000_000_000 + v.nano | # Are these changes tested? Added some new tests # Are there any user-facing changes? No --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 42b690c commit 365a9ec

File tree

3 files changed

+357
-32
lines changed

3 files changed

+357
-32
lines changed

parquet-variant-compute/src/type_conversion.rs

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ macro_rules! impl_primitive_from_variant {
5656
impl PrimitiveFromVariant for $arrow_type {
5757
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> {
5858
let value = variant.$variant_method();
59-
$( let value = value.map($cast_fn); )?
59+
$( let value = value.and_then($cast_fn); )?
6060
value
6161
}
6262
}
@@ -84,14 +84,87 @@ impl_primitive_from_variant!(datatypes::UInt64Type, as_u64);
8484
impl_primitive_from_variant!(datatypes::Float16Type, as_f16);
8585
impl_primitive_from_variant!(datatypes::Float32Type, as_f32);
8686
impl_primitive_from_variant!(datatypes::Float64Type, as_f64);
87-
impl_primitive_from_variant!(
88-
datatypes::Date32Type,
89-
as_naive_date,
90-
datatypes::Date32Type::from_naive_date
91-
);
87+
impl_primitive_from_variant!(datatypes::Date32Type, as_naive_date, |v| {
88+
Some(datatypes::Date32Type::from_naive_date(v))
89+
});
90+
impl_primitive_from_variant!(datatypes::Date64Type, as_naive_date, |v| {
91+
Some(datatypes::Date64Type::from_naive_date(v))
92+
});
93+
impl_primitive_from_variant!(datatypes::Time32SecondType, as_time_utc, |v| {
94+
// Return None if there are leftover nanoseconds
95+
if v.nanosecond() != 0 {
96+
None
97+
} else {
98+
Some(v.num_seconds_from_midnight() as i32)
99+
}
100+
});
101+
impl_primitive_from_variant!(datatypes::Time32MillisecondType, as_time_utc, |v| {
102+
// Return None if there are leftover microseconds
103+
if v.nanosecond() % 1_000_000 != 0 {
104+
None
105+
} else {
106+
Some((v.num_seconds_from_midnight() * 1_000) as i32 + (v.nanosecond() / 1_000_000) as i32)
107+
}
108+
});
92109
impl_primitive_from_variant!(datatypes::Time64MicrosecondType, as_time_utc, |v| {
93-
(v.num_seconds_from_midnight() * 1_000_000 + v.nanosecond() / 1_000) as i64
110+
Some((v.num_seconds_from_midnight() * 1_000_000 + v.nanosecond() / 1_000) as i64)
94111
});
112+
impl_primitive_from_variant!(datatypes::Time64NanosecondType, as_time_utc, |v| {
113+
// convert micro to nano seconds
114+
Some(v.num_seconds_from_midnight() as i64 * 1_000_000_000 + v.nanosecond() as i64)
115+
});
116+
impl_timestamp_from_variant!(
117+
datatypes::TimestampSecondType,
118+
as_timestamp_ntz_nanos,
119+
ntz = true,
120+
|timestamp| {
121+
// Return None if there are leftover nanoseconds
122+
if timestamp.nanosecond() != 0 {
123+
None
124+
} else {
125+
Self::make_value(timestamp)
126+
}
127+
}
128+
);
129+
impl_timestamp_from_variant!(
130+
datatypes::TimestampSecondType,
131+
as_timestamp_nanos,
132+
ntz = false,
133+
|timestamp| {
134+
// Return None if there are leftover nanoseconds
135+
if timestamp.nanosecond() != 0 {
136+
None
137+
} else {
138+
Self::make_value(timestamp.naive_utc())
139+
}
140+
}
141+
);
142+
impl_timestamp_from_variant!(
143+
datatypes::TimestampMillisecondType,
144+
as_timestamp_ntz_nanos,
145+
ntz = true,
146+
|timestamp| {
147+
// Return None if there are leftover microseconds
148+
if timestamp.nanosecond() % 1_000_000 != 0 {
149+
None
150+
} else {
151+
Self::make_value(timestamp)
152+
}
153+
}
154+
);
155+
impl_timestamp_from_variant!(
156+
datatypes::TimestampMillisecondType,
157+
as_timestamp_nanos,
158+
ntz = false,
159+
|timestamp| {
160+
// Return None if there are leftover microseconds
161+
if timestamp.nanosecond() % 1_000_000 != 0 {
162+
None
163+
} else {
164+
Self::make_value(timestamp.naive_utc())
165+
}
166+
}
167+
);
95168
impl_timestamp_from_variant!(
96169
datatypes::TimestampMicrosecondType,
97170
as_timestamp_ntz_micros,

parquet-variant-compute/src/variant_get.rs

Lines changed: 202 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -337,10 +337,10 @@ mod test {
337337
use crate::{VariantArray, VariantArrayBuilder, json_to_variant};
338338
use arrow::array::{
339339
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, Date32Array,
340-
Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array,
341-
Float64Array, Int8Array, Int16Array, Int32Array, Int64Array, LargeBinaryArray,
342-
LargeStringArray, NullBuilder, StringArray, StringViewArray, StructArray,
343-
Time64MicrosecondArray,
340+
Date64Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array,
341+
Float32Array, Float64Array, Int8Array, Int16Array, Int32Array, Int64Array,
342+
LargeBinaryArray, LargeStringArray, NullBuilder, StringArray, StringViewArray, StructArray,
343+
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
344344
};
345345
use arrow::buffer::NullBuffer;
346346
use arrow::compute::CastOptions;
@@ -1000,6 +1000,152 @@ mod test {
10001000
}
10011001
);
10021002

1003+
perfectly_shredded_variant_array_fn!(
1004+
perfectly_shredded_timestamp_micro_variant_array_for_second_and_milli_second,
1005+
|| {
1006+
arrow::array::TimestampMicrosecondArray::from(vec![
1007+
Some(1234), // can't be cast to second & millisecond
1008+
Some(1234000), // can be cast to millisecond, but not second
1009+
Some(1234000000), // can be cast to second & millisecond
1010+
])
1011+
.with_timezone("+00:00")
1012+
}
1013+
);
1014+
1015+
// The following two tests wants to cover the micro with timezone -> milli/second cases
1016+
// there are three test items, which contains some items can be cast safely, and some can't
1017+
perfectly_shredded_to_arrow_primitive_test!(
1018+
get_variant_perfectly_shredded_timestamp_micro_as_timestamp_second,
1019+
DataType::Timestamp(TimeUnit::Second, Some(Arc::from("+00:00"))),
1020+
perfectly_shredded_timestamp_micro_variant_array_for_second_and_milli_second,
1021+
arrow::array::TimestampSecondArray::from(vec![
1022+
None,
1023+
None, // Return None if can't be cast to second safely
1024+
Some(1234)
1025+
])
1026+
.with_timezone("+00:00")
1027+
);
1028+
1029+
perfectly_shredded_to_arrow_primitive_test!(
1030+
get_variant_perfectly_shredded_timestamp_micro_as_timestamp_milli,
1031+
DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from("+00:00"))),
1032+
perfectly_shredded_timestamp_micro_variant_array_for_second_and_milli_second,
1033+
arrow::array::TimestampMillisecondArray::from(vec![
1034+
None, // Return None if can't be cast to millisecond safely
1035+
Some(1234),
1036+
Some(1234000)
1037+
])
1038+
.with_timezone("+00:00")
1039+
);
1040+
1041+
perfectly_shredded_variant_array_fn!(
1042+
perfectly_shredded_timestamp_micro_ntz_variant_array_for_second_and_milli_second,
1043+
|| {
1044+
arrow::array::TimestampMicrosecondArray::from(vec![
1045+
Some(1234), // can't be cast to second & millisecond
1046+
Some(1234000), // can be cast to millisecond, but not second
1047+
Some(1234000000), // can be cast to second & millisecond
1048+
])
1049+
}
1050+
);
1051+
1052+
// The following two tests wants to cover the micro_ntz -> milli/second cases
1053+
// there are three test items, which contains some items can be cast safely, and some can't
1054+
perfectly_shredded_to_arrow_primitive_test!(
1055+
get_variant_perfectly_shredded_timestamp_micro_ntz_as_timestamp_second,
1056+
DataType::Timestamp(TimeUnit::Second, None),
1057+
perfectly_shredded_timestamp_micro_ntz_variant_array_for_second_and_milli_second,
1058+
arrow::array::TimestampSecondArray::from(vec![
1059+
None,
1060+
None, // Return None if can't be cast to second safely
1061+
Some(1234)
1062+
])
1063+
);
1064+
1065+
perfectly_shredded_to_arrow_primitive_test!(
1066+
get_variant_perfectly_shredded_timestamp_micro_ntz_as_timestamp_milli,
1067+
DataType::Timestamp(TimeUnit::Millisecond, None),
1068+
perfectly_shredded_timestamp_micro_ntz_variant_array_for_second_and_milli_second,
1069+
arrow::array::TimestampMillisecondArray::from(vec![
1070+
None, // Return None if can't be cast to millisecond safely
1071+
Some(1234),
1072+
Some(1234000)
1073+
])
1074+
);
1075+
1076+
perfectly_shredded_variant_array_fn!(
1077+
perfectly_shredded_timestamp_nano_variant_array_for_second_and_milli_second,
1078+
|| {
1079+
arrow::array::TimestampNanosecondArray::from(vec![
1080+
Some(1234000), // can't be cast to second & millisecond
1081+
Some(1234000000), // can be cast to millisecond, but not second
1082+
Some(1234000000000), // can be cast to second & millisecond
1083+
])
1084+
.with_timezone("+00:00")
1085+
}
1086+
);
1087+
1088+
// The following two tests wants to cover the nano with timezone -> milli/second cases
1089+
// there are three test items, which contains some items can be cast safely, and some can't
1090+
perfectly_shredded_to_arrow_primitive_test!(
1091+
get_variant_perfectly_shredded_timestamp_nano_as_timestamp_second,
1092+
DataType::Timestamp(TimeUnit::Second, Some(Arc::from("+00:00"))),
1093+
perfectly_shredded_timestamp_nano_variant_array_for_second_and_milli_second,
1094+
arrow::array::TimestampSecondArray::from(vec![
1095+
None,
1096+
None, // Return None if can't be cast to second safely
1097+
Some(1234)
1098+
])
1099+
.with_timezone("+00:00")
1100+
);
1101+
1102+
perfectly_shredded_to_arrow_primitive_test!(
1103+
get_variant_perfectly_shredded_timestamp_nano_as_timestamp_milli,
1104+
DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from("+00:00"))),
1105+
perfectly_shredded_timestamp_nano_variant_array_for_second_and_milli_second,
1106+
arrow::array::TimestampMillisecondArray::from(vec![
1107+
None, // Return None if can't be cast to millisecond safely
1108+
Some(1234),
1109+
Some(1234000)
1110+
])
1111+
.with_timezone("+00:00")
1112+
);
1113+
1114+
perfectly_shredded_variant_array_fn!(
1115+
perfectly_shredded_timestamp_nano_ntz_variant_array_for_second_and_milli_second,
1116+
|| {
1117+
arrow::array::TimestampNanosecondArray::from(vec![
1118+
Some(1234000), // can't be cast to second & millisecond
1119+
Some(1234000000), // can be cast to millisecond, but not second
1120+
Some(1234000000000), // can be cast to second & millisecond
1121+
])
1122+
}
1123+
);
1124+
1125+
// The following two tests wants to cover the nano_ntz -> milli/second cases
1126+
// there are three test items, which contains some items can be cast safely, and some can't
1127+
perfectly_shredded_to_arrow_primitive_test!(
1128+
get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_second,
1129+
DataType::Timestamp(TimeUnit::Second, None),
1130+
perfectly_shredded_timestamp_nano_ntz_variant_array_for_second_and_milli_second,
1131+
arrow::array::TimestampSecondArray::from(vec![
1132+
None,
1133+
None, // Return None if can't be cast to second safely
1134+
Some(1234)
1135+
])
1136+
);
1137+
1138+
perfectly_shredded_to_arrow_primitive_test!(
1139+
get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_milli,
1140+
DataType::Timestamp(TimeUnit::Millisecond, None),
1141+
perfectly_shredded_timestamp_nano_ntz_variant_array_for_second_and_milli_second,
1142+
arrow::array::TimestampMillisecondArray::from(vec![
1143+
None, // Return None if can't be cast to millisecond safely
1144+
Some(1234),
1145+
Some(1234000)
1146+
])
1147+
);
1148+
10031149
perfectly_shredded_to_arrow_primitive_test!(
10041150
get_variant_perfectly_shredded_timestamp_nano_ntz_as_timestamp_nano_ntz,
10051151
DataType::Timestamp(TimeUnit::Nanosecond, None),
@@ -1043,6 +1189,17 @@ mod test {
10431189
Date32Array::from(vec![Some(-12345), Some(17586), Some(20000)])
10441190
);
10451191

1192+
perfectly_shredded_to_arrow_primitive_test!(
1193+
get_variant_perfectly_shredded_date_as_date64,
1194+
DataType::Date64,
1195+
perfectly_shredded_date_variant_array,
1196+
Date64Array::from(vec![
1197+
Some(-1066608000000),
1198+
Some(1519430400000),
1199+
Some(1728000000000)
1200+
])
1201+
);
1202+
10461203
perfectly_shredded_variant_array_fn!(perfectly_shredded_time_variant_array, || {
10471204
Time64MicrosecondArray::from(vec![Some(12345000), Some(87654000), Some(135792000)])
10481205
});
@@ -1054,6 +1211,47 @@ mod test {
10541211
Time64MicrosecondArray::from(vec![Some(12345000), Some(87654000), Some(135792000)])
10551212
);
10561213

1214+
perfectly_shredded_to_arrow_primitive_test!(
1215+
get_variant_perfectly_shredded_time_as_time64_nano,
1216+
DataType::Time64(TimeUnit::Nanosecond),
1217+
perfectly_shredded_time_variant_array,
1218+
Time64NanosecondArray::from(vec![
1219+
Some(12345000000),
1220+
Some(87654000000),
1221+
Some(135792000000)
1222+
])
1223+
);
1224+
1225+
perfectly_shredded_variant_array_fn!(perfectly_shredded_time_variant_array_for_time32, || {
1226+
Time64MicrosecondArray::from(vec![
1227+
Some(1234), // This can't be cast to Time32 losslessly
1228+
Some(7654000), // This can be cast to Time32(Millisecond), but not Time32(Second)
1229+
Some(35792000000), // This can be cast to Time32(Second) & Time32(Millisecond)
1230+
])
1231+
});
1232+
1233+
perfectly_shredded_to_arrow_primitive_test!(
1234+
get_variant_perfectly_shredded_time_as_time32_second,
1235+
DataType::Time32(TimeUnit::Second),
1236+
perfectly_shredded_time_variant_array_for_time32,
1237+
Time32SecondArray::from(vec![
1238+
None,
1239+
None, // Return None if can't be cast to Time32(Second) safely
1240+
Some(35792)
1241+
])
1242+
);
1243+
1244+
perfectly_shredded_to_arrow_primitive_test!(
1245+
get_variant_perfectly_shredded_time_as_time32_milli,
1246+
DataType::Time32(TimeUnit::Millisecond),
1247+
perfectly_shredded_time_variant_array_for_time32,
1248+
Time32MillisecondArray::from(vec![
1249+
None, // Return None if can't be cast to Time32(Second) safely
1250+
Some(7654),
1251+
Some(35792000)
1252+
])
1253+
);
1254+
10571255
perfectly_shredded_variant_array_fn!(perfectly_shredded_null_variant_array, || {
10581256
let mut builder = NullBuilder::new();
10591257
builder.append_nulls(3);

0 commit comments

Comments
 (0)