Skip to content

Commit 9c9a699

Browse files
committed
struct parsing for duckdb working!
1 parent 6539830 commit 9c9a699

File tree

2 files changed

+79
-5
lines changed
  • dataframe-jdbc/src
    • main/kotlin/org/jetbrains/kotlinx/dataframe/io/db
    • test/kotlin/org/jetbrains/kotlinx/dataframe/io/local

2 files changed

+79
-5
lines changed

dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt

Lines changed: 73 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,21 @@ import org.duckdb.DuckDBColumnType.UUID
4242
import org.duckdb.DuckDBColumnType.VARCHAR
4343
import org.duckdb.DuckDBResultSetMetaData
4444
import org.duckdb.JsonNode
45+
import org.jetbrains.kotlinx.dataframe.AnyFrame
46+
import org.jetbrains.kotlinx.dataframe.AnyRow
47+
import org.jetbrains.kotlinx.dataframe.DataColumn
4548
import org.jetbrains.kotlinx.dataframe.DataFrame
49+
import org.jetbrains.kotlinx.dataframe.DataRow
50+
import org.jetbrains.kotlinx.dataframe.api.Infer
51+
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
52+
import org.jetbrains.kotlinx.dataframe.api.asDataColumn
53+
import org.jetbrains.kotlinx.dataframe.api.cast
54+
import org.jetbrains.kotlinx.dataframe.api.castToNotNullable
55+
import org.jetbrains.kotlinx.dataframe.api.first
56+
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
57+
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
58+
import org.jetbrains.kotlinx.dataframe.impl.DataCollector
59+
import org.jetbrains.kotlinx.dataframe.impl.schema.DataFrameSchemaImpl
4660
import org.jetbrains.kotlinx.dataframe.io.DbConnectionConfig
4761
import org.jetbrains.kotlinx.dataframe.io.readAllSqlTables
4862
import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
@@ -56,6 +70,7 @@ import java.sql.ResultSet
5670
import java.sql.Struct
5771
import java.util.Properties
5872
import kotlin.collections.toList
73+
import kotlin.reflect.KClass
5974
import kotlin.reflect.KTypeProjection
6075
import kotlin.reflect.full.createType
6176
import kotlin.reflect.full.withNullability
@@ -100,7 +115,7 @@ public object DuckDb : DbType("duckdb") {
100115
*/
101116
internal fun parseDuckDbType(sqlTypeName: String, isNullable: Boolean): AnyTypeInformation =
102117
duckDbTypeCache.getOrPut(Pair(sqlTypeName, isNullable)) {
103-
when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) {
118+
return@getOrPut when (DuckDBResultSetMetaData.TypeNameToType(sqlTypeName)) {
104119
BOOLEAN -> typeInformationForValueColumnOf<Boolean>(isNullable)
105120

106121
TINYINT -> typeInformationForValueColumnOf<Byte>(isNullable)
@@ -208,9 +223,45 @@ public object DuckDb : DbType("duckdb") {
208223

209224
// TODO requires #1266 for specific types
210225
STRUCT -> {
211-
val structTypes = parseStructType(sqlTypeName)
226+
val structEntries = parseStructType(sqlTypeName)
227+
val parsedStructEntries = structEntries.mapValues { (_, type) ->
228+
parseDuckDbType(sqlTypeName = type, isNullable = true)
229+
}
212230

213-
typeInformationForValueColumnOf<Struct>(isNullable)
231+
val targetSchema = ColumnSchema.Group(
232+
schema = DataFrameSchemaImpl(parsedStructEntries.mapValues { it.value.targetSchema }),
233+
contentType = typeOf<Any?>(),
234+
)
235+
236+
typeInformationWithProcessingFor<Struct, Map<String, Any?>, DataRow<*>>(
237+
jdbcSourceType = typeOf<Struct>().withNullability(isNullable),
238+
targetSchema = targetSchema,
239+
valuePreprocessor = { struct, _ ->
240+
// NOTE DataRows cannot be `null` in DataFrame, instead, all its fields become `null`
241+
if (struct == null) {
242+
parsedStructEntries.mapValues { null }
243+
} else {
244+
// read data from the struct
245+
val attrs = struct.getAttributes(
246+
parsedStructEntries.mapValues {
247+
(it.value.jdbcSourceType.classifier!! as KClass<*>).java
248+
},
249+
)
250+
251+
// and potentially, preprocess each value individually
252+
parsedStructEntries.entries.withIndex().associate { (i, entry) ->
253+
entry.key to entry.value.castToAny().preprocess(attrs[i])
254+
}
255+
}
256+
},
257+
columnPostprocessor = { col, _ ->
258+
col.castToNotNullable()
259+
.values()
260+
.toDataFrame()
261+
.asColumnGroup(col.name())
262+
.asDataColumn()
263+
},
264+
)
214265
}
215266

216267
// Cannot handle this in Kotlin
@@ -222,6 +273,25 @@ public object DuckDb : DbType("duckdb") {
222273
}
223274
}
224275

276+
// Overriding buildDataColumn behavior so we can create the column group in post-processing for effeciency
277+
override fun <D : Any> buildDataColumn(
278+
name: String,
279+
values: List<D?>,
280+
typeInformation: TypeInformation<*, D, *>,
281+
inferNullability: Boolean,
282+
): DataColumn<D?> =
283+
when (val schema = typeInformation.targetSchema) {
284+
is ColumnSchema.Group ->
285+
DataColumn.createValueColumn(
286+
name = name,
287+
values = values,
288+
infer = if (inferNullability) Infer.Nulls else Infer.None,
289+
type = schema.type,
290+
)
291+
292+
else -> super.buildDataColumn(name, values, typeInformation, inferNullability)
293+
}
294+
225295
private fun SqlArray.toList(): List<Any?> =
226296
when (val array = this.array) {
227297
is IntArray -> array.toList()

dataframe-jdbc/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/local/duckDbTest.kt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,13 @@ class DuckDbTest {
255255
}
256256
}
257257

258+
@DataSchema
259+
data class NestedEntry(val i: Int, val j: String)
260+
258261
@DataSchema
259262
data class NestedTypes(
260263
@ColumnName("ijstruct_col")
261-
val ijstructCol: java.sql.Struct, // TODO
264+
val ijstructCol: NestedEntry, // TODO
262265
@ColumnName("intarray_col")
263266
val intarrayCol: List<Int?>,
264267
@ColumnName("intlist_col")
@@ -646,7 +649,8 @@ class DuckDbTest {
646649
1 to mapOf("value1" to "a", "value2" to "b"),
647650
200 to mapOf("value1" to "c", "value2" to "d"),
648651
)
649-
it[{ "ijstruct_col"<java.sql.Struct>() }].attributes shouldBe arrayOf<Any>(42, "answer")
652+
it[{ "ijstruct_col"["i"]<Int>() }] shouldBe 42
653+
it[{ "ijstruct_col"["j"]<String>() }] shouldBe "answer"
650654
it["union_col"] shouldBe 2
651655
}
652656
}

0 commit comments

Comments
 (0)