From 0156765f658fd67fa6d1473aab97b07ee2c2b065 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 7 Nov 2024 21:40:10 +0100 Subject: [PATCH] added describe support for incomparable number types (converting them to either double or bigdecimal) and added tests --- .../kotlinx/dataframe/api/DataColumnType.kt | 4 ++ .../kotlinx/dataframe/impl/TypeUtils.kt | 18 ++++++ .../kotlinx/dataframe/impl/api/describe.kt | 55 ++++++++++++++++-- .../kotlinx/dataframe/api/describe.kt | 58 +++++++++++++++++++ 4 files changed, 131 insertions(+), 4 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataColumnType.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataColumnType.kt index 2c7887ab3..91cd02eb4 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataColumnType.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataColumnType.kt @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.isNothing import org.jetbrains.kotlinx.dataframe.impl.projectTo import org.jetbrains.kotlinx.dataframe.type import org.jetbrains.kotlinx.dataframe.typeClass +import java.math.BigDecimal +import java.math.BigInteger import kotlin.contracts.ExperimentalContracts import kotlin.contracts.contract import kotlin.reflect.KClass @@ -44,6 +46,8 @@ public inline fun AnyCol.isType(): Boolean = type() == typeOf() public fun AnyCol.isNumber(): Boolean = isSubtypeOf() +public fun AnyCol.isBigNumber(): Boolean = isSubtypeOf() || isSubtypeOf() + public fun AnyCol.isList(): Boolean = typeClass == List::class /** diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/TypeUtils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/TypeUtils.kt index 199568630..54c12362c 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/TypeUtils.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/TypeUtils.kt @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.Infer import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType import org.jetbrains.kotlinx.dataframe.util.GUESS_VALUE_TYPE +import java.math.BigDecimal +import java.math.BigInteger import kotlin.reflect.KClass import kotlin.reflect.KType import kotlin.reflect.KTypeParameter @@ -29,6 +31,7 @@ import kotlin.reflect.full.superclasses import kotlin.reflect.full.withNullability import kotlin.reflect.jvm.jvmErasure import kotlin.reflect.typeOf +import kotlin.toBigDecimal as toBigDecimalKotlin internal inline fun KClass<*>.createTypeUsing() = typeOf().projectTo(this) @@ -646,3 +649,18 @@ internal fun Any.asArrayAsListOrNull(): List<*>? = is Array<*> -> asList() else -> null } + +internal fun Any.isBigNumber(): Boolean = this is BigInteger || this is BigDecimal + +internal fun Number.toBigDecimal(): BigDecimal = + when (this) { + is BigDecimal -> this + is BigInteger -> this.toBigDecimalKotlin() + is Int -> this.toBigDecimalKotlin() + is Byte -> this.toInt().toBigDecimalKotlin() + is Short -> this.toInt().toBigDecimalKotlin() + is Long -> this.toBigDecimalKotlin() + is Float -> this.toBigDecimalKotlin() + is Double -> this.toBigDecimalKotlin() + else -> BigDecimal(this.toString()) + } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/describe.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/describe.kt index dad18f490..946ddc8a2 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/describe.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/describe.kt @@ -5,13 +5,16 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.ColumnDescription import org.jetbrains.kotlinx.dataframe.api.add import org.jetbrains.kotlinx.dataframe.api.after +import org.jetbrains.kotlinx.dataframe.api.any import org.jetbrains.kotlinx.dataframe.api.asColumnGroup import org.jetbrains.kotlinx.dataframe.api.asComparable import org.jetbrains.kotlinx.dataframe.api.asNumbers import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.concat +import org.jetbrains.kotlinx.dataframe.api.isBigNumber import org.jetbrains.kotlinx.dataframe.api.isComparable import org.jetbrains.kotlinx.dataframe.api.isNumber +import org.jetbrains.kotlinx.dataframe.api.map import org.jetbrains.kotlinx.dataframe.api.maxOrNull import org.jetbrains.kotlinx.dataframe.api.mean import org.jetbrains.kotlinx.dataframe.api.medianOrNull @@ -25,7 +28,9 @@ import org.jetbrains.kotlinx.dataframe.columns.size import org.jetbrains.kotlinx.dataframe.columns.values import org.jetbrains.kotlinx.dataframe.impl.columns.addPath import org.jetbrains.kotlinx.dataframe.impl.columns.asAnyFrameColumn +import org.jetbrains.kotlinx.dataframe.impl.isBigNumber import org.jetbrains.kotlinx.dataframe.impl.renderType +import org.jetbrains.kotlinx.dataframe.impl.toBigDecimal import org.jetbrains.kotlinx.dataframe.index import org.jetbrains.kotlinx.dataframe.kind import org.jetbrains.kotlinx.dataframe.type @@ -78,12 +83,54 @@ internal fun describeImpl(cols: List): DataFrame { ColumnDescription::mean from { if (it.isNumber()) it.asNumbers().mean() else null } ColumnDescription::std from { if (it.isNumber()) it.asNumbers().std() else null } } - if (hasComparable) { - ColumnDescription::min from inferType { if (it.isComparable()) it.asComparable().minOrNull() else null } + if (hasComparable || hasNumeric) { + ColumnDescription::min from inferType { + when { + it.isComparable() -> + it.asComparable().minOrNull() + + // Found incomparable number types, convert all to Double or BigDecimal first + it.isNumber() -> + if (it.any { it?.isBigNumber() == true }) { + it.map { (it as Number?)?.toBigDecimal() }.minOrNull() + } else { + it.map { (it as Number?)?.toDouble() }.minOrNull() + } + + else -> null + } + } ColumnDescription::median from inferType { - if (it.isComparable()) it.asComparable().medianOrNull() else null + when { + it.isComparable() -> + it.asComparable().medianOrNull() + + // Found incomparable number types, convert all to Double or BigDecimal first + it.isNumber() -> + if (it.any { it?.isBigNumber() == true }) { + it.map { (it as Number?)?.toBigDecimal() }.medianOrNull() + } else { + it.map { (it as Number?)?.toDouble() }.medianOrNull() + } + + else -> null + } + } + ColumnDescription::max from inferType { + when { + it.isComparable() -> it.asComparable().maxOrNull() + + // Found incomparable number types, convert all to Double or BigDecimal first + it.isNumber() -> + if (it.any { it?.isBigNumber() == true }) { + it.map { (it as Number?)?.toBigDecimal() }.maxOrNull() + } else { + it.map { (it as Number?)?.toDouble() }.maxOrNull() + } + + else -> null + } } - ColumnDescription::max from inferType { if (it.isComparable()) it.asComparable().maxOrNull() else null } } } df = df.add(ColumnDescription::freq) { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/describe.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/describe.kt index a418e8744..6f6366a60 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/describe.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/describe.kt @@ -1,7 +1,9 @@ package org.jetbrains.kotlinx.dataframe.api import io.kotest.matchers.shouldBe +import org.jetbrains.kotlinx.dataframe.alsoDebug import org.junit.Test +import kotlin.reflect.typeOf class DescribeTests { @@ -11,4 +13,60 @@ class DescribeTests { val df = dataFrameOf(a).drop(1) df.describe()["min"][0] shouldBe null } + + @Test + fun `describe nullable Number column`() { + val a by columnOf( + 1, + 2.0, + 3f, + 4L, + 5.toShort(), + 6.toByte(), + 7.toBigInteger(), + 8.toBigDecimal(), + null, + ) + val df = dataFrameOf(a) + val describe = df.describe() + .alsoDebug() + .single() + with(describe) { + name shouldBe "a" + type shouldBe "Number?" + count shouldBe 9 + unique shouldBe 9 + nulls shouldBe 1 + top shouldBe 1 + freq shouldBe 1 + mean shouldBe 4.5 + std shouldBe 2.449489742783178 + min shouldBe 1.toBigDecimal() + median shouldBe 4.toBigDecimal() + max shouldBe 8.toBigDecimal() + } + } + + @Test + fun `describe with NaNs`() { + val a by columnOf(1.0, 2.0, Double.NaN, 4.0) + val df = dataFrameOf(a) + val describe = df.describe() + .alsoDebug() + .single() + with(describe) { + name shouldBe "a" + type shouldBe "Double" + count shouldBe 4 + unique shouldBe 4 + nulls shouldBe 0 + top shouldBe 1 + freq shouldBe 1 + mean.isNaN() shouldBe true + std.isNaN() shouldBe true + min shouldBe 1.0 // TODO should be NaN too? + median shouldBe 3.0 + max.isNaN shouldBe true + } + } }