Skip to content

Commit

Permalink
added describe support for incomparable number types (converting them…
Browse files Browse the repository at this point in the history
… to either double or bigdecimal) and added tests
  • Loading branch information
Jolanrensen committed Nov 7, 2024
1 parent ac3eb9a commit 0156765
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.impl.isNothing
import org.jetbrains.kotlinx.dataframe.impl.projectTo
import org.jetbrains.kotlinx.dataframe.type
import org.jetbrains.kotlinx.dataframe.typeClass
import java.math.BigDecimal
import java.math.BigInteger
import kotlin.contracts.ExperimentalContracts
import kotlin.contracts.contract
import kotlin.reflect.KClass
Expand Down Expand Up @@ -44,6 +46,8 @@ public inline fun <reified T> AnyCol.isType(): Boolean = type() == typeOf<T>()

public fun AnyCol.isNumber(): Boolean = isSubtypeOf<Number?>()

public fun AnyCol.isBigNumber(): Boolean = isSubtypeOf<BigInteger?>() || isSubtypeOf<BigDecimal?>()

public fun AnyCol.isList(): Boolean = typeClass == List::class

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.api.Infer
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.util.GUESS_VALUE_TYPE
import java.math.BigDecimal
import java.math.BigInteger
import kotlin.reflect.KClass
import kotlin.reflect.KType
import kotlin.reflect.KTypeParameter
Expand All @@ -29,6 +31,7 @@ import kotlin.reflect.full.superclasses
import kotlin.reflect.full.withNullability
import kotlin.reflect.jvm.jvmErasure
import kotlin.reflect.typeOf
import kotlin.toBigDecimal as toBigDecimalKotlin

internal inline fun <reified T> KClass<*>.createTypeUsing() = typeOf<T>().projectTo(this)

Expand Down Expand Up @@ -646,3 +649,18 @@ internal fun Any.asArrayAsListOrNull(): List<*>? =
is Array<*> -> asList()
else -> null
}

internal fun Any.isBigNumber(): Boolean = this is BigInteger || this is BigDecimal

internal fun Number.toBigDecimal(): BigDecimal =
when (this) {
is BigDecimal -> this
is BigInteger -> this.toBigDecimalKotlin()
is Int -> this.toBigDecimalKotlin()
is Byte -> this.toInt().toBigDecimalKotlin()
is Short -> this.toInt().toBigDecimalKotlin()
is Long -> this.toBigDecimalKotlin()
is Float -> this.toBigDecimalKotlin()
is Double -> this.toBigDecimalKotlin()
else -> BigDecimal(this.toString())
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@ import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.ColumnDescription
import org.jetbrains.kotlinx.dataframe.api.add
import org.jetbrains.kotlinx.dataframe.api.after
import org.jetbrains.kotlinx.dataframe.api.any
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
import org.jetbrains.kotlinx.dataframe.api.asComparable
import org.jetbrains.kotlinx.dataframe.api.asNumbers
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.concat
import org.jetbrains.kotlinx.dataframe.api.isBigNumber
import org.jetbrains.kotlinx.dataframe.api.isComparable
import org.jetbrains.kotlinx.dataframe.api.isNumber
import org.jetbrains.kotlinx.dataframe.api.map
import org.jetbrains.kotlinx.dataframe.api.maxOrNull
import org.jetbrains.kotlinx.dataframe.api.mean
import org.jetbrains.kotlinx.dataframe.api.medianOrNull
Expand All @@ -25,7 +28,9 @@ import org.jetbrains.kotlinx.dataframe.columns.size
import org.jetbrains.kotlinx.dataframe.columns.values
import org.jetbrains.kotlinx.dataframe.impl.columns.addPath
import org.jetbrains.kotlinx.dataframe.impl.columns.asAnyFrameColumn
import org.jetbrains.kotlinx.dataframe.impl.isBigNumber
import org.jetbrains.kotlinx.dataframe.impl.renderType
import org.jetbrains.kotlinx.dataframe.impl.toBigDecimal
import org.jetbrains.kotlinx.dataframe.index
import org.jetbrains.kotlinx.dataframe.kind
import org.jetbrains.kotlinx.dataframe.type
Expand Down Expand Up @@ -78,12 +83,54 @@ internal fun describeImpl(cols: List<AnyCol>): DataFrame<ColumnDescription> {
ColumnDescription::mean from { if (it.isNumber()) it.asNumbers().mean() else null }
ColumnDescription::std from { if (it.isNumber()) it.asNumbers().std() else null }
}
if (hasComparable) {
ColumnDescription::min from inferType { if (it.isComparable()) it.asComparable().minOrNull() else null }
if (hasComparable || hasNumeric) {
ColumnDescription::min from inferType {
when {
it.isComparable() ->
it.asComparable().minOrNull()

// Found incomparable number types, convert all to Double or BigDecimal first
it.isNumber() ->
if (it.any { it?.isBigNumber() == true }) {
it.map { (it as Number?)?.toBigDecimal() }.minOrNull()
} else {
it.map { (it as Number?)?.toDouble() }.minOrNull()
}

else -> null
}
}
ColumnDescription::median from inferType {
if (it.isComparable()) it.asComparable().medianOrNull() else null
when {
it.isComparable() ->
it.asComparable().medianOrNull()

// Found incomparable number types, convert all to Double or BigDecimal first
it.isNumber() ->
if (it.any { it?.isBigNumber() == true }) {
it.map { (it as Number?)?.toBigDecimal() }.medianOrNull()
} else {
it.map { (it as Number?)?.toDouble() }.medianOrNull()
}

else -> null
}
}
ColumnDescription::max from inferType {
when {
it.isComparable() -> it.asComparable().maxOrNull()

// Found incomparable number types, convert all to Double or BigDecimal first
it.isNumber() ->
if (it.any { it?.isBigNumber() == true }) {
it.map { (it as Number?)?.toBigDecimal() }.maxOrNull()
} else {
it.map { (it as Number?)?.toDouble() }.maxOrNull()
}

else -> null
}
}
ColumnDescription::max from inferType { if (it.isComparable()) it.asComparable().maxOrNull() else null }
}
}
df = df.add(ColumnDescription::freq) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package org.jetbrains.kotlinx.dataframe.api

import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.alsoDebug
import org.junit.Test
import kotlin.reflect.typeOf

class DescribeTests {

Expand All @@ -11,4 +13,60 @@ class DescribeTests {
val df = dataFrameOf(a).drop(1)
df.describe()["min"][0] shouldBe null
}

@Test
fun `describe nullable Number column`() {
val a by columnOf(
1,
2.0,
3f,
4L,
5.toShort(),
6.toByte(),
7.toBigInteger(),
8.toBigDecimal(),
null,
)
val df = dataFrameOf(a)
val describe = df.describe()
.alsoDebug()
.single()
with(describe) {
name shouldBe "a"
type shouldBe "Number?"
count shouldBe 9
unique shouldBe 9
nulls shouldBe 1
top shouldBe 1
freq shouldBe 1
mean shouldBe 4.5
std shouldBe 2.449489742783178
min shouldBe 1.toBigDecimal()
median shouldBe 4.toBigDecimal()
max shouldBe 8.toBigDecimal()
}
}

@Test
fun `describe with NaNs`() {
val a by columnOf(1.0, 2.0, Double.NaN, 4.0)
val df = dataFrameOf(a)
val describe = df.describe()
.alsoDebug()
.single()
with(describe) {
name shouldBe "a"
type shouldBe "Double"
count shouldBe 4
unique shouldBe 4
nulls shouldBe 0
top shouldBe 1
freq shouldBe 1
mean.isNaN() shouldBe true
std.isNaN() shouldBe true
min shouldBe 1.0 // TODO should be NaN too?
median shouldBe 3.0
max.isNaN shouldBe true
}
}
}

0 comments on commit 0156765

Please sign in to comment.