From 0b0776c65158db3b07590479d54e10727f2d140b Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Fri, 1 Nov 2024 12:39:39 +0100 Subject: [PATCH] added OOM message pointing to new csv implementation --- .../org/jetbrains/kotlinx/dataframe/io/csv.kt | 99 ++++++++++--------- 1 file changed, 54 insertions(+), 45 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index 605f9a3bf..a22c60ce0 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -351,61 +351,70 @@ public fun DataFrame.Companion.readDelim( readLines: Int? = null, parserOptions: ParserOptions? = null, ): AnyFrame { - var reader = reader - if (skipLines > 0) { - reader = BufferedReader(reader) - repeat(skipLines) { reader.readLine() } - } - - val csvParser = format.parse(reader) - val records = if (readLines == null) { - csvParser.records - } else { - require(readLines >= 0) { "`readLines` must not be negative" } - val records = ArrayList(readLines) - val iter = csvParser.iterator() - var count = readLines ?: 0 - while (iter.hasNext() && 0 < count--) { - records.add(iter.next()) + try { + var reader = reader + if (skipLines > 0) { + reader = BufferedReader(reader) + repeat(skipLines) { reader.readLine() } } - records - } - - val columnNames = csvParser.headerNames.takeIf { it.isNotEmpty() } - ?: (1..(records.firstOrNull()?.count() ?: 0)).map { index -> "X$index" } - val generator = ColumnNameGenerator() - val uniqueNames = columnNames.map { generator.addUnique(it) } + val csvParser = format.parse(reader) + val records = if (readLines == null) { + csvParser.records + } else { + require(readLines >= 0) { "`readLines` must not be negative" } + val records = ArrayList(readLines) + val iter = csvParser.iterator() + var count = readLines ?: 0 + while (iter.hasNext() && 0 < count--) { + records.add(iter.next()) + } + records + } - val cols = uniqueNames.mapIndexed { colIndex, colName -> - val defaultColType = colTypes[".default"] - val colType = colTypes[colName] ?: defaultColType - var hasNulls = false - val values = records.map { - if (it.isSet(colIndex)) { - it[colIndex].ifEmpty { + val columnNames = csvParser.headerNames.takeIf { it.isNotEmpty() } + ?: (1..(records.firstOrNull()?.count() ?: 0)).map { index -> "X$index" } + + val generator = ColumnNameGenerator() + val uniqueNames = columnNames.map { generator.addUnique(it) } + + val cols = uniqueNames.mapIndexed { colIndex, colName -> + val defaultColType = colTypes[".default"] + val colType = colTypes[colName] ?: defaultColType + var hasNulls = false + val values = records.map { + if (it.isSet(colIndex)) { + it[colIndex].ifEmpty { + hasNulls = true + null + } + } else { hasNulls = true null } - } else { - hasNulls = true - null } - } - val column = DataColumn.createValueColumn(colName, values, typeOf().withNullability(hasNulls)) - when (colType) { - null -> column.tryParse(parserOptions) - - else -> { - column.tryParse( - (parserOptions ?: ParserOptions()).copy( - skipTypes = ParserOptions.allTypesExcept(colType.toKType()), - ), - ) + val column = DataColumn.createValueColumn(colName, values, typeOf().withNullability(hasNulls)) + when (colType) { + null -> column.tryParse(parserOptions) + + else -> { + column.tryParse( + (parserOptions ?: ParserOptions()).copy( + skipTypes = ParserOptions.allTypesExcept(colType.toKType()), + ), + ) + } } } + return cols.toDataFrame() + } catch (e: OutOfMemoryError) { + throw OutOfMemoryError( + "Ran out of memory reading this CSV-like file. " + + "You can try our new experimental CSV reader by adding the dependency " + + "\"org.jetbrains.kotlinx:dataframe-csv:{VERSION}\" and using `DataFrame.readCsv()` instead of " + + "`DataFrame.readCSV()`. This requires `@OptIn(ExperimentalCsv::class)`.", + ) } - return cols.toDataFrame() } public fun AnyFrame.writeCSV(file: File, format: CSVFormat = CSVFormat.DEFAULT): Unit =