From acd7f59fc0764017a7339ea054d5bc7a8764a34f Mon Sep 17 00:00:00 2001 From: Balazs Kossovics Date: Tue, 4 Apr 2017 18:09:46 +0200 Subject: [PATCH] Fix partition num when paths contain directories The current method of calculating the number of partitions didn't take into account the real size of the files in a directory, it used the size of a directory, i.e. 4k bytes. --- .../io/druid/indexer/spark/SparkDruidIndexer.scala | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/main/scala/io/druid/indexer/spark/SparkDruidIndexer.scala b/src/main/scala/io/druid/indexer/spark/SparkDruidIndexer.scala index cd0e384..901d135 100644 --- a/src/main/scala/io/druid/indexer/spark/SparkDruidIndexer.scala +++ b/src/main/scala/io/druid/indexer/spark/SparkDruidIndexer.scala @@ -86,7 +86,16 @@ object SparkDruidIndexer { s => { val p = new Path(s) val fs = p.getFileSystem(sc.hadoopConfiguration) - fs.getFileStatus(p).getLen + if (fs.getFileStatus(p).isDirectory) { + var sum = 0l + val children = fs.listFiles(p, false) + while(children.hasNext) { + sum += children.next().getLen + } + sum + } else { + fs.getFileStatus(p).getLen + } } ).sum val startingPartitions = (totalGZSize / (100L << 20)).toInt + 1