From 22b6e707a7a95faa5f1c34aa398ac9b1b66ebef2 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Tue, 17 Dec 2024 23:54:43 -0500 Subject: [PATCH] Add more comments. Rename test orc data name. Replace the incorrectly selected snappy data with the correct uncompressed version --- cpp/src/io/orc/stripe_data.cu | 10 +++++----- ...cFile.timestamp.desynced.snappy.RLEv2.orc} | Bin ...timestamp.desynced.uncompressed.RLEv2.orc} | Bin 5832 -> 5814 bytes python/cudf/cudf/tests/test_orc.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) rename python/cudf/cudf/tests/data/orc/{TestOrcFile.timestampDesynced.orc => TestOrcFile.timestamp.desynced.snappy.RLEv2.orc} (100%) rename python/cudf/cudf/tests/data/orc/{TestOrcFile.timestampDesyncedSnappy.orc => TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc} (93%) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 938cde6cddf..3499c7ad89b 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -138,8 +138,8 @@ struct orcdec_state_s { * This class is used to address a special case, where the first run spans two adjacent row groups * and its length is greater than the maximum length allowed to be consumed. This limit is imposed * by the decoder when processing the SECONDARY stream. This class shall be instantiated in the - * shared memory. As an optimization, the actual cache is a local variable and does not reside in - * the shared memory. + * shared memory, and be used to cache the DATA stream with a decoded data type of `int64_t`. As an + * optimization, the actual cache is a local variable and does not reside in the shared memory. */ class run_cache_manager { private: @@ -187,7 +187,7 @@ class run_cache_manager { * @brief Adjust the maximum length allowed to be consumed when the length of the first run is * greater than it. * - * @param[in] max_length The maximum length allowed to be consumed. + * @param[in] max_length The maximum length allowed to be consumed for the DATA stream. * @return A new maximum length. */ __device__ uint32_t adjust_max_length(uint32_t max_length) @@ -203,7 +203,7 @@ class run_cache_manager { * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache. * * @param[in] src Intermediate buffer for the DATA stream. - * @param[out] cache Local variable serving as the cache. + * @param[out] cache Local variable serving as the cache for the DATA stream. */ __device__ void write_to_cache(int64_t* src, int64_t& cache) { @@ -235,7 +235,7 @@ class run_cache_manager { * * @param[in,out] dst Intermediate buffer for the DATA stream. * @param[in,out] rle Run length decoder state object. - * @param[in] cache Local variable serving as the cache. + * @param[in] cache Local variable serving as the cache for the DATA stream. */ __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle, int64_t cache) { diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesynced.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc similarity index 100% rename from python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesynced.orc rename to python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesyncedSnappy.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc similarity index 93% rename from python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesyncedSnappy.orc rename to python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc index a0ea4fbbfc2fdc0fc9f7b1adbdf4ec6fef161de8..8a7969cdbbb380dc92c2f5fc72642a990fb204f0 100644 GIT binary patch delta 255 zcmX@1yG^&=KggMjU5J&VfhT|g%y_~9WGHg6FaUv&Bv9N-NTT8Qj}^1b6*jHjvTByO z!OhDGKqcy2Y!WHQ_~tMR$#Oj5@Dh_~+6|Ib_;Pgb>RIL*2X8AF?3|+!z;N(?|J$br znVEp}Q!qWd|1N}o{o2!mLLiCj=Rg7nj$c0pqBqX+6E)&u<6skDkPrgXj1oRTno)pJ z!hnmNgB3`h(-LCgU=Uycl1u^&Qq1nYA(IuvR0KGf1^RTPcz{%##6K+sqsif7QR10E YgBluGl$aQJG*k@On!YeI1AQ$H0BKT9fdBvi delta 248 zcmdm{dqP*$Kgd~?fq{!%h?S#(Cji2D!V$nQkAWdTfJ>Q+je(W%z(iS@NKGyd28k4g zV|;U%g=9INaCnJHH0@?oa8USibnog}<{AfYD;VsYqY=Om!oXmlbMSxv+ouPanSk_D zRt}EY{da-n|LfOyISkH$#19<5ehfs<)neE<(NEMO73c;w4mJS>2~jZ3C=mpt83h<6 zbh+3$Sb_9KEg=>T1_1^j$t1ua#q91I;>o};Sx`(x_%Z_n2eZH|9Vs3ll`ip5OTl!q et5}qHjsTNHLj#Kv69bQix&d3$7iMOlcf|o5+&zx~ diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 47c067e1c0b..c2be65bce74 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1975,8 +1975,8 @@ def test_row_group_alignment(datadir): @pytest.mark.parametrize( "inputfile", [ - "TestOrcFile.timestampDesynced.orc", - "TestOrcFile.timestampDesyncedSnappy.orc", + "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc", + "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc", ], ) def test_orc_reader_desynced_timestamp(datadir, inputfile):