From 22b6e707a7a95faa5f1c34aa398ac9b1b66ebef2 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Tue, 17 Dec 2024 23:54:43 -0500
Subject: [PATCH] Add more comments. Rename test orc data name. Replace the
 incorrectly selected snappy data with the correct uncompressed version

---
 cpp/src/io/orc/stripe_data.cu                 |  10 +++++-----
 ...cFile.timestamp.desynced.snappy.RLEv2.orc} | Bin
 ...timestamp.desynced.uncompressed.RLEv2.orc} | Bin 5832 -> 5814 bytes
 python/cudf/cudf/tests/test_orc.py            |   4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)
 rename python/cudf/cudf/tests/data/orc/{TestOrcFile.timestampDesynced.orc => TestOrcFile.timestamp.desynced.snappy.RLEv2.orc} (100%)
 rename python/cudf/cudf/tests/data/orc/{TestOrcFile.timestampDesyncedSnappy.orc => TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc} (93%)

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 938cde6cddf..3499c7ad89b 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -138,8 +138,8 @@ struct orcdec_state_s {
  * This class is used to address a special case, where the first run spans two adjacent row groups
  * and its length is greater than the maximum length allowed to be consumed. This limit is imposed
  * by the decoder when processing the SECONDARY stream. This class shall be instantiated in the
- * shared memory. As an optimization, the actual cache is a local variable and does not reside in
- * the shared memory.
+ * shared memory, and be used to cache the DATA stream with a decoded data type of `int64_t`. As an
+ * optimization, the actual cache is a local variable and does not reside in the shared memory.
  */
 class run_cache_manager {
  private:
@@ -187,7 +187,7 @@ class run_cache_manager {
    * @brief Adjust the maximum length allowed to be consumed when the length of the first run is
    * greater than it.
    *
-   * @param[in] max_length The maximum length allowed to be consumed.
+   * @param[in] max_length The maximum length allowed to be consumed for the DATA stream.
    * @return A new maximum length.
    */
   __device__ uint32_t adjust_max_length(uint32_t max_length)
@@ -203,7 +203,7 @@ class run_cache_manager {
    * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache.
    *
    * @param[in] src Intermediate buffer for the DATA stream.
-   * @param[out] cache Local variable serving as the cache.
+   * @param[out] cache Local variable serving as the cache for the DATA stream.
    */
   __device__ void write_to_cache(int64_t* src, int64_t& cache)
   {
@@ -235,7 +235,7 @@ class run_cache_manager {
    *
    * @param[in,out] dst Intermediate buffer for the DATA stream.
    * @param[in,out] rle Run length decoder state object.
-   * @param[in] cache Local variable serving as the cache.
+   * @param[in] cache Local variable serving as the cache for the DATA stream.
    */
   __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle, int64_t cache)
   {
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesynced.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc
similarity index 100%
rename from python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesynced.orc
rename to python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesyncedSnappy.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc
similarity index 93%
rename from python/cudf/cudf/tests/data/orc/TestOrcFile.timestampDesyncedSnappy.orc
rename to python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc
index a0ea4fbbfc2fdc0fc9f7b1adbdf4ec6fef161de8..8a7969cdbbb380dc92c2f5fc72642a990fb204f0 100644
GIT binary patch
delta 255
zcmX@1yG^&=KggMjU5J&VfhT|g%y_~9WGHg6FaUv&Bv9N-NTT8Qj}^1b6*jHjvTByO
z!OhDGKqcy2Y!WHQ_~tMR$#Oj5@Dh_~+6|Ib_;Pgb>RIL*2X8AF?3|+!z;N(?|J$br
znVEp}Q!qWd|1N}o{o2!mLLiCj=Rg7nj$c0pqBqX+6E)&u<6skDkPrgXj1oRTno)pJ
z!hnmNgB3`h(-LCgU=Uycl1u^&Qq1nYA(IuvR0KGf1^RTPcz{%##6K+sqsif7QR10E
YgBluGl$aQJG*k@On!YeI1AQ$H0BKT9fdBvi

delta 248
zcmdm{dqP*$Kgd~?fq{!%h?S#(Cji2D!V$nQkAWdTfJ>Q+je(W%z(iS@NKGyd28k4g
zV|;U%g=9INaCnJHH0@?oa8USibnog}<{AfYD;VsYqY=Om!oXmlbMSxv+ouPanSk_D
zRt}EY{da-n|LfOyISkH$#19<5ehfs<)neE<(NEMO73c;w4mJS>2~jZ3C=mpt83h<6
zbh+3$Sb_9KEg=>T1_1^j$t1ua#q91I;>o};Sx`(x_%Z_n2eZH|9Vs3ll`ip5OTl!q
et5}qHjsTNHLj#Kv69bQix&d3$7iMOlcf|o5+&zx~

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 47c067e1c0b..c2be65bce74 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1975,8 +1975,8 @@ def test_row_group_alignment(datadir):
 @pytest.mark.parametrize(
     "inputfile",
     [
-        "TestOrcFile.timestampDesynced.orc",
-        "TestOrcFile.timestampDesyncedSnappy.orc",
+        "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc",
+        "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc",
     ],
 )
 def test_orc_reader_desynced_timestamp(datadir, inputfile):