Skip to content

Commit

Permalink
Merge pull request #168 from jeffbrennan/update_column_to_list
Browse files Browse the repository at this point in the history
utilize more performant column to list implementation
  • Loading branch information
jeffbrennan authored Jan 21, 2024
2 parents 26c3422 + 05c5dfb commit ac5958c
Showing 1 changed file with 13 additions and 1 deletion.
14 changes: 13 additions & 1 deletion quinn/dataframe_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

if TYPE_CHECKING:
from pyspark.sql import DataFrame, SparkSession
import sys
from typing import Any

from pyspark.sql.types import StructField, StructType
Expand All @@ -19,7 +20,18 @@ def column_to_list(df: DataFrame, col_name: str) -> list[Any]:
:return: List of values
:rtype: List[Any]
"""
return [x[col_name] for x in df.select(col_name).collect()]
pyarrow_kv = ("spark.sql.execution.arrow.pyspark.enabled", "true")
spark_config = df.sparkSession.sparkContext.getConf().getAll()
pyarrow_enabled: bool = pyarrow_kv in spark_config
pyarrow_valid = pyarrow_enabled and sys.modules["pyarrow"] >= "0.17.0"

pandas_exists = "pandas" in sys.modules
pandas_valid = pandas_exists and sys.modules["pandas"].__version__ >= "0.24.2"

if pyarrow_valid and pandas_valid:
return df.select(col_name).toPandas()[col_name].tolist()

return df.select(col_name).rdd.flatMap(lambda x: x).collect()


def two_columns_to_dictionary(
Expand Down

0 comments on commit ac5958c

Please sign in to comment.