404
+ +Page not found
+ + +diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/404.html b/404.html new file mode 100644 index 00000000..85add2de --- /dev/null +++ b/404.html @@ -0,0 +1,110 @@ + + +
+ + + + +Page not found
+ + +Pyspark helper methods to maximize developer productivity.
+Quinn provides DataFrame validation functions, useful column functions / DataFrame transformations, and performant helper functions.
+ +Quinn is uploaded to PyPi and can be installed with this command:
+pip install quinn
+
+import quinn
+
+validate_presence_of_columns()
+quinn.validate_presence_of_columns(source_df, ["name", "age", "fun"])
+
+Raises an exception unless source_df
contains the name
, age
, and fun
column.
validate_schema()
+quinn.validate_schema(source_df, required_schema)
+
+Raises an exception unless source_df
contains all the StructFields
defined in the required_schema
.
validate_absence_of_columns()
+quinn.validate_absence_of_columns(source_df, ["age", "cool"])
+
+Raises an exception if source_df
contains age
or cool
columns.
single_space()
+actual_df = source_df.withColumn(
+ "words_single_spaced",
+ quinn.single_space(col("words"))
+)
+
+Replaces all multispaces with single spaces (e.g. changes "this has some"
to "this has some"
.
remove_all_whitespace()
+actual_df = source_df.withColumn(
+ "words_without_whitespace",
+ quinn.remove_all_whitespace(col("words"))
+)
+
+Removes all whitespace in a string (e.g. changes "this has some"
to "thishassome"
.
anti_trim()
+actual_df = source_df.withColumn(
+ "words_anti_trimmed",
+ quinn.anti_trim(col("words"))
+)
+
+Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes " this has some "
to " thishassome "
.
remove_non_word_characters()
+actual_df = source_df.withColumn(
+ "words_without_nonword_chars",
+ quinn.remove_non_word_characters(col("words"))
+)
+
+Removes all non-word characters from a string (e.g. changes "si%$#@!#$!@#mpsons"
to "simpsons"
.
multi_equals()
+source_df.withColumn(
+ "are_s1_and_s2_cat",
+ quinn.multi_equals("cat")(col("s1"), col("s2"))
+)
+
+multi_equals
returns true if s1
and s2
are both equal to "cat"
.
approx_equal()
+This function takes 3 arguments which are 2 Pyspark DataFrames and one integer values as threshold, and returns the Boolean column which tells if the columns are equal in the threshold.
+let the columns be
+col1 = [1.2, 2.5, 3.1, 4.0, 5.5]
+col2 = [1.3, 2.3, 3.0, 3.9, 5.6]
+threshold = 0.2
+
+result = approx_equal(col("col1"), col("col2"), threshold)
+result.show()
+
++-----+
+|value|
++-----+
+| true|
+|false|
+| true|
+| true|
+| true|
++-----+
+
+array_choice()
+This function takes a Column as a parameter and returns a PySpark column that contains a random value from the input column parameter
+df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], ["values"])
+result = df.select(array_choice(col("values")))
+
+The output is :=
++--------------+
+|array_choice()|
++--------------+
+| 2|
++--------------+
+
+
+regexp_extract_all()
+The regexp_extract_all takes 2 parameters String s
and regexp
which is a regular expression. This function finds all the matches for the string which satisfies the regular expression.
print(regexp_extract_all("this is a example text message for testing application",r"\b\w*a\w*\b"))
+
+The output is :=
+['a', 'example', 'message', 'application']
+
+
+Where r"\b\w*a\w*\b"
pattern checks for words containing letter a
week_start_date()
+It takes 2 parameters, column and week_start_day. It returns a Spark Dataframe column which contains the start date of the week. By default the week_start_day is set to "Sun".
+For input ["2023-03-05", "2023-03-06", "2023-03-07", "2023-03-08"]
the Output is
result = df.select("date", week_start_date(col("date"), "Sun"))
+result.show()
++----------+----------------+
+| date|week_start_date |
++----------+----------------+
+|2023-03-05| 2023-03-05|
+|2023-03-07| 2023-03-05|
+|2023-03-08| 2023-03-05|
++----------+----------------+
+
+week_end_date()
+It also takes 2 Paramters as Column and week_end_day, and returns the dateframe column which contains the end date of the week. By default the week_end_day is set to "sat"
++---------+-------------+
+ date|week_end_date|
++---------+-------------+
+2023-03-05| 2023-03-05|
+2023-03-07| 2023-03-12|
+2023-03-08| 2023-03-12|
++---------+-------------+
+
+
+uuid5()
+This function generates UUIDv5 in string form from the passed column and optionally namespace and optional extra salt. +By default namespace is NAMESPACE_DNS UUID and no extra string used to reduce hash collisions.
+
+df = spark.createDataFrame([("lorem",), ("ipsum",)], ["values"])
+result = df.select(quinn.uuid5(F.col("values")).alias("uuid5"))
+result.show(truncate=False)
+
+The output is :=
++------------------------------------+
+|uuid5 |
++------------------------------------+
+|35482fda-c10a-5076-8da2-dc7bf22d6be4|
+|51b79c1d-d06c-5b30-a5c6-1fadcd3b2103|
++------------------------------------+
+
+
+snake_case_col_names()
+quinn.snake_case_col_names(source_df)
+
+Converts all the column names in a DataFrame to snake_case. It's annoying to write SQL queries when columns aren't snake cased.
+sort_columns()
+quinn.sort_columns(source_df, "asc")
+
+Sorts the DataFrame columns in alphabetical order. Wide DataFrames are easier to navigate when they're sorted alphabetically.
+column_to_list()
+quinn.column_to_list(source_df, "name")
+
+Converts a column in a DataFrame to a list of values.
+two_columns_to_dictionary()
+quinn.two_columns_to_dictionary(source_df, "name", "age")
+
+Converts two columns of a DataFrame into a dictionary. In this example, name
is the key and age
is the value.
to_list_of_dictionaries()
+quinn.to_list_of_dictionaries(source_df)
+
+Converts an entire DataFrame into a list of dictionaries.
+from quinn.extensions import *
+
+create_df()
+spark.create_df(
+ [("jose", "a"), ("li", "b"), ("sam", "c")],
+ [("name", StringType(), True), ("blah", StringType(), True)]
+)
+
+Creates DataFrame with a syntax that's less verbose than the built-in createDataFrame
method.
isFalsy()
+source_df.withColumn("is_stuff_falsy", F.col("has_stuff").isFalsy())
+
+Returns True
if has_stuff
is None
or False
.
isTruthy()
+source_df.withColumn("is_stuff_truthy", F.col("has_stuff").isTruthy())
+
+Returns True
unless has_stuff
is None
or False
.
isNullOrBlank()
+source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank())
+
+Returns True
if blah
is null
or blank (the empty string or a string that only contains whitespace).
isNotIn()
+source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies))
+
+Returns True
if fun_thing
is not included in the bobs_hobbies
list.
nullBetween()
+source_df.withColumn("is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age")))
+
+Returns True
if age
is between lower_age
and upper_age
. If lower_age
is populated and upper_age
is null
, it will return True
if age
is greater than or equal to lower_age
. If lower_age
is null
and upper_age
is populate, it will return True
if age
is lower than or equal to upper_age
.
We are actively looking for feature requests, pull requests, and bug fixes.
+Any developer that demonstrates excellence will be invited to be a maintainer of the project.
+We are using PySpark code-style and sphinx
as docstrings format. For more details about sphinx
format see this tutorial. A short example of sphinx
-formated docstring is placed below:
"""[Summary]
+
+:param [ParamName]: [ParamDescription], defaults to [DefaultParamVal]
+:type [ParamName]: [ParamType](, optional)
+...
+:raises [ErrorType]: [ErrorDescription]
+...
+:return: [ReturnDescription]
+:rtype: [ReturnType]
+"""
+
+
+ SchemaMismatchError
+
+
+
+ Bases: ValueError
raise this when there's a schema mismatch between source & target schema
+ + +quinn/append_if_schema_identical.py
class SchemaMismatchError(ValueError):
+ """raise this when there's a schema mismatch between source & target schema"""
+ append_if_schema_identical(source_df, target_df)
+
+Compares the schema of source & target dataframe .
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
source_df |
+
+ DataFrame
+ |
+ Input DataFrame |
+ + required + | +
target_df |
+
+ DataFrame
+ |
+ Input DataFrame |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ pyspark.sql.DataFrame
+ |
+ dataframe |
+
quinn/append_if_schema_identical.py
def append_if_schema_identical(source_df: DataFrame, target_df: DataFrame) -> DataFrame:
+ """Compares the schema of source & target dataframe .
+ :param source_df: Input DataFrame
+ :type source_df: pyspark.sql.DataFrame
+ :param target_df: Input DataFrame
+ :type target_df: pyspark.sql.DataFrame
+ :return: dataframe
+ :rtype: pyspark.sql.DataFrame
+ """
+ # Retrieve the schemas of the source and target dataframes
+ source_schema = source_df.schema
+ target_schema = target_df.schema
+
+ # Convert the schemas to a list of tuples
+ source_schema_list = [(field.name, str(field.dataType)) for field in source_schema]
+ target_schema_list = [(field.name, str(field.dataType)) for field in target_schema]
+
+ unmatched_cols = [col for col in source_schema_list if col not in target_schema_list]
+ error_message = f"The schemas of the source and target dataframes are not identical." \
+ f"From source schema column {unmatched_cols} is missing in target schema"
+ # Check if the column names in the source and target schemas are the same, regardless of their order
+ if set(source_schema.fieldNames()) != set(target_schema.fieldNames()):
+ raise SchemaMismatchError(error_message)
+ # Check if the column names and data types in the source and target schemas are the same, in the same order
+ if sorted(source_schema_list) != sorted(target_schema_list):
+ raise SchemaMismatchError(error_message)
+
+ # Append the dataframes if the schemas are identical
+ appended_df = target_df.unionByName(source_df)
+ return appended_df
+ column_to_list(df, col_name)
+
+Collect column to list of values.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ Input DataFrame |
+ + required + | +
col_name |
+
+ str
+ |
+ Column to collect |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ List[Any]
+ |
+ List of values |
+
quinn/dataframe_helpers.py
def column_to_list(df: DataFrame, col_name: str) -> List[Any]:
+ """Collect column to list of values.
+
+ :param df: Input DataFrame
+ :type df: pyspark.sql.DataFrame
+ :param col_name: Column to collect
+ :type col_name: str
+ :return: List of values
+ :rtype: List[Any]
+ """
+ return [x[col_name] for x in df.select(col_name).collect()]
+ print_athena_create_table(df, athena_table_name, s3location)
+
+Generates the Athena create table statement for a given DataFrame
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ The pyspark.sql.DataFrame to use |
+ + required + | +
athena_table_name |
+
+ str
+ |
+ The name of the athena table to generate |
+ + required + | +
s3location |
+
+ str
+ |
+ The S3 location of the parquet data |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+ None |
+
quinn/dataframe_helpers.py
def print_athena_create_table(
+ df: DataFrame, athena_table_name: str, s3location: str
+) -> None:
+ """Generates the Athena create table statement for a given DataFrame
+
+ :param df: The pyspark.sql.DataFrame to use
+ :param athena_table_name: The name of the athena table to generate
+ :param s3location: The S3 location of the parquet data
+ :return: None
+ """
+ fields = df.schema
+
+ print(f"CREATE EXTERNAL TABLE IF NOT EXISTS `{athena_table_name}` ( ")
+
+ for field in fields.fieldNames()[:-1]:
+ print("\t", f"`{fields[field].name}` {fields[field].dataType.simpleString()}, ")
+ last = fields[fields.fieldNames()[-1]]
+ print("\t", f"`{last.name}` {last.dataType.simpleString()} ")
+
+ print(")")
+ print("STORED AS PARQUET")
+ print(f"LOCATION '{s3location}'\n")
+ show_output_to_df(show_output, spark)
+
+Show output as spark DataFrame
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
show_output |
+
+ str
+ |
+ String representing output of 'show' command in spark |
+ + required + | +
spark |
+
+ SparkSession
+ |
+ SparkSession object |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Dataframe
+ |
+ DataFrame object containing output of a show command in spark |
+
quinn/dataframe_helpers.py
def show_output_to_df(show_output: str, spark: SparkSession) -> DataFrame:
+ """Show output as spark DataFrame
+
+ :param show_output: String representing output of 'show' command in spark
+ :type show_output: str
+ :param spark: SparkSession object
+ :type spark: SparkSession
+ :return: DataFrame object containing output of a show command in spark
+ :rtype: Dataframe
+ """
+ l = show_output.split("\n")
+ ugly_column_names = l[1]
+ pretty_column_names = [i.strip() for i in ugly_column_names[1:-1].split("|")]
+ pretty_data = []
+ ugly_data = l[3:-1]
+ for row in ugly_data:
+ r = [i.strip() for i in row[1:-1].split("|")]
+ pretty_data.append(tuple(r))
+ return spark.createDataFrame(pretty_data, pretty_column_names)
+ to_list_of_dictionaries(df)
+
+Convert a Spark DataFrame to a list of dictionaries.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ The Spark DataFrame to convert. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ List[Dict[str, Any]]
+ |
+ A list of dictionaries representing the rows in the DataFrame. |
+
quinn/dataframe_helpers.py
def to_list_of_dictionaries(df: DataFrame) -> List[Dict[str, Any]]:
+ """Convert a Spark DataFrame to a list of dictionaries.
+
+ :param df: The Spark DataFrame to convert.
+ :type df: :py:class:`pyspark.sql.DataFrame`
+ :return: A list of dictionaries representing the rows in the DataFrame.
+ :rtype: List[Dict[str, Any]]
+ """
+ return list(map(lambda r: r.asDict(), df.collect()))
+ two_columns_to_dictionary(df, key_col_name, value_col_name)
+
+Collect two columns as dictionary when first column is key and second is value.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ Input DataFrame |
+ + required + | +
key_col_name |
+
+ str
+ |
+ Key-column |
+ + required + | +
value_col_name |
+
+ str
+ |
+ Value-column |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Dict[str, Any]
+ |
+ Dictionary with values |
+
quinn/dataframe_helpers.py
def two_columns_to_dictionary(
+ df: DataFrame, key_col_name: str, value_col_name: str
+) -> Dict[str, Any]:
+ """Collect two columns as dictionary when first column is key and second is value.
+
+ :param df: Input DataFrame
+ :type df: pyspark.sql.DataFrame
+ :param key_col_name: Key-column
+ :type key_col_name: str
+ :param value_col_name: Value-column
+ :type value_col_name: str
+ :return: Dictionary with values
+ :rtype: Dict[str, Any]
+ """
+ k, v = key_col_name, value_col_name
+ return {x[k]: x[v] for x in df.select(k, v).collect()}
+ DataFrameMissingColumnError
+
+
+
+ Bases: ValueError
raise this when there's a DataFrame column error
+ + +quinn/dataframe_validator.py
class DataFrameMissingColumnError(ValueError):
+ """raise this when there's a DataFrame column error"""
+ DataFrameMissingStructFieldError
+
+
+
+ Bases: ValueError
raise this when there's a DataFrame column error
+ + +quinn/dataframe_validator.py
class DataFrameMissingStructFieldError(ValueError):
+ """raise this when there's a DataFrame column error"""
+ DataFrameProhibitedColumnError
+
+
+
+ Bases: ValueError
raise this when a DataFrame includes prohibited columns
+ + +quinn/dataframe_validator.py
class DataFrameProhibitedColumnError(ValueError):
+ """raise this when a DataFrame includes prohibited columns"""
+ validate_absence_of_columns(df, prohibited_col_names)
+
+Validate that none of the prohibited column names are present among +specified DataFrame columns.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ DataFrame containing columns to be checked. |
+ + required + | +
prohibited_col_names |
+
+ List[str]
+ |
+ List of prohibited column names. |
+ + required + | +
Raises:
+Type | +Description | +
---|---|
+ DataFrameProhibitedColumnError
+ |
+ If the prohibited column names are present among the specified DataFrame columns. |
+
quinn/dataframe_validator.py
def validate_absence_of_columns(df: DataFrame, prohibited_col_names: List[str]) -> None:
+ """
+ Validate that none of the prohibited column names are present among
+ specified DataFrame columns.
+
+ :param df: DataFrame containing columns to be checked.
+ :param prohibited_col_names: List of prohibited column names.
+ :raises DataFrameProhibitedColumnError: If the prohibited column names are
+ present among the specified DataFrame columns.
+ """
+ all_col_names = df.columns
+ extra_col_names = [x for x in all_col_names if x in prohibited_col_names]
+ error_message = "The {extra_col_names} columns are not allowed to be included in the DataFrame with the following columns {all_col_names}".format(
+ extra_col_names=extra_col_names, all_col_names=all_col_names
+ )
+ if extra_col_names:
+ raise DataFrameProhibitedColumnError(error_message)
+ validate_presence_of_columns(df, required_col_names)
+
+Validates the presence of column names in a DataFrame.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ A spark DataFrame. |
+ + required + | +
required_col_names |
+
+ List[str]
+ |
+ List of the required column names for the DataFrame. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ None
+ |
+ None. |
+
Raises:
+Type | +Description | +
---|---|
+ DataFrameMissingColumnError
+ |
+ if any of the requested column names are not present in the DataFrame. |
+
quinn/dataframe_validator.py
def validate_presence_of_columns(df: DataFrame, required_col_names: List[str]) -> None:
+ """Validates the presence of column names in a DataFrame.
+
+ :param df: A spark DataFrame.
+ :type df: DataFrame`
+ :param required_col_names: List of the required column names for the DataFrame.
+ :type required_col_names: :py:class:`list` of :py:class:`str`
+ :return: None.
+ :raises DataFrameMissingColumnError: if any of the requested column names are
+ not present in the DataFrame.
+ """
+ all_col_names = df.columns
+ missing_col_names = [x for x in required_col_names if x not in all_col_names]
+ error_message = "The {missing_col_names} columns are not included in the DataFrame with the following columns {all_col_names}".format(
+ missing_col_names=missing_col_names, all_col_names=all_col_names
+ )
+ if missing_col_names:
+ raise DataFrameMissingColumnError(error_message)
+ validate_schema(df, required_schema, ignore_nullable=False)
+
+This function will validate that a given DataFrame has a given StructType as its +schema.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ DataFrame to validate |
+ + required + | +
required_schema |
+
+ StructType
+ |
+ StructType required for the DataFrame |
+ + required + | +
ignore_nullable |
+
+ bool
+ |
+ (Optional) A flag for if nullable fields should be ignored during validation |
+
+ False
+ |
+
Raises:
+Type | +Description | +
---|---|
+ DataFrameMissingStructFieldError
+ |
+ if any StructFields from the required schema are not included in the DataFrame schema |
+
quinn/dataframe_validator.py
def validate_schema(df: DataFrame, required_schema: StructType, ignore_nullable: bool=False) -> None:
+ """
+ This function will validate that a given DataFrame has a given StructType as its
+ schema.
+
+ :param df: DataFrame to validate
+ :type df: DataFrame
+ :param required_schema: StructType required for the DataFrame
+ :type required_schema: StructType
+ :param ignore_nullable: (Optional) A flag for if nullable fields should be
+ ignored during validation
+ :type ignore_nullable: bool, optional
+
+ :raises DataFrameMissingStructFieldError: if any StructFields from the required
+ schema are not included in the DataFrame schema
+ """
+ _all_struct_fields = copy.deepcopy(df.schema)
+ _required_schema = copy.deepcopy(required_schema)
+
+ if ignore_nullable:
+ for x in _all_struct_fields:
+ x.nullable = None
+
+ for x in _required_schema:
+ x.nullable = None
+
+ missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields]
+ error_message = "The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {all_struct_fields}".format(
+ missing_struct_fields=missing_struct_fields,
+ all_struct_fields=_all_struct_fields,
+ )
+ if missing_struct_fields:
+ raise DataFrameMissingStructFieldError(error_message)
+ isFalse(self)
+
+This function checks if the column is equal to False and returns the column.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
self |
+
+ Column
+ |
+ Column |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ Column |
+
quinn/extensions/column_ext.py
def isFalse(self: Column) -> Column:
+ """This function checks if the column is equal to False and returns the column.
+
+ :param self: Column
+ :return: Column
+ :rtype: Column
+ """
+ return self is False
+ isFalsy(self)
+
+Returns a Column indicating whether all values in the Column are False or NULL
+(falsy). Each element in the resulting column is True if all the elements in the
+Column are either NULL or False, or False otherwise. This is accomplished by
+performing a bitwise or of the isNull
condition and a literal False value and
+then wrapping the result in a when statement.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
self |
+
+ Column
+ |
+ Column object |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ Column object |
+
quinn/extensions/column_ext.py
def isFalsy(self: Column) -> Column:
+ """Returns a Column indicating whether all values in the Column are False or NULL
+ (**falsy**). Each element in the resulting column is True if all the elements in the
+ Column are either NULL or False, or False otherwise. This is accomplished by
+ performing a bitwise or of the ``isNull`` condition and a literal False value and
+ then wrapping the result in a **when** statement.
+
+ :param self: Column object
+ :returns: Column object
+ :rtype: Column
+ """
+ return when(self.isNull() | (self == lit(False)), True).otherwise(False)
+ isNullOrBlank(self)
+
+Returns a Boolean value which expresses whether a given column is null
or
+contains only blank characters.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
\*\*self |
+ + | +The :class: |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ A |
+
quinn/extensions/column_ext.py
def isNullOrBlank(self: Column) -> Column:
+ """Returns a Boolean value which expresses whether a given column is ``null`` or
+ contains only blank characters.
+
+ :param \*\*self: The :class:`Column` to check.
+
+ :returns: A `Column` containing ``True`` if the column is ``null`` or only contains
+ blank characters, or ``False`` otherwise.
+ :rtype: Column
+ """
+
+ return (self.isNull()) | (trim(self) == "")
+ isTrue(self)
+
+This function takes a column of type Column as an argument and returns a column +of type Column.
+It evaluates whether each element in the column argument is equal to True, and +if so will return True, otherwise False.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
self |
+
+ Column
+ |
+ Column object |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ Column object |
+
quinn/extensions/column_ext.py
def isTrue(self: Column) -> Column:
+ """
+ This function takes a column of type Column as an argument and returns a column
+ of type Column.
+
+ It evaluates whether each element in the column argument is equal to True, and
+ if so will return True, otherwise False.
+
+ :param self: Column object
+ :returns: Column object
+ :rtype: Column
+ """
+ return self is True
+ isTruthy(self)
+
+Calculates a boolean expression that is the opposite of isFalsy for the given
+Column
self.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
self |
+
+ Column
+ |
+ The |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ A |
+
quinn/extensions/column_ext.py
def isTruthy(self: Column) -> Column:
+ """Calculates a boolean expression that is the opposite of isFalsy for the given
+ ``Column`` self.
+
+ :param Column self: The ``Column`` to calculate the opposite of isFalsy for.
+ :returns: A ``Column`` with the results of the calculation.
+ :rtype: Column
+ """
+ return ~(self.isFalsy())
+ create_df(self, rows_data, col_specs)
+
+Creates a new DataFrame from the given data and column specs. The returned +DataFrame is created using the StructType and StructField classes provided by +PySpark.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
rows_data |
+
+ array-like
+ |
+ the data used to create the DataFrame |
+ + required + | +
col_specs |
+
+ list of tuples
+ |
+ list of tuples containing the name and type of the field |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ DataFrame
+ |
+ a new DataFrame |
+
quinn/extensions/spark_session_ext.py
def create_df(self, rows_data, col_specs):
+ """Creates a new DataFrame from the given data and column specs. The returned
+ DataFrame is created using the StructType and StructField classes provided by
+ PySpark.
+
+ :param rows_data: the data used to create the DataFrame
+ :type rows_data: array-like
+ :param col_specs: list of tuples containing the name and type of the field
+ :type col_specs: list of tuples
+ :return: a new DataFrame
+ :rtype: DataFrame
+ """
+ struct_fields = list(map(lambda x: StructField(*x), col_specs))
+ return self.createDataFrame(data=rows_data, schema=StructType(struct_fields))
+ anti_trim(col)
+
+Removes whitespace from the boundaries of col
using the regexp_replace
+function.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ Column on which to perform the regexp_replace. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ A new Column with all whitespace removed from the boundaries. |
+
quinn/functions.py
def anti_trim(col: Column) -> Column:
+ """Removes whitespace from the boundaries of ``col`` using the regexp_replace
+ function.
+
+ :param col: Column on which to perform the regexp_replace.
+ :type col: Column
+ :return: A new Column with all whitespace removed from the boundaries.
+ :rtype: Column
+ """
+ return F.regexp_replace(col, "\\b\\s+\\b", "")
+ approx_equal(col1, col2, threshold)
+
+Compares two Column
objects by checking if the difference between them
+is less than a specified threshold
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col1 |
+
+ Column
+ |
+ the first |
+ + required + | +
col2 |
+
+ Column
+ |
+ the second |
+ + required + | +
threshold |
+
+ Number
+ |
+ value to compare with |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ Boolean |
+
quinn/functions.py
def approx_equal(col1: Column, col2: Column, threshold: Number) -> Column:
+ """Compares two ``Column`` objects by checking if the difference between them
+ is less than a specified ``threshold``.
+
+ :param col1: the first ``Column``
+ :type col1: Column
+ :param col2: the second ``Column``
+ :type col2: Column
+ :param threshold: value to compare with
+ :type threshold: Number
+ :return: Boolean ``Column`` with ``True`` indicating that ``abs(col1 -
+ col2)`` is less than ``threshold``
+ """
+ return F.abs(col1 - col2) < threshold
+ array_choice(col)
+
+Returns one random element from the given column.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ Column from which element is chosen |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ random element from the given column |
+
quinn/functions.py
def array_choice(col: Column) -> Column:
+ """Returns one random element from the given column.
+
+ :param col: Column from which element is chosen
+ :type col: Column
+ :return: random element from the given column
+ :rtype: Column
+ """
+ index = (F.rand() * F.size(col)).cast("int")
+ return col[index]
+ exists(f)
+
+Create a user-defined function that takes a list expressed as a column of
+type ArrayType(AnyType)
as an argument and returns a boolean value indicating
+whether any element in the list is true according to the argument f
of the
+exists()
function.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
f |
+
+ Callable[[Any], bool]
+ |
+ Callable function - A callable function that takes an element of type Any and returns a boolean value. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ UserDefinedFunction
+ |
+ A user-defined function that takes a list expressed as a column of type ArrayType(AnyType) as an argument and returns a boolean value indicating whether any element in the list is true according to the argument |
+
quinn/functions.py
def exists(f: Callable[[Any], bool]):
+ """
+ Create a user-defined function that takes a list expressed as a column of
+ type ``ArrayType(AnyType)`` as an argument and returns a boolean value indicating
+ whether any element in the list is true according to the argument ``f`` of the
+ ``exists()`` function.
+
+ :param f: Callable function - A callable function that takes an element of
+ type Any and returns a boolean value.
+ :return: A user-defined function that takes
+ a list expressed as a column of type ArrayType(AnyType) as an argument and
+ returns a boolean value indicating whether any element in the list is true
+ according to the argument ``f`` of the ``exists()`` function.
+ :rtype: UserDefinedFunction
+ """
+
+ def temp_udf(l):
+ return any(map(f, l))
+
+ return F.udf(temp_udf, BooleanType())
+ forall(f)
+
+The forall function allows for mapping a given boolean function to a list of +arguments and return a single boolean value as the result of applying the +boolean function to each element of the list. It does this by creating a Spark +UDF which takes in a list of arguments, applying the given boolean function to +each element of the list and returning a single boolean value if all the +elements pass through the given boolean function.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
f |
+
+ Callable[[Any], bool]
+ |
+ A callable function |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ UserDefinedFunction
+ |
+ A spark UDF which accepts a list of arguments and returns True if all elements pass through the given boolean function, False otherwise. |
+
quinn/functions.py
def forall(f: Callable[[Any], bool]):
+ """The **forall** function allows for mapping a given boolean function to a list of
+ arguments and return a single boolean value as the result of applying the
+ boolean function to each element of the list. It does this by creating a Spark
+ UDF which takes in a list of arguments, applying the given boolean function to
+ each element of the list and returning a single boolean value if all the
+ elements pass through the given boolean function.
+
+ :param f: A callable function ``f`` which takes in any type and returns a boolean
+ :return: A spark UDF which accepts a list of arguments and returns True if all
+ elements pass through the given boolean function, False otherwise.
+ :rtype: UserDefinedFunction
+ """
+
+ def temp_udf(l):
+ return all(map(f, l))
+
+ return F.udf(temp_udf, BooleanType())
+ multi_equals(value)
+
+Create a user-defined function that checks if all the given columns have the +designated value.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
value |
+
+ Any
+ |
+ The designated value. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ UserDifinedFunction
+ |
+ A user-defined function of type BooleanType(). |
+
quinn/functions.py
def multi_equals(value: Any):
+ """Create a user-defined function that checks if all the given columns have the
+ designated value.
+
+ :param value: The designated value.
+ :type value: Any
+ :return: A user-defined function of type BooleanType().
+ :rtype: UserDifinedFunction
+ """
+
+ def temp_udf(*cols):
+ return all(map(lambda col: col == value, cols))
+
+ return F.udf(temp_udf, BooleanType())
+ regexp_extract_all(s, regexp)
+
+This function uses the Python re
library to extract regular expressions from a
+string (s
) using a regex pattern (regexp
). It returns a list of all matches, or None
if s
is None
.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
s |
+
+ str
+ |
+ input string ( |
+ + required + | +
regexp |
+
+ str
+ |
+ string |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Optional[List[re.Match]]
+ |
+ List of matches |
+
quinn/functions.py
@F.udf(returnType=ArrayType(StringType()))
+def regexp_extract_all(s: str, regexp: str) -> Optional[List[re.Match]]:
+ """This function uses the Python `re` library to extract regular expressions from a
+ string (`s`) using a regex pattern (`regexp`). It returns a list of all matches, or `None` if `s` is `None`.
+
+ :param s: input string (`Column`)
+ :type s: str
+ :param regexp: string `re` pattern
+ :return: List of matches
+ """
+ return None if s is None else re.findall(regexp, s)
+ remove_all_whitespace(col)
+
+This function takes a Column
object as a parameter and returns a Column
object
+with all white space removed. It does this using the regexp_replace function
+from F, which replaces all whitespace with an empty string.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ a |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ a |
+
quinn/functions.py
def remove_all_whitespace(col: Column) -> Column:
+ """This function takes a `Column` object as a parameter and returns a `Column` object
+ with all white space removed. It does this using the regexp_replace function
+ from F, which replaces all whitespace with an empty string.
+ :param col: a `Column` object
+ :type col: Column
+ :returns: a `Column` object with all white space removed
+ :rtype: Column
+ """
+ return F.regexp_replace(col, "\\s+", "")
+ remove_non_word_characters(col)
+
+Removes non-word characters from a column.
+The non-word characters which will be removed are those identified by the
+regular expression "[^\w\s]+"
. This expression represents any character
+that is not a word character (e.g. \w
) or whitespace (\s
).
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ A Column object. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ A Column object with non-word characters removed. |
+
quinn/functions.py
def remove_non_word_characters(col: Column) -> Column:
+ """Removes non-word characters from a column.
+
+ The non-word characters which will be removed are those identified by the
+ regular expression ``"[^\\w\\s]+"``. This expression represents any character
+ that is not a word character (e.g. `\w`) or whitespace (`\s`).
+
+ :param col: A Column object.
+ :return: A Column object with non-word characters removed.
+
+ """
+ return F.regexp_replace(col, "[^\\w\\s]+", "")
+ single_space(col)
+
+This function takes a column and replaces all the multiple white spaces with a +single space. It then trims the column to make all the texts consistent.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ The column which needs to be spaced |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ A trimmed column with single space |
+
quinn/functions.py
def single_space(col: Column) -> Column:
+ """This function takes a column and replaces all the multiple white spaces with a
+ single space. It then trims the column to make all the texts consistent.
+ :param col: The column which needs to be spaced
+ :type col: Column
+ :returns: A trimmed column with single space
+ :rtype: Column
+ """
+ return F.trim(F.regexp_replace(col, " +", " "))
+ uuid5(col, namespace=uuid.NAMESPACE_DNS, extra_string='')
+
+This function generates UUIDv5 from col
and namespace
, optionally prepending an extra string to col
.
Sets variant to RFC 4122 one.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ Column that will be hashed. |
+ + required + | +
namespace |
+
+ uuid.UUID
+ |
+ Namespace to be used. (default: |
+
+ uuid.NAMESPACE_DNS
+ |
+
extra_string |
+
+ str
+ |
+ In case of collisions one can pass an extra string to hash on. |
+
+ ''
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ String representation of generated UUIDv5 |
+
quinn/functions.py
def uuid5(col: Column, namespace: uuid.UUID = uuid.NAMESPACE_DNS, extra_string: str = "") -> Column:
+ """This function generates UUIDv5 from ``col`` and ``namespace``, optionally prepending an extra string to ``col``.
+
+ Sets variant to RFC 4122 one.
+
+ :param col: Column that will be hashed.
+ :type col: Column
+ :param namespace: Namespace to be used. (default: `uuid.NAMESPACE_DNS`)
+ :type namespace: str
+ :param extra_string: In case of collisions one can pass an extra string to hash on.
+ :type extra_string: str
+ :return: String representation of generated UUIDv5
+ :rtype: Column
+ """
+ ns = F.lit(namespace.bytes)
+ salted_col = F.concat(F.lit(extra_string), col)
+ encoded = F.encode(salted_col, "utf-8")
+ encoded_with_ns = F.concat(ns, encoded)
+ hashed = F.sha1(encoded_with_ns)
+ variant_part = F.substring(hashed, 17, 4)
+ variant_part = F.conv(variant_part, 16, 2)
+ variant_part = F.lpad(variant_part, 16, "0")
+ variant_part = F.concat(F.lit("10"), F.substring(variant_part, 3, 16)) # RFC 4122 variant.
+ variant_part = F.lower(F.conv(variant_part, 2, 16))
+ return F.concat_ws(
+ "-",
+ F.substring(hashed, 1, 8),
+ F.substring(hashed, 9, 4),
+ F.concat(F.lit("5"), F.substring(hashed, 14, 3)), # Set version.
+ variant_part,
+ F.substring(hashed, 21, 12),
+ )
+ week_end_date(col, week_end_day='Sat')
+
+Returns a date column for the end of week for a given day.
+The Spark function dayofweek
considers Sunday as the first day of the week, and
+uses the default value of 1 to indicate Sunday. Usage of the when
and otherwise
+functions allow a comparison between the end of week day indicated and the day
+of week computed, and the return of the reference date if they match or the the
+addition of one week to the reference date otherwise.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ The reference date column. |
+ + required + | +
week_end_day |
+
+ str
+ |
+ The week end day (default: 'Sat') |
+
+ 'Sat'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ A Column of end of the week dates. |
+
quinn/functions.py
def week_end_date(col: Column, week_end_day: str = "Sat") -> Column:
+ """
+ Returns a date column for the end of week for a given day.
+
+ The Spark function `dayofweek` considers Sunday as the first day of the week, and
+ uses the default value of 1 to indicate Sunday. Usage of the `when` and `otherwise`
+ functions allow a comparison between the end of week day indicated and the day
+ of week computed, and the return of the reference date if they match or the the
+ addition of one week to the reference date otherwise.
+
+ :param col: The reference date column.
+ :type col: Column
+ :param week_end_day: The week end day (default: 'Sat')
+ :type week_end_day: str
+ :return: A Column of end of the week dates.
+ :rtype: Column
+ """
+ _raise_if_invalid_day(week_end_day)
+ # these are the default Spark mappings. Spark considers Sunday the first day of the week.
+ day_of_week_mapping = {
+ "Sun": 1,
+ "Mon": 2,
+ "Tue": 3,
+ "Wed": 4,
+ "Thu": 5,
+ "Fri": 6,
+ "Sat": 7,
+ }
+ return F.when(
+ F.dayofweek(col).eqNullSafe(F.lit(day_of_week_mapping[week_end_day])), col
+ ).otherwise(F.next_day(col, week_end_day))
+ week_start_date(col, week_start_day='Sun')
+
+This function takes a Spark Column
and an optional week_start_day
string
+argument and returns a Column
with the corresponding start of week dates. The
+"standard week" in Spark starts on Sunday, however an optional argument can be
+used to start the week from a different day, e.g. Monday. The week_start_day
+argument is a string corresponding to the day of the week to start the week
+from, e.g. "Mon"
, "Tue"
, and must be in the set: {"Sun", "Mon", "Tue", "Wed",
+"Thu", "Fri", "Sat"}
. If the argument given is not a valid day then a ValueError
+will be raised.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
col |
+
+ Column
+ |
+ The column to determine start of week dates on |
+ + required + | +
week_start_day |
+
+ str
+ |
+ The day to start the week on |
+
+ 'Sun'
+ |
+
Returns:
+Type | +Description | +
---|---|
+ Column
+ |
+ A Column with start of week dates |
+
quinn/functions.py
def week_start_date(col: Column, week_start_day: str = "Sun") -> Column:
+ """This function takes a Spark `Column` and an optional `week_start_day` string
+ argument and returns a `Column` with the corresponding start of week dates. The
+ "standard week" in Spark starts on Sunday, however an optional argument can be
+ used to start the week from a different day, e.g. Monday. The `week_start_day`
+ argument is a string corresponding to the day of the week to start the week
+ from, e.g. `"Mon"`, `"Tue"`, and must be in the set: `{"Sun", "Mon", "Tue", "Wed",
+ "Thu", "Fri", "Sat"}`. If the argument given is not a valid day then a `ValueError`
+ will be raised.
+
+ :param col: The column to determine start of week dates on
+ :type col: Column
+ :param week_start_day: The day to start the week on
+ :type week_start_day: str
+ :returns: A Column with start of week dates
+ :rtype: Column
+ """
+ _raise_if_invalid_day(week_start_day)
+ # the "standard week" in Spark is from Sunday to Saturday
+ mapping = {
+ "Sun": "Sat",
+ "Mon": "Sun",
+ "Tue": "Mon",
+ "Wed": "Tue",
+ "Thu": "Wed",
+ "Fri": "Thu",
+ "Sat": "Fri",
+ }
+ end = week_end_date(col, mapping[week_start_day])
+ return F.date_add(end, -6)
+ print_schema_as_code(dtype)
+
+Represent DataType (including StructType) as valid Python code.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
dtype |
+
+ T.DataType
+ |
+ The input DataType or Schema object |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+ A valid python code which generate the same schema. |
+
quinn/schema_helpers.py
def print_schema_as_code(dtype: T.DataType) -> str:
+ """Represent DataType (including StructType) as valid Python code.
+
+ :param dtype: The input DataType or Schema object
+ :type dtype: pyspark.sql.types.DataType
+ :return: A valid python code which generate the same schema.
+ :rtype: str
+ """
+ res = []
+ if isinstance(dtype, T.StructType):
+ res.append("StructType(\n\tfields=[")
+ for field in dtype.fields:
+ for line in _repr_column(field).split("\n"):
+ res.append("\n\t\t")
+ res.append(line)
+ res.append(",")
+ res.append("\n\t]\n)")
+
+ elif isinstance(dtype, T.ArrayType):
+ res.append("ArrayType(")
+ res.append(print_schema_as_code(dtype.elementType))
+ res.append(")")
+
+ elif isinstance(dtype, T.MapType):
+ res.append("MapType(")
+ res.append(f"\n\t{print_schema_as_code(dtype.keyType)},")
+ for line in print_schema_as_code(dtype.valueType).split("\n"):
+ res.append("\n\t")
+ res.append(line)
+ res.append(",")
+ res.append(f"\n\t{dtype.valueContainsNull},")
+ res.append("\n)")
+
+ elif isinstance(dtype, T.DecimalType):
+ res.append(f"DecimalType({dtype.precision}, {dtype.scale})")
+
+ else:
+ if str(dtype).endswith("()"): # PySpark 3.3+
+ res.append(str(dtype))
+ else:
+ res.append(f"{dtype}()")
+
+ return "".join(res)
+ snake_case_col_names(df)
+
+This function takes a DataFrame
instance and returns the
+same DataFrame
instance with all column names converted to snake case
+(e.g. col_name_1
). It uses the to_snake_case
function in conjunction with
+the with_columns_renamed
function to achieve this.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ A |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ ``DataFrame``
+ |
+ A |
+
quinn/transformations.py
def snake_case_col_names(df: DataFrame) -> DataFrame:
+ """This function takes a ``DataFrame`` instance and returns the
+ same ``DataFrame`` instance with all column names converted to snake case
+ (e.g. ``col_name_1``). It uses the ``to_snake_case`` function in conjunction with
+ the ``with_columns_renamed`` function to achieve this.
+ :param df: A ``DataFrame`` instance to process
+ :type df: ``DataFrame``
+ :return: A ``DataFrame`` instance with column names converted to snake case
+ :rtype: ``DataFrame``
+ """
+ return with_columns_renamed(to_snake_case)(df)
+ sort_columns(df, sort_order)
+
+This function sorts the columns of a given DataFrame based on a given sort
+order. The sort_order
parameter can either be asc
or desc
, which correspond to
+ascending and descending order, respectively. If any other value is provided for
+the sort_order
parameter, a ValueError
will be raised.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
df |
+
+ DataFrame
+ |
+ A DataFrame |
+ + required + | +
sort_order |
+
+ str
+ |
+ The order in which to sort the columns in the DataFrame |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ pandas.DataFrame
+ |
+ A DataFrame with the columns sorted in the chosen order |
+
quinn/transformations.py
def sort_columns(df: DataFrame, sort_order: str) -> DataFrame:
+ """This function sorts the columns of a given DataFrame based on a given sort
+ order. The ``sort_order`` parameter can either be ``asc`` or ``desc``, which correspond to
+ ascending and descending order, respectively. If any other value is provided for
+ the ``sort_order`` parameter, a ``ValueError`` will be raised.
+
+ :param df: A DataFrame
+ :type df: pandas.DataFrame
+ :param sort_order: The order in which to sort the columns in the DataFrame
+ :type sort_order: str
+ :return: A DataFrame with the columns sorted in the chosen order
+ :rtype: pandas.DataFrame
+ """
+ sorted_col_names = None
+ if sort_order == "asc":
+ sorted_col_names = sorted(df.columns)
+ elif sort_order == "desc":
+ sorted_col_names = sorted(df.columns, reverse=True)
+ else:
+ raise ValueError(
+ "['asc', 'desc'] are the only valid sort orders and you entered a sort order of '{sort_order}'".format(
+ sort_order=sort_order
+ )
+ )
+ return df.select(*sorted_col_names)
+ to_snake_case(s)
+
+Takes a string and converts it to snake case format.
+ +Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
s |
+
+ str
+ |
+ The string to be converted. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ str
+ |
+ The string in snake case format. |
+
quinn/transformations.py
def to_snake_case(s: str) -> str:
+ """Takes a string and converts it to snake case format.
+
+ :param s: The string to be converted.
+ :type s: str
+ :return: The string in snake case format.
+ :rtype: str
+ """
+ return s.lower().replace(" ", "_")
+ with_columns_renamed(fun)
+
+This is a function designed to rename the columns of a
+Spark DataFrame
.
It takes a Callable[[str], str]
object as an argument (fun
) and returns a
+Callable[[DataFrame], DataFrame]
object.
When _()
is called on a DataFrame
, it creates a list of column names,
+applying the argument fun()
to each of them, and returning a new DataFrame
+with the new column names.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fun |
+
+ Callable[[str], str]
+ |
+ Renaming function |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ Callable[[DataFrame], DataFrame]
+ |
+ Function which takes DataFrame as parameter. |
+
quinn/transformations.py
def with_columns_renamed(fun: Callable[[str], str]) -> Callable[[DataFrame], DataFrame]:
+ """This is a function designed to rename the columns of a
+ `Spark DataFrame`.
+
+ It takes a `Callable[[str], str]` object as an argument (``fun``) and returns a
+ `Callable[[DataFrame], DataFrame]` object.
+
+ When `_()` is called on a `DataFrame`, it creates a list of column names,
+ applying the argument `fun()` to each of them, and returning a new `DataFrame`
+ with the new column names.
+
+ :param fun: Renaming function
+ :returns: Function which takes DataFrame as parameter.
+ """
+ def _(df: DataFrame) -> DataFrame:
+ cols = list(
+ map(
+ lambda col_name: F.col("`{0}`".format(col_name)).alias(fun(col_name)),
+ df.columns,
+ )
+ )
+ return df.select(*cols)
+
+ return _
+ with_some_columns_renamed(fun, change_col_name)
+
+A function that takes a Callable[[str], str]
and a Callable[[str], str]
+and returns a Callable[[DataFrame], DataFrame]
, which in turn takes a
+DataFrame
and returns a DataFrame
with some of its columns renamed.
Parameters:
+Name | +Type | +Description | +Default | +
---|---|---|---|
fun |
+
+ Callable[[str], str]
+ |
+ A function that takes a column name as a string and returns a new name as a string. |
+ + required + | +
change_col_name |
+
+ Callable[[str], str]
+ |
+ A function that takes a column name as a string and returns a boolean. |
+ + required + | +
Returns:
+Type | +Description | +
---|---|
+ `Callable[[DataFrame], DataFrame]`
+ |
+ A |
+
quinn/transformations.py
def with_some_columns_renamed(
+ fun: Callable[[str], str], change_col_name: Callable[[str], str]
+) -> Callable[[DataFrame], DataFrame]:
+ """A function that takes a `Callable[[str], str]` and a `Callable[[str], str]`
+ and returns a `Callable[[DataFrame], DataFrame]`, which in turn takes a
+ `DataFrame` and returns a `DataFrame` with some of its columns renamed.
+
+ :param fun: A function that takes a column name as a string and returns a
+ new name as a string.
+ :type fun: `Callable[[str], str]`
+ :param change_col_name: A function that takes a column name as a string and
+ returns a boolean.
+ :type change_col_name: `Callable[[str], str]`
+ :return: A `Callable[[DataFrame], DataFrame]`, which takes a
+ `DataFrame` and returns a `DataFrame` with some of its columns renamed.
+ :rtype: `Callable[[DataFrame], DataFrame]`
+ """
+ def _(df):
+ cols = list(
+ map(
+ lambda col_name: F.col("`{0}`".format(col_name)).alias(fun(col_name))
+ if change_col_name(col_name)
+ else F.col("`{0}`".format(col_name)),
+ df.columns,
+ )
+ )
+ return df.select(*cols)
+
+ return _
+ ' + escapeHtml(summary) +'
' + noResultsText + '
'); + } +} + +function doSearch () { + var query = document.getElementById('mkdocs-search-query').value; + if (query.length > min_search_length) { + if (!window.Worker) { + displayResults(search(query)); + } else { + searchWorker.postMessage({query: query}); + } + } else { + // Clear results for short queries + displayResults([]); + } +} + +function initSearch () { + var search_input = document.getElementById('mkdocs-search-query'); + if (search_input) { + search_input.addEventListener("keyup", doSearch); + } + var term = getSearchTermFromLocation(); + if (term) { + search_input.value = term; + doSearch(); + } +} + +function onWorkerMessage (e) { + if (e.data.allowSearch) { + initSearch(); + } else if (e.data.results) { + var results = e.data.results; + displayResults(results); + } else if (e.data.config) { + min_search_length = e.data.config.min_search_length-1; + } +} + +if (!window.Worker) { + console.log('Web Worker API not supported'); + // load index in main thread + $.getScript(joinUrl(base_url, "search/worker.js")).done(function () { + console.log('Loaded worker'); + init(); + window.postMessage = function (msg) { + onWorkerMessage({data: msg}); + }; + }).fail(function (jqxhr, settings, exception) { + console.error('Could not load worker.js'); + }); +} else { + // Wrap search in a web worker + var searchWorker = new Worker(joinUrl(base_url, "search/worker.js")); + searchWorker.postMessage({init: true}); + searchWorker.onmessage = onWorkerMessage; +} diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 00000000..c1b12924 --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Quinn Pyspark helper methods to maximize developer productivity. Quinn provides DataFrame validation functions, useful column functions / DataFrame transformations, and performant helper functions. Setup Quinn is uploaded to PyPi and can be installed with this command: pip install quinn Quinn Helper Functions import quinn DataFrame Validations validate_presence_of_columns() quinn.validate_presence_of_columns(source_df, [\"name\", \"age\", \"fun\"]) Raises an exception unless source_df contains the name , age , and fun column. validate_schema() quinn.validate_schema(source_df, required_schema) Raises an exception unless source_df contains all the StructFields defined in the required_schema . validate_absence_of_columns() quinn.validate_absence_of_columns(source_df, [\"age\", \"cool\"]) Raises an exception if source_df contains age or cool columns. Functions single_space() actual_df = source_df.withColumn( \"words_single_spaced\", quinn.single_space(col(\"words\")) ) Replaces all multispaces with single spaces (e.g. changes \"this has some\" to \"this has some\" . remove_all_whitespace() actual_df = source_df.withColumn( \"words_without_whitespace\", quinn.remove_all_whitespace(col(\"words\")) ) Removes all whitespace in a string (e.g. changes \"this has some\" to \"thishassome\" . anti_trim() actual_df = source_df.withColumn( \"words_anti_trimmed\", quinn.anti_trim(col(\"words\")) ) Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes \" this has some \" to \" thishassome \" . remove_non_word_characters() actual_df = source_df.withColumn( \"words_without_nonword_chars\", quinn.remove_non_word_characters(col(\"words\")) ) Removes all non-word characters from a string (e.g. changes \"si%$#@!#$!@#mpsons\" to \"simpsons\" . multi_equals() source_df.withColumn( \"are_s1_and_s2_cat\", quinn.multi_equals(\"cat\")(col(\"s1\"), col(\"s2\")) ) multi_equals returns true if s1 and s2 are both equal to \"cat\" . approx_equal() This function takes 3 arguments which are 2 Pyspark DataFrames and one integer values as threshold, and returns the Boolean column which tells if the columns are equal in the threshold. let the columns be col1 = [1.2, 2.5, 3.1, 4.0, 5.5] col2 = [1.3, 2.3, 3.0, 3.9, 5.6] threshold = 0.2 result = approx_equal(col(\"col1\"), col(\"col2\"), threshold) result.show() +-----+ |value| +-----+ | true| |false| | true| | true| | true| +-----+ array_choice() This function takes a Column as a parameter and returns a PySpark column that contains a random value from the input column parameter df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], [\"values\"]) result = df.select(array_choice(col(\"values\"))) The output is := +--------------+ |array_choice()| +--------------+ | 2| +--------------+ regexp_extract_all() The regexp_extract_all takes 2 parameters String s and regexp which is a regular expression. This function finds all the matches for the string which satisfies the regular expression. print(regexp_extract_all(\"this is a example text message for testing application\",r\"\\b\\w*a\\w*\\b\")) The output is := ['a', 'example', 'message', 'application'] Where r\"\\b\\w*a\\w*\\b\" pattern checks for words containing letter a week_start_date() It takes 2 parameters, column and week_start_day. It returns a Spark Dataframe column which contains the start date of the week. By default the week_start_day is set to \"Sun\". For input [\"2023-03-05\", \"2023-03-06\", \"2023-03-07\", \"2023-03-08\"] the Output is result = df.select(\"date\", week_start_date(col(\"date\"), \"Sun\")) result.show() +----------+----------------+ | date|week_start_date | +----------+----------------+ |2023-03-05| 2023-03-05| |2023-03-07| 2023-03-05| |2023-03-08| 2023-03-05| +----------+----------------+ week_end_date() It also takes 2 Paramters as Column and week_end_day, and returns the dateframe column which contains the end date of the week. By default the week_end_day is set to \"sat\" +---------+-------------+ date|week_end_date| +---------+-------------+ 2023-03-05| 2023-03-05| 2023-03-07| 2023-03-12| 2023-03-08| 2023-03-12| +---------+-------------+ uuid5() This function generates UUIDv5 in string form from the passed column and optionally namespace and optional extra salt. By default namespace is NAMESPACE_DNS UUID and no extra string used to reduce hash collisions. df = spark.createDataFrame([(\"lorem\",), (\"ipsum\",)], [\"values\"]) result = df.select(quinn.uuid5(F.col(\"values\")).alias(\"uuid5\")) result.show(truncate=False) The output is := +------------------------------------+ |uuid5 | +------------------------------------+ |35482fda-c10a-5076-8da2-dc7bf22d6be4| |51b79c1d-d06c-5b30-a5c6-1fadcd3b2103| +------------------------------------+ Transformations snake_case_col_names() quinn.snake_case_col_names(source_df) Converts all the column names in a DataFrame to snake_case. It's annoying to write SQL queries when columns aren't snake cased. sort_columns() quinn.sort_columns(source_df, \"asc\") Sorts the DataFrame columns in alphabetical order. Wide DataFrames are easier to navigate when they're sorted alphabetically. DataFrame Helpers column_to_list() quinn.column_to_list(source_df, \"name\") Converts a column in a DataFrame to a list of values. two_columns_to_dictionary() quinn.two_columns_to_dictionary(source_df, \"name\", \"age\") Converts two columns of a DataFrame into a dictionary. In this example, name is the key and age is the value. to_list_of_dictionaries() quinn.to_list_of_dictionaries(source_df) Converts an entire DataFrame into a list of dictionaries. Pyspark Core Class Extensions from quinn.extensions import * SparkSession Extensions create_df() spark.create_df( [(\"jose\", \"a\"), (\"li\", \"b\"), (\"sam\", \"c\")], [(\"name\", StringType(), True), (\"blah\", StringType(), True)] ) Creates DataFrame with a syntax that's less verbose than the built-in createDataFrame method. Column Extensions isFalsy() source_df.withColumn(\"is_stuff_falsy\", F.col(\"has_stuff\").isFalsy()) Returns True if has_stuff is None or False . isTruthy() source_df.withColumn(\"is_stuff_truthy\", F.col(\"has_stuff\").isTruthy()) Returns True unless has_stuff is None or False . isNullOrBlank() source_df.withColumn(\"is_blah_null_or_blank\", F.col(\"blah\").isNullOrBlank()) Returns True if blah is null or blank (the empty string or a string that only contains whitespace). isNotIn() source_df.withColumn(\"is_not_bobs_hobby\", F.col(\"fun_thing\").isNotIn(bobs_hobbies)) Returns True if fun_thing is not included in the bobs_hobbies list. nullBetween() source_df.withColumn(\"is_between\", F.col(\"age\").nullBetween(F.col(\"lower_age\"), F.col(\"upper_age\"))) Returns True if age is between lower_age and upper_age . If lower_age is populated and upper_age is null , it will return True if age is greater than or equal to lower_age . If lower_age is null and upper_age is populate, it will return True if age is lower than or equal to upper_age . Contributing We are actively looking for feature requests, pull requests, and bug fixes. Any developer that demonstrates excellence will be invited to be a maintainer of the project. Code Style We are using PySpark code-style and sphinx as docstrings format. For more details about sphinx format see this tutorial . A short example of sphinx -formated docstring is placed below: \"\"\"[Summary] :param [ParamName]: [ParamDescription], defaults to [DefaultParamVal] :type [ParamName]: [ParamType](, optional) ... :raises [ErrorType]: [ErrorDescription] ... :return: [ReturnDescription] :rtype: [ReturnType] \"\"\"","title":"Quin"},{"location":"#quinn","text":"Pyspark helper methods to maximize developer productivity. Quinn provides DataFrame validation functions, useful column functions / DataFrame transformations, and performant helper functions.","title":"Quinn"},{"location":"#setup","text":"Quinn is uploaded to PyPi and can be installed with this command: pip install quinn","title":"Setup"},{"location":"#quinn-helper-functions","text":"import quinn","title":"Quinn Helper Functions"},{"location":"#dataframe-validations","text":"validate_presence_of_columns() quinn.validate_presence_of_columns(source_df, [\"name\", \"age\", \"fun\"]) Raises an exception unless source_df contains the name , age , and fun column. validate_schema() quinn.validate_schema(source_df, required_schema) Raises an exception unless source_df contains all the StructFields defined in the required_schema . validate_absence_of_columns() quinn.validate_absence_of_columns(source_df, [\"age\", \"cool\"]) Raises an exception if source_df contains age or cool columns.","title":"DataFrame Validations"},{"location":"#functions","text":"single_space() actual_df = source_df.withColumn( \"words_single_spaced\", quinn.single_space(col(\"words\")) ) Replaces all multispaces with single spaces (e.g. changes \"this has some\" to \"this has some\" . remove_all_whitespace() actual_df = source_df.withColumn( \"words_without_whitespace\", quinn.remove_all_whitespace(col(\"words\")) ) Removes all whitespace in a string (e.g. changes \"this has some\" to \"thishassome\" . anti_trim() actual_df = source_df.withColumn( \"words_anti_trimmed\", quinn.anti_trim(col(\"words\")) ) Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes \" this has some \" to \" thishassome \" . remove_non_word_characters() actual_df = source_df.withColumn( \"words_without_nonword_chars\", quinn.remove_non_word_characters(col(\"words\")) ) Removes all non-word characters from a string (e.g. changes \"si%$#@!#$!@#mpsons\" to \"simpsons\" . multi_equals() source_df.withColumn( \"are_s1_and_s2_cat\", quinn.multi_equals(\"cat\")(col(\"s1\"), col(\"s2\")) ) multi_equals returns true if s1 and s2 are both equal to \"cat\" . approx_equal() This function takes 3 arguments which are 2 Pyspark DataFrames and one integer values as threshold, and returns the Boolean column which tells if the columns are equal in the threshold. let the columns be col1 = [1.2, 2.5, 3.1, 4.0, 5.5] col2 = [1.3, 2.3, 3.0, 3.9, 5.6] threshold = 0.2 result = approx_equal(col(\"col1\"), col(\"col2\"), threshold) result.show() +-----+ |value| +-----+ | true| |false| | true| | true| | true| +-----+ array_choice() This function takes a Column as a parameter and returns a PySpark column that contains a random value from the input column parameter df = spark.createDataFrame([(1,), (2,), (3,), (4,), (5,)], [\"values\"]) result = df.select(array_choice(col(\"values\"))) The output is := +--------------+ |array_choice()| +--------------+ | 2| +--------------+ regexp_extract_all() The regexp_extract_all takes 2 parameters String s and regexp which is a regular expression. This function finds all the matches for the string which satisfies the regular expression. print(regexp_extract_all(\"this is a example text message for testing application\",r\"\\b\\w*a\\w*\\b\")) The output is := ['a', 'example', 'message', 'application'] Where r\"\\b\\w*a\\w*\\b\" pattern checks for words containing letter a week_start_date() It takes 2 parameters, column and week_start_day. It returns a Spark Dataframe column which contains the start date of the week. By default the week_start_day is set to \"Sun\". For input [\"2023-03-05\", \"2023-03-06\", \"2023-03-07\", \"2023-03-08\"] the Output is result = df.select(\"date\", week_start_date(col(\"date\"), \"Sun\")) result.show() +----------+----------------+ | date|week_start_date | +----------+----------------+ |2023-03-05| 2023-03-05| |2023-03-07| 2023-03-05| |2023-03-08| 2023-03-05| +----------+----------------+ week_end_date() It also takes 2 Paramters as Column and week_end_day, and returns the dateframe column which contains the end date of the week. By default the week_end_day is set to \"sat\" +---------+-------------+ date|week_end_date| +---------+-------------+ 2023-03-05| 2023-03-05| 2023-03-07| 2023-03-12| 2023-03-08| 2023-03-12| +---------+-------------+ uuid5() This function generates UUIDv5 in string form from the passed column and optionally namespace and optional extra salt. By default namespace is NAMESPACE_DNS UUID and no extra string used to reduce hash collisions. df = spark.createDataFrame([(\"lorem\",), (\"ipsum\",)], [\"values\"]) result = df.select(quinn.uuid5(F.col(\"values\")).alias(\"uuid5\")) result.show(truncate=False) The output is := +------------------------------------+ |uuid5 | +------------------------------------+ |35482fda-c10a-5076-8da2-dc7bf22d6be4| |51b79c1d-d06c-5b30-a5c6-1fadcd3b2103| +------------------------------------+","title":"Functions"},{"location":"#transformations","text":"snake_case_col_names() quinn.snake_case_col_names(source_df) Converts all the column names in a DataFrame to snake_case. It's annoying to write SQL queries when columns aren't snake cased. sort_columns() quinn.sort_columns(source_df, \"asc\") Sorts the DataFrame columns in alphabetical order. Wide DataFrames are easier to navigate when they're sorted alphabetically.","title":"Transformations"},{"location":"#dataframe-helpers","text":"column_to_list() quinn.column_to_list(source_df, \"name\") Converts a column in a DataFrame to a list of values. two_columns_to_dictionary() quinn.two_columns_to_dictionary(source_df, \"name\", \"age\") Converts two columns of a DataFrame into a dictionary. In this example, name is the key and age is the value. to_list_of_dictionaries() quinn.to_list_of_dictionaries(source_df) Converts an entire DataFrame into a list of dictionaries.","title":"DataFrame Helpers"},{"location":"#pyspark-core-class-extensions","text":"from quinn.extensions import *","title":"Pyspark Core Class Extensions"},{"location":"#sparksession-extensions","text":"create_df() spark.create_df( [(\"jose\", \"a\"), (\"li\", \"b\"), (\"sam\", \"c\")], [(\"name\", StringType(), True), (\"blah\", StringType(), True)] ) Creates DataFrame with a syntax that's less verbose than the built-in createDataFrame method.","title":"SparkSession Extensions"},{"location":"#column-extensions","text":"isFalsy() source_df.withColumn(\"is_stuff_falsy\", F.col(\"has_stuff\").isFalsy()) Returns True if has_stuff is None or False . isTruthy() source_df.withColumn(\"is_stuff_truthy\", F.col(\"has_stuff\").isTruthy()) Returns True unless has_stuff is None or False . isNullOrBlank() source_df.withColumn(\"is_blah_null_or_blank\", F.col(\"blah\").isNullOrBlank()) Returns True if blah is null or blank (the empty string or a string that only contains whitespace). isNotIn() source_df.withColumn(\"is_not_bobs_hobby\", F.col(\"fun_thing\").isNotIn(bobs_hobbies)) Returns True if fun_thing is not included in the bobs_hobbies list. nullBetween() source_df.withColumn(\"is_between\", F.col(\"age\").nullBetween(F.col(\"lower_age\"), F.col(\"upper_age\"))) Returns True if age is between lower_age and upper_age . If lower_age is populated and upper_age is null , it will return True if age is greater than or equal to lower_age . If lower_age is null and upper_age is populate, it will return True if age is lower than or equal to upper_age .","title":"Column Extensions"},{"location":"#contributing","text":"We are actively looking for feature requests, pull requests, and bug fixes. Any developer that demonstrates excellence will be invited to be a maintainer of the project.","title":"Contributing"},{"location":"#code-style","text":"We are using PySpark code-style and sphinx as docstrings format. For more details about sphinx format see this tutorial . A short example of sphinx -formated docstring is placed below: \"\"\"[Summary] :param [ParamName]: [ParamDescription], defaults to [DefaultParamVal] :type [ParamName]: [ParamType](, optional) ... :raises [ErrorType]: [ErrorDescription] ... :return: [ReturnDescription] :rtype: [ReturnType] \"\"\"","title":"Code Style"},{"location":"reference/SUMMARY/","text":"quinn append_if_schema_identical dataframe_helpers dataframe_validator extensions column_ext dataframe_ext spark_session_ext functions scala_to_pyspark schema_helpers spark transformations","title":"API Docs"},{"location":"reference/quinn/","text":"","title":"Index"},{"location":"reference/quinn/append_if_schema_identical/","text":"SchemaMismatchError Bases: ValueError raise this when there's a schema mismatch between source & target schema Source code in quinn/append_if_schema_identical.py class SchemaMismatchError(ValueError): \"\"\"raise this when there's a schema mismatch between source & target schema\"\"\" append_if_schema_identical(source_df, target_df) Compares the schema of source & target dataframe . Parameters: Name Type Description Default source_df DataFrame Input DataFrame required target_df DataFrame Input DataFrame required Returns: Type Description pyspark.sql.DataFrame dataframe Source code in quinn/append_if_schema_identical.py def append_if_schema_identical(source_df: DataFrame, target_df: DataFrame) -> DataFrame: \"\"\"Compares the schema of source & target dataframe . :param source_df: Input DataFrame :type source_df: pyspark.sql.DataFrame :param target_df: Input DataFrame :type target_df: pyspark.sql.DataFrame :return: dataframe :rtype: pyspark.sql.DataFrame \"\"\" # Retrieve the schemas of the source and target dataframes source_schema = source_df.schema target_schema = target_df.schema # Convert the schemas to a list of tuples source_schema_list = [(field.name, str(field.dataType)) for field in source_schema] target_schema_list = [(field.name, str(field.dataType)) for field in target_schema] unmatched_cols = [col for col in source_schema_list if col not in target_schema_list] error_message = f\"The schemas of the source and target dataframes are not identical.\" \\ f\"From source schema column {unmatched_cols} is missing in target schema\" # Check if the column names in the source and target schemas are the same, regardless of their order if set(source_schema.fieldNames()) != set(target_schema.fieldNames()): raise SchemaMismatchError(error_message) # Check if the column names and data types in the source and target schemas are the same, in the same order if sorted(source_schema_list) != sorted(target_schema_list): raise SchemaMismatchError(error_message) # Append the dataframes if the schemas are identical appended_df = target_df.unionByName(source_df) return appended_df","title":"Append if schema identical"},{"location":"reference/quinn/append_if_schema_identical/#quinn.append_if_schema_identical.SchemaMismatchError","text":"Bases: ValueError raise this when there's a schema mismatch between source & target schema Source code in quinn/append_if_schema_identical.py class SchemaMismatchError(ValueError): \"\"\"raise this when there's a schema mismatch between source & target schema\"\"\"","title":"SchemaMismatchError"},{"location":"reference/quinn/append_if_schema_identical/#quinn.append_if_schema_identical.append_if_schema_identical","text":"Compares the schema of source & target dataframe . Parameters: Name Type Description Default source_df DataFrame Input DataFrame required target_df DataFrame Input DataFrame required Returns: Type Description pyspark.sql.DataFrame dataframe Source code in quinn/append_if_schema_identical.py def append_if_schema_identical(source_df: DataFrame, target_df: DataFrame) -> DataFrame: \"\"\"Compares the schema of source & target dataframe . :param source_df: Input DataFrame :type source_df: pyspark.sql.DataFrame :param target_df: Input DataFrame :type target_df: pyspark.sql.DataFrame :return: dataframe :rtype: pyspark.sql.DataFrame \"\"\" # Retrieve the schemas of the source and target dataframes source_schema = source_df.schema target_schema = target_df.schema # Convert the schemas to a list of tuples source_schema_list = [(field.name, str(field.dataType)) for field in source_schema] target_schema_list = [(field.name, str(field.dataType)) for field in target_schema] unmatched_cols = [col for col in source_schema_list if col not in target_schema_list] error_message = f\"The schemas of the source and target dataframes are not identical.\" \\ f\"From source schema column {unmatched_cols} is missing in target schema\" # Check if the column names in the source and target schemas are the same, regardless of their order if set(source_schema.fieldNames()) != set(target_schema.fieldNames()): raise SchemaMismatchError(error_message) # Check if the column names and data types in the source and target schemas are the same, in the same order if sorted(source_schema_list) != sorted(target_schema_list): raise SchemaMismatchError(error_message) # Append the dataframes if the schemas are identical appended_df = target_df.unionByName(source_df) return appended_df","title":"append_if_schema_identical()"},{"location":"reference/quinn/dataframe_helpers/","text":"column_to_list(df, col_name) Collect column to list of values. Parameters: Name Type Description Default df DataFrame Input DataFrame required col_name str Column to collect required Returns: Type Description List[Any] List of values Source code in quinn/dataframe_helpers.py def column_to_list(df: DataFrame, col_name: str) -> List[Any]: \"\"\"Collect column to list of values. :param df: Input DataFrame :type df: pyspark.sql.DataFrame :param col_name: Column to collect :type col_name: str :return: List of values :rtype: List[Any] \"\"\" return [x[col_name] for x in df.select(col_name).collect()] print_athena_create_table(df, athena_table_name, s3location) Generates the Athena create table statement for a given DataFrame Parameters: Name Type Description Default df DataFrame The pyspark.sql.DataFrame to use required athena_table_name str The name of the athena table to generate required s3location str The S3 location of the parquet data required Returns: Type Description None None Source code in quinn/dataframe_helpers.py def print_athena_create_table( df: DataFrame, athena_table_name: str, s3location: str ) -> None: \"\"\"Generates the Athena create table statement for a given DataFrame :param df: The pyspark.sql.DataFrame to use :param athena_table_name: The name of the athena table to generate :param s3location: The S3 location of the parquet data :return: None \"\"\" fields = df.schema print(f\"CREATE EXTERNAL TABLE IF NOT EXISTS `{athena_table_name}` ( \") for field in fields.fieldNames()[:-1]: print(\"\\t\", f\"`{fields[field].name}` {fields[field].dataType.simpleString()}, \") last = fields[fields.fieldNames()[-1]] print(\"\\t\", f\"`{last.name}` {last.dataType.simpleString()} \") print(\")\") print(\"STORED AS PARQUET\") print(f\"LOCATION '{s3location}'\\n\") show_output_to_df(show_output, spark) Show output as spark DataFrame Parameters: Name Type Description Default show_output str String representing output of 'show' command in spark required spark SparkSession SparkSession object required Returns: Type Description Dataframe DataFrame object containing output of a show command in spark Source code in quinn/dataframe_helpers.py def show_output_to_df(show_output: str, spark: SparkSession) -> DataFrame: \"\"\"Show output as spark DataFrame :param show_output: String representing output of 'show' command in spark :type show_output: str :param spark: SparkSession object :type spark: SparkSession :return: DataFrame object containing output of a show command in spark :rtype: Dataframe \"\"\" l = show_output.split(\"\\n\") ugly_column_names = l[1] pretty_column_names = [i.strip() for i in ugly_column_names[1:-1].split(\"|\")] pretty_data = [] ugly_data = l[3:-1] for row in ugly_data: r = [i.strip() for i in row[1:-1].split(\"|\")] pretty_data.append(tuple(r)) return spark.createDataFrame(pretty_data, pretty_column_names) to_list_of_dictionaries(df) Convert a Spark DataFrame to a list of dictionaries. Parameters: Name Type Description Default df DataFrame The Spark DataFrame to convert. required Returns: Type Description List[Dict[str, Any]] A list of dictionaries representing the rows in the DataFrame. Source code in quinn/dataframe_helpers.py def to_list_of_dictionaries(df: DataFrame) -> List[Dict[str, Any]]: \"\"\"Convert a Spark DataFrame to a list of dictionaries. :param df: The Spark DataFrame to convert. :type df: :py:class:`pyspark.sql.DataFrame` :return: A list of dictionaries representing the rows in the DataFrame. :rtype: List[Dict[str, Any]] \"\"\" return list(map(lambda r: r.asDict(), df.collect())) two_columns_to_dictionary(df, key_col_name, value_col_name) Collect two columns as dictionary when first column is key and second is value. Parameters: Name Type Description Default df DataFrame Input DataFrame required key_col_name str Key-column required value_col_name str Value-column required Returns: Type Description Dict[str, Any] Dictionary with values Source code in quinn/dataframe_helpers.py def two_columns_to_dictionary( df: DataFrame, key_col_name: str, value_col_name: str ) -> Dict[str, Any]: \"\"\"Collect two columns as dictionary when first column is key and second is value. :param df: Input DataFrame :type df: pyspark.sql.DataFrame :param key_col_name: Key-column :type key_col_name: str :param value_col_name: Value-column :type value_col_name: str :return: Dictionary with values :rtype: Dict[str, Any] \"\"\" k, v = key_col_name, value_col_name return {x[k]: x[v] for x in df.select(k, v).collect()}","title":"Dataframe helpers"},{"location":"reference/quinn/dataframe_helpers/#quinn.dataframe_helpers.column_to_list","text":"Collect column to list of values. Parameters: Name Type Description Default df DataFrame Input DataFrame required col_name str Column to collect required Returns: Type Description List[Any] List of values Source code in quinn/dataframe_helpers.py def column_to_list(df: DataFrame, col_name: str) -> List[Any]: \"\"\"Collect column to list of values. :param df: Input DataFrame :type df: pyspark.sql.DataFrame :param col_name: Column to collect :type col_name: str :return: List of values :rtype: List[Any] \"\"\" return [x[col_name] for x in df.select(col_name).collect()]","title":"column_to_list()"},{"location":"reference/quinn/dataframe_helpers/#quinn.dataframe_helpers.print_athena_create_table","text":"Generates the Athena create table statement for a given DataFrame Parameters: Name Type Description Default df DataFrame The pyspark.sql.DataFrame to use required athena_table_name str The name of the athena table to generate required s3location str The S3 location of the parquet data required Returns: Type Description None None Source code in quinn/dataframe_helpers.py def print_athena_create_table( df: DataFrame, athena_table_name: str, s3location: str ) -> None: \"\"\"Generates the Athena create table statement for a given DataFrame :param df: The pyspark.sql.DataFrame to use :param athena_table_name: The name of the athena table to generate :param s3location: The S3 location of the parquet data :return: None \"\"\" fields = df.schema print(f\"CREATE EXTERNAL TABLE IF NOT EXISTS `{athena_table_name}` ( \") for field in fields.fieldNames()[:-1]: print(\"\\t\", f\"`{fields[field].name}` {fields[field].dataType.simpleString()}, \") last = fields[fields.fieldNames()[-1]] print(\"\\t\", f\"`{last.name}` {last.dataType.simpleString()} \") print(\")\") print(\"STORED AS PARQUET\") print(f\"LOCATION '{s3location}'\\n\")","title":"print_athena_create_table()"},{"location":"reference/quinn/dataframe_helpers/#quinn.dataframe_helpers.show_output_to_df","text":"Show output as spark DataFrame Parameters: Name Type Description Default show_output str String representing output of 'show' command in spark required spark SparkSession SparkSession object required Returns: Type Description Dataframe DataFrame object containing output of a show command in spark Source code in quinn/dataframe_helpers.py def show_output_to_df(show_output: str, spark: SparkSession) -> DataFrame: \"\"\"Show output as spark DataFrame :param show_output: String representing output of 'show' command in spark :type show_output: str :param spark: SparkSession object :type spark: SparkSession :return: DataFrame object containing output of a show command in spark :rtype: Dataframe \"\"\" l = show_output.split(\"\\n\") ugly_column_names = l[1] pretty_column_names = [i.strip() for i in ugly_column_names[1:-1].split(\"|\")] pretty_data = [] ugly_data = l[3:-1] for row in ugly_data: r = [i.strip() for i in row[1:-1].split(\"|\")] pretty_data.append(tuple(r)) return spark.createDataFrame(pretty_data, pretty_column_names)","title":"show_output_to_df()"},{"location":"reference/quinn/dataframe_helpers/#quinn.dataframe_helpers.to_list_of_dictionaries","text":"Convert a Spark DataFrame to a list of dictionaries. Parameters: Name Type Description Default df DataFrame The Spark DataFrame to convert. required Returns: Type Description List[Dict[str, Any]] A list of dictionaries representing the rows in the DataFrame. Source code in quinn/dataframe_helpers.py def to_list_of_dictionaries(df: DataFrame) -> List[Dict[str, Any]]: \"\"\"Convert a Spark DataFrame to a list of dictionaries. :param df: The Spark DataFrame to convert. :type df: :py:class:`pyspark.sql.DataFrame` :return: A list of dictionaries representing the rows in the DataFrame. :rtype: List[Dict[str, Any]] \"\"\" return list(map(lambda r: r.asDict(), df.collect()))","title":"to_list_of_dictionaries()"},{"location":"reference/quinn/dataframe_helpers/#quinn.dataframe_helpers.two_columns_to_dictionary","text":"Collect two columns as dictionary when first column is key and second is value. Parameters: Name Type Description Default df DataFrame Input DataFrame required key_col_name str Key-column required value_col_name str Value-column required Returns: Type Description Dict[str, Any] Dictionary with values Source code in quinn/dataframe_helpers.py def two_columns_to_dictionary( df: DataFrame, key_col_name: str, value_col_name: str ) -> Dict[str, Any]: \"\"\"Collect two columns as dictionary when first column is key and second is value. :param df: Input DataFrame :type df: pyspark.sql.DataFrame :param key_col_name: Key-column :type key_col_name: str :param value_col_name: Value-column :type value_col_name: str :return: Dictionary with values :rtype: Dict[str, Any] \"\"\" k, v = key_col_name, value_col_name return {x[k]: x[v] for x in df.select(k, v).collect()}","title":"two_columns_to_dictionary()"},{"location":"reference/quinn/dataframe_validator/","text":"DataFrameMissingColumnError Bases: ValueError raise this when there's a DataFrame column error Source code in quinn/dataframe_validator.py class DataFrameMissingColumnError(ValueError): \"\"\"raise this when there's a DataFrame column error\"\"\" DataFrameMissingStructFieldError Bases: ValueError raise this when there's a DataFrame column error Source code in quinn/dataframe_validator.py class DataFrameMissingStructFieldError(ValueError): \"\"\"raise this when there's a DataFrame column error\"\"\" DataFrameProhibitedColumnError Bases: ValueError raise this when a DataFrame includes prohibited columns Source code in quinn/dataframe_validator.py class DataFrameProhibitedColumnError(ValueError): \"\"\"raise this when a DataFrame includes prohibited columns\"\"\" validate_absence_of_columns(df, prohibited_col_names) Validate that none of the prohibited column names are present among specified DataFrame columns. Parameters: Name Type Description Default df DataFrame DataFrame containing columns to be checked. required prohibited_col_names List [ str ] List of prohibited column names. required Raises: Type Description DataFrameProhibitedColumnError If the prohibited column names are present among the specified DataFrame columns. Source code in quinn/dataframe_validator.py def validate_absence_of_columns(df: DataFrame, prohibited_col_names: List[str]) -> None: \"\"\" Validate that none of the prohibited column names are present among specified DataFrame columns. :param df: DataFrame containing columns to be checked. :param prohibited_col_names: List of prohibited column names. :raises DataFrameProhibitedColumnError: If the prohibited column names are present among the specified DataFrame columns. \"\"\" all_col_names = df.columns extra_col_names = [x for x in all_col_names if x in prohibited_col_names] error_message = \"The {extra_col_names} columns are not allowed to be included in the DataFrame with the following columns {all_col_names}\".format( extra_col_names=extra_col_names, all_col_names=all_col_names ) if extra_col_names: raise DataFrameProhibitedColumnError(error_message) validate_presence_of_columns(df, required_col_names) Validates the presence of column names in a DataFrame. Parameters: Name Type Description Default df DataFrame A spark DataFrame. required required_col_names List [ str ] List of the required column names for the DataFrame. required Returns: Type Description None None. Raises: Type Description DataFrameMissingColumnError if any of the requested column names are not present in the DataFrame. Source code in quinn/dataframe_validator.py def validate_presence_of_columns(df: DataFrame, required_col_names: List[str]) -> None: \"\"\"Validates the presence of column names in a DataFrame. :param df: A spark DataFrame. :type df: DataFrame` :param required_col_names: List of the required column names for the DataFrame. :type required_col_names: :py:class:`list` of :py:class:`str` :return: None. :raises DataFrameMissingColumnError: if any of the requested column names are not present in the DataFrame. \"\"\" all_col_names = df.columns missing_col_names = [x for x in required_col_names if x not in all_col_names] error_message = \"The {missing_col_names} columns are not included in the DataFrame with the following columns {all_col_names}\".format( missing_col_names=missing_col_names, all_col_names=all_col_names ) if missing_col_names: raise DataFrameMissingColumnError(error_message) validate_schema(df, required_schema, ignore_nullable=False) This function will validate that a given DataFrame has a given StructType as its schema. Parameters: Name Type Description Default df DataFrame DataFrame to validate required required_schema StructType StructType required for the DataFrame required ignore_nullable bool (Optional) A flag for if nullable fields should be ignored during validation False Raises: Type Description DataFrameMissingStructFieldError if any StructFields from the required schema are not included in the DataFrame schema Source code in quinn/dataframe_validator.py def validate_schema(df: DataFrame, required_schema: StructType, ignore_nullable: bool=False) -> None: \"\"\" This function will validate that a given DataFrame has a given StructType as its schema. :param df: DataFrame to validate :type df: DataFrame :param required_schema: StructType required for the DataFrame :type required_schema: StructType :param ignore_nullable: (Optional) A flag for if nullable fields should be ignored during validation :type ignore_nullable: bool, optional :raises DataFrameMissingStructFieldError: if any StructFields from the required schema are not included in the DataFrame schema \"\"\" _all_struct_fields = copy.deepcopy(df.schema) _required_schema = copy.deepcopy(required_schema) if ignore_nullable: for x in _all_struct_fields: x.nullable = None for x in _required_schema: x.nullable = None missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields] error_message = \"The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {all_struct_fields}\".format( missing_struct_fields=missing_struct_fields, all_struct_fields=_all_struct_fields, ) if missing_struct_fields: raise DataFrameMissingStructFieldError(error_message)","title":"Dataframe validator"},{"location":"reference/quinn/dataframe_validator/#quinn.dataframe_validator.DataFrameMissingColumnError","text":"Bases: ValueError raise this when there's a DataFrame column error Source code in quinn/dataframe_validator.py class DataFrameMissingColumnError(ValueError): \"\"\"raise this when there's a DataFrame column error\"\"\"","title":"DataFrameMissingColumnError"},{"location":"reference/quinn/dataframe_validator/#quinn.dataframe_validator.DataFrameMissingStructFieldError","text":"Bases: ValueError raise this when there's a DataFrame column error Source code in quinn/dataframe_validator.py class DataFrameMissingStructFieldError(ValueError): \"\"\"raise this when there's a DataFrame column error\"\"\"","title":"DataFrameMissingStructFieldError"},{"location":"reference/quinn/dataframe_validator/#quinn.dataframe_validator.DataFrameProhibitedColumnError","text":"Bases: ValueError raise this when a DataFrame includes prohibited columns Source code in quinn/dataframe_validator.py class DataFrameProhibitedColumnError(ValueError): \"\"\"raise this when a DataFrame includes prohibited columns\"\"\"","title":"DataFrameProhibitedColumnError"},{"location":"reference/quinn/dataframe_validator/#quinn.dataframe_validator.validate_absence_of_columns","text":"Validate that none of the prohibited column names are present among specified DataFrame columns. Parameters: Name Type Description Default df DataFrame DataFrame containing columns to be checked. required prohibited_col_names List [ str ] List of prohibited column names. required Raises: Type Description DataFrameProhibitedColumnError If the prohibited column names are present among the specified DataFrame columns. Source code in quinn/dataframe_validator.py def validate_absence_of_columns(df: DataFrame, prohibited_col_names: List[str]) -> None: \"\"\" Validate that none of the prohibited column names are present among specified DataFrame columns. :param df: DataFrame containing columns to be checked. :param prohibited_col_names: List of prohibited column names. :raises DataFrameProhibitedColumnError: If the prohibited column names are present among the specified DataFrame columns. \"\"\" all_col_names = df.columns extra_col_names = [x for x in all_col_names if x in prohibited_col_names] error_message = \"The {extra_col_names} columns are not allowed to be included in the DataFrame with the following columns {all_col_names}\".format( extra_col_names=extra_col_names, all_col_names=all_col_names ) if extra_col_names: raise DataFrameProhibitedColumnError(error_message)","title":"validate_absence_of_columns()"},{"location":"reference/quinn/dataframe_validator/#quinn.dataframe_validator.validate_presence_of_columns","text":"Validates the presence of column names in a DataFrame. Parameters: Name Type Description Default df DataFrame A spark DataFrame. required required_col_names List [ str ] List of the required column names for the DataFrame. required Returns: Type Description None None. Raises: Type Description DataFrameMissingColumnError if any of the requested column names are not present in the DataFrame. Source code in quinn/dataframe_validator.py def validate_presence_of_columns(df: DataFrame, required_col_names: List[str]) -> None: \"\"\"Validates the presence of column names in a DataFrame. :param df: A spark DataFrame. :type df: DataFrame` :param required_col_names: List of the required column names for the DataFrame. :type required_col_names: :py:class:`list` of :py:class:`str` :return: None. :raises DataFrameMissingColumnError: if any of the requested column names are not present in the DataFrame. \"\"\" all_col_names = df.columns missing_col_names = [x for x in required_col_names if x not in all_col_names] error_message = \"The {missing_col_names} columns are not included in the DataFrame with the following columns {all_col_names}\".format( missing_col_names=missing_col_names, all_col_names=all_col_names ) if missing_col_names: raise DataFrameMissingColumnError(error_message)","title":"validate_presence_of_columns()"},{"location":"reference/quinn/dataframe_validator/#quinn.dataframe_validator.validate_schema","text":"This function will validate that a given DataFrame has a given StructType as its schema. Parameters: Name Type Description Default df DataFrame DataFrame to validate required required_schema StructType StructType required for the DataFrame required ignore_nullable bool (Optional) A flag for if nullable fields should be ignored during validation False Raises: Type Description DataFrameMissingStructFieldError if any StructFields from the required schema are not included in the DataFrame schema Source code in quinn/dataframe_validator.py def validate_schema(df: DataFrame, required_schema: StructType, ignore_nullable: bool=False) -> None: \"\"\" This function will validate that a given DataFrame has a given StructType as its schema. :param df: DataFrame to validate :type df: DataFrame :param required_schema: StructType required for the DataFrame :type required_schema: StructType :param ignore_nullable: (Optional) A flag for if nullable fields should be ignored during validation :type ignore_nullable: bool, optional :raises DataFrameMissingStructFieldError: if any StructFields from the required schema are not included in the DataFrame schema \"\"\" _all_struct_fields = copy.deepcopy(df.schema) _required_schema = copy.deepcopy(required_schema) if ignore_nullable: for x in _all_struct_fields: x.nullable = None for x in _required_schema: x.nullable = None missing_struct_fields = [x for x in _required_schema if x not in _all_struct_fields] error_message = \"The {missing_struct_fields} StructFields are not included in the DataFrame with the following StructFields {all_struct_fields}\".format( missing_struct_fields=missing_struct_fields, all_struct_fields=_all_struct_fields, ) if missing_struct_fields: raise DataFrameMissingStructFieldError(error_message)","title":"validate_schema()"},{"location":"reference/quinn/functions/","text":"anti_trim(col) Removes whitespace from the boundaries of col using the regexp_replace function. Parameters: Name Type Description Default col Column Column on which to perform the regexp_replace. required Returns: Type Description Column A new Column with all whitespace removed from the boundaries. Source code in quinn/functions.py def anti_trim(col: Column) -> Column: \"\"\"Removes whitespace from the boundaries of ``col`` using the regexp_replace function. :param col: Column on which to perform the regexp_replace. :type col: Column :return: A new Column with all whitespace removed from the boundaries. :rtype: Column \"\"\" return F.regexp_replace(col, \"\\\\b\\\\s+\\\\b\", \"\") approx_equal(col1, col2, threshold) Compares two Column objects by checking if the difference between them is less than a specified threshold . Parameters: Name Type Description Default col1 Column the first Column required col2 Column the second Column required threshold Number value to compare with required Returns: Type Description Column Boolean Column with True indicating that abs(col1 - col2) is less than threshold Source code in quinn/functions.py def approx_equal(col1: Column, col2: Column, threshold: Number) -> Column: \"\"\"Compares two ``Column`` objects by checking if the difference between them is less than a specified ``threshold``. :param col1: the first ``Column`` :type col1: Column :param col2: the second ``Column`` :type col2: Column :param threshold: value to compare with :type threshold: Number :return: Boolean ``Column`` with ``True`` indicating that ``abs(col1 - col2)`` is less than ``threshold`` \"\"\" return F.abs(col1 - col2) < threshold array_choice(col) Returns one random element from the given column. Parameters: Name Type Description Default col Column Column from which element is chosen required Returns: Type Description Column random element from the given column Source code in quinn/functions.py def array_choice(col: Column) -> Column: \"\"\"Returns one random element from the given column. :param col: Column from which element is chosen :type col: Column :return: random element from the given column :rtype: Column \"\"\" index = (F.rand() * F.size(col)).cast(\"int\") return col[index] exists(f) Create a user-defined function that takes a list expressed as a column of type ArrayType(AnyType) as an argument and returns a boolean value indicating whether any element in the list is true according to the argument f of the exists() function. Parameters: Name Type Description Default f Callable [[ Any ], bool ] Callable function - A callable function that takes an element of type Any and returns a boolean value. required Returns: Type Description UserDefinedFunction A user-defined function that takes a list expressed as a column of type ArrayType(AnyType) as an argument and returns a boolean value indicating whether any element in the list is true according to the argument f of the exists() function. Source code in quinn/functions.py def exists(f: Callable[[Any], bool]): \"\"\" Create a user-defined function that takes a list expressed as a column of type ``ArrayType(AnyType)`` as an argument and returns a boolean value indicating whether any element in the list is true according to the argument ``f`` of the ``exists()`` function. :param f: Callable function - A callable function that takes an element of type Any and returns a boolean value. :return: A user-defined function that takes a list expressed as a column of type ArrayType(AnyType) as an argument and returns a boolean value indicating whether any element in the list is true according to the argument ``f`` of the ``exists()`` function. :rtype: UserDefinedFunction \"\"\" def temp_udf(l): return any(map(f, l)) return F.udf(temp_udf, BooleanType()) forall(f) The forall function allows for mapping a given boolean function to a list of arguments and return a single boolean value as the result of applying the boolean function to each element of the list. It does this by creating a Spark UDF which takes in a list of arguments, applying the given boolean function to each element of the list and returning a single boolean value if all the elements pass through the given boolean function. Parameters: Name Type Description Default f Callable [[ Any ], bool ] A callable function f which takes in any type and returns a boolean required Returns: Type Description UserDefinedFunction A spark UDF which accepts a list of arguments and returns True if all elements pass through the given boolean function, False otherwise. Source code in quinn/functions.py def forall(f: Callable[[Any], bool]): \"\"\"The **forall** function allows for mapping a given boolean function to a list of arguments and return a single boolean value as the result of applying the boolean function to each element of the list. It does this by creating a Spark UDF which takes in a list of arguments, applying the given boolean function to each element of the list and returning a single boolean value if all the elements pass through the given boolean function. :param f: A callable function ``f`` which takes in any type and returns a boolean :return: A spark UDF which accepts a list of arguments and returns True if all elements pass through the given boolean function, False otherwise. :rtype: UserDefinedFunction \"\"\" def temp_udf(l): return all(map(f, l)) return F.udf(temp_udf, BooleanType()) multi_equals(value) Create a user-defined function that checks if all the given columns have the designated value. Parameters: Name Type Description Default value Any The designated value. required Returns: Type Description UserDifinedFunction A user-defined function of type BooleanType(). Source code in quinn/functions.py def multi_equals(value: Any): \"\"\"Create a user-defined function that checks if all the given columns have the designated value. :param value: The designated value. :type value: Any :return: A user-defined function of type BooleanType(). :rtype: UserDifinedFunction \"\"\" def temp_udf(*cols): return all(map(lambda col: col == value, cols)) return F.udf(temp_udf, BooleanType()) regexp_extract_all(s, regexp) This function uses the Python re library to extract regular expressions from a string ( s ) using a regex pattern ( regexp ). It returns a list of all matches, or None if s is None . Parameters: Name Type Description Default s str input string ( Column ) required regexp str string re pattern required Returns: Type Description Optional [ List [ re . Match ]] List of matches Source code in quinn/functions.py @F.udf(returnType=ArrayType(StringType())) def regexp_extract_all(s: str, regexp: str) -> Optional[List[re.Match]]: \"\"\"This function uses the Python `re` library to extract regular expressions from a string (`s`) using a regex pattern (`regexp`). It returns a list of all matches, or `None` if `s` is `None`. :param s: input string (`Column`) :type s: str :param regexp: string `re` pattern :return: List of matches \"\"\" return None if s is None else re.findall(regexp, s) remove_all_whitespace(col) This function takes a Column object as a parameter and returns a Column object with all white space removed. It does this using the regexp_replace function from F, which replaces all whitespace with an empty string. Parameters: Name Type Description Default col Column a Column object required Returns: Type Description Column a Column object with all white space removed Source code in quinn/functions.py def remove_all_whitespace(col: Column) -> Column: \"\"\"This function takes a `Column` object as a parameter and returns a `Column` object with all white space removed. It does this using the regexp_replace function from F, which replaces all whitespace with an empty string. :param col: a `Column` object :type col: Column :returns: a `Column` object with all white space removed :rtype: Column \"\"\" return F.regexp_replace(col, \"\\\\s+\", \"\") remove_non_word_characters(col) Removes non-word characters from a column. The non-word characters which will be removed are those identified by the regular expression \"[^\\w\\s]+\" . This expression represents any character that is not a word character (e.g. \\w ) or whitespace ( \\s ). Parameters: Name Type Description Default col Column A Column object. required Returns: Type Description Column A Column object with non-word characters removed. Source code in quinn/functions.py def remove_non_word_characters(col: Column) -> Column: \"\"\"Removes non-word characters from a column. The non-word characters which will be removed are those identified by the regular expression ``\"[^\\\\w\\\\s]+\"``. This expression represents any character that is not a word character (e.g. `\\w`) or whitespace (`\\s`). :param col: A Column object. :return: A Column object with non-word characters removed. \"\"\" return F.regexp_replace(col, \"[^\\\\w\\\\s]+\", \"\") single_space(col) This function takes a column and replaces all the multiple white spaces with a single space. It then trims the column to make all the texts consistent. Parameters: Name Type Description Default col Column The column which needs to be spaced required Returns: Type Description Column A trimmed column with single space Source code in quinn/functions.py def single_space(col: Column) -> Column: \"\"\"This function takes a column and replaces all the multiple white spaces with a single space. It then trims the column to make all the texts consistent. :param col: The column which needs to be spaced :type col: Column :returns: A trimmed column with single space :rtype: Column \"\"\" return F.trim(F.regexp_replace(col, \" +\", \" \")) uuid5(col, namespace=uuid.NAMESPACE_DNS, extra_string='') This function generates UUIDv5 from col and namespace , optionally prepending an extra string to col . Sets variant to RFC 4122 one. Parameters: Name Type Description Default col Column Column that will be hashed. required namespace uuid . UUID Namespace to be used. (default: uuid.NAMESPACE_DNS ) uuid.NAMESPACE_DNS extra_string str In case of collisions one can pass an extra string to hash on. '' Returns: Type Description Column String representation of generated UUIDv5 Source code in quinn/functions.py def uuid5(col: Column, namespace: uuid.UUID = uuid.NAMESPACE_DNS, extra_string: str = \"\") -> Column: \"\"\"This function generates UUIDv5 from ``col`` and ``namespace``, optionally prepending an extra string to ``col``. Sets variant to RFC 4122 one. :param col: Column that will be hashed. :type col: Column :param namespace: Namespace to be used. (default: `uuid.NAMESPACE_DNS`) :type namespace: str :param extra_string: In case of collisions one can pass an extra string to hash on. :type extra_string: str :return: String representation of generated UUIDv5 :rtype: Column \"\"\" ns = F.lit(namespace.bytes) salted_col = F.concat(F.lit(extra_string), col) encoded = F.encode(salted_col, \"utf-8\") encoded_with_ns = F.concat(ns, encoded) hashed = F.sha1(encoded_with_ns) variant_part = F.substring(hashed, 17, 4) variant_part = F.conv(variant_part, 16, 2) variant_part = F.lpad(variant_part, 16, \"0\") variant_part = F.concat(F.lit(\"10\"), F.substring(variant_part, 3, 16)) # RFC 4122 variant. variant_part = F.lower(F.conv(variant_part, 2, 16)) return F.concat_ws( \"-\", F.substring(hashed, 1, 8), F.substring(hashed, 9, 4), F.concat(F.lit(\"5\"), F.substring(hashed, 14, 3)), # Set version. variant_part, F.substring(hashed, 21, 12), ) week_end_date(col, week_end_day='Sat') Returns a date column for the end of week for a given day. The Spark function dayofweek considers Sunday as the first day of the week, and uses the default value of 1 to indicate Sunday. Usage of the when and otherwise functions allow a comparison between the end of week day indicated and the day of week computed, and the return of the reference date if they match or the the addition of one week to the reference date otherwise. Parameters: Name Type Description Default col Column The reference date column. required week_end_day str The week end day (default: 'Sat') 'Sat' Returns: Type Description Column A Column of end of the week dates. Source code in quinn/functions.py def week_end_date(col: Column, week_end_day: str = \"Sat\") -> Column: \"\"\" Returns a date column for the end of week for a given day. The Spark function `dayofweek` considers Sunday as the first day of the week, and uses the default value of 1 to indicate Sunday. Usage of the `when` and `otherwise` functions allow a comparison between the end of week day indicated and the day of week computed, and the return of the reference date if they match or the the addition of one week to the reference date otherwise. :param col: The reference date column. :type col: Column :param week_end_day: The week end day (default: 'Sat') :type week_end_day: str :return: A Column of end of the week dates. :rtype: Column \"\"\" _raise_if_invalid_day(week_end_day) # these are the default Spark mappings. Spark considers Sunday the first day of the week. day_of_week_mapping = { \"Sun\": 1, \"Mon\": 2, \"Tue\": 3, \"Wed\": 4, \"Thu\": 5, \"Fri\": 6, \"Sat\": 7, } return F.when( F.dayofweek(col).eqNullSafe(F.lit(day_of_week_mapping[week_end_day])), col ).otherwise(F.next_day(col, week_end_day)) week_start_date(col, week_start_day='Sun') This function takes a Spark Column and an optional week_start_day string argument and returns a Column with the corresponding start of week dates. The \"standard week\" in Spark starts on Sunday, however an optional argument can be used to start the week from a different day, e.g. Monday. The week_start_day argument is a string corresponding to the day of the week to start the week from, e.g. \"Mon\" , \"Tue\" , and must be in the set: {\"Sun\", \"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\"} . If the argument given is not a valid day then a ValueError will be raised. Parameters: Name Type Description Default col Column The column to determine start of week dates on required week_start_day str The day to start the week on 'Sun' Returns: Type Description Column A Column with start of week dates Source code in quinn/functions.py def week_start_date(col: Column, week_start_day: str = \"Sun\") -> Column: \"\"\"This function takes a Spark `Column` and an optional `week_start_day` string argument and returns a `Column` with the corresponding start of week dates. The \"standard week\" in Spark starts on Sunday, however an optional argument can be used to start the week from a different day, e.g. Monday. The `week_start_day` argument is a string corresponding to the day of the week to start the week from, e.g. `\"Mon\"`, `\"Tue\"`, and must be in the set: `{\"Sun\", \"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\"}`. If the argument given is not a valid day then a `ValueError` will be raised. :param col: The column to determine start of week dates on :type col: Column :param week_start_day: The day to start the week on :type week_start_day: str :returns: A Column with start of week dates :rtype: Column \"\"\" _raise_if_invalid_day(week_start_day) # the \"standard week\" in Spark is from Sunday to Saturday mapping = { \"Sun\": \"Sat\", \"Mon\": \"Sun\", \"Tue\": \"Mon\", \"Wed\": \"Tue\", \"Thu\": \"Wed\", \"Fri\": \"Thu\", \"Sat\": \"Fri\", } end = week_end_date(col, mapping[week_start_day]) return F.date_add(end, -6)","title":"Functions"},{"location":"reference/quinn/functions/#quinn.functions.anti_trim","text":"Removes whitespace from the boundaries of col using the regexp_replace function. Parameters: Name Type Description Default col Column Column on which to perform the regexp_replace. required Returns: Type Description Column A new Column with all whitespace removed from the boundaries. Source code in quinn/functions.py def anti_trim(col: Column) -> Column: \"\"\"Removes whitespace from the boundaries of ``col`` using the regexp_replace function. :param col: Column on which to perform the regexp_replace. :type col: Column :return: A new Column with all whitespace removed from the boundaries. :rtype: Column \"\"\" return F.regexp_replace(col, \"\\\\b\\\\s+\\\\b\", \"\")","title":"anti_trim()"},{"location":"reference/quinn/functions/#quinn.functions.approx_equal","text":"Compares two Column objects by checking if the difference between them is less than a specified threshold . Parameters: Name Type Description Default col1 Column the first Column required col2 Column the second Column required threshold Number value to compare with required Returns: Type Description Column Boolean Column with True indicating that abs(col1 - col2) is less than threshold Source code in quinn/functions.py def approx_equal(col1: Column, col2: Column, threshold: Number) -> Column: \"\"\"Compares two ``Column`` objects by checking if the difference between them is less than a specified ``threshold``. :param col1: the first ``Column`` :type col1: Column :param col2: the second ``Column`` :type col2: Column :param threshold: value to compare with :type threshold: Number :return: Boolean ``Column`` with ``True`` indicating that ``abs(col1 - col2)`` is less than ``threshold`` \"\"\" return F.abs(col1 - col2) < threshold","title":"approx_equal()"},{"location":"reference/quinn/functions/#quinn.functions.array_choice","text":"Returns one random element from the given column. Parameters: Name Type Description Default col Column Column from which element is chosen required Returns: Type Description Column random element from the given column Source code in quinn/functions.py def array_choice(col: Column) -> Column: \"\"\"Returns one random element from the given column. :param col: Column from which element is chosen :type col: Column :return: random element from the given column :rtype: Column \"\"\" index = (F.rand() * F.size(col)).cast(\"int\") return col[index]","title":"array_choice()"},{"location":"reference/quinn/functions/#quinn.functions.exists","text":"Create a user-defined function that takes a list expressed as a column of type ArrayType(AnyType) as an argument and returns a boolean value indicating whether any element in the list is true according to the argument f of the exists() function. Parameters: Name Type Description Default f Callable [[ Any ], bool ] Callable function - A callable function that takes an element of type Any and returns a boolean value. required Returns: Type Description UserDefinedFunction A user-defined function that takes a list expressed as a column of type ArrayType(AnyType) as an argument and returns a boolean value indicating whether any element in the list is true according to the argument f of the exists() function. Source code in quinn/functions.py def exists(f: Callable[[Any], bool]): \"\"\" Create a user-defined function that takes a list expressed as a column of type ``ArrayType(AnyType)`` as an argument and returns a boolean value indicating whether any element in the list is true according to the argument ``f`` of the ``exists()`` function. :param f: Callable function - A callable function that takes an element of type Any and returns a boolean value. :return: A user-defined function that takes a list expressed as a column of type ArrayType(AnyType) as an argument and returns a boolean value indicating whether any element in the list is true according to the argument ``f`` of the ``exists()`` function. :rtype: UserDefinedFunction \"\"\" def temp_udf(l): return any(map(f, l)) return F.udf(temp_udf, BooleanType())","title":"exists()"},{"location":"reference/quinn/functions/#quinn.functions.forall","text":"The forall function allows for mapping a given boolean function to a list of arguments and return a single boolean value as the result of applying the boolean function to each element of the list. It does this by creating a Spark UDF which takes in a list of arguments, applying the given boolean function to each element of the list and returning a single boolean value if all the elements pass through the given boolean function. Parameters: Name Type Description Default f Callable [[ Any ], bool ] A callable function f which takes in any type and returns a boolean required Returns: Type Description UserDefinedFunction A spark UDF which accepts a list of arguments and returns True if all elements pass through the given boolean function, False otherwise. Source code in quinn/functions.py def forall(f: Callable[[Any], bool]): \"\"\"The **forall** function allows for mapping a given boolean function to a list of arguments and return a single boolean value as the result of applying the boolean function to each element of the list. It does this by creating a Spark UDF which takes in a list of arguments, applying the given boolean function to each element of the list and returning a single boolean value if all the elements pass through the given boolean function. :param f: A callable function ``f`` which takes in any type and returns a boolean :return: A spark UDF which accepts a list of arguments and returns True if all elements pass through the given boolean function, False otherwise. :rtype: UserDefinedFunction \"\"\" def temp_udf(l): return all(map(f, l)) return F.udf(temp_udf, BooleanType())","title":"forall()"},{"location":"reference/quinn/functions/#quinn.functions.multi_equals","text":"Create a user-defined function that checks if all the given columns have the designated value. Parameters: Name Type Description Default value Any The designated value. required Returns: Type Description UserDifinedFunction A user-defined function of type BooleanType(). Source code in quinn/functions.py def multi_equals(value: Any): \"\"\"Create a user-defined function that checks if all the given columns have the designated value. :param value: The designated value. :type value: Any :return: A user-defined function of type BooleanType(). :rtype: UserDifinedFunction \"\"\" def temp_udf(*cols): return all(map(lambda col: col == value, cols)) return F.udf(temp_udf, BooleanType())","title":"multi_equals()"},{"location":"reference/quinn/functions/#quinn.functions.regexp_extract_all","text":"This function uses the Python re library to extract regular expressions from a string ( s ) using a regex pattern ( regexp ). It returns a list of all matches, or None if s is None . Parameters: Name Type Description Default s str input string ( Column ) required regexp str string re pattern required Returns: Type Description Optional [ List [ re . Match ]] List of matches Source code in quinn/functions.py @F.udf(returnType=ArrayType(StringType())) def regexp_extract_all(s: str, regexp: str) -> Optional[List[re.Match]]: \"\"\"This function uses the Python `re` library to extract regular expressions from a string (`s`) using a regex pattern (`regexp`). It returns a list of all matches, or `None` if `s` is `None`. :param s: input string (`Column`) :type s: str :param regexp: string `re` pattern :return: List of matches \"\"\" return None if s is None else re.findall(regexp, s)","title":"regexp_extract_all()"},{"location":"reference/quinn/functions/#quinn.functions.remove_all_whitespace","text":"This function takes a Column object as a parameter and returns a Column object with all white space removed. It does this using the regexp_replace function from F, which replaces all whitespace with an empty string. Parameters: Name Type Description Default col Column a Column object required Returns: Type Description Column a Column object with all white space removed Source code in quinn/functions.py def remove_all_whitespace(col: Column) -> Column: \"\"\"This function takes a `Column` object as a parameter and returns a `Column` object with all white space removed. It does this using the regexp_replace function from F, which replaces all whitespace with an empty string. :param col: a `Column` object :type col: Column :returns: a `Column` object with all white space removed :rtype: Column \"\"\" return F.regexp_replace(col, \"\\\\s+\", \"\")","title":"remove_all_whitespace()"},{"location":"reference/quinn/functions/#quinn.functions.remove_non_word_characters","text":"Removes non-word characters from a column. The non-word characters which will be removed are those identified by the regular expression \"[^\\w\\s]+\" . This expression represents any character that is not a word character (e.g. \\w ) or whitespace ( \\s ). Parameters: Name Type Description Default col Column A Column object. required Returns: Type Description Column A Column object with non-word characters removed. Source code in quinn/functions.py def remove_non_word_characters(col: Column) -> Column: \"\"\"Removes non-word characters from a column. The non-word characters which will be removed are those identified by the regular expression ``\"[^\\\\w\\\\s]+\"``. This expression represents any character that is not a word character (e.g. `\\w`) or whitespace (`\\s`). :param col: A Column object. :return: A Column object with non-word characters removed. \"\"\" return F.regexp_replace(col, \"[^\\\\w\\\\s]+\", \"\")","title":"remove_non_word_characters()"},{"location":"reference/quinn/functions/#quinn.functions.single_space","text":"This function takes a column and replaces all the multiple white spaces with a single space. It then trims the column to make all the texts consistent. Parameters: Name Type Description Default col Column The column which needs to be spaced required Returns: Type Description Column A trimmed column with single space Source code in quinn/functions.py def single_space(col: Column) -> Column: \"\"\"This function takes a column and replaces all the multiple white spaces with a single space. It then trims the column to make all the texts consistent. :param col: The column which needs to be spaced :type col: Column :returns: A trimmed column with single space :rtype: Column \"\"\" return F.trim(F.regexp_replace(col, \" +\", \" \"))","title":"single_space()"},{"location":"reference/quinn/functions/#quinn.functions.uuid5","text":"This function generates UUIDv5 from col and namespace , optionally prepending an extra string to col . Sets variant to RFC 4122 one. Parameters: Name Type Description Default col Column Column that will be hashed. required namespace uuid . UUID Namespace to be used. (default: uuid.NAMESPACE_DNS ) uuid.NAMESPACE_DNS extra_string str In case of collisions one can pass an extra string to hash on. '' Returns: Type Description Column String representation of generated UUIDv5 Source code in quinn/functions.py def uuid5(col: Column, namespace: uuid.UUID = uuid.NAMESPACE_DNS, extra_string: str = \"\") -> Column: \"\"\"This function generates UUIDv5 from ``col`` and ``namespace``, optionally prepending an extra string to ``col``. Sets variant to RFC 4122 one. :param col: Column that will be hashed. :type col: Column :param namespace: Namespace to be used. (default: `uuid.NAMESPACE_DNS`) :type namespace: str :param extra_string: In case of collisions one can pass an extra string to hash on. :type extra_string: str :return: String representation of generated UUIDv5 :rtype: Column \"\"\" ns = F.lit(namespace.bytes) salted_col = F.concat(F.lit(extra_string), col) encoded = F.encode(salted_col, \"utf-8\") encoded_with_ns = F.concat(ns, encoded) hashed = F.sha1(encoded_with_ns) variant_part = F.substring(hashed, 17, 4) variant_part = F.conv(variant_part, 16, 2) variant_part = F.lpad(variant_part, 16, \"0\") variant_part = F.concat(F.lit(\"10\"), F.substring(variant_part, 3, 16)) # RFC 4122 variant. variant_part = F.lower(F.conv(variant_part, 2, 16)) return F.concat_ws( \"-\", F.substring(hashed, 1, 8), F.substring(hashed, 9, 4), F.concat(F.lit(\"5\"), F.substring(hashed, 14, 3)), # Set version. variant_part, F.substring(hashed, 21, 12), )","title":"uuid5()"},{"location":"reference/quinn/functions/#quinn.functions.week_end_date","text":"Returns a date column for the end of week for a given day. The Spark function dayofweek considers Sunday as the first day of the week, and uses the default value of 1 to indicate Sunday. Usage of the when and otherwise functions allow a comparison between the end of week day indicated and the day of week computed, and the return of the reference date if they match or the the addition of one week to the reference date otherwise. Parameters: Name Type Description Default col Column The reference date column. required week_end_day str The week end day (default: 'Sat') 'Sat' Returns: Type Description Column A Column of end of the week dates. Source code in quinn/functions.py def week_end_date(col: Column, week_end_day: str = \"Sat\") -> Column: \"\"\" Returns a date column for the end of week for a given day. The Spark function `dayofweek` considers Sunday as the first day of the week, and uses the default value of 1 to indicate Sunday. Usage of the `when` and `otherwise` functions allow a comparison between the end of week day indicated and the day of week computed, and the return of the reference date if they match or the the addition of one week to the reference date otherwise. :param col: The reference date column. :type col: Column :param week_end_day: The week end day (default: 'Sat') :type week_end_day: str :return: A Column of end of the week dates. :rtype: Column \"\"\" _raise_if_invalid_day(week_end_day) # these are the default Spark mappings. Spark considers Sunday the first day of the week. day_of_week_mapping = { \"Sun\": 1, \"Mon\": 2, \"Tue\": 3, \"Wed\": 4, \"Thu\": 5, \"Fri\": 6, \"Sat\": 7, } return F.when( F.dayofweek(col).eqNullSafe(F.lit(day_of_week_mapping[week_end_day])), col ).otherwise(F.next_day(col, week_end_day))","title":"week_end_date()"},{"location":"reference/quinn/functions/#quinn.functions.week_start_date","text":"This function takes a Spark Column and an optional week_start_day string argument and returns a Column with the corresponding start of week dates. The \"standard week\" in Spark starts on Sunday, however an optional argument can be used to start the week from a different day, e.g. Monday. The week_start_day argument is a string corresponding to the day of the week to start the week from, e.g. \"Mon\" , \"Tue\" , and must be in the set: {\"Sun\", \"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\"} . If the argument given is not a valid day then a ValueError will be raised. Parameters: Name Type Description Default col Column The column to determine start of week dates on required week_start_day str The day to start the week on 'Sun' Returns: Type Description Column A Column with start of week dates Source code in quinn/functions.py def week_start_date(col: Column, week_start_day: str = \"Sun\") -> Column: \"\"\"This function takes a Spark `Column` and an optional `week_start_day` string argument and returns a `Column` with the corresponding start of week dates. The \"standard week\" in Spark starts on Sunday, however an optional argument can be used to start the week from a different day, e.g. Monday. The `week_start_day` argument is a string corresponding to the day of the week to start the week from, e.g. `\"Mon\"`, `\"Tue\"`, and must be in the set: `{\"Sun\", \"Mon\", \"Tue\", \"Wed\", \"Thu\", \"Fri\", \"Sat\"}`. If the argument given is not a valid day then a `ValueError` will be raised. :param col: The column to determine start of week dates on :type col: Column :param week_start_day: The day to start the week on :type week_start_day: str :returns: A Column with start of week dates :rtype: Column \"\"\" _raise_if_invalid_day(week_start_day) # the \"standard week\" in Spark is from Sunday to Saturday mapping = { \"Sun\": \"Sat\", \"Mon\": \"Sun\", \"Tue\": \"Mon\", \"Wed\": \"Tue\", \"Thu\": \"Wed\", \"Fri\": \"Thu\", \"Sat\": \"Fri\", } end = week_end_date(col, mapping[week_start_day]) return F.date_add(end, -6)","title":"week_start_date()"},{"location":"reference/quinn/scala_to_pyspark/","text":"","title":"Scala to pyspark"},{"location":"reference/quinn/schema_helpers/","text":"print_schema_as_code(dtype) Represent DataType (including StructType) as valid Python code. Parameters: Name Type Description Default dtype T . DataType The input DataType or Schema object required Returns: Type Description str A valid python code which generate the same schema. Source code in quinn/schema_helpers.py def print_schema_as_code(dtype: T.DataType) -> str: \"\"\"Represent DataType (including StructType) as valid Python code. :param dtype: The input DataType or Schema object :type dtype: pyspark.sql.types.DataType :return: A valid python code which generate the same schema. :rtype: str \"\"\" res = [] if isinstance(dtype, T.StructType): res.append(\"StructType(\\n\\tfields=[\") for field in dtype.fields: for line in _repr_column(field).split(\"\\n\"): res.append(\"\\n\\t\\t\") res.append(line) res.append(\",\") res.append(\"\\n\\t]\\n)\") elif isinstance(dtype, T.ArrayType): res.append(\"ArrayType(\") res.append(print_schema_as_code(dtype.elementType)) res.append(\")\") elif isinstance(dtype, T.MapType): res.append(\"MapType(\") res.append(f\"\\n\\t{print_schema_as_code(dtype.keyType)},\") for line in print_schema_as_code(dtype.valueType).split(\"\\n\"): res.append(\"\\n\\t\") res.append(line) res.append(\",\") res.append(f\"\\n\\t{dtype.valueContainsNull},\") res.append(\"\\n)\") elif isinstance(dtype, T.DecimalType): res.append(f\"DecimalType({dtype.precision}, {dtype.scale})\") else: if str(dtype).endswith(\"()\"): # PySpark 3.3+ res.append(str(dtype)) else: res.append(f\"{dtype}()\") return \"\".join(res)","title":"Schema helpers"},{"location":"reference/quinn/schema_helpers/#quinn.schema_helpers.print_schema_as_code","text":"Represent DataType (including StructType) as valid Python code. Parameters: Name Type Description Default dtype T . DataType The input DataType or Schema object required Returns: Type Description str A valid python code which generate the same schema. Source code in quinn/schema_helpers.py def print_schema_as_code(dtype: T.DataType) -> str: \"\"\"Represent DataType (including StructType) as valid Python code. :param dtype: The input DataType or Schema object :type dtype: pyspark.sql.types.DataType :return: A valid python code which generate the same schema. :rtype: str \"\"\" res = [] if isinstance(dtype, T.StructType): res.append(\"StructType(\\n\\tfields=[\") for field in dtype.fields: for line in _repr_column(field).split(\"\\n\"): res.append(\"\\n\\t\\t\") res.append(line) res.append(\",\") res.append(\"\\n\\t]\\n)\") elif isinstance(dtype, T.ArrayType): res.append(\"ArrayType(\") res.append(print_schema_as_code(dtype.elementType)) res.append(\")\") elif isinstance(dtype, T.MapType): res.append(\"MapType(\") res.append(f\"\\n\\t{print_schema_as_code(dtype.keyType)},\") for line in print_schema_as_code(dtype.valueType).split(\"\\n\"): res.append(\"\\n\\t\") res.append(line) res.append(\",\") res.append(f\"\\n\\t{dtype.valueContainsNull},\") res.append(\"\\n)\") elif isinstance(dtype, T.DecimalType): res.append(f\"DecimalType({dtype.precision}, {dtype.scale})\") else: if str(dtype).endswith(\"()\"): # PySpark 3.3+ res.append(str(dtype)) else: res.append(f\"{dtype}()\") return \"\".join(res)","title":"print_schema_as_code()"},{"location":"reference/quinn/spark/","text":"","title":"Spark"},{"location":"reference/quinn/transformations/","text":"snake_case_col_names(df) This function takes a DataFrame instance and returns the same DataFrame instance with all column names converted to snake case (e.g. col_name_1 ). It uses the to_snake_case function in conjunction with the with_columns_renamed function to achieve this. Parameters: Name Type Description Default df DataFrame A DataFrame instance to process required Returns: Type Description ``DataFrame`` A DataFrame instance with column names converted to snake case Source code in quinn/transformations.py def snake_case_col_names(df: DataFrame) -> DataFrame: \"\"\"This function takes a ``DataFrame`` instance and returns the same ``DataFrame`` instance with all column names converted to snake case (e.g. ``col_name_1``). It uses the ``to_snake_case`` function in conjunction with the ``with_columns_renamed`` function to achieve this. :param df: A ``DataFrame`` instance to process :type df: ``DataFrame`` :return: A ``DataFrame`` instance with column names converted to snake case :rtype: ``DataFrame`` \"\"\" return with_columns_renamed(to_snake_case)(df) sort_columns(df, sort_order) This function sorts the columns of a given DataFrame based on a given sort order. The sort_order parameter can either be asc or desc , which correspond to ascending and descending order, respectively. If any other value is provided for the sort_order parameter, a ValueError will be raised. Parameters: Name Type Description Default df DataFrame A DataFrame required sort_order str The order in which to sort the columns in the DataFrame required Returns: Type Description pandas.DataFrame A DataFrame with the columns sorted in the chosen order Source code in quinn/transformations.py def sort_columns(df: DataFrame, sort_order: str) -> DataFrame: \"\"\"This function sorts the columns of a given DataFrame based on a given sort order. The ``sort_order`` parameter can either be ``asc`` or ``desc``, which correspond to ascending and descending order, respectively. If any other value is provided for the ``sort_order`` parameter, a ``ValueError`` will be raised. :param df: A DataFrame :type df: pandas.DataFrame :param sort_order: The order in which to sort the columns in the DataFrame :type sort_order: str :return: A DataFrame with the columns sorted in the chosen order :rtype: pandas.DataFrame \"\"\" sorted_col_names = None if sort_order == \"asc\": sorted_col_names = sorted(df.columns) elif sort_order == \"desc\": sorted_col_names = sorted(df.columns, reverse=True) else: raise ValueError( \"['asc', 'desc'] are the only valid sort orders and you entered a sort order of '{sort_order}'\".format( sort_order=sort_order ) ) return df.select(*sorted_col_names) to_snake_case(s) Takes a string and converts it to snake case format. Parameters: Name Type Description Default s str The string to be converted. required Returns: Type Description str The string in snake case format. Source code in quinn/transformations.py def to_snake_case(s: str) -> str: \"\"\"Takes a string and converts it to snake case format. :param s: The string to be converted. :type s: str :return: The string in snake case format. :rtype: str \"\"\" return s.lower().replace(\" \", \"_\") with_columns_renamed(fun) This is a function designed to rename the columns of a Spark DataFrame . It takes a Callable[[str], str] object as an argument ( fun ) and returns a Callable[[DataFrame], DataFrame] object. When _() is called on a DataFrame , it creates a list of column names, applying the argument fun() to each of them, and returning a new DataFrame with the new column names. Parameters: Name Type Description Default fun Callable [[ str ], str ] Renaming function required Returns: Type Description Callable [[ DataFrame ], DataFrame ] Function which takes DataFrame as parameter. Source code in quinn/transformations.py def with_columns_renamed(fun: Callable[[str], str]) -> Callable[[DataFrame], DataFrame]: \"\"\"This is a function designed to rename the columns of a `Spark DataFrame`. It takes a `Callable[[str], str]` object as an argument (``fun``) and returns a `Callable[[DataFrame], DataFrame]` object. When `_()` is called on a `DataFrame`, it creates a list of column names, applying the argument `fun()` to each of them, and returning a new `DataFrame` with the new column names. :param fun: Renaming function :returns: Function which takes DataFrame as parameter. \"\"\" def _(df: DataFrame) -> DataFrame: cols = list( map( lambda col_name: F.col(\"`{0}`\".format(col_name)).alias(fun(col_name)), df.columns, ) ) return df.select(*cols) return _ with_some_columns_renamed(fun, change_col_name) A function that takes a Callable[[str], str] and a Callable[[str], str] and returns a Callable[[DataFrame], DataFrame] , which in turn takes a DataFrame and returns a DataFrame with some of its columns renamed. Parameters: Name Type Description Default fun Callable [[ str ], str ] A function that takes a column name as a string and returns a new name as a string. required change_col_name Callable [[ str ], str ] A function that takes a column name as a string and returns a boolean. required Returns: Type Description `Callable[[DataFrame], DataFrame]` A Callable[[DataFrame], DataFrame] , which takes a DataFrame and returns a DataFrame with some of its columns renamed. Source code in quinn/transformations.py def with_some_columns_renamed( fun: Callable[[str], str], change_col_name: Callable[[str], str] ) -> Callable[[DataFrame], DataFrame]: \"\"\"A function that takes a `Callable[[str], str]` and a `Callable[[str], str]` and returns a `Callable[[DataFrame], DataFrame]`, which in turn takes a `DataFrame` and returns a `DataFrame` with some of its columns renamed. :param fun: A function that takes a column name as a string and returns a new name as a string. :type fun: `Callable[[str], str]` :param change_col_name: A function that takes a column name as a string and returns a boolean. :type change_col_name: `Callable[[str], str]` :return: A `Callable[[DataFrame], DataFrame]`, which takes a `DataFrame` and returns a `DataFrame` with some of its columns renamed. :rtype: `Callable[[DataFrame], DataFrame]` \"\"\" def _(df): cols = list( map( lambda col_name: F.col(\"`{0}`\".format(col_name)).alias(fun(col_name)) if change_col_name(col_name) else F.col(\"`{0}`\".format(col_name)), df.columns, ) ) return df.select(*cols) return _","title":"Transformations"},{"location":"reference/quinn/transformations/#quinn.transformations.snake_case_col_names","text":"This function takes a DataFrame instance and returns the same DataFrame instance with all column names converted to snake case (e.g. col_name_1 ). It uses the to_snake_case function in conjunction with the with_columns_renamed function to achieve this. Parameters: Name Type Description Default df DataFrame A DataFrame instance to process required Returns: Type Description ``DataFrame`` A DataFrame instance with column names converted to snake case Source code in quinn/transformations.py def snake_case_col_names(df: DataFrame) -> DataFrame: \"\"\"This function takes a ``DataFrame`` instance and returns the same ``DataFrame`` instance with all column names converted to snake case (e.g. ``col_name_1``). It uses the ``to_snake_case`` function in conjunction with the ``with_columns_renamed`` function to achieve this. :param df: A ``DataFrame`` instance to process :type df: ``DataFrame`` :return: A ``DataFrame`` instance with column names converted to snake case :rtype: ``DataFrame`` \"\"\" return with_columns_renamed(to_snake_case)(df)","title":"snake_case_col_names()"},{"location":"reference/quinn/transformations/#quinn.transformations.sort_columns","text":"This function sorts the columns of a given DataFrame based on a given sort order. The sort_order parameter can either be asc or desc , which correspond to ascending and descending order, respectively. If any other value is provided for the sort_order parameter, a ValueError will be raised. Parameters: Name Type Description Default df DataFrame A DataFrame required sort_order str The order in which to sort the columns in the DataFrame required Returns: Type Description pandas.DataFrame A DataFrame with the columns sorted in the chosen order Source code in quinn/transformations.py def sort_columns(df: DataFrame, sort_order: str) -> DataFrame: \"\"\"This function sorts the columns of a given DataFrame based on a given sort order. The ``sort_order`` parameter can either be ``asc`` or ``desc``, which correspond to ascending and descending order, respectively. If any other value is provided for the ``sort_order`` parameter, a ``ValueError`` will be raised. :param df: A DataFrame :type df: pandas.DataFrame :param sort_order: The order in which to sort the columns in the DataFrame :type sort_order: str :return: A DataFrame with the columns sorted in the chosen order :rtype: pandas.DataFrame \"\"\" sorted_col_names = None if sort_order == \"asc\": sorted_col_names = sorted(df.columns) elif sort_order == \"desc\": sorted_col_names = sorted(df.columns, reverse=True) else: raise ValueError( \"['asc', 'desc'] are the only valid sort orders and you entered a sort order of '{sort_order}'\".format( sort_order=sort_order ) ) return df.select(*sorted_col_names)","title":"sort_columns()"},{"location":"reference/quinn/transformations/#quinn.transformations.to_snake_case","text":"Takes a string and converts it to snake case format. Parameters: Name Type Description Default s str The string to be converted. required Returns: Type Description str The string in snake case format. Source code in quinn/transformations.py def to_snake_case(s: str) -> str: \"\"\"Takes a string and converts it to snake case format. :param s: The string to be converted. :type s: str :return: The string in snake case format. :rtype: str \"\"\" return s.lower().replace(\" \", \"_\")","title":"to_snake_case()"},{"location":"reference/quinn/transformations/#quinn.transformations.with_columns_renamed","text":"This is a function designed to rename the columns of a Spark DataFrame . It takes a Callable[[str], str] object as an argument ( fun ) and returns a Callable[[DataFrame], DataFrame] object. When _() is called on a DataFrame , it creates a list of column names, applying the argument fun() to each of them, and returning a new DataFrame with the new column names. Parameters: Name Type Description Default fun Callable [[ str ], str ] Renaming function required Returns: Type Description Callable [[ DataFrame ], DataFrame ] Function which takes DataFrame as parameter. Source code in quinn/transformations.py def with_columns_renamed(fun: Callable[[str], str]) -> Callable[[DataFrame], DataFrame]: \"\"\"This is a function designed to rename the columns of a `Spark DataFrame`. It takes a `Callable[[str], str]` object as an argument (``fun``) and returns a `Callable[[DataFrame], DataFrame]` object. When `_()` is called on a `DataFrame`, it creates a list of column names, applying the argument `fun()` to each of them, and returning a new `DataFrame` with the new column names. :param fun: Renaming function :returns: Function which takes DataFrame as parameter. \"\"\" def _(df: DataFrame) -> DataFrame: cols = list( map( lambda col_name: F.col(\"`{0}`\".format(col_name)).alias(fun(col_name)), df.columns, ) ) return df.select(*cols) return _","title":"with_columns_renamed()"},{"location":"reference/quinn/transformations/#quinn.transformations.with_some_columns_renamed","text":"A function that takes a Callable[[str], str] and a Callable[[str], str] and returns a Callable[[DataFrame], DataFrame] , which in turn takes a DataFrame and returns a DataFrame with some of its columns renamed. Parameters: Name Type Description Default fun Callable [[ str ], str ] A function that takes a column name as a string and returns a new name as a string. required change_col_name Callable [[ str ], str ] A function that takes a column name as a string and returns a boolean. required Returns: Type Description `Callable[[DataFrame], DataFrame]` A Callable[[DataFrame], DataFrame] , which takes a DataFrame and returns a DataFrame with some of its columns renamed. Source code in quinn/transformations.py def with_some_columns_renamed( fun: Callable[[str], str], change_col_name: Callable[[str], str] ) -> Callable[[DataFrame], DataFrame]: \"\"\"A function that takes a `Callable[[str], str]` and a `Callable[[str], str]` and returns a `Callable[[DataFrame], DataFrame]`, which in turn takes a `DataFrame` and returns a `DataFrame` with some of its columns renamed. :param fun: A function that takes a column name as a string and returns a new name as a string. :type fun: `Callable[[str], str]` :param change_col_name: A function that takes a column name as a string and returns a boolean. :type change_col_name: `Callable[[str], str]` :return: A `Callable[[DataFrame], DataFrame]`, which takes a `DataFrame` and returns a `DataFrame` with some of its columns renamed. :rtype: `Callable[[DataFrame], DataFrame]` \"\"\" def _(df): cols = list( map( lambda col_name: F.col(\"`{0}`\".format(col_name)).alias(fun(col_name)) if change_col_name(col_name) else F.col(\"`{0}`\".format(col_name)), df.columns, ) ) return df.select(*cols) return _","title":"with_some_columns_renamed()"},{"location":"reference/quinn/extensions/","text":"","title":"Index"},{"location":"reference/quinn/extensions/column_ext/","text":"isFalse(self) This function checks if the column is equal to False and returns the column. Parameters: Name Type Description Default self Column Column required Returns: Type Description Column Column Source code in quinn/extensions/column_ext.py def isFalse(self: Column) -> Column: \"\"\"This function checks if the column is equal to False and returns the column. :param self: Column :return: Column :rtype: Column \"\"\" return self is False isFalsy(self) Returns a Column indicating whether all values in the Column are False or NULL ( falsy ). Each element in the resulting column is True if all the elements in the Column are either NULL or False, or False otherwise. This is accomplished by performing a bitwise or of the isNull condition and a literal False value and then wrapping the result in a when statement. Parameters: Name Type Description Default self Column Column object required Returns: Type Description Column Column object Source code in quinn/extensions/column_ext.py def isFalsy(self: Column) -> Column: \"\"\"Returns a Column indicating whether all values in the Column are False or NULL (**falsy**). Each element in the resulting column is True if all the elements in the Column are either NULL or False, or False otherwise. This is accomplished by performing a bitwise or of the ``isNull`` condition and a literal False value and then wrapping the result in a **when** statement. :param self: Column object :returns: Column object :rtype: Column \"\"\" return when(self.isNull() | (self == lit(False)), True).otherwise(False) isNullOrBlank(self) Returns a Boolean value which expresses whether a given column is null or contains only blank characters. Parameters: Name Type Description Default \\*\\*self The :class: Column to check. required Returns: Type Description Column A Column containing True if the column is null or only contains blank characters, or False otherwise. Source code in quinn/extensions/column_ext.py def isNullOrBlank(self: Column) -> Column: \"\"\"Returns a Boolean value which expresses whether a given column is ``null`` or contains only blank characters. :param \\*\\*self: The :class:`Column` to check. :returns: A `Column` containing ``True`` if the column is ``null`` or only contains blank characters, or ``False`` otherwise. :rtype: Column \"\"\" return (self.isNull()) | (trim(self) == \"\") isTrue(self) This function takes a column of type Column as an argument and returns a column of type Column. It evaluates whether each element in the column argument is equal to True, and if so will return True, otherwise False. Parameters: Name Type Description Default self Column Column object required Returns: Type Description Column Column object Source code in quinn/extensions/column_ext.py def isTrue(self: Column) -> Column: \"\"\" This function takes a column of type Column as an argument and returns a column of type Column. It evaluates whether each element in the column argument is equal to True, and if so will return True, otherwise False. :param self: Column object :returns: Column object :rtype: Column \"\"\" return self is True isTruthy(self) Calculates a boolean expression that is the opposite of isFalsy for the given Column self. Parameters: Name Type Description Default self Column The Column to calculate the opposite of isFalsy for. required Returns: Type Description Column A Column with the results of the calculation. Source code in quinn/extensions/column_ext.py def isTruthy(self: Column) -> Column: \"\"\"Calculates a boolean expression that is the opposite of isFalsy for the given ``Column`` self. :param Column self: The ``Column`` to calculate the opposite of isFalsy for. :returns: A ``Column`` with the results of the calculation. :rtype: Column \"\"\" return ~(self.isFalsy())","title":"Column ext"},{"location":"reference/quinn/extensions/column_ext/#quinn.extensions.column_ext.isFalse","text":"This function checks if the column is equal to False and returns the column. Parameters: Name Type Description Default self Column Column required Returns: Type Description Column Column Source code in quinn/extensions/column_ext.py def isFalse(self: Column) -> Column: \"\"\"This function checks if the column is equal to False and returns the column. :param self: Column :return: Column :rtype: Column \"\"\" return self is False","title":"isFalse()"},{"location":"reference/quinn/extensions/column_ext/#quinn.extensions.column_ext.isFalsy","text":"Returns a Column indicating whether all values in the Column are False or NULL ( falsy ). Each element in the resulting column is True if all the elements in the Column are either NULL or False, or False otherwise. This is accomplished by performing a bitwise or of the isNull condition and a literal False value and then wrapping the result in a when statement. Parameters: Name Type Description Default self Column Column object required Returns: Type Description Column Column object Source code in quinn/extensions/column_ext.py def isFalsy(self: Column) -> Column: \"\"\"Returns a Column indicating whether all values in the Column are False or NULL (**falsy**). Each element in the resulting column is True if all the elements in the Column are either NULL or False, or False otherwise. This is accomplished by performing a bitwise or of the ``isNull`` condition and a literal False value and then wrapping the result in a **when** statement. :param self: Column object :returns: Column object :rtype: Column \"\"\" return when(self.isNull() | (self == lit(False)), True).otherwise(False)","title":"isFalsy()"},{"location":"reference/quinn/extensions/column_ext/#quinn.extensions.column_ext.isNullOrBlank","text":"Returns a Boolean value which expresses whether a given column is null or contains only blank characters. Parameters: Name Type Description Default \\*\\*self The :class: Column to check. required Returns: Type Description Column A Column containing True if the column is null or only contains blank characters, or False otherwise. Source code in quinn/extensions/column_ext.py def isNullOrBlank(self: Column) -> Column: \"\"\"Returns a Boolean value which expresses whether a given column is ``null`` or contains only blank characters. :param \\*\\*self: The :class:`Column` to check. :returns: A `Column` containing ``True`` if the column is ``null`` or only contains blank characters, or ``False`` otherwise. :rtype: Column \"\"\" return (self.isNull()) | (trim(self) == \"\")","title":"isNullOrBlank()"},{"location":"reference/quinn/extensions/column_ext/#quinn.extensions.column_ext.isTrue","text":"This function takes a column of type Column as an argument and returns a column of type Column. It evaluates whether each element in the column argument is equal to True, and if so will return True, otherwise False. Parameters: Name Type Description Default self Column Column object required Returns: Type Description Column Column object Source code in quinn/extensions/column_ext.py def isTrue(self: Column) -> Column: \"\"\" This function takes a column of type Column as an argument and returns a column of type Column. It evaluates whether each element in the column argument is equal to True, and if so will return True, otherwise False. :param self: Column object :returns: Column object :rtype: Column \"\"\" return self is True","title":"isTrue()"},{"location":"reference/quinn/extensions/column_ext/#quinn.extensions.column_ext.isTruthy","text":"Calculates a boolean expression that is the opposite of isFalsy for the given Column self. Parameters: Name Type Description Default self Column The Column to calculate the opposite of isFalsy for. required Returns: Type Description Column A Column with the results of the calculation. Source code in quinn/extensions/column_ext.py def isTruthy(self: Column) -> Column: \"\"\"Calculates a boolean expression that is the opposite of isFalsy for the given ``Column`` self. :param Column self: The ``Column`` to calculate the opposite of isFalsy for. :returns: A ``Column`` with the results of the calculation. :rtype: Column \"\"\" return ~(self.isFalsy())","title":"isTruthy()"},{"location":"reference/quinn/extensions/dataframe_ext/","text":"","title":"Dataframe ext"},{"location":"reference/quinn/extensions/spark_session_ext/","text":"create_df(self, rows_data, col_specs) Creates a new DataFrame from the given data and column specs. The returned DataFrame is created using the StructType and StructField classes provided by PySpark. Parameters: Name Type Description Default rows_data array-like the data used to create the DataFrame required col_specs list of tuples list of tuples containing the name and type of the field required Returns: Type Description DataFrame a new DataFrame Source code in quinn/extensions/spark_session_ext.py def create_df(self, rows_data, col_specs): \"\"\"Creates a new DataFrame from the given data and column specs. The returned DataFrame is created using the StructType and StructField classes provided by PySpark. :param rows_data: the data used to create the DataFrame :type rows_data: array-like :param col_specs: list of tuples containing the name and type of the field :type col_specs: list of tuples :return: a new DataFrame :rtype: DataFrame \"\"\" struct_fields = list(map(lambda x: StructField(*x), col_specs)) return self.createDataFrame(data=rows_data, schema=StructType(struct_fields))","title":"Spark session ext"},{"location":"reference/quinn/extensions/spark_session_ext/#quinn.extensions.spark_session_ext.create_df","text":"Creates a new DataFrame from the given data and column specs. The returned DataFrame is created using the StructType and StructField classes provided by PySpark. Parameters: Name Type Description Default rows_data array-like the data used to create the DataFrame required col_specs list of tuples list of tuples containing the name and type of the field required Returns: Type Description DataFrame a new DataFrame Source code in quinn/extensions/spark_session_ext.py def create_df(self, rows_data, col_specs): \"\"\"Creates a new DataFrame from the given data and column specs. The returned DataFrame is created using the StructType and StructField classes provided by PySpark. :param rows_data: the data used to create the DataFrame :type rows_data: array-like :param col_specs: list of tuples containing the name and type of the field :type col_specs: list of tuples :return: a new DataFrame :rtype: DataFrame \"\"\" struct_fields = list(map(lambda x: StructField(*x), col_specs)) return self.createDataFrame(data=rows_data, schema=StructType(struct_fields))","title":"create_df()"}]} \ No newline at end of file diff --git a/search/worker.js b/search/worker.js new file mode 100644 index 00000000..8628dbce --- /dev/null +++ b/search/worker.js @@ -0,0 +1,133 @@ +var base_path = 'function' === typeof importScripts ? '.' : '/search/'; +var allowSearch = false; +var index; +var documents = {}; +var lang = ['en']; +var data; + +function getScript(script, callback) { + console.log('Loading script: ' + script); + $.getScript(base_path + script).done(function () { + callback(); + }).fail(function (jqxhr, settings, exception) { + console.log('Error: ' + exception); + }); +} + +function getScriptsInOrder(scripts, callback) { + if (scripts.length === 0) { + callback(); + return; + } + getScript(scripts[0], function() { + getScriptsInOrder(scripts.slice(1), callback); + }); +} + +function loadScripts(urls, callback) { + if( 'function' === typeof importScripts ) { + importScripts.apply(null, urls); + callback(); + } else { + getScriptsInOrder(urls, callback); + } +} + +function onJSONLoaded () { + data = JSON.parse(this.responseText); + var scriptsToLoad = ['lunr.js']; + if (data.config && data.config.lang && data.config.lang.length) { + lang = data.config.lang; + } + if (lang.length > 1 || lang[0] !== "en") { + scriptsToLoad.push('lunr.stemmer.support.js'); + if (lang.length > 1) { + scriptsToLoad.push('lunr.multi.js'); + } + if (lang.includes("ja") || lang.includes("jp")) { + scriptsToLoad.push('tinyseg.js'); + } + for (var i=0; i < lang.length; i++) { + if (lang[i] != 'en') { + scriptsToLoad.push(['lunr', lang[i], 'js'].join('.')); + } + } + } + loadScripts(scriptsToLoad, onScriptsLoaded); +} + +function onScriptsLoaded () { + console.log('All search scripts loaded, building Lunr index...'); + if (data.config && data.config.separator && data.config.separator.length) { + lunr.tokenizer.separator = new RegExp(data.config.separator); + } + + if (data.index) { + index = lunr.Index.load(data.index); + data.docs.forEach(function (doc) { + documents[doc.location] = doc; + }); + console.log('Lunr pre-built index loaded, search ready'); + } else { + index = lunr(function () { + if (lang.length === 1 && lang[0] !== "en" && lunr[lang[0]]) { + this.use(lunr[lang[0]]); + } else if (lang.length > 1) { + this.use(lunr.multiLanguage.apply(null, lang)); // spread operator not supported in all browsers: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_operator#Browser_compatibility + } + this.field('title'); + this.field('text'); + this.ref('location'); + + for (var i=0; i < data.docs.length; i++) { + var doc = data.docs[i]; + this.add(doc); + documents[doc.location] = doc; + } + }); + console.log('Lunr index built, search ready'); + } + allowSearch = true; + postMessage({config: data.config}); + postMessage({allowSearch: allowSearch}); +} + +function init () { + var oReq = new XMLHttpRequest(); + oReq.addEventListener("load", onJSONLoaded); + var index_path = base_path + '/search_index.json'; + if( 'function' === typeof importScripts ){ + index_path = 'search_index.json'; + } + oReq.open("GET", index_path); + oReq.send(); +} + +function search (query) { + if (!allowSearch) { + console.error('Assets for search still loading'); + return; + } + + var resultDocuments = []; + var results = index.search(query); + for (var i=0; i < results.length; i++){ + var result = results[i]; + doc = documents[result.ref]; + doc.summary = doc.text.substring(0, 200); + resultDocuments.push(doc); + } + return resultDocuments; +} + +if( 'function' === typeof importScripts ) { + onmessage = function (e) { + if (e.data.init) { + init(); + } else if (e.data.query) { + postMessage({ results: search(e.data.query) }); + } else { + console.error("Worker - Unrecognized message: " + e); + } + }; +} diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 00000000..0f8724ef --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,3 @@ + +