-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper_functions.py
160 lines (123 loc) · 4.54 KB
/
helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from datetime import date, datetime
from typing import Callable, Dict, List
import numpy as np
import pandas as pd
# Helper function to iterate over dictionary of dataframes
def reduce_df_dict(
df_dict: Dict[str, pd.DataFrame], function: Callable[[pd.DataFrame], pd.DataFrame]
) -> Dict[str, pd.DataFrame]:
"""
Iterates over a dictionary of DataFrames, applying a given function to each
DataFrame.
Parameters:
df_dict (dict): A dictionary where keys are identifiers and values are pandas DataFrames.
function (callable): A function that takes a DataFrame and one argument.
Returns:
dict: A dictionary with the same keys as `df_dict` but with the values being the transformed
DataFrames.
"""
# Initialise an empty dictionary for transformed dataframes
transformed_dict = {}
for key, df in df_dict.items():
transformed_dict[key] = function(df)
return transformed_dict
def parse_mors_datestring(datestring: str) -> date:
"""
Helper function to convert a date string in the format "07nov2019" into a date object.
Parameters:
datestring (str): The date string to convert, expected to be in the format "%d%b%Y".
Returns:
date: A date object corresponding to the date string provided.
"""
if isinstance(datestring, str):
date_object = datetime.strptime(str(datestring), "%d%b%Y")
return date_object
else:
pass
def parse_binary_to_boolean(value) -> bool | None:
"""
Converts a value to boolean. `None` or NaN values return `None`.
Non-zero numbers are considered `True`.
Parameters:
value: Value to convert, expected to be interpretable as a float.
Returns:
True if non-zero, False if zero, `None` if input is `None` or cannot be converted.
"""
if pd.isnull(value):
return None
try:
float_cast = float(value)
return bool(float_cast)
except ValueError:
# Handle the case where conversion fails
return None
def parse_float_to_int(value) -> int | None:
"""
Converts a value to an integer. Handles `None` or NaN by returning `None`.
Parameters:
value: Value to convert, expected to be convertible to a float.
Returns:
The integer part of the value, or `None` if input is `None` or cannot be converted.
"""
if pd.isnull(value):
return None
try:
# Convert to float to handle string representations of floats
float_cast = float(value)
# Convert the float to an integer
int_cast = int(float_cast)
return int_cast
except ValueError:
# Handle the case where conversion fails due to an invalid value (e.g., a non-numeric string)
return None
except TypeError:
# Handle other types of conversion errors
return None
def map_dataframe_dtypes(df: pd.DataFrame, dtype_map: Dict[str, str]) -> pd.DataFrame:
"""
Changes column data types in a dataframe.
Parameters:
df: Dataframe to modify.
dtype_map: Dictionary mapping column names to new data types.
Returns:
New dataframe with updated column data types.
"""
df_copy = df.copy()
return df_copy.astype(dtype_map)
def categorise_columns(df: pd.DataFrame, column_names: List[str]):
"""
Converts specified columns to categorical type.
Parameters:
df: Dataframe to modify.
column_names: Columns to convert to categorical type.
Returns:
New dataframe with specified columns as categorical types.
"""
df_copy = df.copy()
for column_name in column_names:
df_copy[column_name] = pd.Categorical(df_copy[column_name])
return df_copy
def drop_columns(df: pd.DataFrame, column_names: List[str]) -> pd.DataFrame:
"""
Removes specified columns from a dataframe.
Parameters:
df: Dataframe to modify.
column_names: Names of columns to remove.
Returns:
New dataframe without the specified columns.
"""
df_copy = df.copy()
return df_copy.drop(column_names, axis=1, errors="ignore")
def replace_negative_values_with_nan(df, column_names):
"""
Replaces negative values with NaN in specified columns.
Parameters:
df: Dataframe to modify.
column_names: Columns to check for negative values.
Returns:
Dataframe with negative values replaced by NaN in specified columns.
"""
for col in column_names:
if col in df.columns:
df[col] = df[col].apply(lambda x: np.nan if x < 0 else x)
return df