-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_wide.py
44 lines (35 loc) · 1.72 KB
/
clean_wide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
data_path = "data"
file_path = "data_wide.csv"
cleaned_path = "data_wide_cleaned.csv"
df_wide = pd.read_csv(f"{data_path}/{file_path}",
parse_dates=['date'], dayfirst=True)
# We fill NAs by 0
df_wide = df_wide.fillna(0)
# We will not allow for negative values. These are defined as corrections due
# to cases that were subsequently declared negative. So, we will go backwards
# to subtract these corrections from the most recent value above. If this value
# is not high enough, we propagate backwards until no more negative values
# exist
neg_cols = (df_wide.iloc[:, 1:] < 0).any(axis=0)
neg_cols = neg_cols.index[neg_cols]
while len(neg_cols) > 0:
for col in neg_cols:
# Find rows in which the value is negative
indices = df_wide.index[df_wide[col] < 0]
# If the negative value is at the start of the data, then we cannot
# propagate it backwards more. In that case, we set it equal to zero.
# Note that it does not logically make sense that this happens, as
# negative numbers correct values in the past. However, it may
# sometimes happen so our code should take this into account.
if 0 in indices:
df_wide.loc[0, col] = 0
indices = df_wide.index[df_wide[col] < 0]
# Look at indices in reverse
for index in indices[::-1]:
df_wide.loc[index-1, col] = df_wide.loc[index-1, col] + \
df_wide.loc[index, col]
df_wide.loc[index, col] = 0
neg_cols = (df_wide.iloc[:, 1:] < 0).any(axis=0)
neg_cols = neg_cols.index[neg_cols]
df_wide.to_csv(f"{data_path}/{cleaned_path}", index=False)