Program_Code_python.py

# -*- coding: utf-8 -*-
"""FTDA.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/14Kwru_VSekaXpEN1hUzkXXsXy0uaGmW4

# Importing Required Libraries
"""

from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
from pathlib import Path
# import plotly.offline as py
import plotly.express as px

"""### DRIVE IMPORT"""

from google.colab import drive
drive.mount('/content/drive')

data1 = pd.read_csv("/content/drive/My Drive/FTDA/covid_19_data.csv")
data2 = pd.read_csv("/content/drive/My Drive/FTDA/daily_weather_2020.csv")

"""### Import from Runtime"""

data1 = pd.read_csv("covid_19_data.csv")
data2 = pd.read_csv("daily_weather_2020.csv")

"""## Data Cleaning and Transformation"""

data2['ObservationDate'] = pd.to_datetime(data2.time)
data1['ObservationDate'] = pd.to_datetime(data1.ObservationDate)

data1.replace(to_replace='Mainland China',value='China',inplace=True)
data1

data1['Country/Region'].replace(to_replace='Hong Kong',value='China',inplace=True)
data1

data1['Country/Region'].replace(to_replace='Macau',value='China',inplace=True)
data1

import missingno as msno 
msno.bar(data1)

msno.bar(data2)

"""We must remove the precipAccumulation column as it has 80% missing values

---
"""

data2 = data2.drop(labels='precipAccumulation', axis=1)
msno.bar(data2)

"""### Merging both tables by State/Province after removing the NA valued rows in both. Removing categorical variables as they are not relevant."""

d2 = data2[data2['Province/State'].notna()]
d1 = data1[data1['Province/State'].notna()]
data4 = pd.merge(d1,d2,
                 left_on=['Country/Region','Province/State','ObservationDate'],
                 right_on=['Country/Region','Province/State','ObservationDate'])
data5 = data4.drop(['time','summary','icon','Unnamed: 0','SNo','Last Update','precipType'],axis=1)

data5.info()

"""## To save merged dataset if needed for further analysis"""

from google.colab import files

data5.to_csv('merged_data.csv')
files.download('merged_data.csv')

"""### Making key variables like humidity, wind Speed, etc to appear in the front"""

col = data5.pop('humidity')
col1 = data5.pop('pressure')
col2 = data5.pop('windSpeed')
col3 = data5.pop('temperatureMax')
col4 = data5.pop('precipIntensity')
col5 = data5.pop('dewPoint')
data5.insert(6,'humidity',col)
data5.insert(7,'pressure',col1)
data5.insert(8,'windSpeed',col2)
data5.insert(9,'temperature(F)',col3)
data5.insert(10,'precipIntensity',col4)
data5.insert(11,'dewPoint',col5)

msno.bar(data5)

"""## Number of Confirmed Cases vs Date"""

timex = data5[['ObservationDate','Confirmed']]
timex = timex.groupby('ObservationDate').agg(sum).reset_index()
plt.plot(timex['ObservationDate'],timex['Confirmed'],color='red')
plt.title('Date vs Total Confirmed in World')
data5.describe()

"""## Considering values till March 15"""

starting = data5[data5['ObservationDate'] <= pd.Timestamp(2020,3,15)]
perctry = starting.groupby('Country/Region')['Confirmed'].agg(sum).reset_index()
perctry

"""We can see that China has very high number of cases compared to rest"""

starting = starting[starting['Country/Region']=='China']
starting

"""## Considering values after March 15"""

rest = data5[data5['ObservationDate'] > pd.Timestamp(2020,3,15)]
perctry = rest.groupby('Country/Region')['Confirmed'].agg(sum).reset_index()
perctry

"""We can remove Denmark and Netherlands"""

rest = rest[rest['Country/Region']!='Denmark'] 
rest = rest[rest['Country/Region']!='Netherlands']
rest

"""## Correlation

### Considering only China till March 15
"""

starting_corr = starting[['humidity','pressure','windSpeed','temperature(F)','precipIntensity','dewPoint','Confirmed']]
starting_corr.corr().style.background_gradient('RdPu')

"""### Considering all countries except Denmark and Netherlands after March 15"""

rest_corr = rest[['humidity','pressure','windSpeed','temperature(F)','precipIntensity','dewPoint','Confirmed']]
rest_corr.corr().style.background_gradient('magma')

"""### Considering full dataset"""

full_corr = data5[['humidity','pressure','windSpeed','temperature(F)','precipIntensity','dewPoint','Confirmed']]
full_corr.corr().style.background_gradient('viridis')

"""### Correlation matrix for all days after Feb"""

data_for_corr = data5[['ObservationDate','humidity','pressure','windSpeed','temperature(F)','precipIntensity','dewPoint','Confirmed']]
data_for_corr = data_for_corr[data_for_corr['ObservationDate']>=pd.Timestamp(2020,2,1)]
data_for_corr.pop('ObservationDate')
data_for_corr.describe()

data_for_corr.corr().style.background_gradient('viridis')

"""# PLOTS

## Pressure Values vs Confirmed Cases and Precipitation Intensity vs Confirmed
"""

pres = data5[['pressure','Confirmed']]
pres = pres.groupby('pressure').sum().reset_index()
plt.scatter(pres['pressure'],pres['Confirmed'])
plt.title('Pressure - Confirmed')
plt.show()
#pres

prec = data5[['precipIntensity','Confirmed']]
prec = prec.groupby('precipIntensity').sum().reset_index()
plt.scatter(prec['precipIntensity'],prec['Confirmed'])
plt.title('Precipitation Instensity - Confirmed')
plt.show()

"""## Choose a Month to draw Graphs on"""

#@title Choose Month Start and End. For eg if start_month = 2 month of Feb will be considered
start_month =  2#@param {type:"integer"}

d = {}
d[2]='Feb'
d[3]='Mar'
d[4]='Apr'
d[5]='May'
d[6]='jun'
d[7]='Jul'
d[8]='Aug'

month = d[start_month]

data6 = data5
data6 = data6[data6['ObservationDate']> pd.Timestamp(2020,start_month,1)]
data6 = data6[data6['ObservationDate']< pd.Timestamp(2020,start_month+1,1)]
print(data6)

month_corr = data6[['humidity','pressure','windSpeed','temperature(F)','precipIntensity','dewPoint','Confirmed']]
month_corr.corr().style.background_gradient('copper')

"""# The Plots for selected month without considering the less number of cases in certain countries

## Plotting Pressure vs Wind Speed having Intensity values as Confirmed Cases
"""

fx = px.scatter(data6, x='pressure', y='windSpeed',title='Relation between Pressure-windSpeed for ' + month +' month',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

"""## Plotting Humidity vs Dew Point having Intensity values as Confirmed Cases"""

fx = px.scatter(data6, x='humidity', y='dewPoint',title='Relation between Humidity-Dew Point for ' + month +' month',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

"""## Plotting Temperature vs Dew Point having Intensity values as Confirmed Cases"""

fx = px.scatter(data6, x='temperature(F)', y='dewPoint',title='Relation between Temp-Dew Point for ' + month +' month', hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

"""# The Plots after considering before and After March 15 and removing few countries"""

# from plotly.subplots import make_subplots

# fig = make_subplots(rows=2, cols=1, subplot_titles=("Relation between Pressure-windSpeed before 15 March", "Plot 2"))

# fig.append_trace(px.scatter(starting, x='pressure', y='windSpeed',title='Relation between Pressure-windSpeed before 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed'), row=1, col=1)
# fig.append_trace(px.scatter(rest, x='pressure', y='windSpeed',title='Relation between Pressure-windSpeed after 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed'),row =2,col=1)
# fig.update_layout(height=600, width=600, title_text="Stacked Subplots")
# fig.show()

fx = px.scatter(starting, x='pressure', y='windSpeed',title='Relation between Pressure-windSpeed before 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

fx = px.scatter(rest, x='pressure', y='windSpeed',title='Relation between Pressure-windSpeed after 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

fx = px.scatter(starting, x='dewPoint', y='temperature(F)',title='Relation between Dew Point-Temperature before 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

fx = px.scatter(rest, x='humidity', y='precipIntensity',title='Relation between Humidity-Precipitation Intensity after 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

fx = px.scatter(starting, x='humidity', y='precipIntensity',title='Relation between Humidity-Precipitation Intensity before 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

fx = px.scatter(rest, x='dewPoint', y='temperature(F)',title='Relation between Dew Point-Temperature after 15 March',hover_name='ObservationDate',color='Confirmed',size='Confirmed')
fx.show()

"""# PREDICTION"""

starting['temp'] = starting['temperature(F)']
col1 = starting.pop('temperature(F)')
#print(col1)
#starting['temp']

"""## Considering whole Dataset

Pressure and PrecipIntensity had highest correlation
"""

model = ols('Confirmed ~ pressure + precipIntensity', data=data5).fit()
print(model.summary())

fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(model, 'pressure', fig=fig)

fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(model, 'precipIntensity', fig=fig)

"""The R squared value is very low and residual plots are not good

### Applying Linear Regression per parameter and all parameters taken at once. XGBoost Regression also done in similar fashion
"""

from sklearn import linear_model
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from math import sqrt

#confirmed = data5['Confirmed']
print('Linear Regression\n')
reg = linear_model.LinearRegression()
for i in range(6):
    x = data5.iloc[:,i+6]
    print(data5.columns.values[i+6])
    y = data5['Confirmed']
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
    X_train = np.asarray(X_train).reshape(-1,1)
    y_train = np.asarray(y_train).reshape(-1,1)
    reg.fit(X_train, y_train)
    acc_logg = round(reg.score(X_train, y_train) * 100, 2)
    print("Accuracy:",end='')
    print(round(acc_logg,2,), "%")  
    y_pred = reg.predict(np.asarray(X_test).reshape(-1,1))
    print("Mean squared error:",end='')
    print(mean_squared_error(y_test,y_pred))
    df = pd.DataFrame({'Actual': np.asarray(y_test).flatten(), 'Predicted': np.asarray(y_pred).flatten()})
    df
print('\nMultiple Linear Regression\n')
x = data5.iloc[:,6:12]
y = data5['Confirmed']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
reg.fit(X_train, y_train)
acc_logg = round(reg.score(X_train, y_train) * 100, 2)
print("Accuracy:",end='')
print(round(acc_logg,2,), "%")  
y_pred = reg.predict(X_test)
print("Mean squared error:",end='')
print(mean_squared_error(y_test,y_pred))
df = pd.DataFrame({'Actual': np.asarray(y_test).flatten(), 'Predicted': np.asarray(y_pred).flatten()})
df

print('\nXgboost regression\n')
#Xgboost regression
for i in range(6):
    x = data5.iloc[:,i+6]
    print(data5.columns.values[i+6])
    y = data5['Confirmed'] 
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                              colsample_bytree = 0.3,
                              learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
    X_train = np.asarray(X_train).reshape(-1,1)
    y_train = np.asarray(y_train).reshape(-1,1)
    xg_reg.fit(X_train,y_train)
    acc_logg = round(xg_reg.score(X_train, y_train) * 100, 2)
    print("Accuracy:",end='')
    print(round(acc_logg,2,), "%") 
print('\nMultiple XGBoost Regression\n')
x = data5.iloc[:,6:12]
y = data5['Confirmed']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                              colsample_bytree = 0.3,
                              learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)
acc_logg = round(xg_reg.score(X_train, y_train) * 100, 2)
print("Accuracy:",end='')
print(round(acc_logg,2,), "%")
y_pred = xg_reg.predict(X_test)
print("Mean squared error:",end='')
print(mean_squared_error(y_test,y_pred))

"""**Even after applying XGBoost, the maximum accuracy is less than 7%**

## For China Alone till March 15

Considering temp and dewPoint as they had maximum correlation
"""

model = ols('Confirmed ~ temp + dewPoint', data = starting).fit()
print(model.summary())

x = starting[['temp','dewPoint']]
y = starting['Confirmed']
print('Multiple Linear Regression\n')
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
reg.fit(X_train, y_train)
acc_logg = round(reg.score(X_train, y_train) * 100, 2)
print("Accuracy:",end='')
print(round(acc_logg,2,), "%")  
y_pred = reg.predict(X_test)
print("Mean squared error:",end='')
print(mean_squared_error(y_test,y_pred))
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                              colsample_bytree = 0.3,
                              learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)
acc_logg = round(xg_reg.score(X_train, y_train) * 100, 2)
print('\nXGBoost Regression\n')
print("Accuracy:",end='')
print(round(acc_logg,2,), "%")
y_pred = xg_reg.predict(X_test)
print("Mean squared error:",end='')
print(mean_squared_error(y_test,y_pred))

fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(model, 'dewPoint', fig=fig)

fig = plt.figure(figsize=(12,8))
fig = sm.graphics.plot_regress_exog(model, 'temp', fig=fig)

print('Linear Regression\n')
reg = linear_model.LinearRegression()
for i in range(6):
    x = starting.iloc[:,i+6]
    print(data5.columns.values[i+6])
    y = starting['Confirmed']
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
    X_train = np.asarray(X_train).reshape(-1,1)
    y_train = np.asarray(y_train).reshape(-1,1)
    reg.fit(X_train, y_train)
    acc_logg = round(reg.score(X_train, y_train) * 100, 2)
    print("Accuracy:",end='')
    print(round(acc_logg,2,), "%")  
    y_pred = reg.predict(np.asarray(X_test).reshape(-1,1))
    print("Mean squared error:",end='')
    print(mean_squared_error(y_test,y_pred))
    df = pd.DataFrame({'Actual': np.asarray(y_test).flatten(), 'Predicted': np.asarray(y_pred).flatten()})
    df
print('\nMultiple Linear Regression\n')
x = starting.iloc[:,6:12]
y = starting['Confirmed']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
reg.fit(X_train, y_train)
acc_logg = round(reg.score(X_train, y_train) * 100, 2)
print("Accuracy:",end='')
print(round(acc_logg,2,), "%")  
y_pred = reg.predict(X_test)
print("Mean squared error:",end='')
print(mean_squared_error(y_test,y_pred))
df = pd.DataFrame({'Actual': np.asarray(y_test).flatten(), 'Predicted': np.asarray(y_pred).flatten()})
df
###    Xgboost regression   ###
print('\nXgboost regression\n')

for i in range(6):
    x = starting.iloc[:,i+6]
    print(data5.columns.values[i+6])
    y = starting['Confirmed']
    #data_dmatrix = xgb.DMatrix(data=x,label=y)  
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
    xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                              colsample_bytree = 0.3,
                              learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
    X_train = np.asarray(X_train).reshape(-1,1)
    y_train = np.asarray(y_train).reshape(-1,1)
    xg_reg.fit(X_train,y_train)
    acc_logg = round(xg_reg.score(X_train, y_train) * 100, 2)
    print("Accuracy:",end='')
    print(round(acc_logg,2,), "%") 
print('\nMultiple XGBoost Regression\n')
x = starting.iloc[:,6:12]
y = starting['Confirmed']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                              colsample_bytree = 0.3,
                              learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)
acc_logg = round(xg_reg.score(X_train, y_train) * 100, 2)
print("Accuracy:",end='')
print(round(acc_logg,2,), "%")
y_pred = xg_reg.predict(X_test)
print("Mean squared error:",end='')
print(mean_squared_error(y_test,y_pred))

"""## Tree based methods:"""

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn import tree

x = data5.iloc[:,6:12]
y = data5['Confirmed']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

dtree1 = DecisionTreeRegressor(max_depth=2)
dtree2 = DecisionTreeRegressor(max_depth=6)
dtree1.fit(X_train, y_train)
dtree2.fit(X_train, y_train)

# Code Lines 5 to 6: Predict on training data
tr1 = dtree1.predict(X_train)
tr2 = dtree2.predict(X_train) 

#Code Lines 7 to 8: Predict on testing data
y1 = dtree1.predict(X_test)
y2 = dtree2.predict(X_test) 

# Print RMSE and R-squared value for regression tree 'dtree1' on training data
print('With max depth as 2\n')
print('Train Mean Squared Error ' + str(round(np.sqrt(mean_squared_error(y_train,tr1)),4))) 
print('R2 score ' + str(round(r2_score(y_train, tr1),4)))

# Print RMSE and R-squared value for regression tree 'dtree1' on testing data
print('Test Mean Squared Error ' + str(round(np.sqrt(mean_squared_error(y_test,y1)),4))) 
print('R2 score ' + str(round(r2_score(y_test, y1),4))) 
print('')
# Print RMSE and R-squared value for regression tree 'dtree2' on training data
print('\nWith max depth as 6\n')
print('Train Mean Squared Error ' + str(round(np.sqrt(mean_squared_error(y_train,tr2)),4))) 
print('R2 score ' + str(round(r2_score(y_train, tr2),4)))

# Print RMSE and R-squared value for regression tree 'dtree12' on testing data
print('Test Mean Squared Error ' + str(round(np.sqrt(mean_squared_error(y_test,y2)),4))) 
print('R2 score ' + str(round(r2_score(y_test, y2),4)))

"""Decision tree visualization"""

#Decision tree visualization (text)
text_rep1 = tree.export_text(dtree1)
text_rep2 = tree.export_text(dtree2)
print("Tree with depth: 2")
print(text_rep1)
print("Tree with depth: 6")
print(text_rep2)

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dtree1, 
                   feature_names=x.values,  
                   class_names=y.values,
                   filled=True)

fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dtree2, 
                   feature_names=x.values,  
                   class_names=y.values,
                   filled=True)

"""## SVM"""

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from matplotlib import pyplot as plt
# %matplotlib inline

# SVM (Linear)
x = data5.iloc[:,6:12]
y = data5['Confirmed']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

model = SVC(kernel='linear')
model.fit(x_train, y_train)
predictions = model.predict(x_test)
percentage = model.score(x_test, y_test)

from sklearn.metrics import confusion_matrix
res = confusion_matrix(y_test, predictions)
print("Confusion Matrix")
print(res)
print(f"Test Set: {len(x_test)}")
print(f"Accuracy = {percentage*100} %")

# SVM (rbf)
x = data5.iloc[:,6:12]
y = data5['Confirmed']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

model = SVC(kernel='rbf')
model.fit(x_train, y_train)
predictions = model.predict(x_test)
percentage = model.score(x_test, y_test)

from sklearn.metrics import confusion_matrix
res = confusion_matrix(y_test, predictions)
print("Confusion Matrix")
print(res)
print(f"Test Set: {len(x_test)}")
print(f"Accuracy = {percentage*100} %")