CNN_RandomForest_ParameterTuning.py

# -*- coding: utf-8 -*-
"""InformationRetrivalEndSem.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1tfW6hkbM8lv5id6wqACMhYxAkcP2mIAD
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

training_data = pd.read_csv("/content/drive/My Drive/Information Retrival End Sem Kaggle/train.csv")
testing_data = pd.read_csv("/content/drive/My Drive/Information Retrival End Sem Kaggle/test_x.csv")
val_data = pd.read_csv("/content/drive/My Drive/Information Retrival End Sem Kaggle/val.csv")

val_data.shape

training_data.shape

testing_data.shape

training_data.columns

val_data.columns

training_data

for i in val_data['1']:
  if i != 0.0 :
    print(i)

val_data

"""Dividing the Data into X and Y"""

x_train = training_data[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37',
       '38', '39', '40', '41', '42', '43', '44', '45', '46']]

y_train = training_data[['relevance']]

x_val = val_data[['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37',
       '38', '39', '40', '41', '42', '43', '44', '45', '46']]

y_val = val_data[['relevance']]

x_test = testing_data

"""Normalize the data"""

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_val = sc.transform(x_val)
x_test = sc.transform(x_test)

"""### Applying CNN"""

from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.optimizers import Adam

model = Sequential([
                    Dense(16, input_shape = (46,), activation='relu'),
                    Dense(32, activation = 'relu'),
                    Dense(3, activation = 'softmax')
                    ])

model.summary()

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

model.fit(x_train,y_train,batch_size=10, epochs=15)

y_pred = model.predict_classes(x_val)

y_pred

np.array(list(y_val['relevance']))

from sklearn.metrics import accuracy_score

accuracy_score(np.array(list(y_val['relevance'])),y_pred)

y_pred_test = model.predict_classes(x_test)

x_test.shape

y_pred_test.size


"""### Applying Random Forest with preprocessing"""

val_data.shape

training_data.shape

testing_data.shape

x_train_rf = training_data[['2', '3', '4', '5', '6', '7', '8','10', '11', '12', '13',
       '15', '16', '17', '18', '19', '20', '21', '22', '23','25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37',
       '40', '41', '42', '43', '44', '45', '46']]

y_train_rf = training_data[['relevance']]

x_val_rf = val_data[['2', '3', '4', '5', '6', '7', '8','10', '11', '12', '13',
       '15', '16', '17', '18', '19', '20', '21', '22', '23','25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37',
       '40', '41', '42', '43', '44', '45', '46']]

x_test_rf = testing_data[['2', '3', '4', '5', '6', '7', '8','10', '11', '12', '13',
       '15', '16', '17', '18', '19', '20', '21', '22', '23','25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37',
       '40', '41', '42', '43', '44', '45', '46']]

sc_rf = StandardScaler()
x_train_rf = sc_rf.fit_transform(x_train_rf)
x_val_rf = sc_rf.transform(x_val_rf)
x_test_rf = sc_rf.transform(x_test_rf)

from sklearn.decomposition import PCA

pca = PCA(n_components = 35)

pca.fit(x_train_rf)
x_train_rf = pca.transform(x_train_rf)
x_val_rf = pca.transform(x_val_rf)
x_test_rf = pca.transform(x_test_rf)

rf = RandomForestRegressor()

rf_random = GridSearchCV(estimator = rf, random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1,scoring='f1_score')

rf_random.fit(x_train_rf,y_train)

rf_random.best_params_

pred_val = rf_random.predict(x_val_rf)

x_val_rf.shape

y_val.shape

from sklearn.ensemble import RandomForestClassifier

rand_forest = RandomForestRegressor(n_estimators=90,random_state=42,max_depth=15)

rand_forest.fit(x_train_rf, y_train)

y_pred_rf = rand_forest.predict(x_val_rf)

y_pred_rf

y_pred = np.argmax(y_pred_rf, axis=0)

from sklearn.metrics import f1_score

print("F-Score on test data using random-forest classifier::",f1_score(y_val, y_pred_rf, average="macro"))

y_pred_rf_test = rand_forest.predict(x_test_rf)

"""Loading results into files"""

string = "row_id"+","+"relevance"+"\n"

for i in range(y_pred_test.size):
  string = string + str(i) +"," + str(y_pred_rf_test[i]) +"\n"

# print(string[:-1])

with open('/content/drive/My Drive/Information Retrival End Sem Kaggle/newfileone.csv', 'w') as f:
  f.write(string[:-1])

f.close()