diff --git a/machine learning basics/Random Forest Classifier - Ensemble Example for Bagging.ipynb b/machine learning basics/Random Forest Classifier - Ensemble Example for Bagging.ipynb new file mode 100644 index 0000000..185a36d --- /dev/null +++ b/machine learning basics/Random Forest Classifier - Ensemble Example for Bagging.ipynb @@ -0,0 +1,128 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bagging Algorithms\n", + "\n", + "Bootstrap Aggregation or bagging involves taking multiple samples from your training dataset (with replacement) and training a model for each sample.\n", + "\n", + "The final output prediction is averaged across the predictions of all of the sub-models." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#Importing required Libraries\n", + "import pandas\n", + "from sklearn import model_selection\n", + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#Locating the dataset and creating a variable for the URL\n", + "url = \"https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv\"\n", + "names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']\n", + "dataframe = pandas.read_csv(url, names=names) #Reading the data from the URL into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X--> [ 6. 148. 72. 35. 0. 33.6 0.627 50. ]\n", + "Y--> 1.0\n" + ] + } + ], + "source": [ + "array = dataframe.values #Reading the content of the dataframe\n", + "X = array[:,0:8] #Creating X array with first 8 elements of the Data in a row - Where X is an independent variable\n", + "Y = array[:,8] #Creating Y array with last element of the Data in a row- Where Y is a dependent variable\n", + "\n", + "print(\"X-->\", X[0])\n", + "\n", + "print(\"Y-->\", Y[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "seed = 7 #No. of random numbers to be generated\n", + "num_trees = 100 #Create 100 random trees\n", + "max_features = 3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Random forest is an extension of bagged decision trees.\n", + "\n", + "Samples of the training dataset are taken with replacement, but the trees are constructed in a way that reduces the correlation between individual classifiers. Specifically, rather than greedily choosing the best split point in the construction of the tree, only a random subset of features are considered for each split.\n", + "\n", + "You can construct a Random Forest model for classification using the RandomForestClassifier class.\n", + "\n", + "The example below provides an example of Random Forest for classification with 100 trees and split points chosen from a random selection of 3 features." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7667857142857143\n" + ] + } + ], + "source": [ + "kfold = model_selection.KFold(n_splits=100, random_state=seed) #Cross Validation\n", + "model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features) #Building model for Randomforestclassifier\n", + "results = model_selection.cross_val_score(model, X, Y, cv=kfold) \n", + "print(results.mean())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}