This is my maiden voyage, when it comes to Kaggle contest that is!.
In this work, we will write our deep machine learning in its original form of using forward and backward propagations without the help of other packages like Tensor Flow, Kerras, Mxnet,... We also compare results of the neural networks with other statistical methods, we also use the popular scikit-learn library to develop our machine learning algorithms
We follow the following steps to predict:
#load packages
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))
import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))
import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))
import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))
import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__))
import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__))
import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))
#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
Python version: 3.8.5 (default, Sep 3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)] pandas version: 1.1.3 matplotlib version: 3.3.2 NumPy version: 1.19.2 SciPy version: 1.5.2 IPython version: 7.19.0 scikit-learn version: 0.23.2 -------------------------
We will use the popular scikit-learn library to develop our machine learning algorithms. In sklearn, algorithms are called Estimators and implemented in their own classes. For data visualization, we will use the matplotlib and seaborn library. Below are common classes to load.
# #Common Model Algorithms
# from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
# from xgboost import XGBClassifier
# #Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import accuracy_score
#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# from pandas.tools.plotting import scatter_matrix
#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8
df_train = pd.read_csv("train1.csv")
df_test = pd.read_csv("test1.csv")
NUMERIC_COLUMNS=['Alone','Family Size','Sex','Pclass','Fare','FareBand','Age','TitleCat','Embarked'] #72
ORIGINAL_NUMERIC_COLUMNS=['Pclass','Age','SibSp','Parch','Sex','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary','Embarked'] #83
REVISED_NUMERIC_COLUMNS=['Sex', 'Em_Cl_Survival','Pclass_1','Pclass_2','Pclass_3','Age','SibSp','Parch','Family_Survival','IsAlone','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary','Embarked_C','Embarked_Q','Embarked_S'] #84
# create test and training data
data_to_train = df_train[REVISED_NUMERIC_COLUMNS].fillna(-1000)
X=data_to_train.values
Y=df_train['Survived'].values
Y = Y.reshape((Y.size,1))
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25,random_state=21, stratify=Y)
print(X_train[:2,:],Y_train.shape)
[[ 0. 1. 1. 0. 0. 38. 1. 0. 0.5 0. 0. 0. 0. 1. 0. 1. 0. 0. ] [ 1. 0.05882353 0. 0. 1. 21. 0. 0. 0.5 1. 0. 0. 1. 0. 0. 0. 0. 1. ]] (668, 1)
# Explore Data Graphically
grid = sns.FacetGrid(df_train, col = "Pclass", row = "Sex", hue = "Survived", palette = 'seismic')
grid = grid.map(plt.scatter, "PassengerId", "Age")
grid.add_legend()
grid
<seaborn.axisgrid.FacetGrid at 0x1ae2c266e50>
grid = sns.FacetGrid(df_train, col = "Parch", row = "Sex", hue = "Survived", palette = 'seismic')
grid = grid.map(plt.scatter, "PassengerId", "Age")
grid.add_legend()
grid
<seaborn.axisgrid.FacetGrid at 0x1ae2bfd5640>
g = sns.pairplot(df_train[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked']], hue='Survived', palette = 'seismic',size=4,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=50) )
g.set(xticklabels=[])
<seaborn.axisgrid.PairGrid at 0x1ae2d19b1c0>
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True,figsize=(12,6))
sns.boxplot(data = df_train, x = "Pclass", y = "Fare",ax=ax1);
plt.figure(1)
sns.boxplot(data = df_train, x = "Embarked", y = "Fare",ax=ax2);
plt.show()
# Visualise Age Data
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
axis1.set_title('Training Age values - Titanic')
axis2.set_title('Test Age values - Titanic')
# plot original Age values
df_train['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
#df_test['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
# plot new Age Values
#df_train['Age'].hist(bins=70, ax=axis2)
df_test['Age'].hist(bins=70, ax=axis2)
# peaks for survived/not survived passengers by their age
facet = sns.FacetGrid(df_train, hue="Survived",palette = 'seismic',aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, df_train['Age'].max()))
facet.add_legend()
<seaborn.axisgrid.FacetGrid at 0x1ae2f9dfdc0>
#quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
data = pd.concat([df_train, df_test],ignore_index=True)
# Title bussiness
data.loc[1305,"Name"] = "Oliva y Ocana, Mrs. Dona. Fermina" # Missing ttitle for Mrs Oliva
data['Title'] = data['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
# Fill age based on title
for title in data.groupby(['Title']).groups.keys():
age_to_impute = data.groupby('Title')['Age'].median()[title]
data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute
# #Unify common titles.
# data["Title"] = data["Title"].replace('Mlle', 'Miss')
# data["Title"] = data["Title"].replace('Master', 'Master')
# data["Title"] = data["Title"].replace(['Mme', 'Dona', 'Ms'], 'Mrs')
# data["Title"] = data["Title"].replace(['Jonkheer','Don'],'Mr')
# data["Title"] = data["Title"].replace(['Capt','Major', 'Col','Rev'], 'Millitary')
# data["Title"] = data["Title"].replace(['Lady', 'the Countess', 'Countess','Sir'], 'Honor')
# # convert Title categories to Columns
# titledummies=pd.get_dummies(data[['Title']], prefix_sep='_') #Title
# data = pd.concat([data, titledummies], axis=1)
# print('Title categories added')
# Repalcing titles by mean of surviving
order_list = data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
title = order_list.Title.values
surv_mean = order_list.Survived.values
title_mapping = dict(zip(title,surv_mean))
data.loc[:,"Title"] = data["Title"].map(title_mapping)
print('Title categories added')
# Repalcing Embarked by mean of surviving
order_list = data[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()
title = order_list.Embarked.values
surv_mean = order_list.Survived.values
title_mapping = dict(zip(title,surv_mean))
data.loc[:,"Embarked"] = data["Embarked"].map(title_mapping)
data.loc[data['Embarked'].isnull()==True,"Embarked"] = data["Embarked"].median()
print('Embarked categories Digitized')
# Sex digitalize
data.loc[:,"Sex"] = data["Sex"].replace(['male'], 1)
data.loc[:,"Sex"] = data["Sex"].replace(['female'], 0)
#Discrete variables
data['Family_Size'] = data ['SibSp'] + data['Parch'] + 1
data['IsAlone'] = 1 #initialize to yes/1 is alone
data.loc[data['Family_Size'] > 1,"IsAlone"] = 0 # now update to no/b0 if family size is greater than 1
# Repalcing Pclass by mean of surviving
order_list = data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()
title = order_list.Pclass.values
surv_mean = order_list.Survived.values
title_mapping = dict(zip(title,surv_mean))
data.loc[:,"Pclass"] = data["Pclass"].map(title_mapping)
print('Pclass categories digitized')
data.loc[data['Fare'].isnull()==True,"Fare"] = data["Fare"].median()
# # convert Pclass categories to Columns
# titledummies =pd.get_dummies(data['Pclass'])
# titledummies.rename(columns={1:'Pclass_1',2:'Pclass_2',3:'Pclass_3'}, inplace=True)
# data = pd.concat([data, titledummies], axis=1)
# # data = data.drop(["Cabin","Name","Ticket","Title"],axis =1)
Title categories added Embarked categories Digitized Pclass categories digitized
df_train["Family Size"] = (df_train['SibSp'] + df_train['Parch'] + 1)
df_test["Family Size"] = df_test['SibSp'] + df_test['Parch'] + 1
print('Family size feature created')
Family size feature created
plt.figure(3)
sns.jointplot(data=x1, x='PassengerId', y='Age', kind='scatter',color='b')
plt.figure(4)
sns.jointplot(data=x2, x='PassengerId', y='Age', kind='scatter',color='r')
<seaborn.axisgrid.JointGrid at 0x1ae34d46400>
<Figure size 864x576 with 0 Axes>
This credit is due to https://www.kaggle.com/shunjiangxu/blood-is-thicker-than-water-friendship-forever
# get last name
data["Last_Name"] = data['Name'].apply(lambda x: str.split(x, ",")[0])
# Set survival value
DEFAULT_SURVIVAL_VALUE = 0.5
data["Family_Survival"] = DEFAULT_SURVIVAL_VALUE
# Find Family groups by Fare
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
if (len(grp_df) != 1):
# A Family group is found.
for ind, row in grp_df.iterrows():
smax = grp_df.drop(ind)['Survived'].max()
smin = grp_df.drop(ind)['Survived'].min()
passID = row['PassengerId']
if (smax == 1.0):
data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
elif (smin==0.0):
data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
print("Number of passengers with family survival information:",
data.loc[data['Family_Survival']!=0.5].shape[0])
# Find Family groups by Ticket
for _, grp_df in data.groupby('Ticket'):
if (len(grp_df) != 1):
for ind, row in grp_df.iterrows():
if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
smax = grp_df.drop(ind)['Survived'].max()
smin = grp_df.drop(ind)['Survived'].min()
passID = row['PassengerId']
if (smax == 1.0):
data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
elif (smin==0.0):
data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
print("Number of passenger with family/group survival information: "
+str(data[data['Family_Survival']!=0.5].shape[0]))
# # Family_Survival in df_train and df_test:
# df_train["Family_Survival"] = data['Family_Survival'][:891]
# df_test["Family_Survival"] = data['Family_Survival'][891:]
Number of passengers with family survival information: 420 Number of passenger with family/group survival information: 546
# Survival expectation value by sex and pclass
DEFAULT_SURVIVAL_VALUE = 0.5
data["Sx_Pa_Survival"] = DEFAULT_SURVIVAL_VALUE
df_train = data[:891]
list_groups = df_train.groupby(["Parch","Sex"])["Sx_Pa_Survival"].count().values
surv_list_groups = df_train.groupby(["Parch","Sex","Survived"])["Sx_Pa_Survival"].count().values
surv = surv_list_groups[1:14:2]
surv_rate1 = surv/list_groups[:7]
for grp in data.groupby(["Parch","Sex"]):
# print(grp[0])
if grp[0] == (1,0): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[0]
if grp[0] == (0,1): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[1]
if grp[0] == (1,0): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[2]
if grp[0] == (1,1): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[3]
if grp[0] == (2,0): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[4]
if grp[0] == (2,1): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[5]
if grp[0] == (3,0): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = surv_rate1[6]
if grp[0] == (3,1): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
if grp[0] == (4,0): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
if grp[0] == (4,1): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
if grp[0] == (5,0): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = .25
if grp[0] == (5,1): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
if grp[0] == (6,0): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = 0
if grp[0] == (6,1): data.loc[grp[1].index.values,["Sx_Pa_Survival"]] = .5
To carry out machine learning algorithms, we divide data into test and training sets. The goal is to have both good accuracy rates on both sets.
NUMERIC_COLUMNS=['Alone','Family Size','Sex','Pclass','Fare','FareBand','Age','TitleCat','Embarked'] #72
ORIGINAL_NUMERIC_COLUMNS=['Pclass','Age','SibSp','Parch','Sex','Title_Master', 'Title_Miss','Title_Mr', 'Title_Mrs', 'Title_Millitary','Embarked'] #83
REVISED_NUMERIC_COLUMNS=[ 'Family_Survival','Sex', 'Fare','Pclass','Age','SibSp','Parch','IsAlone','Title','Embarked'] #84
#,'Sx_Cl_Survival','Sx_Em_Survival',"Sx_Si_Survival","Sx_Pa_Survival"
# create test and training data
data_to_train = df_train[REVISED_NUMERIC_COLUMNS].fillna(-1000)
X=data_to_train.values
Y=df_train['Survived'].values
Y = Y.reshape((Y.size,1))
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25,random_state=21, stratify=Y)
print(X_train[:2,:],Y_train.shape)
[[0.5 0. 0.13913574 1. 0.475 1. 0. 0. 0.792 1. ] [0.5 1. 0.01522459 0.38492872 0.2625 0. 0. 1. 0.15667311 0.60869565]] (668, 1)
def layer_sizes(X, Y):
n_x = X.shape[1]
n_h = 50 # size of hidden layer
n2 = 15
n3 = 15
n_y = 1 # size of output layer
return (n_x, n_h, n2, n3, n_y)
n_x, n_h, n2, n3, n_y = layer_sizes(X, Y)
print("Input size: {}, hidden layer 1 size: {}, hidden layer 2 size: {}". format(n_x,n_h, n2))
print(" hidden layer 3: size {}, output layer size: {}".format( n3, n_y))
parameters = initialize_parameters(n_x, n_h, n2,n3, n_y)
def initialize_parameters(n_x, n_h, n2, n3, n_y):
np.random.seed(2) # we set up a seed so that your output matches ours although the initialization is random.
W1 = np.random.randn(n_x,n_h)*0.005
b1 = np.zeros((1,n_h))
W2 = np.random.randn(n_h,n2)*0.005
b2 = np.zeros((1,n2))
W3 = np.random.randn(n2,n3)*0.005
b3 = np.zeros((1,n3))
W4 = np.random.randn(n3,n_y)*0.0075
b4 = np.zeros((1,n_y))
parameters = {"W1": W1,
"b1": b1,
"W2": W2,
"b2": b2,
"W3": W3,
"b3": b3,
"W4": W4,
"b4": b4}
return parameters
# Data Standardization
from sklearn import preprocessing
def forward_propagation(X, parameters):
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]
W4 = parameters["W4"]
b4 = parameters["b4"]
# X= preprocessing.scale(X,axis=1)
Z1 = np.dot(X,W1)+b1
A1 = np.tanh(Z1)
Z2 = np.dot(A1,W2)+b2
A2 = np.tanh(Z2)
Z3 = np.dot(A2,W3)+b3
A3 = np.tanh(Z3)
Z4 = np.dot(A3,W4)+b4
A4 = sigmoid(Z4)
# print("DB W1 = %s, b1 = %s, W2 = %s, b2 = %s, , W3 = %s, b3 = %s"%(W1.shape,b1.shape,W2.shape,b2.shape,W3.shape,b3.shape))
# assert(A2.shape == (1, X.shape[1]))
cache = {"Z1": Z1,
"A1": A1,
"Z2": Z2,
"A2": A2,
"Z3": Z3,
"A3": A3,
"Z4": Z4,
"A4": A4}
return A4, cache
def sigmoid(x):
"""
Compute the sigmoid of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- sigmoid(x)
"""
s = 1/(1+np.exp(-x))
return s
def compute_cost(A4, Y, parameters):
m = Y.size # number of example
cost = np.sum((A4-Y)**2)/m
# cost = -np.sum(Y*np.log(A2))/m
# print(A2)
# print(m)
# logprobs = np.multiply(np.log(A2),Y) + np.multiply(np.log(1-A2),1-Y)
# cost = - np.sum(logprobs) * (1./m)
# cost = np.squeeze(cost) # makes sure cost is the dimension we expect.
# E.g., turns [[17]] into 17
# assert(isinstance(cost, float))
return cost
def StestpestDescent(parameters, cache, X, Y):
m = X.shape[0]
W1 = parameters["W1"]
W2 = parameters["W2"]
W3 = parameters["W3"]
W4 = parameters["W4"]
# Z1 = cache["Z1"]
Z2 = cache["Z2"]
A1 = cache["A1"]
A2 = cache["A2"]
A3 = cache["A3"]
A4 = cache["A4"]
d4 = (A4-Y) #* Z2*(1-Z2)
# print("debug d3 = ",d3.shape)
d3 = np.dot(d4,W4.T)*(1- np.power(A3,2))
d2 = np.dot(d3,W3.T)*(1- np.power(A2,2))
d1 = np.dot(d2,W2.T)*(1- np.power(A1,2))
# print("DB A1 = %s W2 =%s d1 = %s" %(A1.shape,W2.shape,d1.shape))
dW4 = np.dot(A3.T,d4)/m
db4 = np.sum(d4,axis=0,keepdims=True)/m
dW3 = np.dot(A2.T,d3)/m
# print("BD1 dW3 =%s, W3 = %s"%(dW3.shape,W3.shape))
db3 = np.sum(d3,axis=0,keepdims=True)/m
dW2 = np.dot(A1.T,d2)/m
# print("BD2 dW2 =%s, W2 = %s"%(dW2.shape,W2.shape))
db2 = np.sum(d2,axis=0,keepdims=True)/m
# print("DB3 dW1",db2.T)
# print("DB3 d1=",d1.T)
dW1 = np.dot(X.T,d1)/m
# print("debug dW1 =%s, W1 = %s"%(dW1.shape,W1.shape))
db1 = np.sum(d1,axis=0,keepdims=True)/m
# print("debug db1 =%s, b1 = %s"%(db1.shape,parameters["b1"].shape))
grads = {"dW1": dW1,
"db1": db1,
"dW2": dW2,
"db2": db2,
"dW3": dW3,
"db3": db3,
"dW4": dW4,
"db4": db4}
return grads
def update_parameters(parameters, grads, learning_rate = .75):
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
W3 = parameters["W3"]
b3 = parameters["b3"]
W4 = parameters["W4"]
b4 = parameters["b4"]
dW1 = grads["dW1"]
db1 = grads["db1"]
dW2 = grads["dW2"]
db2 = grads["db2"]
dW3 = grads["dW3"]
db3 = grads["db3"]
dW4 = grads["dW4"]
db4 = grads["db4"]
# rate = .01#np.linspace(0,learning_rate = 5,1000)
W1 = W1 - learning_rate*dW1
b1 = b1 - learning_rate*db1
W2 = W2 - learning_rate*dW2
b2 = b2 - learning_rate*db2
W3 = W3 - learning_rate*dW3
b3 = b3 - learning_rate*db3
W4 = W4 - learning_rate*dW4
b4 = b4 - learning_rate*db4
parameters = {"W1": W1,
"b1": b1,
"W2": W2,
"b2": b2,
"W3": W3,
"b3": b3,
"W4": W4,
"b4": b4}
return parameters
def nn_model(X, Y, n_h, n2, n3, num_iterations = 10000, print_cost=False):
np.random.seed(3)
n_x = layer_sizes(X, Y)[0]
n_y = layer_sizes(X, Y)[-1]
parameters = initialize_parameters(n_x, n_h, n2, n3, n_y)
# W1 = parameters["W1"]
# b1 = parameters["b1"]
# W2 = parameters["W2"]
# b2 = parameters["b2"]
# W3 = parameters["W3"]
# b3 = parameters["b3"]
for i in range(0, num_iterations):
A3, cache = forward_propagation(X, parameters)
cost = compute_cost(A3, Y, parameters)
if cost <.11: break
grads = StestpestDescent(parameters, cache, X, Y)
parameters = update_parameters(parameters, grads,learning_rate = .2 - 10**(-7)*i)
if print_cost and i % 5000 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
print ("Cost after iteration %i: %f" %(i, cost))
return parameters
Input size: 10, hidden layer 1 size: 50, hidden layer 2 size: 15 hidden layer 3: size 15, output layer size: 1
n_h, n2, n3 = 50, 20, 20
parameters = nn_model(X_train, Y_train, n_h, n2, n3, num_iterations=100000, print_cost=True)
Cost after iteration 0: 0.250000 Cost after iteration 5000: 0.236366 Cost after iteration 10000: 0.236365 Cost after iteration 15000: 0.236364 Cost after iteration 19902: 0.109999
def predict(parameters, X):
A3, cache = forward_propagation(X, parameters)
predictions = (A3 > 0.5)
return np.array(predictions)
train_predictions = predict(parameters, X_train)
test_predictions = predict(parameters, X_test)
train_accuracy = round(accuracy_score(Y_train[:,0],train_predictions) * 100, 2) #float((np.dot(Y_train[:,0],train_predictions) + np.dot(1-Y_train[:,0],1-train_predictions))/float(Y_train.size)*100)
test_accuracy = round(accuracy_score(Y_test[:,0],test_predictions) * 100, 2) #float((np.dot(Y_test[:,0],test_predictions) + np.dot(1-Y_test[:,0],1-test_predictions))/float(Y_test.size)*100)
print ("Accuracy for {}, {} hidden units: Train set {:.2f}%, Test set {:.2f}%".format(n_h, n2, train_accuracy, test_accuracy))
Accuracy for 50, 20 hidden units: Train set 85.48%, Test set 85.20%
# Logistic Regression
from sklearn.linear_model import LogisticRegression
x_train, y_train, x_val, y_val = X_train, Y_train[:,0], X_test,Y_test[:,0]
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_val)
acc_logreg = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Logistic Regression accuracy=",acc_logreg)
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier()
decisiontree.fit(x_train, y_train)
y_pred = decisiontree.predict(x_val)
acc_decisiontree = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Decision Tree accuracy=",acc_decisiontree)
# Random Forest
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state = 0)
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(x_val)
acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Random Forest accuracy=",acc_randomforest)
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbk = GradientBoostingClassifier()
gbk.fit(x_train, y_train)
y_pred = gbk.predict(x_val)
acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Gradient Boosting Classifier=",acc_gbk)
# Perceptron
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron.fit(x_train, y_train)
y_pred = perceptron.predict(x_val)
acc_perceptron = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Perceptron accuracy=", acc_perceptron)
# Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_val)
acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Stochastic Gradient Descent accuracy =", acc_sgd)
# Gradient Boosting Classifier
from sklearn.ensemble import ExtraTreesClassifier
et = ExtraTreesClassifier()
et.fit(x_train, y_train)
y_pred = et.predict(x_val)
acc_et = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Gradient Boosting Classifier accuarcy:",acc_et)
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(x_val)
acc_gaussian = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Gaussian Naive Bayes accuracy =", acc_gaussian)
# Linear SVC
from sklearn.svm import LinearSVC
linear_svc = LinearSVC()
linear_svc.fit(x_train, y_train)
y_pred = linear_svc.predict(x_val)
acc_linear_svc = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Linear SVC accuracy =", acc_linear_svc)
# KNN or k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_val)
acc_knn = round(accuracy_score(y_pred, y_val) * 100, 2)
print("KNN accuracy =", acc_knn)
Logistic Regression accuracy= 84.75 Decision Tree accuracy= 81.17 Random Forest accuracy= 83.41 Gradient Boosting Classifier= 84.3 Perceptron accuracy= 83.86 Stochastic Gradient Descent accuracy = 79.82 Gradient Boosting Classifier accuarcy: 81.17 Gaussian Naive Bayes accuracy = 78.03 Linear SVC accuracy = 84.3 KNN accuracy = 83.86
models = pd.DataFrame({
'Model': ['Neural Network', 'KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes', 'Linear SVC',
'Decision Tree', 'Gradient Boosting Classifier','Extra Trees','Stochastic Gradient Descent','Perceptron'],#,'xgboost'],
'Score': [ test_accuracy, acc_knn, acc_logreg,
acc_randomforest, acc_gaussian,acc_linear_svc, acc_decisiontree,
acc_gbk,acc_et,acc_sgd,acc_perceptron]})#,acc_xgb]})
models.sort_values(by='Score', ascending=False)
Model | Score | |
---|---|---|
0 | Neural Network | 85.20 |
2 | Logistic Regression | 84.75 |
5 | Linear SVC | 84.30 |
7 | Gradient Boosting Classifier | 84.30 |
1 | KNN | 83.86 |
10 | Perceptron | 83.86 |
3 | Random Forest | 83.41 |
9 | Stochastic Gradient Descent | 82.96 |
6 | Decision Tree | 82.51 |
8 | Extra Trees | 80.27 |
4 | Naive Bayes | 78.03 |