K-Fold Cross Validation

K-Fold Cross Validation#

Testing accuracy for just once doesn’t account for the variance in the data and might give misleading results. K-Fold validation randomly selects one of \(k\) parts of the data set then tests the accuracy on the same. After required number of iterations, the accuracy is averaged

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('Social_Network_Ads.csv')
X = df.iloc[:, 2:4]   # Using 1:2 as indices will give us np array of dim (10, 1)
y = df.iloc[:, 4]

df.head()
User ID Gender Age EstimatedSalary Purchased
0 15624510 Male 19 19000 0
1 15810944 Male 35 20000 0
2 15668575 Female 26 43000 0
3 15603246 Female 27 57000 0
4 15804002 Male 19 76000 0
# Split in training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# Scale
from sklearn.preprocessing import StandardScaler
X_sca = StandardScaler()
X_train = X_sca.fit_transform(X_train)
X_test = X_sca.transform(X_test)
from sklearn.svm import SVC #support vector classifier
clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(clf, X_train, y_train, cv=10)
print accuracies
print accuracies.mean()
print accuracies.std()
[ 0.90322581  0.90322581  0.77419355  0.87096774  0.77419355  0.86206897
  0.82758621  0.68965517  0.79310345  0.89655172]
0.829477196885
0.0671935884472

Advantages of XGBoost#

  1. High Performance

  2. No feature scaling needed, so you can keep your interpretation

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('Churn_Modelling.csv')
X = df.iloc[:, 3:13].values
y = df.iloc[:, 13].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le_geography = LabelEncoder()
X[:, 1] = le_geography.fit_transform(X[:, 1])
le_gender = LabelEncoder()
X[:, 2] = le_gender.fit_transform(X[:, 2])
ohe = OneHotEncoder(categorical_features=[1])
X = ohe.fit_transform(X).toarray()
# avoid dummy variable trap
X = X[:, 1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from xgboost import XGBClassifier as XGB
clf = XGB()
clf.fit(X_train, y_train)
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
# print clf.score(X_test, y_pred)
print confusion_matrix(y_test, y_pred)
1.0
[[1521   74]
 [ 197  208]]
from sklearn.model_selection import cross_val_score
results = cross_val_score(clf, X_train, y_train, cv=10)
print results.mean()
print results.std()
0.862999445116
0.0106778721717
a