K-Fold Cross Validation

Contents

K-Fold Cross Validation#

Testing accuracy for just once doesn’t account for the variance in the data and might give misleading results. K-Fold validation randomly selects one of \(k\) parts of the data set then tests the accuracy on the same. After required number of iterations, the accuracy is averaged

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('Social_Network_Ads.csv')
X = df.iloc[:, 2:4]   # Using 1:2 as indices will give us np array of dim (10, 1)
y = df.iloc[:, 4]

df.head()

	User ID	Gender	Age	EstimatedSalary	Purchased
0	15624510	Male	19	19000	0
1	15810944	Male	35	20000	0
2	15668575	Female	26	43000	0
3	15603246	Female	27	57000	0
4	15804002	Male	19	76000	0

# Split in training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Scale
from sklearn.preprocessing import StandardScaler
X_sca = StandardScaler()
X_train = X_sca.fit_transform(X_train)
X_test = X_sca.transform(X_test)

from sklearn.svm import SVC #support vector classifier
clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)

# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(clf, X_train, y_train, cv=10)
print accuracies
print accuracies.mean()
print accuracies.std()

[ 0.90322581  0.90322581  0.77419355  0.87096774  0.77419355  0.86206897
82758621  0.68965517  0.79310345  0.89655172]
829477196885
0671935884472

Grid Search#

Grid Search is used for hyperparameter optimization.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('Social_Network_Ads.csv')
X = df.iloc[:, 2:4]   # Using 1:2 as indices will give us np array of dim (10, 1)
y = df.iloc[:, 4]

df.head()

	User ID	Gender	Age	EstimatedSalary	Purchased
0	15624510	Male	19	19000	0
1	15810944	Male	35	20000	0
2	15668575	Female	26	43000	0
3	15603246	Female	27	57000	0
4	15804002	Male	19	76000	0

# Split in training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Scale
from sklearn.preprocessing import StandardScaler
X_sca = StandardScaler()
X_train = X_sca.fit_transform(X_train)
X_test = X_sca.transform(X_test)

from sklearn.svm import SVC #support vector classifier
clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)

# Grid search
from sklearn.model_selection import GridSearchCV
# insert parameters that you want to optimize
parameters = [
    {
        'C': [1, 10, 100, 1000],
        'kernel': ['linear']
    },
    {
        'C': [1, 10, 100, 1000],
        'kernel': ['rbf'],
        'gamma': [0.5, 0.1, 0.001, 0.0001],
    }
]
grid_search = GridSearchCV(estimator=clf, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)

print grid_search.best_estimator_
print grid_search.best_score_
print grid_search.best_params_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)
0.893333333333
{'kernel': 'rbf', 'C': 1, 'gamma': 0.5}

Advantages of XGBoost#

High Performance
No feature scaling needed, so you can keep your interpretation

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

df = pd.read_csv('Churn_Modelling.csv')
X = df.iloc[:, 3:13].values
y = df.iloc[:, 13].values

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le_geography = LabelEncoder()
X[:, 1] = le_geography.fit_transform(X[:, 1])
le_gender = LabelEncoder()
X[:, 2] = le_gender.fit_transform(X[:, 2])
ohe = OneHotEncoder(categorical_features=[1])
X = ohe.fit_transform(X).toarray()
# avoid dummy variable trap
X = X[:, 1:]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from xgboost import XGBClassifier as XGB
clf = XGB()
clf.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

from sklearn.metrics import confusion_matrix

y_pred = clf.predict(X_test)
# print clf.score(X_test, y_pred)
print confusion_matrix(y_test, y_pred)

1.0
[[1521   74]
 [ 197  208]]

from sklearn.model_selection import cross_val_score
results = cross_val_score(clf, X_train, y_train, cv=10)
print results.mean()
print results.std()

0.862999445116
0.0106778721717