K-Fold Cross Validation#
Testing accuracy for just once doesn’t account for the variance in the data and might give misleading results. K-Fold validation randomly selects one of \(k\) parts of the data set then tests the accuracy on the same. After required number of iterations, the accuracy is averaged
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('Social_Network_Ads.csv')
X = df.iloc[:, 2:4] # Using 1:2 as indices will give us np array of dim (10, 1)
y = df.iloc[:, 4]
df.head()
| User ID | Gender | Age | EstimatedSalary | Purchased | |
|---|---|---|---|---|---|
| 0 | 15624510 | Male | 19 | 19000 | 0 |
| 1 | 15810944 | Male | 35 | 20000 | 0 |
| 2 | 15668575 | Female | 26 | 43000 | 0 |
| 3 | 15603246 | Female | 27 | 57000 | 0 |
| 4 | 15804002 | Male | 19 | 76000 | 0 |
# Split in training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# Scale
from sklearn.preprocessing import StandardScaler
X_sca = StandardScaler()
X_train = X_sca.fit_transform(X_train)
X_test = X_sca.transform(X_test)
from sklearn.svm import SVC #support vector classifier
clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)
# applying k-fold cross validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(clf, X_train, y_train, cv=10)
print accuracies
print accuracies.mean()
print accuracies.std()
[ 0.90322581 0.90322581 0.77419355 0.87096774 0.77419355 0.86206897
0.82758621 0.68965517 0.79310345 0.89655172]
0.829477196885
0.0671935884472
Grid Search#
Grid Search is used for hyperparameter optimization.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
df = pd.read_csv('Social_Network_Ads.csv')
X = df.iloc[:, 2:4] # Using 1:2 as indices will give us np array of dim (10, 1)
y = df.iloc[:, 4]
df.head()
| User ID | Gender | Age | EstimatedSalary | Purchased | |
|---|---|---|---|---|---|
| 0 | 15624510 | Male | 19 | 19000 | 0 |
| 1 | 15810944 | Male | 35 | 20000 | 0 |
| 2 | 15668575 | Female | 26 | 43000 | 0 |
| 3 | 15603246 | Female | 27 | 57000 | 0 |
| 4 | 15804002 | Male | 19 | 76000 | 0 |
# Split in training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
# Scale
from sklearn.preprocessing import StandardScaler
X_sca = StandardScaler()
X_train = X_sca.fit_transform(X_train)
X_test = X_sca.transform(X_test)
from sklearn.svm import SVC #support vector classifier
clf = SVC(kernel='linear', random_state=0).fit(X_train, y_train)
# Grid search
from sklearn.model_selection import GridSearchCV
# insert parameters that you want to optimize
parameters = [
{
'C': [1, 10, 100, 1000],
'kernel': ['linear']
},
{
'C': [1, 10, 100, 1000],
'kernel': ['rbf'],
'gamma': [0.5, 0.1, 0.001, 0.0001],
}
]
grid_search = GridSearchCV(estimator=clf, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid_search = grid_search.fit(X_train, y_train)
print grid_search.best_estimator_
print grid_search.best_score_
print grid_search.best_params_
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
max_iter=-1, probability=False, random_state=0, shrinking=True,
tol=0.001, verbose=False)
0.893333333333
{'kernel': 'rbf', 'C': 1, 'gamma': 0.5}
Advantages of XGBoost#
High Performance
No feature scaling needed, so you can keep your interpretation
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('Churn_Modelling.csv')
X = df.iloc[:, 3:13].values
y = df.iloc[:, 13].values
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le_geography = LabelEncoder()
X[:, 1] = le_geography.fit_transform(X[:, 1])
le_gender = LabelEncoder()
X[:, 2] = le_gender.fit_transform(X[:, 2])
ohe = OneHotEncoder(categorical_features=[1])
X = ohe.fit_transform(X).toarray()
# avoid dummy variable trap
X = X[:, 1:]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
from xgboost import XGBClassifier as XGB
clf = XGB()
clf.fit(X_train, y_train)
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
objective='binary:logistic', reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, seed=0, silent=True, subsample=1)
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)
# print clf.score(X_test, y_pred)
print confusion_matrix(y_test, y_pred)
1.0
[[1521 74]
[ 197 208]]
from sklearn.model_selection import cross_val_score
results = cross_val_score(clf, X_train, y_train, cv=10)
print results.mean()
print results.std()
0.862999445116
0.0106778721717
a