Cross-Validation
Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample. This section covers various cross-validation techniques and their implementation.
Basic Cross-Validation
1. K-Fold Cross-Validation
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score
def k_fold_cv(model, X, y, n_splits=5, shuffle=True, random_state=42):
"""Perform k-fold cross-validation"""
# Initialize K-fold
kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
# Store scores
scores = []
# Perform cross-validation
for train_idx, val_idx in kf.split(X):
# Split data
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
# Train model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_val)
# Compute score
if len(np.unique(y)) <= 2: # Binary classification
score = accuracy_score(y_val, y_pred)
else: # Regression
score = mean_squared_error(y_val, y_pred)
scores.append(score)
return np.mean(scores), np.std(scores)
2. Stratified K-Fold
from sklearn.model_selection import StratifiedKFold
def stratified_k_fold_cv(model, X, y, n_splits=5, shuffle=True, random_state=42):
"""Perform stratified k-fold cross-validation"""
# Initialize Stratified K-fold
skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
# Store scores
scores = []
# Perform cross-validation
for train_idx, val_idx in skf.split(X, y):
# Split data
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
# Train model
model.fit(X_train, y_train)
# Make predictions and compute score
y_pred = model.predict(X_val)
score = accuracy_score(y_val, y_pred)
scores.append(score)
return np.mean(scores), np.std(scores)
Advanced Cross-Validation
1. Time Series Cross-Validation
from sklearn.model_selection import TimeSeriesSplit
def time_series_cv(model, X, y, n_splits=5):
"""Perform time series cross-validation"""
# Initialize Time Series Split
tscv = TimeSeriesSplit(n_splits=n_splits)
# Store scores
scores = []
# Perform cross-validation
for train_idx, val_idx in tscv.split(X):
# Split data
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
# Train model
model.fit(X_train, y_train)
# Make predictions and compute score
y_pred = model.predict(X_val)
score = mean_squared_error(y_val, y_pred)
scores.append(score)
return np.mean(scores), np.std(scores)
2. Nested Cross-Validation
from sklearn.model_selection import GridSearchCV
def nested_cv(model, param_grid, X, y, outer_splits=5, inner_splits=3):
"""Perform nested cross-validation"""
# Initialize outer and inner cross-validation
outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=42)
# Store scores
outer_scores = []
# Outer loop
for train_idx, test_idx in outer_cv.split(X):
# Split data
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# Inner cross-validation for model selection
inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=42)
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
cv=inner_cv,
scoring='neg_mean_squared_error'
)
# Fit grid search
grid_search.fit(X_train, y_train)
# Get best model and evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
score = mean_squared_error(y_test, y_pred)
outer_scores.append(score)
return np.mean(outer_scores), np.std(outer_scores)
3. Leave-One-Out Cross-Validation
from sklearn.model_selection import LeaveOneOut
def leave_one_out_cv(model, X, y):
"""Perform leave-one-out cross-validation"""
# Initialize LOOCV
loo = LeaveOneOut()
# Store scores
scores = []
# Perform cross-validation
for train_idx, val_idx in loo.split(X):
# Split data
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
# Train model
model.fit(X_train, y_train)
# Make predictions and compute score
y_pred = model.predict(X_val)
score = mean_squared_error(y_val, y_pred)
scores.append(score)
return np.mean(scores), np.std(scores)
Visualization
1. Cross-Validation Results
import matplotlib.pyplot as plt
import seaborn as sns
def plot_cv_results(cv_scores, method_name):
"""Plot cross-validation results"""
plt.figure(figsize=(10, 6))
# Create box plot
plt.boxplot(cv_scores)
# Add scatter points for individual scores
plt.scatter(np.ones_like(cv_scores), cv_scores, alpha=0.5)
plt.title(f'Cross-Validation Scores: {method_name}')
plt.ylabel('Score')
plt.xticks([1], [method_name])
return plt.gcf()
2. Learning Curves
from sklearn.model_selection import learning_curve
def plot_learning_curves(model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)):
"""Plot learning curves from cross-validation"""
# Calculate learning curves
train_sizes, train_scores, val_scores = learning_curve(
model, X, y,
train_sizes=train_sizes,
cv=cv,
n_jobs=-1,
scoring='neg_mean_squared_error'
)
# Calculate mean and std
train_mean = -np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = -np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)
# Plot curves
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training error')
plt.plot(train_sizes, val_mean, label='Validation error')
# Plot standard deviation bands
plt.fill_between(
train_sizes,
train_mean - train_std,
train_mean + train_std,
alpha=0.1
)
plt.fill_between(
train_sizes,
val_mean - val_std,
val_mean + val_std,
alpha=0.1
)
plt.xlabel('Training Examples')
plt.ylabel('Mean Squared Error')
plt.title('Learning Curves')
plt.legend(loc='best')
plt.grid(True)
return plt.gcf()
Best Practices
1. Method Selection
- Use stratified k-fold for classification
- Use time series CV for temporal data
- Consider LOOCV for very small datasets
- Use nested CV for model selection
2. Parameter Settings
- Choose appropriate number of folds
- Consider data size and distribution
- Balance computational cost
- Account for data dependencies
3. Validation Strategy
- Ensure representative splits
- Maintain data independence
- Consider problem constraints
- Validate assumptions
4. Common Pitfalls
- Data leakage in preprocessing
- Inappropriate fold selection
- Overfitting to validation set
- Ignoring data dependencies