Cross-Validation

Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample. This section covers various cross-validation techniques and their implementation.

Basic Cross-Validation

1. K-Fold Cross-Validation

import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, accuracy_score

def k_fold_cv(model, X, y, n_splits=5, shuffle=True, random_state=42):
    """Perform k-fold cross-validation"""
    # Initialize K-fold
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    
    # Store scores
    scores = []
    
    # Perform cross-validation
    for train_idx, val_idx in kf.split(X):
        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_val)
        
        # Compute score
        if len(np.unique(y)) <= 2:  # Binary classification
            score = accuracy_score(y_val, y_pred)
        else:  # Regression
            score = mean_squared_error(y_val, y_pred)
        
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

2. Stratified K-Fold

from sklearn.model_selection import StratifiedKFold

def stratified_k_fold_cv(model, X, y, n_splits=5, shuffle=True, random_state=42):
    """Perform stratified k-fold cross-validation"""
    # Initialize Stratified K-fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    
    # Store scores
    scores = []
    
    # Perform cross-validation
    for train_idx, val_idx in skf.split(X, y):
        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions and compute score
        y_pred = model.predict(X_val)
        score = accuracy_score(y_val, y_pred)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

Advanced Cross-Validation

1. Time Series Cross-Validation

from sklearn.model_selection import TimeSeriesSplit

def time_series_cv(model, X, y, n_splits=5):
    """Perform time series cross-validation"""
    # Initialize Time Series Split
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    # Store scores
    scores = []
    
    # Perform cross-validation
    for train_idx, val_idx in tscv.split(X):
        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions and compute score
        y_pred = model.predict(X_val)
        score = mean_squared_error(y_val, y_pred)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

2. Nested Cross-Validation

from sklearn.model_selection import GridSearchCV

def nested_cv(model, param_grid, X, y, outer_splits=5, inner_splits=3):
    """Perform nested cross-validation"""
    # Initialize outer and inner cross-validation
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=42)
    
    # Store scores
    outer_scores = []
    
    # Outer loop
    for train_idx, test_idx in outer_cv.split(X):
        # Split data
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Inner cross-validation for model selection
        inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=42)
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=param_grid,
            cv=inner_cv,
            scoring='neg_mean_squared_error'
        )
        
        # Fit grid search
        grid_search.fit(X_train, y_train)
        
        # Get best model and evaluate on test set
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        score = mean_squared_error(y_test, y_pred)
        outer_scores.append(score)
    
    return np.mean(outer_scores), np.std(outer_scores)

3. Leave-One-Out Cross-Validation

from sklearn.model_selection import LeaveOneOut

def leave_one_out_cv(model, X, y):
    """Perform leave-one-out cross-validation"""
    # Initialize LOOCV
    loo = LeaveOneOut()
    
    # Store scores
    scores = []
    
    # Perform cross-validation
    for train_idx, val_idx in loo.split(X):
        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Train model
        model.fit(X_train, y_train)
        
        # Make predictions and compute score
        y_pred = model.predict(X_val)
        score = mean_squared_error(y_val, y_pred)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

Visualization

1. Cross-Validation Results

import matplotlib.pyplot as plt
import seaborn as sns

def plot_cv_results(cv_scores, method_name):
    """Plot cross-validation results"""
    plt.figure(figsize=(10, 6))
    
    # Create box plot
    plt.boxplot(cv_scores)
    
    # Add scatter points for individual scores
    plt.scatter(np.ones_like(cv_scores), cv_scores, alpha=0.5)
    
    plt.title(f'Cross-Validation Scores: {method_name}')
    plt.ylabel('Score')
    plt.xticks([1], [method_name])
    
    return plt.gcf()

2. Learning Curves

from sklearn.model_selection import learning_curve

def plot_learning_curves(model, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)):
    """Plot learning curves from cross-validation"""
    # Calculate learning curves
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y,
        train_sizes=train_sizes,
        cv=cv,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    )
    
    # Calculate mean and std
    train_mean = -np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = -np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    # Plot curves
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_mean, label='Training error')
    plt.plot(train_sizes, val_mean, label='Validation error')
    
    # Plot standard deviation bands
    plt.fill_between(
        train_sizes,
        train_mean - train_std,
        train_mean + train_std,
        alpha=0.1
    )
    plt.fill_between(
        train_sizes,
        val_mean - val_std,
        val_mean + val_std,
        alpha=0.1
    )
    
    plt.xlabel('Training Examples')
    plt.ylabel('Mean Squared Error')
    plt.title('Learning Curves')
    plt.legend(loc='best')
    plt.grid(True)
    
    return plt.gcf()

Best Practices

1. Method Selection

  • Use stratified k-fold for classification
  • Use time series CV for temporal data
  • Consider LOOCV for very small datasets
  • Use nested CV for model selection

2. Parameter Settings

  • Choose appropriate number of folds
  • Consider data size and distribution
  • Balance computational cost
  • Account for data dependencies

3. Validation Strategy

  • Ensure representative splits
  • Maintain data independence
  • Consider problem constraints
  • Validate assumptions

4. Common Pitfalls

  • Data leakage in preprocessing
  • Inappropriate fold selection
  • Overfitting to validation set
  • Ignoring data dependencies