Model Selection

Model selection is the process of choosing the best model from a set of candidate models. This section covers various techniques for model selection and hyperparameter tuning.

import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, accuracy_score

def grid_search_cv(model, param_grid, X, y, cv=5, scoring=None):
    """Perform grid search with cross-validation"""
    # Set up scoring
    if scoring is None:
        if len(np.unique(y)) <= 2:  # Binary classification
            scoring = 'accuracy'
        else:  # Regression
            scoring = 'neg_mean_squared_error'
    
    # Initialize grid search
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        verbose=1
    )
    
    # Fit grid search
    grid_search.fit(X, y)
    
    # Get results
    results = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'cv_results': grid_search.cv_results_
    }
    
    return results
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

def random_search_cv(model, param_distributions, X, y, n_iter=100, cv=5, scoring=None):
    """Perform randomized search with cross-validation"""
    # Set up scoring
    if scoring is None:
        if len(np.unique(y)) <= 2:
            scoring = 'accuracy'
        else:
            scoring = 'neg_mean_squared_error'
    
    # Initialize random search
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        verbose=1
    )
    
    # Fit random search
    random_search.fit(X, y)
    
    # Get results
    results = {
        'best_params': random_search.best_params_,
        'best_score': random_search.best_score_,
        'cv_results': random_search.cv_results_
    }
    
    return results

Model Comparison

1. Cross-Validation Comparison

def compare_models(models, X, y, cv=5):
    """Compare multiple models using cross-validation"""
    from sklearn.model_selection import cross_val_score
    
    # Store results
    results = {}
    
    # Evaluate each model
    for name, model in models.items():
        # Perform cross-validation
        scores = cross_val_score(
            model, X, y,
            cv=cv,
            scoring='neg_mean_squared_error' if len(np.unique(y)) > 2 else 'accuracy'
        )
        
        # Store results
        results[name] = {
            'mean_score': np.mean(scores),
            'std_score': np.std(scores),
            'scores': scores
        }
    
    return results

2. Statistical Comparison

from scipy import stats

def statistical_comparison(model_results):
    """Perform statistical comparison of models"""
    # Store model names and scores
    models = list(model_results.keys())
    scores = [model_results[model]['scores'] for model in models]
    
    # Perform Friedman test
    statistic, p_value = stats.friedmanchisquare(*scores)
    
    # Perform pairwise t-tests
    pairwise_tests = {}
    for i in range(len(models)):
        for j in range(i + 1, len(models)):
            t_stat, p_val = stats.ttest_rel(scores[i], scores[j])
            pair = f"{models[i]} vs {models[j]}"
            pairwise_tests[pair] = {
                't_statistic': t_stat,
                'p_value': p_val
            }
    
    return {
        'friedman_test': {
            'statistic': statistic,
            'p_value': p_value
        },
        'pairwise_tests': pairwise_tests
    }

Hyperparameter Tuning

1. Bayesian Optimization

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Matern

class BayesianOptimizer:
    def __init__(self, model, param_space, n_iterations=50):
        self.model = model
        self.param_space = param_space
        self.n_iterations = n_iterations
        self.X_samples = []
        self.y_samples = []
        
    def objective(self, params):
        """Objective function to minimize"""
        # Set model parameters
        self.model.set_params(**params)
        
        # Perform cross-validation
        scores = cross_val_score(
            self.model, self.X, self.y,
            cv=5,
            scoring='neg_mean_squared_error'
        )
        
        return -np.mean(scores)
    
    def optimize(self, X, y):
        """Run Bayesian optimization"""
        self.X = X
        self.y = y
        
        # Initialize Gaussian Process
        gp = GaussianProcessRegressor(
            kernel=Matern(nu=2.5),
            n_restarts_optimizer=10,
            random_state=42
        )
        
        for i in range(self.n_iterations):
            # Fit GP to observed data
            if self.X_samples:
                gp.fit(self.X_samples, self.y_samples)
                
                # Sample next point
                next_params = self._sample_next_point(gp)
            else:
                # Random initialization
                next_params = {
                    param: space.rvs()
                    for param, space in self.param_space.items()
                }
            
            # Evaluate objective
            score = self.objective(next_params)
            
            # Update samples
            self.X_samples.append(list(next_params.values()))
            self.y_samples.append(score)
        
        # Find best parameters
        best_idx = np.argmin(self.y_samples)
        best_params = dict(zip(
            self.param_space.keys(),
            self.X_samples[best_idx]
        ))
        
        return best_params, -self.y_samples[best_idx]

2. Learning Curves Analysis

def analyze_learning_curves(model, param_name, param_range, X, y, cv=5):
    """Analyze learning curves for different parameter values"""
    # Store results
    results = {}
    
    for param_value in param_range:
        # Set parameter
        model.set_params(**{param_name: param_value})
        
        # Calculate learning curves
        train_sizes, train_scores, val_scores = learning_curve(
            model, X, y,
            train_sizes=np.linspace(0.1, 1.0, 10),
            cv=cv,
            n_jobs=-1,
            scoring='neg_mean_squared_error'
        )
        
        # Store results
        results[param_value] = {
            'train_sizes': train_sizes,
            'train_scores': -train_scores.mean(axis=1),
            'val_scores': -val_scores.mean(axis=1)
        }
    
    return results

Best Practices

1. Search Strategy

  • Use grid search for few parameters
  • Use random search for large spaces
  • Consider Bayesian optimization
  • Balance exploration and exploitation

2. Cross-Validation

  • Use appropriate validation strategy
  • Consider data dependencies
  • Account for computational cost
  • Validate final model independently

3. Model Selection

  • Consider model complexity
  • Account for uncertainty
  • Use statistical tests
  • Document selection process

4. Common Pitfalls

  • Overfitting to validation set
  • Insufficient search space
  • Ignoring computational cost
  • Not considering model stability