Introduction to Statistical Learning

Statistical learning provides the theoretical foundation for machine learning, focusing on the mathematical frameworks and principles that enable learning from data.

Key Concepts

Statistical Learning Framework

  • Learning from data
  • Model estimation
  • Prediction and inference
  • Error measurement

Implementation Examples

1. Basic Statistical Learning Setup

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

class StatisticalLearner:
    def __init__(self):
        self.parameters = None
    
    def fit(self, X, y):
        """Fit model to training data"""
        pass
    
    def predict(self, X):
        """Make predictions on new data"""
        pass
    
    def score(self, X, y):
        """Evaluate model performance"""
        predictions = self.predict(X)
        return {
            'mse': mean_squared_error(y, predictions),
            'r2': r2_score(y, predictions)
        }

# Example usage
X = np.random.randn(100, 2)  # Features
y = 2*X[:, 0] + 3*X[:, 1] + np.random.randn(100)*0.1  # Target

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

2. Bias-Variance Analysis

def bias_variance_decomposition(model, X_train, y_train, X_test, y_test, n_bootstraps=100):
    """Estimate bias and variance using bootstrap"""
    predictions = np.zeros((n_bootstraps, len(X_test)))
    
    for i in range(n_bootstraps):
        # Bootstrap sample
        indices = np.random.randint(0, len(X_train), len(X_train))
        X_boot = X_train[indices]
        y_boot = y_train[indices]
        
        # Train model and predict
        model.fit(X_boot, y_boot)
        predictions[i, :] = model.predict(X_test)
    
    # Calculate statistics
    mean_pred = np.mean(predictions, axis=0)
    bias = np.mean((mean_pred - y_test) ** 2)
    variance = np.mean(np.var(predictions, axis=0))
    
    return {
        'bias': bias,
        'variance': variance,
        'total_error': bias + variance
    }

Model Evaluation Framework

1. Cross-Validation Implementation

def custom_cross_validation(model, X, y, n_folds=5):
    """Implement k-fold cross-validation from scratch"""
    fold_size = len(X) // n_folds
    scores = []
    
    for i in range(n_folds):
        # Create validation fold
        start_idx = i * fold_size
        end_idx = start_idx + fold_size
        X_val = X[start_idx:end_idx]
        y_val = y[start_idx:end_idx]
        
        # Create training set
        X_train = np.concatenate([
            X[:start_idx],
            X[end_idx:]
        ])
        y_train = np.concatenate([
            y[:start_idx],
            y[end_idx:]
        ])
        
        # Train and evaluate
        model.fit(X_train, y_train)
        score = model.score(X_val, y_val)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

2. Model Selection

def model_selection(models, X_train, y_train, X_val, y_val):
    """Compare multiple models and select the best one"""
    results = {}
    
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Evaluate performance
        train_score = model.score(X_train, y_train)
        val_score = model.score(X_val, y_val)
        
        results[name] = {
            'train_score': train_score,
            'val_score': val_score,
            'model': model
        }
    
    # Find best model
    best_model = max(results.items(), 
                    key=lambda x: x[1]['val_score'])
    
    return results, best_model

# Example usage
from sklearn.linear_model import LinearRegression, Ridge, Lasso

models = {
    'linear': LinearRegression(),
    'ridge': Ridge(alpha=1.0),
    'lasso': Lasso(alpha=1.0)
}

results, best_model = model_selection(
    models, X_train, y_train, X_test, y_test
)

Best Practices

1. Data Preprocessing

def preprocess_data(X, standardize=True, remove_outliers=True):
    """Preprocess data for statistical learning"""
    if standardize:
        # Standardize features
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)
        X = (X - mean) / std
    
    if remove_outliers:
        # Remove outliers using IQR method
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        IQR = Q3 - Q1
        mask = ~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)
        X = X[mask]
    
    return X

2. Model Diagnostics

def model_diagnostics(model, X, y):
    """Perform basic model diagnostics"""
    predictions = model.predict(X)
    residuals = y - predictions
    
    diagnostics = {
        'residuals_mean': np.mean(residuals),
        'residuals_std': np.std(residuals),
        'r2_score': r2_score(y, predictions),
        'mse': mean_squared_error(y, predictions)
    }
    
    return diagnostics

Applications

  1. Regression Analysis

    • Linear regression
    • Polynomial regression
    • Regularized regression
  2. Classification

    • Logistic regression
    • Discriminant analysis
    • Support vector machines
  3. Model Selection

    • Cross-validation
    • Information criteria
    • Bootstrap methods