Introduction to Statistical Learning

Statistical learning provides the theoretical foundation for machine learning, focusing on the mathematical frameworks and principles that enable learning from data.

Key Concepts

Statistical Learning Framework

Learning from data
Model estimation
Prediction and inference
Error measurement

Implementation Examples

1. Basic Statistical Learning Setup

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

class StatisticalLearner:
    def __init__(self):
        self.parameters = None
    
    def fit(self, X, y):
        """Fit model to training data"""
        pass
    
    def predict(self, X):
        """Make predictions on new data"""
        pass
    
    def score(self, X, y):
        """Evaluate model performance"""
        predictions = self.predict(X)
        return {
            'mse': mean_squared_error(y, predictions),
            'r2': r2_score(y, predictions)
        }

# Example usage
X = np.random.randn(100, 2)  # Features
y = 2*X[:, 0] + 3*X[:, 1] + np.random.randn(100)*0.1  # Target

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

2. Bias-Variance Analysis

def bias_variance_decomposition(model, X_train, y_train, X_test, y_test, n_bootstraps=100):
    """Estimate bias and variance using bootstrap"""
    predictions = np.zeros((n_bootstraps, len(X_test)))
    
    for i in range(n_bootstraps):
        # Bootstrap sample
        indices = np.random.randint(0, len(X_train), len(X_train))
        X_boot = X_train[indices]
        y_boot = y_train[indices]
        
        # Train model and predict
        model.fit(X_boot, y_boot)
        predictions[i, :] = model.predict(X_test)
    
    # Calculate statistics
    mean_pred = np.mean(predictions, axis=0)
    bias = np.mean((mean_pred - y_test) ** 2)
    variance = np.mean(np.var(predictions, axis=0))
    
    return {
        'bias': bias,
        'variance': variance,
        'total_error': bias + variance
    }

Model Evaluation Framework

1. Cross-Validation Implementation

def custom_cross_validation(model, X, y, n_folds=5):
    """Implement k-fold cross-validation from scratch"""
    fold_size = len(X) // n_folds
    scores = []
    
    for i in range(n_folds):
        # Create validation fold
        start_idx = i * fold_size
        end_idx = start_idx + fold_size
        X_val = X[start_idx:end_idx]
        y_val = y[start_idx:end_idx]
        
        # Create training set
        X_train = np.concatenate([
            X[:start_idx],
            X[end_idx:]
        ])
        y_train = np.concatenate([
            y[:start_idx],
            y[end_idx:]
        ])
        
        # Train and evaluate
        model.fit(X_train, y_train)
        score = model.score(X_val, y_val)
        scores.append(score)
    
    return np.mean(scores), np.std(scores)

2. Model Selection

def model_selection(models, X_train, y_train, X_val, y_val):
    """Compare multiple models and select the best one"""
    results = {}
    
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        
        # Evaluate performance
        train_score = model.score(X_train, y_train)
        val_score = model.score(X_val, y_val)
        
        results[name] = {
            'train_score': train_score,
            'val_score': val_score,
            'model': model
        }
    
    # Find best model
    best_model = max(results.items(), 
                    key=lambda x: x[1]['val_score'])
    
    return results, best_model

# Example usage
from sklearn.linear_model import LinearRegression, Ridge, Lasso

models = {
    'linear': LinearRegression(),
    'ridge': Ridge(alpha=1.0),
    'lasso': Lasso(alpha=1.0)
}

results, best_model = model_selection(
    models, X_train, y_train, X_test, y_test
)

Best Practices

1. Data Preprocessing

def preprocess_data(X, standardize=True, remove_outliers=True):
    """Preprocess data for statistical learning"""
    if standardize:
        # Standardize features
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0)
        X = (X - mean) / std
    
    if remove_outliers:
        # Remove outliers using IQR method
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        IQR = Q3 - Q1
        mask = ~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)
        X = X[mask]
    
    return X

2. Model Diagnostics

def model_diagnostics(model, X, y):
    """Perform basic model diagnostics"""
    predictions = model.predict(X)
    residuals = y - predictions
    
    diagnostics = {
        'residuals_mean': np.mean(residuals),
        'residuals_std': np.std(residuals),
        'r2_score': r2_score(y, predictions),
        'mse': mean_squared_error(y, predictions)
    }
    
    return diagnostics

Applications

Regression Analysis
- Linear regression
- Polynomial regression
- Regularized regression
Classification
- Logistic regression
- Discriminant analysis
- Support vector machines
Model Selection
- Cross-validation
- Information criteria
- Bootstrap methods

PreviousStatistical Learning

NextLinear Models

Getting Started

Math

Machine Learning

Deep Learning

Natural Language Processing

Reinforcement Learning

References

Introduction to Statistical Learning

Key Concepts

Statistical Learning Framework

Implementation Examples

1. Basic Statistical Learning Setup

2. Bias-Variance Analysis

Model Evaluation Framework

1. Cross-Validation Implementation

2. Model Selection

Best Practices

1. Data Preprocessing

2. Model Diagnostics

Applications

On this page