Linear Models

Linear models form the foundation of statistical learning, providing interpretable and efficient methods for both regression and classification tasks.

Theoretical Foundations

1. Linear Regression

  1. Model Formulation

    • Linear relationship: y=Xβ+ϵy = X\beta + \epsilon
    • Where:
      • yy is the response variable
      • XX is the design matrix
      • β\beta are the coefficients
      • ϵ\epsilon is the error term, ϵN(0,σ2)\epsilon \sim N(0, \sigma^2)
  2. Ordinary Least Squares (OLS)

    • Minimizes sum of squared residuals:
    \min_{\beta} ||y - X\beta||^2
    
    • Closed-form solution:
    \hat{\beta} = (X^TX)^{-1}X^Ty
    

2. Ridge Regression (L2 Regularization)

  1. Objective Function

    \min_{\beta} ||y - X\beta||^2 + \lambda||\beta||^2
    

    where λ\lambda is the regularization parameter

  2. Solution

    \hat{\beta}_{ridge} = (X^TX + \lambda I)^{-1}X^Ty
    

3. Lasso Regression (L1 Regularization)

  1. Objective Function

    \min_{\beta} ||y - X\beta||^2 + \lambda||\beta||_1
    
  2. Properties

    • Promotes sparsity
    • Feature selection
    • No closed-form solution

Implementation Examples

1. Linear Regression Implementation

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

class LinearRegression:
    def __init__(self, fit_intercept=True):
        self.fit_intercept = fit_intercept
        self.coef_ = None
        self.intercept_ = None
        
    def fit(self, X, y):
        """Fit linear regression using OLS"""
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]
        
        # Compute coefficients using normal equation
        beta = np.linalg.inv(X.T @ X) @ X.T @ y
        
        if self.fit_intercept:
            self.intercept_ = beta[0]
            self.coef_ = beta[1:]
        else:
            self.coef_ = beta
        
        return self
    
    def predict(self, X):
        """Make predictions"""
        if self.fit_intercept:
            return self.intercept_ + X @ self.coef_
        return X @ self.coef_
    
    def score(self, X, y):
        """Compute R-squared score"""
        y_pred = self.predict(X)
        u = ((y - y_pred) ** 2).sum()
        v = ((y - y.mean()) ** 2).sum()
        return 1 - u/v
    
    def confidence_intervals(self, X, y, alpha=0.05):
        """Compute confidence intervals for coefficients"""
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]
        
        n = X.shape[0]
        p = X.shape[1]
        
        # Compute residuals
        y_pred = self.predict(X[:, 1:] if self.fit_intercept else X)
        residuals = y - y_pred
        
        # Compute standard error
        mse = np.sum(residuals**2) / (n - p)
        var_beta = mse * np.linalg.inv(X.T @ X)
        se = np.sqrt(np.diag(var_beta))
        
        # Compute t-statistics
        t_stat = stats.t.ppf(1 - alpha/2, n-p)
        
        # Compute confidence intervals
        beta = np.r_[self.intercept_, self.coef_] if self.fit_intercept else self.coef_
        ci_lower = beta - t_stat * se
        ci_upper = beta + t_stat * se
        
        return ci_lower, ci_upper

# Example usage with visualization
def visualize_linear_regression():
    # Generate sample data
    np.random.seed(42)
    X = np.linspace(0, 10, 100).reshape(-1, 1)
    y = 2 * X.ravel() + 1 + np.random.normal(0, 1, 100)
    
    # Fit model
    model = LinearRegression()
    model.fit(X, y)
    
    # Plot results
    plt.figure(figsize=(10, 6))
    plt.scatter(X, y, alpha=0.5, label='Data')
    plt.plot(X, model.predict(X), 'r-', label='Fitted line')
    
    # Add confidence intervals
    ci_lower, ci_upper = model.confidence_intervals(X, y)
    plt.fill_between(X.ravel(), 
                    model.predict(X) - 1.96 * np.std(y - model.predict(X)),
                    model.predict(X) + 1.96 * np.std(y - model.predict(X)),
                    alpha=0.2, label='95% CI')
    
    plt.xlabel('X')
    plt.ylabel('y')
    plt.title('Linear Regression with Confidence Intervals')
    plt.legend()
    return plt.gcf()

2. Ridge Regression Implementation

class RidgeRegression:
    def __init__(self, alpha=1.0, fit_intercept=True):
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.coef_ = None
        self.intercept_ = None
    
    def fit(self, X, y):
        """Fit ridge regression"""
        if self.fit_intercept:
            X = np.c_[np.ones(X.shape[0]), X]
        
        n_samples, n_features = X.shape
        
        # Add ridge penalty
        I = np.eye(n_features)
        if self.fit_intercept:
            I[0, 0] = 0  # Don't penalize intercept
            
        # Compute coefficients
        beta = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y
        
        if self.fit_intercept:
            self.intercept_ = beta[0]
            self.coef_ = beta[1:]
        else:
            self.coef_ = beta
            
        return self
    
    def predict(self, X):
        """Make predictions"""
        if self.fit_intercept:
            return self.intercept_ + X @ self.coef_
        return X @ self.coef_

3. Lasso Implementation using Coordinate Descent

class LassoRegression:
    def __init__(self, alpha=1.0, max_iter=1000, tol=1e-4):
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol
        self.coef_ = None
        
    def _soft_threshold(self, x, lambda_):
        """Soft-thresholding operator"""
        return np.sign(x) * np.maximum(np.abs(x) - lambda_, 0)
    
    def fit(self, X, y):
        """Fit lasso regression using coordinate descent"""
        n_samples, n_features = X.shape
        
        # Initialize coefficients
        self.coef_ = np.zeros(n_features)
        
        # Precompute X^T X and X^T y
        XtX = X.T @ X
        Xty = X.T @ y
        
        for _ in range(self.max_iter):
            coef_old = self.coef_.copy()
            
            # Update each coordinate
            for j in range(n_features):
                # Remove current feature's contribution
                r = y - X @ self.coef_ + X[:, j] * self.coef_[j]
                
                # Compute coordinate update
                self.coef_[j] = self._soft_threshold(
                    X[:, j] @ r,
                    self.alpha * n_samples
                ) / (XtX[j, j])
            
            # Check convergence
            if np.sum(np.abs(self.coef_ - coef_old)) < self.tol:
                break
                
        return self
    
    def predict(self, X):
        """Make predictions"""
        return X @ self.coef_

Model Selection and Validation

1. Cross-Validation

def cross_validate_linear_model(model_class, X, y, k_folds=5, **model_params):
    """Perform k-fold cross-validation"""
    n_samples = len(y)
    fold_size = n_samples // k_folds
    scores = []
    
    for i in range(k_folds):
        # Create train/test split
        test_idx = slice(i * fold_size, (i + 1) * fold_size)
        train_idx = list(set(range(n_samples)) - set(range(*test_idx.indices(n_samples))))
        
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        # Fit model and compute score
        model = model_class(**model_params)
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))
    
    return np.mean(scores), np.std(scores)

2. Regularization Path

def compute_regularization_path(X, y, alphas):
    """Compute regularization path for ridge/lasso"""
    coefs = []
    scores = []
    
    for alpha in alphas:
        model = RidgeRegression(alpha=alpha)
        model.fit(X, y)
        coefs.append(model.coef_)
        scores.append(model.score(X, y))
    
    return np.array(coefs), np.array(scores)

def plot_regularization_path(X, y, alphas):
    """Plot regularization path"""
    coefs, scores = compute_regularization_path(X, y, alphas)
    
    plt.figure(figsize=(12, 5))
    
    # Plot coefficients
    plt.subplot(121)
    for i in range(coefs.shape[1]):
        plt.plot(np.log10(alphas), coefs[:, i], label=f'Feature {i+1}')
    plt.xlabel('log(alpha)')
    plt.ylabel('Coefficients')
    plt.title('Regularization Path')
    plt.legend()
    
    # Plot scores
    plt.subplot(122)
    plt.plot(np.log10(alphas), scores)
    plt.xlabel('log(alpha)')
    plt.ylabel('R-squared')
    plt.title('Model Performance')
    
    return plt.gcf()

Advanced Topics

1. Elastic Net

class ElasticNet:
    def __init__(self, alpha=1.0, l1_ratio=0.5, max_iter=1000, tol=1e-4):
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.max_iter = max_iter
        self.tol = tol
        self.coef_ = None
        
    def fit(self, X, y):
        """Fit elastic net using coordinate descent"""
        n_samples, n_features = X.shape
        
        # Initialize coefficients
        self.coef_ = np.zeros(n_features)
        
        # Compute elastic net parameters
        alpha_l1 = self.alpha * self.l1_ratio
        alpha_l2 = self.alpha * (1 - self.l1_ratio)
        
        for _ in range(self.max_iter):
            coef_old = self.coef_.copy()
            
            for j in range(n_features):
                # Remove current feature's contribution
                r = y - X @ self.coef_ + X[:, j] * self.coef_[j]
                
                # Compute coordinate update
                self.coef_[j] = self._soft_threshold(
                    X[:, j] @ r,
                    alpha_l1 * n_samples
                ) / (X[:, j] @ X[:, j] + alpha_l2 * n_samples)
            
            # Check convergence
            if np.sum(np.abs(self.coef_ - coef_old)) < self.tol:
                break
                
        return self

2. Robust Regression

class HuberRegression:
    def __init__(self, epsilon=1.35, max_iter=100, tol=1e-4):
        self.epsilon = epsilon
        self.max_iter = max_iter
        self.tol = tol
        self.coef_ = None
        
    def _huber_loss(self, residuals):
        """Compute Huber loss"""
        mask = np.abs(residuals) <= self.epsilon
        loss = np.zeros_like(residuals)
        loss[mask] = 0.5 * residuals[mask]**2
        loss[~mask] = self.epsilon * (np.abs(residuals[~mask]) - 0.5 * self.epsilon)
        return loss.sum()
    
    def fit(self, X, y):
        """Fit Huber regression using IRLS"""
        n_samples, n_features = X.shape
        
        # Initialize coefficients
        self.coef_ = np.zeros(n_features)
        
        for _ in range(self.max_iter):
            coef_old = self.coef_.copy()
            
            # Compute residuals
            residuals = y - X @ self.coef_
            
            # Compute weights
            weights = np.ones(n_samples)
            mask = np.abs(residuals) > self.epsilon
            weights[mask] = self.epsilon / np.abs(residuals[mask])
            
            # Update coefficients
            W = np.diag(weights)
            self.coef_ = np.linalg.inv(X.T @ W @ X) @ X.T @ W @ y
            
            # Check convergence
            if np.sum(np.abs(self.coef_ - coef_old)) < self.tol:
                break
                
        return self

Best Practices

  1. Data Preprocessing

    • Scale features
    • Handle missing values
    • Check for multicollinearity
    • Remove outliers if necessary
  2. Model Selection

    • Use cross-validation
    • Consider regularization
    • Check assumptions
    • Compare different models
  3. Diagnostics

    • Check residuals
    • Test for heteroscedasticity
    • Assess influential points
    • Validate linearity assumption
  4. Interpretation

    • Examine coefficients
    • Calculate confidence intervals
    • Consider feature importance
    • Assess model fit