Generalized Linear Models

Generalized Linear Models (GLMs) extend linear regression to handle response variables with non-normal distributions and non-linear relationships with predictors through a link function.

Theoretical Foundations

1. Components of GLMs

  1. Random Component

    • Specifies the probability distribution of the response variable
    • Common distributions:
      • Gaussian (Normal): YN(μ,σ2)Y \sim N(\mu, \sigma^2)
      • Binomial: YB(n,p)Y \sim B(n, p)
      • Poisson: YPois(λ)Y \sim Pois(\lambda)
      • Gamma: YGamma(α,β)Y \sim Gamma(\alpha, \beta)
  2. Systematic Component

    • Linear predictor: η=Xβ\eta = X\beta
    • Where:
      • XX is the design matrix
      • β\beta are the coefficients
  3. Link Function

    • Connects mean of response to linear predictor: g(μ)=ηg(\mu) = \eta
    • Common link functions:
      • Identity: g(μ)=μg(\mu) = \mu
      • Logit: g(μ)=log(μ1μ)g(\mu) = \log(\frac{\mu}{1-\mu})
      • Log: g(μ)=log(μ)g(\mu) = \log(\mu)
      • Inverse: g(μ)=1μg(\mu) = \frac{1}{\mu}

2. Maximum Likelihood Estimation

The log-likelihood function:

l(\beta) = \sum_{i=1}^n \log f(y_i; \theta_i, \phi)

where:

  • ff is the probability density/mass function
  • θi\theta_i is the canonical parameter
  • ϕ\phi is the dispersion parameter

Implementation Examples

1. Logistic Regression

import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt

class LogisticRegression:
    def __init__(self, fit_intercept=True):
        self.fit_intercept = fit_intercept
        self.coef_ = None
        self.intercept_ = None
    
    def _sigmoid(self, z):
        """Sigmoid function"""
        return 1 / (1 + np.exp(-z))
    
    def _add_intercept(self, X):
        """Add intercept term to X"""
        if self.fit_intercept:
            return np.c_[np.ones(X.shape[0]), X]
        return X
    
    def _negative_log_likelihood(self, beta, X, y):
        """Compute negative log-likelihood"""
        z = X @ beta
        log_likelihood = np.sum(y * z - np.log(1 + np.exp(z)))
        return -log_likelihood
    
    def _gradient(self, beta, X, y):
        """Compute gradient of negative log-likelihood"""
        z = X @ beta
        return -X.T @ (y - self._sigmoid(z))
    
    def fit(self, X, y, max_iter=100):
        """Fit logistic regression using maximum likelihood"""
        X = self._add_intercept(X)
        
        # Initialize parameters
        beta_init = np.zeros(X.shape[1])
        
        # Optimize using BFGS
        result = minimize(
            fun=self._negative_log_likelihood,
            x0=beta_init,
            args=(X, y),
            method='BFGS',
            jac=self._gradient,
            options={'maxiter': max_iter}
        )
        
        if self.fit_intercept:
            self.intercept_ = result.x[0]
            self.coef_ = result.x[1:]
        else:
            self.coef_ = result.x
        
        return self
    
    def predict_proba(self, X):
        """Predict probabilities"""
        X = self._add_intercept(X)
        return self._sigmoid(X @ np.r_[self.intercept_, self.coef_])
    
    def predict(self, X, threshold=0.5):
        """Predict classes"""
        return (self.predict_proba(X) >= threshold).astype(int)

# Example usage
def plot_logistic_regression():
    # Generate sample data
    np.random.seed(42)
    X = np.random.randn(200, 2)
    y = (X[:, 0] + X[:, 1] > 0).astype(int)
    
    # Fit model
    model = LogisticRegression()
    model.fit(X, y)
    
    # Create grid for visualization
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.linspace(x_min, x_max, 100),
        np.linspace(y_min, y_max, 100)
    )
    
    # Make predictions
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    # Plot
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X[:, 0], X[:, 1], c=y, alpha=0.8)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('Logistic Regression Decision Boundary')
    return plt.gcf()

2. Poisson Regression

class PoissonRegression:
    def __init__(self, fit_intercept=True):
        self.fit_intercept = fit_intercept
        self.coef_ = None
        self.intercept_ = None
    
    def _add_intercept(self, X):
        if self.fit_intercept:
            return np.c_[np.ones(X.shape[0]), X]
        return X
    
    def _negative_log_likelihood(self, beta, X, y):
        """Compute negative log-likelihood for Poisson regression"""
        mu = np.exp(X @ beta)
        return -np.sum(y * np.log(mu) - mu - np.log(np.factorial(y)))
    
    def _gradient(self, beta, X, y):
        """Compute gradient of negative log-likelihood"""
        mu = np.exp(X @ beta)
        return -X.T @ (y - mu)
    
    def fit(self, X, y, max_iter=100):
        """Fit Poisson regression using maximum likelihood"""
        X = self._add_intercept(X)
        
        # Initialize parameters
        beta_init = np.zeros(X.shape[1])
        
        # Optimize using BFGS
        result = minimize(
            fun=self._negative_log_likelihood,
            x0=beta_init,
            args=(X, y),
            method='BFGS',
            jac=self._gradient,
            options={'maxiter': max_iter}
        )
        
        if self.fit_intercept:
            self.intercept_ = result.x[0]
            self.coef_ = result.x[1:]
        else:
            self.coef_ = result.x
        
        return self
    
    def predict(self, X):
        """Predict counts"""
        X = self._add_intercept(X)
        return np.exp(X @ np.r_[self.intercept_, self.coef_])

3. Gamma Regression

class GammaRegression:
    def __init__(self, fit_intercept=True):
        self.fit_intercept = fit_intercept
        self.coef_ = None
        self.intercept_ = None
        self.shape_ = None
    
    def _negative_log_likelihood(self, params, X, y):
        """Compute negative log-likelihood for Gamma regression"""
        beta = params[:-1]
        alpha = params[-1]  # shape parameter
        
        mu = np.exp(X @ beta)
        ll = (alpha - 1) * np.log(y) - y/mu - alpha * np.log(mu) - np.log(gamma(alpha))
        return -np.sum(ll)
    
    def fit(self, X, y, max_iter=100):
        """Fit Gamma regression using maximum likelihood"""
        X = self._add_intercept(X)
        
        # Initialize parameters (including shape parameter)
        params_init = np.zeros(X.shape[1] + 1)
        params_init[-1] = 1  # initial shape parameter
        
        # Optimize
        result = minimize(
            fun=self._negative_log_likelihood,
            x0=params_init,
            args=(X, y),
            method='BFGS',
            options={'maxiter': max_iter}
        )
        
        if self.fit_intercept:
            self.intercept_ = result.x[0]
            self.coef_ = result.x[1:-1]
        else:
            self.coef_ = result.x[:-1]
        
        self.shape_ = result.x[-1]
        return self
    
    def predict(self, X):
        """Predict mean response"""
        X = self._add_intercept(X)
        return np.exp(X @ np.r_[self.intercept_, self.coef_])

Model Diagnostics

1. Residual Analysis

def analyze_residuals(model, X, y):
    """Analyze residuals for GLM"""
    y_pred = model.predict(X)
    
    # Compute different types of residuals
    response_residuals = y - y_pred
    pearson_residuals = response_residuals / np.sqrt(y_pred)
    deviance_residuals = np.sign(response_residuals) * np.sqrt(2 * (
        y * np.log(y/y_pred) - (y - y_pred)
    ))
    
    # Create diagnostic plots
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Residuals vs Fitted
    axes[0, 0].scatter(y_pred, response_residuals)
    axes[0, 0].axhline(y=0, color='r', linestyle='--')
    axes[0, 0].set_xlabel('Fitted values')
    axes[0, 0].set_ylabel('Response residuals')
    
    # Scale-Location
    axes[0, 1].scatter(y_pred, np.abs(pearson_residuals))
    axes[0, 1].set_xlabel('Fitted values')
    axes[0, 1].set_ylabel('|Pearson residuals|')
    
    # QQ plot
    from scipy import stats
    stats.probplot(deviance_residuals, dist="norm", plot=axes[1, 0])
    
    # Cook's distance
    leverage = np.diag(X @ np.linalg.inv(X.T @ X) @ X.T)
    cooks_dist = (pearson_residuals**2 * leverage) / (1 - leverage)
    axes[1, 1].scatter(range(len(cooks_dist)), cooks_dist)
    axes[1, 1].set_xlabel('Observation')
    axes[1, 1].set_ylabel("Cook's distance")
    
    plt.tight_layout()
    return fig

2. Deviance Analysis

def compute_deviance(model, X, y, family='poisson'):
    """Compute deviance for GLM"""
    y_pred = model.predict(X)
    
    if family == 'poisson':
        deviance = 2 * np.sum(
            y * np.log(y/y_pred) - (y - y_pred)
        )
    elif family == 'gamma':
        deviance = 2 * np.sum(
            -np.log(y/y_pred) + (y - y_pred)/y_pred
        )
    elif family == 'binomial':
        deviance = 2 * np.sum(
            y * np.log(y/y_pred) + (1-y) * np.log((1-y)/(1-y_pred))
        )
    
    return deviance

Advanced Topics

1. Elastic-Net GLM

class ElasticNetGLM:
    def __init__(self, alpha=1.0, l1_ratio=0.5, family='gaussian'):
        self.alpha = alpha
        self.l1_ratio = l1_ratio
        self.family = family
        self.coef_ = None
    
    def _penalty(self, beta):
        """Compute elastic net penalty"""
        l1 = self.l1_ratio * self.alpha * np.sum(np.abs(beta))
        l2 = (1 - self.l1_ratio) * self.alpha * np.sum(beta**2)
        return l1 + l2
    
    def fit(self, X, y, max_iter=1000):
        """Fit elastic net GLM using coordinate descent"""
        # Implementation details...
        return self

2. Quasi-likelihood Methods

class QuasiPoissonRegression:
    def __init__(self):
        self.coef_ = None
        self.dispersion_ = None
    
    def fit(self, X, y):
        """Fit quasi-Poisson regression"""
        # First fit standard Poisson regression
        poisson_model = PoissonRegression()
        poisson_model.fit(X, y)
        
        # Estimate dispersion parameter
        y_pred = poisson_model.predict(X)
        pearson_residuals = (y - y_pred) / np.sqrt(y_pred)
        self.dispersion_ = np.sum(pearson_residuals**2) / (len(y) - X.shape[1])
        
        # Store coefficients
        self.coef_ = poisson_model.coef_
        
        return self

Best Practices

  1. Model Selection

    • Choose appropriate distribution family
    • Select suitable link function
    • Consider model complexity
    • Use diagnostic tools
  2. Data Preprocessing

    • Handle missing values
    • Check for outliers
    • Scale predictors
    • Handle categorical variables
  3. Model Validation

    • Check residuals
    • Assess goodness of fit
    • Validate assumptions
    • Use cross-validation
  4. Interpretation

    • Understand coefficient meanings
    • Consider scale of predictors
    • Account for link function
    • Report uncertainty estimates