Linear Models
Linear models form the foundation of statistical learning, providing interpretable and efficient methods for both regression and classification tasks.
Theoretical Foundations
1. Linear Regression
-
Model Formulation
- Linear relationship:
- Where:
- is the response variable
- is the design matrix
- are the coefficients
- is the error term,
-
Ordinary Least Squares (OLS)
- Minimizes sum of squared residuals:
\min_{\beta} ||y - X\beta||^2
- Closed-form solution:
\hat{\beta} = (X^TX)^{-1}X^Ty
2. Ridge Regression (L2 Regularization)
-
Objective Function
\min_{\beta} ||y - X\beta||^2 + \lambda||\beta||^2
where is the regularization parameter
-
Solution
\hat{\beta}_{ridge} = (X^TX + \lambda I)^{-1}X^Ty
3. Lasso Regression (L1 Regularization)
-
Objective Function
\min_{\beta} ||y - X\beta||^2 + \lambda||\beta||_1
-
Properties
- Promotes sparsity
- Feature selection
- No closed-form solution
Implementation Examples
1. Linear Regression Implementation
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self, fit_intercept=True):
self.fit_intercept = fit_intercept
self.coef_ = None
self.intercept_ = None
def fit(self, X, y):
"""Fit linear regression using OLS"""
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
# Compute coefficients using normal equation
beta = np.linalg.inv(X.T @ X) @ X.T @ y
if self.fit_intercept:
self.intercept_ = beta[0]
self.coef_ = beta[1:]
else:
self.coef_ = beta
return self
def predict(self, X):
"""Make predictions"""
if self.fit_intercept:
return self.intercept_ + X @ self.coef_
return X @ self.coef_
def score(self, X, y):
"""Compute R-squared score"""
y_pred = self.predict(X)
u = ((y - y_pred) ** 2).sum()
v = ((y - y.mean()) ** 2).sum()
return 1 - u/v
def confidence_intervals(self, X, y, alpha=0.05):
"""Compute confidence intervals for coefficients"""
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
n = X.shape[0]
p = X.shape[1]
# Compute residuals
y_pred = self.predict(X[:, 1:] if self.fit_intercept else X)
residuals = y - y_pred
# Compute standard error
mse = np.sum(residuals**2) / (n - p)
var_beta = mse * np.linalg.inv(X.T @ X)
se = np.sqrt(np.diag(var_beta))
# Compute t-statistics
t_stat = stats.t.ppf(1 - alpha/2, n-p)
# Compute confidence intervals
beta = np.r_[self.intercept_, self.coef_] if self.fit_intercept else self.coef_
ci_lower = beta - t_stat * se
ci_upper = beta + t_stat * se
return ci_lower, ci_upper
# Example usage with visualization
def visualize_linear_regression():
# Generate sample data
np.random.seed(42)
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = 2 * X.ravel() + 1 + np.random.normal(0, 1, 100)
# Fit model
model = LinearRegression()
model.fit(X, y)
# Plot results
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.5, label='Data')
plt.plot(X, model.predict(X), 'r-', label='Fitted line')
# Add confidence intervals
ci_lower, ci_upper = model.confidence_intervals(X, y)
plt.fill_between(X.ravel(),
model.predict(X) - 1.96 * np.std(y - model.predict(X)),
model.predict(X) + 1.96 * np.std(y - model.predict(X)),
alpha=0.2, label='95% CI')
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression with Confidence Intervals')
plt.legend()
return plt.gcf()
2. Ridge Regression Implementation
class RidgeRegression:
def __init__(self, alpha=1.0, fit_intercept=True):
self.alpha = alpha
self.fit_intercept = fit_intercept
self.coef_ = None
self.intercept_ = None
def fit(self, X, y):
"""Fit ridge regression"""
if self.fit_intercept:
X = np.c_[np.ones(X.shape[0]), X]
n_samples, n_features = X.shape
# Add ridge penalty
I = np.eye(n_features)
if self.fit_intercept:
I[0, 0] = 0 # Don't penalize intercept
# Compute coefficients
beta = np.linalg.inv(X.T @ X + self.alpha * I) @ X.T @ y
if self.fit_intercept:
self.intercept_ = beta[0]
self.coef_ = beta[1:]
else:
self.coef_ = beta
return self
def predict(self, X):
"""Make predictions"""
if self.fit_intercept:
return self.intercept_ + X @ self.coef_
return X @ self.coef_
3. Lasso Implementation using Coordinate Descent
class LassoRegression:
def __init__(self, alpha=1.0, max_iter=1000, tol=1e-4):
self.alpha = alpha
self.max_iter = max_iter
self.tol = tol
self.coef_ = None
def _soft_threshold(self, x, lambda_):
"""Soft-thresholding operator"""
return np.sign(x) * np.maximum(np.abs(x) - lambda_, 0)
def fit(self, X, y):
"""Fit lasso regression using coordinate descent"""
n_samples, n_features = X.shape
# Initialize coefficients
self.coef_ = np.zeros(n_features)
# Precompute X^T X and X^T y
XtX = X.T @ X
Xty = X.T @ y
for _ in range(self.max_iter):
coef_old = self.coef_.copy()
# Update each coordinate
for j in range(n_features):
# Remove current feature's contribution
r = y - X @ self.coef_ + X[:, j] * self.coef_[j]
# Compute coordinate update
self.coef_[j] = self._soft_threshold(
X[:, j] @ r,
self.alpha * n_samples
) / (XtX[j, j])
# Check convergence
if np.sum(np.abs(self.coef_ - coef_old)) < self.tol:
break
return self
def predict(self, X):
"""Make predictions"""
return X @ self.coef_
Model Selection and Validation
1. Cross-Validation
def cross_validate_linear_model(model_class, X, y, k_folds=5, **model_params):
"""Perform k-fold cross-validation"""
n_samples = len(y)
fold_size = n_samples // k_folds
scores = []
for i in range(k_folds):
# Create train/test split
test_idx = slice(i * fold_size, (i + 1) * fold_size)
train_idx = list(set(range(n_samples)) - set(range(*test_idx.indices(n_samples))))
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
# Fit model and compute score
model = model_class(**model_params)
model.fit(X_train, y_train)
scores.append(model.score(X_test, y_test))
return np.mean(scores), np.std(scores)
2. Regularization Path
def compute_regularization_path(X, y, alphas):
"""Compute regularization path for ridge/lasso"""
coefs = []
scores = []
for alpha in alphas:
model = RidgeRegression(alpha=alpha)
model.fit(X, y)
coefs.append(model.coef_)
scores.append(model.score(X, y))
return np.array(coefs), np.array(scores)
def plot_regularization_path(X, y, alphas):
"""Plot regularization path"""
coefs, scores = compute_regularization_path(X, y, alphas)
plt.figure(figsize=(12, 5))
# Plot coefficients
plt.subplot(121)
for i in range(coefs.shape[1]):
plt.plot(np.log10(alphas), coefs[:, i], label=f'Feature {i+1}')
plt.xlabel('log(alpha)')
plt.ylabel('Coefficients')
plt.title('Regularization Path')
plt.legend()
# Plot scores
plt.subplot(122)
plt.plot(np.log10(alphas), scores)
plt.xlabel('log(alpha)')
plt.ylabel('R-squared')
plt.title('Model Performance')
return plt.gcf()
Advanced Topics
1. Elastic Net
class ElasticNet:
def __init__(self, alpha=1.0, l1_ratio=0.5, max_iter=1000, tol=1e-4):
self.alpha = alpha
self.l1_ratio = l1_ratio
self.max_iter = max_iter
self.tol = tol
self.coef_ = None
def fit(self, X, y):
"""Fit elastic net using coordinate descent"""
n_samples, n_features = X.shape
# Initialize coefficients
self.coef_ = np.zeros(n_features)
# Compute elastic net parameters
alpha_l1 = self.alpha * self.l1_ratio
alpha_l2 = self.alpha * (1 - self.l1_ratio)
for _ in range(self.max_iter):
coef_old = self.coef_.copy()
for j in range(n_features):
# Remove current feature's contribution
r = y - X @ self.coef_ + X[:, j] * self.coef_[j]
# Compute coordinate update
self.coef_[j] = self._soft_threshold(
X[:, j] @ r,
alpha_l1 * n_samples
) / (X[:, j] @ X[:, j] + alpha_l2 * n_samples)
# Check convergence
if np.sum(np.abs(self.coef_ - coef_old)) < self.tol:
break
return self
2. Robust Regression
class HuberRegression:
def __init__(self, epsilon=1.35, max_iter=100, tol=1e-4):
self.epsilon = epsilon
self.max_iter = max_iter
self.tol = tol
self.coef_ = None
def _huber_loss(self, residuals):
"""Compute Huber loss"""
mask = np.abs(residuals) <= self.epsilon
loss = np.zeros_like(residuals)
loss[mask] = 0.5 * residuals[mask]**2
loss[~mask] = self.epsilon * (np.abs(residuals[~mask]) - 0.5 * self.epsilon)
return loss.sum()
def fit(self, X, y):
"""Fit Huber regression using IRLS"""
n_samples, n_features = X.shape
# Initialize coefficients
self.coef_ = np.zeros(n_features)
for _ in range(self.max_iter):
coef_old = self.coef_.copy()
# Compute residuals
residuals = y - X @ self.coef_
# Compute weights
weights = np.ones(n_samples)
mask = np.abs(residuals) > self.epsilon
weights[mask] = self.epsilon / np.abs(residuals[mask])
# Update coefficients
W = np.diag(weights)
self.coef_ = np.linalg.inv(X.T @ W @ X) @ X.T @ W @ y
# Check convergence
if np.sum(np.abs(self.coef_ - coef_old)) < self.tol:
break
return self
Best Practices
-
Data Preprocessing
- Scale features
- Handle missing values
- Check for multicollinearity
- Remove outliers if necessary
-
Model Selection
- Use cross-validation
- Consider regularization
- Check assumptions
- Compare different models
-
Diagnostics
- Check residuals
- Test for heteroscedasticity
- Assess influential points
- Validate linearity assumption
-
Interpretation
- Examine coefficients
- Calculate confidence intervals
- Consider feature importance
- Assess model fit