Introduction to Statistical Learning
Statistical learning provides the theoretical foundation for machine learning, focusing on the mathematical frameworks and principles that enable learning from data.
Key Concepts
Statistical Learning Framework
- Learning from data
- Model estimation
- Prediction and inference
- Error measurement
Implementation Examples
1. Basic Statistical Learning Setup
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
class StatisticalLearner:
def __init__(self):
self.parameters = None
def fit(self, X, y):
"""Fit model to training data"""
pass
def predict(self, X):
"""Make predictions on new data"""
pass
def score(self, X, y):
"""Evaluate model performance"""
predictions = self.predict(X)
return {
'mse': mean_squared_error(y, predictions),
'r2': r2_score(y, predictions)
}
# Example usage
X = np.random.randn(100, 2) # Features
y = 2*X[:, 0] + 3*X[:, 1] + np.random.randn(100)*0.1 # Target
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
2. Bias-Variance Analysis
def bias_variance_decomposition(model, X_train, y_train, X_test, y_test, n_bootstraps=100):
"""Estimate bias and variance using bootstrap"""
predictions = np.zeros((n_bootstraps, len(X_test)))
for i in range(n_bootstraps):
# Bootstrap sample
indices = np.random.randint(0, len(X_train), len(X_train))
X_boot = X_train[indices]
y_boot = y_train[indices]
# Train model and predict
model.fit(X_boot, y_boot)
predictions[i, :] = model.predict(X_test)
# Calculate statistics
mean_pred = np.mean(predictions, axis=0)
bias = np.mean((mean_pred - y_test) ** 2)
variance = np.mean(np.var(predictions, axis=0))
return {
'bias': bias,
'variance': variance,
'total_error': bias + variance
}
Model Evaluation Framework
1. Cross-Validation Implementation
def custom_cross_validation(model, X, y, n_folds=5):
"""Implement k-fold cross-validation from scratch"""
fold_size = len(X) // n_folds
scores = []
for i in range(n_folds):
# Create validation fold
start_idx = i * fold_size
end_idx = start_idx + fold_size
X_val = X[start_idx:end_idx]
y_val = y[start_idx:end_idx]
# Create training set
X_train = np.concatenate([
X[:start_idx],
X[end_idx:]
])
y_train = np.concatenate([
y[:start_idx],
y[end_idx:]
])
# Train and evaluate
model.fit(X_train, y_train)
score = model.score(X_val, y_val)
scores.append(score)
return np.mean(scores), np.std(scores)
2. Model Selection
def model_selection(models, X_train, y_train, X_val, y_val):
"""Compare multiple models and select the best one"""
results = {}
for name, model in models.items():
# Train model
model.fit(X_train, y_train)
# Evaluate performance
train_score = model.score(X_train, y_train)
val_score = model.score(X_val, y_val)
results[name] = {
'train_score': train_score,
'val_score': val_score,
'model': model
}
# Find best model
best_model = max(results.items(),
key=lambda x: x[1]['val_score'])
return results, best_model
# Example usage
from sklearn.linear_model import LinearRegression, Ridge, Lasso
models = {
'linear': LinearRegression(),
'ridge': Ridge(alpha=1.0),
'lasso': Lasso(alpha=1.0)
}
results, best_model = model_selection(
models, X_train, y_train, X_test, y_test
)
Best Practices
1. Data Preprocessing
def preprocess_data(X, standardize=True, remove_outliers=True):
"""Preprocess data for statistical learning"""
if standardize:
# Standardize features
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std
if remove_outliers:
# Remove outliers using IQR method
Q1 = np.percentile(X, 25, axis=0)
Q3 = np.percentile(X, 75, axis=0)
IQR = Q3 - Q1
mask = ~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)
X = X[mask]
return X
2. Model Diagnostics
def model_diagnostics(model, X, y):
"""Perform basic model diagnostics"""
predictions = model.predict(X)
residuals = y - predictions
diagnostics = {
'residuals_mean': np.mean(residuals),
'residuals_std': np.std(residuals),
'r2_score': r2_score(y, predictions),
'mse': mean_squared_error(y, predictions)
}
return diagnostics
Applications
-
Regression Analysis
- Linear regression
- Polynomial regression
- Regularized regression
-
Classification
- Logistic regression
- Discriminant analysis
- Support vector machines
-
Model Selection
- Cross-validation
- Information criteria
- Bootstrap methods