Stacking and Blending
Stacking and blending are advanced ensemble methods that combine predictions from multiple models using a meta-model. This section covers their implementation and best practices.
Theoretical Foundation
1. Stacking Concepts
- Multiple base models
- Meta-model (level-2 model)
- Cross-validation predictions
- Model diversity
2. Blending Concepts
- Hold-out set predictions
- Weighted combinations
- Model selection
- Overfitting prevention
Implementation
1. Basic Stacking
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
def create_stacking_classifier(
cv=5,
final_estimator=None
):
"""Create a stacking classifier"""
# Define base estimators
estimators = [
('lr', LogisticRegression()),
('dt', DecisionTreeClassifier()),
('svm', SVC(probability=True)),
('knn', KNeighborsClassifier())
]
# Define final estimator
if final_estimator is None:
final_estimator = LogisticRegression()
# Create stacking classifier
stacking = StackingClassifier(
estimators=estimators,
final_estimator=final_estimator,
cv=cv,
stack_method='predict_proba'
)
return stacking
2. Custom Stacking Implementation
import numpy as np
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, ClassifierMixin
class CustomStackingClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, base_models, meta_model, n_folds=5):
"""Initialize custom stacking classifier"""
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
self.base_predictions = None
def fit(self, X, y):
"""Fit stacking classifier"""
# Initialize storage for meta features
self.meta_features = np.zeros((X.shape[0], len(self.base_models)))
# Create cross-validation folds
kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
# Generate meta features
for i, model in enumerate(self.base_models):
# Perform cross-validation predictions
for train_idx, val_idx in kf.split(X):
# Split data
X_train, X_val = X[train_idx], X[val_idx]
y_train = y[train_idx]
# Train model and predict
model.fit(X_train, y_train)
self.meta_features[val_idx, i] = model.predict_proba(X_val)[:, 1]
# Fit on full dataset
model.fit(X, y)
# Train meta model
self.meta_model.fit(self.meta_features, y)
return self
def predict_proba(self, X):
"""Predict probability estimates"""
# Generate predictions from base models
meta_features = np.column_stack([
model.predict_proba(X)[:, 1]
for model in self.base_models
])
# Get meta model predictions
return self.meta_model.predict_proba(meta_features)
def predict(self, X):
"""Make predictions"""
return np.argmax(self.predict_proba(X), axis=1)
3. Blending Implementation
class Blender:
def __init__(self, models, weights=None):
"""Initialize blender"""
self.models = models
self.weights = weights
if weights is None:
self.weights = np.ones(len(models)) / len(models)
def fit(self, X, y, val_size=0.3):
"""Fit models using hold-out validation"""
# Split data
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=val_size, random_state=42
)
# Train models
self.train_predictions = np.zeros((X_val.shape[0], len(self.models)))
for i, model in enumerate(self.models):
# Train model
model.fit(X_train, y_train)
# Generate validation predictions
self.train_predictions[:, i] = model.predict_proba(X_val)[:, 1]
# Optimize weights if not provided
if self.weights is None:
self.optimize_weights(y_val)
# Retrain on full dataset
for model in self.models:
model.fit(X, y)
return self
def optimize_weights(self, y_val):
"""Optimize blending weights"""
from scipy.optimize import minimize
def loss_function(weights):
"""Compute log loss for given weights"""
weighted_preds = np.sum(
self.train_predictions * weights.reshape(-1, 1),
axis=1
)
return log_loss(y_val, weighted_preds)
# Optimize weights
constraints = (
{'type': 'eq', 'fun': lambda w: np.sum(w) - 1}, # sum to 1
{'type': 'ineq', 'fun': lambda w: w} # non-negative
)
result = minimize(
loss_function,
self.weights,
constraints=constraints,
method='SLSQP'
)
self.weights = result.x
def predict_proba(self, X):
"""Make probability predictions"""
predictions = np.zeros((X.shape[0], len(self.models)))
for i, model in enumerate(self.models):
predictions[:, i] = model.predict_proba(X)[:, 1]
return np.sum(predictions * self.weights.reshape(1, -1), axis=1)
def predict(self, X):
"""Make predictions"""
probas = self.predict_proba(X)
return (probas > 0.5).astype(int)
Advanced Techniques
1. Multi-level Stacking
class MultiLevelStacking:
def __init__(self, base_models, meta_models, final_model):
"""Initialize multi-level stacking"""
self.base_models = base_models
self.meta_models = meta_models
self.final_model = final_model
def fit(self, X, y):
"""Fit multi-level stacking model"""
# First level
first_level_features = np.zeros((X.shape[0], len(self.base_models)))
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for i, model in enumerate(self.base_models):
for train_idx, val_idx in kf.split(X):
X_train, X_val = X[train_idx], X[val_idx]
y_train = y[train_idx]
model.fit(X_train, y_train)
first_level_features[val_idx, i] = model.predict_proba(X_val)[:, 1]
model.fit(X, y)
# Second level
second_level_features = np.zeros((X.shape[0], len(self.meta_models)))
for i, model in enumerate(self.meta_models):
model.fit(first_level_features, y)
second_level_features[:, i] = model.predict_proba(first_level_features)[:, 1]
# Final level
self.final_model.fit(second_level_features, y)
return self
2. Feature-Weighted Linear Stacking
class FeatureWeightedLinearStacking:
def __init__(self, models, meta_features=None):
"""Initialize feature-weighted linear stacking"""
self.models = models
self.meta_features = meta_features
self.weights = None
def fit(self, X, y):
"""Fit feature-weighted linear stacking"""
# Generate base predictions
base_predictions = np.column_stack([
model.fit(X, y).predict_proba(X)[:, 1]
for model in self.models
])
if self.meta_features is None:
self.meta_features = X
# Learn feature-dependent weights
self.weights = np.zeros((self.meta_features.shape[1], len(self.models)))
for i in range(self.meta_features.shape[1]):
# Weight learning for each feature
feature = self.meta_features[:, i].reshape(-1, 1)
self.weights[i] = LinearRegression().fit(
feature,
y - base_predictions
).coef_
return self
def predict_proba(self, X):
"""Make probability predictions"""
# Generate base predictions
base_predictions = np.column_stack([
model.predict_proba(X)[:, 1]
for model in self.models
])
# Compute feature-dependent weights
if self.meta_features is None:
meta_features = X
else:
meta_features = self.meta_features
# Apply feature-dependent weights
weighted_predictions = np.zeros_like(base_predictions)
for i in range(meta_features.shape[1]):
feature = meta_features[:, i].reshape(-1, 1)
weighted_predictions += feature * self.weights[i]
return weighted_predictions
Best Practices
1. Model Selection
- Choose diverse base models
- Consider computational cost
- Balance model complexity
- Evaluate individual performance
2. Architecture Design
- Determine number of levels
- Choose appropriate meta-learner
- Consider feature engineering
- Validate architecture choices
3. Implementation Tips
- Use cross-validation
- Handle data leakage
- Monitor memory usage
- Implement early stopping
4. Common Pitfalls
- Overfitting at meta-level
- Insufficient model diversity
- Data leakage in stacking
- Complex architectures