Ensemble Methods
This section covers ensemble learning methods, including bagging, boosting, and stacking techniques.
Bagging Methods
1. Random Forest
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
class RandomForestWrapper:
def __init__(
self,
task='classification',
n_estimators=100,
**kwargs
):
"""Initialize random forest"""
self.task = task
if task == 'classification':
self.model = RandomForestClassifier(
n_estimators=n_estimators,
**kwargs
)
elif task == 'regression':
self.model = RandomForestRegressor(
n_estimators=n_estimators,
**kwargs
)
else:
raise ValueError("Task must be 'classification' or 'regression'")
def fit(self, X, y):
"""Fit random forest"""
return self.model.fit(X, y)
def predict(self, X):
"""Make predictions"""
return self.model.predict(X)
def feature_importance(self, feature_names=None):
"""Plot feature importance"""
importances = self.model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(
range(len(importances)),
importances[indices],
align='center'
)
if feature_names is not None:
plt.xticks(
range(len(importances)),
[feature_names[i] for i in indices],
rotation=45
)
plt.tight_layout()
return plt.gcf()
def plot_tree_predictions(self, X):
"""Plot predictions of individual trees"""
predictions = np.array([
tree.predict(X)
for tree in self.model.estimators_
])
plt.figure(figsize=(10, 6))
plt.boxplot(predictions.T)
plt.title('Individual Tree Predictions')
plt.xlabel('Sample')
plt.ylabel('Prediction')
plt.tight_layout()
return plt.gcf()
2. Extra Trees
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
class ExtraTreesWrapper:
def __init__(
self,
task='classification',
n_estimators=100,
**kwargs
):
"""Initialize extra trees"""
self.task = task
if task == 'classification':
self.model = ExtraTreesClassifier(
n_estimators=n_estimators,
**kwargs
)
elif task == 'regression':
self.model = ExtraTreesRegressor(
n_estimators=n_estimators,
**kwargs
)
else:
raise ValueError("Task must be 'classification' or 'regression'")
def fit(self, X, y):
"""Fit extra trees"""
return self.model.fit(X, y)
def predict(self, X):
"""Make predictions"""
return self.model.predict(X)
Boosting Methods
1. Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
class GradientBoostingWrapper:
def __init__(
self,
task='classification',
method='sklearn',
**kwargs
):
"""Initialize gradient boosting model"""
self.task = task
self.method = method
if method == 'sklearn':
if task == 'classification':
self.model = GradientBoostingClassifier(**kwargs)
else:
self.model = GradientBoostingRegressor(**kwargs)
elif method == 'xgboost':
if task == 'classification':
self.model = xgb.XGBClassifier(**kwargs)
else:
self.model = xgb.XGBRegressor(**kwargs)
elif method == 'lightgbm':
if task == 'classification':
self.model = lgb.LGBMClassifier(**kwargs)
else:
self.model = lgb.LGBMRegressor(**kwargs)
else:
raise ValueError(
"Method must be 'sklearn', 'xgboost', or 'lightgbm'"
)
def fit(self, X, y, eval_set=None):
"""Fit gradient boosting model"""
if self.method in ['xgboost', 'lightgbm'] and eval_set is not None:
return self.model.fit(
X, y,
eval_set=eval_set,
early_stopping_rounds=10,
verbose=False
)
return self.model.fit(X, y)
def predict(self, X):
"""Make predictions"""
return self.model.predict(X)
def plot_learning_curve(self, X, y, cv=5):
"""Plot learning curve"""
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
self.model, X, y,
cv=cv,
n_jobs=-1,
train_sizes=np.linspace(0.1, 1.0, 10)
)
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.figure(figsize=(10, 6))
plt.plot(
train_sizes,
train_mean,
label='Training score'
)
plt.plot(
train_sizes,
test_mean,
label='Cross-validation score'
)
plt.fill_between(
train_sizes,
train_mean - train_std,
train_mean + train_std,
alpha=0.1
)
plt.fill_between(
train_sizes,
test_mean - test_std,
test_mean + test_std,
alpha=0.1
)
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend(loc='best')
plt.grid(True)
return plt.gcf()
2. AdaBoost
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
class AdaBoostWrapper:
def __init__(
self,
task='classification',
n_estimators=50,
base_estimator=None,
**kwargs
):
"""Initialize AdaBoost"""
self.task = task
if task == 'classification':
self.model = AdaBoostClassifier(
base_estimator=base_estimator,
n_estimators=n_estimators,
**kwargs
)
elif task == 'regression':
self.model = AdaBoostRegressor(
base_estimator=base_estimator,
n_estimators=n_estimators,
**kwargs
)
else:
raise ValueError("Task must be 'classification' or 'regression'")
def fit(self, X, y):
"""Fit AdaBoost"""
return self.model.fit(X, y)
def predict(self, X):
"""Make predictions"""
return self.model.predict(X)
def plot_feature_importance(self, feature_names=None):
"""Plot feature importance"""
importances = self.model.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(10, 6))
plt.title('Feature Importances')
plt.bar(
range(len(importances)),
importances[indices],
align='center'
)
if feature_names is not None:
plt.xticks(
range(len(importances)),
[feature_names[i] for i in indices],
rotation=45
)
plt.tight_layout()
return plt.gcf()
Stacking Methods
1. Stacking Ensemble
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.model_selection import cross_val_predict
class StackingEnsemble:
def __init__(
self,
base_models,
meta_model,
task='classification',
cv=5
):
"""Initialize stacking ensemble"""
self.task = task
self.cv = cv
if task == 'classification':
self.model = StackingClassifier(
estimators=base_models,
final_estimator=meta_model,
cv=cv
)
elif task == 'regression':
self.model = StackingRegressor(
estimators=base_models,
final_estimator=meta_model,
cv=cv
)
else:
raise ValueError("Task must be 'classification' or 'regression'")
def fit(self, X, y):
"""Fit stacking ensemble"""
return self.model.fit(X, y)
def predict(self, X):
"""Make predictions"""
return self.model.predict(X)
def get_base_predictions(self, X, y):
"""Get predictions from base models"""
predictions = {}
for name, model in self.model.estimators_:
pred = cross_val_predict(
model,
X, y,
cv=self.cv,
method='predict_proba' if self.task == 'classification' else 'predict'
)
predictions[name] = pred
return predictions
Model Selection and Tuning
1. Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
def tune_ensemble(
model,
param_grid,
X,
y,
cv=5,
method='grid',
n_iter=100
):
"""Tune ensemble model hyperparameters"""
if method == 'grid':
search = GridSearchCV(
model,
param_grid,
cv=cv,
n_jobs=-1,
verbose=1
)
elif method == 'random':
search = RandomizedSearchCV(
model,
param_grid,
n_iter=n_iter,
cv=cv,
n_jobs=-1,
verbose=1
)
else:
raise ValueError("Method must be 'grid' or 'random'")
search.fit(X, y)
return search.best_params_, search.best_score_
Best Practices
1. Ensemble Design
- Choose diverse base models
- Balance model complexity
- Consider computational cost
- Monitor memory usage
2. Model Selection
- Use appropriate base models
- Consider problem characteristics
- Validate ensemble performance
- Monitor overfitting
3. Hyperparameter Tuning
- Tune individual models
- Optimize ensemble parameters
- Use appropriate search strategy
- Validate results
4. Performance Evaluation
- Use proper validation
- Monitor diversity metrics
- Compare with base models
- Consider interpretability