Ensemble Methods

This section covers ensemble learning methods, including bagging, boosting, and stacking techniques.

Bagging Methods

1. Random Forest

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt

class RandomForestWrapper:
    def __init__(
        self,
        task='classification',
        n_estimators=100,
        **kwargs
    ):
        """Initialize random forest"""
        self.task = task
        if task == 'classification':
            self.model = RandomForestClassifier(
                n_estimators=n_estimators,
                **kwargs
            )
        elif task == 'regression':
            self.model = RandomForestRegressor(
                n_estimators=n_estimators,
                **kwargs
            )
        else:
            raise ValueError("Task must be 'classification' or 'regression'")
    
    def fit(self, X, y):
        """Fit random forest"""
        return self.model.fit(X, y)
    
    def predict(self, X):
        """Make predictions"""
        return self.model.predict(X)
    
    def feature_importance(self, feature_names=None):
        """Plot feature importance"""
        importances = self.model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        plt.figure(figsize=(10, 6))
        plt.title('Feature Importances')
        plt.bar(
            range(len(importances)),
            importances[indices],
            align='center'
        )
        
        if feature_names is not None:
            plt.xticks(
                range(len(importances)),
                [feature_names[i] for i in indices],
                rotation=45
            )
        
        plt.tight_layout()
        return plt.gcf()
    
    def plot_tree_predictions(self, X):
        """Plot predictions of individual trees"""
        predictions = np.array([
            tree.predict(X)
            for tree in self.model.estimators_
        ])
        
        plt.figure(figsize=(10, 6))
        plt.boxplot(predictions.T)
        plt.title('Individual Tree Predictions')
        plt.xlabel('Sample')
        plt.ylabel('Prediction')
        plt.tight_layout()
        return plt.gcf()

2. Extra Trees

from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor

class ExtraTreesWrapper:
    def __init__(
        self,
        task='classification',
        n_estimators=100,
        **kwargs
    ):
        """Initialize extra trees"""
        self.task = task
        if task == 'classification':
            self.model = ExtraTreesClassifier(
                n_estimators=n_estimators,
                **kwargs
            )
        elif task == 'regression':
            self.model = ExtraTreesRegressor(
                n_estimators=n_estimators,
                **kwargs
            )
        else:
            raise ValueError("Task must be 'classification' or 'regression'")
    
    def fit(self, X, y):
        """Fit extra trees"""
        return self.model.fit(X, y)
    
    def predict(self, X):
        """Make predictions"""
        return self.model.predict(X)

Boosting Methods

1. Gradient Boosting

from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

class GradientBoostingWrapper:
    def __init__(
        self,
        task='classification',
        method='sklearn',
        **kwargs
    ):
        """Initialize gradient boosting model"""
        self.task = task
        self.method = method
        
        if method == 'sklearn':
            if task == 'classification':
                self.model = GradientBoostingClassifier(**kwargs)
            else:
                self.model = GradientBoostingRegressor(**kwargs)
        elif method == 'xgboost':
            if task == 'classification':
                self.model = xgb.XGBClassifier(**kwargs)
            else:
                self.model = xgb.XGBRegressor(**kwargs)
        elif method == 'lightgbm':
            if task == 'classification':
                self.model = lgb.LGBMClassifier(**kwargs)
            else:
                self.model = lgb.LGBMRegressor(**kwargs)
        else:
            raise ValueError(
                "Method must be 'sklearn', 'xgboost', or 'lightgbm'"
            )
    
    def fit(self, X, y, eval_set=None):
        """Fit gradient boosting model"""
        if self.method in ['xgboost', 'lightgbm'] and eval_set is not None:
            return self.model.fit(
                X, y,
                eval_set=eval_set,
                early_stopping_rounds=10,
                verbose=False
            )
        return self.model.fit(X, y)
    
    def predict(self, X):
        """Make predictions"""
        return self.model.predict(X)
    
    def plot_learning_curve(self, X, y, cv=5):
        """Plot learning curve"""
        from sklearn.model_selection import learning_curve
        
        train_sizes, train_scores, test_scores = learning_curve(
            self.model, X, y,
            cv=cv,
            n_jobs=-1,
            train_sizes=np.linspace(0.1, 1.0, 10)
        )
        
        train_mean = np.mean(train_scores, axis=1)
        train_std = np.std(train_scores, axis=1)
        test_mean = np.mean(test_scores, axis=1)
        test_std = np.std(test_scores, axis=1)
        
        plt.figure(figsize=(10, 6))
        plt.plot(
            train_sizes,
            train_mean,
            label='Training score'
        )
        plt.plot(
            train_sizes,
            test_mean,
            label='Cross-validation score'
        )
        
        plt.fill_between(
            train_sizes,
            train_mean - train_std,
            train_mean + train_std,
            alpha=0.1
        )
        plt.fill_between(
            train_sizes,
            test_mean - test_std,
            test_mean + test_std,
            alpha=0.1
        )
        
        plt.xlabel('Training Examples')
        plt.ylabel('Score')
        plt.title('Learning Curve')
        plt.legend(loc='best')
        plt.grid(True)
        
        return plt.gcf()

2. AdaBoost

from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor

class AdaBoostWrapper:
    def __init__(
        self,
        task='classification',
        n_estimators=50,
        base_estimator=None,
        **kwargs
    ):
        """Initialize AdaBoost"""
        self.task = task
        if task == 'classification':
            self.model = AdaBoostClassifier(
                base_estimator=base_estimator,
                n_estimators=n_estimators,
                **kwargs
            )
        elif task == 'regression':
            self.model = AdaBoostRegressor(
                base_estimator=base_estimator,
                n_estimators=n_estimators,
                **kwargs
            )
        else:
            raise ValueError("Task must be 'classification' or 'regression'")
    
    def fit(self, X, y):
        """Fit AdaBoost"""
        return self.model.fit(X, y)
    
    def predict(self, X):
        """Make predictions"""
        return self.model.predict(X)
    
    def plot_feature_importance(self, feature_names=None):
        """Plot feature importance"""
        importances = self.model.feature_importances_
        indices = np.argsort(importances)[::-1]
        
        plt.figure(figsize=(10, 6))
        plt.title('Feature Importances')
        plt.bar(
            range(len(importances)),
            importances[indices],
            align='center'
        )
        
        if feature_names is not None:
            plt.xticks(
                range(len(importances)),
                [feature_names[i] for i in indices],
                rotation=45
            )
        
        plt.tight_layout()
        return plt.gcf()

Stacking Methods

1. Stacking Ensemble

from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.model_selection import cross_val_predict

class StackingEnsemble:
    def __init__(
        self,
        base_models,
        meta_model,
        task='classification',
        cv=5
    ):
        """Initialize stacking ensemble"""
        self.task = task
        self.cv = cv
        
        if task == 'classification':
            self.model = StackingClassifier(
                estimators=base_models,
                final_estimator=meta_model,
                cv=cv
            )
        elif task == 'regression':
            self.model = StackingRegressor(
                estimators=base_models,
                final_estimator=meta_model,
                cv=cv
            )
        else:
            raise ValueError("Task must be 'classification' or 'regression'")
    
    def fit(self, X, y):
        """Fit stacking ensemble"""
        return self.model.fit(X, y)
    
    def predict(self, X):
        """Make predictions"""
        return self.model.predict(X)
    
    def get_base_predictions(self, X, y):
        """Get predictions from base models"""
        predictions = {}
        
        for name, model in self.model.estimators_:
            pred = cross_val_predict(
                model,
                X, y,
                cv=self.cv,
                method='predict_proba' if self.task == 'classification' else 'predict'
            )
            predictions[name] = pred
        
        return predictions

Model Selection and Tuning

1. Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def tune_ensemble(
    model,
    param_grid,
    X,
    y,
    cv=5,
    method='grid',
    n_iter=100
):
    """Tune ensemble model hyperparameters"""
    if method == 'grid':
        search = GridSearchCV(
            model,
            param_grid,
            cv=cv,
            n_jobs=-1,
            verbose=1
        )
    elif method == 'random':
        search = RandomizedSearchCV(
            model,
            param_grid,
            n_iter=n_iter,
            cv=cv,
            n_jobs=-1,
            verbose=1
        )
    else:
        raise ValueError("Method must be 'grid' or 'random'")
    
    search.fit(X, y)
    return search.best_params_, search.best_score_

Best Practices

1. Ensemble Design

Choose diverse base models
Balance model complexity
Consider computational cost
Monitor memory usage

2. Model Selection

Use appropriate base models
Consider problem characteristics
Validate ensemble performance
Monitor overfitting

3. Hyperparameter Tuning

Tune individual models
Optimize ensemble parameters
Use appropriate search strategy
Validate results

4. Performance Evaluation

Use proper validation
Monitor diversity metrics
Compare with base models
Consider interpretability

PreviousHandling Imbalanced Data

NextUnsupervised Learning

Getting Started

Math

Machine Learning

Deep Learning

Natural Language Processing

Reinforcement Learning

References

Ensemble Methods

Bagging Methods

1. Random Forest

2. Extra Trees

Boosting Methods

1. Gradient Boosting

2. AdaBoost

Stacking Methods

1. Stacking Ensemble

Model Selection and Tuning

1. Hyperparameter Tuning

Best Practices

1. Ensemble Design

2. Model Selection

3. Hyperparameter Tuning

4. Performance Evaluation

On this page