Bagging (Bootstrap Aggregating)

Bagging is an ensemble learning method that creates multiple versions of a predictor and combines them through averaging or voting. This section covers the theory and implementation of bagging methods.

Theoretical Foundation

1. Bootstrap Sampling

Random sampling with replacement
Sample size equal to original dataset
Approximately 63.2% unique samples per bootstrap

Implementation

1. Basic Bagging Classifier

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np

def create_bagging_classifier(
    base_estimator=None,
    n_estimators=10,
    max_samples=1.0,
    max_features=1.0,
    bootstrap=True,
    bootstrap_features=False
):
    """Create a bagging classifier with specified parameters"""
    if base_estimator is None:
        base_estimator = DecisionTreeClassifier()
    
    bagging = BaggingClassifier(
        base_estimator=base_estimator,
        n_estimators=n_estimators,
        max_samples=max_samples,
        max_features=max_features,
        bootstrap=bootstrap,
        bootstrap_features=bootstrap_features,
        n_jobs=-1
    )
    
    return bagging

2. Custom Bagging Implementation

class CustomBagging:
    def __init__(self, base_estimator, n_estimators=10, max_samples=1.0):
        """Initialize custom bagging ensemble"""
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.estimators_ = []
    
    def _bootstrap_sample(self, X, y):
        """Create a bootstrap sample"""
        n_samples = int(len(X) * self.max_samples)
        indices = np.random.choice(len(X), size=n_samples, replace=True)
        return X[indices], y[indices]
    
    def fit(self, X, y):
        """Fit the bagging ensemble"""
        self.estimators_ = []
        
        for _ in range(self.n_estimators):
            # Create bootstrap sample
            X_boot, y_boot = self._bootstrap_sample(X, y)
            
            # Create and train new estimator
            estimator = clone(self.base_estimator)
            estimator.fit(X_boot, y_boot)
            self.estimators_.append(estimator)
        
        return self
    
    def predict(self, X):
        """Make predictions using majority voting"""
        predictions = np.array([est.predict(X) for est in self.estimators_])
        return np.apply_along_axis(
            lambda x: np.bincount(x).argmax(),
            axis=0,
            arr=predictions
        )
    
    def predict_proba(self, X):
        """Make probability predictions"""
        probas = np.array([est.predict_proba(X) for est in self.estimators_])
        return np.mean(probas, axis=0)

Advanced Techniques

1. Random Subspace Method

def random_subspace_ensemble(
    base_estimator,
    n_estimators=10,
    max_features=0.5,
    random_state=None
):
    """Create a random subspace ensemble"""
    n_features = X.shape[1]
    n_subspace_features = int(max_features * n_features)
    
    estimators = []
    feature_indices = []
    
    for _ in range(n_estimators):
        # Select random feature subset
        features = np.random.choice(
            n_features,
            size=n_subspace_features,
            replace=False
        )
        feature_indices.append(features)
        
        # Train estimator on feature subset
        estimator = clone(base_estimator)
        estimator.fit(X[:, features], y)
        estimators.append(estimator)
    
    return estimators, feature_indices

2. Out-of-Bag Estimation

def compute_oob_score(bagging_classifier, X, y):
    """Compute out-of-bag score for a bagging classifier"""
    n_samples = len(X)
    n_classes = len(np.unique(y))
    
    # Initialize arrays for OOB predictions
    oob_predictions = np.zeros((n_samples, n_classes))
    n_predictions = np.zeros(n_samples)
    
    # Compute OOB predictions
    for estimator, samples in zip(
        bagging_classifier.estimators_,
        bagging_classifier.estimators_samples_
    ):
        # OOB samples for this estimator
        oob_mask = ~samples
        
        if not np.any(oob_mask):
            continue
        
        # Predict for OOB samples
        pred = estimator.predict_proba(X[oob_mask])
        oob_predictions[oob_mask] += pred
        n_predictions[oob_mask] += 1
    
    # Compute final predictions
    with np.errstate(divide='ignore', invalid='ignore'):
        oob_predictions /= n_predictions[:, np.newaxis]
    
    # Compute OOB score
    oob_score = accuracy_score(
        y,
        np.argmax(oob_predictions, axis=1)
    )
    
    return oob_score

Performance Analysis

1. Diversity Analysis

def analyze_bagging_diversity(bagging_classifier, X):
    """Analyze diversity among bagging ensemble members"""
    predictions = []
    for estimator in bagging_classifier.estimators_:
        pred = estimator.predict(X)
        predictions.append(pred)
    
    # Compute pairwise disagreement
    n_estimators = len(predictions)
    diversity_matrix = np.zeros((n_estimators, n_estimators))
    
    for i in range(n_estimators):
        for j in range(i + 1, n_estimators):
            disagreement = np.mean(predictions[i] != predictions[j])
            diversity_matrix[i, j] = disagreement
            diversity_matrix[j, i] = disagreement
    
    return diversity_matrix

2. Learning Curves

def plot_bagging_learning_curves(
    base_estimator,
    X,
    y,
    n_estimators_range=range(1, 51, 5)
):
    """Plot learning curves for bagging ensemble"""
    train_scores = []
    val_scores = []
    
    for n_estimators in n_estimators_range:
        bagging = BaggingClassifier(
            base_estimator=base_estimator,
            n_estimators=n_estimators
        )
        
        scores = cross_validate(
            bagging,
            X,
            y,
            cv=5,
            scoring='accuracy',
            return_train_score=True
        )
        
        train_scores.append(scores['train_score'].mean())
        val_scores.append(scores['test_score'].mean())
    
    plt.figure(figsize=(10, 6))
    plt.plot(n_estimators_range, train_scores, label='Training Score')
    plt.plot(n_estimators_range, val_scores, label='Validation Score')
    plt.xlabel('Number of Estimators')
    plt.ylabel('Score')
    plt.title('Bagging Learning Curves')
    plt.legend()
    plt.grid(True)
    
    return plt.gcf()

Best Practices

1. Model Selection

Choose appropriate base estimator
Consider problem characteristics
Balance model complexity
Evaluate computational cost

2. Parameter Tuning

Optimize number of estimators
Adjust sample size
Consider feature sampling
Use cross-validation

3. Implementation Tips

Use parallel processing
Monitor memory usage
Implement early stopping
Consider OOB estimates

4. Common Pitfalls

Overfitting with complex base models
Insufficient number of estimators
Ignoring feature importance
Not validating OOB estimates

PreviousOverview

NextBoosting

Getting Started

Math

Machine Learning

Deep Learning

Natural Language Processing

Reinforcement Learning

References