Bagging (Bootstrap Aggregating)
Bagging is an ensemble learning method that creates multiple versions of a predictor and combines them through averaging or voting. This section covers the theory and implementation of bagging methods.
Theoretical Foundation
1. Bootstrap Sampling
- Random sampling with replacement
- Sample size equal to original dataset
- Approximately 63.2% unique samples per bootstrap
Implementation
1. Basic Bagging Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
def create_bagging_classifier(
base_estimator=None,
n_estimators=10,
max_samples=1.0,
max_features=1.0,
bootstrap=True,
bootstrap_features=False
):
"""Create a bagging classifier with specified parameters"""
if base_estimator is None:
base_estimator = DecisionTreeClassifier()
bagging = BaggingClassifier(
base_estimator=base_estimator,
n_estimators=n_estimators,
max_samples=max_samples,
max_features=max_features,
bootstrap=bootstrap,
bootstrap_features=bootstrap_features,
n_jobs=-1
)
return bagging
2. Custom Bagging Implementation
class CustomBagging:
def __init__(self, base_estimator, n_estimators=10, max_samples=1.0):
"""Initialize custom bagging ensemble"""
self.base_estimator = base_estimator
self.n_estimators = n_estimators
self.max_samples = max_samples
self.estimators_ = []
def _bootstrap_sample(self, X, y):
"""Create a bootstrap sample"""
n_samples = int(len(X) * self.max_samples)
indices = np.random.choice(len(X), size=n_samples, replace=True)
return X[indices], y[indices]
def fit(self, X, y):
"""Fit the bagging ensemble"""
self.estimators_ = []
for _ in range(self.n_estimators):
# Create bootstrap sample
X_boot, y_boot = self._bootstrap_sample(X, y)
# Create and train new estimator
estimator = clone(self.base_estimator)
estimator.fit(X_boot, y_boot)
self.estimators_.append(estimator)
return self
def predict(self, X):
"""Make predictions using majority voting"""
predictions = np.array([est.predict(X) for est in self.estimators_])
return np.apply_along_axis(
lambda x: np.bincount(x).argmax(),
axis=0,
arr=predictions
)
def predict_proba(self, X):
"""Make probability predictions"""
probas = np.array([est.predict_proba(X) for est in self.estimators_])
return np.mean(probas, axis=0)
Advanced Techniques
1. Random Subspace Method
def random_subspace_ensemble(
base_estimator,
n_estimators=10,
max_features=0.5,
random_state=None
):
"""Create a random subspace ensemble"""
n_features = X.shape[1]
n_subspace_features = int(max_features * n_features)
estimators = []
feature_indices = []
for _ in range(n_estimators):
# Select random feature subset
features = np.random.choice(
n_features,
size=n_subspace_features,
replace=False
)
feature_indices.append(features)
# Train estimator on feature subset
estimator = clone(base_estimator)
estimator.fit(X[:, features], y)
estimators.append(estimator)
return estimators, feature_indices
2. Out-of-Bag Estimation
def compute_oob_score(bagging_classifier, X, y):
"""Compute out-of-bag score for a bagging classifier"""
n_samples = len(X)
n_classes = len(np.unique(y))
# Initialize arrays for OOB predictions
oob_predictions = np.zeros((n_samples, n_classes))
n_predictions = np.zeros(n_samples)
# Compute OOB predictions
for estimator, samples in zip(
bagging_classifier.estimators_,
bagging_classifier.estimators_samples_
):
# OOB samples for this estimator
oob_mask = ~samples
if not np.any(oob_mask):
continue
# Predict for OOB samples
pred = estimator.predict_proba(X[oob_mask])
oob_predictions[oob_mask] += pred
n_predictions[oob_mask] += 1
# Compute final predictions
with np.errstate(divide='ignore', invalid='ignore'):
oob_predictions /= n_predictions[:, np.newaxis]
# Compute OOB score
oob_score = accuracy_score(
y,
np.argmax(oob_predictions, axis=1)
)
return oob_score
Performance Analysis
1. Diversity Analysis
def analyze_bagging_diversity(bagging_classifier, X):
"""Analyze diversity among bagging ensemble members"""
predictions = []
for estimator in bagging_classifier.estimators_:
pred = estimator.predict(X)
predictions.append(pred)
# Compute pairwise disagreement
n_estimators = len(predictions)
diversity_matrix = np.zeros((n_estimators, n_estimators))
for i in range(n_estimators):
for j in range(i + 1, n_estimators):
disagreement = np.mean(predictions[i] != predictions[j])
diversity_matrix[i, j] = disagreement
diversity_matrix[j, i] = disagreement
return diversity_matrix
2. Learning Curves
def plot_bagging_learning_curves(
base_estimator,
X,
y,
n_estimators_range=range(1, 51, 5)
):
"""Plot learning curves for bagging ensemble"""
train_scores = []
val_scores = []
for n_estimators in n_estimators_range:
bagging = BaggingClassifier(
base_estimator=base_estimator,
n_estimators=n_estimators
)
scores = cross_validate(
bagging,
X,
y,
cv=5,
scoring='accuracy',
return_train_score=True
)
train_scores.append(scores['train_score'].mean())
val_scores.append(scores['test_score'].mean())
plt.figure(figsize=(10, 6))
plt.plot(n_estimators_range, train_scores, label='Training Score')
plt.plot(n_estimators_range, val_scores, label='Validation Score')
plt.xlabel('Number of Estimators')
plt.ylabel('Score')
plt.title('Bagging Learning Curves')
plt.legend()
plt.grid(True)
return plt.gcf()
Best Practices
1. Model Selection
- Choose appropriate base estimator
- Consider problem characteristics
- Balance model complexity
- Evaluate computational cost
2. Parameter Tuning
- Optimize number of estimators
- Adjust sample size
- Consider feature sampling
- Use cross-validation
3. Implementation Tips
- Use parallel processing
- Monitor memory usage
- Implement early stopping
- Consider OOB estimates
4. Common Pitfalls
- Overfitting with complex base models
- Insufficient number of estimators
- Ignoring feature importance
- Not validating OOB estimates