Boosting
Boosting is an ensemble learning method that combines weak learners into a strong learner by iteratively focusing on misclassified examples.
Theoretical Foundation
1. Core Concepts
- Sequential learning
- Weighted samples
- Weak learners
- Additive modeling
2. Common Algorithms
- AdaBoost
- Gradient Boosting
- XGBoost
- LightGBM
Implementation
1. AdaBoost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
def create_adaboost_classifier(
base_estimator=None,
n_estimators=50,
learning_rate=1.0,
algorithm='SAMME.R'
):
"""Create an AdaBoost classifier"""
if base_estimator is None:
base_estimator = DecisionTreeClassifier(max_depth=1)
ada = AdaBoostClassifier(
base_estimator=base_estimator,
n_estimators=n_estimators,
learning_rate=learning_rate,
algorithm=algorithm
)
return ada
2. Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
def create_gradient_boosting_classifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
min_samples_split=2,
subsample=1.0
):
"""Create a Gradient Boosting classifier"""
gb = GradientBoostingClassifier(
n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth,
min_samples_split=min_samples_split,
subsample=subsample,
random_state=42
)
return gb
3. XGBoost Implementation
import xgboost as xgb
def create_xgboost_classifier(
n_estimators=100,
learning_rate=0.1,
max_depth=3,
min_child_weight=1,
subsample=1.0,
colsample_bytree=1.0
):
"""Create an XGBoost classifier"""
xgb_clf = xgb.XGBClassifier(
n_estimators=n_estimators,
learning_rate=learning_rate,
max_depth=max_depth,
min_child_weight=min_child_weight,
subsample=subsample,
colsample_bytree=colsample_bytree,
objective='binary:logistic',
random_state=42
)
return xgb_clf
Advanced Techniques
1. Early Stopping
def train_with_early_stopping(
model,
X_train,
y_train,
X_val,
y_val,
early_stopping_rounds=10
):
"""Train a boosting model with early stopping"""
# Create evaluation set
eval_set = [(X_train, y_train), (X_val, y_val)]
# Train model
model.fit(
X_train,
y_train,
eval_set=eval_set,
eval_metric='logloss',
early_stopping_rounds=early_stopping_rounds,
verbose=False
)
return model
2. Feature Importance Analysis
import matplotlib.pyplot as plt
import numpy as np
def plot_feature_importance(model, feature_names):
"""Plot feature importance for boosting models"""
# Get feature importance
if hasattr(model, 'feature_importances_'):
importance = model.feature_importances_
else:
importance = model.get_booster().get_score(importance_type='gain')
importance = [importance.get(f, 0) for f in feature_names]
# Sort features by importance
indices = np.argsort(importance)[::-1]
# Plot
plt.figure(figsize=(12, 6))
plt.title('Feature Importance')
plt.bar(range(len(importance)), importance[indices])
plt.xticks(range(len(importance)), [feature_names[i] for i in indices], rotation=45)
plt.tight_layout()
return plt.gcf()
3. Learning Rate Scheduling
class LearningRateScheduler:
def __init__(self, initial_lr=0.1, decay_factor=0.1, decay_epochs=10):
"""Initialize learning rate scheduler"""
self.initial_lr = initial_lr
self.decay_factor = decay_factor
self.decay_epochs = decay_epochs
self.current_lr = initial_lr
def get_lr(self, epoch):
"""Get learning rate for current epoch"""
if epoch > 0 and epoch % self.decay_epochs == 0:
self.current_lr *= self.decay_factor
return self.current_lr
Performance Analysis
1. Learning Curves
def plot_boosting_learning_curves(model, X, y, cv=5):
"""Plot learning curves for boosting models"""
from sklearn.model_selection import learning_curve
# Calculate learning curves
train_sizes, train_scores, val_scores = learning_curve(
model, X, y,
train_sizes=np.linspace(0.1, 1.0, 10),
cv=cv,
n_jobs=-1,
scoring='accuracy'
)
# Calculate mean and std
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)
# Plot
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training Score')
plt.plot(train_sizes, val_mean, label='Validation Score')
# Plot standard deviation bands
plt.fill_between(
train_sizes,
train_mean - train_std,
train_mean + train_std,
alpha=0.1
)
plt.fill_between(
train_sizes,
val_mean - val_std,
val_mean + val_std,
alpha=0.1
)
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.title('Learning Curves')
plt.legend(loc='best')
plt.grid(True)
return plt.gcf()
2. Model Diagnostics
def analyze_boosting_model(model, X, y):
"""Analyze boosting model performance"""
from sklearn.metrics import roc_curve, auc
# Get predictions
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
# Calculate ROC curve
fpr, tpr, _ = roc_curve(y, y_prob)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
return {
'roc_curve': (fpr, tpr, roc_auc),
'predictions': y_pred,
'probabilities': y_prob
}
Best Practices
1. Model Selection
- Choose appropriate base learner
- Consider problem size and complexity
- Balance speed and accuracy
- Evaluate memory requirements
2. Parameter Tuning
- Optimize learning rate
- Adjust number of estimators
- Tune tree parameters
- Use early stopping
3. Implementation Tips
- Handle missing values
- Scale features appropriately
- Monitor training progress
- Use cross-validation
4. Common Pitfalls
- Overfitting with too many trees
- Using high learning rates
- Ignoring early stopping
- Not handling imbalanced data