Handling Imbalanced Data
Imbalanced datasets occur when class distributions are not equal, potentially leading to biased models.
Handling Imbalanced Data
Imbalanced datasets occur when class distributions are not equal, potentially leading to biased models.
Understanding Class Imbalance
Measuring Imbalance
def analyze_class_distribution(y):
# Calculate class distribution
class_counts = pd.Series(y).value_counts()
class_proportions = class_counts / len(y)
# Calculate imbalance ratio
imbalance_ratio = class_counts.max() / class_counts.min()
return {
'counts': class_counts,
'proportions': class_proportions,
'imbalance_ratio': imbalance_ratio
}
Resampling Techniques
Oversampling Methods
- Random Oversampling
from imblearn.over_sampling import RandomOverSampler
def random_oversample(X, y):
oversample = RandomOverSampler(random_state=42)
X_resampled, y_resampled = oversample.fit_resample(X, y)
return X_resampled, y_resampled
- SMOTE (Synthetic Minority Over-sampling Technique)
from imblearn.over_sampling import SMOTE
def apply_smote(X, y, k_neighbors=5):
smote = SMOTE(
k_neighbors=k_neighbors,
random_state=42
)
X_resampled, y_resampled = smote.fit_resample(X, y)
return X_resampled, y_resampled
- ADASYN (Adaptive Synthetic Sampling)
from imblearn.over_sampling import ADASYN
def apply_adasyn(X, y):
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X, y)
return X_resampled, y_resampled
Undersampling Methods
- Random Undersampling
from imblearn.under_sampling import RandomUnderSampler
def random_undersample(X, y):
undersample = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X, y)
return X_resampled, y_resampled
- Near Miss
from imblearn.under_sampling import NearMiss
def apply_near_miss(X, y, version=3):
nearmiss = NearMiss(version=version)
X_resampled, y_resampled = nearmiss.fit_resample(X, y)
return X_resampled, y_resampled
- Tomek Links
from imblearn.under_sampling import TomekLinks
def remove_tomek_links(X, y):
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X, y)
return X_resampled, y_resampled
Combination Methods
- SMOTEENN (SMOTE + Edited Nearest Neighbors)
from imblearn.combine import SMOTEENN
def apply_smoteenn(X, y):
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X, y)
return X_resampled, y_resampled
- SMOTETomek (SMOTE + Tomek Links)
from imblearn.combine import SMOTETomek
def apply_smotetomek(X, y):
smotetomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smotetomek.fit_resample(X, y)
return X_resampled, y_resampled
Cost-Sensitive Learning
Class Weights
from sklearn.utils.class_weight import compute_class_weight
def calculate_class_weights(y):
# Calculate balanced class weights
class_weights = compute_class_weight(
'balanced',
classes=np.unique(y),
y=y
)
return dict(zip(np.unique(y), class_weights))
Custom Loss Functions
import torch.nn as nn
class WeightedBCELoss(nn.Module):
def __init__(self, pos_weight):
super().__init__()
self.pos_weight = pos_weight
def forward(self, y_pred, y_true):
loss = -(self.pos_weight * y_true * torch.log(y_pred) +
(1 - y_true) * torch.log(1 - y_pred))
return loss.mean()
Ensemble Methods for Imbalanced Data
Balanced Random Forest
from imblearn.ensemble import BalancedRandomForestClassifier
def train_balanced_random_forest(X, y):
model = BalancedRandomForestClassifier(
n_estimators=100,
random_state=42
)
model.fit(X, y)
return model
Easy Ensemble
from imblearn.ensemble import EasyEnsembleClassifier
def train_easy_ensemble(X, y):
model = EasyEnsembleClassifier(
n_estimators=10,
random_state=42
)
model.fit(X, y)
return model
Evaluation Metrics
Imbalanced Classification Metrics
from sklearn.metrics import (
confusion_matrix,
precision_recall_curve,
average_precision_score,
f1_score
)
def evaluate_imbalanced_classifier(y_true, y_pred, y_prob=None):
metrics = {
'confusion_matrix': confusion_matrix(y_true, y_pred),
'f1': f1_score(y_true, y_pred),
'balanced_accuracy': balanced_accuracy_score(y_true, y_pred)
}
if y_prob is not None:
metrics['average_precision'] = average_precision_score(y_true, y_prob)
return metrics
Visualization Tools
def plot_precision_recall_curve(y_true, y_prob):
precision, recall, _ = precision_recall_curve(y_true, y_prob)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
return plt
def plot_class_distribution(y):
plt.figure(figsize=(8, 6))
pd.Series(y).value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
return plt
Best Practices
- Data Analysis
def analyze_imbalanced_dataset(X, y):
analysis = {
'class_distribution': analyze_class_distribution(y),
'feature_correlations': pd.DataFrame(X).corr(),
'missing_values': pd.DataFrame(X).isnull().sum()
}
return analysis
- Cross-Validation Strategy
from sklearn.model_selection import StratifiedKFold
def stratified_cv_score(model, X, y, cv=5):
# Use stratified k-fold to maintain class distribution
skf = StratifiedKFold(n_splits=cv, shuffle=True)
scores = []
for train_idx, val_idx in skf.split(X, y):
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
model.fit(X_train, y_train)
scores.append(
evaluate_imbalanced_classifier(
y_val,
model.predict(X_val)
)
)
return scores
- Pipeline Creation
def create_imbalanced_pipeline(X, y, method='smote'):
if method == 'smote':
sampler = SMOTE()
elif method == 'adasyn':
sampler = ADASYN()
elif method == 'random':
sampler = RandomOverSampler()
pipeline = Pipeline([
('sampler', sampler),
('scaler', StandardScaler()),
('classifier', BalancedRandomForestClassifier())
])
return pipeline
- Threshold Tuning
def find_optimal_threshold(y_true, y_prob):
precisions, recalls, thresholds = precision_recall_curve(
y_true,
y_prob
)
# Find threshold that maximizes F1 score
f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
optimal_threshold = thresholds[np.argmax(f1_scores)]
return optimal_threshold