Handling Imbalanced Data

Imbalanced datasets occur when class distributions are not equal, potentially leading to biased models.

Handling Imbalanced Data

Imbalanced datasets occur when class distributions are not equal, potentially leading to biased models.

Understanding Class Imbalance

Measuring Imbalance

def analyze_class_distribution(y):
    # Calculate class distribution
    class_counts = pd.Series(y).value_counts()
    class_proportions = class_counts / len(y)
    
    # Calculate imbalance ratio
    imbalance_ratio = class_counts.max() / class_counts.min()
    
    return {
        'counts': class_counts,
        'proportions': class_proportions,
        'imbalance_ratio': imbalance_ratio
    }

Resampling Techniques

Oversampling Methods

  1. Random Oversampling
from imblearn.over_sampling import RandomOverSampler

def random_oversample(X, y):
    oversample = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversample.fit_resample(X, y)
    return X_resampled, y_resampled
  1. SMOTE (Synthetic Minority Over-sampling Technique)
from imblearn.over_sampling import SMOTE

def apply_smote(X, y, k_neighbors=5):
    smote = SMOTE(
        k_neighbors=k_neighbors,
        random_state=42
    )
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled
  1. ADASYN (Adaptive Synthetic Sampling)
from imblearn.over_sampling import ADASYN

def apply_adasyn(X, y):
    adasyn = ADASYN(random_state=42)
    X_resampled, y_resampled = adasyn.fit_resample(X, y)
    return X_resampled, y_resampled

Undersampling Methods

  1. Random Undersampling
from imblearn.under_sampling import RandomUnderSampler

def random_undersample(X, y):
    undersample = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = undersample.fit_resample(X, y)
    return X_resampled, y_resampled
  1. Near Miss
from imblearn.under_sampling import NearMiss

def apply_near_miss(X, y, version=3):
    nearmiss = NearMiss(version=version)
    X_resampled, y_resampled = nearmiss.fit_resample(X, y)
    return X_resampled, y_resampled
  1. Tomek Links
from imblearn.under_sampling import TomekLinks

def remove_tomek_links(X, y):
    tomek = TomekLinks()
    X_resampled, y_resampled = tomek.fit_resample(X, y)
    return X_resampled, y_resampled

Combination Methods

  1. SMOTEENN (SMOTE + Edited Nearest Neighbors)
from imblearn.combine import SMOTEENN

def apply_smoteenn(X, y):
    smoteenn = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smoteenn.fit_resample(X, y)
    return X_resampled, y_resampled
  1. SMOTETomek (SMOTE + Tomek Links)
from imblearn.combine import SMOTETomek

def apply_smotetomek(X, y):
    smotetomek = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smotetomek.fit_resample(X, y)
    return X_resampled, y_resampled

Cost-Sensitive Learning

Class Weights

from sklearn.utils.class_weight import compute_class_weight

def calculate_class_weights(y):
    # Calculate balanced class weights
    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(y),
        y=y
    )
    
    return dict(zip(np.unique(y), class_weights))

Custom Loss Functions

import torch.nn as nn

class WeightedBCELoss(nn.Module):
    def __init__(self, pos_weight):
        super().__init__()
        self.pos_weight = pos_weight
        
    def forward(self, y_pred, y_true):
        loss = -(self.pos_weight * y_true * torch.log(y_pred) + 
                (1 - y_true) * torch.log(1 - y_pred))
        return loss.mean()

Ensemble Methods for Imbalanced Data

Balanced Random Forest

from imblearn.ensemble import BalancedRandomForestClassifier

def train_balanced_random_forest(X, y):
    model = BalancedRandomForestClassifier(
        n_estimators=100,
        random_state=42
    )
    model.fit(X, y)
    return model

Easy Ensemble

from imblearn.ensemble import EasyEnsembleClassifier

def train_easy_ensemble(X, y):
    model = EasyEnsembleClassifier(
        n_estimators=10,
        random_state=42
    )
    model.fit(X, y)
    return model

Evaluation Metrics

Imbalanced Classification Metrics

from sklearn.metrics import (
    confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    f1_score
)

def evaluate_imbalanced_classifier(y_true, y_pred, y_prob=None):
    metrics = {
        'confusion_matrix': confusion_matrix(y_true, y_pred),
        'f1': f1_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred)
    }
    
    if y_prob is not None:
        metrics['average_precision'] = average_precision_score(y_true, y_prob)
    
    return metrics

Visualization Tools

def plot_precision_recall_curve(y_true, y_prob):
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    return plt

def plot_class_distribution(y):
    plt.figure(figsize=(8, 6))
    pd.Series(y).value_counts().plot(kind='bar')
    plt.title('Class Distribution')
    plt.xlabel('Class')
    plt.ylabel('Count')
    return plt

Best Practices

  1. Data Analysis
def analyze_imbalanced_dataset(X, y):
    analysis = {
        'class_distribution': analyze_class_distribution(y),
        'feature_correlations': pd.DataFrame(X).corr(),
        'missing_values': pd.DataFrame(X).isnull().sum()
    }
    return analysis
  1. Cross-Validation Strategy
from sklearn.model_selection import StratifiedKFold

def stratified_cv_score(model, X, y, cv=5):
    # Use stratified k-fold to maintain class distribution
    skf = StratifiedKFold(n_splits=cv, shuffle=True)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model.fit(X_train, y_train)
        scores.append(
            evaluate_imbalanced_classifier(
                y_val,
                model.predict(X_val)
            )
        )
    
    return scores
  1. Pipeline Creation
def create_imbalanced_pipeline(X, y, method='smote'):
    if method == 'smote':
        sampler = SMOTE()
    elif method == 'adasyn':
        sampler = ADASYN()
    elif method == 'random':
        sampler = RandomOverSampler()
    
    pipeline = Pipeline([
        ('sampler', sampler),
        ('scaler', StandardScaler()),
        ('classifier', BalancedRandomForestClassifier())
    ])
    
    return pipeline
  1. Threshold Tuning
def find_optimal_threshold(y_true, y_prob):
    precisions, recalls, thresholds = precision_recall_curve(
        y_true,
        y_prob
    )
    
    # Find threshold that maximizes F1 score
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    
    return optimal_threshold