Classification Metrics

This section covers the various metrics used to evaluate classification models, including their mathematical foundations, implementation, and practical considerations.

Basic Metrics

1. Confusion Matrix

True Positives (TP): Correctly predicted positive cases
True Negatives (TN): Correctly predicted negative cases
False Positives (FP): Incorrectly predicted positive cases
False Negatives (FN): Incorrectly predicted negative cases

import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_true, y_pred, labels=None):
    """Plot confusion matrix with percentages"""
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Convert to percentages
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    # Create heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm_percent,
        annot=True,
        fmt='.1f',
        cmap='Blues',
        xticklabels=labels,
        yticklabels=labels
    )
    plt.title('Confusion Matrix (%)')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    return plt.gcf()

2. Basic Metrics

def compute_basic_metrics(y_true, y_pred):
    """Compute basic classification metrics"""
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    # Precision
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Recall (Sensitivity)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    # Specificity
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # F1 Score
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'f1_score': f1
    }

Advanced Metrics

1. ROC and AUC

from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_true, y_prob):
    """Plot ROC curve and compute AUC"""
    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    
    # Plot
    plt.figure(figsize=(10, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.grid(True)
    return plt.gcf(), roc_auc

2. Precision-Recall Curve

from sklearn.metrics import precision_recall_curve, average_precision_score

def plot_precision_recall_curve(y_true, y_prob):
    """Plot Precision-Recall curve and compute average precision"""
    # Compute PR curve
    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    avg_precision = average_precision_score(y_true, y_prob)
    
    # Plot
    plt.figure(figsize=(10, 8))
    plt.plot(recall, precision, color='darkorange', lw=2,
             label=f'PR curve (AP = {avg_precision:.2f})')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")
    plt.grid(True)
    return plt.gcf(), avg_precision

3. Log Loss

from sklearn.metrics import log_loss

def compute_log_loss(y_true, y_prob):
    """Compute log loss (cross-entropy)"""
    return log_loss(y_true, y_prob)

Multiclass Metrics

1. Multiclass Confusion Matrix

def plot_multiclass_confusion_matrix(y_true, y_pred, labels):
    """Plot multiclass confusion matrix"""
    cm = confusion_matrix(y_true, y_pred)
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        cm_percent,
        annot=True,
        fmt='.1f',
        cmap='Blues',
        xticklabels=labels,
        yticklabels=labels
    )
    plt.title('Multiclass Confusion Matrix (%)')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    return plt.gcf()

2. Per-Class Metrics

from sklearn.metrics import classification_report

def compute_per_class_metrics(y_true, y_pred, labels=None):
    """Compute per-class precision, recall, and F1-score"""
    return classification_report(
        y_true,
        y_pred,
        labels=labels,
        output_dict=True
    )

Calibration Metrics

1. Calibration Curves

from sklearn.calibration import calibration_curve

def plot_calibration_curve(y_true, y_prob, n_bins=10):
    """Plot calibration curve"""
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins)
    
    plt.figure(figsize=(10, 8))
    plt.plot(prob_pred, prob_true, marker='o', label='Model')
    plt.plot([0, 1], [0, 1], linestyle='--', label='Perfect calibration')
    plt.xlabel('Mean predicted probability')
    plt.ylabel('Fraction of positives')
    plt.title('Calibration Curve')
    plt.legend()
    plt.grid(True)
    return plt.gcf()

Best Practices

1. Metric Selection

Choose metrics based on problem characteristics
Consider class imbalance
Account for business requirements
Use multiple complementary metrics

2. Threshold Selection

Optimize threshold based on business needs
Consider cost of different types of errors
Use ROC and PR curves for threshold selection
Validate threshold on holdout set

3. Reporting

Include confidence intervals
Report all relevant metrics
Provide visual representations
Document metric definitions and assumptions

4. Common Pitfalls

Using accuracy for imbalanced datasets
Ignoring probability calibration
Not considering business context
Overfitting to evaluation metrics

PreviousPerformance Metrics

NextRegression Metrics

Getting Started

Math

Machine Learning

Deep Learning

Natural Language Processing

Reinforcement Learning

References

Classification Metrics

Basic Metrics

1. Confusion Matrix

2. Basic Metrics

Advanced Metrics

1. ROC and AUC

2. Precision-Recall Curve

3. Log Loss

Multiclass Metrics

1. Multiclass Confusion Matrix

2. Per-Class Metrics

Calibration Metrics

1. Calibration Curves

Best Practices

1. Metric Selection

2. Threshold Selection

3. Reporting

4. Common Pitfalls

On this page