Regression

Understanding and implementing regression algorithms in machine learning

Regression is a supervised learning task where the goal is to predict continuous numerical values.

Linear Regression

Simple Linear Regression

The most basic regression model with one feature:

y=β0+β1x+ϵy = \beta_0 + \beta_1x + \epsilon
from sklearn.linear_model import LinearRegression
import numpy as np

def train_simple_linear_regression(X, y):
    # Initialize and train model
    model = LinearRegression()
    model.fit(X.reshape(-1, 1), y)
    
    # Get coefficients
    slope = model.coef_[0]
    intercept = model.intercept_
    
    return model, slope, intercept

def plot_regression_line(X, y, model):
    plt.scatter(X, y, color='blue', alpha=0.5)
    plt.plot(X, model.predict(X.reshape(-1, 1)), color='red')
    plt.xlabel('Feature')
    plt.ylabel('Target')
    plt.title('Linear Regression Fit')
    return plt

Multiple Linear Regression

Extending to multiple features:

y=β0+β1x1+β2x2+...+βpxp+ϵy = \beta_0 + \beta_1x_1 + \beta_2x_2 + ... + \beta_px_p + \epsilon
def train_multiple_linear_regression(X, y):
    # Initialize and train model
    model = LinearRegression()
    model.fit(X, y)
    
    # Get feature importance
    importance = pd.DataFrame({
        'feature': range(X.shape[1]),
        'coefficient': model.coef_
    })
    
    return model, importance

Regularized Regression

Ridge Regression (L2)

Adding L2 regularization to prevent overfitting:

minβyXβ2+λβ22\min_{\beta} \|y - X\beta\|^2 + \lambda\|\beta\|^2_2
from sklearn.linear_model import Ridge

def train_ridge_regression(X, y, alpha=1.0):
    # Initialize and train model
    model = Ridge(alpha=alpha)
    model.fit(X, y)
    
    return model

def plot_ridge_path(X, y, alphas):
    coefs = []
    for alpha in alphas:
        model = Ridge(alpha=alpha)
        model.fit(X, y)
        coefs.append(model.coef_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(alphas, np.array(coefs))
    plt.xscale('log')
    plt.xlabel('Alpha')
    plt.ylabel('Coefficients')
    plt.title('Ridge Path')
    return plt

Lasso Regression (L1)

Using L1 regularization for feature selection:

minβyXβ2+λβ1\min_{\beta} \|y - X\beta\|^2 + \lambda\|\beta\|_1
from sklearn.linear_model import Lasso

def train_lasso_regression(X, y, alpha=1.0):
    # Initialize and train model
    model = Lasso(alpha=alpha)
    model.fit(X, y)
    
    # Get selected features
    selected_features = np.where(model.coef_ != 0)[0]
    
    return model, selected_features

Elastic Net

Combining L1 and L2 regularization:

from sklearn.linear_model import ElasticNet

def train_elastic_net(X, y, alpha=1.0, l1_ratio=0.5):
    # Initialize and train model
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    model.fit(X, y)
    
    return model

Polynomial Regression

Handling non-linear relationships:

from sklearn.preprocessing import PolynomialFeatures

def train_polynomial_regression(X, y, degree=2):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X.reshape(-1, 1))
    
    # Train model
    model = LinearRegression()
    model.fit(X_poly, y)
    
    return model, poly

def plot_polynomial_fit(X, y, model, poly):
    # Generate points for smooth curve
    X_test = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
    X_test_poly = poly.transform(X_test)
    y_pred = model.predict(X_test_poly)
    
    plt.scatter(X, y, color='blue', alpha=0.5)
    plt.plot(X_test, y_pred, color='red')
    plt.title(f'Polynomial Regression (degree={poly.degree})')
    return plt

Support Vector Regression (SVR)

Using support vectors for regression:

from sklearn.svm import SVR

def train_svr(X, y, kernel='rbf'):
    # Initialize and train model
    model = SVR(kernel=kernel)
    model.fit(X, y)
    
    return model

Tree-based Regression

Decision Tree Regressor

from sklearn.tree import DecisionTreeRegressor

def train_decision_tree_regressor(X, y, max_depth=5):
    # Initialize and train model
    model = DecisionTreeRegressor(max_depth=max_depth)
    model.fit(X, y)
    
    return model

Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

def train_random_forest_regressor(X, y):
    # Initialize and train model
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=None
    )
    model.fit(X, y)
    
    return model

Evaluation Metrics

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

def evaluate_regressor(y_true, y_pred):
    metrics = {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred)
    }
    return metrics

Cross-Validation and Model Selection

from sklearn.model_selection import cross_val_score, KFold

def perform_regression_cv(model, X, y, cv=5):
    # Perform cross-validation
    cv_scores = cross_val_score(
        model, X, y,
        cv=cv,
        scoring='neg_mean_squared_error'
    )
    
    # Convert MSE to RMSE
    rmse_scores = np.sqrt(-cv_scores)
    
    return {
        'mean_rmse': rmse_scores.mean(),
        'std_rmse': rmse_scores.std()
    }

Hyperparameter Tuning

def tune_regression_model(model, param_grid, X, y):
    # Perform grid search
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X, y)
    
    # Get best parameters and score
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_)
    
    return grid_search.best_estimator_, best_params, best_score

Best Practices

  1. Feature Scaling
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler
  1. Handling Outliers
def remove_outliers(X, y, threshold=3):
    z_scores = np.abs((y - y.mean()) / y.std())
    mask = z_scores < threshold
    return X[mask], y[mask]
  1. Feature Selection
def select_features_for_regression(X, y, k=10):
    selector = SelectKBest(score_func=f_regression, k=k)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    return X_selected, selected_features