Regression
Understanding and implementing regression algorithms in machine learning
Regression is a supervised learning task where the goal is to predict continuous numerical values.
Linear Regression
Simple Linear Regression
The most basic regression model with one feature:
from sklearn.linear_model import LinearRegression
import numpy as np
def train_simple_linear_regression(X, y):
    # Initialize and train model
    model = LinearRegression()
    model.fit(X.reshape(-1, 1), y)
    
    # Get coefficients
    slope = model.coef_[0]
    intercept = model.intercept_
    
    return model, slope, intercept
def plot_regression_line(X, y, model):
    plt.scatter(X, y, color='blue', alpha=0.5)
    plt.plot(X, model.predict(X.reshape(-1, 1)), color='red')
    plt.xlabel('Feature')
    plt.ylabel('Target')
    plt.title('Linear Regression Fit')
    return plt
Multiple Linear Regression
Extending to multiple features:
def train_multiple_linear_regression(X, y):
    # Initialize and train model
    model = LinearRegression()
    model.fit(X, y)
    
    # Get feature importance
    importance = pd.DataFrame({
        'feature': range(X.shape[1]),
        'coefficient': model.coef_
    })
    
    return model, importance
Regularized Regression
Ridge Regression (L2)
Adding L2 regularization to prevent overfitting:
from sklearn.linear_model import Ridge
def train_ridge_regression(X, y, alpha=1.0):
    # Initialize and train model
    model = Ridge(alpha=alpha)
    model.fit(X, y)
    
    return model
def plot_ridge_path(X, y, alphas):
    coefs = []
    for alpha in alphas:
        model = Ridge(alpha=alpha)
        model.fit(X, y)
        coefs.append(model.coef_)
    
    plt.figure(figsize=(10, 6))
    plt.plot(alphas, np.array(coefs))
    plt.xscale('log')
    plt.xlabel('Alpha')
    plt.ylabel('Coefficients')
    plt.title('Ridge Path')
    return plt
Lasso Regression (L1)
Using L1 regularization for feature selection:
from sklearn.linear_model import Lasso
def train_lasso_regression(X, y, alpha=1.0):
    # Initialize and train model
    model = Lasso(alpha=alpha)
    model.fit(X, y)
    
    # Get selected features
    selected_features = np.where(model.coef_ != 0)[0]
    
    return model, selected_features
Elastic Net
Combining L1 and L2 regularization:
from sklearn.linear_model import ElasticNet
def train_elastic_net(X, y, alpha=1.0, l1_ratio=0.5):
    # Initialize and train model
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    model.fit(X, y)
    
    return model
Polynomial Regression
Handling non-linear relationships:
from sklearn.preprocessing import PolynomialFeatures
def train_polynomial_regression(X, y, degree=2):
    # Create polynomial features
    poly = PolynomialFeatures(degree=degree)
    X_poly = poly.fit_transform(X.reshape(-1, 1))
    
    # Train model
    model = LinearRegression()
    model.fit(X_poly, y)
    
    return model, poly
def plot_polynomial_fit(X, y, model, poly):
    # Generate points for smooth curve
    X_test = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
    X_test_poly = poly.transform(X_test)
    y_pred = model.predict(X_test_poly)
    
    plt.scatter(X, y, color='blue', alpha=0.5)
    plt.plot(X_test, y_pred, color='red')
    plt.title(f'Polynomial Regression (degree={poly.degree})')
    return plt
Support Vector Regression (SVR)
Using support vectors for regression:
from sklearn.svm import SVR
def train_svr(X, y, kernel='rbf'):
    # Initialize and train model
    model = SVR(kernel=kernel)
    model.fit(X, y)
    
    return model
Tree-based Regression
Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
def train_decision_tree_regressor(X, y, max_depth=5):
    # Initialize and train model
    model = DecisionTreeRegressor(max_depth=max_depth)
    model.fit(X, y)
    
    return model
Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
def train_random_forest_regressor(X, y):
    # Initialize and train model
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=None
    )
    model.fit(X, y)
    
    return model
Evaluation Metrics
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)
def evaluate_regressor(y_true, y_pred):
    metrics = {
        'mse': mean_squared_error(y_true, y_pred),
        'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
        'mae': mean_absolute_error(y_true, y_pred),
        'r2': r2_score(y_true, y_pred)
    }
    return metrics
Cross-Validation and Model Selection
from sklearn.model_selection import cross_val_score, KFold
def perform_regression_cv(model, X, y, cv=5):
    # Perform cross-validation
    cv_scores = cross_val_score(
        model, X, y,
        cv=cv,
        scoring='neg_mean_squared_error'
    )
    
    # Convert MSE to RMSE
    rmse_scores = np.sqrt(-cv_scores)
    
    return {
        'mean_rmse': rmse_scores.mean(),
        'std_rmse': rmse_scores.std()
    }
Hyperparameter Tuning
def tune_regression_model(model, param_grid, X, y):
    # Perform grid search
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X, y)
    
    # Get best parameters and score
    best_params = grid_search.best_params_
    best_score = np.sqrt(-grid_search.best_score_)
    
    return grid_search.best_estimator_, best_params, best_score
Best Practices
- Feature Scaling
def scale_features(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler
- Handling Outliers
def remove_outliers(X, y, threshold=3):
    z_scores = np.abs((y - y.mean()) / y.std())
    mask = z_scores < threshold
    return X[mask], y[mask]
- Feature Selection
def select_features_for_regression(X, y, k=10):
    selector = SelectKBest(score_func=f_regression, k=k)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    return X_selected, selected_features