Regression
Understanding and implementing regression algorithms in machine learning
Regression is a supervised learning task where the goal is to predict continuous numerical values.
Linear Regression
Simple Linear Regression
The most basic regression model with one feature:
from sklearn.linear_model import LinearRegression
import numpy as np
def train_simple_linear_regression(X, y):
# Initialize and train model
model = LinearRegression()
model.fit(X.reshape(-1, 1), y)
# Get coefficients
slope = model.coef_[0]
intercept = model.intercept_
return model, slope, intercept
def plot_regression_line(X, y, model):
plt.scatter(X, y, color='blue', alpha=0.5)
plt.plot(X, model.predict(X.reshape(-1, 1)), color='red')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Linear Regression Fit')
return plt
Multiple Linear Regression
Extending to multiple features:
def train_multiple_linear_regression(X, y):
# Initialize and train model
model = LinearRegression()
model.fit(X, y)
# Get feature importance
importance = pd.DataFrame({
'feature': range(X.shape[1]),
'coefficient': model.coef_
})
return model, importance
Regularized Regression
Ridge Regression (L2)
Adding L2 regularization to prevent overfitting:
from sklearn.linear_model import Ridge
def train_ridge_regression(X, y, alpha=1.0):
# Initialize and train model
model = Ridge(alpha=alpha)
model.fit(X, y)
return model
def plot_ridge_path(X, y, alphas):
coefs = []
for alpha in alphas:
model = Ridge(alpha=alpha)
model.fit(X, y)
coefs.append(model.coef_)
plt.figure(figsize=(10, 6))
plt.plot(alphas, np.array(coefs))
plt.xscale('log')
plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.title('Ridge Path')
return plt
Lasso Regression (L1)
Using L1 regularization for feature selection:
from sklearn.linear_model import Lasso
def train_lasso_regression(X, y, alpha=1.0):
# Initialize and train model
model = Lasso(alpha=alpha)
model.fit(X, y)
# Get selected features
selected_features = np.where(model.coef_ != 0)[0]
return model, selected_features
Elastic Net
Combining L1 and L2 regularization:
from sklearn.linear_model import ElasticNet
def train_elastic_net(X, y, alpha=1.0, l1_ratio=0.5):
# Initialize and train model
model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
model.fit(X, y)
return model
Polynomial Regression
Handling non-linear relationships:
from sklearn.preprocessing import PolynomialFeatures
def train_polynomial_regression(X, y, degree=2):
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X.reshape(-1, 1))
# Train model
model = LinearRegression()
model.fit(X_poly, y)
return model, poly
def plot_polynomial_fit(X, y, model, poly):
# Generate points for smooth curve
X_test = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
X_test_poly = poly.transform(X_test)
y_pred = model.predict(X_test_poly)
plt.scatter(X, y, color='blue', alpha=0.5)
plt.plot(X_test, y_pred, color='red')
plt.title(f'Polynomial Regression (degree={poly.degree})')
return plt
Support Vector Regression (SVR)
Using support vectors for regression:
from sklearn.svm import SVR
def train_svr(X, y, kernel='rbf'):
# Initialize and train model
model = SVR(kernel=kernel)
model.fit(X, y)
return model
Tree-based Regression
Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
def train_decision_tree_regressor(X, y, max_depth=5):
# Initialize and train model
model = DecisionTreeRegressor(max_depth=max_depth)
model.fit(X, y)
return model
Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
def train_random_forest_regressor(X, y):
# Initialize and train model
model = RandomForestRegressor(
n_estimators=100,
max_depth=None
)
model.fit(X, y)
return model
Evaluation Metrics
from sklearn.metrics import (
mean_squared_error,
mean_absolute_error,
r2_score
)
def evaluate_regressor(y_true, y_pred):
metrics = {
'mse': mean_squared_error(y_true, y_pred),
'rmse': np.sqrt(mean_squared_error(y_true, y_pred)),
'mae': mean_absolute_error(y_true, y_pred),
'r2': r2_score(y_true, y_pred)
}
return metrics
Cross-Validation and Model Selection
from sklearn.model_selection import cross_val_score, KFold
def perform_regression_cv(model, X, y, cv=5):
# Perform cross-validation
cv_scores = cross_val_score(
model, X, y,
cv=cv,
scoring='neg_mean_squared_error'
)
# Convert MSE to RMSE
rmse_scores = np.sqrt(-cv_scores)
return {
'mean_rmse': rmse_scores.mean(),
'std_rmse': rmse_scores.std()
}
Hyperparameter Tuning
def tune_regression_model(model, param_grid, X, y):
# Perform grid search
grid_search = GridSearchCV(
model,
param_grid,
cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1
)
grid_search.fit(X, y)
# Get best parameters and score
best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)
return grid_search.best_estimator_, best_params, best_score
Best Practices
- Feature Scaling
def scale_features(X_train, X_test):
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, scaler
- Handling Outliers
def remove_outliers(X, y, threshold=3):
z_scores = np.abs((y - y.mean()) / y.std())
mask = z_scores < threshold
return X[mask], y[mask]
- Feature Selection
def select_features_for_regression(X, y, k=10):
selector = SelectKBest(score_func=f_regression, k=k)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()
return X_selected, selected_features