AutoML and Pipelines

Understanding and implementing automated machine learning and pipeline construction for efficient model development

Introduction to AutoML

AutoML automates the process of applying machine learning to real-world problems, including feature engineering, model selection, and hyperparameter optimization.

Pipeline Construction

Basic Pipeline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

def create_basic_pipeline():
    """Create a basic machine learning pipeline"""
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ])
    
    return pipeline

def create_custom_pipeline(steps):
    """Create a custom pipeline with specified steps"""
    return Pipeline(steps)

Feature Engineering Pipeline

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def create_feature_pipeline(numeric_features, categorical_features):
    """Create a feature engineering pipeline"""
    numeric_transformer = Pipeline(
        steps=[('scaler', StandardScaler())]
    )
    
    categorical_transformer = Pipeline(
        steps=[('onehot', OneHotEncoder(drop='first'))]
    )
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    
    return preprocessor

AutoML Implementation

Model Search

import optuna

def automl_model_search(X, y, n_trials=100):
    """Automated model selection and hyperparameter tuning"""
    def objective(trial):
        # Define model options
        model_type = trial.suggest_categorical(
            'model_type',
            ['rf', 'xgb', 'lgb']
        )
        
        # Get model and parameters
        if model_type == 'rf':
            model = RandomForestClassifier(
                n_estimators=trial.suggest_int('n_estimators', 10, 100),
                max_depth=trial.suggest_int('max_depth', 2, 32),
                min_samples_split=trial.suggest_int('min_samples_split', 2, 10)
            )
        elif model_type == 'xgb':
            model = XGBClassifier(
                n_estimators=trial.suggest_int('n_estimators', 10, 100),
                max_depth=trial.suggest_int('max_depth', 2, 32),
                learning_rate=trial.suggest_float('learning_rate', 1e-4, 1e-1)
            )
        else:
            model = LGBMClassifier(
                n_estimators=trial.suggest_int('n_estimators', 10, 100),
                max_depth=trial.suggest_int('max_depth', 2, 32),
                learning_rate=trial.suggest_float('learning_rate', 1e-4, 1e-1)
            )
        
        # Cross-validation score
        score = cross_val_score(model, X, y, cv=5).mean()
        return score
    
    # Run optimization
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)
    
    return study.best_trial

Feature Selection

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

def automated_feature_selection(X, y, method='rf'):
    """Automated feature selection"""
    if method == 'rf':
        selector = SelectFromModel(
            RandomForestClassifier(n_estimators=100),
            prefit=False
        )
    
    # Fit and transform
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()].tolist()
    
    return X_selected, selected_features, selector

Hyperparameter Optimization

from sklearn.model_selection import RandomizedSearchCV

def automl_hyperparameter_tuning(model, param_distributions, X, y):
    """Automated hyperparameter tuning"""
    # Initialize random search
    random_search = RandomizedSearchCV(
        model,
        param_distributions=param_distributions,
        n_iter=100,
        cv=5,
        n_jobs=-1,
        verbose=1
    )
    
    # Fit random search
    random_search.fit(X, y)
    
    return random_search.best_estimator_, random_search.best_params_

Advanced Pipeline Features

Custom Transformers

class FeatureGenerator(BaseEstimator, TransformerMixin):
    """Custom feature generator"""
    
    def __init__(self, operations=None):
        self.operations = operations or ['log', 'square', 'sqrt']
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_new = X.copy()
        
        for col in X.columns:
            if 'log' in self.operations:
                X_new[f'{col}_log'] = np.log1p(X[col])
            if 'square' in self.operations:
                X_new[f'{col}_squared'] = X[col] ** 2
            if 'sqrt' in self.operations:
                X_new[f'{col}_sqrt'] = np.sqrt(X[col])
        
        return X_new

Pipeline Persistence

import joblib

def save_pipeline(pipeline, filename):
    """Save pipeline to disk"""
    joblib.dump(pipeline, filename)

def load_pipeline(filename):
    """Load pipeline from disk"""
    return joblib.load(filename)

AutoML Libraries Integration

H2O AutoML

import h2o
from h2o.automl import H2OAutoML

def run_h2o_automl(X, y, time_limit=3600):
    """Run H2O AutoML"""
    # Initialize H2O
    h2o.init()
    
    # Convert to H2O frame
    train = h2o.H2OFrame(pd.concat([X, y], axis=1))
    
    # Run AutoML
    aml = H2OAutoML(
        max_runtime_secs=time_limit,
        seed=1
    )
    aml.train(
        y=y.name,
        training_frame=train
    )
    
    return aml.leader

Auto-Sklearn

import autosklearn.classification

def run_autosklearn(X, y, time_limit=3600):
    """Run Auto-Sklearn"""
    # Initialize Auto-Sklearn
    automl = autosklearn.classification.AutoSklearnClassifier(
        time_left_for_this_task=time_limit,
        per_run_time_limit=360,
        ensemble_size=50
    )
    
    # Fit AutoML
    automl.fit(X, y)
    
    return automl

Best Practices

1. Pipeline Design

Keep pipelines modular
Include all preprocessing steps
Handle missing values
Validate transformations

2. AutoML Configuration

Set appropriate time limits
Define metric objectives
Consider resource constraints
Monitor progress

3. Model Selection

Define model search space
Use cross-validation
Consider model complexity
Evaluate performance

4. Common Pitfalls

Overfitting in AutoML
Insufficient validation
Resource exhaustion
Complex pipelines

PreviousRegularization

NextApplications

Getting Started

Math

Machine Learning

Deep Learning

Natural Language Processing

Reinforcement Learning

References