Data Preprocessing
Essential techniques for preparing data for machine learning models
Data preprocessing is a crucial step that transforms raw data into a clean and suitable format for machine learning models.
Data Cleaning
Handling Duplicates
def remove_duplicates(df, subset=None):
"""
Remove duplicate rows from dataframe
"""
n_before = len(df)
df = df.drop_duplicates(subset=subset)
n_after = len(df)
print(f"Removed {n_before - n_after} duplicate rows")
return df
Outlier Detection
Statistical Methods
- Z-Score Method
def detect_outliers_zscore(data, threshold=3):
z_scores = np.abs((data - data.mean()) / data.std())
return z_scores > threshold
- IQR Method
def detect_outliers_iqr(data):
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return (data < lower_bound) | (data > upper_bound)
Machine Learning Methods
- Isolation Forest
from sklearn.ensemble import IsolationForest
def detect_outliers_iforest(X):
iso_forest = IsolationForest(contamination=0.1)
yhat = iso_forest.fit_predict(X)
return yhat == -1 # True for outliers
- Local Outlier Factor
from sklearn.neighbors import LocalOutlierFactor
def detect_outliers_lof(X):
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X)
return yhat == -1
Data Transformation
Numerical Transformations
- Log Transform
def log_transform(data, epsilon=1e-10):
"""
Apply log transformation with handling of zeros/negative values
"""
return np.log(data + epsilon)
- Box-Cox Transform
from scipy.stats import boxcox
def box_cox_transform(data):
"""
Apply Box-Cox transformation
"""
transformed_data, lambda_param = boxcox(data + 1)
return transformed_data, lambda_param
Categorical Transformations
- Frequency Encoding
def frequency_encode(data, column):
"""
Replace categories with their frequencies
"""
frequency = data[column].value_counts(normalize=True)
return data[column].map(frequency)
- Mean Encoding with Regularization
def mean_encode_regularized(train, test, column, target, alpha=10):
"""
Mean encoding with smoothing
"""
global_mean = train[target].mean()
agg = train.groupby(column)[target].agg(['count', 'mean'])
# Apply smoothing
counts = agg['count']
means = agg['mean']
smooth = (counts * means + alpha * global_mean) / (counts + alpha)
# Map values
train_encoded = train[column].map(smooth)
test_encoded = test[column].map(smooth)
return train_encoded, test_encoded
Feature Scaling
Advanced Scaling Techniques
- Robust Scaling
from sklearn.preprocessing import RobustScaler
def robust_scale(X_train, X_test):
"""
Scale features using statistics that are robust to outliers
"""
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
- Quantile Transformation
from sklearn.preprocessing import QuantileTransformer
def quantile_transform(X_train, X_test, n_quantiles=1000):
"""
Transform features to follow a normal distribution
"""
transformer = QuantileTransformer(
n_quantiles=n_quantiles,
output_distribution='normal'
)
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)
return X_train_transformed, X_test_transformed
Feature Creation
Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
def create_polynomial_features(X, degree=2):
"""
Create polynomial and interaction features
"""
poly = PolynomialFeatures(degree=degree)
return poly.fit_transform(X)
Time-based Features
def create_datetime_features(df, date_column):
"""
Extract useful features from datetime column
"""
df = df.copy()
df['hour'] = df[date_column].dt.hour
df['day'] = df[date_column].dt.day
df['month'] = df[date_column].dt.month
df['year'] = df[date_column].dt.year
df['day_of_week'] = df[date_column].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
return df
Data Validation
Schema Validation
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class DataSchema:
numerical_features: List[str]
categorical_features: List[str]
datetime_features: List[str]
target: str
def validate_schema(df: pd.DataFrame, schema: DataSchema) -> bool:
"""
Validate dataframe against schema
"""
# Check all features exist
all_features = (
schema.numerical_features +
schema.categorical_features +
schema.datetime_features
)
if not all(feat in df.columns for feat in all_features):
return False
# Check data types
for feat in schema.numerical_features:
if not np.issubdtype(df[feat].dtype, np.number):
return False
return True
Data Quality Checks
def check_data_quality(df: pd.DataFrame) -> Dict:
"""
Perform various data quality checks
"""
checks = {
'missing_values': df.isnull().sum().to_dict(),
'unique_counts': df.nunique().to_dict(),
'negative_values': {
col: (df[col] < 0).sum()
for col in df.select_dtypes(include=[np.number]).columns
},
'zero_values': {
col: (df[col] == 0).sum()
for col in df.select_dtypes(include=[np.number]).columns
}
}
return checks
Pipeline Creation
Custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin
class CustomPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, numerical_features, categorical_features):
self.numerical_features = numerical_features
self.categorical_features = categorical_features
def fit(self, X, y=None):
return self
def transform(self, X):
X = X.copy()
# Transform numerical features
for feature in self.numerical_features:
X[feature] = self._transform_numerical(X[feature])
# Transform categorical features
for feature in self.categorical_features:
X[feature] = self._transform_categorical(X[feature])
return X
def _transform_numerical(self, series):
# Add your numerical transformations
return series
def _transform_categorical(self, series):
# Add your categorical transformations
return series
Complete Pipeline
def create_preprocessing_pipeline(numerical_features,
categorical_features):
"""
Create a complete preprocessing pipeline
"""
numerical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', RobustScaler())
])
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='constant',
fill_value='missing')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)
])
return preprocessor