Dimensionality Reduction

This section covers various dimensionality reduction techniques and their implementation in machine learning.

Linear Methods

1. Principal Component Analysis (PCA)

from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

class PCAReducer:
    def __init__(
        self,
        n_components=None,
        **kwargs
    ):
        """Initialize PCA reducer"""
        self.n_components = n_components
        self.model = PCA(n_components=n_components, **kwargs)
    
    def fit(self, X):
        """Fit PCA model"""
        self.model.fit(X)
        return self
    
    def transform(self, X):
        """Transform data"""
        return self.model.transform(X)
    
    def fit_transform(self, X):
        """Fit and transform data"""
        return self.model.fit_transform(X)
    
    def inverse_transform(self, X):
        """Inverse transform data"""
        return self.model.inverse_transform(X)
    
    def plot_explained_variance(self):
        """Plot explained variance ratio"""
        plt.figure(figsize=(10, 6))
        
        # Cumulative explained variance
        plt.plot(
            np.cumsum(self.model.explained_variance_ratio_),
            'bo-',
            label='Cumulative'
        )
        
        # Individual explained variance
        plt.plot(
            self.model.explained_variance_ratio_,
            'ro-',
            label='Individual'
        )
        
        plt.xlabel('Number of Components')
        plt.ylabel('Explained Variance Ratio')
        plt.title('PCA Explained Variance')
        plt.legend()
        plt.grid(True)
        return plt.gcf()
    
    def plot_components(self, feature_names=None):
        """Plot component loadings"""
        n_components = self.model.components_.shape[0]
        
        plt.figure(figsize=(12, 8))
        plt.imshow(
            self.model.components_,
            cmap='coolwarm',
            aspect='auto'
        )
        plt.colorbar()
        
        if feature_names is not None:
            plt.yticks(
                range(n_components),
                [f'PC{i+1}' for i in range(n_components)]
            )
            plt.xticks(
                range(len(feature_names)),
                feature_names,
                rotation=45,
                ha='right'
            )
        
        plt.title('PCA Components')
        plt.tight_layout()
        return plt.gcf()

2. Linear Discriminant Analysis (LDA)

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

class LDAReducer:
    def __init__(
        self,
        n_components=None,
        **kwargs
    ):
        """Initialize LDA reducer"""
        self.n_components = n_components
        self.model = LinearDiscriminantAnalysis(
            n_components=n_components,
            **kwargs
        )
    
    def fit(self, X, y):
        """Fit LDA model"""
        self.model.fit(X, y)
        return self
    
    def transform(self, X):
        """Transform data"""
        return self.model.transform(X)
    
    def fit_transform(self, X, y):
        """Fit and transform data"""
        return self.model.fit_transform(X, y)

Non-linear Methods

1. t-SNE

from sklearn.manifold import TSNE

class TSNEReducer:
    def __init__(
        self,
        n_components=2,
        **kwargs
    ):
        """Initialize t-SNE reducer"""
        self.n_components = n_components
        self.model = TSNE(
            n_components=n_components,
            **kwargs
        )
    
    def fit_transform(self, X):
        """Fit and transform data"""
        return self.model.fit_transform(X)
    
    def plot_embedding(self, X, labels=None, title=None):
        """Plot t-SNE embedding"""
        embedding = self.fit_transform(X)
        
        plt.figure(figsize=(10, 6))
        scatter = plt.scatter(
            embedding[:, 0],
            embedding[:, 1],
            c=labels,
            cmap='viridis' if labels is not None else None
        )
        
        if labels is not None:
            plt.colorbar(scatter)
        
        plt.title(title or 't-SNE Embedding')
        return plt.gcf()

2. UMAP

import umap

class UMAPReducer:
    def __init__(
        self,
        n_components=2,
        **kwargs
    ):
        """Initialize UMAP reducer"""
        self.n_components = n_components
        self.model = umap.UMAP(
            n_components=n_components,
            **kwargs
        )
    
    def fit(self, X):
        """Fit UMAP model"""
        self.model.fit(X)
        return self
    
    def transform(self, X):
        """Transform data"""
        return self.model.transform(X)
    
    def fit_transform(self, X):
        """Fit and transform data"""
        return self.model.fit_transform(X)
    
    def plot_embedding(self, X, labels=None, title=None):
        """Plot UMAP embedding"""
        embedding = self.fit_transform(X)
        
        plt.figure(figsize=(10, 6))
        scatter = plt.scatter(
            embedding[:, 0],
            embedding[:, 1],
            c=labels,
            cmap='viridis' if labels is not None else None
        )
        
        if labels is not None:
            plt.colorbar(scatter)
        
        plt.title(title or 'UMAP Embedding')
        return plt.gcf()

Autoencoder-based Methods

1. Simple Autoencoder

import tensorflow as tf
from tensorflow.keras import layers, models

class Autoencoder:
    def __init__(
        self,
        input_dim,
        encoding_dim,
        hidden_layers=None
    ):
        """Initialize autoencoder"""
        self.input_dim = input_dim
        self.encoding_dim = encoding_dim
        self.hidden_layers = hidden_layers or []
        
        self.encoder = self._build_encoder()
        self.decoder = self._build_decoder()
        self.model = self._build_autoencoder()
    
    def _build_encoder(self):
        """Build encoder model"""
        encoder = models.Sequential()
        encoder.add(layers.Input(shape=(self.input_dim,)))
        
        # Add hidden layers
        for units in self.hidden_layers:
            encoder.add(layers.Dense(units, activation='relu'))
        
        # Add bottleneck layer
        encoder.add(layers.Dense(self.encoding_dim, activation='relu'))
        
        return encoder
    
    def _build_decoder(self):
        """Build decoder model"""
        decoder = models.Sequential()
        decoder.add(layers.Input(shape=(self.encoding_dim,)))
        
        # Add hidden layers in reverse
        for units in reversed(self.hidden_layers):
            decoder.add(layers.Dense(units, activation='relu'))
        
        # Add output layer
        decoder.add(layers.Dense(self.input_dim, activation='sigmoid'))
        
        return decoder
    
    def _build_autoencoder(self):
        """Build complete autoencoder"""
        inputs = layers.Input(shape=(self.input_dim,))
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        
        return models.Model(inputs, decoded)
    
    def compile(self, **kwargs):
        """Compile model"""
        self.model.compile(**kwargs)
    
    def fit(self, X, **kwargs):
        """Train autoencoder"""
        return self.model.fit(X, X, **kwargs)
    
    def encode(self, X):
        """Encode data"""
        return self.encoder.predict(X)
    
    def decode(self, X):
        """Decode data"""
        return self.decoder.predict(X)
    
    def plot_reconstruction(self, X, n_samples=5):
        """Plot original vs reconstructed samples"""
        # Get reconstructions
        reconstructed = self.model.predict(X)
        
        plt.figure(figsize=(15, 5))
        for i in range(n_samples):
            # Original
            plt.subplot(2, n_samples, i + 1)
            plt.plot(X[i])
            plt.title('Original')
            
            # Reconstructed
            plt.subplot(2, n_samples, n_samples + i + 1)
            plt.plot(reconstructed[i])
            plt.title('Reconstructed')
        
        plt.tight_layout()
        return plt.gcf()

Model Evaluation

1. Reconstruction Error

def compute_reconstruction_error(X, X_reconstructed):
    """Compute reconstruction error"""
    return np.mean(np.square(X - X_reconstructed))

def plot_reconstruction_error(errors):
    """Plot reconstruction error"""
    plt.figure(figsize=(10, 6))
    plt.plot(errors)
    plt.xlabel('Iteration')
    plt.ylabel('Reconstruction Error')
    plt.title('Reconstruction Error Over Time')
    plt.grid(True)
    return plt.gcf()

Best Practices

1. Data Preparation

  • Scale features appropriately
  • Handle missing values
  • Remove outliers if necessary
  • Consider feature selection

2. Algorithm Selection

  • Consider data characteristics
  • Evaluate computational requirements
  • Test multiple methods
  • Validate results

3. Parameter Tuning

  • Choose appropriate dimensions
  • Consider perplexity (t-SNE)
  • Adjust learning parameters
  • Monitor convergence

4. Evaluation

  • Use multiple metrics
  • Visualize results
  • Consider interpretability
  • Validate with domain experts