Optimization
Understanding optimization techniques and their applications in machine learning algorithms.
Optimization is a crucial aspect of machine learning, involving finding the best parameters or solutions that minimize or maximize specific objective functions.
Fundamentals of Optimization
Basic Concepts
- Objective function:
- Constraints: ,
- Local minimum: for all near
- Global minimum: for all feasible
- Convexity:
Optimality Conditions
-
First-order conditions
- Gradient:
- Directional derivative:
-
Second-order conditions
- Hessian positive definite:
- For local minimum: positive semidefinite
import numpy as np
from scipy.optimize import minimize
def check_optimality(f, x, eps=1e-6):
# Compute gradient
grad = np.gradient(f(x))
# Compute Hessian
hess = np.array([[
(f(x + eps*ei + eps*ej) - f(x + eps*ei) - f(x + eps*ej) + f(x))/(eps**2)
for ei in np.eye(len(x))]
for ej in np.eye(len(x))
])
# Check conditions
is_stationary = np.all(np.abs(grad) < eps)
is_min = np.all(np.linalg.eigvals(hess) > -eps)
return is_stationary, is_min
Gradient-Based Methods
Gradient Descent
- Update rule:
class GradientDescent:
def __init__(self, learning_rate=0.01, max_iter=1000, tol=1e-6):
self.lr = learning_rate
self.max_iter = max_iter
self.tol = tol
def optimize(self, f, grad_f, x0):
x = x0
history = [x0]
for _ in range(self.max_iter):
grad = grad_f(x)
if np.linalg.norm(grad) < self.tol:
break
x = x - self.lr * grad
history.append(x)
return x, history
Advanced Gradient Methods
-
Momentum
- Update:
- Position:
def momentum_update(x, v, grad, lr=0.01, beta=0.9): v = beta * v + grad x = x - lr * v return x, v
-
Adam Optimizer
class Adam: def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8): self.lr = lr self.beta1 = beta1 self.beta2 = beta2 self.eps = eps self.m = None self.v = None self.t = 0 def update(self, params, grads): if self.m is None: self.m = np.zeros_like(params) self.v = np.zeros_like(params) self.t += 1 self.m = self.beta1 * self.m + (1 - self.beta1) * grads self.v = self.beta2 * self.v + (1 - self.beta2) * grads**2 m_hat = self.m / (1 - self.beta1**self.t) v_hat = self.v / (1 - self.beta2**self.t) return params - self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
Constrained Optimization
Equality Constraints
- Lagrangian:
def lagrangian_optimization(f, h, x0, lambda0):
def lagrangian(x_lambda):
x, lambda_ = x_lambda[:len(x0)], x_lambda[len(x0):]
return f(x) + np.sum(lambda_ * h(x))
result = minimize(lagrangian, np.concatenate([x0, lambda0]))
return result.x[:len(x0)], result.x[len(x0):]
Inequality Constraints
- KKT conditions:
def barrier_method(f, g, x0, mu=1.0, beta=0.5):
def barrier(x):
return f(x) - mu * np.sum(np.log(-g(x)))
x = x0
while mu > 1e-6:
result = minimize(barrier, x)
x = result.x
mu *= beta
return x
Applications in Machine Learning
Neural Networks
class NeuralNetworkOptimizer:
def __init__(self, model, optimizer='adam'):
self.model = model
self.optimizer = Adam() if optimizer == 'adam' else GradientDescent()
def train_step(self, X, y):
# Forward pass
predictions = self.model.forward(X)
loss = self.model.loss(predictions, y)
# Backward pass
gradients = self.model.backward()
# Update parameters
for param, grad in zip(self.model.parameters(), gradients):
param.data = self.optimizer.update(param.data, grad)
return loss
Support Vector Machines
def svm_dual_optimization(X, y, C=1.0):
n_samples = len(y)
K = np.dot(X, X.T) # kernel matrix
def dual_objective(alpha):
return -0.5 * np.sum(np.outer(alpha*y, alpha*y) * K) + np.sum(alpha)
constraints = [
{'type': 'eq', 'fun': lambda alpha: np.dot(alpha, y)},
{'type': 'ineq', 'fun': lambda alpha: alpha},
{'type': 'ineq', 'fun': lambda alpha: C - alpha}
]
result = minimize(dual_objective, np.zeros(n_samples),
constraints=constraints, method='SLSQP')
return result.x
Numerical Optimization
Line Search Methods
def backtracking_line_search(f, x, p, alpha=0.5, beta=0.8):
t = 1.0
while f(x + t*p) > f(x) + alpha*t*np.dot(grad(f, x), p):
t *= beta
return t
Trust Region Methods
def trust_region_subproblem(g, H, delta):
"""Solve the trust region subproblem"""
n = len(g)
def objective(p):
return np.dot(g, p) + 0.5 * np.dot(p, np.dot(H, p))
constraints = [{'type': 'ineq', 'fun': lambda p: delta - np.linalg.norm(p)}]
result = minimize(objective, np.zeros(n), constraints=constraints)
return result.x
Advanced Topics
Stochastic Optimization
def sgd_with_momentum(f, grad_f, x0, batch_size=32, n_epochs=100):
x = x0
v = np.zeros_like(x0)
for _ in range(n_epochs):
# Shuffle data
indices = np.random.permutation(len(data))
for i in range(0, len(data), batch_size):
batch_idx = indices[i:i+batch_size]
grad = grad_f(x, data[batch_idx])
x, v = momentum_update(x, v, grad)
return x
Meta-optimization
def grid_search_optimization(model, param_grid, X, y):
best_score = float('-inf')
best_params = None
for params in itertools.product(*param_grid.values()):
param_dict = dict(zip(param_grid.keys(), params))
model.set_params(**param_dict)
score = cross_val_score(model, X, y).mean()
if score > best_score:
best_score = score
best_params = param_dict
return best_params, best_score