Gradient Descent
Understanding and implementing various gradient descent optimization algorithms
Gradient descent is a fundamental optimization algorithm used to minimize loss functions in machine learning models.
Basic Gradient Descent
Mathematical Foundation
The update rule for gradient descent:
where:
- θ is the parameter vector
- α is the learning rate
- J(θ) is the loss function
def gradient_descent(X, y, theta, alpha, n_iterations):
m = len(y)
cost_history = []
for i in range(n_iterations):
# Compute predictions
h = X @ theta
# Compute gradients
gradients = (1/m) * X.T @ (h - y)
# Update parameters
theta = theta - alpha * gradients
# Compute cost
cost = (1/(2*m)) * np.sum((h - y)**2)
cost_history.append(cost)
return theta, cost_history
Variants of Gradient Descent
Stochastic Gradient Descent (SGD)
def sgd(X, y, theta, alpha, n_epochs):
m = len(y)
cost_history = []
for epoch in range(n_epochs):
# Shuffle data
indices = np.random.permutation(m)
X = X[indices]
y = y[indices]
for i in range(m):
# Compute gradient for single example
h = X[i:i+1] @ theta
gradient = X[i:i+1].T * (h - y[i:i+1])
# Update parameters
theta = theta - alpha * gradient
# Compute cost for epoch
h = X @ theta
cost = (1/(2*m)) * np.sum((h - y)**2)
cost_history.append(cost)
return theta, cost_history
Mini-batch Gradient Descent
def minibatch_gradient_descent(X, y, theta, alpha, batch_size, n_epochs):
m = len(y)
cost_history = []
for epoch in range(n_epochs):
# Shuffle data
indices = np.random.permutation(m)
X = X[indices]
y = y[indices]
# Process mini-batches
for i in range(0, m, batch_size):
X_batch = X[i:i+batch_size]
y_batch = y[i:i+batch_size]
# Compute gradients
h = X_batch @ theta
gradients = (1/batch_size) * X_batch.T @ (h - y_batch)
# Update parameters
theta = theta - alpha * gradients
# Compute cost for epoch
h = X @ theta
cost = (1/(2*m)) * np.sum((h - y)**2)
cost_history.append(cost)
return theta, cost_history
Advanced Optimization Algorithms
Momentum
def momentum_gradient_descent(X, y, theta, alpha, beta=0.9, n_iterations=1000):
m = len(y)
velocity = np.zeros_like(theta)
cost_history = []
for i in range(n_iterations):
# Compute gradients
h = X @ theta
gradients = (1/m) * X.T @ (h - y)
# Update velocity
velocity = beta * velocity + alpha * gradients
# Update parameters
theta = theta - velocity
# Compute cost
cost = (1/(2*m)) * np.sum((h - y)**2)
cost_history.append(cost)
return theta, cost_history
RMSprop
def rmsprop(X, y, theta, alpha=0.001, beta=0.9, epsilon=1e-8, n_iterations=1000):
m = len(y)
v = np.zeros_like(theta)
cost_history = []
for i in range(n_iterations):
# Compute gradients
h = X @ theta
gradients = (1/m) * X.T @ (h - y)
# Update accumulator
v = beta * v + (1 - beta) * gradients**2
# Update parameters
theta = theta - (alpha / (np.sqrt(v) + epsilon)) * gradients
# Compute cost
cost = (1/(2*m)) * np.sum((h - y)**2)
cost_history.append(cost)
return theta, cost_history
Adam Optimizer
def adam(X, y, theta, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, n_iterations=1000):
m = len(y)
v = np.zeros_like(theta) # Second moment estimate
s = np.zeros_like(theta) # First moment estimate
cost_history = []
for t in range(1, n_iterations + 1):
# Compute gradients
h = X @ theta
gradients = (1/m) * X.T @ (h - y)
# Update moment estimates
s = beta1 * s + (1 - beta1) * gradients
v = beta2 * v + (1 - beta2) * gradients**2
# Compute bias-corrected moments
s_corrected = s / (1 - beta1**t)
v_corrected = v / (1 - beta2**t)
# Update parameters
theta = theta - alpha * s_corrected / (np.sqrt(v_corrected) + epsilon)
# Compute cost
cost = (1/(2*m)) * np.sum((h - y)**2)
cost_history.append(cost)
return theta, cost_history
Learning Rate Scheduling
Step Decay
def step_decay_schedule(initial_lr, drop_factor=0.5, epochs_drop=10):
def schedule(epoch):
return initial_lr * (drop_factor ** np.floor(epoch/epochs_drop))
return schedule
Exponential Decay
def exponential_decay_schedule(initial_lr, decay_rate=0.1):
def schedule(epoch):
return initial_lr * np.exp(-decay_rate * epoch)
return schedule
Cosine Annealing
def cosine_annealing_schedule(initial_lr, T_max):
def schedule(epoch):
return initial_lr * (1 + np.cos(np.pi * epoch / T_max)) / 2
return schedule
Visualization Tools
def plot_convergence(cost_history):
plt.figure(figsize=(10, 6))
plt.plot(cost_history)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.title('Convergence Curve')
plt.yscale('log')
return plt
def plot_parameter_trajectory(parameter_history):
plt.figure(figsize=(10, 6))
for i in range(parameter_history.shape[1]):
plt.plot(parameter_history[:, i], label=f'Parameter {i}')
plt.xlabel('Iteration')
plt.ylabel('Parameter Value')
plt.title('Parameter Trajectories')
plt.legend()
return plt
Best Practices
- Learning Rate Selection
def find_optimal_learning_rate(X, y, theta_init, lr_range=np.logspace(-4, 0, 100)):
results = []
for lr in lr_range:
theta, cost_history = gradient_descent(
X, y, theta_init.copy(),
alpha=lr, n_iterations=100
)
results.append({
'lr': lr,
'final_cost': cost_history[-1],
'convergence': len(cost_history) > 1 and
cost_history[-1] < cost_history[0]
})
return pd.DataFrame(results)
- Gradient Clipping
def clip_gradients(gradients, max_norm):
norm = np.linalg.norm(gradients)
if norm > max_norm:
gradients = gradients * (max_norm / norm)
return gradients
- Convergence Checking
def check_convergence(cost_history, tolerance=1e-6, window=10):
if len(cost_history) < window:
return False
recent_costs = cost_history[-window:]
cost_change = np.abs(np.diff(recent_costs)).mean()
return cost_change < tolerance