Gradient descent is a core optimization technique in machine learning that iteratively adjusts model parameters to minimize error. This section explains how gradients guide the direction and size of updates, the concept of learning rates, and how gradient descent underpins the training of models from linear regression to deep neural networks.
Gradient descent is an iterative optimization algorithm that finds the minimum of a function by repeatedly taking steps in the direction opposite to the gradient.
Update Rule:
θ_new = θ_old - α × ∇f(θ_old)
Where α (alpha) is the learning rate—the step size.
import numpy as np
# Simple example: minimize f(x) = x²
# Minimum is at x = 0
def f(x):
return x ** 2
def gradient(x):
return 2 * x
# Gradient descent
x = 10.0 # Starting point
learning_rate = 0.1
iterations = 50
print("Gradient Descent: Minimizing f(x) = x²")
print(f"Starting at x = {x}")
for i in range(iterations):
grad = gradient(x)
x = x - learning_rate * grad
if i < 5 or i >= iterations - 3:
print(f"Iteration {i+1}: x = {x:.6f}, f(x) = {f(x):.6f}")
elif i == 5:
print("...")
print(f"\nFinal: x = {x:.6f} (optimal is 0)")
import numpy as np
def gradient_descent_with_history(f, gradient, x_start, lr, n_iter):
"""Run gradient descent and track history."""
x = x_start
history = {'x': [x], 'f': [f(x)]}
for _ in range(n_iter):
grad = gradient(x)
x = x - lr * grad
history['x'].append(x)
history['f'].append(f(x))
return x, history
# Minimize f(x) = (x-3)² + 1
def f(x):
return (x - 3)**2 + 1
def gradient(x):
return 2 * (x - 3)
x_final, history = gradient_descent_with_history(
f, gradient, x_start=10, lr=0.1, n_iter=30
)
print("Gradient Descent Progress:")
print(f"{'Step':<6}{'x':<12}{'f(x)':<12}{'Gradient':<12}")
print("-" * 42)
for i in range(0, len(history['x']), 5):
x = history['x'][i]
fx = history['f'][i]
g = gradient(x)
print(f"{i:<6}{x:<12.4f}{fx:<12.4f}{g:<12.4f}")
The learning rate critically affects convergence.
import numpy as np
def f(x):
return x ** 2
def gradient(x):
return 2 * x
def run_gd(learning_rate, n_iter=20):
x = 10.0
history = [x]
for _ in range(n_iter):
x = x - learning_rate * gradient(x)
history.append(x)
return history
# Compare different learning rates
learning_rates = [0.01, 0.1, 0.5, 0.9, 1.1]
print("Impact of Learning Rate:")
print(f"{'LR':<8}{'Final x':<15}{'Converged?':<15}")
print("-" * 38)
for lr in learning_rates:
history = run_gd(lr)
final_x = history[-1]
converged = "Yes" if abs(final_x) < 0.01 else "No (diverged)" if abs(final_x) > 10 else "Slow"
print(f"{lr:<8}{final_x:<15.6f}{converged:<15}")
Key Observations:
import numpy as np
# Generate sample data
np.random.seed(42)
X = 2 * np.random.rand(100)
y = 4 + 3 * X + np.random.randn(100) * 0.5
def compute_loss(w, b, X, y):
predictions = w * X + b
return np.mean((predictions - y) ** 2)
def compute_gradients(w, b, X, y):
predictions = w * X + b
errors = predictions - y
dw = 2 * np.mean(errors * X)
db = 2 * np.mean(errors)
return dw, db
# Batch Gradient Descent (uses all data)
def batch_gd(X, y, lr=0.1, n_iter=100):
w, b = 0.0, 0.0
for _ in range(n_iter):
dw, db = compute_gradients(w, b, X, y)
w -= lr * dw
b -= lr * db
return w, b
# Stochastic Gradient Descent (uses one sample)
def stochastic_gd(X, y, lr=0.01, n_iter=1000):
w, b = 0.0, 0.0
n = len(X)
for i in range(n_iter):
idx = np.random.randint(n)
xi, yi = X[idx:idx+1], y[idx:idx+1]
dw, db = compute_gradients(w, b, xi, yi)
w -= lr * dw
b -= lr * db
return w, b
# Mini-Batch Gradient Descent (uses subset)
def minibatch_gd(X, y, batch_size=16, lr=0.05, n_iter=200):
w, b = 0.0, 0.0
n = len(X)
for _ in range(n_iter):
indices = np.random.choice(n, batch_size, replace=False)
X_batch, y_batch = X[indices], y[indices]
dw, db = compute_gradients(w, b, X_batch, y_batch)
w -= lr * dw
b -= lr * db
return w, b
# Compare methods
print("Comparing Gradient Descent Variants:")
print("True values: w=3, b=4\n")
methods = [
("Batch GD", batch_gd(X, y)),
("Stochastic GD", stochastic_gd(X, y)),
("Mini-Batch GD", minibatch_gd(X, y))
]
for name, (w, b) in methods:
loss = compute_loss(w, b, X, y)
print(f"{name:15s}: w={w:.4f}, b={b:.4f}, Loss={loss:.4f}")
Momentum: Accumulates velocity to speed through flat regions and dampen oscillations.
import numpy as np
def gradient_descent_momentum(f, gradient, x_start, lr, momentum, n_iter):
x = x_start
velocity = 0
history = [x]
for _ in range(n_iter):
grad = gradient(x)
velocity = momentum * velocity - lr * grad
x = x + velocity
history.append(x)
return x, history
# Compare standard GD vs momentum
def f(x):
return x ** 2
def gradient(x):
return 2 * x
x_standard, hist_standard = gradient_descent_with_history(
f, gradient, x_start=10, lr=0.1, n_iter=20
)
x_momentum, hist_momentum = gradient_descent_momentum(
f, gradient, x_start=10, lr=0.1, momentum=0.9, n_iter=20
)
print("Comparison: Standard GD vs Momentum")
print(f"Standard GD final x: {x_standard:.6f}")
print(f"Momentum GD final x: {x_momentum:.6f}")
import numpy as np
class LinearRegressionGD:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.lr = learning_rate
self.n_iter = n_iterations
self.weights = None
self.bias = None
self.loss_history = []
def fit(self, X, y):
n_samples, n_features = X.shape
self.weights = np.zeros(n_features)
self.bias = 0
for _ in range(self.n_iter):
# Forward pass
y_pred = np.dot(X, self.weights) + self.bias
# Compute loss (MSE)
loss = np.mean((y_pred - y) ** 2)
self.loss_history.append(loss)
# Compute gradients
dw = (2 / n_samples) * np.dot(X.T, (y_pred - y))
db = (2 / n_samples) * np.sum(y_pred - y)
# Update parameters
self.weights -= self.lr * dw
self.bias -= self.lr * db
return self
def predict(self, X):
return np.dot(X, self.weights) + self.bias
# Generate sample data
np.random.seed(42)
X = np.random.randn(100, 2)
true_weights = np.array([3, -2])
true_bias = 5
y = np.dot(X, true_weights) + true_bias + np.random.randn(100) * 0.5
# Train model
model = LinearRegressionGD(learning_rate=0.1, n_iterations=500)
model.fit(X, y)
print("Linear Regression with Gradient Descent")
print(f"True weights: {true_weights}, bias: {true_bias}")
print(f"Learned weights: {model.weights.round(4)}, bias: {model.bias:.4f}")
print(f"Initial loss: {model.loss_history[0]:.4f}")
print(f"Final loss: {model.loss_history[-1]:.4f}")
Modern deep learning uses sophisticated optimizers:
| Optimizer | Key Feature | When to Use |
|---|---|---|
| SGD | Basic, reliable | Simple problems, fine-tuning |
| SGD + Momentum | Faster convergence | Most neural networks |
| RMSprop | Adaptive learning rates | RNNs, non-stationary |
| Adam | Combines momentum + adaptive rates | Default choice for deep learning |
# Using optimizers with scikit-learn
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
# Generate data
np.random.seed(42)
X = np.random.randn(1000, 5)
y = np.dot(X, [1, 2, 3, 4, 5]) + np.random.randn(1000) * 0.5
# Scale features (important for gradient descent)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# SGD Regressor
sgd = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
sgd.fit(X_scaled, y)
print("scikit-learn SGDRegressor:")
print(f"Learned coefficients: {sgd.coef_.round(3)}")
print(f"True coefficients: [1, 2, 3, 4, 5]")
This tutorial covered the essential mathematical foundations for Machine Learning: