Gradient Descent – The Heart of ML Optimization

Gradient descent is a core optimization technique in machine learning that iteratively adjusts model parameters to minimize error. This section explains how gradients guide the direction and size of updates, the concept of learning rates, and how gradient descent underpins the training of models from linear regression to deep neural networks.

10 min read16 views

What Is Gradient Descent?

Gradient descent is an iterative optimization algorithm that finds the minimum of a function by repeatedly taking steps in the direction opposite to the gradient.

Update Rule:

θ_new = θ_old - α × ∇f(θ_old)

Where α (alpha) is the learning rate—the step size.

import numpy as np

# Simple example: minimize f(x) = x²
# Minimum is at x = 0

def f(x):
    return x ** 2

def gradient(x):
    return 2 * x

# Gradient descent
x = 10.0  # Starting point
learning_rate = 0.1
iterations = 50

print("Gradient Descent: Minimizing f(x) = x²")
print(f"Starting at x = {x}")

for i in range(iterations):
    grad = gradient(x)
    x = x - learning_rate * grad
    
    if i < 5 or i >= iterations - 3:
        print(f"Iteration {i+1}: x = {x:.6f}, f(x) = {f(x):.6f}")
    elif i == 5:
        print("...")

print(f"\nFinal: x = {x:.6f} (optimal is 0)")

Visualizing Gradient Descent Steps

import numpy as np

def gradient_descent_with_history(f, gradient, x_start, lr, n_iter):
    """Run gradient descent and track history."""
    x = x_start
    history = {'x': [x], 'f': [f(x)]}
    
    for _ in range(n_iter):
        grad = gradient(x)
        x = x - lr * grad
        history['x'].append(x)
        history['f'].append(f(x))
    
    return x, history

# Minimize f(x) = (x-3)² + 1
def f(x):
    return (x - 3)**2 + 1

def gradient(x):
    return 2 * (x - 3)

x_final, history = gradient_descent_with_history(
    f, gradient, x_start=10, lr=0.1, n_iter=30
)

print("Gradient Descent Progress:")
print(f"{'Step':<6}{'x':<12}{'f(x)':<12}{'Gradient':<12}")
print("-" * 42)
for i in range(0, len(history['x']), 5):
    x = history['x'][i]
    fx = history['f'][i]
    g = gradient(x)
    print(f"{i:<6}{x:<12.4f}{fx:<12.4f}{g:<12.4f}")

Learning Rate Impact

The learning rate critically affects convergence.

import numpy as np

def f(x):
    return x ** 2

def gradient(x):
    return 2 * x

def run_gd(learning_rate, n_iter=20):
    x = 10.0
    history = [x]
    for _ in range(n_iter):
        x = x - learning_rate * gradient(x)
        history.append(x)
    return history

# Compare different learning rates
learning_rates = [0.01, 0.1, 0.5, 0.9, 1.1]

print("Impact of Learning Rate:")
print(f"{'LR':<8}{'Final x':<15}{'Converged?':<15}")
print("-" * 38)

for lr in learning_rates:
    history = run_gd(lr)
    final_x = history[-1]
    converged = "Yes" if abs(final_x) < 0.01 else "No (diverged)" if abs(final_x) > 10 else "Slow"
    print(f"{lr:<8}{final_x:<15.6f}{converged:<15}")

Key Observations:

Too small: Slow convergence
Just right: Fast, stable convergence
Too large: Oscillation or divergence

Batch, Stochastic, and Mini-Batch Gradient Descent

import numpy as np

# Generate sample data
np.random.seed(42)
X = 2 * np.random.rand(100)
y = 4 + 3 * X + np.random.randn(100) * 0.5

def compute_loss(w, b, X, y):
    predictions = w * X + b
    return np.mean((predictions - y) ** 2)

def compute_gradients(w, b, X, y):
    predictions = w * X + b
    errors = predictions - y
    dw = 2 * np.mean(errors * X)
    db = 2 * np.mean(errors)
    return dw, db

# Batch Gradient Descent (uses all data)
def batch_gd(X, y, lr=0.1, n_iter=100):
    w, b = 0.0, 0.0
    for _ in range(n_iter):
        dw, db = compute_gradients(w, b, X, y)
        w -= lr * dw
        b -= lr * db
    return w, b

# Stochastic Gradient Descent (uses one sample)
def stochastic_gd(X, y, lr=0.01, n_iter=1000):
    w, b = 0.0, 0.0
    n = len(X)
    for i in range(n_iter):
        idx = np.random.randint(n)
        xi, yi = X[idx:idx+1], y[idx:idx+1]
        dw, db = compute_gradients(w, b, xi, yi)
        w -= lr * dw
        b -= lr * db
    return w, b

# Mini-Batch Gradient Descent (uses subset)
def minibatch_gd(X, y, batch_size=16, lr=0.05, n_iter=200):
    w, b = 0.0, 0.0
    n = len(X)
    for _ in range(n_iter):
        indices = np.random.choice(n, batch_size, replace=False)
        X_batch, y_batch = X[indices], y[indices]
        dw, db = compute_gradients(w, b, X_batch, y_batch)
        w -= lr * dw
        b -= lr * db
    return w, b

# Compare methods
print("Comparing Gradient Descent Variants:")
print("True values: w=3, b=4\n")

methods = [
    ("Batch GD", batch_gd(X, y)),
    ("Stochastic GD", stochastic_gd(X, y)),
    ("Mini-Batch GD", minibatch_gd(X, y))
]

for name, (w, b) in methods:
    loss = compute_loss(w, b, X, y)
    print(f"{name:15s}: w={w:.4f}, b={b:.4f}, Loss={loss:.4f}")

Gradient Descent Variants

Momentum: Accumulates velocity to speed through flat regions and dampen oscillations.

import numpy as np

def gradient_descent_momentum(f, gradient, x_start, lr, momentum, n_iter):
    x = x_start
    velocity = 0
    history = [x]
    
    for _ in range(n_iter):
        grad = gradient(x)
        velocity = momentum * velocity - lr * grad
        x = x + velocity
        history.append(x)
    
    return x, history

# Compare standard GD vs momentum
def f(x):
    return x ** 2

def gradient(x):
    return 2 * x

x_standard, hist_standard = gradient_descent_with_history(
    f, gradient, x_start=10, lr=0.1, n_iter=20
)

x_momentum, hist_momentum = gradient_descent_momentum(
    f, gradient, x_start=10, lr=0.1, momentum=0.9, n_iter=20
)

print("Comparison: Standard GD vs Momentum")
print(f"Standard GD final x: {x_standard:.6f}")
print(f"Momentum GD final x: {x_momentum:.6f}")

Complete Linear Regression with Gradient Descent

import numpy as np

class LinearRegressionGD:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iter = n_iterations
        self.weights = None
        self.bias = None
        self.loss_history = []
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iter):
            # Forward pass
            y_pred = np.dot(X, self.weights) + self.bias
            
            # Compute loss (MSE)
            loss = np.mean((y_pred - y) ** 2)
            self.loss_history.append(loss)
            
            # Compute gradients
            dw = (2 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (2 / n_samples) * np.sum(y_pred - y)
            
            # Update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
        
        return self
    
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# Generate sample data
np.random.seed(42)
X = np.random.randn(100, 2)
true_weights = np.array([3, -2])
true_bias = 5
y = np.dot(X, true_weights) + true_bias + np.random.randn(100) * 0.5

# Train model
model = LinearRegressionGD(learning_rate=0.1, n_iterations=500)
model.fit(X, y)

print("Linear Regression with Gradient Descent")
print(f"True weights: {true_weights}, bias: {true_bias}")
print(f"Learned weights: {model.weights.round(4)}, bias: {model.bias:.4f}")
print(f"Initial loss: {model.loss_history[0]:.4f}")
print(f"Final loss: {model.loss_history[-1]:.4f}")

Advanced Optimizers Overview

Modern deep learning uses sophisticated optimizers:

Optimizer	Key Feature	When to Use
SGD	Basic, reliable	Simple problems, fine-tuning
SGD + Momentum	Faster convergence	Most neural networks
RMSprop	Adaptive learning rates	RNNs, non-stationary
Adam	Combines momentum + adaptive rates	Default choice for deep learning

# Using optimizers with scikit-learn
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np

# Generate data
np.random.seed(42)
X = np.random.randn(1000, 5)
y = np.dot(X, [1, 2, 3, 4, 5]) + np.random.randn(1000) * 0.5

# Scale features (important for gradient descent)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SGD Regressor
sgd = SGDRegressor(max_iter=1000, tol=1e-3, random_state=42)
sgd.fit(X_scaled, y)

print("scikit-learn SGDRegressor:")
print(f"Learned coefficients: {sgd.coef_.round(3)}")
print(f"True coefficients: [1, 2, 3, 4, 5]")

Summary

This tutorial covered the essential mathematical foundations for Machine Learning:

Linear Algebra Essentials

Vectors and matrices are the fundamental data structures in ML
Dot products power predictions in linear models and neural networks
Matrix operations enable efficient batch computations
Eigenvalues and eigenvectors underpin dimensionality reduction (PCA)

Statistics and Probability

Descriptive statistics summarize and understand data distributions
Probability fundamentals (conditional probability, Bayes' theorem) enable classification
Probability distributions model real-world data patterns
Maximum likelihood is the foundation for parameter estimation

Calculus for Optimization

Derivatives and gradients

Related Lessons

Derivatives and Gradients Explained

Derivatives and gradients are central to understanding how machine learning models learn and optimize. This section explains how derivatives measure the rate of change of functions and how gradients extend this concept to multiple variables, guiding algorithms like gradient descent to minimize loss and improve model performance.