Derivatives and gradients are central to understanding how machine learning models learn and optimize. This section explains how derivatives measure the rate of change of functions and how gradients extend this concept to multiple variables, guiding algorithms like gradient descent to minimize loss and improve model performance.
A derivative measures how a function changes as its input changes. In ML, we use derivatives to understand how the loss function changes as we adjust model parameters.
Intuition: The derivative tells us the "slope" or "rate of change" at any point.
import numpy as np
# Simple function: f(x) = x²
def f(x):
return x ** 2
# Its derivative: f'(x) = 2x
def f_derivative(x):
return 2 * x
# Numerical approximation of derivative
def numerical_derivative(func, x, h=1e-7):
return (func(x + h) - func(x)) / h
# Compare at x = 3
x = 3
analytical = f_derivative(x)
numerical = numerical_derivative(f, x)
print(f"At x = {x}:")
print(f"Analytical derivative: {analytical}")
print(f"Numerical derivative: {numerical:.6f}")
Understanding these rules helps interpret ML optimizations:
import numpy as np
# Power rule: d/dx(x^n) = n*x^(n-1)
def power_function(x, n):
return x ** n
def power_derivative(x, n):
return n * x ** (n - 1)
print("Power Rule: d/dx(x³) at x=2")
print(f"Result: {power_derivative(2, 3)}") # 3 * 2² = 12
# Chain rule: d/dx(f(g(x))) = f'(g(x)) * g'(x)
# Example: d/dx((x²+1)³)
def composite(x):
return (x**2 + 1) ** 3
def composite_derivative(x):
# f(u) = u³, f'(u) = 3u²
# g(x) = x²+1, g'(x) = 2x
# Chain: 3(x²+1)² * 2x
return 3 * (x**2 + 1)**2 * 2 * x
x = 2
print(f"\nChain Rule at x={x}:")
print(f"Analytical: {composite_derivative(x)}")
When a function has multiple variables, we take partial derivatives with respect to each variable separately.
import numpy as np
# Function of two variables: f(x, y) = x² + 3xy + y²
def f(x, y):
return x**2 + 3*x*y + y**2
# Partial derivatives
# ∂f/∂x = 2x + 3y
# ∂f/∂y = 3x + 2y
def partial_x(x, y):
return 2*x + 3*y
def partial_y(x, y):
return 3*x + 2*y
# Numerical verification
def numerical_partial_x(x, y, h=1e-7):
return (f(x + h, y) - f(x, y)) / h
def numerical_partial_y(x, y, h=1e-7):
return (f(x, y + h) - f(x, y)) / h
x, y = 2, 3
print(f"At point ({x}, {y}):")
print(f"∂f/∂x analytical: {partial_x(x, y)}, numerical: {numerical_partial_x(x, y):.4f}")
print(f"∂f/∂y analytical: {partial_y(x, y)}, numerical: {numerical_partial_y(x, y):.4f}")
The gradient is a vector of all partial derivatives. It points in the direction of steepest increase.
import numpy as np
# Function: f(x, y) = x² + y²
def f(params):
x, y = params
return x**2 + y**2
# Gradient: ∇f = [2x, 2y]
def gradient(params):
x, y = params
return np.array([2*x, 2*y])
# Example: gradient at point (3, 4)
point = np.array([3.0, 4.0])
grad = gradient(point)
print(f"Point: {point}")
print(f"Function value: {f(point)}")
print(f"Gradient: {grad}")
print(f"Gradient magnitude: {np.linalg.norm(grad):.4f}")
Key Insight: To minimize a function, we move in the opposite direction of the gradient (steepest descent).
In ML, we compute gradients of the loss function with respect to model parameters.
import numpy as np
# Simple linear model: y = wx + b
# Loss function: MSE = (1/n) Σ(y_pred - y_true)²
def mse_loss(w, b, X, y):
"""Calculate Mean Squared Error loss."""
y_pred = w * X + b
return np.mean((y_pred - y) ** 2)
def mse_gradients(w, b, X, y):
"""Calculate gradients of MSE w.r.t. w and b."""
n = len(X)
y_pred = w * X + b
error = y_pred - y
# ∂Loss/∂w = (2/n) Σ error * x
dw = (2 / n) * np.sum(error * X)
# ∂Loss/∂b = (2/n) Σ error
db = (2 / n) * np.sum(error)
return dw, db
# Example data
X = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])
# Current parameters
w, b = 0.5, 1.0
loss = mse_loss(w, b, X, y)
dw, db = mse_gradients(w, b, X, y)
print(f"Current w={w}, b={b}")
print(f"Loss: {loss:.4f}")
print(f"Gradient w.r.t. w: {dw:.4f}")
print(f"Gradient w.r.t. b: {db:.4f}")
Deep learning relies heavily on the chain rule for backpropagation.
import numpy as np
# Simple two-layer computation
# Layer 1: z = wx + b
# Layer 2: a = sigmoid(z)
# Loss: L = (a - y)²
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def sigmoid_derivative(z):
s = sigmoid(z)
return s * (1 - s)
# Forward pass
x, y_true = 2.0, 1.0
w, b = 0.5, 0.1
z = w * x + b
a = sigmoid(z)
loss = (a - y_true) ** 2
print("Forward pass:")
print(f"z = {z:.4f}, a = {a:.4f}, loss = {loss:.4f}")
# Backward pass (chain rule)
# dL/da = 2(a - y_true)
dL_da = 2 * (a - y_true)
# da/dz = sigmoid'(z)
da_dz = sigmoid_derivative(z)
# dz/dw = x, dz/db = 1
dz_dw = x
dz_db = 1
# Chain rule: dL/dw = dL/da * da/dz * dz/dw
dL_dw = dL_da * da_dz * dz_dw
dL_db = dL_da * da_dz * dz_db
print("\nBackward pass (gradients):")
print(f"dL/dw = {dL_dw:.4f}")
print(f"dL/db = {dL_db:.4f}")