Introduction
Machine learning is fundamentally mathematics. Understanding the mathematical foundations is essential not just for implementing algorithms, but for understanding why they work, how to debug them, and how to choose the right approach for your problem.
This comprehensive guide covers the mathematical concepts you need to understand machine learning: linear algebra, calculus, probability, and statistics.
Linear Algebra
Vectors and Matrices
import numpy as np
# Vectors
v = np.array([1, 2, 3])
w = np.array([4, 5, 6])
# Vector operations
dot_product = np.dot(v, w) # 1*4 + 2*5 + 3*6 = 32
cross_product = np.cross(v, w) # [-3, 6, -3]
magnitude = np.linalg.norm(v) # sqrt(1^2 + 2^2 + 3^2)
# Matrix operations
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
C = np.matmul(A, B) # Matrix multiplication
determinant = np.linalg.det(A) # -2.0
inverse = np.linalg.inv(A) # [[-2, 1], [1.5, -0.5]]
Matrix Decompositions
# Eigenvalue decomposition
A = np.array([[4, 2], [1, 3]])
eigenvalues, eigenvectors = np.linalg.eig(A)
# Singular Value Decomposition (SVD)
U, S, Vt = np.linalg.svd(A)
# Principal Component Analysis (PCA)
def pca(X, n_components):
# Center the data
X_centered = X - X.mean(axis=0)
# Compute covariance matrix
cov = np.cov(X_centered.T)
# Eigenvalue decomposition
eigenvalues, eigenvectors = np.linalg.eig(cov)
# Sort by eigenvalues (descending)
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
# Select top n components
components = eigenvectors[:, :n_components]
# Transform data
X_pca = X_centered @ components
return X_pca, components, eigenvalues
Applications in ML
| Concept | ML Application |
|---|---|
| Dot products | Similarity measures, attention mechanisms |
| Matrix multiplication | Neural network forward pass |
| Eigenvalues | PCA, dimensionality reduction |
| SVD | Recommender systems, compression |
| Decompositions | Solving linear systems, optimization |
Calculus
Derivatives and Gradients
# Gradient descent implementation
def gradient_descent(f, grad_f, x0, learning_rate=0.01, max_iter=1000, tol=1e-6):
"""Minimize function using gradient descent"""
x = x0
for i in range(max_iter):
gradient = grad_f(x)
x_new = x - learning_rate * gradient
# Check convergence
if np.linalg.norm(x_new - x) < tol:
break
x = x_new
return x
# Example: minimize f(x) = x^2
def f(x):
return x**2
def grad_f(x):
return 2*x
x_min = gradient_descent(f, grad_f, x0=10.0)
print(f"Minimum at x = {x_min:.6f}") # ~0
Chain Rule and Backpropagation
# Simple neural network forward and backward pass
class LinearLayer:
def __init__(self, input_dim, output_dim):
self.W = np.random.randn(input_dim, output_dim) * 0.01
self.b = np.zeros(output_dim)
self.X = None
self.dW = None
self.db = None
def forward(self, X):
self.X = X
return X @ self.W + self.b
def backward(self, dY):
# dY: gradient from next layer
m = self.X.shape[0]
# Gradients
self.dW = self.X.T @ dY / m
self.db = np.sum(dY, axis=0) / m
# Gradient w.r.t. input (for previous layer)
dX = dY @ self.W.T
return dX
class ReLU:
def __init__(self):
self.X = None
def forward(self, X):
self.X = X
return np.maximum(0, X)
def backward(self, dY):
dX = dY * (self.X > 0)
return dX
Optimization Concepts
Optimization Algorithms:
โโโ Gradient Descent - Basic optimization
โโโ Stochastic Gradient Descent - Mini-batch updates
โโโ Momentum - Accelerate convergence
โโโ Adam - Adaptive learning rates
โโโ RMSprop - Divide by gradient magnitude
โโโ Newton's Method - Second-order optimization
Probability
Probability Distributions
import scipy.stats as stats
# Normal distribution
normal = stats.norm(loc=0, scale=1)
samples = normal.rvs(1000) # Generate samples
pdf = normal.pdf(0) # PDF at x=0
cdf = normal.cdf(0) # CDF at x=0
# Common distributions
# Bernoulli: Binary outcomes
bernoulli = stats.bernoulli(p=0.7)
# Poisson: Count data
poisson = stats.poisson(mu=5)
# Exponential: Time between events
exponential = stats.expon(scale=1)
# Log-normal: Positive values with log-normal distribution
lognormal = stats.lognorm(s=1, scale=np.exp(0))
Bayesian Inference
# Bayesian updating
def bayesian_update(prior, likelihood, data):
"""
Update prior belief with observed data
prior: P(H) - prior probability of hypothesis
likelihood: P(D|H) - probability of data given hypothesis
data: P(D) - probability of data
"""
posterior = (likelihood * prior) / data
return posterior
# Example: Spam detection
# P(spam) = 0.3
# P(words | spam) = 0.9
# P(words | not spam) = 0.1
prior_spam = 0.3
likelihood_spam = 0.9
likelihood_ham = 0.1
# P(words) = P(words|spam)*P(spam) + P(words|not spam)*P(not spam)
p_words = likelihood_spam * prior_spam + likelihood_ham * (1 - prior_spam)
# P(spam|words)
posterior_spam = (likelihood_spam * prior_spam) / p_words
print(f"P(spam|words) = {posterior_spam:.3f}")
Maximum Likelihood Estimation
def mle_normal(data):
"""MLE for normal distribution"""
n = len(data)
mu_hat = sum(data) / n
sigma2_hat = sum((x - mu_hat)**2 for x in data) / n
return mu_hat, np.sqrt(sigma2_hat)
# Example
data = np.array([2.1, 2.5, 1.8, 2.3, 2.0])
mu, sigma = mle_normal(data)
print(f"Estimated: ฮผ = {mu:.2f}, ฯ = {sigma:.2f}")
Statistics
Hypothesis Testing
from scipy import stats
# T-test for comparing means
def t_test(sample1, sample2):
"""Two-sample t-test"""
t_stat, p_value = stats.ttest_ind(sample1, sample2)
return t_stat, p_value
# Example
group_a = [85, 87, 82, 86, 88, 85, 87]
group_b = [78, 82, 79, 81, 80, 83, 78]
t, p = t_test(group_a, group_b)
print(f"t-statistic: {t:.3f}, p-value: {p:.4f}")
# Confidence intervals
def confidence_interval(data, confidence=0.95):
"""Calculate confidence interval"""
n = len(data)
mean = np.mean(data)
se = stats.sem(data) # Standard error
margin = se * stats.t.ppf((1 + confidence) / 2, n - 1)
return mean - margin, mean + margin
ci = confidence_interval(group_a)
print(f"95% CI: [{ci[0]:.2f}, {ci[1]:.2f}]")
Regression Analysis
# Simple linear regression
def linear_regression(X, y):
"""Least squares linear regression"""
# Add bias term
X_b = np.c_[np.ones((X.shape[0], 1)), X]
# Compute weights: w = (X^T X)^(-1) X^T y
XtX = X_b.T @ X_b
Xty = X_b.T @ y
weights = np.linalg.solve(XtX, Xty)
return weights
# Example
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2.1, 4.2, 6.1, 7.9, 10.2])
weights = linear_regression(X, y)
print(f"Intercept: {weights[0]:.3f}, Slope: {weights[1]:.3f}")
Regularization
# Ridge regression (L2 regularization)
def ridge_regression(X, y, alpha=1.0):
"""Ridge regression with L2 penalty"""
n, d = X.shape
X_b = np.c_[np.ones((n, 1)), X]
# w = (X^T X + ฮฑI)^(-1) X^T y
XtX = X_b.T @ X_b
XtX += alpha * np.eye(d + 1) # Add regularization
Xty = X_b.T @ y
weights = np.linalg.solve(XtX, Xty)
return weights
# Lasso regression (L1 regularization)
def lasso_regression(X, y, alpha=1.0, max_iter=1000):
"""Lasso regression with L1 penalty (coordinate descent)"""
n, d = X.shape
weights = np.zeros(d + 1)
for _ in range(max_iter):
for j in range(d + 1):
# Soft thresholding for L1
residual = y - X_b @ weights + X_b[:, j] * weights[j]
rho_j = X_b[:, j] @ residual
if j == 0: # Intercept
weights[j] = rho_j / n
else:
weights[j] = np.sign(rho_j) * max(0, abs(rho_j) - alpha) / (X_b[:, j] @ X_b[:, j])
return weights
Information Theory
Entropy and Information
def entropy(probabilities):
"""Calculate Shannon entropy"""
return -sum(p * np.log2(p) for p in probabilities if p > 0)
def kl_divergence(p, q):
"""Kullback-Leibler divergence"""
return sum(p[i] * np.log2(p[i] / q[i]) for i in range(len(p)) if p[i] > 0)
def cross_entropy(p, q):
"""Cross entropy"""
return -sum(p[i] * np.log2(q[i]) for i in range(len(p)) if p[i] > 0)
# Example: Coin flip
p_fair = [0.5, 0.5] # Fair coin
p_biased = [0.9, 0.1] # Biased coin
H_fair = entropy(p_fair)
H_biased = entropy(p_biased)
print(f"Entropy fair: {H_fair:.3f} bits")
print(f"Entropy biased: {H_biased:.3f} bits")
Applications in ML
| Concept | ML Application |
|---|---|
| Cross-entropy loss | Classification models |
| KL divergence | Variational autoencoders, distribution matching |
| Information gain | Decision trees |
| Mutual information | Feature selection |
Conclusion
The mathematical foundations of machine learning provide the framework for understanding how algorithms work and why. These concepts appear throughout modern ML:
- Linear Algebra: Neural networks, embeddings, dimensionality reduction
- Calculus: Optimization, backpropagation
- Probability: Bayesian methods, uncertainty quantification
- Statistics: Hypothesis testing, confidence intervals, regression
- Information Theory: Loss functions, feature selection
Mastering these fundamentals will make you a better ML practitionerโable to read papers, implement novel approaches, and debug existing systems.
Resources
- Mathematics for Machine Learning
- Deep Learning Book - Math Chapter
- 3Blue1Brown - Essence of Linear Algebra
Comments