Descriptive statistics summarize and describe the key features of a dataset, providing a foundation for data analysis in machine learning. This section covers measures of central tendency, dispersion, and data distribution, helping to identify patterns, detect anomalies, and inform preprocessing and modeling decisions.
Central tendency measures describe where data "centers" around.
Mean (Average): The sum of all values divided by the count.
import numpy as np
data = np.array([23, 45, 67, 34, 89, 12, 54, 33, 78, 45])
# Calculate mean
mean = np.mean(data)
print(f"Mean: {mean}")
# Manual calculation
manual_mean = data.sum() / len(data)
print(f"Manual calculation: {manual_mean}")
Median: The middle value when data is sorted. More robust to outliers than mean.
import numpy as np
# Data with outlier
data_normal = np.array([10, 12, 14, 15, 16])
data_outlier = np.array([10, 12, 14, 15, 100])
print(f"Normal data - Mean: {np.mean(data_normal):.1f}, Median: {np.median(data_normal)}")
print(f"With outlier - Mean: {np.mean(data_outlier):.1f}, Median: {np.median(data_outlier)}")
# Outlier affects mean much more than median
Mode: The most frequently occurring value.
from scipy import stats
import numpy as np
data = np.array([1, 2, 2, 3, 3, 3, 4, 4, 5])
mode_result = stats.mode(data, keepdims=True)
print(f"Mode: {mode_result.mode[0]} (appears {mode_result.count[0]} times)")
Spread measures describe how dispersed the data is.
Variance and Standard Deviation:
import numpy as np
data = np.array([2, 4, 4, 4, 5, 5, 7, 9])
# Variance: average squared deviation from mean
variance = np.var(data)
print(f"Variance: {variance}")
# Standard deviation: square root of variance
std_dev = np.std(data)
print(f"Standard Deviation: {std_dev}")
# Manual calculation
mean = np.mean(data)
manual_variance = np.mean((data - mean)**2)
print(f"Manual variance: {manual_variance}")
Standard deviation is in the same units as the original data, making it more interpretable than variance.
Range and Interquartile Range (IQR):
import numpy as np
data = np.array([12, 15, 18, 22, 25, 28, 32, 35, 40, 45, 100])
# Range
data_range = np.max(data) - np.min(data)
print(f"Range: {data_range}")
# Quartiles
Q1 = np.percentile(data, 25)
Q2 = np.percentile(data, 50) # Median
Q3 = np.percentile(data, 75)
# Interquartile Range
IQR = Q3 - Q1
print(f"Q1: {Q1}, Q2 (Median): {Q2}, Q3: {Q3}")
print(f"IQR: {IQR}")
# IQR is robust to outliers
Skewness: Measures asymmetry of the distribution.
from scipy import stats
import numpy as np
# Symmetric data (skewness ≈ 0)
symmetric = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
# Right-skewed data (positive skewness)
right_skewed = np.array([1, 1, 2, 2, 2, 3, 3, 8, 15])
# Left-skewed data (negative skewness)
left_skewed = np.array([1, 8, 13, 14, 14, 15, 15, 15, 16])
print(f"Symmetric skewness: {stats.skew(symmetric):.3f}")
print(f"Right-skewed: {stats.skew(right_skewed):.3f}")
print(f"Left-skewed: {stats.skew(left_skewed):.3f}")
Kurtosis: Measures the "tailedness" of the distribution.
from scipy import stats
import numpy as np
# Normal distribution (kurtosis ≈ 0)
normal = np.random.randn(1000)
# Heavy-tailed distribution
heavy_tailed = np.concatenate([np.random.randn(900), np.random.randn(100)*5])
print(f"Normal kurtosis: {stats.kurtosis(normal):.3f}")
print(f"Heavy-tailed kurtosis: {stats.kurtosis(heavy_tailed):.3f}")
import numpy as np
import pandas as pd
from scipy import stats
def describe_data(data, name="Data"):
"""Generate comprehensive statistics for a dataset."""
summary = {
'Count': len(data),
'Mean': np.mean(data),
'Std Dev': np.std(data),
'Min': np.min(data),
'25%': np.percentile(data, 25),
'Median': np.median(data),
'75%': np.percentile(data, 75),
'Max': np.max(data),
'Skewness': stats.skew(data),
'Kurtosis': stats.kurtosis(data)
}
print(f"\n=== {name} Summary ===")
for key, value in summary.items():
print(f"{key}: {value:.4f}" if isinstance(value, float) else f"{key}: {value}")
return summary
# Example usage
np.random.seed(42)
sample_data = np.random.exponential(scale=2, size=1000)
describe_data(sample_data, "Exponential Sample")
Correlation measures the linear relationship between two variables.
import numpy as np
# Generate correlated data
np.random.seed(42)
x = np.random.randn(100)
y_positive = x * 0.8 + np.random.randn(100) * 0.3 # Positive correlation
y_negative = -x * 0.7 + np.random.randn(100) * 0.4 # Negative correlation
y_none = np.random.randn(100) # No correlation
# Pearson correlation coefficient
corr_positive = np.corrcoef(x, y_positive)[0, 1]
corr_negative = np.corrcoef(x, y_negative)[0, 1]
corr_none = np.corrcoef(x, y_none)[0, 1]
print(f"Positive correlation: {corr_positive:.3f}")
print(f"Negative correlation: {corr_negative:.3f}")
print(f"No correlation: {corr_none:.3f}")
Correlation Matrix for Multiple Features:
import numpy as np
import pandas as pd
# Sample feature matrix
np.random.seed(42)
data = pd.DataFrame({
'feature_1': np.random.randn(100),
'feature_2': np.random.randn(100),
'feature_3': np.random.randn(100)
})
data['feature_2'] = data['feature_1'] * 0.7 + data['feature_2'] * 0.3
# Correlation matrix
corr_matrix = data.corr()
print("Correlation Matrix:")
print(corr_matrix.round(3))
Correlation matrices help identify redundant features and multicollinearity in ML datasets.
Probability fundamentals provide the framework for reasoning under uncertainty in machine learning. This section introduces key concepts such as random variables, probability distributions, conditional probability, and Bayes’ theorem, which are essential for modeling uncertainty, making predictions, and designing probabilistic algorithms.
Probability distributions describe how data values are spread and are essential for modeling and inference in machine learning. This section covers common distributions—such as normal, binomial, and uniform—and explains their role in understanding data, estimating probabilities, and building probabilistic models.