Understand how the KNN algorithm classifies data based on similarity. This lesson explains distance metrics, choosing the right K value, and building accurate classification models.
K-Nearest Neighbors (KNN) is an intuitive algorithm based on a simple principle: similar things are close to each other. To classify a new data point, KNN looks at the K closest training examples and assigns the most common class among them.
Imagine you move to a new neighborhood and want to predict your political preference. KNN would look at your K nearest neighbors and predict you'll have the same preference as the majority of them.
Real-world applications:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)
# Create sample data
class_a = np.random.randn(20, 2) + np.array([2, 2])
class_b = np.random.randn(20, 2) + np.array([5, 5])
# New point to classify
new_point = np.array([3.5, 3.5])
plt.figure(figsize=(8, 6))
plt.scatter(class_a[:, 0], class_a[:, 1], c='blue', s=100, label='Class A')
plt.scatter(class_b[:, 0], class_b[:, 1], c='red', s=100, label='Class B')
plt.scatter(new_point[0], new_point[1], c='green', s=200, marker='*',
label='New Point', edgecolors='black', linewidths=2)
# Draw circle representing K=5 nearest neighbors
circle = plt.Circle((new_point[0], new_point[1]), 1.8, fill=False,
linestyle='--', color='gray', label='K=5 Neighborhood')
plt.gca().add_patch(circle)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('KNN Classification: Finding Nearest Neighbors')
plt.legend()
plt.axis('equal')
plt.show()
The algorithm finds the K points closest to the green star and uses their classes to make a prediction.
KNN relies on distance to determine "nearness." The most common distance metrics are:
d = √[(x₂-x₁)² + (y₂-y₁)² + ... + (zₙ-z₁)²]
This is the straight-line distance, like measuring with a ruler.
d = |x₂-x₁| + |y₂-y₁| + ... + |zₙ-z₁|
This measures distance along axes only, like walking on a city grid.
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
import numpy as np
point_a = np.array([[1, 2]])
point_b = np.array([[4, 6]])
euclidean = euclidean_distances(point_a, point_b)[0][0]
manhattan = manhattan_distances(point_a, point_b)[0][0]
print(f"Euclidean Distance: {euclidean:.2f}")
print(f"Manhattan Distance: {manhattan:.2f}")
Output:
Euclidean Distance: 5.00
Manhattan Distance: 7.00
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
print(f"Features: {iris.feature_names}")
print(f"Classes: {iris.target_names}")
print(f"Dataset shape: {X.shape}")
The Iris dataset contains flower measurements for three species, making it perfect for demonstrating multi-class classification.
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features - crucial for KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
Important: Always scale features before using KNN. Since KNN relies on distance, features with larger scales would dominate the distance calculation unfairly.
# Create and train KNN classifier with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
The n_neighbors parameter specifies how many neighbors to consider when making predictions.
y_pred = knn.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
# New flower measurements
new_flower = np.array([[5.0, 3.5, 1.5, 0.3]])
# Scale the new data
new_flower_scaled = scaler.transform(new_flower)
# Predict
prediction = knn.predict(new_flower_scaled)
probabilities = knn.predict_proba(new_flower_scaled)
print(f"Predicted class: {iris.target_names[prediction[0]]}")
print(f"Class probabilities: {probabilities[0].round(3)}")
The predict_proba() method returns the proportion of each class among the K neighbors.
The choice of K significantly impacts model performance:
from sklearn.model_selection import cross_val_score
k_values = range(1, 31)
cv_scores = []
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)
cv_scores.append(scores.mean())
# Plot results
plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_scores, 'bo-')
plt.xlabel('K (Number of Neighbors)')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Finding Optimal K Value')
plt.xticks(k_values[::2])
plt.grid(True, alpha=0.3)
plt.show()
best_k = k_values[np.argmax(cv_scores)]
print(f"Best K: {best_k} with accuracy: {max(cv_scores):.4f}")
This approach tests multiple K values and selects the one with the highest cross-validation accuracy.
from sklearn.datasets import make_classification
# Create 2D data for visualization
X_2d, y_2d = make_classification(
n_samples=200, n_features=2, n_informative=2,
n_redundant=0, n_clusters_per_class=1, random_state=42
)
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
k_values = [1, 5, 15]
for ax, k in zip(axes, k_values):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_2d, y_2d)
# Create mesh grid
x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap='coolwarm', edgecolors='black')
ax.set_title(f'K = {k}')
ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
plt.tight_layout()
plt.show()
Notice how K=1 creates jagged boundaries (overfitting), while K=15 creates smoother boundaries.
Standard KNN treats all K neighbors equally. Weighted KNN gives closer neighbors more influence.
# Compare uniform and distance-weighted KNN
knn_uniform = KNeighborsClassifier(n_neighbors=5, weights='uniform')
knn_weighted = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_uniform.fit(X_train_scaled, y_train)
knn_weighted.fit(X_train_scaled, y_train)
print(f"Uniform weights accuracy: {knn_uniform.score(X_test_scaled, y_test):.4f}")
print(f"Distance weights accuracy: {knn_weighted.score(X_test_scaled, y_test):.4f}")
Weights options:
'uniform': All neighbors weighted equally'distance': Closer neighbors have more influence (weight = 1/distance)KNN can also perform regression by averaging the target values of K neighbors instead of voting.
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score
# Generate regression data
X_reg, y_reg = make_regression(n_samples=200, n_features=1, noise=20, random_state=42)
# Split data
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# Train KNN regressor
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train_reg, y_train_reg)
# Evaluate
y_pred_reg = knn_reg.predict(X_test_reg)
print(f"R² Score: {r2_score(y_test_reg, y_pred_reg):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_reg)):.4f}")
# Complete KNN pipeline with best practices
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# Create pipeline with scaling
pipeline = Pipeline([
('scaler', StandardScaler()),
('knn', KNeighborsClassifier())
])
# Define parameter grid
param_grid = {
'knn__n_neighbors': [3, 5, 7, 9, 11],
'knn__weights': ['uniform', 'distance'],
'knn__metric': ['euclidean', 'manhattan']
}
# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best accuracy: {grid_search.best_score_:.4f}")
Using a pipeline ensures scaling is applied consistently and prevents data leakage.
K-Nearest Neighbors is a simple yet effective algorithm that classifies based on similarity to nearby training examples.
Key takeaways:
Learn how Logistic Regression predicts binary outcomes using probability-based decision boundaries. This lesson covers theory, implementation, and practical use cases like spam detection and churn prediction.
Discover the Naive Bayes classifier, a fast and powerful algorithm based on probability and Bayes’ theorem. This lesson shows how it excels in text classification and other high‑dimensional tasks.
Master Support Vector Machines for high‑accuracy classification. Learn how SVMs create optimal boundaries and handle linear and nonlinear data with kernel functions.