Apply classification algorithms to a real Customer Churn Prediction project. Learn to preprocess data, evaluate models, and build insights that help businesses reduce customer loss.
Customer churn prediction is one of the most important applications of classification in business. Predicting which customers are likely to leave allows companies to take proactive retention actions.
Acquiring new customers costs significantly more than retaining existing ones. By identifying at-risk customers early, businesses can:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, confusion_matrix, classification_report,
roc_auc_score, roc_curve)
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
pd.set_option('display.max_columns', None)
def generate_churn_data(n_samples=2000):
"""Generate synthetic customer churn dataset."""
np.random.seed(42)
data = {
'customer_id': range(1, n_samples + 1),
'tenure_months': np.random.randint(1, 72, n_samples),
'monthly_charges': np.random.uniform(20, 120, n_samples),
'total_charges': np.zeros(n_samples),
'contract_type': np.random.choice(['Month-to-month', 'One year', 'Two year'],
n_samples, p=[0.5, 0.3, 0.2]),
'payment_method': np.random.choice(['Electronic check', 'Mailed check',
'Bank transfer', 'Credit card'], n_samples),
'internet_service': np.random.choice(['DSL', 'Fiber optic', 'No'],
n_samples, p=[0.35, 0.45, 0.2]),
'online_security': np.random.choice(['Yes', 'No', 'No internet'], n_samples),
'tech_support': np.random.choice(['Yes', 'No', 'No internet'], n_samples),
'streaming_tv': np.random.choice(['Yes', 'No', 'No internet'], n_samples),
'num_support_tickets': np.random.poisson(2, n_samples),
'num_referrals': np.random.poisson(1, n_samples),
'satisfaction_score': np.random.randint(1, 6, n_samples),
}
df = pd.DataFrame(data)
# Calculate total charges
df['total_charges'] = df['tenure_months'] * df['monthly_charges'] * np.random.uniform(0.9, 1.1, n_samples)
# Generate churn based on realistic patterns
churn_prob = np.zeros(n_samples)
# Higher churn for month-to-month contracts
churn_prob += (df['contract_type'] == 'Month-to-month') * 0.25
# Higher churn for shorter tenure
churn_prob += (df['tenure_months'] < 12) * 0.15
# Higher churn for high monthly charges
churn_prob += (df['monthly_charges'] > 80) * 0.1
# Higher churn for fiber optic (faster but more issues)
churn_prob += (df['internet_service'] == 'Fiber optic') * 0.1
# Lower churn with security and support
churn_prob -= (df['online_security'] == 'Yes') * 0.1
churn_prob -= (df['tech_support'] == 'Yes') * 0.1
# Higher churn with more support tickets
churn_prob += df['num_support_tickets'] * 0.03
# Lower churn with referrals (engaged customers)
churn_prob -= df['num_referrals'] * 0.05
# Lower churn with higher satisfaction
churn_prob -= (df['satisfaction_score'] - 3) * 0.08
# Add noise and clip
churn_prob += np.random.uniform(-0.1, 0.1, n_samples)
churn_prob = np.clip(churn_prob, 0.05, 0.95)
# Generate churn labels
df['churn'] = (np.random.random(n_samples) < churn_prob).astype(int)
return df
# Generate dataset
df = generate_churn_data(2000)
print(f"Dataset shape: {df.shape}")
print(df.head())
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()}")
print(f"\nBasic Statistics:\n{df.describe()}")
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
churn_counts = df['churn'].value_counts()
plt.bar(['Retained', 'Churned'], churn_counts.values, color=['green', 'red'], alpha=0.7)
plt.ylabel('Count')
plt.title('Customer Churn Distribution')
for i, v in enumerate(churn_counts.values):
plt.text(i, v + 20, str(v), ha='center')
plt.subplot(1, 2, 2)
plt.pie(churn_counts.values, labels=['Retained', 'Churned'],
autopct='%1.1f%%', colors=['green', 'red'], alpha=0.7)
plt.title('Churn Percentage')
plt.tight_layout()
plt.show()
print(f"Churn Rate: {df['churn'].mean()*100:.2f}%")
categorical_cols = ['contract_type', 'payment_method', 'internet_service',
'online_security', 'tech_support']
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(categorical_cols):
churn_by_cat = df.groupby(col)['churn'].mean().sort_values(ascending=False)
axes[i].bar(range(len(churn_by_cat)), churn_by_cat.values, color='coral', alpha=0.7)
axes[i].set_xticks(range(len(churn_by_cat)))
axes[i].set_xticklabels(churn_by_cat.index, rotation=45, ha='right')
axes[i].set_ylabel('Churn Rate')
axes[i].set_title(f'Churn Rate by {col}')
axes[i].axhline(y=df['churn'].mean(), color='red', linestyle='--', label='Overall Rate')
# Hide empty subplot
axes[5].axis('off')
plt.tight_layout()
plt.show()
numerical_cols = ['tenure_months', 'monthly_charges', 'total_charges',
'num_support_tickets', 'satisfaction_score']
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()
for i, col in enumerate(numerical_cols):
# Box plot by churn status
df.boxplot(column=col, by='churn', ax=axes[i])
axes[i].set_xlabel('Churn (0=No, 1=Yes)')
axes[i].set_ylabel(col)
axes[i].set_title(f'{col} by Churn Status')
plt.suptitle('') # Remove automatic title
axes[5].axis('off')
plt.tight_layout()
plt.show()
# Encode categorical variables for correlation
df_encoded = df.copy()
label_encoders = {}
for col in ['contract_type', 'payment_method', 'internet_service',
'online_security', 'tech_support', 'streaming_tv']:
le = LabelEncoder()
df_encoded[col] = le.fit_transform(df_encoded[col])
label_encoders[col] = le
# Correlation with target
correlations = df_encoded.drop('customer_id', axis=1).corr()['churn'].sort_values(ascending=False)
print("Feature Correlations with Churn:")
print(correlations)
# Correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df_encoded.drop('customer_id', axis=1).corr(),
annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()
# Create a copy for feature engineering
df_features = df.copy()
# Average monthly charges relative to tenure
df_features['avg_monthly_value'] = df_features['total_charges'] / (df_features['tenure_months'] + 1)
# Tenure categories
df_features['tenure_group'] = pd.cut(df_features['tenure_months'],
bins=[0, 12, 24, 48, 72],
labels=['0-1yr', '1-2yr', '2-4yr', '4+yr'])
# High value customer flag
df_features['high_value'] = (df_features['monthly_charges'] > df_features['monthly_charges'].median()).astype(int)
# Risk score based on known churn indicators
df_features['risk_score'] = (
(df_features['contract_type'] == 'Month-to-month').astype(int) * 2 +
(df_features['tenure_months'] < 12).astype(int) * 2 +
(df_features['num_support_tickets'] > 3).astype(int) * 1 +
(df_features['satisfaction_score'] < 3).astype(int) * 2 -
(df_features['num_referrals'] > 0).astype(int) * 1
)
# Service count
service_cols = ['online_security', 'tech_support', 'streaming_tv']
df_features['num_services'] = sum((df_features[col] == 'Yes').astype(int) for col in service_cols)
print("New Features Created:")
print(df_features[['avg_monthly_value', 'tenure_group', 'high_value',
'risk_score', 'num_services']].head(10))
# One-hot encoding for categorical variables
categorical_features = ['contract_type', 'payment_method', 'internet_service',
'online_security', 'tech_support', 'streaming_tv', 'tenure_group']
df_final = pd.get_dummies(df_features, columns=categorical_features, drop_first=True)
print(f"Final dataset shape: {df_final.shape}")
print(f"Columns: {list(df_final.columns)}")
# Remove non-feature columns
drop_cols = ['customer_id', 'churn']
feature_cols = [col for col in df_final.columns if col not in drop_cols]
X = df_final[feature_cols]
y = df_final['churn']
print(f"Features: {X.shape[1]}")
print(f"Samples: {X.shape[0]}")
print(f"Target distribution:\n{y.value_counts()}")
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")
print(f"Training churn rate: {y_train.mean():.2%}")
print(f"Testing churn rate: {y_test.mean():.2%}")
# Identify numerical columns to scale
numerical_features = ['tenure_months', 'monthly_charges', 'total_charges',
'num_support_tickets', 'num_referrals', 'satisfaction_score',
'avg_monthly_value', 'risk_score', 'num_services']
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])
models = {
'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
'SVM': SVC(random_state=42, probability=True),
'Naive Bayes': GaussianNB()
}
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
"""Train and evaluate a classification model."""
# Train
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
# Calculate metrics
metrics = {
'Model': model_name,
'Accuracy': accuracy_score(y_test, y_pred),
'Precision': precision_score(y_test, y_pred),
'Recall': recall_score(y_test, y_pred),
'F1-Score': f1_score(y_test, y_pred),
'ROC-AUC': roc_auc_score(y_test, y_prob) if y_prob is not None else None
}
return metrics, y_pred, y_prob
# Train and evaluate all models
results = []
predictions = {}
for name, model in models.items():
# Use scaled data for distance-based models
if name in ['K-Nearest Neighbors', 'SVM']:
metrics, y_pred, y_prob = evaluate_model(
model, X_train_scaled, X_test_scaled, y_train, y_test, name
)
else:
metrics, y_pred, y_prob = evaluate_model(
model, X_train, X_test, y_train, y_test, name
)
results.append(metrics)
predictions[name] = {'y_pred': y_pred, 'y_prob': y_prob}
# Display results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('ROC-AUC', ascending=False)
print("Model Comparison Results:")
print(results_df.to_string(index=False))
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Bar chart of metrics
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
x = np.arange(len(results_df))
width = 0.15
for i, metric in enumerate(metrics_to_plot):
axes[0].bar(x + i*width, results_df[metric], width, label=metric)
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_title('Model Performance Comparison')
axes[0].set_xticks(x + width * 2)
axes[0].set_xticklabels(results_df['Model'], rotation=45, ha='right')
axes[0].legend(loc='lower right')
axes[0].set_ylim(0, 1)
# ROC Curves
for name in models.keys():
if predictions[name]['y_prob'] is not None:
fpr, tpr, _ = roc_curve(y_test, predictions[name]['y_prob'])
auc = roc_auc_score(y_test, predictions[name]['y_prob'])
axes[1].plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})')
axes[1].plot([0, 1], [0, 1], 'k--', label='Random')
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('ROC Curves')
axes[1].legend(loc='lower right')
plt.tight_layout()
plt.show()
print("Cross-Validation Results (5-fold):")
print("-" * 60)
cv_results = []
for name, model in models.items():
# Use appropriate data
if name in ['K-Nearest Neighbors', 'SVM']:
X_cv = X_train_scaled
else:
X_cv = X_train
scores = cross_val_score(model, X_cv, y_train, cv=5, scoring='roc_auc')
cv_results.append({
'Model': name,
'Mean ROC-AUC': scores.mean(),
'Std': scores.std()
})
print(f"{name}: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")
cv_df = pd.DataFrame(cv_results).sort_values('Mean ROC-AUC', ascending=False)
Based on the comparison, let's tune Random Forest (typically performs well for churn prediction).
# Define parameter grid for Random Forest
param_grid = {
'n_estimators': [100, 200],
'max_depth': [10, 15, 20, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# Grid search
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best CV ROC-AUC: {grid_search.best_score_:.4f}")
# Train with best parameters
best_rf = grid_search.best_estimator_
# Evaluate on test set
y_pred_final = best_rf.predict(X_test)
y_prob_final = best_rf.predict_proba(X_test)[:, 1]
print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_final):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_final):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_final):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_final):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_prob_final):.4f}")
cm = confusion_matrix(y_test, y_pred_final)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Predicted: Stay', 'Predicted: Churn'],
yticklabels=['Actual: Stay', 'Actual: Churn'])
plt.title('Confusion Matrix - Final Model')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
# Calculate business metrics
tn, fp, fn, tp = cm.ravel()
print(f"\nBusiness Impact Analysis:")
print(f"True Negatives (Correct: Stay): {tn}")
print(f"True Positives (Correct: Churn): {tp}")
print(f"False Positives (Wrong: Predicted Churn): {fp}")
print(f"False Negatives (Missed: Actual Churn): {fn}")
print(f"\nChurn Detection Rate: {tp/(tp+fn)*100:.1f}%")
print(f"False Alarm Rate: {fp/(fp+tn)*100:.1f}%")
# Get feature importance
feature_importance = pd.DataFrame({
'Feature': feature_cols,
'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)
# Plot top 15 features
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['Importance'].values)
plt.yticks(range(len(top_features)), top_features['Feature'].values)
plt.xlabel('Importance')
plt.title('Top 15 Most Important Features for Churn Prediction')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
print("Top 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))
For business applications, the default 0.5 threshold may not be optimal.
# Test different thresholds
thresholds = np.arange(0.1, 0.9, 0.05)
threshold_results = []
for thresh in thresholds:
y_pred_thresh = (y_prob_final >= thresh).astype(int)
threshold_results.append({
'Threshold': thresh,
'Precision': precision_score(y_test, y_pred_thresh),
'Recall': recall_score(y_test, y_pred_thresh),
'F1-Score': f1_score(y_test, y_pred_thresh),
'Churners Caught': (y_pred_thresh & y_test).sum(),
'False Alarms': ((y_pred_thresh == 1) & (y_test == 0)).sum()
})
threshold_df = pd.DataFrame(threshold_results)
# Plot precision-recall tradeoff
plt.figure(figsize=(10, 5))
plt.plot(threshold_df['Threshold'], threshold_df['Precision'], 'b-', label='Precision')
plt.plot(threshold_df['Threshold'], threshold_df['Recall'], 'r-', label='Recall')
plt.plot(threshold_df['Threshold'], threshold_df['F1-Score'], 'g-', label='F1-Score')
plt.xlabel('Threshold')
plt.ylabel('Score')
plt.title('Precision, Recall, and F1-Score vs Threshold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# Find optimal threshold for F1
optimal_idx = threshold_df['F1-Score'].idxmax()
optimal_threshold = threshold_df.loc[optimal_idx, 'Threshold']
print(f"Optimal Threshold (Max F1): {optimal_threshold:.2f}")
print(threshold_df.loc[optimal_idx])
def predict_churn(customer_data, model, scaler, feature_cols, numerical_features):
"""
Predict churn probability for a single customer.
Parameters:
- customer_data: dict with customer features
- model: trained model
- scaler: fitted StandardScaler
- feature_cols: list of feature column names
- numerical_features: list of numerical feature names
Returns:
- churn_probability: float
- churn_prediction: int (0 or 1)
- risk_level: str
"""
# Convert to DataFrame
df_customer = pd.DataFrame([customer_data])
# Feature engineering (same as training)
df_customer['avg_monthly_value'] = df_customer['total_charges'] / (df_customer['tenure_months'] + 1)
df_customer['high_value'] = (df_customer['monthly_charges'] > 60).astype(int)
df_customer['risk_score'] = (
(df_customer['contract_type'] == 'Month-to-month').astype(int) * 2 +
(df_customer['tenure_months'] < 12).astype(int) * 2 +
(df_customer['num_support_tickets'] > 3).astype(int) * 1 +
(df_customer['satisfaction_score'] < 3).astype(int) * 2 -
(df_customer['num_referrals'] > 0).astype(int) * 1
)
service_cols = ['online_security', 'tech_support', 'streaming_tv']
df_customer['num_services'] = sum(
(df_customer[col] == 'Yes').astype(int) for col in service_cols
)
# Tenure group
if df_customer['tenure_months'].values[0] <= 12:
tenure_group = '0-1yr'
elif df_customer['tenure_months'].values[0] <= 24:
tenure_group = '1-2yr'
elif df_customer['tenure_months'].values[0] <= 48:
tenure_group = '2-4yr'
else:
tenure_group = '4+yr'
# Create feature vector matching training columns
customer_features = {}
# Add numerical features
for col in numerical_features:
if col in df_customer.columns:
customer_features[col] = df_customer[col].values[0]
# Add encoded categorical features
categorical_mappings = {
'contract_type': ['One year', 'Two year'],
'payment_method': ['Credit card', 'Electronic check', 'Mailed check'],
'internet_service': ['Fiber optic', 'No'],
'online_security': ['No internet', 'Yes'],
'tech_support': ['No internet', 'Yes'],
'streaming_tv': ['No internet', 'Yes'],
'tenure_group': ['1-2yr', '2-4yr', '4+yr']
}
for cat_col, values in categorical_mappings.items():
for val in values:
col_name = f"{cat_col}_{val}"
if cat_col == 'tenure_group':
customer_features[col_name] = 1 if tenure_group == val else 0
else:
customer_features[col_name] = 1 if df_customer[cat_col].values[0] == val else 0
# Create DataFrame with correct column order
X_customer = pd.DataFrame([customer_features])
# Ensure all feature columns exist
for col in feature_cols:
if col not in X_customer.columns:
X_customer[col] = 0
X_customer = X_customer[feature_cols]
# Scale numerical features
X_customer[numerical_features] = scaler.transform(X_customer[numerical_features])
# Predict
churn_prob = model.predict_proba(X_customer)[0][1]
churn_pred = 1 if churn_prob >= optimal_threshold else 0
# Determine risk level
if churn_prob < 0.3:
risk_level = "Low Risk"
elif churn_prob < 0.6:
risk_level = "Medium Risk"
else:
risk_level = "High Risk"
return churn_prob, churn_pred, risk_level
# Test the prediction function
test_customer = {
'tenure_months': 6,
'monthly_charges': 85.50,
'total_charges': 513.00,
'contract_type': 'Month-to-month',
'payment_method': 'Electronic check',
'internet_service': 'Fiber optic',
'online_security': 'No',
'tech_support': 'No',
'streaming_tv': 'Yes',
'num_support_tickets': 4,
'num_referrals': 0,
'satisfaction_score': 2
}
prob, pred, risk = predict_churn(
test_customer, best_rf, scaler, feature_cols, numerical_features
)
print("\n" + "="*50)
print("CUSTOMER CHURN PREDICTION")
print("="*50)
print(f"\nCustomer Profile:")
for key, value in test_customer.items():
print(f" {key}: {value}")
print(f"\nPrediction Results:")
print(f" Churn Probability: {prob:.2%}")
print(f" Prediction: {'Will Churn' if pred == 1 else 'Will Stay'}")
print(f" Risk Level: {risk}")
def predict_batch(customer_list, model, scaler, feature_cols, numerical_features):
"""Predict churn for multiple customers."""
results = []
for i, customer in enumerate(customer_list):
prob, pred, risk = predict_churn(
customer, model, scaler, feature_cols, numerical_features
)
results.append({
'Customer_ID': i + 1,
'Churn_Probability': prob,
'Prediction': 'Churn' if pred == 1 else 'Stay',
'Risk_Level': risk
})
return pd.DataFrame(results)
# Example batch prediction
sample_customers = [
{
'tenure_months': 48, 'monthly_charges': 45.00, 'total_charges': 2160.00,
'contract_type': 'Two year', 'payment_method': 'Bank transfer',
'internet_service': 'DSL', 'online_security': 'Yes', 'tech_support': 'Yes',
'streaming_tv': 'No', 'num_support_tickets': 1, 'num_referrals': 3,
'satisfaction_score': 5
},
{
'tenure_months': 3, 'monthly_charges': 95.00, 'total_charges': 285.00,
'contract_type': 'Month-to-month', 'payment_method': 'Electronic check',
'internet_service': 'Fiber optic', 'online_security': 'No', 'tech_support': 'No',
'streaming_tv': 'Yes', 'num_support_tickets': 5, 'num_referrals': 0,
'satisfaction_score': 1
}
]
batch_results = predict_batch(
sample_customers, best_rf, scaler, feature_cols, numerical_features
)
print("\nBatch Prediction Results:")
print(batch_results.to_string(index=False))
import joblib
# Save model and preprocessing objects
joblib.dump(best_rf, 'churn_model.pkl')
joblib.dump(scaler, 'churn_scaler.pkl')
joblib.dump(feature_cols, 'churn_features.pkl')
print("Model components saved successfully!")
print("\n" + "="*60)
print("CUSTOMER CHURN PREDICTION MODEL - SUMMARY REPORT")
print("="*60)
print("\nš DATASET OVERVIEW:")
print(f" Total Customers: {len(df):,}")
print(f" Churn Rate: {df['churn'].mean()*100:.1f}%")
print(f" Features Used: {len(feature_cols)}")
print("\nš BEST MODEL: Random Forest Classifier")
print(f" Parameters: {grid_search.best_params_}")
print("\nš PERFORMANCE METRICS:")
print(f" Accuracy: {accuracy_score(y_test, y_pred_final)*100:.1f}%")
print(f" Precision: {precision_score(y_test, y_pred_final)*100:.1f}%")
print(f" Recall: {recall_score(y_test, y_pred_final)*100:.1f}%")
print(f" F1-Score: {f1_score(y_test, y_pred_final)*100:.1f}%")
print(f" ROC-AUC: {roc_auc_score(y_test, y_prob_final)*100:.1f}%")
print("\nš TOP 5 CHURN PREDICTORS:")
for i, row in feature_importance.head(5).iterrows():
print(f" {row['Feature']}: {row['Importance']:.4f}")
print("\nš” KEY INSIGHTS:")
print(" - Month-to-month contracts have highest churn risk")
print(" - Low tenure (< 12 months) indicates higher churn probability")
print(" - Customers with tech support are less likely to churn")
print(" - High number of support tickets correlates with churn")
print(" - Satisfaction score is a strong predictor of retention")
print("\nš SAVED FILES:")
print(" - churn_model.pkl (Trained model)")
print(" - churn_scaler.pkl (Feature scaler)")
print(" - churn_features.pkl (Feature list)")
print("\n" + "="*60)
A complete customer churn prediction system that:
| Metric | Value |
|---|---|
| Best Model | Random Forest |
| ROC-AUC | ~0.85+ |
| Recall (Churn Detection) | ~75%+ |
| Top Predictor | Contract Type |
Based on this analysis:
To improve this project further:
Congratulations! You've completed a comprehensive classification project, applying multiple algorithms to solve a real business problem with measurable impact.
Explore Decision Trees and how they split data into meaningful decision rules. This lesson teaches tree-building, visualization, and practical classification applications.
Understand how the KNN algorithm classifies data based on similarity. This lesson explains distance metrics, choosing the right K value, and building accurate classification models.
Learn how Logistic Regression predicts binary outcomes using probability-based decision boundaries. This lesson covers theory, implementation, and practical use cases like spam detection and churn prediction.