Build intelligent systems that learn from data and make predictions
Imagine teaching a computer to recognize cats without explicitly programming "if it has whiskers and pointy ears, it's a cat." Instead, you show it thousands of cat pictures, and it learns the patterns itself! That's machine learning - computers learning from experience.
Machine Learning (ML) is teaching computers to learn patterns from data and make predictions or decisions without being explicitly programmed for every scenario. It's like teaching a child through examples rather than rules!
Traditional Programming vs Machine Learning:
Traditional: Rules + Data = Answers
ML: Data + Answers = Rules (learned patterns)
Learning with a teacher! You provide labeled examples (input + correct answer), and the model learns to predict answers for new inputs.
Examples:
Learning without a teacher! The model finds hidden patterns and structures in unlabeled data on its own.
Examples:
Linear regression is like drawing the best-fit line through scattered points! It's the simplest ML algorithm - perfect for predicting continuous values like prices, temperatures, or sales.
Linear regression finds the equation of a line (y = mx + b) that best fits your data. The goal is to minimize the distance between the line and all data points.
# Import libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
# Example: Predict house prices based on size
house_sizes = np.array([1000, 1500, 2000, 2500, 3000, 3500, 4000]).reshape(-1, 1)
prices = np.array([200000, 280000, 350000, 420000, 500000, 580000, 650000])
# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
house_sizes, prices, test_size=0.2, random_state=42
)
# Create and train model
model = LinearRegression()
model.fit(X_train, y_train) # Learn from training data
# Make predictions
y_pred = model.predict(X_test)
print(f"Predicted prices: {y_pred}")
print(f"Actual prices: {y_test}")
# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:,.2f}")
print(f"R² Score: {r2:.3f}") # 1.0 = perfect fit
# Predict new house price
new_house = np.array([[2800]]) # 2800 sq ft
predicted_price = model.predict(new_house)
print(f"Predicted price for 2800 sq ft: ${predicted_price[0]:,.2f}")
Despite its name, logistic regression is for classification, not regression! It predicts categories (yes/no, spam/not spam, cat/dog) by calculating probabilities. Think of it as answering "What's the probability this email is spam?"
# Example: Predict if a customer will buy (binary classification)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
# Sample data: age, income, will_buy
data = {
'age': [25, 35, 45, 20, 50, 23, 40, 60, 30, 55],
'income': [30000, 50000, 70000, 25000, 90000, 28000, 65000, 95000, 45000, 85000],
'will_buy': [0, 0, 1, 0, 1, 0, 1, 1, 0, 1] # 0=No, 1=Yes
}
df = pd.DataFrame(data)
# Prepare features (X) and target (y)
X = df[['age', 'income']]
y = df['will_buy']
# Split and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test) # Get probabilities
# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")
print("\\nClassification Report:")
print(classification_report(y_test, y_pred))
# Predict for new customer
new_customer = [[38, 60000]] # 38 years old, $60k income
prediction = model.predict(new_customer)
probability = model.predict_proba(new_customer)[0][1]
print(f"\\nWill buy: {prediction[0]} (Probability: {probability:.2%})")
Decision trees make decisions like a flowchart - asking yes/no questions until reaching a conclusion. Random forests are like asking 100 experts (trees) and taking a vote! More accurate and less prone to overfitting.
The tree asks questions like "Is age > 30?" and splits data based on answers. It continues splitting until it can classify data accurately. Think of it as a game of 20 questions!
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
# Load famous Iris dataset
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Train Decision Tree
dt_model = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_model.fit(X_train, y_train)
dt_accuracy = dt_model.score(X_test, y_test)
print(f"Decision Tree Accuracy: {dt_accuracy:.2%}")
# Train Random Forest (ensemble of trees)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_accuracy = rf_model.score(X_test, y_test)
print(f"Random Forest Accuracy: {rf_accuracy:.2%}")
# Feature importance (which features matter most?)
importances = rf_model.feature_importances_
for i, importance in enumerate(importances):
print(f"{iris.feature_names[i]}: {importance:.3f}")
K-Means is unsupervised learning - it finds groups (clusters) in data without being told what to look for! Like organizing a messy closet by grouping similar items together automatically.
# Example: Customer segmentation
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# Sample customer data: spending score and income
X = np.array([
[20, 30], [25, 35], [22, 32], # Low spenders
[70, 75], [75, 80], [72, 78], # High spenders
[45, 50], [50, 55], [48, 52] # Medium spenders
])
# Create and fit K-Means model (k=3 clusters)
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X)
# Get cluster centers
centers = kmeans.cluster_centers_
print("Cluster Centers:")
print(centers)
# Visualize clusters
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', s=100)
plt.scatter(centers[:, 0], centers[:, 1], c='red', marker='X', s=200, label='Centers')
plt.xlabel('Spending Score')
plt.ylabel('Income')
plt.title('Customer Segmentation')
plt.legend()
plt.show()
# Predict cluster for new customer
new_customer = [[60, 65]]
cluster = kmeans.predict(new_customer)
print(f"New customer belongs to cluster: {cluster[0]}")
Use the Elbow Method: Plot inertia (sum of squared distances) for different k values. The "elbow" point where improvement slows down is your optimal k!
Accuracy alone can be misleading! If 95% of emails aren't spam, a model that always predicts "not spam" gets 95% accuracy but is useless. We need better metrics!
Accuracy
Percentage of correct predictions. Simple but can be misleading with imbalanced data.
Precision
Of all positive predictions, how many were actually positive? Important when false positives are costly.
Recall (Sensitivity)
Of all actual positives, how many did we catch? Important when false negatives are costly.
F1 Score
Harmonic mean of precision and recall. Good balance when you care about both.
ROC-AUC
Area under ROC curve. Measures model's ability to distinguish between classes (0-1, higher is better).
# Calculate all metrics
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, roc_auc_score, confusion_matrix
)
# Example predictions
y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0]
y_pred = [1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
y_proba = [0.9, 0.1, 0.8, 0.4, 0.2, 0.85, 0.6, 0.15, 0.95, 0.05]
# Calculate metrics
print(f"Accuracy: {accuracy_score(y_true, y_pred):.2%}")
print(f"Precision: {precision_score(y_true, y_pred):.2%}")
print(f"Recall: {recall_score(y_true, y_pred):.2%}")
print(f"F1 Score: {f1_score(y_true, y_pred):.3f}")
print(f"ROC-AUC: {roc_auc_score(y_true, y_proba):.3f}")
# Confusion Matrix
cm = confusion_matrix(y_true, y_pred)
print("\\nConfusion Matrix:")
print(" Predicted")
print(" Neg Pos")
print(f"Actual Neg {cm[0][0]:3d} {cm[0][1]:3d}")
print(f" Pos {cm[1][0]:3d} {cm[1][1]:3d}")
Testing on one split might give lucky (or unlucky) results! Cross-validation tests your model on multiple splits to get a more reliable performance estimate. It's like taking multiple exams instead of just one.
Split data into K parts (folds). Train on K-1 folds, test on the remaining fold. Repeat K times, each time using a different fold for testing. Average the results!
# K-Fold Cross-Validation
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
# Load data
iris = load_iris()
X, y = iris.data, iris.target
# Create model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5) # 5 folds
print("Cross-validation scores:", scores)
print(f"Mean accuracy: {scores.mean():.2%} (+/- {scores.std() * 2:.2%})")
# Get multiple metrics
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
results = cross_validate(model, X, y, cv=5, scoring=scoring)
for metric in scoring:
scores = results[f'test_{metric}']
print(f"{metric}: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
Let's build an end-to-end machine learning project! We'll predict whether a customer will churn (leave) based on their behavior - a real business problem.
# Complete ML Project: Customer Churn Prediction
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
# Step 1: Create sample data
np.random.seed(42)
n_samples = 1000
data = {
'age': np.random.randint(18, 70, n_samples),
'tenure_months': np.random.randint(1, 72, n_samples),
'monthly_charges': np.random.uniform(20, 150, n_samples),
'total_charges': np.random.uniform(100, 8000, n_samples),
'num_products': np.random.randint(1, 5, n_samples),
'support_calls': np.random.randint(0, 10, n_samples)
}
df = pd.DataFrame(data)
# Create target: more likely to churn if high support calls, low tenure
df['churn'] = ((df['support_calls'] > 5) | (df['tenure_months'] < 12)).astype(int)
# Step 2: Exploratory Data Analysis
print("Dataset Shape:", df.shape)
print("\\nChurn Distribution:")
print(df['churn'].value_counts(normalize=True))
print("\\nFeature Statistics:")
print(df.describe())
# Step 3: Prepare data
X = df.drop('churn', axis=1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Step 4: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Step 5: Train model
model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
model.fit(X_train_scaled, y_train)
# Step 6: Evaluate model
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]
print("\\n=== MODEL PERFORMANCE ===")
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.3f}")
# Step 7: Cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"\\nCross-validation accuracy: {cv_scores.mean():.2%} (+/- {cv_scores.std() * 2:.2%})")
# Step 8: Feature importance
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print("\\nTop Features:")
print(feature_importance)
# Step 9: Make predictions for new customers
new_customers = pd.DataFrame({
'age': [35, 50],
'tenure_months': [6, 48],
'monthly_charges': [75, 100],
'total_charges': [450, 4800],
'num_products': [2, 3],
'support_calls': [7, 2]
})
new_customers_scaled = scaler.transform(new_customers)
predictions = model.predict(new_customers_scaled)
probabilities = model.predict_proba(new_customers_scaled)[:, 1]
print("\\n=== NEW CUSTOMER PREDICTIONS ===")
for i, (pred, prob) in enumerate(zip(predictions, probabilities)):
print(f"Customer {i+1}: {'Churn' if pred else 'Stay'} (Probability: {prob:.2%})")
You now have the fundamentals of machine learning! In the next module, we'll explore Advanced Machine Learning - gradient boosting, feature engineering, hyperparameter tuning, and handling imbalanced datasets.