Principal Component Analysis (PCA) is a powerful dimensionality reduction technique widely used in data science and machine learning. This tutorial covers both the practical implementation using scikit-learn and the mathematical derivation from first principles.
Principal Component Analysis (PCA) is a statistical technique used to reduce the dimensionality of data while preserving as much variance as possible. It transforms the original features into a new set of uncorrelated features called principal components.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=2) # Reduce to 2 dimensions
X_pca = pca.fit_transform(X_scaled)
# Print the explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total explained variance:", sum(pca.explained_variance_ratio_))
# Plot the first two principal components
plt.figure(figsize=(10, 8))
colors = ['navy', 'turquoise', 'darkorange']
for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names):
plt.scatter(X_pca[y == i, 0], X_pca[y == i, 1], color=color, alpha=0.8, lw=2,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of Iris dataset')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.grid(True)
plt.show()
We can determine the optimal number of components by looking at the explained variance:
# Create a PCA instance without specifying n_components
pca_full = PCA()
pca_full.fit(X_scaled)
# Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca_full.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.axhline(y=0.95, color='r', linestyle='-', alpha=0.5)
plt.grid(True)
plt.title('Explained Variance vs. Number of Components')
plt.show()
# Apply PCA with different numbers of components
n_components_list = [1, 2, 3, 4]
fig, axes = plt.subplots(1, len(n_components_list), figsize=(15, 3))
for i, n_components in enumerate(n_components_list):
# Apply PCA
pca = PCA(n_components=n_components)
X_transformed = pca.fit_transform(X_scaled)
# Inverse transform to reconstruct the data
X_reconstructed = pca.inverse_transform(X_transformed)
# Calculate reconstruction error
reconstruction_error = np.mean((X_scaled - X_reconstructed) ** 2)
explained_var = sum(pca.explained_variance_ratio_)
# Plot original vs reconstructed data for the first two features
ax = axes[i]
ax.scatter(X_scaled[:, 0], X_scaled[:, 1], alpha=0.3, label='Original')
ax.scatter(X_reconstructed[:, 0], X_reconstructed[:, 1], alpha=0.3, label='Reconstructed')
ax.set_title(f'Components: {n_components}\nVar: {explained_var:.2f}, Error: {reconstruction_error:.2f}')
ax.grid(True)
if i == 0:
ax.legend()
plt.tight_layout()
plt.show()
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
# Load digits dataset (8x8 images)
digits = load_digits()
X = digits.data
y = digits.target
# Select a single digit image for demonstration
digit_index = 42
digit_image = X[digit_index].reshape(8, 8)
# Apply PCA with different numbers of components
n_components_list = [1, 5, 10, 20, 30, 40, 64]
fig, axes = plt.subplots(1, len(n_components_list) + 1, figsize=(15, 3))
# Display original image
axes[0].imshow(digit_image, cmap='gray')
axes[0].set_title('Original')
axes[0].axis('off')
# Apply PCA with different number of components
for i, n_components in enumerate(n_components_list):
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=n_components)
X_transformed = pca.fit_transform(X_scaled)
X_reconstructed = pca.inverse_transform(X_transformed)
# Un-standardize the reconstructed data
X_reconstructed = scaler.inverse_transform(X_reconstructed)
# Display the reconstructed image
reconstructed_image = X_reconstructed[digit_index].reshape(8, 8)
axes[i+1].imshow(reconstructed_image, cmap='gray')
axes[i+1].set_title(f'n={n_components}')
axes[i+1].axis('off')
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
# Generate a noisy sine wave
n_samples = 1000
t = np.linspace(0, 8 * np.pi, n_samples)
signal = np.sin(t)
# Create a 2D dataset with the signal and some noise
noise_level = 0.5
X = np.column_stack([
signal,
signal + np.random.normal(0, noise_level, n_samples),
signal + np.random.normal(0, noise_level, n_samples),
signal + np.random.normal(0, noise_level, n_samples),
np.random.normal(0, noise_level, n_samples)
])
# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA for noise reduction
pca = PCA(n_components=2) # Keep only 2 principal components
X_transformed = pca.fit_transform(X_scaled)
X_reconstructed = pca.inverse_transform(X_transformed)
X_reconstructed = scaler.inverse_transform(X_reconstructed)
# Plot results
plt.figure(figsize=(12, 8))
plt.subplot(2, 1, 1)
plt.title('Original Noisy Data')
for i in range(X.shape[1]):
plt.plot(t, X[:, i], alpha=0.5, label=f'Feature {i+1}')
plt.legend(loc='upper right')
plt.grid(True)
plt.subplot(2, 1, 2)
plt.title('PCA Reconstructed Data (Noise Reduced)')
for i in range(X_reconstructed.shape[1]):
plt.plot(t, X_reconstructed[:, i], alpha=0.5, label=f'Feature {i+1}')
plt.legend(loc='upper right')
plt.grid(True)
plt.tight_layout()
plt.show()
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import numpy as np
# Load breast cancer dataset
cancer = load_breast_cancer()
X = cancer.data
y = cancer.target
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Compare accuracy for different numbers of components
n_components_range = range(1, 31)
accuracies = []
for n_components in n_components_range:
# Apply PCA
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
# Train SVM
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_pca, y_train)
# Predict and calculate accuracy
y_pred = svm.predict(X_test_pca)
accuracies.append(accuracy_score(y_test, y_pred))
# Plot results
plt.figure(figsize=(10, 6))
plt.plot(n_components_range, accuracies, marker='o')
plt.axhline(y=max(accuracies), color='r', linestyle='--', alpha=0.5)
plt.text(2, max(accuracies)-0.01, f'Max accuracy: {max(accuracies):.3f}')
plt.xlabel('Number of Principal Components')
plt.ylabel('Classification Accuracy')
plt.title('SVM Accuracy vs Number of PCA Components')
plt.grid(True)
plt.show()
# Get best number of components
best_n_components = n_components_range[np.argmax(accuracies)]
print(f"Best number of components: {best_n_components}")
# Train final model with optimal components
pca = PCA(n_components=best_n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train_pca, y_train)
y_pred = svm.predict(X_test_pca)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
From a mathematical perspective, PCA aims to find a new orthogonal basis that maximizes the variance along each basis vector. These basis vectors are the principal components.
Let's denote: - (X \in \mathbb{R}^{n \times p}): Data matrix with n samples and p features - (w \in \mathbb{R}^p): Direction vector we want to find - (z = Xw): Projection of X onto w
The objective is to maximize the variance of z: [ \text{Var}(z) = \frac{1}{n} \sum{i=1}^{n} (zi - \bar{z})^2 = \frac{1}{n} \sum{i=1}^{n} ((Xw)i - \overline{Xw})^2 ]
Assuming the data is centered (mean zero), the covariance matrix is: [ C = \frac{1}{n} X^T X ]
The variance of the projection is: [ \text{Var}(z) = w^T C w ]
We want to maximize this subject to the constraint that (w) has unit length: [ |w|^2 = w^T w = 1 ]
Using Lagrangian multipliers: [ L(w, \lambda) = w^T C w - \lambda (w^T w - 1) ]
Taking the derivative with respect to (w) and setting it to zero: [ \frac{\partial L}{\partial w} = 2C w - 2\lambda w = 0 ] [ C w = \lambda w ]
This is an eigenvalue problem! The solution is that (w) is an eigenvector of (C) and (\lambda) is the corresponding eigenvalue.
Since we want to maximize (w^T C w), and (w^T C w = w^T \lambda w = \lambda w^T w = \lambda) (because (w^T w = 1)), we should choose the eigenvector corresponding to the largest eigenvalue of (C).
For the next principal component, we want a vector that is orthogonal to the first one and maximizes the remaining variance. This leads to the second eigenvector of (C), and so on.
PCA can also be computed using SVD:
[ X = U \Sigma V^T ]
Where: - (U) is an n×n orthogonal matrix - (Σ) is an n×p diagonal matrix with non-negative real numbers (singular values) - (V) is a p×p orthogonal matrix
The columns of (V) are the eigenvectors of (X^T X), which are the principal components. The singular values in (Σ) are the square roots of the eigenvalues, representing the standard deviations along the principal components.
def pca_from_scratch(X, n_components=None):
"""
Implements PCA from scratch using eigendecomposition of the covariance matrix.
Parameters:
-----------
X : array-like, shape (n_samples, n_features)
Input data
n_components : int, optional
Number of components to keep. If None, all components are kept.
Returns:
--------
X_transformed : array-like, shape (n_samples, n_components)
Transformed data
components : array-like, shape (n_components, n_features)
Principal components
explained_variance : array-like, shape (n_components,)
Variance explained by each component
explained_variance_ratio : array-like, shape (n_components,)
Ratio of variance explained by each component
"""
# Center the data
X_centered = X - np.mean(X, axis=0)
# Compute the covariance matrix
n_samples = X_centered.shape[0]
cov_matrix = np.dot(X_centered.T, X_centered) / (n_samples - 1)
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# Sort eigenvalues and eigenvectors in descending order
idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
# Select n_components
if n_components is None:
n_components = len(eigenvalues)
# Get the top n_components
components = eigenvectors[:, :n_components]
explained_variance = eigenvalues[:n_components]
# Calculate the explained variance ratio
explained_variance_ratio = explained_variance / np.sum(eigenvalues)
# Project the data onto the principal components
X_transformed = np.dot(X_centered, components)
return X_transformed, components, explained_variance, explained_variance_ratio
def pca_svd_from_scratch(X, n_components=None):
"""
Implements PCA from scratch using Singular Value Decomposition (SVD).
Parameters:
-----------
X : array-like, shape (n_samples, n_features)
Input data
n_components : int, optional
Number of components to keep. If None, all components are kept.
Returns:
--------
X_transformed : array-like, shape (n_samples, n_components)
Transformed data
components : array-like, shape (n_components, n_features)
Principal components
explained_variance : array-like, shape (n_components,)
Variance explained by each component
explained_variance_ratio : array-like, shape (n_components,)
Ratio of variance explained by each component
"""
# Center the data
X_centered = X - np.mean(X, axis=0)
# Perform SVD
U, s, Vt = np.linalg.svd(X_centered, full_matrices=False)
# V contains the right singular vectors which are the principal components
components = Vt
# Calculate the explained variance
n_samples = X_centered.shape[0]
explained_variance = (s ** 2) / (n_samples - 1)
explained_variance_ratio = explained_variance / np.sum(explained_variance)
# Select n_components
if n_components is None:
n_components = len(explained_variance)
# Get the top n_components
components = components[:n_components, :]
explained_variance = explained_variance[:n_components]
explained_variance_ratio = explained_variance_ratio[:n_components]
# Project the data onto the principal components
X_transformed = np.dot(X_centered, components.T)
return X_transformed, components, explained_variance, explained_variance_ratio
# Load and preprocess data
iris = load_iris()
X = iris.data
y = iris.target
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply our PCA implementation
X_pca_custom, components, explained_variance, explained_variance_ratio = pca_from_scratch(X_scaled, n_components=2)
# Compare with scikit-learn's PCA
pca_sklearn = PCA(n_components=2)
X_pca_sklearn = pca_sklearn.fit_transform(X_scaled)
# Plot both results
plt.figure(figsize=(15, 6))
# Our Implementation
plt.subplot(1, 2, 1)
for color, i, target_name in zip(['navy', 'turquoise', 'darkorange'], [0, 1, 2], iris.target_names):
plt.scatter(X_pca_custom[y == i, 0], X_pca_custom[y == i, 1], color=color, alpha=0.8, lw=2,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA from Scratch')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.grid(True)
# Scikit-learn's Implementation
plt.subplot(1, 2, 2)
for color, i, target_name in zip(['navy', 'turquoise', 'darkorange'], [0, 1, 2], iris.target_names):
plt.scatter(X_pca_sklearn[y == i, 0], X_pca_sklearn[y == i, 1], color=color, alpha=0.8, lw=2,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('Scikit-learn PCA')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.grid(True)
plt.tight_layout()
plt.show()
# Compare explained variance ratios
print("Custom PCA explained variance ratio:", explained_variance_ratio)
print("Scikit-learn PCA explained variance ratio:", pca_sklearn.explained_variance_ratio_)
import time
# Compare performance and results between SVD and Eigendecomposition methods
X = np.random.random((1000, 50)) # Larger dataset for timing comparison
X_scaled = StandardScaler().fit_transform(X)
# Time eigendecomposition
start_time = time.time()
X_pca_eigen, _, var_eigen, ratio_eigen = pca_from_scratch(X_scaled, n_components=2)
eigen_time = time.time() - start_time
# Time SVD
start_time = time.time()
X_pca_svd, _, var_svd, ratio_svd = pca_svd_from_scratch(X_scaled, n_components=2)
svd_time = time.time() - start_time
# Print results
print(f"Eigendecomposition time: {eigen_time:.4f} seconds")
print(f"SVD time: {svd_time:.4f} seconds")
print("\nExplained variance (Eigendecomposition):", var_eigen)
print("Explained variance (SVD):", var_svd)
print("\nExplained variance ratio (Eigendecomposition):", ratio_eigen)
print("Explained variance ratio (SVD):", ratio_svd)
# Check if the results are similar (Note: signs might be flipped)
correlation = np.abs(np.corrcoef(X_pca_eigen[:, 0], X_pca_svd[:, 0])[0, 1])
print(f"\nCorrelation between first principal components: {correlation:.4f}")
In this tutorial, we've covered: 1. The fundamentals of Principal Component Analysis 2. How to implement PCA using scikit-learn 3. Practical applications of PCA: - Visualization - Dimensionality reduction - Noise reduction - Feature extraction 4. Mathematical derivation of PCA from first principles 5. Implementation of PCA from scratch using both eigendecomposition and SVD
The complete code for this tutorial is available in the accompanying Jupyter notebook.