This tutorial covers the fundamentals of neural networks, including: - Basic concepts and architecture - Implementation using PyTorch - Implementation from scratch with Python - Understanding gradient descent and backpropagation
Neural networks are computational models inspired by the human brain. They consist of: - Neurons: Basic processing units - Connections: Links between neurons with associated weights - Layers: Organized groups of neurons (input, hidden, output) - Activation Functions: Non-linear transformations applied to neuron outputs
A simple neural network consists of: 1. Input Layer: Receives the input data 2. Hidden Layer(s): Performs computations and feature extraction 3. Output Layer: Produces the final result
Each neuron computes a weighted sum of its inputs, applies an activation function, and passes the result to the next layer.
PyTorch is a popular deep learning framework that provides tools for building and training neural networks efficiently.
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
class SimpleNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(SimpleNN, self).__init__()
# First fully connected layer
self.fc1 = nn.Linear(input_size, hidden_size)
# Second fully connected layer
self.fc2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
# Apply activation function (ReLU) after first layer
x = F.relu(self.fc1(x))
# Output layer (no activation for now)
x = self.fc2(x)
return x
Let's create some synthetic data for demonstration:
def generate_data(samples=1000):
# Create binary classification data
X = np.random.randn(samples, 2)
# Create two clusters
y = np.zeros(samples)
# Points in first quadrant belong to class 1
y[(X[:, 0] > 0) & (X[:, 1] > 0)] = 1
# Points in third quadrant belong to class 1
y[(X[:, 0] < 0) & (X[:, 1] < 0)] = 1
# Convert to PyTorch tensors
X_tensor = torch.FloatTensor(X)
y_tensor = torch.FloatTensor(y).reshape(-1, 1)
return X_tensor, y_tensor
# Generate training data
X_train, y_train = generate_data(1000)
def train_model(model, X_train, y_train, epochs=1000, lr=0.01):
# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss() # Binary cross entropy with sigmoid
optimizer = optim.SGD(model.parameters(), lr=lr)
# Lists to store loss values
losses = []
# Training loop
for epoch in range(epochs):
# Forward pass
outputs = model(X_train)
loss = criterion(outputs, y_train)
# Backward pass and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Store loss
losses.append(loss.item())
# Print progress
if (epoch+1) % 100 == 0:
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
return losses
# Initialize the model
input_size = 2
hidden_size = 10
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)
# Train the model
losses = train_model(model, X_train, y_train)
# Plot the loss curve
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()
def plot_decision_boundary(model, X, y):
# Create a mesh grid
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
np.arange(y_min, y_max, 0.01))
# Get predictions for all grid points
Z = model(torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()]))
Z = torch.sigmoid(Z).detach().numpy().reshape(xx.shape)
# Plot decision boundary and data points
plt.figure(figsize=(10, 8))
plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdBu)
plt.scatter(X[:, 0], X[:, 1], c=y.reshape(-1), edgecolors='k', cmap=plt.cm.RdBu)
plt.title('Decision Boundary')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
# Visualize the decision boundary
X_np = X_train.numpy()
y_np = y_train.numpy()
plot_decision_boundary(model, X_np, y_np)
Now, let's implement a neural network using only basic Python functions and NumPy.
import numpy as np
class NeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
# Initialize weights and biases
self.W1 = np.random.randn(input_size, hidden_size) * 0.01
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.01
self.b2 = np.zeros((1, output_size))
def relu(self, x):
"""ReLU activation function"""
return np.maximum(0, x)
def relu_derivative(self, x):
"""Derivative of ReLU"""
return np.where(x > 0, 1, 0)
def sigmoid(self, x):
"""Sigmoid activation function"""
return 1 / (1 + np.exp(-np.clip(x, -500, 500))) # Clip to avoid overflow
def sigmoid_derivative(self, x):
"""Derivative of sigmoid"""
s = self.sigmoid(x)
return s * (1 - s)
def forward(self, X):
"""Forward propagation"""
# First layer
self.z1 = np.dot(X, self.W1) + self.b1
self.a1 = self.relu(self.z1)
# Output layer
self.z2 = np.dot(self.a1, self.W2) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def backward(self, X, y, output, learning_rate=0.01):
"""Backward propagation"""
m = X.shape[0]
# Calculate gradients
dz2 = output - y
dW2 = (1/m) * np.dot(self.a1.T, dz2)
db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
dz1 = np.dot(dz2, self.W2.T) * self.relu_derivative(self.z1)
dW1 = (1/m) * np.dot(X.T, dz1)
db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
# Update parameters
self.W2 -= learning_rate * dW2
self.b2 -= learning_rate * db2
self.W1 -= learning_rate * dW1
self.b1 -= learning_rate * db1
def train(self, X, y, epochs=1000, learning_rate=0.01):
"""Train the neural network"""
losses = []
for epoch in range(epochs):
# Forward pass
output = self.forward(X)
# Calculate loss (binary cross-entropy)
loss = -np.mean(y * np.log(output + 1e-15) + (1 - y) * np.log(1 - output + 1e-15))
losses.append(loss)
# Backward pass
self.backward(X, y, output, learning_rate)
# Print progress
if (epoch+1) % 100 == 0:
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}')
return losses
# Generate data
np.random.seed(42)
X = np.random.randn(1000, 2)
y = np.zeros((1000, 1))
y[(X[:, 0] > 0) & (X[:, 1] > 0)] = 1
y[(X[:, 0] < 0) & (X[:, 1] < 0)] = 1
# Create and train the model
nn = NeuralNetwork(input_size=2, hidden_size=10, output_size=1)
losses = nn.train(X, y, epochs=2000, learning_rate=0.1)
# Plot loss curve
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title('Training Loss (Custom NN)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()
# Predict and evaluate
predictions = nn.forward(X)
accuracy = np.mean((predictions >= 0.5) == y)
print(f'Accuracy: {accuracy:.4f}')
Gradient descent is an optimization algorithm used to minimize the loss function in neural networks. The basic process is:
The weight update rule is:
w = w - learning_rate * dL/dw
Where:
- w
is the weight
- learning_rate
is a hyperparameter controlling step size
- dL/dw
is the gradient of the loss with respect to the weight
Backpropagation is the algorithm used to efficiently calculate gradients in neural networks. It works by:
The chain rule of calculus is the fundamental concept behind backpropagation:
dL/dw = dL/da * da/dz * dz/dw
Where:
- L
is the loss function
- w
is a weight
- z
is the weighted input to a neuron
- a
is the activation (output) of a neuron
Let's implement a simple example to illustrate backpropagation with a single hidden layer:
def manual_backprop_example():
# Simple 2-2-1 network (2 input, 2 hidden, 1 output)
# Initialize parameters
W1 = np.array([[0.1, 0.2], [0.3, 0.4]]) # Input -> Hidden
b1 = np.array([[0.1, 0.1]])
W2 = np.array([[0.5], [0.6]]) # Hidden -> Output
b2 = np.array([[0.1]])
# Input and target
X = np.array([[1.0, 2.0]])
y = np.array([[1.0]])
# Forward pass
# Hidden layer
Z1 = np.dot(X, W1) + b1 # Z1 = X * W1 + b1
A1 = 1/(1 + np.exp(-Z1)) # Sigmoid activation
# Output layer
Z2 = np.dot(A1, W2) + b2 # Z2 = A1 * W2 + b2
A2 = 1/(1 + np.exp(-Z2)) # Sigmoid activation
# Calculate loss
loss = -np.mean(y * np.log(A2) + (1 - y) * np.log(1 - A2))
print(f'Loss: {loss}')
# Backward pass
# Output layer
dZ2 = A2 - y # Derivative of loss with respect to Z2
dW2 = np.dot(A1.T, dZ2) # Derivative of loss with respect to W2
db2 = np.sum(dZ2, axis=0, keepdims=True)
# Hidden layer
dZ1 = np.dot(dZ2, W2.T) * A1 * (1 - A1) # Using sigmoid derivative
dW1 = np.dot(X.T, dZ1)
db1 = np.sum(dZ1, axis=0, keepdims=True)
# Print gradients
print(f'dW1:\n{dW1}')
print(f'db1: {db1}')
print(f'dW2:\n{dW2}')
print(f'db2: {db2}')
# Update parameters with learning rate of 0.1
learning_rate = 0.1
W1 -= learning_rate * dW1
b1 -= learning_rate * db1
W2 -= learning_rate * dW2
b2 -= learning_rate * db2
# Forward pass with updated parameters
Z1_new = np.dot(X, W1) + b1
A1_new = 1/(1 + np.exp(-Z1_new))
Z2_new = np.dot(A1_new, W2) + b2
A2_new = 1/(1 + np.exp(-Z2_new))
# Calculate new loss
new_loss = -np.mean(y * np.log(A2_new) + (1 - y) * np.log(1 - A2_new))
print(f'New Loss: {new_loss}')
print(f'Loss decreased: {loss > new_loss}')
# Run the backpropagation example
manual_backprop_example()
Let's put everything together with a practical example using the MNIST dataset:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False)
# Define network
class MNISTNetwork(nn.Module):
def __init__(self):
super(MNISTNetwork, self).__init__()
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(28*28, 128)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.flatten(x)
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
# Initialize model, loss function, and optimizer
model = MNISTNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# Training loop
def train(epochs=5):
losses = []
for epoch in range(epochs):
running_loss = 0.0
for i, data in enumerate(train_loader, 0):
inputs, labels = data
# Zero the parameter gradients
optimizer.zero_grad()
# Forward pass
outputs = model(inputs)
loss = criterion(outputs, labels)
# Backward pass and optimize
loss.backward()
optimizer.step()
# Print statistics
running_loss += loss.item()
if i % 100 == 99:
avg_loss = running_loss / 100
losses.append(avg_loss)
print(f'Epoch {epoch+1}, Batch {i+1}: Loss {avg_loss:.3f}')
running_loss = 0.0
print('Finished Training')
return losses
# Train the model
losses = train(epochs=3)
# Plot the training loss
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Mini-batch (x100)')
plt.ylabel('Loss')
plt.grid(True)
plt.show()
# Evaluate on test data
def evaluate():
correct = 0
total = 0
with torch.no_grad():
for data in test_loader:
images, labels = data
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print(f'Accuracy on test set: {100 * correct / total:.2f}%')
# Run evaluation
evaluate()
# Visualize some predictions
def visualize_predictions():
with torch.no_grad():
# Get a batch of test images
dataiter = iter(test_loader)
images, labels = next(dataiter)
# Get predictions
outputs = model(images)
_, predicted = torch.max(outputs, 1)
# Show images with predictions
fig = plt.figure(figsize=(12, 6))
for i in range(10):
ax = fig.add_subplot(2, 5, i + 1)
img = images[i].squeeze().numpy()
ax.imshow(img, cmap='gray')
ax.set_title(f'Pred: {predicted[i]}, True: {labels[i]}')
ax.axis('off')
plt.tight_layout()
plt.show()
# Visualize predictions
visualize_predictions()
This tutorial covered: 1. Neural Network fundamentals 2. Implementing a neural network using PyTorch 3. Building a neural network from scratch 4. Understanding gradient descent and backpropagation
Neural networks are powerful tools for machine learning, and understanding their implementation details provides valuable insights into how they learn from data.
Next steps: - Experiment with different architectures (more layers, different sizes) - Try different activation functions (tanh, LeakyReLU) - Implement more advanced optimization algorithms (Adam, RMSprop) - Explore regularization techniques to prevent overfitting