This tutorial covers the fundamentals of neural networks, including: - Basic concepts and architecture - Implementation using PyTorch - Implementation from scratch with Python - Understanding gradient descent and backpropagation

1. Neural Network Fundamentals

Neural networks are computational models inspired by the human brain. They consist of: - Neurons: Basic processing units - Connections: Links between neurons with associated weights - Layers: Organized groups of neurons (input, hidden, output) - Activation Functions: Non-linear transformations applied to neuron outputs

Basic Architecture

A simple neural network consists of: 1. Input Layer: Receives the input data 2. Hidden Layer(s): Performs computations and feature extraction 3. Output Layer: Produces the final result

Each neuron computes a weighted sum of its inputs, applies an activation function, and passes the result to the next layer.

2. Implementing a Neural Network with PyTorch

PyTorch is a popular deep learning framework that provides tools for building and training neural networks efficiently.

Setting Up

Creating a Simple Neural Network

class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        # First fully connected layer
        self.fc1 = nn.Linear(input_size, hidden_size)
        # Second fully connected layer
        self.fc2 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        # Apply activation function (ReLU) after first layer
        x = F.relu(self.fc1(x))
        # Output layer (no activation for now)
        x = self.fc2(x)
        return x

Generating Synthetic Data

def generate_data(samples=1000):
    # Create binary classification data
    X = np.random.randn(samples, 2)
    # Create two clusters
    y = np.zeros(samples)
    # Points in first quadrant belong to class 1
    y[(X[:, 0] > 0) & (X[:, 1] > 0)] = 1
    # Points in third quadrant belong to class 1
    y[(X[:, 0] < 0) & (X[:, 1] < 0)] = 1
    
    # Convert to PyTorch tensors
    X_tensor = torch.FloatTensor(X)
    y_tensor = torch.FloatTensor(y).reshape(-1, 1)
    
    return X_tensor, y_tensor

# Generate training data
X_train, y_train = generate_data(1000)

Training the Network

def train_model(model, X_train, y_train, epochs=1000, lr=0.01):
    # Define loss function and optimizer
    criterion = nn.BCEWithLogitsLoss()  # Binary cross entropy with sigmoid
    optimizer = optim.SGD(model.parameters(), lr=lr)
    
    # Lists to store loss values
    losses = []
    
    # Training loop
    for epoch in range(epochs):
        # Forward pass
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Store loss
        losses.append(loss.item())
        
        # Print progress
        if (epoch+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
    
    return losses

# Initialize the model
input_size = 2
hidden_size = 10
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)

# Train the model
losses = train_model(model, X_train, y_train)

# Plot the loss curve
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

Visualizing the Decision Boundary

def plot_decision_boundary(model, X, y):
    # Create a mesh grid
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                         np.arange(y_min, y_max, 0.01))
    
    # Get predictions for all grid points
    Z = model(torch.FloatTensor(np.c_[xx.ravel(), yy.ravel()]))
    Z = torch.sigmoid(Z).detach().numpy().reshape(xx.shape)
    
    # Plot decision boundary and data points
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdBu)
    plt.scatter(X[:, 0], X[:, 1], c=y.reshape(-1), edgecolors='k', cmap=plt.cm.RdBu)
    plt.title('Decision Boundary')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()

# Visualize the decision boundary
X_np = X_train.numpy()
y_np = y_train.numpy()
plot_decision_boundary(model, X_np, y_np)

3. Building a Neural Network from Scratch

Now, let's implement a neural network using only basic Python functions and NumPy.

Implementation

import numpy as np

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights and biases
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))
    
    def relu(self, x):
        """ReLU activation function"""
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        """Derivative of ReLU"""
        return np.where(x > 0, 1, 0)
    
    def sigmoid(self, x):
        """Sigmoid activation function"""
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))  # Clip to avoid overflow
    
    def sigmoid_derivative(self, x):
        """Derivative of sigmoid"""
        s = self.sigmoid(x)
        return s * (1 - s)
    
    def forward(self, X):
        """Forward propagation"""
        # First layer
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.relu(self.z1)
        
        # Output layer
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.sigmoid(self.z2)
        
        return self.a2
    
    def backward(self, X, y, output, learning_rate=0.01):
        """Backward propagation"""
        m = X.shape[0]
        
        # Calculate gradients
        dz2 = output - y
        dW2 = (1/m) * np.dot(self.a1.T, dz2)
        db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
        
        dz1 = np.dot(dz2, self.W2.T) * self.relu_derivative(self.z1)
        dW1 = (1/m) * np.dot(X.T, dz1)
        db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
        
        # Update parameters
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
    
    def train(self, X, y, epochs=1000, learning_rate=0.01):
        """Train the neural network"""
        losses = []
        
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)
            
            # Calculate loss (binary cross-entropy)
            loss = -np.mean(y * np.log(output + 1e-15) + (1 - y) * np.log(1 - output + 1e-15))
            losses.append(loss)
            
            # Backward pass
            self.backward(X, y, output, learning_rate)
            
            # Print progress
            if (epoch+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}')
        
        return losses

Using Our Custom Neural Network

# Generate data
np.random.seed(42)
X = np.random.randn(1000, 2)
y = np.zeros((1000, 1))
y[(X[:, 0] > 0) & (X[:, 1] > 0)] = 1
y[(X[:, 0] < 0) & (X[:, 1] < 0)] = 1

# Create and train the model
nn = NeuralNetwork(input_size=2, hidden_size=10, output_size=1)
losses = nn.train(X, y, epochs=2000, learning_rate=0.1)

# Plot loss curve
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title('Training Loss (Custom NN)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

# Predict and evaluate
predictions = nn.forward(X)
accuracy = np.mean((predictions >= 0.5) == y)
print(f'Accuracy: {accuracy:.4f}')

4. Gradient Descent and Backpropagation

Gradient Descent

Gradient descent is an optimization algorithm used to minimize the loss function in neural networks. The basic process is:

Where: - w is the weight - learning_rate is a hyperparameter controlling step size - dL/dw is the gradient of the loss with respect to the weight

Types of Gradient Descent:

Backpropagation

Backpropagation is the algorithm used to efficiently calculate gradients in neural networks. It works by:

Where: - L is the loss function - w is a weight - z is the weighted input to a neuron - a is the activation (output) of a neuron

Let's implement a simple example to illustrate backpropagation with a single hidden layer:

def manual_backprop_example():
    # Simple 2-2-1 network (2 input, 2 hidden, 1 output)
    
    # Initialize parameters
    W1 = np.array([[0.1, 0.2], [0.3, 0.4]])  # Input -> Hidden
    b1 = np.array([[0.1, 0.1]])
    W2 = np.array([[0.5], [0.6]])  # Hidden -> Output
    b2 = np.array([[0.1]])
    
    # Input and target
    X = np.array([[1.0, 2.0]])
    y = np.array([[1.0]])
    
    # Forward pass
    # Hidden layer
    Z1 = np.dot(X, W1) + b1  # Z1 = X * W1 + b1
    A1 = 1/(1 + np.exp(-Z1))  # Sigmoid activation
    
    # Output layer
    Z2 = np.dot(A1, W2) + b2  # Z2 = A1 * W2 + b2
    A2 = 1/(1 + np.exp(-Z2))  # Sigmoid activation
    
    # Calculate loss
    loss = -np.mean(y * np.log(A2) + (1 - y) * np.log(1 - A2))
    print(f'Loss: {loss}')
    
    # Backward pass
    # Output layer
    dZ2 = A2 - y  # Derivative of loss with respect to Z2
    dW2 = np.dot(A1.T, dZ2)  # Derivative of loss with respect to W2
    db2 = np.sum(dZ2, axis=0, keepdims=True)
    
    # Hidden layer
    dZ1 = np.dot(dZ2, W2.T) * A1 * (1 - A1)  # Using sigmoid derivative
    dW1 = np.dot(X.T, dZ1)
    db1 = np.sum(dZ1, axis=0, keepdims=True)
    
    # Print gradients
    print(f'dW1:\n{dW1}')
    print(f'db1: {db1}')
    print(f'dW2:\n{dW2}')
    print(f'db2: {db2}')
    
    # Update parameters with learning rate of 0.1
    learning_rate = 0.1
    W1 -= learning_rate * dW1
    b1 -= learning_rate * db1
    W2 -= learning_rate * dW2
    b2 -= learning_rate * db2
    
    # Forward pass with updated parameters
    Z1_new = np.dot(X, W1) + b1
    A1_new = 1/(1 + np.exp(-Z1_new))
    Z2_new = np.dot(A1_new, W2) + b2
    A2_new = 1/(1 + np.exp(-Z2_new))
    
    # Calculate new loss
    new_loss = -np.mean(y * np.log(A2_new) + (1 - y) * np.log(1 - A2_new))
    print(f'New Loss: {new_loss}')
    print(f'Loss decreased: {loss > new_loss}')

# Run the backpropagation example
manual_backprop_example()

5. Complete Working Example

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=False)

# Define network
class MNISTNetwork(nn.Module):
    def __init__(self):
        super(MNISTNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(28*28, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)
        
    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize model, loss function, and optimizer
model = MNISTNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# Training loop
def train(epochs=5):
    losses = []
    for epoch in range(epochs):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Print statistics
            running_loss += loss.item()
            if i % 100 == 99:
                avg_loss = running_loss / 100
                losses.append(avg_loss)
                print(f'Epoch {epoch+1}, Batch {i+1}: Loss {avg_loss:.3f}')
                running_loss = 0.0
    
    print('Finished Training')
    return losses

# Train the model
losses = train(epochs=3)

# Plot the training loss
plt.figure(figsize=(10, 5))
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Mini-batch (x100)')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

# Evaluate on test data
def evaluate():
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Accuracy on test set: {100 * correct / total:.2f}%')

# Run evaluation
evaluate()

# Visualize some predictions
def visualize_predictions():
    with torch.no_grad():
        # Get a batch of test images
        dataiter = iter(test_loader)
        images, labels = next(dataiter)
        
        # Get predictions
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        
        # Show images with predictions
        fig = plt.figure(figsize=(12, 6))
        for i in range(10):
            ax = fig.add_subplot(2, 5, i + 1)
            img = images[i].squeeze().numpy()
            ax.imshow(img, cmap='gray')
            ax.set_title(f'Pred: {predicted[i]}, True: {labels[i]}')
            ax.axis('off')
        plt.tight_layout()
        plt.show()

# Visualize predictions
visualize_predictions()

Conclusion

This tutorial covered: 1. Neural Network fundamentals 2. Implementing a neural network using PyTorch 3. Building a neural network from scratch 4. Understanding gradient descent and backpropagation

Neural networks are powerful tools for machine learning, and understanding their implementation details provides valuable insights into how they learn from data.

Next steps: - Experiment with different architectures (more layers, different sizes) - Try different activation functions (tanh, LeakyReLU) - Implement more advanced optimization algorithms (Adam, RMSprop) - Explore regularization techniques to prevent overfitting

Neural Networks Tutorial