# loading the relevant libraries
import torch
import torch.nn as nn 
import torchvision
import torchvision.transforms as transforms
# Define the relevant variables
batch_size = 64
num_classes = 10
learning_rate = 0.001
num_epochs = 10
# Device will determine whether to run the training on GPU or CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')Architecture
To understand LeNet-5 in detail lets go through each component of the architecture.

The input for LeNet-5 architecture is grey scale images which are 32x32 size. Since these are grey scale images hence channel is 1.
LeNet-5 has 5 layers with 3 convolutional layers with a combination of average pooling and two fully connected layers. The first convolutional layers has filter size of 5x5 with 6 such filters. Now these will decrease the width and height of image while increasing the depth (channels) of image. The output will be 28x28x6.
Here is simplified version of architecture. 
The spatial size of output is calculated using ([W-F+2P]/S)+1
* W is the input volume size
* F is the size of the filter
* P is the number of padding applied
* S is the number of strides
W = 32, F = 5, P =0, S = 1 the output depth will be equal to the number of filters applied i.e. = 6.
Applying formula ([32-5+2*0]/1)+1= 28. So the output volume is 28x28x6.
After this pooling is applied to decrease the feature map by half i.e. 14x14x6. Again same filter size 5x5 with 16 filters is now applied to the output followed by a pooling layer. This reduces the output feature map to 5x5x16.
After this, a convolutional layer of size 5x5 with 120 filters is applied to flatten the feature map to 120 values.Then comes the first fully connected layer, with 84 neurons. Finally, we have the output layer which has 10 output neurons, since the MNIST data have 10 classes for each of the represented 10 numerical digits.
| Layer | #Filters/Neurons | Filter Size | stride | size of Feature Map | Activation Function | |
|---|---|---|---|---|---|---|
| input | 1 | - | - | 32x32x1 | ||
| conv 1 | 6 | 5x5 | 1 | 28x28x6 | Relu | |
| Average Pooling | 6 | 2x2 | 2 | 14x14x6 | ||
| conv 2 | 16 | 5x5 | 1 | 10x10x16 | Relu | |
| Average Pooling | 16 | 2x2 | 2 | 5x5x16 | ||
| conv 3 | 120 | 5x5 | 1 | 120 | Relu | |
| FC | - | - | - | 84 | Relu | |
| FC | - | - | - | 10 | softmax | 
In original paper sigmoid is used as activation Function. Here we are replacing it recently most popular one i.e. Relu.
Implementation
Now we have understood the architecture and let’s implement it.
- Dataset Understanding
 
- LeNet from Scratch
 
- Setting Hyperparameters
 
- Model Training
 
- Model Evaluation
Dataset Understanding
Here we will be using famous MNIST dataset which contains hand written digits. These are greyscale with size of 28x28 composed of 60,000 training images and 10,000 testing images.
Importing Libraries
Loading the Dataset
Using the torchvision library we will load the dataset.
The MNIST data can’t be used as it is for the LeNet5 architecture. The LeNet5 architecture accepts the input to be 32x32 and the MNIST images are 28x28. We can fix this by resizing the images, normalizing them using the pre-calculated mean and standard deviation (available online), and finally storing them as tensors.
# Loading the dataset and preprocessing
train_dataset = torchvision.datasets.MNIST(root = './data',
                                           train = True,
                                           transform = transforms.Compose([
                                                  transforms.Resize((32,32)),
                                                  transforms.ToTensor(),
                                                  transforms.Normalize(mean = (0.1307,), std = (0.3081,))]),
                                           download = True)
test_dataset = torchvision.datasets.MNIST(root = './data',
                                          train = False,
                                          transform = transforms.Compose([
                                                  transforms.Resize((32,32)),
                                                  transforms.ToTensor(),
                                                  transforms.Normalize(mean = (0.1325,), std = (0.3105,))]),
                                          download=True)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                           batch_size = batch_size,
                                           shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                           batch_size = batch_size,
                                           shuffle = True)
LeNet5 from Scratch
# Defining the convolutional neural network
class LeNet5(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 2, stride = 2))
        self.fc = nn.Linear(400, 120)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(120, 84)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(84, num_classes)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.relu(out)
        out = self.fc1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        return outSetting up hyperparameters
model = LeNet5(num_classes).to(device)
# Setting the loss function
cost = nn.CrossEntropyLoss()
# Setting the optimizer with the model parameters and learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# this is defined to print how many steps are remaining when training
total_step = len(train_loader)Model Training
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        images = images.to(device)
        labels = labels.to(device)
        
        #Forward pass
        outputs = model(images)
        loss = cost(outputs, labels)
            
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
                
        if (i+1) % 400 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                           .format(epoch+1, num_epochs, i+1, total_step, loss.item()))Epoch [1/10], Step [400/938], Loss: 0.1018
Epoch [1/10], Step [800/938], Loss: 0.0178
Epoch [2/10], Step [400/938], Loss: 0.0299
Epoch [2/10], Step [800/938], Loss: 0.0044
Epoch [3/10], Step [400/938], Loss: 0.0335
Epoch [3/10], Step [800/938], Loss: 0.0365
Epoch [4/10], Step [400/938], Loss: 0.0217
Epoch [4/10], Step [800/938], Loss: 0.0110
Epoch [5/10], Step [400/938], Loss: 0.0063
Epoch [5/10], Step [800/938], Loss: 0.0620
Epoch [6/10], Step [400/938], Loss: 0.0178
Epoch [6/10], Step [800/938], Loss: 0.0588
Epoch [7/10], Step [400/938], Loss: 0.0092
Epoch [7/10], Step [800/938], Loss: 0.0120
Epoch [8/10], Step [400/938], Loss: 0.0048
Epoch [8/10], Step [800/938], Loss: 0.0455
Epoch [9/10], Step [400/938], Loss: 0.0067
Epoch [9/10], Step [800/938], Loss: 0.0589
Epoch [10/10], Step [400/938], Loss: 0.0015
Epoch [10/10], Step [800/938], Loss: 0.0030As we can see, the loss is decreasing with every epoch which shows that our model is indeed learning.
Model Testing
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
  
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))
     Accuracy of the network on the 10000 test images: 98.97 %Using this model, we get around 98.97% accuracy which is quite good.
Conclusions
- Understood the architecture of LeNet5
- Then we built LeNet5 from scratch along with defining hyperparameters for the model.
 
- Finally, we trained and tested our model on the MNIST dataset, and the model seemed to perform well on the test dataset.