Introduction to neural nets with fastai
Understanding basics of deep learning with fastai and pytorch
- MNIST fastai walkthrough
- Imports
- Inspect data
- From images to tensors
- Customized Image Dataset
- Custom Data Loader
- Custom learner
- Custom Neural Net
- Custom optimizer
- Custom loss function
- Custom metric
- Conclusion
MNIST fastai walkthrough
The aim of this notebook is to go through and understand steps in https://github.com/fastai/fastbook/blob/master/04_mnist_basics.ipynb. In particular, how MNIST images are classified starting from getting data and ending with neural net parameter optimization. In most cases I will try to take existing fastai implementation and distil it to constituent parts using basic PyTorch operations.
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.core.display import display, HTML
import math
import matplotlib.pyplot as plt
import functools
import pylab
from fastai.datasets import untar_data, URLs
from fastai.vision.image import *
from fastai.metrics import accuracy
import torch
from torch import nn
from torchvision import transforms
from torch.utils.data import DataLoader
from torch import tensor
from torch.optim import SGD
%load_ext autoreload
%autoreload 2
%matplotlib inline
pylab.rcParams['figure.figsize'] = (15, 8)
pylab.rcParams['font.size'] = 10
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
URLs.MNIST
data_path = untar_data(URLs.MNIST)
data_path
Dataset is already split into testing and training
data_path.ls()
Each has separate folders for target classes 0 - 9
Path.joinpath(data_path, 'testing').ls()
See number of images in each, around 80/20% split
[f"Number of images in training set {len(Path.joinpath(data_path, 'training', str(label)).ls())}, "
f"test set {len(Path.joinpath(data_path, 'testing', str(label)).ls())}"
for label in range(10)]
Each image has identifier in its filename
Path.joinpath(data_path, 'training', '0').ls()[:10]
Images are of size 28 x 28 pixels, monochrome
im = PIL.Image.open('/root/.fastai/data/mnist_png/training/0/15559.png')
print(im.size)
im
Load image from previous section to tensor, check it's type and dimensions
im_tensor = transforms.ToTensor()(im)
type(im_tensor), im_tensor.shape
Load few images from a single class into tensor. Note that there's one extra dimension, so we need to reshape into rank 2 before stacking
zeroes_path = Path.joinpath(data_path, 'training', '0').ls()[:10]
zeroes = torch.stack([transforms.ToTensor()(PIL.Image.open(image_path)).reshape((28, 28))
for image_path in zeroes_path])
Now this tensor contains first 10 images from zeroes category
print(zeroes.shape)
plt.imshow(zeroes[0])
Now concatenate tensors loaded from couple categories
class_paths = [Path.joinpath(data_path, 'training', '0'),
Path.joinpath(data_path, 'training', '1')]
NUM_SAMPLES = 10
zeroes_and_ones = torch.cat([
torch.stack([transforms.ToTensor()(PIL.Image.open(image_path)).reshape((28, 28))
for image_path in class_path.ls()[:NUM_SAMPLES]])
for class_path in class_paths
])
Small utitlity to show few images at once
import math
def show_images(ims, nrows=1, ncols=None):
if ncols is None: ncols = int(math.ceil(len(ims) / nrows))
axs = plt.subplots(nrows, ncols)[1].flat
for im, ax in zip(ims, axs): ax.imshow(im)
Now we have images from both classes loaded in a single tensor
print(zeroes_and_ones.shape)
show_images((zeroes_and_ones[0], zeroes_and_ones[-1]))
Input look already scaled, so don't need to take extra preprocessing steps
zeroes_and_ones.min(), zeroes_and_ones.max()
Reshape images into vectors, such that each image is a row / observation, that's our prepared x data
x = zeroes_and_ones.view(-1, 28 * 28)
x.shape
Extract labels from image folder path and get number of images in each
labels_sizes = [(int(str(path).split('/')[-1]), len(path.ls()))
for path in class_paths]
labels_sizes
Repeat each label number of times specified by number of items in class
y = tensor([label for label_size in labels_sizes
for label in [label_size[0]] * label_size[1]])
print(f'Target shape as expected: {y.shape[0] == labels_sizes[0][1] + labels_sizes[1][1]}')
y
class ImageDataSet:
def __init__(self, data_path, image_size=(28, 28), num_samples=None):
self.data_path = data_path
self.image_size = image_size
self.num_samples = num_samples
def load_class_images_to_tensor_list(self, class_path):
return torch.stack([transforms.ToTensor()(PIL.Image.open(image_path)).reshape(self.image_size)
for image_path in class_path.ls()[:self.num_samples]]).float()
def stack_class_tensors(self):
return torch.cat([
self.load_class_images_to_tensor_list(class_path)
for class_path in sorted(self.data_path.ls())
])
@staticmethod
def tensor_to_vector(t):
return t.view(-1, t.shape[-1] * t.shape[-2])
@property
def x(self):
stacked_tensor = self.stack_class_tensors()
return self.tensor_to_vector(stacked_tensor)
@property
def y(self):
labels_sizes = [
(self._extract_label_from_path(path), len(path.ls()[:self.num_samples]))
for path in sorted(self.data_path.ls())
]
return tensor([label for label_size in labels_sizes
for label in [label_size[0]] * label_size[1]])
@staticmethod
def _extract_label_from_path(path):
return int(str(path).split('/')[-1])
Use this class to create training and validation sets with 100 samples each
ds = ImageDataSet(data_path=Path.joinpath(data_path, 'training'), num_samples=100)
train_x, train_y = ds.x, ds.y
ds_val = ImageDataSet(Path.joinpath(data_path, 'testing'), num_samples=100)
valid_x, valid_y = ds_val.x, ds_val.y
Shapes and images look as expected
print(train_x.shape, train_y.shape)
print(train_y[0], train_y[-1])
show_images((train_x[0].view(28, 28), train_x[-1].view(28, 28)))
Zip x and y into single data structures
dset = list(zip(train_x, train_y))
valid_dset = list(zip(valid_x, valid_y))
Check again that everything is correct
x1, y1 = valid_dset[0]
x2, y2 = valid_dset[-1]
print(y1, y2)
show_images((x1.view(28, 28), x2.view(28, 28)))
Now let's pass these created datasets to training and validation data loaders that randomize image order and allow us to stream inputs in batches for stochastic gradient descent
import random
class DataLoader_:
def __init__(self, dset, batch_size):
self.chunked_dset = self.chunker(dset, batch_size)
def chunker(self, dset, batch_size):
random.shuffle(dset)
return (dset[idx:idx + batch_size]
for idx in range(0, len(dset), batch_size))
def __iter__(self):
return self
def __next__(self):
try:
x_b, y_b = list(zip(*next(self.chunked_dset)))
return torch.stack(x_b), torch.stack(y_b)
except IndexError:
raise StopIteration()
dl = DataLoader_(dset, batch_size=256)
x_b, y_b = next(dl)
x_b.shape, y_b.shape
Looks like our custom data loader is working fine so far!
plt.imshow(x_b[0].view(28, 28)), y_b[0]
Custom learner
Let's build a learner that is initialized by passing:
- dataloader with batch size
- neural net
- optimizer
- loss function
- metric
Then trains and validates the model for a specified number of epochs in the following manner:
- get batch of data from training data loader
- perform forward pass through the network
- calculate loss and it's gradients
- update weights via backpropagation using optimizer
- get batch from validation data loader
- perform forward pass through the network
- calculate and print performance metric
class CustomLearner:
def __init__(self, dsets, batch_size, model, optimizer, loss_fn, metric):
self.train_dset, self.valid_dset = dsets
self.batch_size = batch_size
self.model = model
self.optimizer = optimizer
self.loss_fn = loss_fn
self.metric = metric
def train_model(self, num_epochs):
for epoch in range(num_epochs):
self.train_epoch()
print(self.validate_epoch())
def train_epoch(self):
train_dl = DataLoader_(self.train_dset, self.batch_size)
for x_batch, y_batch in train_dl:
self.calculate_gradient(x_batch, y_batch)
self.optimizer.step()
self.optimizer.zero_grad()
def calculate_gradient(self, x_batch, y_batch):
predictions = self.model.forward(x_batch)
self.loss_fn(predictions, y_batch).backward()
def validate_epoch(self):
valid_dl = DataLoader_(self.valid_dset, self.batch_size)
metrics = [self.metric(self.model.forward(x_batch), y_batch)
for x_batch, y_batch in valid_dl]
return torch.stack(metrics).mean().item()
Setting constants that will be used for later customizations. Number of epochs, learning rate, number of activations are set to achieve fast iteration time, in real life more time should be spent finding the right values.
BATCH_SIZE = 256
LEARNING_RATE = .1
NUM_CLASSES = 10
NUM_EPOCHS = 20
NUM_ACTIVATIONS = 30
INPUT_SIZE = np.prod(ds.image_size)
dsets = dset, valid_dset
Set up simple neural net with two hidden linear layers and ReLU non-linearity inbetween. SGD optimizer is imported from Torch.
simple_net = nn.Sequential(
nn.Linear(INPUT_SIZE, NUM_ACTIVATIONS),
nn.ReLU(),
nn.Linear(NUM_ACTIVATIONS, NUM_CLASSES)
)
optimizer = SGD(simple_net.parameters(), lr=LEARNING_RATE)
Could get close to 90% accuracy, if ran for more epochs, even with 100 samples of each category
CustomLearner(dsets=dsets, model=simple_net, optimizer=optimizer,
loss_fn=nn.functional.cross_entropy, metric=accuracy, batch_size=BATCH_SIZE).train_model(NUM_EPOCHS)
from torch.nn.parameter import Parameter # Still need to import this to simplify weight optimization
class SimpleNet(nn.Module):
def __init__(self, input_size, num_activations, output_size):
super().__init__()
self.weights_1, self.bias_1 = (self.init_params((input_size, num_activations)),
self.init_params(num_activations))
self.weights_2, self.bias_2 = (self.init_params((num_activations, output_size)),
self.init_params(output_size))
@staticmethod
def init_params(size):
return Parameter(torch.randn(size))
@staticmethod
def linear(x_batch, weight, bias):
return x_batch.matmul(weight) + bias
@staticmethod
def relu(x_batch):
return x_batch.max(tensor(.0))
def forward(self, x_batch):
res = self.linear(x_batch, self.weights_1, self.bias_1)
res = self.relu(res)
return self.linear(res, self.weights_2, self.bias_2)
For some reason results are not the same as with torch implementation but loss is improving, so let's leave it for now.
sn = SimpleNet(INPUT_SIZE, NUM_ACTIVATIONS, NUM_CLASSES)
optimizer = SGD(sn.parameters(), lr=LEARNING_RATE)
CustomLearner(dsets=dsets, model=sn, optimizer=optimizer,
loss_fn=nn.functional.cross_entropy, metric=accuracy, batch_size=BATCH_SIZE).train_model(NUM_EPOCHS)
class CustomSGD:
def __init__(self, params, learning_rate):
self.params = list(params)
self.learning_rate = learning_rate
def step(self):
for p in self.params:
p.data -= p.grad.data * self.learning_rate
def zero_grad(self):
for p in self.params:
p.grad = None
Results look similar
sn = SimpleNet(INPUT_SIZE, NUM_ACTIVATIONS, NUM_CLASSES)
optimizer = CustomSGD(sn.parameters(), LEARNING_RATE)
CustomLearner(dsets=dsets, model=sn, optimizer=optimizer,
loss_fn=nn.functional.cross_entropy, metric=accuracy, batch_size=BATCH_SIZE).train_model(NUM_EPOCHS)
Custom loss function
Now let's replace torch's cross entropy loss. Which essentially applies log-softmax to each row of last activation layer output (recall this output has n_input * n_classes shape) and picks value that corresponds to target class index and takes the negative mean of them. Also note that softmax = exponentiated element of a row divided by the sum of all exponentiated row elements.
Let's take one batch and pass it through our already trained network to illustrate this
x_b, y_b = next(dl)
preds = sn.forward(x_b)
print(preds.shape)
preds
def _log_softmax(input, dim):
return input.exp().div(input.exp().sum(dim).unsqueeze(1)).log()
def cross_entropy_loss(preds, target):
return -_log_softmax(preds, 1).gather(1, target.unsqueeze(1)).mean()
Differences in log_softmax implementation are small, but for some reason using custom softmax does not update parameters in optimization process
cross_entropy_loss(preds, y_b), nn.functional.cross_entropy(preds, y_b)
sn = SimpleNet(INPUT_SIZE, NUM_ACTIVATIONS, NUM_CLASSES)
optimizer = CustomSGD(sn.parameters(), LEARNING_RATE)
CustomLearner(dsets=dsets, model=sn, optimizer=optimizer,
loss_fn=cross_entropy_loss, metric=accuracy, batch_size=BATCH_SIZE).train_model(NUM_EPOCHS)
But using torch log_softmax works fine
def cross_entropy_loss(preds, target):
return -preds.log_softmax(1).gather(1, target.unsqueeze(1)).mean()
sn = SimpleNet(INPUT_SIZE, NUM_ACTIVATIONS, NUM_CLASSES)
optimizer = CustomSGD(sn.parameters(), LEARNING_RATE)
CustomLearner(dsets=dsets, model=sn, optimizer=optimizer,
loss_fn=cross_entropy_loss, metric=accuracy, batch_size=BATCH_SIZE).train_model(NUM_EPOCHS)
def custom_accuracy(preds, target):
return (preds.argmax(1).unsqueeze(1) == target.unsqueeze(1)).float().mean()
sn = SimpleNet(INPUT_SIZE, NUM_ACTIVATIONS, NUM_CLASSES)
optimizer = CustomSGD(sn.parameters(), LEARNING_RATE)
CustomLearner(dsets=dsets, model=sn, optimizer=optimizer,
loss_fn=cross_entropy_loss, metric=custom_accuracy, batch_size=BATCH_SIZE).train_model(NUM_EPOCHS)
Conclusion
Although there are some bits that didn't work 100% as expected, this has helped me to better understand what's happening behind the curtains of fastai and torch and more generally, the basics of how simple neural networks work. I urge everyone to check Jeremy Howard's fantastic lectures on https://www.fast.ai/