Object detection with YOLO-v1 (You Only Look Once) explained in details!
A complete guide on how the yolov1 algorithm works
Object detection
Unlike image classification whose objective is to identify the object in an image, object detection is a computer vision technique that allows both the identification and the localization of objects in an image.
What is YOLO v1?
YOLOv1 is an approach to object detection introduced in 2016 where a single neural network predicts bounding boxes and class probabilities directly from images in a single evaluation. Unlike two-stage detectors such as the R-CNN family where the first stage is responsible for proposing ROIs (regions of interest) and the second stage is responsible for classifying these regions. This unified detection makes this algorithm much faster than R-CNN and allows to reach high accuracy.
How does it work?
YOLO decomposes the input image into an SxS grid. If the center of an object is in a grid cell, that cell is assigned to its detection. In addition, YOLO predicts bounding boxes B and confidence scores per grid cell. The confidence score represents the probability that a bounding box contains an object (not a category). Thus, each bounding box has 5 predictions: x, y, w, h and confidence score, where (x, y) are relative to a grid cell and (w, h) are relative to the entire image.
The architecture of YOLO-v1
The YOLO-v1 model has 24 convolutional layers followed by 2 fully connected layers as shown in the next figure :
The goal of the model is to divide a given image into a grid of S by S where S equals 7, for each cell the model predicts two bounding boxes, so the total number of bounding boxes predicted for an image is SxSx2 = 98. We can observe that the output is equal to 7x7x30, with 30 representing the probabilities for each class (20 in the PASVAL VOC dataset), the x, y, width, height, and confidence of the two predicted bounding boxes for each cell.
from turtle import forward
import torch
import torch.nn as nn
architecture_config = [
# Tuple = (kernel_size, num_filters, stride, padding)
(7, 64, 2, 3),
"M",
(3, 192, 1, 1),
"M",
(1, 128, 1, 0),
(3, 256, 1, 1),
(1, 256, 1, 0),
(3, 512, 1, 1),
"M",
# list = [tuples, num_repeats]
[(1, 256, 1, 0), (3, 512, 1, 1), 4],
(1, 512, 1, 0),
(3, 1024, 1, 1),
"M",
[(1, 512, 1, 0), (3, 1024, 1, 1), 2],
(3, 1024, 1, 1),
(3, 1024, 2, 1),
(3, 1024, 1, 1),
(3, 1024, 1, 1),
]
class CNNBlock(nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
super().__init__()
self.conv = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels, bias=False, **kwargs)
self.bn = nn.BatchNorm2d(out_channels)
self.leaky = nn.LeakyReLU(0.1)
def forward(self, x):
return self.leaky(self.bn(self.conv(x)))
class YOLOv1(nn.Module):
def __init__(self, in_channels=3, **kwargs):
super().__init__()
self.in_channels = in_channels
self.architecture = architecture_config
self.darknet = self._create_darknet(self.architecture)
self.fc = self._create_fc(**kwargs)
def forward(self, x):
x = self.darknet(x)
return self.fc(torch.flatten(x, start_dim=1))
def _create_darknet(self, architecture):
layers = []
inChannels = self.in_channels
for x in architecture:
if type(x) == tuple:
outChannels = x[1]
kernel_size = x[0]
stride = x[2]
padding = x[3]
layers += [CNNBlock(in_channels=inChannels,
out_channels=outChannels, kernel_size=kernel_size,
stride = stride, padding = padding)]
inChannels = x[1]
elif type(x) == str:
layers += [nn.MaxPool2d(kernel_size=(2,2), stride=(2,2))]
elif type(x) == list:
conv1 = x[0]
conv2 = x[1]
num_repeats = x[2]
for _ in range(num_repeats):
layers += [
CNNBlock(
in_channels=inChannels,
out_channels=conv1[1],
kernel_size = conv1[0],
stride=conv1[2],
padding=conv1[3])
]
inChannels = conv1[1]
layers += [
CNNBlock(
in_channels=inChannels,
out_channels=conv2[1],
kernel_size = conv2[0],
stride=conv2[2],
padding=conv2[3])
]
inChannels = conv2[1]
return nn.Sequential(*layers)
def _create_fc(self, Split_size, num_box, num_classes):
return nn.Sequential(
nn.Flatten(),
nn.Linear(1024 * Split_size * Split_size, 496),
nn.Dropout(0.0),
nn.LeakyReLU(0.1),
nn.Linear(496, Split_size*Split_size*(num_classes + num_box * 5)),
)
The Loss function
To train the model, we need to have a loss fucntion, it's defined as follows :
The loss function of YOLO-v1 contains several sub-losses, including loss for bounding box coordinates, bounding box height and with, class confidence and class probabilities.
import torch
import torch.nn as nn
from utils import intersection_over_union
class YoloLoss(nn.Module):
"""
Calculate the loss for yolo (v1) model
"""
def __init__(self, S=7, B=2, C=20):
super(YoloLoss, self).__init__()
self.mse = nn.MSELoss(reduction="sum")
"""
S is split size of image (in paper 7),
B is number of boxes (in paper 2),
C is number of classes (in paper and VOC dataset is 20),
"""
self.S = S
self.B = B
self.C = C
# These are from Yolo paper, signifying how much we should
# pay loss for no object (noobj) and the box coordinates (coord)
self.lambda_noobj = 0.5
self.lambda_coord = 5
def forward(self, predictions, target):
# predictions are shaped (BATCH_SIZE, S*S(C+B*5) when inputted
predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)
# Calculate IoU for the two predicted bounding boxes with target bbox
iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
# Take the box with highest IoU out of the two prediction
# Note that bestbox will be indices of 0, 1 for which bbox was best
iou_maxes, bestbox = torch.max(ious, dim=0)
exists_box = target[..., 20].unsqueeze(3) # in paper this is Iobj_i
# ======================== #
# FOR BOX COORDINATES #
# ======================== #
# Set boxes with no object in them to 0. We only take out one of the two
# predictions, which is the one with highest Iou calculated previously.
box_predictions = exists_box * (
(
bestbox * predictions[..., 26:30]
+ (1 - bestbox) * predictions[..., 21:25]
)
)
box_targets = exists_box * target[..., 21:25]
# Take sqrt of width, height of boxes to ensure that
box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
torch.abs(box_predictions[..., 2:4] + 1e-6)
)
box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])
box_loss = self.mse(
torch.flatten(box_predictions, end_dim=-2),
torch.flatten(box_targets, end_dim=-2),
)
# ==================== #
# FOR OBJECT LOSS #
# ==================== #
# pred_box is the confidence score for the bbox with highest IoU
pred_box = (
bestbox * predictions[..., 25:26] + (1 - bestbox) * predictions[..., 20:21]
)
object_loss = self.mse(
torch.flatten(exists_box * pred_box),
torch.flatten(exists_box * target[..., 20:21]),
)
# ======================= #
# FOR NO OBJECT LOSS #
# ======================= #
#max_no_obj = torch.max(predictions[..., 20:21], predictions[..., 25:26])
#no_object_loss = self.mse(
# torch.flatten((1 - exists_box) * max_no_obj, start_dim=1),
# torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
#)
no_object_loss = self.mse(
torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
)
no_object_loss += self.mse(
torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
)
# ================== #
# FOR CLASS LOSS #
# ================== #
class_loss = self.mse(
torch.flatten(exists_box * predictions[..., :20], end_dim=-2,),
torch.flatten(exists_box * target[..., :20], end_dim=-2,),
)
loss = (
self.lambda_coord * box_loss # first two rows in paper
+ object_loss # third row in paper
+ self.lambda_noobj * no_object_loss # forth row
+ class_loss # fifth row
)
return loss
Finally ...
YOLO v1 is suitable for applications that require real-time predictions. However, it has several limitations, for example, it cannot detect two classes in a cell, a cell can only detect one class. We will discuss other versions of YOLO in detail and see the main differences between them. Thank you!