YOLOv1 Full Implementation with PyTorch

YOLO is an extremely fast object detection algorithm proposed in 2015. If you want to know more about the details, check my paper review for YOLOv1: YOLOv1 paper review

In this post, we will implement the full YOLOv1 with PyTorch.

References

The YOLOv1 video by Aladdin Persson was super helpful and I learned a lot from him. My train.py is mostly the same as his code. However, I wanted to make codes easier to understand and intuitive so I implented mostly from scratch except train.py and model.py. You can follow along with the comments.

The model was overfitted for 8examples.csv to see if it works. Below is one predicted bounding box example of the model.

Module Structure

dataset.py
loss.py
utils.py
model.py
train.py
plot_image.py:

dataset.py

You can download the dataset here: Link. Note that the labels are in the format (class labe, x, y, w, h) but relative to the whole image. Therefore, we need to convert the labels relative to each grid cell.

  
import torch
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
import os

class VOCDataset(Dataset):
    def __init__(self, data_csv, img_dir, label_dir, S=7, B=2, C=20, transform=None):
        '''
        Parameters:
            data_csv (str): csv file path name which has two columns.
                           1st column: img file name
                           2nd column: label file name
            img_dir (str): image directory path
            label_dir (str): label directory path
            S (int): Grid cell size (ex. 7x7)
            B (int): Number of bounding boxes per one cell
            C (int): Number of classes
            transform (Compose): A list of transforms
        '''

        self.df = pd.read_csv(data_csv)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.S = S
        self.B = B
        self.C = C
        self.transform = transform


    def __len__(self):
        '''
        Returns the length of the dataset
        '''
        return len(self.df)


    def __getitem__(self, idx):
        '''
        Returns one data item with idx

        Parameters:
            idx (int): Index number of data

        Returns:
            tuple: (image, label) where label is in format of
                    (S, S, 30). Note that last 5 elements of label
                    will NOT be used.
        '''

        # Path for image and label
        img_path = os.path.join(self.img_dir, self.df.iloc[idx, 0])
        label_path = os.path.join(self.label_dir, self.df.iloc[idx, 1])

        # Read image
        image = Image.open(img_path)

        # Read bounding boxes from label .txt file and store them to list
        bboxes = []

        with open(label_path, 'r') as f:
            while True:
                line = f.readline()
                if not line:
                    break

                # class_num, x, y, w, h
                one_box = line.replace('\n','').split(' ')

                # convert types to int or float accordingly
                one_box = [
                    float(el) if float(el) != int(float(el)) else int(el)
                    for el in one_box
                           ]
                bboxes.append(one_box)

        bboxes = torch.tensor(bboxes)
        if self.transform:
            image, bboxes = self.transform(image, bboxes)
        bboxes = bboxes.tolist()

        '''
        Loop through each box and scale relative to grid cell.
        Formula is quite straightforward.
        cell_w = S * w
        cell_h = S * h
        cell_x = S * x - floor(x * S)
        cell_y = S * y - floor(y * S)
        '''

        label_matrix = torch.zeros((self.S, self.S, 30)) # last 5 will NOT be used!
        for box in bboxes:
            # (i,j) in SxS grid -> used to assign box to label_matrix
            j, i = int(box[1] * self.S), int(box[2] * self.S)

            box_class = int(box[0])

            # Rescale relative to cell
            box[1] = self.S * box[1] - j # x
            box[2] = self.S * box[2] - i # y
            box[3] = self.S * box[3] # w
            box[4] = self.S * box[4] # h

            if label_matrix[i,j,20] == 0:
                # one-hot vector for class
                label_matrix[i, j, box_class] = 1

                # confidence for ground-truth = 1
                label_matrix[i, j, 20] = 1

                # box coordinate
                label_matrix[i, j, 21:25] = torch.tensor(box[1:])

        return image, label_matrix

model.py

You can reference the architecture from the paper.

  
import torch
import torch.nn as nn
import utils

# Import Network Architecture
# net_architecture = utils.read_json('./config.json')
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

class YOLOv1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(YOLOv1, self).__init__()

        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.classifiers = self._create_classifiers(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.classifiers(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]

            elif type(x) == tuple:
                layers += [CNNBlock(
                    in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3]
                )]
                in_channels = x[1]

            elif type(x) == list:
                conv1 = x[0] # tuple
                conv2 = x[1] # tuple
                repeat = x[2] # integer

                for _ in range(repeat):
                    layers += [
                        CNNBlock(
                            in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3]
                        )
                    ]

                    layers += [
                        CNNBlock(
                            conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3]
                        )
                    ]

                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_classifiers(self, grid_size, num_boxes, num_classes):
        S, B, C = grid_size, num_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B*5)) # (S, S, 3)
        )

loss.py

  
import torch
import torch.nn as nn
from utils import intersection_over_union

class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20, box_format="midpoint"):
        super(YoloLoss, self).__init__()
        self.S = S
        self.B = B
        self.C = C
        self.box_format = box_format
        self.lambda_coord = 5
        self.lambda_noobj = 0.5
        self.mse = nn.MSELoss(reduction="sum") # according to the paper

    def forward(self, preds, labels):
        '''
        Returns the loss of YOLOv1

        Parameters:
            preds (tensor): predicted bounding boxes in the shape
                            (batch_size, S, S, 30)
            labels (tensor): ground truth bounding boexes in the shape
                            (batch_size, S, S, 30)

        Returns:
            loss (float): The final loss consists of
                          1. Coordinate Loss
                          2. Confidence Loss(obj/noobj)
                          3. Class Loss
        0:20 - class label one-hot vector
        20 - box1 confidence
        21:25 - box1 (x,y,w,h)
        25 - box2 confidence
        26:30 - box2 (x,y,w,h)
        '''

        '''
        First, we need to determine which box is responsible for detecting the
        obj in a specific grid cell, given that an object exists. As stated in
        the original paper, only ONE predicted bounding box should be responsible.
        This is also a limitation of YOLOv1. The way to determine the responsibility
        is to compare both predictions' IoU with the ground truth box and pick
        the one with the highest IoU, given an object exists.
        '''
        preds = preds.reshape(-1, self.S, self.S, self.C + 5 * self.B)

        # Ground truth coordinates, class one-hot vector, confidence
        gt_coord = labels[..., 21:25]
        gt_class = labels[..., 0:20]
        gt_confidence = labels[..., 20:21]

        # Same as confidence..but denote Identity
        Iobj = labels[..., 20:21]

        # COORDINATES for box 1, 2
        box1_coord = preds[..., 21:25]
        box2_coord = preds[..., 26:30]

        # CLASS LABEL one-hot vector
        pred_class = preds[..., 0:20]

        # CONFIDENCE for box 1,2
        box1_confidence = preds[..., 20:21]
        box2_confidence = preds[..., 25:26]

        # IoU score for box 1,2
        box1_iou = intersection_over_union(
            box1_coord,
            gt_coord,
            box_format=self.box_format
        )

        box2_iou = intersection_over_union(
            box2_coord,
            gt_coord,
            box_format=self.box_format
        )

        iou_combined = torch.cat(
            (box1_iou, box2_iou),
            dim = -1
        )

        # select best box with higher IoU
        # (batch_size, S, S, 1) -> 0 or 1
        best_box_num = iou_combined.argmax(
            dim = -1, keepdim=True
        )


        # BEST box confidence
        best_box_confidence = (
            (1 - best_box_num) * box1_confidence
            + best_box_num * box2_confidence
        )

        # BEST box coordinates (x,y,w,h)
        # (batch size, S, S, 4)
        best_box_coord = (
            (1 - best_box_num) * box1_coord # if 0
            + best_box_num * box2_coord  # if 1
        )


        ##############################
        #      COORDINATE LOSS       #
        ##############################
        torch.autograd.set_detect_anomaly(True)
        best_box_coord[...,2:4] = torch.sign(best_box_coord[...,2:4]) * torch.sqrt(
            torch.abs(best_box_coord[...,2:4] + 1e-6)
        )

        gt_coord[...,2:4] = torch.sqrt(gt_coord[...,2:4])

        coord_loss = self.lambda_coord * self.mse(
            torch.flatten(Iobj * best_box_coord, end_dim=-2),
            torch.flatten(Iobj * gt_coord, end_dim=-2)
        )


        ##############################
        #      CONFIDENCE LOSS       #
        ##############################
        # If YES object
        obj_confidence_loss = self.mse(
            torch.flatten(Iobj * best_box_confidence, end_dim=-2),
            torch.flatten(Iobj * gt_confidence,end_dim=-2)
        )

        # If NO object
        noobj_confidence_loss = self.mse(
            torch.flatten((1 - Iobj) * box1_confidence, end_dim=-2),
            torch.flatten((1 - Iobj) * gt_confidence, end_dim=-2)
        )

        noobj_confidence_loss += self.mse(
            torch.flatten((1 - Iobj) * box2_confidence, end_dim=-2),
            torch.flatten((1 - Iobj) * gt_confidence, end_dim=-2)
        )

        confidence_loss = (
            obj_confidence_loss
            + self.lambda_noobj * noobj_confidence_loss
        )

        ##############################
        #         CLASS LOSS         #
        ##############################

        class_loss = self.mse(
            torch.flatten(Iobj * pred_class, end_dim=-2),
            torch.flatten(Iobj * gt_class, end_dim=-2)
        )

        return coord_loss + confidence_loss + class_loss

utils.py

I personally think that utils.py was the hardest to implement. This module includes non-max suppression, IoU, mean average precision(mAP), converting model output to the list of bounding boxes.

  
import torch
from collections import Counter
from dataset import VOCDataset
from torch.utils.data import DataLoader
from torchvision import transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np


def intersection_over_union(pred_bboxes, target_bboxes,
                            box_format="midpoint"):
    '''
    Compute Intersection over Union

    Parameters:
        pred_bboxes (tensor): Predicted bounding boxes (BATCH_SIZE, 4)
        target_bboxes (tensor): Target bounding boxes (BATCH_SIZE, 4)
        box_format (str): corners or midpoint
            corners: [x1,y1,x2,y2]
            midpoint: [x,y,w,h]

    Return:
        IoU (scalar tensor): for ALL examples (BATCH_SIZE, 1)
    '''
    if not torch.is_tensor(pred_bboxes):
        pred_bboxes = torch.tensor(pred_bboxes)

    if not torch.is_tensor(target_bboxes):
        target_bboxes = torch.tensor(target_bboxes)
    if box_format == "midpoint":
        box1_x1 = pred_bboxes[..., 0:1] - pred_bboxes[..., 2:3] / 2
        box1_y1 = pred_bboxes[..., 1:2] - pred_bboxes[..., 3:4] / 2
        box1_x2 = pred_bboxes[..., 0:1] + pred_bboxes[..., 2:3] / 2
        box1_y2 = pred_bboxes[..., 1:2] + pred_bboxes[..., 3:4] / 2
        box2_x1 = target_bboxes[..., 0:1] - target_bboxes[..., 2:3] / 2
        box2_y1 = target_bboxes[..., 1:2] - target_bboxes[..., 3:4] / 2
        box2_x2 = target_bboxes[..., 0:1] + target_bboxes[..., 2:3] / 2
        box2_y2 = target_bboxes[..., 1:2] + target_bboxes[..., 3:4] / 2

    elif box_format == "corners":
        box1_x1 = pred_bboxes[..., 0:1]
        box1_y1 = pred_bboxes[..., 1:2]
        box1_x2 = pred_bboxes[..., 2:3]
        box1_y2 = pred_bboxes[..., 3:4]
        box2_x1 = target_bboxes[..., 0:1]
        box2_y1 = target_bboxes[..., 1:2]
        box2_x2 = target_bboxes[..., 2:3]
        box2_y2 = target_bboxes[..., 3:4]

    cross_x1 = torch.max(box1_x1, box2_x1)
    cross_y1 = torch.max(box1_y1, box2_y1)
    cross_x2 = torch.min(box1_x2, box2_x2)
    cross_y2 = torch.min(box1_y2, box2_y2)

    # For non-overlapping boxes, clamp to 0 so that IoU=0
    intersection = (cross_x2 - cross_x1).clamp(0) * (cross_y2 - cross_y1).clamp(0)
    union = (
        (box1_x2 - box1_x1) * (box1_y2 - box1_y1)
        + (box2_x2 - box2_x1) * (box2_y2 - box2_y1)
        - intersection
    )

    return intersection / (union + 1e-6)



def non_max_suppression(
    bboxes, iou_threshold, confidence_threshold, box_format="midpoint"
):
    '''
    Performs Non Max Suppression on the given list of bounding boxes

    Parameters:
        bboxes (list): list[(class_prediction, confidence, x, y, w, h)]
        iou_threshold (float): minimum IoU required for predicted bbox to be correct
        confidence_threshold (float): minimum confidence required for predicted bbox.
                              all bboxes below this confidence are removed in prior to nms
        box_format (str): "corners" or "midpoint"

    Return:
        list: A list of bboxes after NMS performed
    '''
    nms_boxes = []
    assert type(bboxes) == list

    # [1] Remove all bboxes with confidence < confidence_threshold
    bboxes = [box for box in bboxes if box[1] > confidence_threshold]

    # [2] Sort bboxes for confidence in descending order
    bboxes.sort(key=lambda x: x[1], reverse=True)

    # [3] Perform nms for "EACH" class
    while(bboxes):
        top_box = bboxes.pop(0)
        nms_boxes.append(top_box)

        # [3-1] Don't compare if different class
        # [3-2] Only "leave" boxes with iou < iou_threshold
        bboxes = [box for box in bboxes
                  if box[0] != top_box[0]
                  or intersection_over_union(
                      torch.tensor(box[2:]),
                      torch.tensor(top_box[2:]),
                      box_format=box_format
                  ) < iou_threshold
                  ]

    return nms_boxes

def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint",
    num_classes=20
):
    '''
    Calculates mAP for given predicted boxes and true boxes

    Parameters:
        pred_boxes (list): list of bounding boxes
            - (image idx, class, confidence, x, y, w, h)
        true_boxes (list): list of ground truth boudning boxes
        iou_threshold (float): minimum iou required for bbox to be correct
        box_format (str): "corners" or "midpoint"
        num_classes (int): number of classes

    Returns:
        mAP (float): mAP across "all" classes
    '''

    # [NOTE] The ultimate goal is to find TP & FP for each pred_boxes with class c

    # average precisions -> later will be averaged = mAP
    average_precisions = []

    for c in range(num_classes):
        # pred_boxes for current class c
        class_pred_boxes = [
            box for box in pred_boxes
            if box[1] == c
        ]

        # true_boxes for current class c
        class_gt_boxes = [
            box for box in true_boxes
            if box[1] == c
        ]

        # If there's no gt box, skip
        if len(class_gt_boxes) == 0:
            continue

        # Build a frequency dictionary for each image index
        # This tells how many true boxes per image index
        gt_visited = Counter([
            gt[0] for gt in class_gt_boxes
        ])

        # convert value: num_boxes -> [0] * num_boxes
        # to make visited array for each box
        for key, val in gt_visited.items():
            gt_visited[key] = torch.zeros(val)

        # Time to calculate TP/FP.
        # First, sort class_pred_boxes w.r.t confidence
        class_pred_boxes.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros(len(class_pred_boxes))
        FP = torch.zeros(len(class_pred_boxes))
        total_gt_boxes = len(class_gt_boxes)

        for detection_idx, detection in enumerate(class_pred_boxes):
            best_iou = 0
            best_iou_gt_idx = None

            # GT boxes for SAME image and SAME class
            same_image_class_gt_boxes = [
                box for box in class_gt_boxes
                if box[0] == detection[0]
            ]

            for gt_idx, gt in enumerate(same_image_class_gt_boxes):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format
                )

                if iou > best_iou:
                    best_iou = iou
                    best_iou_gt_idx = gt_idx

            if best_iou > iou_threshold:
                # If not visited, then the current predicted detection is correct!
                if gt_visited[detection[0]][best_iou_gt_idx] == 0:
                    gt_visited[detection[0]][best_iou_gt_idx] = 1
                    TP[detection_idx] = 1
                else:  # If already visited, then the current predicted detection is incorrect!
                    FP[detection_idx] = 1
            else:  # If best iou < threshold, then the current pred detection failed!
                FP[detection_idx] = 1

        # Now, we found all TP, FP for current class
        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)

        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + 1e-6)
        recalls = TP_cumsum / (total_gt_boxes + 1e-6)

        precisions = torch.cat(
            (torch.tensor([1]), precisions)
        )
        recalls = torch.cat(
            (torch.tensor([0]), recalls)
        )

        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)


def get_bboxes(
    loader, model,
    iou_threshold, confidence_threshold,
    box_format="midpoint",
    device="cuda",
    S=7
):
    '''
    Returns a tuple of list of all the bounding boxes information for both prediction
    boxes and ground truth boxes in shape (image idx, class, confidence, x, y, w, h)

    Parameters:
        loader (generator): DataLoader
        model (nn.Module): YOLOv1 model
        iou_threshold (float): min. iou required for predicted box
                                to be correct
        confidence_threshold: min. confidence to be a candidate
        box_format (str): "corners" or "midpoint",
        device (str): "cpu" or "gpu"

    Returns:
        tuple: (all_pred_boxes, all_gt_boxes)
        - decoupled bounding box information accross all batches and examples
         for predictions and ground truths
    '''
    # We're not training
    model.eval()
    train_idx = 0

    # All boxes to return: (train idx, class, confidence, x, y, w, h)
    all_pred_boxes = []
    all_gt_boxes = []

    # For each BATCH
    for batch_idx, (images, labels) in enumerate(loader):
        images = images.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            predictions = model(images)

        print("prediction shape:", predictions.shape)

        # after forward(), its shape is (batch_size, S*S*(C+5*B))
        batch_size = images.shape[0]
        # predictions = (batchsize, S, S, 30)
            # each predictions[batchsize, ...] is predictions per ONE IMAGE
        # labels = (batchsize, S, S, 30)

        pred_boxes = cellboxes_to_list_boxes(predictions)
        gt_boxes = cellboxes_to_list_boxes(labels)
        # pred_boxes = [list of (class, confidence, x, y, w, h)] * batch_size
        # labels = [list of (class, confidence, x, y, w, h)] * batch_size


        # for each IMAGE
        for idx in range(batch_size):
            nms_boxes = non_max_suppression(
                pred_boxes[idx],
                iou_threshold=iou_threshold,
                confidence_threshold=confidence_threshold,
                box_format=box_format
            )
            print(nms_boxes)
            # For each PREDICTED BOX
            for nms_box in nms_boxes:
                # We need "train idx" for mAP
                all_pred_boxes.append(
                    [train_idx] + nms_box
                )

            # For each GROUND TRUTH BOX
            for gt_box in gt_boxes[idx]:
                # Many (i,j)th cells in S x S DO NOT have
                # ground truth labels!!
                if gt_box[1] > confidence_threshold:
                    all_gt_boxes.append(
                        [train_idx] + gt_box
                    )
            train_idx += 1

    model.train()
    return all_pred_boxes, all_gt_boxes


def convert_cellboxes(box_3d, S=7):
    '''
    Converts (batch_size, S, S, 30) -> (batch_size, S, S, 6)
    by selecting the best box among box 1 and 2.

    Parameters:
        box_3d (tensor): Shape of (batch_size, S, S, 30)
        S (int): Grid size

    Returns:
        tensor: Shape of (batch_size, S, S, 6) where
        tensor vector in dim=3 is in the format of
        (class, confidence, x, y, w, h)

    For 30 vector,
    0:20 = class vector
    20 = box1 confidence
    21:25 = box1 (x,y,w,h)
    25 = box2 confidence
    26:30 = box2 (x,y,w,h)
    '''
    batch_size = box_3d.shape[0]
    box_3d = box_3d.to("cpu")
    box_3d = box_3d.reshape(batch_size, S, S, 30)

    converted_boxes = torch.empty(batch_size, S, S, 6)
    for i in range(S):
        for j in range(S):
            # Scale to relative to the whole image rather than cell
            box1_cell_x = box_3d[..., i:i+1, j:j+1, 21]
            box1_cell_y = box_3d[..., i:i+1, j:j+1, 22]
            box1_cell_w = box_3d[..., i:i+1, j:j+1, 23]
            box1_cell_h = box_3d[..., i:i+1, j:j+1, 24]

            box_3d[..., i:i+1, j:j+1, 21] = (j + box1_cell_x) / S
            box_3d[..., i:i+1, j:j+1, 22] = (i + box1_cell_y) / S
            box_3d[..., i:i+1, j:j+1, 23] = box1_cell_w / S
            box_3d[..., i:i+1, j:j+1, 24] = box1_cell_h / S

            box2_cell_x = box_3d[..., i:i+1, j:j+1, 26]
            box2_cell_y = box_3d[..., i:i+1, j:j+1, 27]
            box2_cell_w = box_3d[..., i:i+1, j:j+1, 28]
            box2_cell_h = box_3d[..., i:i+1, j:j+1, 29]

            box_3d[..., i:i+1, j:j+1, 26] = (j + box2_cell_x) / S
            box_3d[..., i:i+1, j:j+1, 27] = (i + box2_cell_y) / S
            box_3d[..., i:i+1, j:j+1, 28] = box2_cell_w / S
            box_3d[..., i:i+1, j:j+1, 29] = box2_cell_h / S

            # Best Class
            best_class = box_3d[..., i:i+1, j:j+1, 0:20].argmax(3, keepdim=True)

            # Best confidence
            # Besst confidence idx for identity
            best_confidence, best_confidence_idx = torch.cat(
                (
                    box_3d[..., i:i+1, j:j+1, 20:21],
                    box_3d[..., i:i+1, j:j+1, 25:26]
                ),
                dim=3
            ).max(dim=3, keepdim=True)

            boxes1 = box_3d[..., i:i+1, j:j+1, 21:25]
            boxes2 = box_3d[..., i:i+1, j:j+1, 26:30]

            # Best Box coordinate
            best_box = (
                (1 - best_confidence_idx) * boxes1
                + best_confidence_idx * boxes2
            )

            converted_boxes[..., i:i+1, j:j+1, :] = torch.cat(
                (
                    best_class,
                    best_confidence,
                    best_box
                ),
                dim=-1
            )

    return converted_boxes

# input: (128, 7, 7, 30)
# return: [[[class,confidence,x,y,x,y],[],[]],[],...,[]], each element -> all
# boxes per image: 3D
def cellboxes_to_list_boxes(box_3d, S=7):
    '''
    Returns a list of "list of all bounding box information" in the format of
    (class, confidence, x, y, w, h). The length of the output will be the same
    as batch_size. Each ELEMENT of output is also a LIST for a particular IMAGE.

    Parameters:
        box_3d (tensor): Shape of (batch_size, S, S, 30). Each element in dim=0
        is a 3D-shaped bounding boxes.

        S (int): Grid size

    Returns:
        list: The length of ouput will be the same as batch_ize.
            Each ELEMENT of output is also a 2D LIST for a particular IMAGE.
            Each bounding box is (class, confidence, x, y, w, h)
    '''
    converted_list_boxes = []
    batch_size = box_3d.shape[0]

    # convert (batch_size,S,S,30) -> (batch_size,S,S,6)
    box_3d = convert_cellboxes(box_3d)
    print("box_3d in cellboxes_to_list_boxes:",box_3d.shape)
    box_3d = box_3d.reshape(batch_size, S*S, -1)
    box_3d[..., 0] = box_3d[..., 0].long()

    for img_idx in range(batch_size):
        img_list_boxes = []
        for box_idx in range(S*S):
            img_list_boxes.append([x.item() for x in box_3d[img_idx, box_idx, :]])
        converted_list_boxes.append(img_list_boxes)

    return converted_list_boxes


def save_checkpoint(state, fname="my_checkpoint.pth.tar"):
    print("=> Saving Checkpoint...")
    torch.save(state, fname)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading Checkpoint...")
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

train.py

  
import torch
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader
from model import YOLOv1
from dataset import VOCDataset
from utils import(
    intersection_over_union,
    non_max_suppression,
    mean_average_precision,
    cellboxes_to_list_boxes,
    get_bboxes,
    save_checkpoint,
    load_checkpoint,
)

from loss import YoloLoss

seed = 123
torch.manual_seed(seed)

# Hyperparameters
LEARNING_RATE = 5e-6
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Running on {DEVICE}")
BATCH_SIZE = 32
WEIGHT_DECAY = 0
EPOCHS = 3000
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = True
LOAD_MODEL_FILE = "overfit.pth.tar"
IMG_DIR = "data/images"
LABEL_DIR = "data/labels"

max_map = 0
save_map = False
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes
        return img, bboxes

transform = Compose([transforms.Resize((448,448)), transforms.ToTensor()])

def train_fn(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the progress bar
        loop.set_postfix(loss = loss.item())

    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

def main():
    model = YOLOv1(grid_size=7, num_boxes=2, num_classes=20).to(DEVICE)
    optimizer = optim.Adam(
        model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )
    loss_fn = YoloLoss()

    if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE, map_location=DEVICE), model, optimizer)

    train_dataset = VOCDataset(
        "data/8examples.csv",
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
    )

    test_dataset = VOCDataset(
        "data/test.csv",
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR
    )

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        drop_last=False
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=True
    )

    for epoch in range(EPOCHS):
        pred_boxes, target_boxes = get_bboxes(
            train_loader, model, iou_threshold=0.5, confidence_threshold=0.4, device=DEVICE
        )

        mean_avg_prec = mean_average_precision(
            pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
        )

        print(f"Train mAP: {mean_avg_prec}")
        global max_map
        global save_map
        if mean_avg_prec > 0.99 and not save_map:
            checkpoint = {
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    }
            save_checkpoint(checkpoint, fname=LOAD_MODEL_FILE)
            max_map = mean_avg_prec
            save_map = True
        train_fn(train_loader, model, optimizer, loss_fn)

if __name__ == "__main__":
    main()

plot_image.py

Now let’s test our model to see if it works.

  
import torch
from dataset import VOCDataset
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from model import YOLOv1
from utils import (
    cellboxes_to_list_boxes,
    non_max_suppression,
)
import time

LOAD_MODEL_FILE = "overfit.pth.tar"
IMG_DIR = "data/images"
LABEL_DIR = "data/labels"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
classes={
    1: 'bicycle',
    6: 'car',
    7: 'cat',
    11: 'dog',
    12: 'horse',
    14: 'peron'
}

color_list = ['red','blue','brown','rosybrown','lightyellow','aquamarine','mediumslateblue','skyblue','darkorchid','purple','cyan','darkcyan','lime','green','lightsteelblue','cornflowerblue','pink','crimson','peru','chocolate']
class_list = list(range(0,20))

colors = dict(zip(class_list,color_list))

def plot_bbox(image_tensor):
    '''
    Draw bounding boxes with image

    Parameters:
        images (tensor): (3, 448, 448)

    Returns
        None
    '''
    start_time = time.time()

    model = YOLOv1(grid_size=7, num_boxes=2, num_classes=20)
    model.load_state_dict(
        torch.load(LOAD_MODEL_FILE, map_location=DEVICE)['model_state_dict']
    )
    # DON'T FORGET!!! training mode might mess up Dropout and BatchNorm
    model.eval()
    predictions = model(image_tensor.unsqueeze(0))

    # 1) Loop through images and predictions
    # 2) Plot image first then the prediction
    pred_boxes = cellboxes_to_list_boxes(predictions)
    pred_boxes_nms = []
    for idx in range(len(pred_boxes)):
        nms_boxes = non_max_suppression(
            pred_boxes[idx],
            iou_threshold=0.5,
            confidence_threshold=0.5,
            box_format="midpoint"
        )
        pred_boxes_nms.append(nms_boxes)

    image = image_tensor.permute(1, 2, 0)
    bboxes = pred_boxes_nms[0]

    fig, ax = plt.subplots(figsize=(10, 7))

    # Convert to np.array to obtain height and width
    image = np.array(image)
    img_h, img_w, _ = image.shape

    ax.imshow(image)

    for box in bboxes:
        box_class, confidence, x, y, w, h = box

        # Need to convert to lower upper x and y
        upper_left_x = x - w / 2
        upper_left_y = y - h / 2
        upper_left_x *= img_w
        upper_left_y *= img_h

        rect = patches.Rectangle(
            (upper_left_x, upper_left_y),
            width = w * img_w,
            height = h * img_h,
            linewidth=3,
            edgecolor=colors[box_class],
            facecolor='none'
        )

        ax.add_patch(rect)
        ax.text(
            x = upper_left_x,
            y = upper_left_y - 5,
            fontsize=8,
            backgroundcolor = colors[box_class],
            color = 'black',
            s = f"{classes[int(box_class)]}: {confidence:.2f}"
        )
        ax.set_axis_off()

    print(f"Execution Time: {time.time() - start_time} seconds.")
    #  plt.show(block=False)
    plt.axis('off')
    plt.draw()
    plt.pause(5)
    plt.close()

YOLOv1 Full Implementation with PyTorch

Module Structure

dataset.py

model.py

loss.py

utils.py

train.py

plot_image.py

Further Reading

Stabilizing Softmax and Logsoftmax functions to avoid underflow and overflow

Gradient Descent and Numpy Implementation

Jacobian and Hessian Matrix

`dataset.py`