Home YOLOv1 Full Implementation with PyTorch
Post
Cancel

YOLOv1 Full Implementation with PyTorch

YOLO is an extremely fast object detection algorithm proposed in 2015. If you want to know more about the details, check my paper review for YOLOv1: YOLOv1 paper review

In this post, we will implement the full YOLOv1 with PyTorch.

References

  1. Aladdin Persson Youtube
  2. Paper

The YOLOv1 video by Aladdin Persson was super helpful and I learned a lot from him. My train.py is mostly the same as his code. However, I wanted to make codes easier to understand and intuitive so I implented mostly from scratch except train.py and model.py. You can follow along with the comments.

The model was overfitted for 8examples.csv to see if it works. Below is one predicted bounding box example of the model.

space-1.jpg

Module Structure

  • dataset.py
  • loss.py
  • utils.py
  • model.py
  • train.py
  • plot_image.py:

dataset.py

You can download the dataset here: Link. Note that the labels are in the format (class labe, x, y, w, h) but relative to the whole image. Therefore, we need to convert the labels relative to each grid cell.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
import os

class VOCDataset(Dataset):
    def __init__(self, data_csv, img_dir, label_dir, S=7, B=2, C=20, transform=None):
        '''
        Parameters:
            data_csv (str): csv file path name which has two columns.
                           1st column: img file name
                           2nd column: label file name
            img_dir (str): image directory path
            label_dir (str): label directory path
            S (int): Grid cell size (ex. 7x7)
            B (int): Number of bounding boxes per one cell
            C (int): Number of classes
            transform (Compose): A list of transforms
        '''

        self.df = pd.read_csv(data_csv)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.S = S
        self.B = B
        self.C = C
        self.transform = transform


    def __len__(self):
        '''
        Returns the length of the dataset
        '''
        return len(self.df)


    def __getitem__(self, idx):
        '''
        Returns one data item with idx

        Parameters:
            idx (int): Index number of data

        Returns:
            tuple: (image, label) where label is in format of
                    (S, S, 30). Note that last 5 elements of label
                    will NOT be used.
        '''

        # Path for image and label
        img_path = os.path.join(self.img_dir, self.df.iloc[idx, 0])
        label_path = os.path.join(self.label_dir, self.df.iloc[idx, 1])

        # Read image
        image = Image.open(img_path)

        # Read bounding boxes from label .txt file and store them to list
        bboxes = []

        with open(label_path, 'r') as f:
            while True:
                line = f.readline()
                if not line:
                    break

                # class_num, x, y, w, h
                one_box = line.replace('\n','').split(' ')

                # convert types to int or float accordingly
                one_box = [
                    float(el) if float(el) != int(float(el)) else int(el)
                    for el in one_box
                           ]
                bboxes.append(one_box)

        bboxes = torch.tensor(bboxes)
        if self.transform:
            image, bboxes = self.transform(image, bboxes)
        bboxes = bboxes.tolist()

        '''
        Loop through each box and scale relative to grid cell.
        Formula is quite straightforward.
        cell_w = S * w
        cell_h = S * h
        cell_x = S * x - floor(x * S)
        cell_y = S * y - floor(y * S)
        '''

        label_matrix = torch.zeros((self.S, self.S, 30)) # last 5 will NOT be used!
        for box in bboxes:
            # (i,j) in SxS grid -> used to assign box to label_matrix
            j, i = int(box[1] * self.S), int(box[2] * self.S)

            box_class = int(box[0])

            # Rescale relative to cell
            box[1] = self.S * box[1] - j # x
            box[2] = self.S * box[2] - i # y
            box[3] = self.S * box[3] # w
            box[4] = self.S * box[4] # h

            if label_matrix[i,j,20] == 0:
                # one-hot vector for class
                label_matrix[i, j, box_class] = 1

                # confidence for ground-truth = 1
                label_matrix[i, j, 20] = 1

                # box coordinate
                label_matrix[i, j, 21:25] = torch.tensor(box[1:])

        return image, label_matrix

model.py

You can reference the architecture from the paper.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import torch
import torch.nn as nn
import utils

# Import Network Architecture
# net_architecture = utils.read_json('./config.json')
architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))

class YOLOv1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(YOLOv1, self).__init__()

        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.classifiers = self._create_classifiers(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.classifiers(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]

            elif type(x) == tuple:
                layers += [CNNBlock(
                    in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3]
                )]
                in_channels = x[1]

            elif type(x) == list:
                conv1 = x[0] # tuple
                conv2 = x[1] # tuple
                repeat = x[2] # integer

                for _ in range(repeat):
                    layers += [
                        CNNBlock(
                            in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2], padding=conv1[3]
                        )
                    ]

                    layers += [
                        CNNBlock(
                            conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3]
                        )
                    ]

                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_classifiers(self, grid_size, num_boxes, num_classes):
        S, B, C = grid_size, num_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B*5)) # (S, S, 3)
        )

loss.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import torch
import torch.nn as nn
from utils import intersection_over_union

class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20, box_format="midpoint"):
        super(YoloLoss, self).__init__()
        self.S = S
        self.B = B
        self.C = C
        self.box_format = box_format
        self.lambda_coord = 5
        self.lambda_noobj = 0.5
        self.mse = nn.MSELoss(reduction="sum") # according to the paper

    def forward(self, preds, labels):
        '''
        Returns the loss of YOLOv1

        Parameters:
            preds (tensor): predicted bounding boxes in the shape
                            (batch_size, S, S, 30)
            labels (tensor): ground truth bounding boexes in the shape
                            (batch_size, S, S, 30)

        Returns:
            loss (float): The final loss consists of
                          1. Coordinate Loss
                          2. Confidence Loss(obj/noobj)
                          3. Class Loss
        0:20 - class label one-hot vector
        20 - box1 confidence
        21:25 - box1 (x,y,w,h)
        25 - box2 confidence
        26:30 - box2 (x,y,w,h)
        '''

        '''
        First, we need to determine which box is responsible for detecting the
        obj in a specific grid cell, given that an object exists. As stated in
        the original paper, only ONE predicted bounding box should be responsible.
        This is also a limitation of YOLOv1. The way to determine the responsibility
        is to compare both predictions' IoU with the ground truth box and pick
        the one with the highest IoU, given an object exists.
        '''
        preds = preds.reshape(-1, self.S, self.S, self.C + 5 * self.B)

        # Ground truth coordinates, class one-hot vector, confidence
        gt_coord = labels[..., 21:25]
        gt_class = labels[..., 0:20]
        gt_confidence = labels[..., 20:21]

        # Same as confidence..but denote Identity
        Iobj = labels[..., 20:21]

        # COORDINATES for box 1, 2
        box1_coord = preds[..., 21:25]
        box2_coord = preds[..., 26:30]

        # CLASS LABEL one-hot vector
        pred_class = preds[..., 0:20]

        # CONFIDENCE for box 1,2
        box1_confidence = preds[..., 20:21]
        box2_confidence = preds[..., 25:26]

        # IoU score for box 1,2
        box1_iou = intersection_over_union(
            box1_coord,
            gt_coord,
            box_format=self.box_format
        )

        box2_iou = intersection_over_union(
            box2_coord,
            gt_coord,
            box_format=self.box_format
        )

        iou_combined = torch.cat(
            (box1_iou, box2_iou),
            dim = -1
        )

        # select best box with higher IoU
        # (batch_size, S, S, 1) -> 0 or 1
        best_box_num = iou_combined.argmax(
            dim = -1, keepdim=True
        )


        # BEST box confidence
        best_box_confidence = (
            (1 - best_box_num) * box1_confidence
            + best_box_num * box2_confidence
        )

        # BEST box coordinates (x,y,w,h)
        # (batch size, S, S, 4)
        best_box_coord = (
            (1 - best_box_num) * box1_coord # if 0
            + best_box_num * box2_coord  # if 1
        )


        ##############################
        #      COORDINATE LOSS       #
        ##############################
        torch.autograd.set_detect_anomaly(True)
        best_box_coord[...,2:4] = torch.sign(best_box_coord[...,2:4]) * torch.sqrt(
            torch.abs(best_box_coord[...,2:4] + 1e-6)
        )

        gt_coord[...,2:4] = torch.sqrt(gt_coord[...,2:4])

        coord_loss = self.lambda_coord * self.mse(
            torch.flatten(Iobj * best_box_coord, end_dim=-2),
            torch.flatten(Iobj * gt_coord, end_dim=-2)
        )


        ##############################
        #      CONFIDENCE LOSS       #
        ##############################
        # If YES object
        obj_confidence_loss = self.mse(
            torch.flatten(Iobj * best_box_confidence, end_dim=-2),
            torch.flatten(Iobj * gt_confidence,end_dim=-2)
        )

        # If NO object
        noobj_confidence_loss = self.mse(
            torch.flatten((1 - Iobj) * box1_confidence, end_dim=-2),
            torch.flatten((1 - Iobj) * gt_confidence, end_dim=-2)
        )

        noobj_confidence_loss += self.mse(
            torch.flatten((1 - Iobj) * box2_confidence, end_dim=-2),
            torch.flatten((1 - Iobj) * gt_confidence, end_dim=-2)
        )

        confidence_loss = (
            obj_confidence_loss
            + self.lambda_noobj * noobj_confidence_loss
        )

        ##############################
        #         CLASS LOSS         #
        ##############################

        class_loss = self.mse(
            torch.flatten(Iobj * pred_class, end_dim=-2),
            torch.flatten(Iobj * gt_class, end_dim=-2)
        )

        return coord_loss + confidence_loss + class_loss

utils.py

I personally think that utils.py was the hardest to implement. This module includes non-max suppression, IoU, mean average precision(mAP), converting model output to the list of bounding boxes.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import torch
from collections import Counter
from dataset import VOCDataset
from torch.utils.data import DataLoader
from torchvision import transforms
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np


def intersection_over_union(pred_bboxes, target_bboxes,
                            box_format="midpoint"):
    '''
    Compute Intersection over Union

    Parameters:
        pred_bboxes (tensor): Predicted bounding boxes (BATCH_SIZE, 4)
        target_bboxes (tensor): Target bounding boxes (BATCH_SIZE, 4)
        box_format (str): corners or midpoint
            corners: [x1,y1,x2,y2]
            midpoint: [x,y,w,h]

    Return:
        IoU (scalar tensor): for ALL examples (BATCH_SIZE, 1)
    '''
    if not torch.is_tensor(pred_bboxes):
        pred_bboxes = torch.tensor(pred_bboxes)

    if not torch.is_tensor(target_bboxes):
        target_bboxes = torch.tensor(target_bboxes)
    if box_format == "midpoint":
        box1_x1 = pred_bboxes[..., 0:1] - pred_bboxes[..., 2:3] / 2
        box1_y1 = pred_bboxes[..., 1:2] - pred_bboxes[..., 3:4] / 2
        box1_x2 = pred_bboxes[..., 0:1] + pred_bboxes[..., 2:3] / 2
        box1_y2 = pred_bboxes[..., 1:2] + pred_bboxes[..., 3:4] / 2
        box2_x1 = target_bboxes[..., 0:1] - target_bboxes[..., 2:3] / 2
        box2_y1 = target_bboxes[..., 1:2] - target_bboxes[..., 3:4] / 2
        box2_x2 = target_bboxes[..., 0:1] + target_bboxes[..., 2:3] / 2
        box2_y2 = target_bboxes[..., 1:2] + target_bboxes[..., 3:4] / 2

    elif box_format == "corners":
        box1_x1 = pred_bboxes[..., 0:1]
        box1_y1 = pred_bboxes[..., 1:2]
        box1_x2 = pred_bboxes[..., 2:3]
        box1_y2 = pred_bboxes[..., 3:4]
        box2_x1 = target_bboxes[..., 0:1]
        box2_y1 = target_bboxes[..., 1:2]
        box2_x2 = target_bboxes[..., 2:3]
        box2_y2 = target_bboxes[..., 3:4]

    cross_x1 = torch.max(box1_x1, box2_x1)
    cross_y1 = torch.max(box1_y1, box2_y1)
    cross_x2 = torch.min(box1_x2, box2_x2)
    cross_y2 = torch.min(box1_y2, box2_y2)

    # For non-overlapping boxes, clamp to 0 so that IoU=0
    intersection = (cross_x2 - cross_x1).clamp(0) * (cross_y2 - cross_y1).clamp(0)
    union = (
        (box1_x2 - box1_x1) * (box1_y2 - box1_y1)
        + (box2_x2 - box2_x1) * (box2_y2 - box2_y1)
        - intersection
    )

    return intersection / (union + 1e-6)



def non_max_suppression(
    bboxes, iou_threshold, confidence_threshold, box_format="midpoint"
):
    '''
    Performs Non Max Suppression on the given list of bounding boxes

    Parameters:
        bboxes (list): list[(class_prediction, confidence, x, y, w, h)]
        iou_threshold (float): minimum IoU required for predicted bbox to be correct
        confidence_threshold (float): minimum confidence required for predicted bbox.
                              all bboxes below this confidence are removed in prior to nms
        box_format (str): "corners" or "midpoint"

    Return:
        list: A list of bboxes after NMS performed
    '''
    nms_boxes = []
    assert type(bboxes) == list

    # [1] Remove all bboxes with confidence < confidence_threshold
    bboxes = [box for box in bboxes if box[1] > confidence_threshold]

    # [2] Sort bboxes for confidence in descending order
    bboxes.sort(key=lambda x: x[1], reverse=True)

    # [3] Perform nms for "EACH" class
    while(bboxes):
        top_box = bboxes.pop(0)
        nms_boxes.append(top_box)

        # [3-1] Don't compare if different class
        # [3-2] Only "leave" boxes with iou < iou_threshold
        bboxes = [box for box in bboxes
                  if box[0] != top_box[0]
                  or intersection_over_union(
                      torch.tensor(box[2:]),
                      torch.tensor(top_box[2:]),
                      box_format=box_format
                  ) < iou_threshold
                  ]

    return nms_boxes

def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint",
    num_classes=20
):
    '''
    Calculates mAP for given predicted boxes and true boxes

    Parameters:
        pred_boxes (list): list of bounding boxes
            - (image idx, class, confidence, x, y, w, h)
        true_boxes (list): list of ground truth boudning boxes
        iou_threshold (float): minimum iou required for bbox to be correct
        box_format (str): "corners" or "midpoint"
        num_classes (int): number of classes

    Returns:
        mAP (float): mAP across "all" classes
    '''

    # [NOTE] The ultimate goal is to find TP & FP for each pred_boxes with class c

    # average precisions -> later will be averaged = mAP
    average_precisions = []

    for c in range(num_classes):
        # pred_boxes for current class c
        class_pred_boxes = [
            box for box in pred_boxes
            if box[1] == c
        ]

        # true_boxes for current class c
        class_gt_boxes = [
            box for box in true_boxes
            if box[1] == c
        ]

        # If there's no gt box, skip
        if len(class_gt_boxes) == 0:
            continue

        # Build a frequency dictionary for each image index
        # This tells how many true boxes per image index
        gt_visited = Counter([
            gt[0] for gt in class_gt_boxes
        ])

        # convert value: num_boxes -> [0] * num_boxes
        # to make visited array for each box
        for key, val in gt_visited.items():
            gt_visited[key] = torch.zeros(val)

        # Time to calculate TP/FP.
        # First, sort class_pred_boxes w.r.t confidence
        class_pred_boxes.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros(len(class_pred_boxes))
        FP = torch.zeros(len(class_pred_boxes))
        total_gt_boxes = len(class_gt_boxes)

        for detection_idx, detection in enumerate(class_pred_boxes):
            best_iou = 0
            best_iou_gt_idx = None

            # GT boxes for SAME image and SAME class
            same_image_class_gt_boxes = [
                box for box in class_gt_boxes
                if box[0] == detection[0]
            ]

            for gt_idx, gt in enumerate(same_image_class_gt_boxes):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format
                )

                if iou > best_iou:
                    best_iou = iou
                    best_iou_gt_idx = gt_idx

            if best_iou > iou_threshold:
                # If not visited, then the current predicted detection is correct!
                if gt_visited[detection[0]][best_iou_gt_idx] == 0:
                    gt_visited[detection[0]][best_iou_gt_idx] = 1
                    TP[detection_idx] = 1
                else:  # If already visited, then the current predicted detection is incorrect!
                    FP[detection_idx] = 1
            else:  # If best iou < threshold, then the current pred detection failed!
                FP[detection_idx] = 1

        # Now, we found all TP, FP for current class
        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)

        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + 1e-6)
        recalls = TP_cumsum / (total_gt_boxes + 1e-6)

        precisions = torch.cat(
            (torch.tensor([1]), precisions)
        )
        recalls = torch.cat(
            (torch.tensor([0]), recalls)
        )

        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)


def get_bboxes(
    loader, model,
    iou_threshold, confidence_threshold,
    box_format="midpoint",
    device="cuda",
    S=7
):
    '''
    Returns a tuple of list of all the bounding boxes information for both prediction
    boxes and ground truth boxes in shape (image idx, class, confidence, x, y, w, h)

    Parameters:
        loader (generator): DataLoader
        model (nn.Module): YOLOv1 model
        iou_threshold (float): min. iou required for predicted box
                                to be correct
        confidence_threshold: min. confidence to be a candidate
        box_format (str): "corners" or "midpoint",
        device (str): "cpu" or "gpu"

    Returns:
        tuple: (all_pred_boxes, all_gt_boxes)
        - decoupled bounding box information accross all batches and examples
         for predictions and ground truths
    '''
    # We're not training
    model.eval()
    train_idx = 0

    # All boxes to return: (train idx, class, confidence, x, y, w, h)
    all_pred_boxes = []
    all_gt_boxes = []

    # For each BATCH
    for batch_idx, (images, labels) in enumerate(loader):
        images = images.to(device)
        labels = labels.to(device)
        with torch.no_grad():
            predictions = model(images)

        print("prediction shape:", predictions.shape)

        # after forward(), its shape is (batch_size, S*S*(C+5*B))
        batch_size = images.shape[0]
        # predictions = (batchsize, S, S, 30)
            # each predictions[batchsize, ...] is predictions per ONE IMAGE
        # labels = (batchsize, S, S, 30)

        pred_boxes = cellboxes_to_list_boxes(predictions)
        gt_boxes = cellboxes_to_list_boxes(labels)
        # pred_boxes = [list of (class, confidence, x, y, w, h)] * batch_size
        # labels = [list of (class, confidence, x, y, w, h)] * batch_size


        # for each IMAGE
        for idx in range(batch_size):
            nms_boxes = non_max_suppression(
                pred_boxes[idx],
                iou_threshold=iou_threshold,
                confidence_threshold=confidence_threshold,
                box_format=box_format
            )
            print(nms_boxes)
            # For each PREDICTED BOX
            for nms_box in nms_boxes:
                # We need "train idx" for mAP
                all_pred_boxes.append(
                    [train_idx] + nms_box
                )

            # For each GROUND TRUTH BOX
            for gt_box in gt_boxes[idx]:
                # Many (i,j)th cells in S x S DO NOT have
                # ground truth labels!!
                if gt_box[1] > confidence_threshold:
                    all_gt_boxes.append(
                        [train_idx] + gt_box
                    )
            train_idx += 1

    model.train()
    return all_pred_boxes, all_gt_boxes


def convert_cellboxes(box_3d, S=7):
    '''
    Converts (batch_size, S, S, 30) -> (batch_size, S, S, 6)
    by selecting the best box among box 1 and 2.

    Parameters:
        box_3d (tensor): Shape of (batch_size, S, S, 30)
        S (int): Grid size

    Returns:
        tensor: Shape of (batch_size, S, S, 6) where
        tensor vector in dim=3 is in the format of
        (class, confidence, x, y, w, h)

    For 30 vector,
    0:20 = class vector
    20 = box1 confidence
    21:25 = box1 (x,y,w,h)
    25 = box2 confidence
    26:30 = box2 (x,y,w,h)
    '''
    batch_size = box_3d.shape[0]
    box_3d = box_3d.to("cpu")
    box_3d = box_3d.reshape(batch_size, S, S, 30)

    converted_boxes = torch.empty(batch_size, S, S, 6)
    for i in range(S):
        for j in range(S):
            # Scale to relative to the whole image rather than cell
            box1_cell_x = box_3d[..., i:i+1, j:j+1, 21]
            box1_cell_y = box_3d[..., i:i+1, j:j+1, 22]
            box1_cell_w = box_3d[..., i:i+1, j:j+1, 23]
            box1_cell_h = box_3d[..., i:i+1, j:j+1, 24]

            box_3d[..., i:i+1, j:j+1, 21] = (j + box1_cell_x) / S
            box_3d[..., i:i+1, j:j+1, 22] = (i + box1_cell_y) / S
            box_3d[..., i:i+1, j:j+1, 23] = box1_cell_w / S
            box_3d[..., i:i+1, j:j+1, 24] = box1_cell_h / S

            box2_cell_x = box_3d[..., i:i+1, j:j+1, 26]
            box2_cell_y = box_3d[..., i:i+1, j:j+1, 27]
            box2_cell_w = box_3d[..., i:i+1, j:j+1, 28]
            box2_cell_h = box_3d[..., i:i+1, j:j+1, 29]

            box_3d[..., i:i+1, j:j+1, 26] = (j + box2_cell_x) / S
            box_3d[..., i:i+1, j:j+1, 27] = (i + box2_cell_y) / S
            box_3d[..., i:i+1, j:j+1, 28] = box2_cell_w / S
            box_3d[..., i:i+1, j:j+1, 29] = box2_cell_h / S

            # Best Class
            best_class = box_3d[..., i:i+1, j:j+1, 0:20].argmax(3, keepdim=True)

            # Best confidence
            # Besst confidence idx for identity
            best_confidence, best_confidence_idx = torch.cat(
                (
                    box_3d[..., i:i+1, j:j+1, 20:21],
                    box_3d[..., i:i+1, j:j+1, 25:26]
                ),
                dim=3
            ).max(dim=3, keepdim=True)

            boxes1 = box_3d[..., i:i+1, j:j+1, 21:25]
            boxes2 = box_3d[..., i:i+1, j:j+1, 26:30]

            # Best Box coordinate
            best_box = (
                (1 - best_confidence_idx) * boxes1
                + best_confidence_idx * boxes2
            )

            converted_boxes[..., i:i+1, j:j+1, :] = torch.cat(
                (
                    best_class,
                    best_confidence,
                    best_box
                ),
                dim=-1
            )

    return converted_boxes

# input: (128, 7, 7, 30)
# return: [[[class,confidence,x,y,x,y],[],[]],[],...,[]], each element -> all
# boxes per image: 3D
def cellboxes_to_list_boxes(box_3d, S=7):
    '''
    Returns a list of "list of all bounding box information" in the format of
    (class, confidence, x, y, w, h). The length of the output will be the same
    as batch_size. Each ELEMENT of output is also a LIST for a particular IMAGE.

    Parameters:
        box_3d (tensor): Shape of (batch_size, S, S, 30). Each element in dim=0
        is a 3D-shaped bounding boxes.

        S (int): Grid size

    Returns:
        list: The length of ouput will be the same as batch_ize.
            Each ELEMENT of output is also a 2D LIST for a particular IMAGE.
            Each bounding box is (class, confidence, x, y, w, h)
    '''
    converted_list_boxes = []
    batch_size = box_3d.shape[0]

    # convert (batch_size,S,S,30) -> (batch_size,S,S,6)
    box_3d = convert_cellboxes(box_3d)
    print("box_3d in cellboxes_to_list_boxes:",box_3d.shape)
    box_3d = box_3d.reshape(batch_size, S*S, -1)
    box_3d[..., 0] = box_3d[..., 0].long()

    for img_idx in range(batch_size):
        img_list_boxes = []
        for box_idx in range(S*S):
            img_list_boxes.append([x.item() for x in box_3d[img_idx, box_idx, :]])
        converted_list_boxes.append(img_list_boxes)

    return converted_list_boxes


def save_checkpoint(state, fname="my_checkpoint.pth.tar"):
    print("=> Saving Checkpoint...")
    torch.save(state, fname)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading Checkpoint...")
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

train.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import torch
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader
from model import YOLOv1
from dataset import VOCDataset
from utils import(
    intersection_over_union,
    non_max_suppression,
    mean_average_precision,
    cellboxes_to_list_boxes,
    get_bboxes,
    save_checkpoint,
    load_checkpoint,
)

from loss import YoloLoss

seed = 123
torch.manual_seed(seed)

# Hyperparameters
LEARNING_RATE = 5e-6
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Running on {DEVICE}")
BATCH_SIZE = 32
WEIGHT_DECAY = 0
EPOCHS = 3000
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = True
LOAD_MODEL_FILE = "overfit.pth.tar"
IMG_DIR = "data/images"
LABEL_DIR = "data/labels"

max_map = 0
save_map = False
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes
        return img, bboxes

transform = Compose([transforms.Resize((448,448)), transforms.ToTensor()])

def train_fn(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update the progress bar
        loop.set_postfix(loss = loss.item())

    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

def main():
    model = YOLOv1(grid_size=7, num_boxes=2, num_classes=20).to(DEVICE)
    optimizer = optim.Adam(
        model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
    )
    loss_fn = YoloLoss()

    if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE, map_location=DEVICE), model, optimizer)

    train_dataset = VOCDataset(
        "data/8examples.csv",
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR,
    )

    test_dataset = VOCDataset(
        "data/test.csv",
        transform=transform,
        img_dir=IMG_DIR,
        label_dir=LABEL_DIR
    )

    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        drop_last=False
    )

    test_loader = DataLoader(
        dataset=test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=True
    )

    for epoch in range(EPOCHS):
        pred_boxes, target_boxes = get_bboxes(
            train_loader, model, iou_threshold=0.5, confidence_threshold=0.4, device=DEVICE
        )

        mean_avg_prec = mean_average_precision(
            pred_boxes, target_boxes, iou_threshold=0.5, box_format="midpoint"
        )

        print(f"Train mAP: {mean_avg_prec}")
        global max_map
        global save_map
        if mean_avg_prec > 0.99 and not save_map:
            checkpoint = {
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    }
            save_checkpoint(checkpoint, fname=LOAD_MODEL_FILE)
            max_map = mean_avg_prec
            save_map = True
        train_fn(train_loader, model, optimizer, loss_fn)

if __name__ == "__main__":
    main()

plot_image.py

Now let’s test our model to see if it works.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import torch
from dataset import VOCDataset
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
from model import YOLOv1
from utils import (
    cellboxes_to_list_boxes,
    non_max_suppression,
)
import time

LOAD_MODEL_FILE = "overfit.pth.tar"
IMG_DIR = "data/images"
LABEL_DIR = "data/labels"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
classes={
    1: 'bicycle',
    6: 'car',
    7: 'cat',
    11: 'dog',
    12: 'horse',
    14: 'peron'
}

color_list = ['red','blue','brown','rosybrown','lightyellow','aquamarine','mediumslateblue','skyblue','darkorchid','purple','cyan','darkcyan','lime','green','lightsteelblue','cornflowerblue','pink','crimson','peru','chocolate']
class_list = list(range(0,20))

colors = dict(zip(class_list,color_list))

def plot_bbox(image_tensor):
    '''
    Draw bounding boxes with image

    Parameters:
        images (tensor): (3, 448, 448)

    Returns
        None
    '''
    start_time = time.time()

    model = YOLOv1(grid_size=7, num_boxes=2, num_classes=20)
    model.load_state_dict(
        torch.load(LOAD_MODEL_FILE, map_location=DEVICE)['model_state_dict']
    )
    # DON'T FORGET!!! training mode might mess up Dropout and BatchNorm
    model.eval()
    predictions = model(image_tensor.unsqueeze(0))

    # 1) Loop through images and predictions
    # 2) Plot image first then the prediction
    pred_boxes = cellboxes_to_list_boxes(predictions)
    pred_boxes_nms = []
    for idx in range(len(pred_boxes)):
        nms_boxes = non_max_suppression(
            pred_boxes[idx],
            iou_threshold=0.5,
            confidence_threshold=0.5,
            box_format="midpoint"
        )
        pred_boxes_nms.append(nms_boxes)

    image = image_tensor.permute(1, 2, 0)
    bboxes = pred_boxes_nms[0]

    fig, ax = plt.subplots(figsize=(10, 7))

    # Convert to np.array to obtain height and width
    image = np.array(image)
    img_h, img_w, _ = image.shape

    ax.imshow(image)

    for box in bboxes:
        box_class, confidence, x, y, w, h = box

        # Need to convert to lower upper x and y
        upper_left_x = x - w / 2
        upper_left_y = y - h / 2
        upper_left_x *= img_w
        upper_left_y *= img_h

        rect = patches.Rectangle(
            (upper_left_x, upper_left_y),
            width = w * img_w,
            height = h * img_h,
            linewidth=3,
            edgecolor=colors[box_class],
            facecolor='none'
        )

        ax.add_patch(rect)
        ax.text(
            x = upper_left_x,
            y = upper_left_y - 5,
            fontsize=8,
            backgroundcolor = colors[box_class],
            color = 'black',
            s = f"{classes[int(box_class)]}: {confidence:.2f}"
        )
        ax.set_axis_off()

    print(f"Execution Time: {time.time() - start_time} seconds.")
    #  plt.show(block=False)
    plt.axis('off')
    plt.draw()
    plt.pause(5)
    plt.close()
This post is licensed under CC BY 4.0 by the author.
Trending Tags