Source code for super_gradients.training.models.yolov3

"""
Yolov3 code adapted from https://github.com/ultralytics/yolov3
"""
from typing import Union

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from super_gradients.training.models import SgModule
from super_gradients.training.models.darknet53 import Darknet53, DarkResidualBlock, create_conv_module
from super_gradients.training.utils import HpmStruct, get_param


[docs]class SPPLayer(nn.Module): def __init__(self): super(SPPLayer, self).__init__()
[docs] def forward(self, x): x_1 = x x_2 = F.max_pool2d(x, 5, stride=1, padding=2) x_3 = F.max_pool2d(x, 9, stride=1, padding=4) x_4 = F.max_pool2d(x, 13, stride=1, padding=6) out = torch.cat((x_1, x_2, x_3, x_4), dim=1) return out
[docs]class Upsample(nn.Module): def __init__(self, scale_factor, mode="nearest"): super(Upsample, self).__init__() self.scale_factor = scale_factor self.mode = mode
[docs] def forward(self, x): x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode) return x
[docs]class YOLOLayer(nn.Module): def __init__(self, anchors_mask: list, classes_num: int, anchors: list, image_size: int, onnx_stride: int, onnx_export_mode: bool = False): """ YOLOLayer :param anchors_mask: :param classes_num: :param anchors: :param image_size: :param onnx_stride: :param onnx_export_mode: """ super(YOLOLayer, self).__init__() self.anchors = anchors self.onnx_export_mode = onnx_export_mode masked_anchors = [self.anchors[i] for i in anchors_mask] anchors = np.array(masked_anchors) self.anchors_mask = torch.Tensor(anchors) self.anchors_num = len(anchors_mask) self.classes_num = classes_num self.x_grid_points_num = 0 self.y_grid_points_num = 0 self.onnx_stride = onnx_stride
[docs] def forward(self, img, img_size): if self.onnx_export_mode: # ALL OF THE GRIDS WERE CALCULATED IN init batch_size = 1 else: batch_size, _, y_grid_points_num, x_grid_points_num = img.shape if (self.x_grid_points_num, self.y_grid_points_num) != (x_grid_points_num, y_grid_points_num): self.create_grids(img_size, (x_grid_points_num, y_grid_points_num), img.device, img.dtype) # PREDICTION # IMG.VIEW(BATCH_SIZE, PRE_YOLO_LAYER_SIZE(DEFAULT IS 255), 13, 13) --> (BATCH_SIZE, 3, 13, 13, NUM_CLASSES + 5) # (BS, ANCHORS_NUM, GRID, GRID, CLASSES + XYWH + OBJECTNESS) prediction = img.view(batch_size, self.anchors_num, self.classes_num + 5, self.y_grid_points_num, self.x_grid_points_num).permute(0, 1, 3, 4, 2).contiguous() if self.training: return prediction # INFERENCE - ONNX elif self.onnx_export_mode: # CONSTANTS CAN NOT BE BROADCASTED m = self.anchors_num * self.x_grid_points_num * self.y_grid_points_num ngu = self.grid_size.repeat((1, m, 1)) grid_xy = self.grid_xy.repeat((1, self.anchors_num, 1, 1, 1)).view(1, m, 2) anchor_wh = self.anchor_wh.repeat((1, 1, self.x_grid_points_num, self.y_grid_points_num, 1)).view(1, m, 2) / ngu # MOVE THE TENSORS TO SAME DEVICE AS prediction TO APPLY TENSOR CALCULATION ngu = ngu.to(prediction.device) grid_xy = grid_xy.to(prediction.device) anchor_wh = anchor_wh.to(prediction.device) prediction = prediction.view(m, 5 + self.classes_num) xy = torch.sigmoid(prediction[..., 0:2]) + grid_xy[0] # x, y wh = torch.exp(prediction[..., 2:4]) * anchor_wh[0] # width, height prediction_confidence = torch.sigmoid(prediction[:, 4:5]) # CHANGE THE RESULTS TO BE A VECTOR OF CLASS CONF * OBJECTNESS CONF FOR EACH OF THE CLASSES (like SSD) cls_prediction = F.softmax(prediction[:, 5:5 + self.classes_num], 1) * prediction_confidence return torch.cat((xy / ngu[0], wh, prediction_confidence, cls_prediction), 1).t() # INFERENCE else: inference_out = prediction.clone() inference_out[..., 0:2] = torch.sigmoid(inference_out[..., 0:2]) + self.grid_xy inference_out[..., 2:4] = torch.exp(inference_out[..., 2:4]) * self.anchor_wh inference_out[..., :4] *= self.stride torch.sigmoid_(inference_out[..., 4:]) if self.classes_num == 1: # IGNORE cls FOR SINGLE CLASS DATA SETS inference_out[..., 5] = 1 # RESHAPE FROM [1, 3, 13, 13, NUM_CLASSES + 5] TO [1, 507, NUM_CLASSES + 5] return inference_out.view(batch_size, -1, 5 + self.classes_num), prediction
[docs] def create_grids(self, img_size=(416, 416), grid_size=(13, 13), device='cpu', data_type=torch.float32): """ create_grids - Creates the grids for image sizes that are different than the model's defualt image size :param img_size: :param grid_size: :param device: :param data_type: """ nx, ny = grid_size self.img_size = max(img_size) self.stride = self.img_size / max(grid_size) # build xy offsets yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)]) self.grid_xy = torch.stack((xv, yv), 2).to(device).type(data_type).view((1, 1, ny, nx, 2)) # build wh gains self.anchor_vec = self.anchors_mask.to(device) / self.stride self.anchor_wh = self.anchor_vec.view(1, self.anchors_num, 1, 1, 2).to(device).type(data_type) self.grid_size = torch.Tensor(grid_size).to(device) self.x_grid_points_num = nx self.y_grid_points_num = ny
[docs]class YoloV3(SgModule): """ YoloV3 """ def __init__(self, num_classes: int = 80, image_size: int = 416, arch_params: HpmStruct = None, iou_t: float = 0.225, yolo_v3_anchors: list = None, onnx_export_mode=False): super(YoloV3, self).__init__() if arch_params: arch_params_dict = arch_params.to_dict() self.num_classes = arch_params.num_classes if 'num_classes' in arch_params_dict else num_classes self.image_size = arch_params.image_size if 'image_size' in arch_params_dict else image_size self.iou_t = arch_params.iou_t if 'iou_t' in arch_params_dict else iou_t self.onnx_export_mode = arch_params.onnx_export_mode if \ 'onnx_export_mode' in arch_params_dict else onnx_export_mode yolo_v3_anchors = arch_params.yolo_v3_anchors if 'yolo_v3_anchors' in arch_params_dict else yolo_v3_anchors else: self.image_size = image_size self.num_classes = num_classes self.iou_t = iou_t self.onnx_export_mode = onnx_export_mode # THIS IS THE LAYER SIZE THAT FEEDS THE YOLO LAYER self.pre_yolo_layer_size = (self.num_classes + 5) * 3 if yolo_v3_anchors is None: # USE DEFAULT COCO DATA SET ANCHORS FOR YOLO V3 yolo_v3_anchors = [ (10., 13.), (16., 30.), (33., 23.), (30., 61.), (62., 45.), (59., 119.), (116., 90.), (156., 198.), (373., 326.)] self.yolo_v3_anchors = yolo_v3_anchors self.module_list = self.create_modules_list(num_classes=self.num_classes) self.yolo_layers_indices = self.get_yolo_layers_indices() if self.onnx_export_mode: self.prep_model_for_conversion([self.image_size, self.image_size])
[docs] def forward(self, x, var=None): img_size = x.shape[-2:] yolo_output = [] route_layers = [] for i, module in enumerate(self.module_list): if isinstance(module, YOLOLayer): y = module(x, img_size=img_size) yolo_output.append(y) else: x = module(x) # CONCATENATE THE OUTPUTS OF PREVIOUS LAYERS x, route_layers = self.concatenate_layer_output(x, i, route_layers) if self.training: return yolo_output elif self.onnx_export_mode: # CAT 3 LAYERS (NUM_CLASSES + 5) X (507, 2028, 8112) TO (NUM_CLASSES + 5) X 10647 output = torch.cat(yolo_output, 1) # ONNX SCORES, bboxes return output[5:5 + self.num_classes].t(), output[0:4].t() else: # INFERENCE inference_output, training_output = list(zip(*yolo_output)) return torch.cat(inference_output, 1), training_output
[docs] def initialize_param_groups(self, lr: float, training_params: HpmStruct) -> list: """ initialize_optimizer_for_model_param_groups - Initializes the optimizer group params, adds weight decay *Only* to the Conv2D layers :param lr: lr to set for the optimizer :param training_params: :return: A dictionary with named params and optimizer attributes """ optimizer_params = get_param(training_params, 'optimizer_params') # OPTIMIZER PARAMETER GROUPS default_param_group, weight_decay_param_group, biases_param_group = [], [], [] for k, v in dict(self.named_parameters()).items(): if '.bias' in k: biases_param_group += [[k, v]] elif 'Conv2d.weight' in k: weight_decay_param_group += [[k, v]] else: default_param_group += [[k, v]] # DEFAULT USAGE FOR YOLO TRAINING IS WITH NESTEROV nesterov = True if 'nesterov' not in optimizer_params.keys() else optimizer_params['nesterov'] default_param_group_optimizer_format = {'named_params': default_param_group, 'lr': lr, 'nesterov': nesterov, 'momentum': optimizer_params['momentum']} weight_decay_param_group_optimizer_format = {'named_params': weight_decay_param_group, 'weight_decay': optimizer_params['weight_decay']} biases_param_group_optimizer_format = {'named_params': biases_param_group} return [default_param_group_optimizer_format, weight_decay_param_group_optimizer_format, biases_param_group_optimizer_format]
[docs] @staticmethod def concatenate_layer_output(x, layer_index: int, route_layers: list) -> tuple: """ concatenate_layer_output :param x: input for the layer :param layer_index: the layer index to decide how to concatenate to :param route_layers: the route layers list with previous data :return: tuple of x, route_layers """ # CONCATENATE THE OUTPUTS OF PREVIOUS LAYERS if layer_index in [6, 8, 16, 26]: route_layers.append(x) if layer_index == 19: x = route_layers[2] if layer_index == 29: x = route_layers[3] if layer_index == 21: x = torch.cat((x, route_layers[1]), 1) if layer_index == 31: x = torch.cat((x, route_layers[0]), 1) return x, route_layers
[docs] def get_yolo_layers_indices(self): return [i for i, module in enumerate(self.module_list) if isinstance(module, YOLOLayer)]
[docs] @staticmethod def add_yolo_layer_to_modules_list(modules_list: nn.ModuleList, image_size: int, yolo_v3_anchors: list, anchors_mask: list, num_classes: int, onnx_stride: int, onnx_export_mode: bool = False) -> nn.ModuleList: """ add_yolo_layer_to_modules_list - Adds a YoLo Head Layer to the nn.ModuleList :param modules_list: The Modules List :param image_size: The YoLo Model Image Size :param yolo_v3_anchors: The Anchors (K-Means) List for the YoLo Layer Initialization :param anchors_mask: the mask to get the relevant anchors :param num_classes: The number of different classes in the data :param onnx_stride: The stride of the layer for ONNX grid points calculation in YoLo Layer init :param onnx_export_mode: Alter the model YoLo Layer for ONNX Export :return: The nn.ModuleList with the Added Yolo layer, and a Bias Initialization """ mask = [yolo_v3_anchors[i] for i in anchors_mask] b = [-5.5, -5.0] bias = modules_list[-1][0].bias.view(len(mask), -1) # PRE-YOLO-LAYER to 3x(NUM_CLASSES + 5) with torch.no_grad(): bias[:, 4] += b[0] - bias[:, 4].mean() # OBJECTNESS bias[:, 5:] += b[1] - bias[:, 5:].mean() # CLASSIFICATION modules_list[-1][0].bias = torch.nn.Parameter(bias.view(-1)) modules_list.append(YOLOLayer(anchors_mask=anchors_mask, classes_num=num_classes, anchors=yolo_v3_anchors, image_size=image_size, onnx_stride=onnx_stride, onnx_export_mode=onnx_export_mode)) return modules_list
[docs] @staticmethod def named_sequential_module(module_name, module) -> nn.Sequential: """ create_named_nn_sequential_module :param module_name: :param module: :return: nn.Sequential() with the added relevant names """ named_sequential_module = nn.Sequential() named_sequential_module.add_module(module_name, module) return named_sequential_module
[docs] def create_modules_list(self, num_classes: int): """ create_modules_list :param num_classes: :return: """ # DARKNET BACKBONE ARCHITECTURE darknet_53 = Darknet53(backbone_mode=True) yolo_modules_list = darknet_53.get_modules_list() # YOLO V3 ARCHITECTURE yolo_modules_list.append(DarkResidualBlock(in_channels=1024, shortcut=False)) # 11 yolo_modules_list.append(create_conv_module(in_channels=1024, out_channels=512, kernel_size=1, stride=1)) # 12 yolo_modules_list.append(SPPLayer()) # 13 yolo_modules_list.append(create_conv_module(in_channels=2048, out_channels=512, kernel_size=1, stride=1)) # 14 yolo_modules_list.append(create_conv_module(in_channels=512, out_channels=1024, kernel_size=3, stride=1)) # 15 yolo_modules_list.append(create_conv_module(in_channels=1024, out_channels=512, kernel_size=1, stride=1)) # 16 yolo_modules_list.append(create_conv_module(in_channels=512, out_channels=1024, kernel_size=3, stride=1)) # 17 yolo_modules_list.append(self.named_sequential_module('Conv2d', nn.Conv2d(in_channels=1024, out_channels=self.pre_yolo_layer_size, kernel_size=1, stride=1))) # 18 yolo_modules_list = self.add_yolo_layer_to_modules_list(modules_list=yolo_modules_list, # 19 image_size=self.image_size, yolo_v3_anchors=self.yolo_v3_anchors, anchors_mask=[6, 7, 8], num_classes=num_classes, onnx_stride=32, onnx_export_mode=self.onnx_export_mode) yolo_modules_list.append(create_conv_module(in_channels=512, out_channels=256, kernel_size=1, stride=1)) # 20 yolo_modules_list.append(Upsample(scale_factor=2, mode='nearest')) # 21 yolo_modules_list.append(create_conv_module(in_channels=768, out_channels=256, kernel_size=1, stride=1)) # 22 yolo_modules_list.append(create_conv_module(in_channels=256, out_channels=512, kernel_size=3, stride=1)) # 23 yolo_modules_list.append(create_conv_module(in_channels=512, out_channels=256, kernel_size=1, stride=1)) # 24 yolo_modules_list.append(create_conv_module(in_channels=256, out_channels=512, kernel_size=3, stride=1)) # 25 yolo_modules_list.append(create_conv_module(in_channels=512, out_channels=256, kernel_size=1, stride=1)) # 26 yolo_modules_list.append(create_conv_module(in_channels=256, out_channels=512, kernel_size=3, stride=1)) # 27 yolo_modules_list.append(self.named_sequential_module('Conv2d', nn.Conv2d(in_channels=512, out_channels=self.pre_yolo_layer_size, kernel_size=1, stride=1))) # 28 yolo_modules_list = self.add_yolo_layer_to_modules_list(modules_list=yolo_modules_list, # 29 image_size=self.image_size, yolo_v3_anchors=self.yolo_v3_anchors, anchors_mask=[3, 4, 5], num_classes=num_classes, onnx_stride=16, onnx_export_mode=self.onnx_export_mode) yolo_modules_list.append(create_conv_module(in_channels=256, out_channels=128, kernel_size=1, stride=1)) # 30 yolo_modules_list.append(Upsample(scale_factor=2, mode='nearest')) # 31 yolo_modules_list.append(create_conv_module(in_channels=384, out_channels=128, kernel_size=1, stride=1)) # 32 yolo_modules_list.append(create_conv_module(in_channels=128, out_channels=256, kernel_size=3, stride=1)) # 33 yolo_modules_list.append(create_conv_module(in_channels=256, out_channels=128, kernel_size=1, stride=1)) # 34 yolo_modules_list.append(create_conv_module(in_channels=128, out_channels=256, kernel_size=3, stride=1)) # 35 yolo_modules_list.append(create_conv_module(in_channels=256, out_channels=128, kernel_size=1, stride=1)) # 36 yolo_modules_list.append(create_conv_module(in_channels=128, out_channels=256, kernel_size=3, stride=1)) # 37 yolo_modules_list.append(self.named_sequential_module('Conv2d', nn.Conv2d(in_channels=256, out_channels=self.pre_yolo_layer_size, kernel_size=1, stride=1))) # 38 yolo_modules_list = self.add_yolo_layer_to_modules_list(modules_list=yolo_modules_list, # 39 image_size=self.image_size, yolo_v3_anchors=self.yolo_v3_anchors, anchors_mask=[0, 1, 2], num_classes=num_classes, onnx_stride=8, onnx_export_mode=self.onnx_export_mode) return yolo_modules_list
[docs] def prep_model_for_conversion(self, input_size: Union[tuple, list] = None, **kwargs): """ Method for preparing the Yolov3 and TinyYolov3 for conversion (ONNX, TRT, CoreML etc). :param input_size: used for calculating the grid points. """ self.onnx_export_mode = True # ONNX EXPORT REQUIRES GRIDS TO BE CALCULATED IN init of YOLOLayer SO WE RE-RUN THE CALC METHOD for module in self.module_list: if isinstance(module, YOLOLayer): module.onnx_export_mode = True x_grid_points_num = int(input_size / module.onnx_stride) y_grid_points_num = int(input_size / module.onnx_stride) module.create_grids((input_size, input_size), (x_grid_points_num, y_grid_points_num))
[docs]class TinyYoloV3(YoloV3): """ TinyYoloV3 - Inherits from YoLoV3 class and overloads the relevant methods and members """ def __init__(self, num_classes: int = 80, image_size: int = 416, arch_params: dict = None, iou_t: float = 0.225, yolo_v3_anchors: list = None): if arch_params: yolo_v3_anchors = arch_params.yolo_v3_anchors if 'yolo_v3_anchors' in arch_params.to_dict() else yolo_v3_anchors if yolo_v3_anchors is None: # DEFAULT ANCHORS FOR TINY YOLO V3 yolo_v3_anchors = [(10., 14.), (23., 27.), (37., 58.), (81., 82.), (135., 169.), (344., 319.)] super(TinyYoloV3, self).__init__(num_classes=num_classes, image_size=image_size, arch_params=arch_params, iou_t=iou_t, yolo_v3_anchors=yolo_v3_anchors)
[docs] @staticmethod def concatenate_layer_output(x, layer_index: int, route_layers: list) -> tuple: """ concatenate_layer_output :param x: input for the layer :param layer_index: the layer index to decide how to concatenate to :param route_layers: the route layers list with previous data :return: tuple of x, route_layers """ # CONCATENATE THE OUTPUTS OF PREVIOUS LAYERS if layer_index in [8, 14]: route_layers.append(x) if layer_index == 17: x = route_layers[1] if layer_index == 19: x = torch.cat((x, route_layers[0]), 1) return x, route_layers
[docs] def create_modules_list(self, num_classes: int): """ create_tiny_modules_list :param num_classes: The Number of different Classes :return: nn.ModuleList with the Tiny-Yolo-V3 Architecture """ yolo_modules_list = nn.ModuleList() yolo_modules_list.append(create_conv_module(3, 16)) # 0 yolo_modules_list.append(self.named_sequential_module('MaxPool2d', nn.MaxPool2d(stride=2, kernel_size=2))) # 1 yolo_modules_list.append(create_conv_module(16, 32)) # 2 yolo_modules_list.append(self.named_sequential_module('MaxPool2d', nn.MaxPool2d(stride=2, kernel_size=2))) # 3 yolo_modules_list.append(create_conv_module(32, 64)) # 4 yolo_modules_list.append(self.named_sequential_module('MaxPool2d', nn.MaxPool2d(stride=2, kernel_size=2))) # 5 yolo_modules_list.append(create_conv_module(64, 128)) # 6 yolo_modules_list.append(self.named_sequential_module('MaxPool2d', nn.MaxPool2d(stride=2, kernel_size=2))) # 7 yolo_modules_list.append(create_conv_module(128, 256)) # 8 yolo_modules_list.append(self.named_sequential_module('MaxPool2d', nn.MaxPool2d(stride=2, kernel_size=2))) # 9 yolo_modules_list.append(create_conv_module(256, 512)) # 10 yolo_modules_list.append(self.named_sequential_module('ZeroPad2d', nn.ZeroPad2d((0, 1, 0, 1)))) # 11 yolo_modules_list.append(self.named_sequential_module('MaxPool2d', nn.MaxPool2d(stride=1, kernel_size=2))) # 12 yolo_modules_list.append(create_conv_module(512, 1024)) # 13 yolo_modules_list.append(create_conv_module(1024, 256, kernel_size=1)) # 14 yolo_modules_list.append(create_conv_module(256, 512)) # 15 yolo_modules_list.append(self.named_sequential_module('Conv2d', nn.Conv2d(in_channels=512, out_channels=self.pre_yolo_layer_size, kernel_size=1, stride=1))) # 16 yolo_modules_list = self.add_yolo_layer_to_modules_list(modules_list=yolo_modules_list, # 17 image_size=self.image_size, yolo_v3_anchors=self.yolo_v3_anchors, anchors_mask=[3, 4, 5], num_classes=num_classes, onnx_stride=32, onnx_export_mode=self.onnx_export_mode) yolo_modules_list.append(create_conv_module(256, 128, kernel_size=1)) # 18 yolo_modules_list.append(Upsample(scale_factor=2, mode='nearest')) # 19 yolo_modules_list.append(create_conv_module(384, 256)) # 20 yolo_modules_list.append(self.named_sequential_module('Conv2d', nn.Conv2d(in_channels=256, out_channels=self.pre_yolo_layer_size, kernel_size=1, stride=1))) # 21 # THE [1, 2, 3] IN THE MASK IS NOT A BUG, BUT A FEATURE :) yolo_modules_list = self.add_yolo_layer_to_modules_list(modules_list=yolo_modules_list, # 22 image_size=self.image_size, yolo_v3_anchors=self.yolo_v3_anchors, anchors_mask=[1, 2, 3], num_classes=num_classes, onnx_stride=16, onnx_export_mode=self.onnx_export_mode) return yolo_modules_list