Source code for gluoncv.model_zoo.ssd.ssd

"""Single-shot Multi-box Detector."""
from __future__ import absolute_import

import os
import mxnet as mx
from mxnet import autograd
from mxnet.gluon import nn
from mxnet.gluon import HybridBlock
from ..features import FeatureExpander
from .anchor import SSDAnchorGenerator
from ..predictors import ConvPredictor
from ..coders import MultiPerClassDecoder, NormalizedBoxCenterDecoder
from .target import SSDTargetGenerator
from .vgg_atrous import vgg16_atrous_300, vgg16_atrous_512
# from ...utils import set_lr_mult
from ...data import VOCDetection

__all__ = ['SSD', 'get_ssd', 'ssd_300_vgg16_atrous_voc', 'ssd_512_vgg16_atrous_voc',
           'ssd_512_resnet18_v1_voc', 'ssd_512_resnet50_v1_voc',
           'ssd_512_resnet101_v2_voc', 'ssd_512_resnet152_v2_voc',
           'ssd_512_mobilenet1_0_voc']


[docs]class SSD(HybridBlock): """Single-shot Object Detection Network: https://arxiv.org/abs/1512.02325. Parameters ---------- network : string or None Name of the base network, if `None` is used, will instantiate the base network from `features` directly instead of composing. base_size : int Base input size, it is speficied so SSD can support dynamic input shapes. features : list of str or mxnet.gluon.HybridBlock Intermediate features to be extracted or a network with multi-output. If `network` is `None`, `features` is expected to be a multi-output network. num_filters : list of int Number of channels for the appended layers, ignored if `network`is `None`. sizes : iterable fo float Sizes of anchor boxes, this should be a list of floats, in incremental order. The length of `sizes` must be len(layers) + 1. For example, a two stage SSD model can have ``sizes = [30, 60, 90]``, and it converts to `[30, 60]` and `[60, 90]` for the two stages, respectively. For more details, please refer to original paper. ratios : iterable of list Aspect ratios of anchors in each output layer. Its length must be equals to the number of SSD output layers. steps : list of int Step size of anchor boxes in each output layer. classes : iterable of str Names of all categories. use_1x1_transition : bool Whether to use 1x1 convolution as transition layer between attached layers, it is effective reducing model capacity. use_bn : bool Whether to use BatchNorm layer after each attached convolutional layer. reduce_ratio : float Channel reduce ratio (0, 1) of the transition layer. min_depth : int Minimum channels for the transition layers. global_pool : bool Whether to attach a global average pooling layer as the last output layer. pretrained : bool Description of parameter `pretrained`. iou_thresh : float, default is 0.5 IOU overlap threshold of matching targets, used during training phase. neg_thresh : float, default is 0.5 Negative mining threshold for un-matched anchors, this is to avoid highly overlapped anchors to be treated as negative samples. negative_mining_ratio : float, default is 3 Ratio of negative vs. positive samples. stds : tuple of float, default is (0.1, 0.1, 0.2, 0.2) Std values to be divided/multiplied to box encoded values. nms_thresh : float, default is 0.45. Non-maximum suppression threshold. You can speficy < 0 or > 1 to disable NMS. nms_topk : int, default is -1 Apply NMS to top k detection results, use -1 to disable so that every Detection result is used in NMS. anchor_alloc_size : tuple of int, default is (128, 128) For advanced users. Define `anchor_alloc_size` to generate large enough anchor maps, which will later saved in parameters. During inference, we support arbitrary input image by cropping corresponding area of the anchor map. This allow us to export to symbol so we can run it in c++, scalar, etc. """ def __init__(self, network, base_size, features, num_filters, sizes, ratios, steps, classes, use_1x1_transition=True, use_bn=True, reduce_ratio=1.0, min_depth=128, global_pool=False, pretrained=False, iou_thresh=0.5, neg_thresh=0.5, negative_mining_ratio=3, stds=(0.1, 0.1, 0.2, 0.2), nms_thresh=0.45, nms_topk=-1, anchor_alloc_size=128, **kwargs): super(SSD, self).__init__(**kwargs) if network is None: num_layers = len(ratios) else: num_layers = len(features) + len(num_filters) + int(global_pool) assert len(sizes) == num_layers + 1 sizes = list(zip(sizes[:-1], sizes[1:])) assert isinstance(ratios, list), "Must provide ratios as list or list of list" if not isinstance(ratios[0], (tuple, list)): ratios = ratios * num_layers # propagate to all layers if use same ratio assert num_layers == len(sizes) == len(ratios), \ "Mismatched (number of layers) vs (sizes) vs (ratios): {}, {}, {}".format( num_layers, len(sizes), len(ratios)) assert num_layers > 0, "SSD require at least one layer, suggest multiple." self._num_layers = num_layers self.classes = classes self.num_classes = len(classes) + 1 self.nms_thresh = nms_thresh self.nms_topk = nms_topk self.target = set([SSDTargetGenerator( iou_thresh=iou_thresh, neg_thresh=neg_thresh, negative_mining_ratio=negative_mining_ratio, stds=stds)]) with self.name_scope(): if network is None: # use fine-grained manually designed block as features self.features = features(pretrained=pretrained) else: self.features = FeatureExpander( network=network, outputs=features, num_filters=num_filters, use_1x1_transition=use_1x1_transition, use_bn=use_bn, reduce_ratio=reduce_ratio, min_depth=min_depth, global_pool=global_pool, pretrained=pretrained) self.class_predictors = nn.HybridSequential() self.box_predictors = nn.HybridSequential() self.anchor_generators = nn.HybridSequential() asz = anchor_alloc_size im_size = (base_size, base_size) for i, s, r, st in zip(range(num_layers), sizes, ratios, steps): anchor_generator = SSDAnchorGenerator(i, im_size, s, r, st, (asz, asz)) self.anchor_generators.add(anchor_generator) asz = max(asz // 2, 16) # pre-compute larger than 16x16 anchor map num_anchors = anchor_generator.num_depth self.class_predictors.add(ConvPredictor(num_anchors * self.num_classes)) self.box_predictors.add(ConvPredictor(num_anchors * 4)) self.bbox_decoder = NormalizedBoxCenterDecoder(stds) self.cls_decoder = MultiPerClassDecoder(self.num_classes, thresh=0.01) def set_nms(self, nms_thresh=0, nms_topk=-1): self.nms_thresh = nms_thresh self.nms_topk = nms_topk @property def target_generator(self): return list(self.target)[0] # pylint: disable=arguments-differ
[docs] def hybrid_forward(self, F, x): """Hybrid forward""" features = self.features(x) cls_preds = [F.flatten(F.transpose(cp(feat), (0, 2, 3, 1))) for feat, cp in zip(features, self.class_predictors)] box_preds = [F.flatten(F.transpose(bp(feat), (0, 2, 3, 1))) for feat, bp in zip(features, self.box_predictors)] anchors = [F.reshape(ag(feat), shape=(1, -1)) for feat, ag in zip(features, self.anchor_generators)] cls_preds = F.concat(*cls_preds, dim=1).reshape((0, -1, self.num_classes)) box_preds = F.concat(*box_preds, dim=1).reshape((0, -1, 4)) anchors = F.concat(*anchors, dim=1).reshape((1, -1, 4)) if autograd.is_recording(): return [cls_preds, box_preds, anchors] bboxes = self.bbox_decoder(box_preds, anchors) cls_ids, scores = self.cls_decoder(F.softmax(cls_preds, axis=-1)) results = [] for i in range(self.num_classes - 1): cls_id = cls_ids.slice_axis(axis=-1, begin=i, end=i+1) score = scores.slice_axis(axis=-1, begin=i, end=i+1) # per class results per_result = F.concat(*[cls_id, score, bboxes], dim=-1) if self.nms_thresh > 0 and self.nms_thresh < 1: per_result = F.contrib.box_nms( per_result, overlap_thresh=self.nms_thresh, topk=self.nms_topk, id_index=0, score_index=1, coord_start=2) results.append(per_result) result = F.concat(*results, dim=1) ids = F.slice_axis(result, axis=2, begin=0, end=1) scores = F.slice_axis(result, axis=2, begin=1, end=2) bboxes = F.slice_axis(result, axis=2, begin=2, end=6) return ids, scores, bboxes
[docs]def get_ssd(name, base_size, features, filters, sizes, ratios, steps, classes, dataset, pretrained=False, pretrained_base=True, ctx=mx.cpu(), root=os.path.join('~', '.mxnet', 'models'), **kwargs): """Get SSD models. Parameters ---------- name : str or None Model name, if `None` is used, you must specify `features` to be a `HybridBlock`. base_size : int Base image size for training, this is fixed once training is assigned. A fixed base size still allows you to have variable input size during test. features : iterable of str or `HybridBlock` List of network internal output names, in order to specify which layers are used for predicting bbox values. If `name` is `None`, `features` must be a `HybridBlock` which generate mutliple outputs for prediction. filters : iterable of float or None List of convolution layer channels which is going to be appended to the base network feature extractor. If `name` is `None`, this is ignored. sizes : iterable fo float Sizes of anchor boxes, this should be a list of floats, in incremental order. The length of `sizes` must be len(layers) + 1. For example, a two stage SSD model can have ``sizes = [30, 60, 90]``, and it converts to `[30, 60]` and `[60, 90]` for the two stages, respectively. For more details, please refer to original paper. ratios : iterable of list Aspect ratios of anchors in each output layer. Its length must be equals to the number of SSD output layers. steps : list of int Step size of anchor boxes in each output layer. classes : iterable of str Names of categories. dataset : str Name of dataset. This is used to identify model name because models trained on differnet datasets are going to be very different. pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Note that if pretrained is `Ture`, this has no effect. ctx : mxnet.Context Context such as mx.cpu(), mx.gpu(0). root : str Model weights storing path. Returns ------- HybridBlock A SSD detection network. """ pretrained_base = False if pretrained else pretrained_base base_name = None if callable(features) else name net = SSD(base_name, base_size, features, filters, sizes, ratios, steps, pretrained=pretrained_base, classes=classes, **kwargs) if pretrained: from ..model_store import get_model_file full_name = '_'.join(('ssd', str(base_size), name, dataset)) net.load_params(get_model_file(full_name, root=root), ctx=ctx) # set_lr_mult(net, ".*_bias", 2.0) return net
[docs]def ssd_300_vgg16_atrous_voc(pretrained=False, pretrained_base=True, **kwargs): """SSD architecture with VGG16 atrous 300x300 base network. Parameters ---------- pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Returns ------- HybridBlock A SSD detection network. """ classes = VOCDetection.CLASSES net = get_ssd('vgg16_atrous', 300, features=vgg16_atrous_300, filters=None, sizes=[30, 60, 111, 162, 213, 264, 315], ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0/3]] * 3 + [[1, 2, 0.5]] * 2, steps=[8, 16, 32, 64, 100, 300], classes=classes, dataset='voc', pretrained=pretrained, pretrained_base=pretrained_base, **kwargs) return net
[docs]def ssd_512_vgg16_atrous_voc(pretrained=False, pretrained_base=True, **kwargs): """SSD architecture with VGG16 atrous 512x512 base network. Parameters ---------- pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Returns ------- HybridBlock A SSD detection network. """ classes = VOCDetection.CLASSES net = get_ssd('vgg16_atrous', 512, features=vgg16_atrous_512, filters=None, sizes=[51.2, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6], ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0/3]] * 4 + [[1, 2, 0.5]] * 2, steps=[8, 16, 32, 64, 128, 256, 512], classes=classes, dataset='voc', pretrained=pretrained, pretrained_base=pretrained_base, **kwargs) return net
def ssd_512_resnet18_v1_voc(pretrained=False, pretrained_base=True, **kwargs): """SSD architecture with ResNet v1 18 layers. Parameters ---------- pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Returns ------- HybridBlock A SSD detection network. """ classes = VOCDetection.CLASSES return get_ssd('resnet18_v1', 512, features=['stage3_activation1', 'stage4_activation1'], filters=[512, 512, 256, 256], sizes=[51.2, 102.4, 189.4, 276.4, 363.52, 450.6, 492], ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0/3]] * 3 + [[1, 2, 0.5]] * 2, steps=[8, 16, 32, 64, 128, 256, 512], classes=classes, dataset='voc', pretrained=pretrained, pretrained_base=pretrained_base, **kwargs)
[docs]def ssd_512_resnet50_v1_voc(pretrained=False, pretrained_base=True, **kwargs): """SSD architecture with ResNet v1 50 layers. Parameters ---------- pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Returns ------- HybridBlock A SSD detection network. """ classes = VOCDetection.CLASSES return get_ssd('resnet50_v1', 512, features=['stage3_activation5', 'stage4_activation2'], filters=[512, 512, 256, 256], sizes=[51.2, 102.4, 189.4, 276.4, 363.52, 450.6, 492], ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0/3]] * 3 + [[1, 2, 0.5]] * 2, steps=[16, 32, 64, 128, 256, 512], classes=classes, dataset='voc', pretrained=pretrained, pretrained_base=pretrained_base, **kwargs)
[docs]def ssd_512_resnet101_v2_voc(pretrained=False, pretrained_base=True, **kwargs): """SSD architecture with ResNet v2 101 layers. Parameters ---------- pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Returns ------- HybridBlock A SSD detection network. """ classes = VOCDetection.CLASSES return get_ssd('resnet101_v2', 512, features=['stage3_activation22', 'stage4_activation2'], filters=[512, 512, 256, 256], sizes=[51.2, 102.4, 189.4, 276.4, 363.52, 450.6, 492], ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0/3]] * 3 + [[1, 2, 0.5]] * 2, steps=[16, 32, 64, 128, 256, 512], classes=classes, dataset='voc', pretrained=pretrained, pretrained_base=pretrained_base, **kwargs)
[docs]def ssd_512_resnet152_v2_voc(pretrained=False, pretrained_base=True, **kwargs): """SSD architecture with ResNet v2 152 layers. Parameters ---------- pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Returns ------- HybridBlock A SSD detection network. """ classes = VOCDetection.CLASSES return get_ssd('resnet152_v2', 512, features=['stage2_activation7', 'stage3_activation35', 'stage4_activation2'], filters=[512, 512, 256, 256], sizes=[51.2, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6], ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0/3]] * 4 + [[1, 2, 0.5]] * 2, steps=[8, 16, 32, 64, 128, 256, 512], classes=classes, dataset='voc', pretrained=pretrained, pretrained_base=pretrained_base, **kwargs)
def ssd_512_mobilenet1_0_voc(pretrained=False, pretrained_base=True, **kwargs): """SSD architecture with mobilenet1.0 base networks. Parameters ---------- pretrained : bool, optional, default is False Load pretrained weights. pretrained_base : bool, optional, default is True Load pretrained base network, the extra layers are randomized. Returns ------- HybridBlock A SSD detection network. """ classes = VOCDetection.CLASSES return get_ssd('mobilenet1.0', 512, features=['relu22_fwd', 'relu26_fwd'], filters=[512, 512, 256, 256], sizes=[51.2, 102.4, 189.4, 276.4, 363.52, 450.6, 492], ratios=[[1, 2, 0.5]] + [[1, 2, 0.5, 3, 1.0/3]] * 3 + [[1, 2, 0.5]] * 2, steps=[16, 32, 64, 128, 256, 512], classes=classes, dataset='voc', pretrained=pretrained, pretrained_base=pretrained_base, **kwargs)