# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
from mmcv.cnn.bricks import ConvModule
from mmcv.runner import BaseModule, auto_fp16, force_fp32
from mmdet.core import build_assigner, build_bbox_coder, build_sampler
from mmdet.core.anchor import build_prior_generator
from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
from mmdet.models import HEADS, build_loss

from mmtrack.core.track import depthwise_correlation

[文档]@HEADS.register_module() class CorrelationHead(BaseModule): """Correlation head module. This module is proposed in "SiamRPN++: Evolution of Siamese Visual Tracking with Very Deep Networks. `SiamRPN++ <>`_. Args: in_channels (int): Input channels. mid_channels (int): Middle channels. out_channels (int): Output channels. kernel_size (int): Kernel size of convs. Defaults to 3. norm_cfg (dict): Configuration of normlization method after each conv. Defaults to dict(type='BN'). act_cfg (dict): Configuration of activation method after each conv. Defaults to dict(type='ReLU'). init_cfg (dict or list[dict], optional): Initialization config dict. Defaults to None. """ def __init__(self, in_channels, mid_channels, out_channels, kernel_size=3, norm_cfg=dict(type='BN'), act_cfg=dict(type='ReLU'), init_cfg=None, **kwargs): super(CorrelationHead, self).__init__(init_cfg) self.kernel_convs = ConvModule( in_channels=in_channels, out_channels=mid_channels, kernel_size=kernel_size, norm_cfg=norm_cfg, act_cfg=act_cfg) self.search_convs = ConvModule( in_channels=in_channels, out_channels=mid_channels, kernel_size=kernel_size, norm_cfg=norm_cfg, act_cfg=act_cfg) self.head_convs = nn.Sequential( ConvModule( in_channels=mid_channels, out_channels=mid_channels, kernel_size=1, norm_cfg=norm_cfg, act_cfg=act_cfg), ConvModule( in_channels=mid_channels, out_channels=out_channels, kernel_size=1, act_cfg=None))
[文档] def forward(self, kernel, search): kernel = self.kernel_convs(kernel) search = self.search_convs(search) correlation_maps = depthwise_correlation(search, kernel) out = self.head_convs(correlation_maps) return out
[文档]@HEADS.register_module() class SiameseRPNHead(BaseModule): """Siamese RPN head. This module is proposed in "SiamRPN++: Evolution of Siamese Visual Tracking with Very Deep Networks. `SiamRPN++ <>`_. Args: anchor_generator (dict): Configuration to build anchor generator module. in_channels (int): Input channels. kernel_size (int): Kernel size of convs. Defaults to 3. norm_cfg (dict): Configuration of normlization method after each conv. Defaults to dict(type='BN'). weighted_sum (bool): If True, use learnable weights to weightedly sum the output of multi heads in siamese rpn , otherwise, use averaging. Defaults to False. bbox_coder (dict): Configuration to build bbox coder. Defaults to dict(type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[1., 1., 1., 1.]). loss_cls (dict): Configuration to build classification loss. Defaults to dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0) loss_bbox (dict): Configuration to build bbox regression loss. Defaults to dict( type='L1Loss', reduction='sum', loss_weight=1.2). train_cfg (Dict): Training setting. Defaults to None. test_cfg (Dict): Testing setting. Defaults to None. init_cfg (dict or list[dict], optional): Initialization config dict. Defaults to None. """ def __init__(self, anchor_generator, in_channels, kernel_size=3, norm_cfg=dict(type='BN'), weighted_sum=False, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[1., 1., 1., 1.]), loss_cls=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), loss_bbox=dict( type='L1Loss', reduction='sum', loss_weight=1.2), train_cfg=None, test_cfg=None, init_cfg=None, *args, **kwargs): super(SiameseRPNHead, self).__init__(init_cfg) self.anchor_generator = build_prior_generator(anchor_generator) self.bbox_coder = build_bbox_coder(bbox_coder) self.train_cfg = train_cfg self.test_cfg = test_cfg self.assigner = build_assigner(self.train_cfg.assigner) self.sampler = build_sampler(self.train_cfg.sampler) self.fp16_enabled = False self.cls_heads = nn.ModuleList() self.reg_heads = nn.ModuleList() for i in range(len(in_channels)): self.cls_heads.append( CorrelationHead(in_channels[i], in_channels[i], 2 * self.anchor_generator.num_base_anchors[0], kernel_size, norm_cfg)) self.reg_heads.append( CorrelationHead(in_channels[i], in_channels[i], 4 * self.anchor_generator.num_base_anchors[0], kernel_size, norm_cfg)) self.weighted_sum = weighted_sum if self.weighted_sum: self.cls_weight = nn.Parameter(torch.ones(len(in_channels))) self.reg_weight = nn.Parameter(torch.ones(len(in_channels))) self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox)
[文档] @auto_fp16() def forward(self, z_feats, x_feats): """Forward with features `z_feats` of exemplar images and features `x_feats` of search images. Args: z_feats (tuple[Tensor]): Tuple of Tensor with shape (N, C, H, W) denoting the multi level feature maps of exemplar images. Typically H and W equal to 7. x_feats (tuple[Tensor]): Tuple of Tensor with shape (N, C, H, W) denoting the multi level feature maps of search images. Typically H and W equal to 31. Returns: tuple(cls_score, bbox_pred): cls_score is a Tensor with shape (N, 2 * num_base_anchors, H, W), bbox_pred is a Tensor with shape (N, 4 * num_base_anchors, H, W), Typically H and W equal to 25. """ assert isinstance(z_feats, tuple) and isinstance(x_feats, tuple) assert len(z_feats) == len(x_feats) and len(z_feats) == len( self.cls_heads) if self.weighted_sum: cls_weight = nn.functional.softmax(self.cls_weight, dim=0) reg_weight = nn.functional.softmax(self.reg_weight, dim=0) else: reg_weight = cls_weight = [ 1.0 / len(z_feats) for i in range(len(z_feats)) ] cls_score = 0 bbox_pred = 0 for i in range(len(z_feats)): cls_score_single = self.cls_heads[i](z_feats[i], x_feats[i]) bbox_pred_single = self.reg_heads[i](z_feats[i], x_feats[i]) cls_score += cls_weight[i] * cls_score_single bbox_pred += reg_weight[i] * bbox_pred_single return cls_score, bbox_pred
def _get_init_targets(self, gt_bbox, score_maps_size): """Initialize the training targets based on flattened anchors of the last score map.""" num_base_anchors = self.anchor_generator.num_base_anchors[0] H, W = score_maps_size num_anchors = H * W * num_base_anchors labels = gt_bbox.new_zeros((num_anchors, ), dtype=torch.long) labels_weights = gt_bbox.new_zeros((num_anchors, )) bbox_weights = gt_bbox.new_zeros((num_anchors, 4)) bbox_targets = gt_bbox.new_zeros((num_anchors, 4)) return labels, labels_weights, bbox_targets, bbox_weights def _get_positive_pair_targets(self, gt_bbox, score_maps_size): """Generate the training targets for positive exemplar image and search image pair. Args: gt_bbox (Tensor): Ground truth bboxes of an search image with shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y] format. score_maps_size (torch.size): denoting the output size (height, width) of the network. Returns: tuple(labels, labels_weights, bbox_targets, bbox_weights): the shape is (H * W * num_base_anchors,), (H * W * num_base_anchors,), (H * W * num_base_anchors, 4), (H * W * num_base_anchors, 4) respectively. All of them are Tensor. """ (labels, labels_weights, _, bbox_weights) = self._get_init_targets(gt_bbox, score_maps_size) if not hasattr(self, 'anchors'): self.anchors = self.anchor_generator.grid_priors( [score_maps_size], device=gt_bbox.device)[0] # Transform the coordinate origin from the top left corner to the # center in the scaled score map. feat_h, feat_w = score_maps_size stride_w, stride_h = self.anchor_generator.strides[0] self.anchors[:, 0:4:2] -= (feat_w // 2) * stride_w self.anchors[:, 1:4:2] -= (feat_h // 2) * stride_h anchors = self.anchors.clone() # The scaled feature map and the searched image have the same center. # Transform coordinate origin from the center to the top left corner in # the searched image. anchors += self.train_cfg.search_size // 2 assign_result = self.assigner.assign(anchors, gt_bbox[:, 1:]) sampling_result = self.sampler.sample(assign_result, anchors, gt_bbox[:, 1:]) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds neg_upper_bound = int(self.sampler.num * (1 - self.sampler.pos_fraction)) if len(neg_inds) > neg_upper_bound: neg_inds = neg_inds[:neg_upper_bound] if len(pos_inds) > 0: labels[pos_inds] = 1 labels_weights[pos_inds] = 1.0 / len(pos_inds) / 2 bbox_weights[pos_inds] = 1.0 / len(pos_inds) if len(neg_inds) > 0: labels[neg_inds] = 0 labels_weights[neg_inds] = 1.0 / len(neg_inds) / 2 bbox_targets = self.bbox_coder.encode( anchors, gt_bbox[:, 1:].repeat(anchors.shape[0], 1)) return labels, labels_weights, bbox_targets, bbox_weights def _get_negative_pair_targets(self, gt_bbox, score_maps_size): """Generate the training targets for negative exemplar image and search image pair. Args: gt_bbox (Tensor): Ground truth bboxes of an search image with shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y] format. score_maps_size (torch.size): denoting the output size (height, width) of the network. Returns: tuple(labels, labels_weights, bbox_targets, bbox_weights): the shape is (H * W * num_base_anchors,), (H * W * num_base_anchors,), (H * W * num_base_anchors, 4), (H * W * num_base_anchors, 4) respectively. All of them are Tensor. """ (labels, labels_weights, bbox_targets, bbox_weights) = self._get_init_targets(gt_bbox, score_maps_size) H, W = score_maps_size target_cx, target_cy, _, _ = bbox_xyxy_to_cxcywh(gt_bbox[:, 1:])[0] anchor_stride = self.anchor_generator.strides[0] cx = W // 2 cy = H // 2 cx += int( torch.ceil((target_cx - self.train_cfg.search_size // 2) / anchor_stride[0] + 0.5)) cy += int( torch.ceil((target_cy - self.train_cfg.search_size // 2) / anchor_stride[1] + 0.5)) left = max(0, cx - 3) right = min(W, cx + 4) top = max(0, cy - 3) down = min(H, cy + 4) labels = labels.view(H, W, -1) labels[...] = -1 labels[top:down, left:right, :] = 0 labels = labels.view(-1) neg_inds = torch.nonzero(labels == 0, as_tuple=False)[:, 0] index = torch.randperm( neg_inds.numel(), device=neg_inds.device)[:self.train_cfg.num_neg] neg_inds = neg_inds[index] labels[...] = -1 if len(neg_inds) > 0: labels[neg_inds] = 0 labels_weights[neg_inds] = 1.0 / len(neg_inds) / 2 labels[...] = 0 return labels, labels_weights, bbox_targets, bbox_weights
[文档] def get_targets(self, gt_bboxes, score_maps_size, is_positive_pairs): """Generate the training targets for exemplar image and search image pairs. Args: gt_bboxes (list[Tensor]): Ground truth bboxes of each search image with shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y] format. score_maps_size (torch.size): denoting the output size (height, width) of the network. is_positive_pairs (bool): list of bool denoting whether each ground truth bbox in `gt_bboxes` is positive. Returns: tuple(all_labels, all_labels_weights, all_bbox_targets, all_bbox_weights): the shape is (N, H * W * num_base_anchors), (N, H * W * num_base_anchors), (N, H * W * num_base_anchors, 4), (N, H * W * num_base_anchors, 4), respectively. All of them are Tensor. """ (all_labels, all_labels_weights, all_bbox_targets, all_bbox_weights) = [], [], [], [] for gt_bbox, is_positive_pair in zip(gt_bboxes, is_positive_pairs): if is_positive_pair: (labels, labels_weights, bbox_targets, bbox_weights) = self._get_positive_pair_targets( gt_bbox, score_maps_size) else: (labels, labels_weights, bbox_targets, bbox_weights) = self._get_negative_pair_targets( gt_bbox, score_maps_size) all_labels.append(labels) all_labels_weights.append(labels_weights) all_bbox_targets.append(bbox_targets) all_bbox_weights.append(bbox_weights) all_labels = torch.stack(all_labels) all_labels_weights = torch.stack(all_labels_weights) / len( all_labels_weights) all_bbox_targets = torch.stack(all_bbox_targets) all_bbox_weights = torch.stack(all_bbox_weights) / len( all_bbox_weights) return (all_labels, all_labels_weights, all_bbox_targets, all_bbox_weights)
[文档] @force_fp32(apply_to=('cls_score', 'bbox_pred')) def loss(self, cls_score, bbox_pred, labels, labels_weights, bbox_targets, bbox_weights): """Compute loss. Args: cls_score (Tensor): of shape (N, 2 * num_base_anchors, H, W). bbox_pred (Tensor): of shape (N, 4 * num_base_anchors, H, W). labels (Tensor): of shape (N, H * W * num_base_anchors). labels_weights (Tensor): of shape (N, H * W * num_base_anchors). bbox_targets (Tensor): of shape (N, H * W * num_base_anchors, 4). bbox_weights (Tensor): of shape (N, H * W * num_base_anchors, 4). Returns: dict[str, Tensor]: a dictionary of loss components """ losses = {} N, _, H, W = cls_score.shape cls_score = cls_score.view(N, 2, -1, H, W) cls_score = cls_score.permute(0, 3, 4, 2, 1).contiguous().view(-1, 2) labels = labels.view(-1) labels_weights = labels_weights.view(-1) losses['loss_rpn_cls'] = self.loss_cls( cls_score, labels, weight=labels_weights) bbox_pred = bbox_pred.view(N, 4, -1, H, W) bbox_pred = bbox_pred.permute(0, 3, 4, 2, 1).contiguous().view(-1, 4) bbox_targets = bbox_targets.view(-1, 4) bbox_weights = bbox_weights.view(-1, 4) losses['loss_rpn_bbox'] = self.loss_bbox( bbox_pred, bbox_targets, weight=bbox_weights) return losses
[文档] @force_fp32(apply_to=('cls_score', 'bbox_pred')) def get_bbox(self, cls_score, bbox_pred, prev_bbox, scale_factor): """Track `prev_bbox` to current frame based on the output of network. Args: cls_score (Tensor): of shape (1, 2 * num_base_anchors, H, W). bbox_pred (Tensor): of shape (1, 4 * num_base_anchors, H, W). prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format. scale_factor (Tensr): scale factor. Returns: tuple(best_score, best_bbox): best_score is a Tensor denoting the score of `best_bbox`, best_bbox is a Tensor of shape (4, ) with [cx, cy, w, h] format, which denotes the best tracked bbox in current frame. """ score_maps_size = [(cls_score.shape[2:])] if not hasattr(self, 'anchors'): self.anchors = self.anchor_generator.grid_priors( score_maps_size, device=cls_score.device)[0] # Transform the coordinate origin from the top left corner to the # center in the scaled feature map. feat_h, feat_w = score_maps_size[0] stride_w, stride_h = self.anchor_generator.strides[0] self.anchors[:, 0:4:2] -= (feat_w // 2) * stride_w self.anchors[:, 1:4:2] -= (feat_h // 2) * stride_h if not hasattr(self, 'windows'): = self.anchor_generator.gen_2d_hanning_windows( score_maps_size, cls_score.device)[0] H, W = score_maps_size[0] cls_score = cls_score.view(2, -1, H, W) cls_score = cls_score.permute(2, 3, 1, 0).contiguous().view(-1, 2) cls_score = cls_score.softmax(dim=1)[:, 1] bbox_pred = bbox_pred.view(4, -1, H, W) bbox_pred = bbox_pred.permute(2, 3, 1, 0).contiguous().view(-1, 4) bbox_pred = self.bbox_coder.decode(self.anchors, bbox_pred) bbox_pred = bbox_xyxy_to_cxcywh(bbox_pred) def change_ratio(ratio): return torch.max(ratio, 1. / ratio) def enlarge_size(w, h): pad = (w + h) * 0.5 return torch.sqrt((w + pad) * (h + pad)) # scale penalty scale_penalty = change_ratio( enlarge_size(bbox_pred[:, 2], bbox_pred[:, 3]) / enlarge_size( prev_bbox[2] * scale_factor, prev_bbox[3] * scale_factor)) # aspect ratio penalty aspect_ratio_penalty = change_ratio( (prev_bbox[2] / prev_bbox[3]) / (bbox_pred[:, 2] / bbox_pred[:, 3])) # penalize cls_score penalty = torch.exp(-(aspect_ratio_penalty * scale_penalty - 1) * self.test_cfg.penalty_k) penalty_score = penalty * cls_score # window penalty penalty_score = penalty_score * (1 - self.test_cfg.window_influence) \ + * self.test_cfg.window_influence best_idx = torch.argmax(penalty_score) best_score = cls_score[best_idx] best_bbox = bbox_pred[best_idx, :] / scale_factor final_bbox = torch.zeros_like(best_bbox) # map the bbox center from the searched image to the original image. final_bbox[0] = best_bbox[0] + prev_bbox[0] final_bbox[1] = best_bbox[1] + prev_bbox[1] # smooth bbox lr = penalty[best_idx] * cls_score[best_idx] * final_bbox[2] = prev_bbox[2] * (1 - lr) + best_bbox[2] * lr final_bbox[3] = prev_bbox[3] * (1 - lr) + best_bbox[3] * lr return best_score, final_bbox
