mmtrack.models.track_heads.siamese_rpn_head 源代码

# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
from mmcv.cnn.bricks import ConvModule
from mmcv.runner import BaseModule, auto_fp16, force_fp32
from mmdet.core import build_assigner, build_bbox_coder, build_sampler
from mmdet.core.anchor import build_prior_generator
from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh
from mmdet.models import HEADS, build_loss

from mmtrack.core.track import depthwise_correlation


[文档]@HEADS.register_module()
class CorrelationHead(BaseModule):
    """Correlation head module.

    This module is proposed in
    "SiamRPN++: Evolution of Siamese Visual Tracking with Very Deep Networks.
    `SiamRPN++ <https://arxiv.org/abs/1812.11703>`_.

    Args:
        in_channels (int): Input channels.
        mid_channels (int): Middle channels.
        out_channels (int): Output channels.
        kernel_size (int): Kernel size of convs. Defaults to 3.
        norm_cfg (dict): Configuration of normlization method after each conv.
            Defaults to dict(type='BN').
        act_cfg (dict): Configuration of activation method after each conv.
            Defaults to dict(type='ReLU').
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Defaults to None.
    """

    def __init__(self,
                 in_channels,
                 mid_channels,
                 out_channels,
                 kernel_size=3,
                 norm_cfg=dict(type='BN'),
                 act_cfg=dict(type='ReLU'),
                 init_cfg=None,
                 **kwargs):
        super(CorrelationHead, self).__init__(init_cfg)
        self.kernel_convs = ConvModule(
            in_channels=in_channels,
            out_channels=mid_channels,
            kernel_size=kernel_size,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg)

        self.search_convs = ConvModule(
            in_channels=in_channels,
            out_channels=mid_channels,
            kernel_size=kernel_size,
            norm_cfg=norm_cfg,
            act_cfg=act_cfg)

        self.head_convs = nn.Sequential(
            ConvModule(
                in_channels=mid_channels,
                out_channels=mid_channels,
                kernel_size=1,
                norm_cfg=norm_cfg,
                act_cfg=act_cfg),
            ConvModule(
                in_channels=mid_channels,
                out_channels=out_channels,
                kernel_size=1,
                act_cfg=None))

[文档]    def forward(self, kernel, search):
        kernel = self.kernel_convs(kernel)
        search = self.search_convs(search)
        correlation_maps = depthwise_correlation(search, kernel)
        out = self.head_convs(correlation_maps)
        return out


[文档]@HEADS.register_module()
class SiameseRPNHead(BaseModule):
    """Siamese RPN head.

    This module is proposed in
    "SiamRPN++: Evolution of Siamese Visual Tracking with Very Deep Networks.
    `SiamRPN++ <https://arxiv.org/abs/1812.11703>`_.

    Args:
        anchor_generator (dict): Configuration to build anchor generator
            module.

        in_channels (int): Input channels.

        kernel_size (int): Kernel size of convs. Defaults to 3.

        norm_cfg (dict): Configuration of normlization method after each conv.
            Defaults to dict(type='BN').

        weighted_sum (bool): If True, use learnable weights to weightedly sum
            the output of multi heads in siamese rpn , otherwise, use
            averaging. Defaults to False.

        bbox_coder (dict): Configuration to build bbox coder. Defaults to
            dict(type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.],
            target_stds=[1., 1., 1., 1.]).

        loss_cls (dict): Configuration to build classification loss. Defaults
            to dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)

        loss_bbox (dict): Configuration to build bbox regression loss. Defaults
            to dict( type='L1Loss', reduction='sum', loss_weight=1.2).

        train_cfg (Dict): Training setting. Defaults to None.

        test_cfg (Dict): Testing setting. Defaults to None.

        init_cfg (dict or list[dict], optional): Initialization config dict.
            Defaults to None.
    """

    def __init__(self,
                 anchor_generator,
                 in_channels,
                 kernel_size=3,
                 norm_cfg=dict(type='BN'),
                 weighted_sum=False,
                 bbox_coder=dict(
                     type='DeltaXYWHBBoxCoder',
                     target_means=[0., 0., 0., 0.],
                     target_stds=[1., 1., 1., 1.]),
                 loss_cls=dict(
                     type='CrossEntropyLoss', reduction='sum',
                     loss_weight=1.0),
                 loss_bbox=dict(
                     type='L1Loss', reduction='sum', loss_weight=1.2),
                 train_cfg=None,
                 test_cfg=None,
                 init_cfg=None,
                 *args,
                 **kwargs):
        super(SiameseRPNHead, self).__init__(init_cfg)
        self.anchor_generator = build_prior_generator(anchor_generator)
        self.bbox_coder = build_bbox_coder(bbox_coder)
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.assigner = build_assigner(self.train_cfg.assigner)
        self.sampler = build_sampler(self.train_cfg.sampler)
        self.fp16_enabled = False

        self.cls_heads = nn.ModuleList()
        self.reg_heads = nn.ModuleList()
        for i in range(len(in_channels)):
            self.cls_heads.append(
                CorrelationHead(in_channels[i], in_channels[i],
                                2 * self.anchor_generator.num_base_anchors[0],
                                kernel_size, norm_cfg))
            self.reg_heads.append(
                CorrelationHead(in_channels[i], in_channels[i],
                                4 * self.anchor_generator.num_base_anchors[0],
                                kernel_size, norm_cfg))

        self.weighted_sum = weighted_sum
        if self.weighted_sum:
            self.cls_weight = nn.Parameter(torch.ones(len(in_channels)))
            self.reg_weight = nn.Parameter(torch.ones(len(in_channels)))

        self.loss_cls = build_loss(loss_cls)
        self.loss_bbox = build_loss(loss_bbox)

[文档]    @auto_fp16()
    def forward(self, z_feats, x_feats):
        """Forward with features `z_feats` of exemplar images and features
        `x_feats` of search images.

        Args:
            z_feats (tuple[Tensor]): Tuple of Tensor with shape (N, C, H, W)
                denoting the multi level feature maps of exemplar images.
                Typically H and W equal to 7.
            x_feats (tuple[Tensor]): Tuple of Tensor with shape (N, C, H, W)
                denoting the multi level feature maps of search images.
                Typically H and W equal to 31.

        Returns:
            tuple(cls_score, bbox_pred): cls_score is a Tensor with shape
            (N, 2 * num_base_anchors, H, W), bbox_pred is a Tensor with shape
            (N, 4 * num_base_anchors, H, W), Typically H and W equal to 25.
        """
        assert isinstance(z_feats, tuple) and isinstance(x_feats, tuple)
        assert len(z_feats) == len(x_feats) and len(z_feats) == len(
            self.cls_heads)

        if self.weighted_sum:
            cls_weight = nn.functional.softmax(self.cls_weight, dim=0)
            reg_weight = nn.functional.softmax(self.reg_weight, dim=0)
        else:
            reg_weight = cls_weight = [
                1.0 / len(z_feats) for i in range(len(z_feats))
            ]

        cls_score = 0
        bbox_pred = 0
        for i in range(len(z_feats)):
            cls_score_single = self.cls_heads[i](z_feats[i], x_feats[i])
            bbox_pred_single = self.reg_heads[i](z_feats[i], x_feats[i])
            cls_score += cls_weight[i] * cls_score_single
            bbox_pred += reg_weight[i] * bbox_pred_single

        return cls_score, bbox_pred

    def _get_init_targets(self, gt_bbox, score_maps_size):
        """Initialize the training targets based on flattened anchors of the
        last score map."""
        num_base_anchors = self.anchor_generator.num_base_anchors[0]
        H, W = score_maps_size
        num_anchors = H * W * num_base_anchors
        labels = gt_bbox.new_zeros((num_anchors, ), dtype=torch.long)
        labels_weights = gt_bbox.new_zeros((num_anchors, ))
        bbox_weights = gt_bbox.new_zeros((num_anchors, 4))
        bbox_targets = gt_bbox.new_zeros((num_anchors, 4))
        return labels, labels_weights, bbox_targets, bbox_weights

    def _get_positive_pair_targets(self, gt_bbox, score_maps_size):
        """Generate the training targets for positive exemplar image and search
        image pair.

        Args:
            gt_bbox (Tensor): Ground truth bboxes of an search image with
                shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y] format.
            score_maps_size (torch.size): denoting the output size
                (height, width) of the network.

        Returns:
            tuple(labels, labels_weights, bbox_targets, bbox_weights): the
            shape is (H * W * num_base_anchors,), (H * W * num_base_anchors,),
            (H * W * num_base_anchors, 4), (H * W * num_base_anchors, 4)
            respectively. All of them are Tensor.
        """
        (labels, labels_weights, _,
         bbox_weights) = self._get_init_targets(gt_bbox, score_maps_size)

        if not hasattr(self, 'anchors'):
            self.anchors = self.anchor_generator.grid_priors(
                [score_maps_size], device=gt_bbox.device)[0]
            # Transform the coordinate origin from the top left corner to the
            # center in the scaled score map.
            feat_h, feat_w = score_maps_size
            stride_w, stride_h = self.anchor_generator.strides[0]
            self.anchors[:, 0:4:2] -= (feat_w // 2) * stride_w
            self.anchors[:, 1:4:2] -= (feat_h // 2) * stride_h

        anchors = self.anchors.clone()

        # The scaled feature map and the searched image have the same center.
        # Transform coordinate origin from the center to the top left corner in
        # the searched image.
        anchors += self.train_cfg.search_size // 2

        assign_result = self.assigner.assign(anchors, gt_bbox[:, 1:])
        sampling_result = self.sampler.sample(assign_result, anchors,
                                              gt_bbox[:, 1:])
        pos_inds = sampling_result.pos_inds
        neg_inds = sampling_result.neg_inds
        neg_upper_bound = int(self.sampler.num *
                              (1 - self.sampler.pos_fraction))
        if len(neg_inds) > neg_upper_bound:
            neg_inds = neg_inds[:neg_upper_bound]

        if len(pos_inds) > 0:
            labels[pos_inds] = 1
            labels_weights[pos_inds] = 1.0 / len(pos_inds) / 2
            bbox_weights[pos_inds] = 1.0 / len(pos_inds)

        if len(neg_inds) > 0:
            labels[neg_inds] = 0
            labels_weights[neg_inds] = 1.0 / len(neg_inds) / 2

        bbox_targets = self.bbox_coder.encode(
            anchors, gt_bbox[:, 1:].repeat(anchors.shape[0], 1))
        return labels, labels_weights, bbox_targets, bbox_weights

    def _get_negative_pair_targets(self, gt_bbox, score_maps_size):
        """Generate the training targets for negative exemplar image and search
        image pair.

        Args:
            gt_bbox (Tensor): Ground truth bboxes of an search image with
                shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y] format.
            score_maps_size (torch.size): denoting the output size
                (height, width) of the network.

        Returns:
            tuple(labels, labels_weights, bbox_targets, bbox_weights): the
            shape is (H * W * num_base_anchors,), (H * W * num_base_anchors,),
            (H * W * num_base_anchors, 4), (H * W * num_base_anchors, 4)
            respectively. All of them are Tensor.
        """
        (labels, labels_weights, bbox_targets,
         bbox_weights) = self._get_init_targets(gt_bbox, score_maps_size)
        H, W = score_maps_size
        target_cx, target_cy, _, _ = bbox_xyxy_to_cxcywh(gt_bbox[:, 1:])[0]
        anchor_stride = self.anchor_generator.strides[0]

        cx = W // 2
        cy = H // 2
        cx += int(
            torch.ceil((target_cx - self.train_cfg.search_size // 2) /
                       anchor_stride[0] + 0.5))
        cy += int(
            torch.ceil((target_cy - self.train_cfg.search_size // 2) /
                       anchor_stride[1] + 0.5))

        left = max(0, cx - 3)
        right = min(W, cx + 4)
        top = max(0, cy - 3)
        down = min(H, cy + 4)

        labels = labels.view(H, W, -1)
        labels[...] = -1
        labels[top:down, left:right, :] = 0

        labels = labels.view(-1)
        neg_inds = torch.nonzero(labels == 0, as_tuple=False)[:, 0]
        index = torch.randperm(
            neg_inds.numel(), device=neg_inds.device)[:self.train_cfg.num_neg]
        neg_inds = neg_inds[index]

        labels[...] = -1
        if len(neg_inds) > 0:
            labels[neg_inds] = 0
            labels_weights[neg_inds] = 1.0 / len(neg_inds) / 2
        labels[...] = 0

        return labels, labels_weights, bbox_targets, bbox_weights

[文档]    def get_targets(self, gt_bboxes, score_maps_size, is_positive_pairs):
        """Generate the training targets for exemplar image and search image
        pairs.

        Args:
            gt_bboxes (list[Tensor]): Ground truth bboxes of each
                search image with shape (1, 5) in [0.0, tl_x, tl_y, br_x, br_y]
                format.
            score_maps_size (torch.size): denoting the output size
                (height, width) of the network.
            is_positive_pairs (bool): list of bool denoting whether each ground
                truth bbox in `gt_bboxes` is positive.

        Returns:
            tuple(all_labels, all_labels_weights, all_bbox_targets,
            all_bbox_weights): the shape is (N, H * W * num_base_anchors),
            (N, H * W * num_base_anchors), (N, H * W * num_base_anchors, 4),
            (N, H * W * num_base_anchors, 4), respectively. All of them are
            Tensor.
        """
        (all_labels, all_labels_weights, all_bbox_targets,
         all_bbox_weights) = [], [], [], []

        for gt_bbox, is_positive_pair in zip(gt_bboxes, is_positive_pairs):
            if is_positive_pair:
                (labels, labels_weights, bbox_targets,
                 bbox_weights) = self._get_positive_pair_targets(
                     gt_bbox, score_maps_size)
            else:
                (labels, labels_weights, bbox_targets,
                 bbox_weights) = self._get_negative_pair_targets(
                     gt_bbox, score_maps_size)

            all_labels.append(labels)
            all_labels_weights.append(labels_weights)
            all_bbox_targets.append(bbox_targets)
            all_bbox_weights.append(bbox_weights)

        all_labels = torch.stack(all_labels)
        all_labels_weights = torch.stack(all_labels_weights) / len(
            all_labels_weights)
        all_bbox_targets = torch.stack(all_bbox_targets)
        all_bbox_weights = torch.stack(all_bbox_weights) / len(
            all_bbox_weights)

        return (all_labels, all_labels_weights, all_bbox_targets,
                all_bbox_weights)

[文档]    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
    def loss(self, cls_score, bbox_pred, labels, labels_weights, bbox_targets,
             bbox_weights):
        """Compute loss.

        Args:
            cls_score (Tensor): of shape (N, 2 * num_base_anchors, H, W).
            bbox_pred (Tensor): of shape (N, 4 * num_base_anchors, H, W).
            labels (Tensor): of shape (N, H * W * num_base_anchors).
            labels_weights (Tensor): of shape (N, H * W * num_base_anchors).
            bbox_targets (Tensor): of shape (N, H * W * num_base_anchors, 4).
            bbox_weights (Tensor): of shape (N, H * W * num_base_anchors, 4).

        Returns:
            dict[str, Tensor]: a dictionary of loss components
        """
        losses = {}
        N, _, H, W = cls_score.shape

        cls_score = cls_score.view(N, 2, -1, H, W)
        cls_score = cls_score.permute(0, 3, 4, 2, 1).contiguous().view(-1, 2)
        labels = labels.view(-1)
        labels_weights = labels_weights.view(-1)
        losses['loss_rpn_cls'] = self.loss_cls(
            cls_score, labels, weight=labels_weights)

        bbox_pred = bbox_pred.view(N, 4, -1, H, W)
        bbox_pred = bbox_pred.permute(0, 3, 4, 2, 1).contiguous().view(-1, 4)
        bbox_targets = bbox_targets.view(-1, 4)
        bbox_weights = bbox_weights.view(-1, 4)
        losses['loss_rpn_bbox'] = self.loss_bbox(
            bbox_pred, bbox_targets, weight=bbox_weights)

        return losses

[文档]    @force_fp32(apply_to=('cls_score', 'bbox_pred'))
    def get_bbox(self, cls_score, bbox_pred, prev_bbox, scale_factor):
        """Track `prev_bbox` to current frame based on the output of network.

        Args:
            cls_score (Tensor): of shape (1, 2 * num_base_anchors, H, W).
            bbox_pred (Tensor): of shape (1, 4 * num_base_anchors, H, W).
            prev_bbox (Tensor): of shape (4, ) in [cx, cy, w, h] format.
            scale_factor (Tensr): scale factor.

        Returns:
            tuple(best_score, best_bbox): best_score is a Tensor denoting the
            score of `best_bbox`, best_bbox is a Tensor of shape (4, )
            with [cx, cy, w, h] format, which denotes the best tracked
            bbox in current frame.
        """
        score_maps_size = [(cls_score.shape[2:])]
        if not hasattr(self, 'anchors'):
            self.anchors = self.anchor_generator.grid_priors(
                score_maps_size, device=cls_score.device)[0]
            # Transform the coordinate origin from the top left corner to the
            # center in the scaled feature map.
            feat_h, feat_w = score_maps_size[0]
            stride_w, stride_h = self.anchor_generator.strides[0]
            self.anchors[:, 0:4:2] -= (feat_w // 2) * stride_w
            self.anchors[:, 1:4:2] -= (feat_h // 2) * stride_h

        if not hasattr(self, 'windows'):
            self.windows = self.anchor_generator.gen_2d_hanning_windows(
                score_maps_size, cls_score.device)[0]

        H, W = score_maps_size[0]
        cls_score = cls_score.view(2, -1, H, W)
        cls_score = cls_score.permute(2, 3, 1, 0).contiguous().view(-1, 2)
        cls_score = cls_score.softmax(dim=1)[:, 1]

        bbox_pred = bbox_pred.view(4, -1, H, W)
        bbox_pred = bbox_pred.permute(2, 3, 1, 0).contiguous().view(-1, 4)
        bbox_pred = self.bbox_coder.decode(self.anchors, bbox_pred)
        bbox_pred = bbox_xyxy_to_cxcywh(bbox_pred)

        def change_ratio(ratio):
            return torch.max(ratio, 1. / ratio)

        def enlarge_size(w, h):
            pad = (w + h) * 0.5
            return torch.sqrt((w + pad) * (h + pad))

        # scale penalty
        scale_penalty = change_ratio(
            enlarge_size(bbox_pred[:, 2], bbox_pred[:, 3]) / enlarge_size(
                prev_bbox[2] * scale_factor, prev_bbox[3] * scale_factor))

        # aspect ratio penalty
        aspect_ratio_penalty = change_ratio(
            (prev_bbox[2] / prev_bbox[3]) /
            (bbox_pred[:, 2] / bbox_pred[:, 3]))

        # penalize cls_score
        penalty = torch.exp(-(aspect_ratio_penalty * scale_penalty - 1) *
                            self.test_cfg.penalty_k)
        penalty_score = penalty * cls_score

        # window penalty
        penalty_score = penalty_score * (1 - self.test_cfg.window_influence) \
            + self.windows * self.test_cfg.window_influence

        best_idx = torch.argmax(penalty_score)
        best_score = cls_score[best_idx]
        best_bbox = bbox_pred[best_idx, :] / scale_factor

        final_bbox = torch.zeros_like(best_bbox)

        # map the bbox center from the searched image to the original image.
        final_bbox[0] = best_bbox[0] + prev_bbox[0]
        final_bbox[1] = best_bbox[1] + prev_bbox[1]

        # smooth bbox
        lr = penalty[best_idx] * cls_score[best_idx] * self.test_cfg.lr
        final_bbox[2] = prev_bbox[2] * (1 - lr) + best_bbox[2] * lr
        final_bbox[3] = prev_bbox[3] * (1 - lr) + best_bbox[3] * lr

        return best_score, final_bbox