mmtrack.models.motion.flownet_simple 源代码
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
from mmcv.cnn.bricks import ConvModule
from mmcv.runner import BaseModule
from ..builder import MOTION
[文档]@MOTION.register_module()
class FlowNetSimple(BaseModule):
"""The simple version of FlowNet.
This FlowNetSimple is the implementation of `FlowNetSimple
<https://arxiv.org/abs/1504.06852>`_.
Args:
img_scale_factor (float): Used to upsample/downsample the image.
out_indices (list): The indices of outputting feature maps after
each group of conv layers. Defaults to [2, 3, 4, 5, 6].
flow_scale_factor (float): Used to enlarge the values of flow.
Defaults to 5.0.
flow_img_norm_std (list): Used to scale the values of image.
Defaults to [255.0, 255.0, 255.0].
flow_img_norm_mean (list): Used to center the values of image.
Defaults to [0.411, 0.432, 0.450].
init_cfg (dict or list[dict], optional): Initialization config dict.
Defaults to None.
"""
arch_setting = {
'conv_layers': {
'inplanes': (6, 64, 128, 256, 512, 512),
'kernel_size': (7, 5, 5, 3, 3, 3),
'num_convs': (1, 1, 2, 2, 2, 2)
},
'deconv_layers': {
'inplanes': (386, 770, 1026, 1024)
}
}
def __init__(self,
img_scale_factor,
out_indices=[2, 3, 4, 5, 6],
flow_scale_factor=5.0,
flow_img_norm_std=[255.0, 255.0, 255.0],
flow_img_norm_mean=[0.411, 0.432, 0.450],
init_cfg=None):
super(FlowNetSimple, self).__init__(init_cfg)
self.img_scale_factor = img_scale_factor
self.out_indices = out_indices
self.flow_scale_factor = flow_scale_factor
self.flow_img_norm_mean = flow_img_norm_mean
self.flow_img_norm_std = flow_img_norm_std
self.conv_layers = []
conv_layers_setting = self.arch_setting['conv_layers']
for i in range(len(conv_layers_setting['inplanes'])):
num_convs = conv_layers_setting['num_convs'][i]
kernel_size = conv_layers_setting['kernel_size'][i]
inplanes = conv_layers_setting['inplanes'][i]
if i == len(conv_layers_setting['inplanes']) - 1:
planes = 2 * inplanes
else:
planes = conv_layers_setting['inplanes'][i + 1]
conv_layer = nn.ModuleList()
conv_layer.append(
ConvModule(
in_channels=inplanes,
out_channels=planes,
kernel_size=kernel_size,
stride=2,
padding=(kernel_size - 1) // 2,
bias=True,
conv_cfg=dict(type='Conv'),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1)))
for j in range(1, num_convs):
kernel_size = 3 if i == 2 else kernel_size
conv_layer.append(
ConvModule(
in_channels=planes,
out_channels=planes,
kernel_size=kernel_size,
stride=1,
padding=(kernel_size - 1) // 2,
bias=True,
conv_cfg=dict(type='Conv'),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1)))
self.add_module(f'conv{i+1}', conv_layer)
self.conv_layers.append(f'conv{i+1}')
self.deconv_layers = []
self.flow_layers = []
self.upflow_layers = []
deconv_layers_setting = self.arch_setting['deconv_layers']
planes = deconv_layers_setting['inplanes'][-1] // 2
for i in range(len(deconv_layers_setting['inplanes']) - 1, -1, -1):
inplanes = deconv_layers_setting['inplanes'][i]
deconv_layer = ConvModule(
in_channels=inplanes,
out_channels=planes,
kernel_size=4,
stride=2,
padding=1,
bias=False,
conv_cfg=dict(type='deconv'),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1))
self.add_module(f'deconv{i+2}', deconv_layer)
self.deconv_layers.insert(0, f'deconv{i+2}')
flow_layer = ConvModule(
in_channels=inplanes,
out_channels=2,
kernel_size=3,
stride=1,
padding=1,
bias=False,
conv_cfg=dict(type='Conv'),
act_cfg=None)
self.add_module(f'predict_flow{i+3}', flow_layer)
self.flow_layers.insert(0, f'predict_flow{i+3}')
upflow_layer = ConvModule(
in_channels=2,
out_channels=2,
kernel_size=4,
stride=2,
padding=1,
bias=False,
conv_cfg=dict(type='deconv'),
act_cfg=None)
self.add_module(f'upsample_flow{i+2}', upflow_layer)
self.upflow_layers.insert(0, f'upsample_flow{i+2}')
planes = planes // 2
self.predict_flow = ConvModule(
in_channels=planes * (2 + 4) + 2,
out_channels=2,
kernel_size=3,
stride=1,
padding=1,
bias=False,
conv_cfg=dict(type='Conv'),
act_cfg=None)
[文档] def prepare_imgs(self, imgs, img_metas):
"""Preprocess images pairs for computing flow.
Args:
imgs (Tensor): of shape (N, 6, H, W) encoding input images pairs.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): list of image information dict where each
dict has: 'img_shape', 'scale_factor', 'flip', and may also
contain 'filename', 'ori_shape', 'pad_shape', and
'img_norm_cfg'. For details on the values of these keys see
`mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
Returns:
Tensor: of shape (N, 6, H, W) encoding the input images pairs for
FlowNetSimple.
"""
if not hasattr(self, 'img_norm_mean'):
mean = img_metas[0]['img_norm_cfg']['mean']
mean = torch.tensor(mean, dtype=imgs.dtype, device=imgs.device)
self.img_norm_mean = mean.repeat(2)[None, :, None, None]
mean = self.flow_img_norm_mean
mean = torch.tensor(mean, dtype=imgs.dtype, device=imgs.device)
self.flow_img_norm_mean = mean.repeat(2)[None, :, None, None]
if not hasattr(self, 'img_norm_std'):
std = img_metas[0]['img_norm_cfg']['std']
std = torch.tensor(std, dtype=imgs.dtype, device=imgs.device)
self.img_norm_std = std.repeat(2)[None, :, None, None]
std = self.flow_img_norm_std
std = torch.tensor(std, dtype=imgs.dtype, device=imgs.device)
self.flow_img_norm_std = std.repeat(2)[None, :, None, None]
flow_img = imgs * self.img_norm_std + self.img_norm_mean
flow_img = flow_img / self.flow_img_norm_std - self.flow_img_norm_mean
flow_img[:, :, img_metas[0]['img_shape'][0]:, :] = 0.0
flow_img[:, :, :, img_metas[0]['img_shape'][1]:] = 0.0
flow_img = torch.nn.functional.interpolate(
flow_img,
scale_factor=self.img_scale_factor,
mode='bilinear',
align_corners=False)
return flow_img
[文档] def forward(self, imgs, img_metas):
"""Compute the flow of images pairs.
Args:
imgs (Tensor): of shape (N, 6, H, W) encoding input images pairs.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): list of image information dict where each
dict has: 'img_shape', 'scale_factor', 'flip', and may also
contain 'filename', 'ori_shape', 'pad_shape', and
'img_norm_cfg'. For details on the values of these keys see
`mmtrack/datasets/pipelines/formatting.py:VideoCollect`.
Returns:
Tensor: of shape (N, 2, H, W) encoding flow of images pairs.
"""
x = self.prepare_imgs(imgs, img_metas)
conv_outs = []
for i, conv_name in enumerate(self.conv_layers, 1):
conv_layer = getattr(self, conv_name)
for module in conv_layer:
x = module(x)
if i in self.out_indices:
conv_outs.append(x)
num_outs = len(conv_outs)
for i, deconv_name, flow_name, upflow_name in zip(
range(1, num_outs)[::-1], self.deconv_layers[::-1],
self.flow_layers[::-1], self.upflow_layers[::-1]):
deconv_layer = getattr(self, deconv_name)
flow_layer = getattr(self, flow_name)
upflow_layer = getattr(self, upflow_name)
if i == num_outs - 1:
concat_out = conv_outs[i]
flow = flow_layer(concat_out)
upflow = self.crop_like(upflow_layer(flow), conv_outs[i - 1])
deconv_out = self.crop_like(
deconv_layer(concat_out), conv_outs[i - 1])
concat_out = torch.cat((conv_outs[i - 1], deconv_out, upflow),
dim=1)
flow = self.predict_flow(concat_out)
flow = torch.nn.functional.interpolate(
flow,
scale_factor=4 / self.img_scale_factor,
mode='bilinear',
align_corners=False)
flow *= 4 / self.img_scale_factor
flow *= self.flow_scale_factor
return flow
[文档] def crop_like(self, input, target):
"""Crop `input` as the size of `target`."""
if input.size()[2:] == target.size()[2:]:
return input
else:
return input[:, :, :target.size(2), :target.size(3)]