Source code for mmtrack.utils.misc
# Copyright (c) OpenMMLab. All rights reserved.
from copy import deepcopy
from typing import List, Optional, Sequence, Tuple, Union
import numpy as np
import torch
import torch.nn.functional as F
from torch import Tensor
from ..structures import TrackDataSample
[docs]def stack_batch(tensors: List[torch.Tensor],
pad_size_divisor: int = 0,
pad_value: Union[int, float] = 0) -> torch.Tensor:
"""Stack multiple tensors to form a batch and pad the images to the max
shape use the right bottom padding mode in these images. If
``pad_size_divisor > 0``, add padding to ensure the common height and width
is divisible by ``pad_size_divisor``.
Args:
tensors (List[Tensor]): The input multiple tensors. each is a
TCHW 4D-tensor. T denotes the number of key/reference frames.
pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
to ensure the common height and width is divisible by
``pad_size_divisor``. This depends on the model, and many
models need a divisibility of 32. Defaults to 0
pad_value (int, float): The padding value. Defaults to 0
Returns:
Tensor: The NTCHW 5D-tensor. N denotes the batch size.
"""
assert isinstance(tensors, list), \
f'Expected input type to be list, but got {type(tensors)}'
assert len(set([tensor.ndim for tensor in tensors])) == 1, \
f'Expected the dimensions of all tensors must be the same, ' \
f'but got {[tensor.ndim for tensor in tensors]}'
assert tensors[0].ndim == 4, f'Expected tensor dimension to be 4, ' \
f'but got {tensors[0].ndim}'
assert len(set([tensor.shape[0] for tensor in tensors])) == 1, \
f'Expected the channels of all tensors must be the same, ' \
f'but got {[tensor.shape[0] for tensor in tensors]}'
tensor_sizes = [(tensor.shape[-2], tensor.shape[-1]) for tensor in tensors]
max_size = np.stack(tensor_sizes).max(0)
if pad_size_divisor > 1:
# the last two dims are H,W, both subject to divisibility requirement
max_size = (
max_size +
(pad_size_divisor - 1)) // pad_size_divisor * pad_size_divisor
padded_samples = []
for tensor in tensors:
padding_size = [
0, max_size[-1] - tensor.shape[-1], 0,
max_size[-2] - tensor.shape[-2]
]
if sum(padding_size) == 0:
padded_samples.append(tensor)
else:
padded_samples.append(F.pad(tensor, padding_size, value=pad_value))
return torch.stack(padded_samples, dim=0)
[docs]def convert_data_sample_type(
data_sample: TrackDataSample,
num_ref_imgs: int = 1) -> Tuple[List[TrackDataSample], List[dict]]:
"""Convert the type of ``data_sample`` from dict[list] to list[dict].
Note: This function is mainly used to be compatible with the
interface of MMDetection. It make sure that the information of
each reference image can be independently packed into
``data_sample`` in which all the keys are without prefix "ref_".
Args:
data_sample (TrackDataSample): Data sample input.
num_ref_imgs (int, optional): The numbe of reference images in the
``data_sample``. Defaults to 1.
Returns:
Tuple[List[TrackDataSample], List[dict]]: The first element is the
list of object of TrackDataSample. The second element is the
list of meta information of reference images.
"""
ref_data_samples, ref_metainfos = [], []
for _ in range(num_ref_imgs):
ref_data_samples.append(deepcopy(data_sample))
ref_metainfos.append(deepcopy(data_sample.metainfo))
for key, value in data_sample.metainfo.items():
if key.startswith('ref_'):
new_key = key[4:]
if num_ref_imgs == 1:
value = [value]
assert len(value) == num_ref_imgs
for i, v in enumerate(value):
ref_metainfos[i][new_key] = v
ref_data_samples[i].set_metainfo(dict(new_key=v))
# pop the redundant original reference key.
ref_metainfos[i].pop(key)
ref_data_samples[i].pop(key)
return ref_data_samples, ref_metainfos
[docs]def max_last2d(input: Tensor) -> Tuple[Tensor, Tensor]:
"""Computes the value and position of maximum in the last two dimensions.
Args:
input (Tensor): of shape (..., H, W)
Returns:
max_val (Tensor): The maximum value.
argmax (Tensor): The position of maximum in [row, col] format.
"""
max_val_row, argmax_row = torch.max(input, dim=-2)
max_val, argmax_col = torch.max(max_val_row, dim=-1)
argmax_row = argmax_row.view(argmax_col.numel(),
-1)[torch.arange(argmax_col.numel()),
argmax_col.view(-1)]
argmax_row = argmax_row.reshape(argmax_col.shape)
argmax = torch.cat((argmax_row.unsqueeze(-1), argmax_col.unsqueeze(-1)),
-1)
return max_val, argmax
[docs]def format_video_level_show(
video_names: List,
eval_results: List[np.ndarray],
sort_by_first_metric: bool = True,
show_indices: Optional[Tuple[int, List]] = None) -> List[List]:
"""Format video-level performance show.
Args:
video_names (List): The names of the videos.
eval_results (List[np.ndarray]): The evaluation results.
sort_by_first_metric (bool, optional): Whether to sort the results by
the first metric. Defaults to True.
show_indices (Optional[Tuple[int, List]], optional): The video indices
to be shown. Defaults to None, i.e., all videos.
Returns:
List[List]: The formatted video-level evaluation results. For example:
[[`video-2`, 48.2, 49.2, 51.9],
[`video-1`, 46.2, 48.2, 50.2]]
"""
all_video_names_str = np.array(video_names, dtype=str)
eval_show_results = eval_results
if sort_by_first_metric:
# sort from largest to smallest
sorted_index = np.argsort(-eval_results[0])
all_video_names_str = all_video_names_str[sorted_index]
sorted_eval_results = []
for eval_res in eval_results:
sorted_eval_results.append(eval_res[sorted_index])
eval_show_results = np.stack(sorted_eval_results).T
if show_indices is not None:
if isinstance(show_indices, int):
if show_indices < 0:
show_indices = np.arange(show_indices, 0)
else:
show_indices = np.arange(show_indices)
elif isinstance(show_indices, Sequence):
show_indices = np.array(show_indices, dtype=np.int64)
else:
raise NotImplementedError(
f'{type(show_indices)} is not supported. '
'Please use type of int or list')
eval_show_results = eval_show_results[show_indices, :]
eval_show_results = eval_show_results.tolist()
for res_line, video_name in zip(eval_show_results, all_video_names_str):
res_line.insert(0, video_name)
return eval_show_results