mmtrack.datasets.pipelines.transforms 源代码
# Copyright (c) OpenMMLab. All rights reserved.
import math
import cv2
import mmcv
import numpy as np
from mmcv.utils import print_log
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Normalize, Pad, RandomFlip, Resize
from mmtrack.core import crop_image
[文档]@PIPELINES.register_module()
class SeqCropLikeSiamFC(object):
"""Crop images as SiamFC did.
The way of cropping an image is proposed in
"Fully-Convolutional Siamese Networks for Object Tracking."
`SiamFC <https://arxiv.org/abs/1606.09549>`_.
Args:
context_amount (float): The context amount around a bounding box.
Defaults to 0.5.
exemplar_size (int): Exemplar size. Defaults to 127.
crop_size (int): Crop size. Defaults to 511.
"""
def __init__(self, context_amount=0.5, exemplar_size=127, crop_size=511):
self.context_amount = context_amount
self.exemplar_size = exemplar_size
self.crop_size = crop_size
[文档] def crop_like_SiamFC(self,
image,
bbox,
context_amount=0.5,
exemplar_size=127,
crop_size=511):
"""Crop an image as SiamFC did.
Args:
image (ndarray): of shape (H, W, 3).
bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
context_amount (float): The context amount around a bounding box.
Defaults to 0.5.
exemplar_size (int): Exemplar size. Defaults to 127.
crop_size (int): Crop size. Defaults to 511.
Returns:
ndarray: The cropped image of shape (crop_size, crop_size, 3).
"""
padding = np.mean(image, axis=(0, 1)).tolist()
bbox = np.array([
0.5 * (bbox[2] + bbox[0]), 0.5 * (bbox[3] + bbox[1]),
bbox[2] - bbox[0], bbox[3] - bbox[1]
])
z_width = bbox[2] + context_amount * (bbox[2] + bbox[3])
z_height = bbox[3] + context_amount * (bbox[2] + bbox[3])
z_size = np.sqrt(z_width * z_height)
z_scale = exemplar_size / z_size
d_search = (crop_size - exemplar_size) / 2.
pad = d_search / z_scale
x_size = z_size + 2 * pad
x_bbox = np.array([
bbox[0] - 0.5 * x_size, bbox[1] - 0.5 * x_size,
bbox[0] + 0.5 * x_size, bbox[1] + 0.5 * x_size
])
x_crop_img = crop_image(image, x_bbox, crop_size, padding)
return x_crop_img
[文档] def generate_box(self, image, gt_bbox, context_amount, exemplar_size):
"""Generate box based on cropped image.
Args:
image (ndarray): The cropped image of shape
(self.crop_size, self.crop_size, 3).
gt_bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
context_amount (float): The context amount around a bounding box.
exemplar_size (int): Exemplar size. Defaults to 127.
Returns:
ndarray: Generated box of shape (4, ) in [x1, y1, x2, y2] format.
"""
img_h, img_w = image.shape[:2]
w, h = gt_bbox[2] - gt_bbox[0], gt_bbox[3] - gt_bbox[1]
z_width = w + context_amount * (w + h)
z_height = h + context_amount * (w + h)
z_scale = np.sqrt(z_width * z_height)
z_scale_factor = exemplar_size / z_scale
w = w * z_scale_factor
h = h * z_scale_factor
cx, cy = img_w // 2, img_h // 2
bbox = np.array(
[cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h],
dtype=np.float32)
return bbox
def __call__(self, results):
"""Call function.
For each dict in results, crop image like SiamFC did.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains cropped image and
corresponding ground truth box.
"""
outs = []
for _results in results:
image = _results['img']
gt_bbox = _results['gt_bboxes'][0]
crop_img = self.crop_like_SiamFC(image, gt_bbox,
self.context_amount,
self.exemplar_size,
self.crop_size)
generated_bbox = self.generate_box(crop_img, gt_bbox,
self.context_amount,
self.exemplar_size)
generated_bbox = generated_bbox[None]
_results['img'] = crop_img
if 'img_shape' in _results:
_results['img_shape'] = crop_img.shape
_results['gt_bboxes'] = generated_bbox
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqCropLikeStark(object):
"""Crop images as Stark did.
The way of cropping an image is proposed in
"Learning Spatio-Temporal Transformer for Visual Tracking."
`Stark <https://arxiv.org/abs/2103.17154>`_.
Args:
crop_size_factor (list[int | float]): contains the ratio of crop size
to bbox size.
output_size (list[int | float]): contains the size of resized image
(always square).
"""
def __init__(self, crop_size_factor, output_size):
self.crop_size_factor = crop_size_factor
self.output_size = output_size
[文档] def crop_like_stark(self, img, bbox, crop_size_factor, output_size):
"""Crop an image as Stark did.
Args:
image (ndarray): of shape (H, W, 3).
bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
crop_size_factor (float): the ratio of crop size to bbox size
output_size (int): the size of resized image (always square).
Returns:
img_crop_padded (ndarray): the cropped image of shape
(crop_size, crop_size, 3).
resize_factor (float): the ratio of original image scale to cropped
image scale.
pdding_mask (ndarray): the padding mask caused by cropping.
"""
x1, y1, x2, y2 = np.split(bbox, 4, axis=-1)
bbox_w, bbox_h = x2 - x1, y2 - y1
cx, cy = x1 + bbox_w / 2., y1 + bbox_h / 2.
img_h, img_w, _ = img.shape
# 1. Crop image
# 1.1 calculate crop size and pad size
crop_size = math.ceil(math.sqrt(bbox_w * bbox_h) * crop_size_factor)
crop_size = max(crop_size, 1)
x1 = int(np.round(cx - crop_size * 0.5))
x2 = x1 + crop_size
y1 = int(np.round(cy - crop_size * 0.5))
y2 = y1 + crop_size
x1_pad = max(0, -x1)
x2_pad = max(x2 - img_w + 1, 0)
y1_pad = max(0, -y1)
y2_pad = max(y2 - img_h + 1, 0)
# 1.2 crop image
img_crop = img[y1 + y1_pad:y2 - y2_pad, x1 + x1_pad:x2 - x2_pad, :]
# 1.3 pad image
img_crop_padded = cv2.copyMakeBorder(img_crop, y1_pad, y2_pad, x1_pad,
x2_pad, cv2.BORDER_CONSTANT)
# 1.4 generate padding mask
img_h, img_w, _ = img_crop_padded.shape
pdding_mask = np.ones((img_h, img_w))
end_x, end_y = -x2_pad, -y2_pad
if y2_pad == 0:
end_y = None
if x2_pad == 0:
end_x = None
pdding_mask[y1_pad:end_y, x1_pad:end_x] = 0
# 2. Resize image and padding mask
resize_factor = output_size / crop_size
img_crop_padded = cv2.resize(img_crop_padded,
(output_size, output_size))
pdding_mask = cv2.resize(pdding_mask,
(output_size, output_size)).astype(np.bool_)
return img_crop_padded, resize_factor, pdding_mask
[文档] def generate_box(self,
bbox_gt,
bbox_cropped,
resize_factor,
output_size,
normalize=False):
"""Transform the box coordinates from the original image coordinates to
the coordinates of the cropped image.
Args:
bbox_gt (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
bbox_cropped (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
resize_factor (float): the ratio of original image scale to cropped
image scale.
output_size (float): the size of output image.
normalize (bool): whether to normalize the output box.
Default to True.
Returns:
ndarray: generated box of shape (4, ) in [x1, y1, x2, y2] format.
"""
assert output_size > 0
bbox_gt_center = (bbox_gt[0:2] + bbox_gt[2:4]) * 0.5
bbox_cropped_center = (bbox_cropped[0:2] + bbox_cropped[2:4]) * 0.5
bbox_out_center = (output_size - 1) / 2. + (
bbox_gt_center - bbox_cropped_center) * resize_factor
bbox_out_wh = (bbox_gt[2:4] - bbox_gt[0:2]) * resize_factor
bbox_out = np.concatenate((bbox_out_center - 0.5 * bbox_out_wh,
bbox_out_center + 0.5 * bbox_out_wh),
axis=-1)
return bbox_out / output_size if normalize else bbox_out
def __call__(self, results):
"""Call function. For each dict in results, crop image like Stark did.
Args:
results (list[dict]): list of dict from
:obj:`mmtrack.base_sot_dataset`.
Returns:
List[dict]: list of dict that contains cropped image and
the corresponding groundtruth bbox.
"""
outs = []
for i, _results in enumerate(results):
image = _results['img']
gt_bbox = _results['gt_bboxes'][0]
jittered_bboxes = _results['jittered_bboxes'][0]
crop_img, resize_factor, padding_mask = self.crop_like_stark(
image, jittered_bboxes, self.crop_size_factor[i],
self.output_size[i])
generated_bbox = self.generate_box(
gt_bbox,
jittered_bboxes,
resize_factor,
self.output_size[i],
normalize=False)
generated_bbox = generated_bbox[None]
_results['img'] = crop_img
if 'img_shape' in _results:
_results['img_shape'] = crop_img.shape
_results['gt_bboxes'] = generated_bbox
_results['seg_fields'] = ['padding_mask']
_results['padding_mask'] = padding_mask
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqBboxJitter(object):
"""Bounding box jitter augmentation. The jittered bboxes are used for
subsequent image cropping, like `SeqCropLikeStark`.
Args:
scale_jitter_factor (list[int | float]): contains the factor of scale
jitter.
center_jitter_factor (list[int | float]): contains the factor of center
jitter.
crop_size_factor (list[int | float]): contains the ratio of crop size
to bbox size.
"""
def __init__(self, scale_jitter_factor, center_jitter_factor,
crop_size_factor):
self.scale_jitter_factor = scale_jitter_factor
self.center_jitter_factor = center_jitter_factor
self.crop_size_factor = crop_size_factor
def __call__(self, results):
"""Call function.
Args:
results (list[dict]): list of dict from
:obj:`mmtrack.base_sot_dataset`.
Returns:
list[dict]: list of dict that contains augmented images.
"""
outs = []
for i, _results in enumerate(results):
gt_bbox = _results['gt_bboxes'][0]
x1, y1, x2, y2 = np.split(gt_bbox, 4, axis=-1)
bbox_w, bbox_h = x2 - x1, y2 - y1
gt_bbox_cxcywh = np.concatenate(
[x1 + bbox_w / 2., y1 + bbox_h / 2., bbox_w, bbox_h], axis=-1)
crop_img_size = -1
# avoid croped image size too small.
count = 0
while crop_img_size < 1:
count += 1
if count > 100:
print_log(
f'-------- bbox {gt_bbox_cxcywh} is invalid -------')
return None
jittered_wh = gt_bbox_cxcywh[2:4] * np.exp(
np.random.randn(2) * self.scale_jitter_factor[i])
crop_img_size = np.ceil(
np.sqrt(jittered_wh.prod()) * self.crop_size_factor[i])
max_offset = np.sqrt(
jittered_wh.prod()) * self.center_jitter_factor[i]
jittered_center = gt_bbox_cxcywh[0:2] + max_offset * (
np.random.rand(2) - 0.5)
jittered_bboxes = np.concatenate(
(jittered_center - 0.5 * jittered_wh,
jittered_center + 0.5 * jittered_wh),
axis=-1)
_results['jittered_bboxes'] = jittered_bboxes[None]
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqBrightnessAug(object):
"""Brightness augmention for images.
Args:
jitter_range (float): The range of brightness jitter.
Defaults to 0..
"""
def __init__(self, jitter_range=0):
self.jitter_range = jitter_range
def __call__(self, results):
"""Call function.
For each dict in results, perform brightness augmention for image in
the dict.
Args:
results (list[dict]): list of dict that from
:obj:`mmtrack.base_sot_dataset`.
Returns:
list[dict]: list of dict that contains augmented image.
"""
brightness_factor = np.random.uniform(
max(0, 1 - self.jitter_range), 1 + self.jitter_range)
outs = []
for _results in results:
image = _results['img']
image = np.dot(image, brightness_factor).clip(0, 255.0)
_results['img'] = image
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqGrayAug(object):
"""Gray augmention for images.
Args:
prob (float): The probability to perform gray augmention.
Defaults to 0..
"""
def __init__(self, prob=0.):
self.prob = prob
def __call__(self, results):
"""Call function.
For each dict in results, perform gray augmention for image in the
dict.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains augmented gray image.
"""
outs = []
gray_prob = np.random.random()
for _results in results:
if self.prob > gray_prob:
grayed = cv2.cvtColor(_results['img'], cv2.COLOR_BGR2GRAY)
image = cv2.cvtColor(grayed, cv2.COLOR_GRAY2BGR)
_results['img'] = image
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqShiftScaleAug(object):
"""Shift and rescale images and bounding boxes.
Args:
target_size (list[int]): list of int denoting exemplar size and search
size, respectively. Defaults to [127, 255].
shift (list[int]): list of int denoting the max shift offset. Defaults
to [4, 64].
scale (list[float]): list of float denoting the max rescale factor.
Defaults to [0.05, 0.18].
"""
def __init__(self,
target_size=[127, 255],
shift=[4, 64],
scale=[0.05, 0.18]):
self.target_size = target_size
self.shift = shift
self.scale = scale
def _shift_scale_aug(self, image, bbox, target_size, shift, scale):
"""Shift and rescale an image and corresponding bounding box.
Args:
image (ndarray): of shape (H, W, 3). Typically H and W equal to
511.
bbox (ndarray): of shape (4, ) in [x1, y1, x2, y2] format.
target_size (int): Exemplar size or search size.
shift (int): The max shift offset.
scale (float): The max rescale factor.
Returns:
tuple(crop_img, bbox): crop_img is a ndarray of shape
(target_size, target_size, 3), bbox is the corresponding ground
truth box in [x1, y1, x2, y2] format.
"""
img_h, img_w = image.shape[:2]
scale_x = (2 * np.random.random() - 1) * scale + 1
scale_y = (2 * np.random.random() - 1) * scale + 1
scale_x = min(scale_x, float(img_w) / target_size)
scale_y = min(scale_y, float(img_h) / target_size)
crop_region = np.array([
img_w // 2 - 0.5 * scale_x * target_size,
img_h // 2 - 0.5 * scale_y * target_size,
img_w // 2 + 0.5 * scale_x * target_size,
img_h // 2 + 0.5 * scale_y * target_size
])
shift_x = (2 * np.random.random() - 1) * shift
shift_y = (2 * np.random.random() - 1) * shift
shift_x = max(-crop_region[0], min(img_w - crop_region[2], shift_x))
shift_y = max(-crop_region[1], min(img_h - crop_region[3], shift_y))
shift = np.array([shift_x, shift_y, shift_x, shift_y])
crop_region += shift
crop_img = crop_image(image, crop_region, target_size)
bbox -= np.array(
[crop_region[0], crop_region[1], crop_region[0], crop_region[1]])
bbox /= np.array([scale_x, scale_y, scale_x, scale_y],
dtype=np.float32)
return crop_img, bbox
def __call__(self, results):
"""Call function.
For each dict in results, shift and rescale the image and the bounding
box in the dict.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains cropped image and
corresponding ground truth box.
"""
outs = []
for i, _results in enumerate(results):
image = _results['img']
gt_bbox = _results['gt_bboxes'][0]
crop_img, crop_bbox = self._shift_scale_aug(
image, gt_bbox, self.target_size[i], self.shift[i],
self.scale[i])
crop_bbox = crop_bbox[None]
_results['img'] = crop_img
if 'img_shape' in _results:
_results['img_shape'] = crop_img.shape
_results['gt_bboxes'] = crop_bbox
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqColorAug(object):
"""Color augmention for images.
Args:
prob (list[float]): The probability to perform color augmention for
each image. Defaults to [1.0, 1.0].
rgb_var (list[list]]): The values of color augmentaion. Defaults to
[[-0.55919361, 0.98062831, -0.41940627],
[1.72091413, 0.19879334, -1.82968581],
[4.64467907, 4.73710203, 4.88324118]].
"""
def __init__(self,
prob=[1.0, 1.0],
rgb_var=[[-0.55919361, 0.98062831, -0.41940627],
[1.72091413, 0.19879334, -1.82968581],
[4.64467907, 4.73710203, 4.88324118]]):
self.prob = prob
self.rgb_var = np.array(rgb_var, dtype=np.float32)
def __call__(self, results):
"""Call function.
For each dict in results, perform color augmention for image in the
dict.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains augmented color image.
"""
outs = []
for i, _results in enumerate(results):
image = _results['img']
if self.prob[i] > np.random.random():
offset = np.dot(self.rgb_var, np.random.randn(3, 1))
# bgr to rgb
offset = offset[::-1]
offset = offset.reshape(3)
image = (image - offset).astype(np.float32)
_results['img'] = image
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqBlurAug(object):
"""Blur augmention for images.
Args:
prob (list[float]): The probability to perform blur augmention for
each image. Defaults to [0.0, 0.2].
"""
def __init__(self, prob=[0.0, 0.2]):
self.prob = prob
def __call__(self, results):
"""Call function.
For each dict in results, perform blur augmention for image in the
dict.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains augmented blur image.
"""
outs = []
for i, _results in enumerate(results):
image = _results['img']
if self.prob[i] > np.random.random():
sizes = np.arange(5, 46, 2)
size = np.random.choice(sizes)
kernel = np.zeros((size, size))
c = int(size / 2)
wx = np.random.random()
kernel[:, c] += 1. / size * wx
kernel[c, :] += 1. / size * (1 - wx)
image = cv2.filter2D(image, -1, kernel)
_results['img'] = image
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqResize(Resize):
"""Resize images.
Please refer to `mmdet.datasets.pipelines.transforms.py:Resize` for
detailed docstring.
Args:
share_params (bool): If True, share the resize parameters for all
images. Defaults to True.
"""
def __init__(self, share_params=True, *args, **kwargs):
super().__init__(*args, **kwargs)
self.share_params = share_params
def __call__(self, results):
"""Call function.
For each dict in results, call the call function of `Resize` to resize
image and corresponding annotations.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains resized results,
'img_shape', 'pad_shape', 'scale_factor', 'keep_ratio' keys
are added into result dict.
"""
outs, scale = [], None
for i, _results in enumerate(results):
if self.share_params and i > 0:
_results['scale'] = scale
_results = super().__call__(_results)
if self.share_params and i == 0:
scale = _results['scale']
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqNormalize(Normalize):
"""Normalize images.
Please refer to `mmdet.datasets.pipelines.transforms.py:Normalize` for
detailed docstring.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __call__(self, results):
"""Call function.
For each dict in results, call the call function of `Normalize` to
normalize image.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains normalized results,
'img_norm_cfg' key is added into result dict.
"""
outs = []
for _results in results:
_results = super().__call__(_results)
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqRandomFlip(RandomFlip):
"""Randomly flip for images.
Please refer to `mmdet.datasets.pipelines.transforms.py:RandomFlip` for
detailed docstring.
Args:
share_params (bool): If True, share the flip parameters for all images.
Defaults to True.
"""
def __init__(self, share_params, *args, **kwargs):
super().__init__(*args, **kwargs)
self.share_params = share_params
def __call__(self, results):
"""Call function.
For each dict in results, call `RandomFlip` to randomly flip image.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains flipped results, 'flip',
'flip_direction' keys are added into the dict.
"""
if self.share_params:
if isinstance(self.direction, list):
# None means non-flip
direction_list = self.direction + [None]
else:
# None means non-flip
direction_list = [self.direction, None]
if isinstance(self.flip_ratio, list):
non_flip_ratio = 1 - sum(self.flip_ratio)
flip_ratio_list = self.flip_ratio + [non_flip_ratio]
else:
non_flip_ratio = 1 - self.flip_ratio
# exclude non-flip
single_ratio = self.flip_ratio / (len(direction_list) - 1)
flip_ratio_list = [single_ratio] * (len(direction_list) -
1) + [non_flip_ratio]
cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
flip = cur_dir is not None
flip_direction = cur_dir
for _results in results:
_results['flip'] = flip
_results['flip_direction'] = flip_direction
outs = []
for _results in results:
_results = super().__call__(_results)
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqPad(Pad):
"""Pad images.
Please refer to `mmdet.datasets.pipelines.transforms.py:Pad` for detailed
docstring.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __call__(self, results):
"""Call function.
For each dict in results, call the call function of `Pad` to pad image.
Args:
results (list[dict]): List of dict that from
:obj:`mmtrack.CocoVideoDataset`.
Returns:
list[dict]: List of dict that contains padding results,
'pad_shape', 'pad_fixed_size' and 'pad_size_divisor' keys are
added into the dict.
"""
outs = []
for _results in results:
_results = super().__call__(_results)
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqRandomCrop(object):
"""Sequentially random crop the images & bboxes & masks.
The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
then the cropped results are generated.
Args:
crop_size (tuple): The relative ratio or absolute pixels of
height and width.
allow_negative_crop (bool, optional): Whether to allow a crop that does
not contain any bbox area. Default False.
share_params (bool, optional): Whether share the cropping parameters
for the images.
bbox_clip_border (bool, optional): Whether clip the objects outside
the border of the image. Defaults to True.
Note:
- If the image is smaller than the absolute crop size, return the
original image.
- The keys for bboxes, labels and masks must be aligned. That is,
`gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
`gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
`gt_masks_ignore`.
- If the crop does not contain any gt-bbox region and
`allow_negative_crop` is set to False, skip this image.
"""
def __init__(self,
crop_size,
allow_negative_crop=False,
share_params=False,
bbox_clip_border=False):
assert crop_size[0] > 0 and crop_size[1] > 0
self.crop_size = crop_size
self.allow_negative_crop = allow_negative_crop
self.share_params = share_params
self.bbox_clip_border = bbox_clip_border
# The key correspondence from bboxes to labels and masks.
self.bbox2label = {
'gt_bboxes': ['gt_labels', 'gt_instance_ids'],
'gt_bboxes_ignore': ['gt_labels_ignore', 'gt_instance_ids_ignore']
}
self.bbox2mask = {
'gt_bboxes': 'gt_masks',
'gt_bboxes_ignore': 'gt_masks_ignore'
}
[文档] def get_offsets(self, img):
"""Random generate the offsets for cropping."""
margin_h = max(img.shape[0] - self.crop_size[0], 0)
margin_w = max(img.shape[1] - self.crop_size[1], 0)
offset_h = np.random.randint(0, margin_h + 1)
offset_w = np.random.randint(0, margin_w + 1)
return offset_h, offset_w
[文档] def random_crop(self, results, offsets=None):
"""Call function to randomly crop images, bounding boxes, masks,
semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
offsets (tuple, optional): Pre-defined offsets for cropping.
Default to None.
Returns:
dict: Randomly cropped results, 'img_shape' key in result dict is
updated according to crop size.
"""
for key in results.get('img_fields', ['img']):
img = results[key]
if offsets is not None:
offset_h, offset_w = offsets
else:
offset_h, offset_w = self.get_offsets(img)
results['img_info']['crop_offsets'] = (offset_h, offset_w)
crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0]
crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1]
# crop the image
img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
img_shape = img.shape
results[key] = img
results['img_shape'] = img_shape
# crop bboxes accordingly and clip to the image boundary
for key in results.get('bbox_fields', []):
# e.g. gt_bboxes and gt_bboxes_ignore
bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
dtype=np.float32)
bboxes = results[key] - bbox_offset
if self.bbox_clip_border:
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
bboxes[:, 3] > bboxes[:, 1])
# If the crop does not contain any gt-bbox area and
# self.allow_negative_crop is False, skip this image.
if (key == 'gt_bboxes' and not valid_inds.any()
and not self.allow_negative_crop):
return None
results[key] = bboxes[valid_inds, :]
# label fields. e.g. gt_labels and gt_labels_ignore
label_keys = self.bbox2label.get(key)
for label_key in label_keys:
if label_key in results:
results[label_key] = results[label_key][valid_inds]
# mask fields, e.g. gt_masks and gt_masks_ignore
mask_key = self.bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][
valid_inds.nonzero()[0]].crop(
np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
# crop semantic seg
for key in results.get('seg_fields', []):
results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
return results
def __call__(self, results):
"""Call function to sequentially randomly crop images, bounding boxes,
masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Randomly cropped results, 'img_shape' key in result dict is
updated according to crop size.
"""
if self.share_params:
offsets = self.get_offsets(results[0]['img'])
else:
offsets = None
outs = []
for _results in results:
_results = self.random_crop(_results, offsets)
if _results is None:
return None
outs.append(_results)
return outs
[文档]@PIPELINES.register_module()
class SeqPhotoMetricDistortion(object):
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
share_params=True,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.share_params = share_params
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
[文档] def get_params(self):
"""Generate parameters."""
params = dict()
# delta
if np.random.randint(2):
params['delta'] = np.random.uniform(-self.brightness_delta,
self.brightness_delta)
else:
params['delta'] = None
# mode
mode = np.random.randint(2)
params['contrast_first'] = True if mode == 1 else 0
# alpha
if np.random.randint(2):
params['alpha'] = np.random.uniform(self.contrast_lower,
self.contrast_upper)
else:
params['alpha'] = None
# saturation
if np.random.randint(2):
params['saturation'] = np.random.uniform(self.saturation_lower,
self.saturation_upper)
else:
params['saturation'] = None
# hue
if np.random.randint(2):
params['hue'] = np.random.uniform(-self.hue_delta, self.hue_delta)
else:
params['hue'] = None
# swap
if np.random.randint(2):
params['permutation'] = np.random.permutation(3)
else:
params['permutation'] = None
return params
[文档] def photo_metric_distortion(self, results, params=None):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
params (dict, optional): Pre-defined parameters. Default to None.
Returns:
dict: Result dict with images distorted.
"""
if params is None:
params = self.get_params()
results['img_info']['color_jitter'] = params
if 'img_fields' in results:
assert results['img_fields'] == ['img'], \
'Only single img_fields is allowed'
img = results['img']
assert img.dtype == np.float32, \
'PhotoMetricDistortion needs the input image of dtype np.float32,'\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if params['delta'] is not None:
img += params['delta']
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
if params['contrast_first']:
if params['alpha'] is not None:
img *= params['alpha']
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if params['saturation'] is not None:
img[..., 1] *= params['saturation']
# random hue
if params['hue'] is not None:
img[..., 0] += params['hue']
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if not params['contrast_first']:
if params['alpha'] is not None:
img *= params['alpha']
# randomly swap channels
if params['permutation'] is not None:
img = img[..., params['permutation']]
results['img'] = img
return results
def __call__(self, results):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
if self.share_params:
params = self.get_params()
else:
params = None
outs = []
for _results in results:
_results = self.photo_metric_distortion(_results, params)
outs.append(_results)
return outs
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str