1.collate
用來拼接batch中的數據。與標準的pytorch中的default_collate不同,這裏的collate返回的是一個列表,每個列表中的元素是一個minibatch,應該是爲了用於多個gpu,每個gpu上運行一個minibatch。
collate支持對於DataContainer數據類型的操作,
對於meta data,直接拼成minibatch,返回
對於圖像,先pad成同樣大小,再拼成minibatch,返回
對於bbox,直接拼接,返回。
一般來說,distributed爲FALSE,進而使用**_non_dist_train**,進而build_dataloader的dist參數爲FALSE,
所以build_dataloader中sampler和batch_size等是這麼定義的:
else:
sampler = GroupSampler(dataset, imgs_per_gpu) if shuffle else None
batch_size = num_gpus * imgs_per_gpu
num_workers = num_gpus * workers_per_gpu
def collate(batch, samples_per_gpu=1):
"""Puts each data field into a tensor/DataContainer with outer dimension
batch size.
Extend default_collate to add support for
:type:`~mmcv.parallel.DataContainer`. There are 3 cases.
1. cpu_only = True, e.g., meta data
2. cpu_only = False, stack = True, e.g., images tensors
3. cpu_only = False, stack = False, e.g., gt bboxes
"""
if not isinstance(batch, collections.Sequence):
raise TypeError("{} is not supported.".format(batch.dtype))
if isinstance(batch[0], DataContainer):
assert len(batch) % samples_per_gpu == 0
stacked = []
if batch[0].cpu_only:
# 如果是cpu_only,說明是meta data,數據類型不固定,例如字符串。
# 那麼將data一個一個的append進stacked中,每個mini_batch的大小爲samples_per_gpu
# stacked = [[d1, d2, d3], [d4, d5, d6], [...], ...],由於batch % samples_per_gpu == 0,每個mini_batch中個數相同。
# stacked被裝進一個DataContainer返回。
for i in range(0, len(batch), samples_per_gpu):
stacked.append(
[sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(
stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
# 如果是stack,且不是cpu,說明是圖像,是Tensor。
# 那麼首先對圖像進行pad,使得圖像的形狀一樣,之後使用標準的default_collate,將每一個mini_batch中的圖像進行拼接
elif batch[0].stack:
for i in range(0, len(batch), samples_per_gpu):
assert isinstance(batch[i].data, torch.Tensor)
# pad用來給圖像進行pad,添加邊框,邊框的值爲batch[0]padding_value。
# pad的目的是讓所有圖像的大小一致
if batch[i].pad_dims is not None:
# ndim爲圖像的維度(指標的個數)
ndim = batch[i].dim()
assert ndim > batch[i].pad_dims
max_shape = [0 for _ in range(batch[i].pad_dims)]
for dim in range(1, batch[i].pad_dims + 1):
max_shape[dim - 1] = batch[i].size(-dim)
# pad_dims取值在[None, 1, 2, 3]內
# max_shape最終得到的是batch[i]的size的倒序,如原本爲3 * 255 * 255,則max_shape = [255, 255, 3]
for sample in batch[i:i + samples_per_gpu]:
for dim in range(0, ndim - batch[i].pad_dims):
assert batch[i].size(dim) == sample.size(dim)
for dim in range(1, batch[i].pad_dims + 1):
max_shape[dim - 1] = max(max_shape[dim - 1],
sample.size(-dim))
# max_shape與其他sample比較,取shape的max。
padded_samples = []
for sample in batch[i:i + samples_per_gpu]:
pad = [0 for _ in range(batch[i].pad_dims * 2)]
# pad的大小爲pad_dims的2倍,2指的是在該維度的前後進行pad。
for dim in range(1, batch[i].pad_dims + 1):
pad[2 * dim -
1] = max_shape[dim - 1] - sample.size(-dim)
# 目的是進行pad之後所有圖像的大小一致。
padded_samples.append(
F.pad(
sample.data, pad, value=sample.padding_value))
stacked.append(default_collate(padded_samples))
elif batch[i].pad_dims is None:
stacked.append(
default_collate([
sample.data
for sample in batch[i:i + samples_per_gpu]
]))
else:
raise ValueError(
'pad_dims should be either None or integers (1-3)')
# 這種情況對應bbox,這個時候直接取出來(是一個列表),分割成成samples_per_gpu大小的mini batch,直接返回
else:
for i in range(0, len(batch), samples_per_gpu):
stacked.append(
[sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
# 有可能出現遞歸操作,這些與default_collate一致
elif isinstance(batch[0], collections.Sequence):
transposed = zip(*batch)
return [collate(samples, samples_per_gpu) for samples in transposed]
elif isinstance(batch[0], collections.Mapping):
return {
key: collate([d[key] for d in batch], samples_per_gpu)
for key in batch[0]
}
# 對於其他操作使用default_collate
else:
return default_collate(batch)
pytorch自帶的default_collate解釋如下:
def default_collate(batch):
r"""Puts each data field into a tensor with outer dimension batch size"""
elem = batch[0]
elem_type = type(elem)
# 如果元素是Tensor,首先查看類型是否爲Tensor,如果是Tensor,則查看get_worker_info()是否爲空。
# torch.utils.data.get_worker_info()的作用是返回工作進程中的各種有用信息(包括工作者ID,數據集副本,初始種子等)
# 如果get_worker_info不爲空,說明在另外的進程中,這個時候,首先numel獲得所需內存大小,用_new_shared創建一塊新的內存,用new創建一個在該內存上的Tensor
# 最終,使用stack直接拼接起來。
# Creates a new storage in shared memory with the same data type
if isinstance(elem, torch.Tensor):
out = None
if torch.utils.data.get_worker_info() is not None:
# If we're in a background process, concatenate directly into a
# shared memory tensor to avoid an extra copy
numel = sum([x.numel() for x in batch])
storage = elem.storage()._new_shared(numel)
out = elem.new(storage)
return torch.stack(batch, 0, out=out)
# 如果元素是numpy中的類型,並且是ndarray,則變成Tensor之後,再遞歸調用自身進行gather
elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
and elem_type.__name__ != 'string_':
elem = batch[0]
if elem_type.__name__ == 'ndarray':
# array of string classes and object
if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
raise TypeError(default_collate_err_msg_format.format(elem.dtype))
return default_collate([torch.as_tensor(b) for b in batch])
elif elem.shape == (): # scalars
return torch.as_tensor(batch)
# 如果元素是標量,直接初始化爲Tensor
elif isinstance(elem, float):
return torch.tensor(batch, dtype=torch.float64)
elif isinstance(elem, int_classes):
return torch.tensor(batch)
# 如果元素是字符串,直接返回
elif isinstance(elem, string_classes):
return batch
# 如果元素是字典,對字典中的每個value遞歸進行gather
# 如[{'a': [1, 2]}, {'a': [3, 4]}],collate之後變成{'a': [1, 2, 3, 4]}
elif isinstance(elem, container_abcs.Mapping):
return {key: default_collate([d[key] for d in batch]) for key in elem}
# 如果元素是元組的話,
elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple
return elem_type(*(default_collate(samples) for samples in zip(*batch)))
# 如果元素是序列的話,先用zip將batch中對應位置的元素集合到一起形成新元素,之後對新元素進行遞歸的gather操作
elif isinstance(elem, container_abcs.Sequence):
transposed = zip(*batch)
return [default_collate(samples) for samples in transposed]
raise TypeError(default_collate_err_msg_format.format(elem_type))
舉一個例子:
batch = [{‘name’: ‘a’, ‘bbox’: [[[1, 2], [3, 4]]]},
{‘name’: ‘b’, ‘bbox’: [[[5, 6], [7, 8]]]}
則首先batch的元素是dict,因此先集合成{‘name’: [‘a’, ‘b’], ‘bbox’: [[[[1, 2], [3, 4]]], [[[5, 6], [7, 8]]]]}
再對每個value進行collate,字符串序列保持不變,bbox則stack到一起。
更一般的例子:
如果每個elem是一個樹形結構,則default_collate會遞歸的調用,將每棵樹對應的部分(string,Tensor等)使用collate拼接起來。
2. sampler
就如collate一開始所說,一般dist=False, 因此採用GroupSampler。
其有一些屬性。
dataset:數據集
samples_per_gpu:每張gpu的樣本個數
flag:長寬比是否大於1的0, 1序列,將data分爲了兩組。在customdataset中定義
group_sizes:上面劃分的兩組的組內個數
num_samples:對groups_sizes中的每個數先除以samples_per_gpu,取整再乘上去,再共同相加。得到的是總體數目(可被sample_per_gpu整除)。
GroupSampler的作用是給出一個index,這個index是總體dataset成員的index,能夠隨機取出dataset中的元素。在dataloader的初始化過程中,sampler就被轉化爲了iter,固定了下來。
class GroupSampler(Sampler):
def __init__(self, dataset, samples_per_gpu=1):
assert hasattr(dataset, 'flag')
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.flag = dataset.flag.astype(np.int64)
self.group_sizes = np.bincount(self.flag)
self.num_samples = 0
for i, size in enumerate(self.group_sizes):
self.num_samples += int(np.ceil(
size / self.samples_per_gpu)) * self.samples_per_gpu
def __iter__(self):
indices = []
for i, size in enumerate(self.group_sizes):
if size == 0:
continue
# 獲得下標indice
indice = np.where(self.flag == i)[0]
assert len(indice) == size
# 隨機打亂
np.random.shuffle(indice)
num_extra = int(np.ceil(size / self.samples_per_gpu)
) * self.samples_per_gpu - len(indice)
# 拼接上一些元素(從indice尾部往後數),使得indice的個數被samples_per_gpu整除
indice = np.concatenate([indice, indice[:num_extra]])
indices.append(indice)
# indices裏頭兩個group的數量都能被samples_per_gpu整除
indices = np.concatenate(indices)
# 重新打亂indices(將另個group混合在一起),相鄰兩個爲一組打亂(之所以兩個爲一組猜測是爲了減少計算時間)
indices = [
indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
for i in np.random.permutation(
range(len(indices) // self.samples_per_gpu))
]
indices = np.concatenate(indices)
indices = indices.astype(np.int64).tolist()
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
3. DataSets
基礎類CustomDataset,繼承自pytorch的DataSets類。
data的數據結構爲:
Annotation format:
[
{
'filename': 'a.jpg',
'width': 1280,
'height': 720,
'ann': {
'bboxes': <np.ndarray> (n, 4),
'labels': <np.ndarray> (n, ),
'bboxes_ignore': <np.ndarray> (k, 4),
'labels_ignore': <np.ndarray> (k, 4) (optional field)
}
},
...
]
__init__部分出現的屬性:
img_infos: 圖片的信息,通過cocoapi讀入
proposals:None
ImageTransform:用來對圖像、mask進行尺度變化、翻轉、正規化。
BboxTransform:根據圖像尺寸rescale bbox,翻轉。
img_ids:出現在CocoDataset類中,爲self.coco.getImgIds()
4. 數據流
4.1. ruuer.train從data_loader中讀出data_batch
data_batch是一個dict,
其中
img_meta的data爲包含兩個元素的list,元素是包含另個元素的list,每個元素是一個dict,包含了如下信息:
img的data爲包含兩個元素的list,元素是231216*800的Tensor
gt_bboxes的data爲包含兩個元素的list,每個元素是一個list,其中有兩個Tensor,大小爲n * 4
gt_labels的data爲包含兩個元素的list,每個元素是一個list,其中有兩個Tensor,大小爲n
4.2.data_batch經由batch_processor進入model
def batch_processor(model, data, train_mode):
losses = model(**data)
loss, log_vars = parse_losses(losses)
outputs = dict(
loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
return outputs
pytorch並行的參考資料:
https://blog.csdn.net/zzlyw/article/details/78769012
https://blog.csdn.net/weixin_40087578/article/details/87186613,以下過程參考這張圖:
4.3. data進入DataParallel(Pytorch的)
由於model外層套着DataParallel類,因此先使用DataParallel類的forward。
4.3. data進入MMDataParallel(繼承自DataParallel(Pytorch的))
由於model外層套着MMDataParallel類,因此先使用MMDataParallel類的forward。以下爲MMDataParallel的forward,與DataParallel的forward相同。
def forward(self, *inputs, **kwargs):
if not self.device_ids:
return self.module(*inputs, **kwargs)
for t in chain(self.module.parameters(), self.module.buffers()):
if t.device != self.src_device_obj:
raise RuntimeError("module must have its parameters and buffers "
"on device {} (device_ids[0]) but found one of "
"them on device: {}".format(self.src_device_obj, t.device))
inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
if len(self.device_ids) == 1:
return self.module(*inputs[0], **kwargs[0])
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
outputs = self.parallel_apply(replicas, inputs, kwargs)
return self.gather(outputs, self.output_device)
data轉換爲inputs,kwargs,交由scatter來分散到不同的GPU上。
4.4. inputs, kwargs傳入scatter(mmcv重寫了該函數)
scatter(MMDataParallel的成員函數)->scatter_kwargs(依然是mmcv重寫)->scatter函數(mmcv中,在scatter_gather.py中)-> Scatter靜態類的forward函數(mmcv中)->scatter函數(mmcv中,在_functions,py中)
第一個scatter函數,其中定義了一個函數scatter_map,對於dict,它通過map將scatter_map遞歸的作用於其中的元素,並進行了一些操作,最終得到了一個列表,列表中有len(target_gpus)個數個元素,每個元素對應於一個GPU。
總的來說,作用是獲得不同設備上的data。
def scatter(inputs, target_gpus, dim=0):
"""Scatter inputs to target gpus.
The only difference from original :func:`scatter` is to add support for
:type:`~mmcv.parallel.DataContainer`.
"""
def scatter_map(obj):
if isinstance(obj, torch.Tensor):
return OrigScatter.apply(target_gpus, None, dim, obj)
if isinstance(obj, DataContainer):
if obj.cpu_only:
return obj.data
else:
return Scatter.forward(target_gpus, obj.data)
if isinstance(obj, tuple) and len(obj) > 0:
return list(zip(*map(scatter_map, obj)))
if isinstance(obj, list) and len(obj) > 0:
out = list(map(list, zip(*map(scatter_map, obj))))
return out
if isinstance(obj, dict) and len(obj) > 0:
out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
return out
return [obj for targets in target_gpus]
# After scatter_map is called, a scatter_map cell will exist. This cell
# has a reference to the actual function scatter_map, which has references
# to a closure that has a reference to the scatter_map cell (because the
# fn is recursive). To avoid this reference cycle, we set the function to
# None, clearing the cell
try:
return scatter_map(inputs)
finally:
scatter_map = None
4.5. 上面流程中最後的scatter函數(分配gpu,返回整體)
上面流程中最後的scatter函數如下:
def scatter(input, devices, streams=None):
"""Scatters tensor across multiple GPUs.
"""
if streams is None:
streams = [None] * len(devices)
if isinstance(input, list):
chunk_size = (len(input) - 1) // len(devices) + 1
outputs = [
scatter(input[i], [devices[i // chunk_size]],
[streams[i // chunk_size]]) for i in range(len(input))
]
return outputs
elif isinstance(input, torch.Tensor):
output = input.contiguous()
# TODO: copy to a pinned buffer first (if copying from CPU)
stream = streams[0] if output.numel() > 0 else None
with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
output = output.cuda(devices[0], non_blocking=True)
return output
else:
raise Exception('Unknown type {}.'.format(type(input)))
其作用是通過遞歸的方式來將數據分散到各個設備上。
例子:
input = [tensor([[1, 2], [3, 4]]), tensor([[5, 6], [7, 8]])]
devices = [0, 1]
scatter檢測到是列表,遞歸調用:
第一次input[0] = tensor([[1, 2], [3, 4]]), devices=[0],檢測到是Tensor,通過 output = output.cuda(devices[0], non_blocking=True)來分配到cuda:0上,並返回對象
第二次同理,分配到cuda:1上,返回對象
scatter返回列表,元素與之前相同,但是設備已經不一樣了,一個在0上,一個在1上。
4.6前. self.module傳入replicate(pytorch中)中,獲得各個gpu上的模型
對應於DataParallel類的forward中的
replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
得到的replicas 是一個列表,包含多個gpu上的模型
4.6. inputs, kwargs(列表)傳入parallel_apply函數
大致思路就是上面那張圖,將kwargs_tup的各個元素分配到modules的各個元素上,運行。最後還存在一個彙總的步驟,就不寫了。
def parallel_apply(modules, inputs, kwargs_tup=None, devices=None):
r"""Applies each `module` in :attr:`modules` in parallel on arguments
contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
on each of :attr:`devices`.
Args:
modules (Module): modules to be parallelized
inputs (tensor): inputs to the modules
devices (list of int or torch.device): CUDA devices
:attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
:attr:`devices` (if given) should all have same length. Moreover, each
element of :attr:`inputs` can either be a single object as the only argument
to a module, or a collection of positional arguments.
"""
assert len(modules) == len(inputs)
if kwargs_tup is not None:
assert len(modules) == len(kwargs_tup)
else:
kwargs_tup = ({},) * len(modules)
if devices is not None:
assert len(modules) == len(devices)
else:
devices = [None] * len(modules)
devices = list(map(lambda x: _get_device_index(x, True), devices))
lock = threading.Lock()
results = {}
grad_enabled = torch.is_grad_enabled()
def _worker(i, module, input, kwargs, device=None):
torch.set_grad_enabled(grad_enabled)
if device is None:
device = get_a_var(input).get_device()
try:
with torch.cuda.device(device):
# this also avoids accidental slicing of `input` if it is a Tensor
if not isinstance(input, (list, tuple)):
input = (input,)
output = module(*input, **kwargs)
with lock:
results[i] = output
except Exception:
with lock:
results[i] = ExceptionWrapper(
where="in replica {} on device {}".format(i, device))
if len(modules) > 1:
threads = [threading.Thread(target=_worker,
args=(i, module, input, kwargs, device))
for i, (module, input, kwargs, device) in
enumerate(zip(modules, inputs, kwargs_tup, devices))]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
else:
_worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
outputs = []
for i in range(len(inputs)):
output = results[i]
if isinstance(output, ExceptionWrapper):
output.reraise()
outputs.append(output)
return outputs