tools/test.py
def main():
args = parse_args()
if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
raise ValueError('The output file must be a pkl file.')
cfg = mmcv.Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
#測試模式
cfg.data.test.test_mode = True
dataset = obj_from_dict(cfg.data.test, datasets, dict(test_mode=True))
if args.gpus == 1:
model = build_detector(
cfg.model, train_cfg=None, test_cfg=cfg.test_cfg)
load_checkpoint(model, args.checkpoint)
model = MMDataParallel(model, device_ids=[0])
data_loader = build_dataloader(
dataset,
imgs_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
num_gpus=1,
dist=False,
shuffle=False)
outputs = single_test(model, data_loader, args.show)
else:
model_args = cfg.model.copy()
model_args.update(train_cfg=None, test_cfg=cfg.test_cfg)
model_type = getattr(detectors, model_args.pop('type'))
outputs = parallel_test(
model_type,
model_args,
args.checkpoint,
dataset,
_data_func,
range(args.gpus),
workers_per_gpu=args.proc_per_gpu)
if args.out:
print('writing results to {}'.format(args.out))
mmcv.dump(outputs, args.out)
eval_types = args.eval
if eval_types:
print('Starting evaluate {}'.format(' and '.join(eval_types)))
if eval_types == ['proposal_fast']:
result_file = args.out
coco_eval(result_file, eval_types, dataset.coco)
else:
if not isinstance(outputs[0], dict):
result_file = args.out + '.json'
results2json(dataset, outputs, result_file)
coco_eval(result_file, eval_types, dataset.coco)
else:
for name in outputs[0]:
print('\nEvaluating {}'.format(name))
outputs_ = [out[name] for out in outputs]
result_file = args.out + '.{}.json'.format(name)
results2json(dataset, outputs_, result_file)
coco_eval(result_file, eval_types, dataset.coco)
解釋一下obj_from_dict方法
def obj_from_dict(info, parrent=None, default_args=None):
"""Initialize an object from dict.
The dict must contain the key "type", which indicates the object type, it
can be either a string or type, such as "list" or ``list``. Remaining
fields are treated as the arguments for constructing the object.
Args:
info (dict): Object types and arguments.
module (:class:`module`): Module which may containing expected object
classes.
default_args (dict, optional): Default arguments for initializing the
object.
Returns:
any type: Object built from the dict.
"""
assert isinstance(info, dict) and 'type' in info
assert isinstance(default_args, dict) or default_args is None
args = info.copy()
obj_type = args.pop('type')
if mmcv.is_str(obj_type):
if parrent is not None:
obj_type = getattr(parrent, obj_type)
else:
obj_type = sys.modules[obj_type]
elif not isinstance(obj_type, type):
raise TypeError('type must be a str or valid type, but got {}'.format(
type(obj_type)))
if default_args is not None:
for name, value in default_args.items():
args.setdefault(name, value)
return obj_type(**args)
輸入參數info,是一個cfg裏的字典,比如這裏的是test配置。
第二個參數是module,包含了期望的對象類別。
第三個是默認參數,用來初始化對象的
首先,判斷info是不是字典,而且裏面必須包含type關鍵字
默認參數也要檢查是字典或者爲None
然後,pop出type
字典的值 obj+type
,
如果是字符串類型,在判斷module(parrent)
參數是否爲空
- 不爲空,執行就從
obj_type = getattr(parrent, obj_type)
。測試時,parrent是datasets文件夾,obj_type是cfg中的test字典的type
參數,所以相當於是從datasets文件夾裏,加載相應名稱的數據集讀取程序dataset
。
如果obj_type不是任何一種python類型,就報錯。
然後,如果默認參數不爲空,迭代讀取每個默認參數的key:value
,把這些字典添加到cfg中test字典的參數中(也就是args)。
最後返回一個數據集讀取的類 return obj_type(**args)
obj_type是dataset類,args是配置參數,是一個字典。
同理,這個函數也可以給定cfg,module,其他參數
三個輸入來初始化任何一個對象,以上是以加載測試dataset爲例
接着往下,如果gpu參數爲1,進入單卡程序。
調用build_detector
新建一個模型。
def build_detector(cfg, train_cfg=None, test_cfg=None):
from . import detectors
return build(cfg, detectors, dict(train_cfg=train_cfg, test_cfg=test_cfg))
可以看到是針對建立detector的一個封裝
def build(cfg, parrent=None, default_args=None):
if isinstance(cfg, list):
modules = [_build_module(cfg_, parrent, default_args) for cfg_ in cfg]
return nn.Sequential(*modules)
else:
return _build_module(cfg, parrent, default_args)
再找到build
函數,build函數可以處理放在list中的多個module的建立,再調用_build_module
def _build_module(cfg, parrent=None, default_args=None):
return cfg if isinstance(cfg, nn.Module) else obj_from_dict(
cfg, parrent, default_args)
這裏其實就是又用到了之前使用的obj_from_dict
函數,按照cfg配置返回一個detector的模塊對象。
只不過obj_from_dict函數
傳入的參數變成了 cfg=cfg.model , parrent=detectors , default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)
,
下一步
,加載參數ckp
接着調用MMDataParallel
將model放到GPU上
然後調用build_dataloader
,建立dataloader對象。
我們看一下這個函數
def build_dataloader(dataset,
imgs_per_gpu,
workers_per_gpu,
num_gpus=1,
dist=True,
**kwargs):
輸入時dataset對象,每個gpu的圖片數,線程數,多少個gpu,是否分佈式。
- 如果是分佈式讀取數據
if dist:
rank, world_size = get_dist_info()
sampler = DistributedGroupSampler(dataset, imgs_per_gpu, world_size,
rank)
batch_size = imgs_per_gpu
num_workers = workers_per_gpu
獲取分佈式的信息,然後調用sampler中的DistributedGroupSampler
劃分數據集,得到sampler。
- 如果是單卡
else:
if not kwargs.get('shuffle', True):
sampler = None
else:
sampler = GroupSampler(dataset, imgs_per_gpu)
batch_size = num_gpus * imgs_per_gpu
num_workers = num_gpus * workers_per_gpu
調用GroupSampler
得到sampler。
解釋一下kwargs.get():
get(key[, default])
Return the value for key if key is in the dictionary, else default. Ifdefault
is not given, itdefaults
to None, so that this method never raises a KeyError.
然後,調用pytorch.utils.data中的DataLoader生成一個dataloader對象。
data_loader = DataLoader(
dataset,
batch_size=batch_size,
sampler=sampler,
num_workers=num_workers,
collate_fn=partial(collate, samples_per_gpu=imgs_per_gpu),
pin_memory=False,
**kwargs)
return data_loader
doc
:sampler (Sampler, optional) – defines the strategy to draw samples from the dataset. If specified, shuffle must be False.
文檔中說,sampler參數如果指定了,就必須設置shuffle=False,這也是爲什麼,前面檢查了shuffle爲True時,要設置Sampler=None
doc
:collate_fn (callable, optional) – merges a list of samples to form a mini-batch.
我們來看一下collate function
,
def collate(batch, samples_per_gpu=1):
"""Puts each data field into a tensor/DataContainer with outer dimension
batch size.
Extend default_collate to add support for
:type:`~mmcv.parallel.DataContainer`. There are 3 cases.
1. cpu_only = True, e.g., meta data
2. cpu_only = False, stack = True, e.g., images tensors
3. cpu_only = False, stack = False, e.g., gt bboxes
"""
if not isinstance(batch, collections.Sequence):
raise TypeError("{} is not supported.".format(batch.dtype))
if isinstance(batch[0], DataContainer):
assert len(batch) % samples_per_gpu == 0
stacked = []
#第一種情況
if batch[0].cpu_only:
for i in range(0, len(batch), samples_per_gpu):
stacked.append(
[sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(
stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
#第二種情況
elif batch[0].stack:
for i in range(0, len(batch), samples_per_gpu):
assert isinstance(batch[i].data, torch.Tensor)
# TODO: handle tensors other than 3d
assert batch[i].dim() == 3
c, h, w = batch[0].size()
for sample in batch[i:i + samples_per_gpu]:
assert c == sample.size(0)
h = max(h, sample.size(1))
w = max(w, sample.size(2))
padded_samples = [
F.pad(
sample.data,
(0, w - sample.size(2), 0, h - sample.size(1)),
value=sample.padding_value)
for sample in batch[i:i + samples_per_gpu]
]
stacked.append(default_collate(padded_samples))
#第三種情況
else:
for i in range(0, len(batch), samples_per_gpu):
stacked.append(
[sample.data for sample in batch[i:i + samples_per_gpu]])
return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
elif isinstance(batch[0], collections.Sequence):
transposed = zip(*batch)
return [collate(samples, samples_per_gpu) for samples in transposed]
elif isinstance(batch[0], collections.Mapping):
return {
key: collate([d[key] for d in batch], samples_per_gpu)
for key in batch[0]
}
else:
return default_collate(batch)
#from torch.utils.data.dataloader import default_collate
collate
函數定義了四種情況處理。分別是DataContainer collections.Sequence collections.Mapping 其他
。
我們只看一下最關鍵的DataContainer的情況,因爲這個數據類型是我們自定義的一個類型。(會在dataloading部分進行專門講解)
三種情況
- cpu_only = True, e.g., meta data
將一個batch的數據按照samples_per_gpu的大小劃分成n個,每個都存在一個列表中,然後n個list放進一個大的list中(即stacked)
返回一個DataContainer對象return DataContainer( stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
- cpu_only = False, stack = True, e.g., images tensors
同樣的循環間隔samples_per_gpu,爲了將batch劃分成n個minibatch。獲取第一個數據batch[0]
的大小c,w,h
,然後對每個[i:i+samples_per_gpu]的數據,都計算他們和第一個數據的大小的差,padding這個大小的差距。最後調用torch中默認的default_coolate
將一個minibatch大小的list數據變成image tensors。
最後得到n個這樣的image tensors,都加入到大的list stacked中,返回DataContainer對象 - cpu_only = False, stack = False, e.g., gt bboxes
操作與第一種情況一樣,不同在於返回的return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
,cpu_only參數==False
所以這種情況返回的是GT bboxes
doc
:pin_memory (bool, optional) – If True, the data loader will copy tensors into CUDA pinned memory
before returning them.
對CUDA架構而言,主機端的內存被分爲兩種,一種是可分頁內存(pageable memroy)和
頁鎖定內存(page-lock或 pinned)
。可分頁內存是由操作系統API malloc()在主機上分配的,頁鎖定內存是由CUDA函數cudaHostAlloc()在主機內存上分配的,頁鎖定內存的重要屬性
是主機的操作系統將不會對這塊內存進行分頁和交換操作,確保該內存始終駐留在物理內存中。
GPU知道頁鎖定內存的物理地址,可以通過“直接內存訪問(Direct Memory Access,DMA)”技術直接在主機和GPU之間複製數據,速率更快
。由於每個頁鎖定內存都需要分配物理內存,並且這些內存不能交換到磁盤上,所以頁鎖定內存比使用標準malloc()分配的可分頁內存更消耗內存空間
。
以上所有內容解決了gpu只有一個的情況,得到了model和data_loader,然後調用single_test
函數得到outputs。
ok.現在回到test.py
文件,下面是gpu num大於1的情況
像將cfg.model的參數複製到model_args。更新test_cfg配置
model_type = getattr(detectors, model_args.pop('type'))
這一步獲取對應的detector
類。
在model/detector文件init中,共定義了以下幾種detector:
__ all__ = [ 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN', 'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'RetinaNet' ]
然後調用
outputs = parallel_test( model_type, model_args, args.checkpoint, dataset, _data_func, range(args.gpus), workers_per_gpu=args.proc_per_gpu)
我們看一下parallel_test函數
多進程基礎教程
直接在代碼中解釋
def parallel_test(model_cls,
model_kwargs,
checkpoint,
dataset,
data_func,
gpus,
workers_per_gpu=1):
"""Parallel testing on multiple GPUs.
Args:
model_cls (type): Model class type.
model_kwargs (dict): Arguments to init the model.
checkpoint (str): Checkpoint filepath.
dataset (:obj:`Dataset`): The dataset to be tested.
data_func (callable): The function that generates model inputs.
gpus (list[int]): GPU ids to be used.
workers_per_gpu (int): Number of processes on each GPU. It is possible
to run multiple workers on each GPU.
Returns:
list: Test results.
"""
#開啓一個多進程的上下文
ctx = multiprocessing.get_context('spawn')
#定義兩個多線程隊列,用來存儲結果/索引結果
idx_queue = ctx.Queue()
result_queue = ctx.Queue()
#計算線程的總共多少個
num_workers = len(gpus) * workers_per_gpu
#定義num_workers個線程函數。
#定義線程函數worker_func, args參數要是可迭代的
workers = [
ctx.Process(
target=worker_func,
args=(model_cls, model_kwargs, checkpoint, dataset, data_func,
gpus[i % len(gpus)], idx_queue, result_queue))
for i in range(num_workers)
]
#啓動進程
for w in workers:
#要使你的python服務不受終端影響而常駐系統,就需要將它變成守護進程
w.daemon = True
w.start()
#index加入索引隊列
for i in range(len(dataset)):
idx_queue.put(i)
#定義一個結果list初始化爲[None,None,None.....]
results = [None for _ in range(len(dataset))]
#定義一個進度bar
prog_bar = mmcv.ProgressBar(task_num=len(dataset))
for _ in range(len(dataset)):
#從結果隊列 獲取結果 ,索引和result
idx, res = result_queue.get()
#將得到的result加入結果list
results[idx] = res
#更新bar的顯示
prog_bar.update()
print('\n')
#關閉每個進程
for worker in workers:
worker.terminate()
return results
ok那麼上面函數中最關鍵的是定義線程時候加入的worker_func函數。
def worker_func(model_cls, model_kwargs, checkpoint, dataset, data_func,
gpu_id, idx_queue, result_queue):
model = model_cls(**model_kwargs)
#實例化這個model,也就是前面的detector
#加載ckp
load_checkpoint(model, checkpoint, map_location='cpu')
#設置多個gpuid
torch.cuda.set_device(gpu_id)
model.cuda()
model.eval()
# 不更新梯度的上下文管理器
with torch.no_grad():
#死循環,到索引隊列空了後退出。
while True:
#首先從索引隊列中獲取索引
idx = idx_queue.get()
#獲取第idx個數據
data = dataset[idx]
#計算結果
result = model(**data_func(data, gpu_id))
#將結果(idx,result)加入到結果隊列
result_queue.put((idx, result))
data_func函數是怎麼樣的?
def _data_func(data, device_id):
data = scatter(collate([data], samples_per_gpu=1), [device_id])[0]
return dict(return_loss=False, rescale=True, **data)
datafunc函數是將data分配到多個gpu–>gpu_id。
看一下scatter
函數
#如果輸入時list類型,那麼對list中每一項都遞歸地做scatter。
def scatter(input, devices, streams=None):
"""Scatters tensor across multiple GPUs.
"""
if streams is None:
streams = [None] * len(devices)
if isinstance(input, list):
chunk_size = (len(input) - 1) // len(devices) + 1
#如果輸入時list類型,那麼對list中每一項都遞歸地做scatter。
outputs = [
scatter(input[i], [devices[i // chunk_size]],
[streams[i // chunk_size]]) for i in range(len(input))
]
return outputs
elif isinstance(input, torch.Tensor):
output = input.contiguous()
# TODO: copy to a pinned buffer first (if copying from CPU)
stream = streams[0] if output.numel() > 0 else None
with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
output = output.cuda(devices[0], non_blocking=True)
return output
else:
raise Exception('Unknown type {}.'.format(type(input)))
最後一部分
結果寫入out文件
if args.out:
print('writing results to {}'.format(args.out))
mmcv.dump(outputs, args.out)
eval_types = args.eval
if eval_types:
print('Starting evaluate {}'.format(' and '.join(eval_types)))
if eval_types == ['proposal_fast']:
result_file = args.out
coco_eval(result_file, eval_types, dataset.coco)
else:
if not isinstance(outputs[0], dict):
result_file = args.out + '.json'
results2json(dataset, outputs, result_file)
coco_eval(result_file, eval_types, dataset.coco)
else:
for name in outputs[0]:
print('\nEvaluating {}'.format(name))
outputs_ = [out[name] for out in outputs]
result_file = args.out + '.{}.json'.format(name)
results2json(dataset, outputs_, result_file)
coco_eval(result_file, eval_types, dataset.coco)
將之前得到的outputs通過mmcv/io.py中的dump函數寫入輸出文件。
接着,下面的大部分是evaluation,調用了coco API做eval,暫時不詳細解釋。