0.背景
2018年coco挑戰賽亞軍,pipeline非常簡潔,VGG16+額外幾層反捲積分,輸出關節點熱力圖。主要了解pipeline構建過程,學習一下代碼怎麼寫。https://github.com/microsoft/human-pose-estimation.pytorch
1.相關知識
1.1 關鍵點數量
17個關鍵點,具體如下。所以網絡層最終輸出 B* 17* Heatmap_H* Heatmap_W
'''
"keypoints": {
0: "nose",
1: "left_eye",
2: "right_eye",
3: "left_ear",
4: "right_ear",
5: "left_shoulder",
6: "right_shoulder",
7: "left_elbow",
8: "right_elbow",
9: "left_wrist",
10: "right_wrist",
11: "left_hip",
12: "right_hip",
13: "left_knee",
14: "right_knee",
15: "left_ankle",
16: "right_ankle"
},
"skeleton": [
[16,14],[14,12],[17,15],[15,13],[12,13],[6,12],[7,13], [6,7],[6,8],
[7,9],[8,10],[9,11],[2,3],[1,2],[1,3],[2,4],[3,5],[4,6],[5,7]]
'''
1.2 網絡結構
self.features = nn.Sequential(*features)
#原先是VGG16,224-->7.1/32,更換爲MobilenetV2, 7*7*1280
self.deconv_layers = self._make_deconv_layer(
extra.NUM_DECONV_LAYERS,
extra.NUM_DECONV_FILTERS,
extra.NUM_DECONV_KERNELS,
) #1/32-->1/4 channal=256
self.final_layer = nn.Conv2d(
in_channels=extra.NUM_DECONV_FILTERS[-1],
out_channels=cfg.MODEL.NUM_JOINTS,
kernel_size=extra.FINAL_CONV_KERNEL,
stride=1,
padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0
) #1/4,channal=256-->17. 默認輸入3*256*192,輸出17*64*48
部分初始化參數
def init_weights(self, pretrained=''):
if os.path.isfile(pretrained):
logger.info('=> init deconv weights from normal distribution')
for name, m in self.deconv_layers.named_modules():
if isinstance(m, nn.ConvTranspose2d):
logger.info('=> init {}.weight as normal(0, 0.001)'.format(name))
logger.info('=> init {}.bias as 0'.format(name))
nn.init.normal_(m.weight, std=0.001)
if self.deconv_with_bias:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
logger.info('=> init {}.weight as 1'.format(name))
logger.info('=> init {}.bias as 0'.format(name))
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
logger.info('=> init final conv weights from normal distribution')
for m in self.final_layer.modules():
if isinstance(m, nn.Conv2d):
# nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
logger.info('=> init {}.weight as normal(0, 0.001)'.format(name))
logger.info('=> init {}.bias as 0'.format(name))
nn.init.normal_(m.weight, std=0.001)
nn.init.constant_(m.bias, 0)
# pretrained_state_dict = torch.load(pretrained)
logger.info('=> loading pretrained model {}'.format(pretrained))
# self.load_state_dict(pretrained_state_dict, strict=False)
checkpoint = torch.load(pretrained)
if isinstance(checkpoint, OrderedDict):
state_dict = checkpoint
elif isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
state_dict_old = checkpoint['state_dict']
state_dict = OrderedDict()
# delete 'module.' because it is saved from DataParallel module
for key in state_dict_old.keys():
if key.startswith('module.'):
# state_dict[key[7:]] = state_dict[key]
# state_dict.pop(key),原先權重可能是在多卡下存儲的,或多一個
# modele.。解決辦法兩種,一種保存模型時,model.module.dict()
# 一種 model = \
# torch.nn.DataParallel(model,device_ids=gpus).cuda()
# 再加載 model.load
model.load_state_dict(checkpoint) # mudule 類自帶的函數,strict=False
state_dict[key[7:]] = state_dict_old[key]
else:
state_dict[key] = state_dict_old[key]
else:
raise RuntimeError(
'No state_dict found in checkpoint file {}'.format(pretrained))
self.load_state_dict(state_dict, strict=False) #mudule 類自帶的函數
else:
logger.error('=> imagenet pretrained model dose not exist')
logger.error('=> please download it first')
raise ValueError('imagenet pretrained model does not exist')
1.3 文件夾佈局
重要的幾個
--core
--__init__.py
--config.py #用easydict儲存配置信息
--evaluate.py #計算PCK0.5,得出各關節點在一個Batch的ACC
--function.py ##定義 train()和 val()
--inference.py ##get_max_preds(),輸入 b*64*32, 輸出b*17*2(峯值座標,有的爲0),b*17*1(score)
--loss.py ##定義loss類,forward中實現 L2loss,輸出(1,)
--dataset
--__init__.py
--coco.py ##自定義數據類coco,繼承jioint類,加載GT在自身中。還包含evalution函數計算AP,最重要
--Jioint.py ##這裏定義 __getitem__
--model
--pose.py ##定義網絡
--utils.py
--transform.py ##定義 flip、仿射變換等數據增強
--vis.py ##展示圖片
--utils.py ##定義logger,optimizer等操作
2.數據加載
2.1 coco類介紹
info、image、license 共享 annotion滑頭最多
images數組和annotations數組的元素數量是不相等的,annotion數量多於圖片,每個圖片裏的每一個對象有一個自己的id,且有對應image對應的image_id. catergories裏只有一個人類。
annotion:
2.2 自定義coco數據類
self.image_set_index = self._load_image_set_index() #[1122,1212,12121,...] int array,存有train or val所有的圖片id
self.db = self._get_db() #[{},{}],keypoint x,y,vision, center,一個{}就是一個人的信息,很全面
def __getitem__(self, idx):
db_rec = copy.deepcopy(self.db[idx]) #{}
image_file = db_rec['image'] #'xx/.jpg'
filename = db_rec['filename'] if 'filename' in db_rec else ''
imgnum = db_rec['imgnum'] if 'imgnum' in db_rec else '' #0
if self.data_format == 'zip':
from utils import zipreader
data_numpy = zipreader.imread(
image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
else:
data_numpy = cv2.imread(
image_file, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
if data_numpy is None:
logger.error('=> fail to read {}'.format(image_file))
raise ValueError('Fail to read {}'.format(image_file))
joints = db_rec['joints_3d'] #[x,y,0]
joints_vis = db_rec['joints_3d_vis'] #[0,0,0] or [1,1,0]
c = db_rec['center'] #[111,222]
s = db_rec['scale'] #[1.1,2.3],people's h,w/200
score = db_rec['score'] if 'score' in db_rec else 1 #1
r = 0
if self.is_train:
sf = self.scale_factor #0.3
rf = self.rotation_factor #40
s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
r = np.clip(np.random.randn()*rf, -rf*2, rf*2) \
if random.random() <= 0.6 else 0
if self.flip and random.random() <= 0.5:
data_numpy = data_numpy[:, ::-1, :] #hwc,w
joints, joints_vis = fliplr_joints(
joints, joints_vis, data_numpy.shape[1], self.flip_pairs) #17*3,17*3
c[0] = data_numpy.shape[1] - c[0] - 1
trans = get_affine_transform(c, s, r, self.image_size) ##return a matrix,2*3
input = cv2.warpAffine(
data_numpy,
trans,
(int(self.image_size[0]), int(self.image_size[1])),
flags=cv2.INTER_LINEAR) #仿射變換
if self.transform:
input = self.transform(input) ##normaize
for i in range(self.num_joints):
if joints_vis[i, 0] > 0.0:
joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
target, target_weight = self.generate_target(joints, joints_vis)
# targrt = 17*heatmap, target_weight = 17*1,關鍵點在邊緣或者沒有的 ->0
target = torch.from_numpy(target)
target_weight = torch.from_numpy(target_weight)
meta = {
'image': image_file,
'filename': filename,
'imgnum': imgnum,
'joints': joints, #(x,y,0) or (0,0,0)
'joints_vis': joints_vis, #(1,1,0) or (0,0,0)
'center': c,
'scale': s,
'rotation': r,
'score': score
}
return input, target, target_weight, meta
3.train流程
model = eval('models.'+'pose_mobilenetv2'+'.get_pose_net2')(
config, is_train=False
) # 調用get_pose_net,參數在後面,返回model,output B*() *16
checkpoint = torch.load('0.633-model_best.pth.tar')
#model.load_state_dict(checkpoint['state_dict']) #mudule 類自帶的函數,strict=False
#writer = SummaryWriter(log_dir='tensorboard_logs')
gpus = [int(i) for i in config.GPUS.split(',')] # [0]
model = torch.nn.DataParallel(model, device_ids=gpus).cuda()
model.load_state_dict(checkpoint) # mudule 類自帶的函數,strict=False
# define loss function (criterion) and optimizer
criterion = JointsMSELoss(
use_target_weight=config.LOSS.USE_TARGET_WEIGHT
).cuda() # call(output, target, target_weight)
optimizer = get_optimizer(config, model) #LR=0.001
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,15,0.000001,-1)
# Data loading code
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_dataset = eval('dataset.'+config.DATASET.DATASET)(
config,
config.DATASET.ROOT,
config.DATASET.TRAIN_SET,
True,
transforms.Compose([
transforms.ToTensor(),
normalize,
])
) # COCO,include img_id[].gt[]
train_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=config.TRAIN.BATCH_SIZE*len(gpus),
shuffle=config.TRAIN.SHUFFLE,
num_workers=config.WORKERS,
pin_memory=True
)
best_perf = 0.0
best_model = False
for epoch in range(150, 220): #config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH
lr_scheduler.step()
# train for one epoch
train(config, train_loader, model, criterion, optimizer, epoch,
final_output_dir, tb_log_dir)
# evaluate on validation set,return AP
perf_indicator = validate(config, valid_loader, valid_dataset, model,
criterion, final_output_dir, tb_log_dir)
if perf_indicator > best_perf:
best_perf = perf_indicator
best_model = True
else:
best_model = False
logger.info('=> saving checkpoint to {}'.format(final_output_dir))
save_checkpoint({
'epoch': epoch + 1,
'model': get_model_name(config),
'state_dict': model.state_dict(),
'perf': perf_indicator,
'optimizer': optimizer.state_dict(),
}, best_model, final_output_dir)
final_model_state_file = os.path.join(final_output_dir,
'final_state.pkl')
logger.info('saving final model state to {}'.format(
final_model_state_file))
torch.save(model.module.state_dict(), final_model_state_file)
3.1 train函數
def train(config, train_loader, model, criterion, optimizer, epoch,
output_dir, tb_log_dir, writer_dict=None):
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
acc = AverageMeter()
# switch to train mode
model.train()
end = time.time()
for i, (input, target, target_weight, meta) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
# compute output
output = model(input) #17*h*w
target = target.cuda(non_blocking=True) #17*4*h,0-1
target_weight = target_weight.cuda(non_blocking=True) #17*1,0 or 1
loss = criterion(output, target, target_weight) # 1*1
# compute gradient and do update step
optimizer.zero_grad()
loss.backward()
optimizer.step()
# measure accuracy and record loss
losses.update(loss.item(), input.size(0))
_, avg_acc, cnt, pred = accuracy(output.detach().cpu().numpy(),
target.detach().cpu().numpy()) #[18,],scalr(與gt比較計算pck0.5),cnt=16,b*17*2(top點座標,有部分爲0,因爲峯值低於0)
acc.update(avg_acc, cnt)
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % config.PRINT_FREQ == 0:
msg = 'Epoch: [{0}][{1}/{2}]\t' \
'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \
'Speed {speed:.1f} samples/s\t' \
'Data {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \
'Loss {loss.val:.5f} ({loss.avg:.5f})\t' \
'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(
epoch, i, len(train_loader), batch_time=batch_time,
speed=input.size(0)/batch_time.val,
data_time=data_time, loss=losses, acc=acc)
logger.info(msg)
'''
writer = writer_dict['writer']
global_steps = writer_dict['train_global_steps']
writer.add_scalar('train_loss', losses.val, global_steps)
writer.add_scalar('train_acc', acc.val, global_steps)
writer_dict['train_global_steps'] = global_steps + 1
'''
prefix = '{}_{}'.format(os.path.join(output_dir, 'train'), i)
save_debug_images(config, input, meta, target, pred*4, output,
prefix)
3.2 計算 pck0.5(按關節點種類來的)
def calc_dists(preds, target, normalize): #b*17*2,
preds = preds.astype(np.float32)
target = target.astype(np.float32)
dists = np.zeros((preds.shape[1], preds.shape[0]))
for n in range(preds.shape[0]):
for c in range(preds.shape[1]):
if target[n, c, 0] > 1 and target[n, c, 1] > 1: #ingore 邊緣關鍵點
normed_preds = preds[n, c, :] / normalize[n] #/6.4,/4.8
normed_targets = target[n, c, :] / normalize[n]
dists[c, n] = np.linalg.norm(normed_preds - normed_targets) #better is 0,>0,L2距離
else:
dists[c, n] = -1
return dists #17*b
def dist_acc(dists, thr=0.5): # 1*b
''' Return percentage below threshold while ignoring values with a -1 '''
dist_cal = np.not_equal(dists, -1)
num_dist_cal = dist_cal.sum()
if num_dist_cal > 0:
return np.less(dists[dist_cal], thr).sum() * 1.0 / num_dist_cal #距離小於0.5,pck0.5
else:
return -1
def accuracy(output, target, hm_type='gaussian', thr=0.5):
'''
Calculate accuracy according to PCK,
but uses ground truth heatmap rather than x,y locations
First value to be returned is average accuracy across 'idxs',
followed by individual accuracies
'''
idx = list(range(output.shape[1])) #17
norm = 1.0
if hm_type == 'gaussian':
pred, _ = get_max_preds(output) #b*17*2,max座標,部分爲0
target, _ = get_max_preds(target)
h = output.shape[2]
w = output.shape[3]
norm = np.ones((pred.shape[0], 2)) * np.array([h, w]) / 10 #6.4,4.8
dists = calc_dists(pred, target, norm) #17*b
acc = np.zeros((len(idx) + 1))
avg_acc = 0
cnt = 0
for i in range(len(idx)):
acc[i + 1] = dist_acc(dists[idx[i]])
if acc[i + 1] >= 0:
avg_acc = avg_acc + acc[i + 1]
cnt += 1
avg_acc = avg_acc / cnt if cnt != 0 else 0
if cnt != 0:
acc[0] = avg_acc
return acc, avg_acc, cnt, pred
4 總結
可視化代碼:
def save_batch_image_with_joints(batch_image, batch_joints, batch_joints_vis,
file_name, nrow=8, padding=2):
'''
batch_image: [batch_size, channel, height, width]
batch_joints: [batch_size, num_joints, 3],
batch_joints_vis: [batch_size, num_joints, 1],
}
一行8個
'''
grid = torchvision.utils.make_grid(batch_image, nrow, padding, True)
ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).cpu().numpy() #?均值方差不用動?
ndarr = ndarr.copy()
nmaps = batch_image.size(0)
xmaps = min(nrow, nmaps)
ymaps = int(math.ceil(float(nmaps) / xmaps))
height = int(batch_image.size(2) + padding)
width = int(batch_image.size(3) + padding)
k = 0
for y in range(ymaps):
for x in range(xmaps):
if k >= nmaps:
break
joints = batch_joints[k]
joints_vis = batch_joints_vis[k]
for joint, joint_vis in zip(joints, joints_vis):
joint[0] = x * width + padding + joint[0]
joint[1] = y * height + padding + joint[1]
if joint_vis[0]:
cv2.circle(ndarr, (int(joint[0]), int(joint[1])), 2, [255, 0, 0], 2)
k = k + 1
cv2.imwrite(file_name, ndarr)
收穫
1.使用 logger打印信息
2.部分加載模型權重
3.自定義一個複雜數據類
4.使用tensorboardX查看訓練過程
5.model得到輸出後loss在GPU上計算,計算ACC轉化到 cpu()numpy()上進行後續操作