siamese-fc pytorch代碼解讀
demo_siamfc.py
1.os.path.join(video_dir, “img/*.jpg”)連接兩個或更多的路徑名組件
video_dir = '../Car1/'
b = os.path.join(video_dir, "img/*.jpg")
print(b)
打印結果爲
../Car1/img/*.jpg
如果video_dir = '../Car1'
打印結果爲../Car1\img/*.jpg,會自動補上下劃線‘\’
2.glob.glob()
參數爲字符串
test = glob.glob(os.path.join(video_dir, "img/*.jpg"))
print('test: ', test)
打印結果爲
test: ['../Car1/img\\0001.jpg', '../Car1/img\\0002.jpg', '../Car1/img\\0003.jpg',.....]
該文件夾下所有的jpg文件
3.os.path.basename(x).split(’.’)[0]
path = 'D:/honey/0001.jpg'
print('basepath: ', os.path.basename(path))
print('split: ', os.path.basename(path).split('.'))
打印結果爲
basepath: 0001.jpg
split: ['0001', 'jpg']
3.demo_siamfc.py
import glob
import os
import pandas as pd
import argparse
import numpy as np
import cv2
import time
import sys
sys.path.append(os.getcwd())
from fire import Fire
from tqdm import tqdm
from siamfc import SiamFCTracker
def main(video_dir, gpu_id, model_path):
#savepath = cv2.VideoWriter('test_track.avi', cv2.VideoWriter_fourcc('M', 'P', '4', '2'), 25, (320,240),True)
#返回所有匹配的文件路徑列表
filenames = sorted(glob.glob(os.path.join(video_dir, "img/*.jpg")),
key=lambda x: int(os.path.basename(x).split('.')[0]))#os.path.basename()返回最後的文件名
#將圖像轉成彩色
frames = [cv2.cvtColor(cv2.imread(filename), cv2.COLOR_BGR2RGB) for filename in filenames]
#讀取每一幀真實目標座標
gt_bboxes = pd.read_csv(os.path.join(video_dir, "groundtruth_rect.txt"), sep='\t|,| ',
header=None, names=['xmin', 'ymin', 'width', 'height'],
engine='python')
#print('gt_bboxes: ', gt_bboxes)
title = video_dir.split('/')[-1]
#print('title: ', title)
# starting tracking
#定義一個跟蹤器,參數爲模型和gpu_id
tracker = SiamFCTracker(model_path, gpu_id)
for idx, frame in enumerate(frames):
if idx == 0:
#取出第一行的座標,第一幀的目標pos
bbox = gt_bboxes.iloc[0].values
#初始化
tracker.init(frame, bbox)
#向左上角挪一個位置?
bbox = (bbox[0]-1, bbox[1]-1,
bbox[0]+bbox[2]-1, bbox[1]+bbox[3]-1)
else:
#更新
bbox = tracker.update(frame)
#畫跟蹤器的得出的座標
frame = cv2.rectangle(frame,
(int(bbox[0]), int(bbox[1])),
(int(bbox[2]), int(bbox[3])),
(0, 255, 0),
2)
# 畫groundtruth_rect中的座標
gt_bbox = gt_bboxes.iloc[idx].values
gt_bbox = (gt_bbox[0], gt_bbox[1],
gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3])
frame = cv2.rectangle(frame,
(int(gt_bbox[0]-1), int(gt_bbox[1]-1)), # 0-index
(int(gt_bbox[2]-1), int(gt_bbox[3]-1)),
(255, 0, 0),
1)
if len(frame.shape) == 3:
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
frame = cv2.putText(frame, str(idx), (5, 20), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1)
cv2.imshow(title, frame)
#savepath.write(frame)
cv2.waitKey(30)
if __name__ == "__main__":
video_dir = '../Car1/'#視頻路徑
gpu_id = 0#gpu id
model_path = '../models/siamfc_pretrained.pth'#模型路徑
main(video_dir, gpu_id, model_path)
tracker.py
1.torch.nn.Module.eval()測試模式,不使用bn和dropout
2.torchvision.transforms.Compose([ToTensor()])將PILImage或者ndarrray轉換爲tensor,並且歸一化至[0-1]
3.np.newaxis創建新軸
import numpy as np
x = np.array([1, 2, 3, 4])
print('x: ', x)
print('x.shape: ', x.shape)
x1 = x[np.newaxis, :]
print('x1: ', x1)
print('x.shape: ', x1.shape)
x2 = x[:, np.newaxis]
print('x2: ', x2)
print('x2.shape: ', x2.shape)
輸出:
x: [1 2 3 4]
x.shape: (4,)
x1: [[1 2 3 4]]
x.shape: (1, 4)
x2: [[1]
[2]
[3]
[4]]
x2.shape: (4, 1)
4.numyp.dot()矩陣乘法
5.tracker.py
import numpy as np
import cv2
import torch
import torch.nn.functional as F
import time
import warnings
import torchvision.transforms as transforms
from torch.autograd import Variable
from .alexnet import SiameseAlexNet
from .config import config
from .custom_transforms import ToTensor
from .utils import get_exemplar_image, get_pyramid_instance_image, get_instance_image
torch.set_num_threads(1) # otherwise pytorch will take all cpus
class SiamFCTracker:
def __init__(self, model_path, gpu_id):
self.gpu_id = gpu_id
with torch.cuda.device(gpu_id):
self.model = SiameseAlexNet(gpu_id, train=False)
#torch.load()解序列化一個pickled對象並加載到內存
#torch.nn.Module.load_state_dict()加載一個解序列化的state_dict對象
self.model.load_state_dict(torch.load(model_path))
self.model = self.model.cuda()
#不啓用 BatchNormalization 和 Dropout,測試模式,對應於nn.Module.train()
self.model.eval()
#將PILImage或者ndarrray轉換爲tensor,並且歸一化至[0-1]
self.transforms = transforms.Compose([
ToTensor()
])
#創建cos窗
def _cosine_window(self, size):
"""
get the cosine window
"""
#np.newaxis維度擴展
#尺寸爲(size[0],size[1])
cos_window = np.hanning(int(size[0]))[:, np.newaxis].dot(np.hanning(int(size[1]))[np.newaxis, :])
cos_window = cos_window.astype(np.float32)
cos_window /= np.sum(cos_window)
return cos_window
#初始化(RGB圖像,第一幀目標位置)
def init(self, frame, bbox):
""" initialize siamfc tracker
Args:
frame: an RGB image
bbox: one-based bounding box [x, y, width, height]
"""
self.bbox = (bbox[0]-1, bbox[1]-1, bbox[0]-1+bbox[2], bbox[1]-1+bbox[3]) # zero based
#目標中心座標
self.pos = np.array([bbox[0]-1+(bbox[2]-1)/2, bbox[1]-1+(bbox[3]-1)/2]) # center x, center y, zero based
#目標尺寸
self.target_sz = np.array([bbox[2], bbox[3]]) # width, height
#對應於matlab代碼avgChans = gather([mean(mean(im(:,:,1))) mean(mean(im(:,:,2))) mean(mean(im(:,:,3)))]);
#用均值填充
self.img_mean = tuple(map(int, frame.mean(axis=(0, 1))))
'''
print('img_mean: ', self.img_mean)
img_mean: (140, 140, 140)
'''
#get_exemplar_image(frame, box, 127, 0.5, self.img_mean)
#取出初始化中心的片段(模板)
#scale_z是s_z縮放到127的比例,sz是目標擴充的正方形邊長
exemplar_img, scale_z, s_z = get_exemplar_image(frame, self.bbox,
config.exemplar_size, config.context_amount, self.img_mean)
# get exemplar feature
#歸一化這個片段
exemplar_img = self.transforms(exemplar_img)[None,:,:,:]
#將模板送入gpu
with torch.cuda.device(self.gpu_id):
exemplar_img_var = Variable(exemplar_img.cuda())
#前向傳播,計算模板圖像的特徵
self.model((exemplar_img_var, None))
#config.num_scale = 3
self.penalty = np.ones((config.num_scale)) * config.scale_penalty#0.9745
#self.penalty[1] = 1,self.penalty = (0.9745,1,0.9745)
self.penalty[config.num_scale//2] = 1
# create cosine window
#config.response_up_stride = 16上採樣的stride,response_sz = 17
self.interp_response_sz = config.response_up_stride * config.response_sz#272
#創建cos窗
self.cosine_window = self._cosine_window((self.interp_response_sz, self.interp_response_sz))
# create scalse
#三尺度scale_step = 1.0375, 1/1.0375,1,1.0375
self.scales = config.scale_step ** np.arange(np.ceil(config.num_scale/2)-config.num_scale,
np.floor(config.num_scale/2)+1)
# instance_size = 255, exemplar_size = 127
#搜索範圍縮放前的大小(255-127)*sz/127,按照模板縮放的比例確定的
self.s_x = s_z + (config.instance_size-config.exemplar_size) / scale_z
# arbitrary scale saturation
#搜索範圍
self.min_s_x = 0.2 * self.s_x
self.max_s_x = 5 * self.s_x
#跟蹤更新
def update(self, frame):
"""track object based on the previous frame
Args:
frame: an RGB image
Returns:
bbox: tuple of 1-based bounding box(xmin, ymin, xmax, ymax)
"""
#三尺度搜索範圍大小
size_x_scales = self.s_x * self.scales
#獲取搜索範圍三尺度片段
pyramid = get_pyramid_instance_image(frame, self.pos, config.instance_size, size_x_scales, self.img_mean)
#歸一化三個片段,並將三個片段按照行拼接在一起
instance_imgs = torch.cat([self.transforms(x)[None,:,:,:] for x in pyramid], dim=0)
#print('instance_imgs: ', instance_imgs.size())
with torch.cuda.device(self.gpu_id):
#搜索圖像送入gpu
instance_imgs_var = Variable(instance_imgs.cuda())
#前向傳播
response_maps = self.model((None, instance_imgs_var))
response_maps = response_maps.data.cpu().numpy().squeeze()
#上採樣
response_maps_up = [cv2.resize(x, (self.interp_response_sz, self.interp_response_sz), cv2.INTER_CUBIC)
for x in response_maps]
#計算每個尺度最大得分,最大值乘以懲罰因子
max_score = np.array([x.max() for x in response_maps_up]) * self.penalty
# penalty scale change
scale_idx = max_score.argmax()#得分最大的索引值,是將數組平鋪成一維下的索引
response_map = response_maps_up[scale_idx]
#響應圖歸一化
response_map -= response_map.min()
response_map /= response_map.sum()
#config.window_influenc = 0.176
response_map = (1 - config.window_influence) * response_map + \
config.window_influence * self.cosine_window
#找到最大的響應位置
max_r, max_c = np.unravel_index(response_map.argmax(), response_map.shape)
# displacement in interpolation response
#在響應圖上偏離中心的脫靶量
disp_response_interp = np.array([max_c, max_r]) - (self.interp_response_sz-1) / 2.
# displacement in input
disp_response_input = disp_response_interp * config.total_stride / config.response_up_stride
# displacement in frame
#當前得分最大的尺度,disp_response_input(x,y)是在255x255圖像上的位置,還原到金字塔原圖上面x/255*s_x*scale
scale = self.scales[scale_idx]
disp_response_frame = disp_response_input * (self.s_x * scale) / config.instance_size
# 絕對座標
self.pos += disp_response_frame
# scale_lr = 0.59尺度學習率
self.s_x *= ((1 - config.scale_lr) + config.scale_lr * scale)
self.s_x = max(self.min_s_x, min(self.max_s_x, self.s_x))
self.target_sz = ((1 - config.scale_lr) + config.scale_lr * scale) * self.target_sz
bbox = (self.pos[0] - self.target_sz[0]/2 + 1, # xmin convert to 1-based
self.pos[1] - self.target_sz[1]/2 + 1, # ymin
self.pos[0] + self.target_sz[0]/2 + 1, # xmax
self.pos[1] + self.target_sz[1]/2 + 1) # ymax
return bbox
utils.py
import numpy as np
import cv2
def get_center(x):
return (x - 1.) / 2.
#(x1,y1,x2,y2)轉換成(cx,cy,w,h)
def xyxy2cxcywh(bbox):
return get_center(bbox[0]+bbox[2]), \
get_center(bbox[1]+bbox[3]), \
(bbox[2]-bbox[0]), \
(bbox[3]-bbox[1])
def crop_and_pad(img, cx, cy, model_sz, original_sz, img_mean=None):
#左上角和右下角的x,y
xmin = cx - original_sz // 2
xmax = cx + original_sz // 2
ymin = cy - original_sz // 2
ymax = cy + original_sz // 2
im_h, im_w, _ = img.shape
left = right = top = bottom = 0
if xmin < 0:
left = int(abs(xmin))
if xmax > im_w:
right = int(xmax - im_w)
if ymin < 0:
top = int(abs(ymin))
if ymax > im_h:
bottom = int(ymax - im_h)
xmin = int(max(0, xmin))
xmax = int(min(im_w, xmax))
ymin = int(max(0, ymin))
ymax = int(min(im_h, ymax))
#取出目標片段
im_patch = img[ymin:ymax, xmin:xmax]
if left != 0 or right !=0 or top!=0 or bottom!=0:
#前面沒計算的話這裏計算一次
if img_mean is None:
img_mean = tuple(map(int, img.mean(axis=(0, 1))))
#填充im_patch大小爲original_sz
im_patch = cv2.copyMakeBorder(im_patch, top, bottom, left, right,
cv2.BORDER_CONSTANT, value=img_mean)
if model_sz != original_sz:
im_patch = cv2.resize(im_patch, (model_sz, model_sz))#縮放成127x127
return im_patch
#get_exemplar_image(frame, box, 127, 0.5, self.img_mean),取初始化模型的片段圖像
def get_exemplar_image(img, bbox, size_z, context_amount, img_mean=None):
#轉換成中心座標
cx, cy, w, h = xyxy2cxcywh(bbox)
#擴充的w,h
wc_z = w + context_amount * (w+h)
hc_z = h + context_amount * (w+h)
#面積開方
s_z = np.sqrt(wc_z * hc_z)
#計算尺度 = 127 / sz,前面縮放的比例
scale_z = size_z / s_z
#以目標中心取出的片段
exemplar_img = crop_and_pad(img, cx, cy, size_z, s_z, img_mean)
return exemplar_img, scale_z, s_z
#
def get_instance_image(img, bbox, size_z, size_x, context_amount, img_mean=None):
cx, cy, w, h = xyxy2cxcywh(bbox)
wc_z = w + context_amount * (w+h)
hc_z = h + context_amount * (w+h)
s_z = np.sqrt(wc_z * hc_z)
scale_z = size_z / s_z
d_search = (size_x - size_z) / 2
pad = d_search / scale_z
s_x = s_z + 2 * pad
scale_x = size_x / s_x
instance_img = crop_and_pad(img, cx, cy, size_x, s_x, img_mean)
return instance_img, scale_x, s_x
#獲取搜索範圍金字塔圖像(圖像,中心座標,255,三尺度,圖像均值),全部縮放至255x255
def get_pyramid_instance_image(img, center, size_x, size_x_scales, img_mean=None):
if img_mean is None:
img_mean = tuple(map(int, img.mean(axis=(0, 1))))
pyramid = [crop_and_pad(img, center[0], center[1], size_x, size_x_scale, img_mean)
for size_x_scale in size_x_scales]
return pyramid