siamese-fc pytorch代碼解讀

siamese-fc pytorch代碼解讀

demo_siamfc.py

1.os.path.join(video_dir, “img/*.jpg”)連接兩個或更多的路徑名組件

video_dir = '../Car1/'
b = os.path.join(video_dir, "img/*.jpg")
print(b)
打印結果爲
../Car1/img/*.jpg
如果video_dir = '../Car1'
打印結果爲../Car1\img/*.jpg,會自動補上下劃線‘\’

2.glob.glob()

參數爲字符串
test = glob.glob(os.path.join(video_dir, "img/*.jpg"))
print('test: ', test)
打印結果爲
test:  ['../Car1/img\\0001.jpg', '../Car1/img\\0002.jpg', '../Car1/img\\0003.jpg',.....]
該文件夾下所有的jpg文件

3.os.path.basename(x).split(’.’)[0]

path = 'D:/honey/0001.jpg'
print('basepath: ', os.path.basename(path))
print('split: ', os.path.basename(path).split('.'))
打印結果爲
basepath:  0001.jpg
split:  ['0001', 'jpg']

3.demo_siamfc.py

import glob
import os
import pandas as pd
import argparse
import numpy as np
import cv2
import time
import sys
sys.path.append(os.getcwd())

from fire import Fire
from tqdm import tqdm

from siamfc import SiamFCTracker

def main(video_dir, gpu_id,  model_path):
    #savepath = cv2.VideoWriter('test_track.avi', cv2.VideoWriter_fourcc('M', 'P', '4', '2'), 25, (320,240),True)
    #返回所有匹配的文件路徑列表
    filenames = sorted(glob.glob(os.path.join(video_dir, "img/*.jpg")),
           key=lambda x: int(os.path.basename(x).split('.')[0]))#os.path.basename()返回最後的文件名
    #將圖像轉成彩色
    frames = [cv2.cvtColor(cv2.imread(filename), cv2.COLOR_BGR2RGB) for filename in filenames]
    #讀取每一幀真實目標座標
    gt_bboxes = pd.read_csv(os.path.join(video_dir, "groundtruth_rect.txt"), sep='\t|,| ',
            header=None, names=['xmin', 'ymin', 'width', 'height'],
            engine='python')
    #print('gt_bboxes: ', gt_bboxes)
    title = video_dir.split('/')[-1]
    #print('title: ', title)
    # starting tracking
    #定義一個跟蹤器,參數爲模型和gpu_id
    tracker = SiamFCTracker(model_path, gpu_id)
    for idx, frame in enumerate(frames):
        if idx == 0:
            #取出第一行的座標,第一幀的目標pos
            bbox = gt_bboxes.iloc[0].values
            #初始化
            tracker.init(frame, bbox)
            #向左上角挪一個位置?
            bbox = (bbox[0]-1, bbox[1]-1,
                    bbox[0]+bbox[2]-1, bbox[1]+bbox[3]-1)
        else:
            #更新
            bbox = tracker.update(frame)
        #畫跟蹤器的得出的座標
        frame = cv2.rectangle(frame,
                              (int(bbox[0]), int(bbox[1])),
                              (int(bbox[2]), int(bbox[3])),
                              (0, 255, 0),
                              2)
        # 畫groundtruth_rect中的座標
        gt_bbox = gt_bboxes.iloc[idx].values
        gt_bbox = (gt_bbox[0], gt_bbox[1],
                   gt_bbox[0]+gt_bbox[2], gt_bbox[1]+gt_bbox[3])
        frame = cv2.rectangle(frame,
                              (int(gt_bbox[0]-1), int(gt_bbox[1]-1)), # 0-index
                              (int(gt_bbox[2]-1), int(gt_bbox[3]-1)),
                              (255, 0, 0),
                              1)
        if len(frame.shape) == 3:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        frame = cv2.putText(frame, str(idx), (5, 20), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 1)
        cv2.imshow(title, frame)
        #savepath.write(frame)
        cv2.waitKey(30)

if __name__ == "__main__":
    video_dir = '../Car1/'#視頻路徑
    gpu_id = 0#gpu id
    model_path = '../models/siamfc_pretrained.pth'#模型路徑
    main(video_dir, gpu_id, model_path)

tracker.py

1.torch.nn.Module.eval()測試模式,不使用bn和dropout
2.torchvision.transforms.Compose([ToTensor()])將PILImage或者ndarrray轉換爲tensor,並且歸一化至[0-1]
3.np.newaxis創建新軸

import numpy as np
x = np.array([1, 2, 3, 4])
print('x: ', x)
print('x.shape: ', x.shape)
x1 = x[np.newaxis, :]
print('x1: ', x1)
print('x.shape: ', x1.shape)
x2 = x[:, np.newaxis]
print('x2: ', x2)
print('x2.shape: ', x2.shape)
輸出:
x:  [1 2 3 4]
x.shape:  (4,)
x1:  [[1 2 3 4]]
x.shape:  (1, 4)
x2:  [[1]
 [2]
 [3]
 [4]]
x2.shape:  (4, 1)

4.numyp.dot()矩陣乘法
5.tracker.py

import numpy as np
import cv2
import torch
import torch.nn.functional as F
import time
import warnings
import torchvision.transforms as transforms

from torch.autograd import Variable

from .alexnet import SiameseAlexNet
from .config import config
from .custom_transforms import ToTensor
from .utils import get_exemplar_image, get_pyramid_instance_image, get_instance_image

torch.set_num_threads(1) # otherwise pytorch will take all cpus

class SiamFCTracker:
    def __init__(self, model_path, gpu_id):
        self.gpu_id = gpu_id
        with torch.cuda.device(gpu_id):
            self.model = SiameseAlexNet(gpu_id, train=False)
            #torch.load()解序列化一個pickled對象並加載到內存
            #torch.nn.Module.load_state_dict()加載一個解序列化的state_dict對象
            self.model.load_state_dict(torch.load(model_path))
            self.model = self.model.cuda()
            #不啓用 BatchNormalization 和 Dropout,測試模式,對應於nn.Module.train()
            self.model.eval()
        #將PILImage或者ndarrray轉換爲tensor,並且歸一化至[0-1]
        self.transforms = transforms.Compose([
            ToTensor()
        ])
    #創建cos窗
    def _cosine_window(self, size):
        """
            get the cosine window
        """
        #np.newaxis維度擴展
        #尺寸爲(size[0],size[1])
        cos_window = np.hanning(int(size[0]))[:, np.newaxis].dot(np.hanning(int(size[1]))[np.newaxis, :])
        cos_window = cos_window.astype(np.float32)
        cos_window /= np.sum(cos_window)
        return cos_window
    #初始化(RGB圖像,第一幀目標位置)
    def init(self, frame, bbox):
        """ initialize siamfc tracker
        Args:
            frame: an RGB image
            bbox: one-based bounding box [x, y, width, height]
        """
        self.bbox = (bbox[0]-1, bbox[1]-1, bbox[0]-1+bbox[2], bbox[1]-1+bbox[3]) # zero based
        #目標中心座標
        self.pos = np.array([bbox[0]-1+(bbox[2]-1)/2, bbox[1]-1+(bbox[3]-1)/2])  # center x, center y, zero based
        #目標尺寸
        self.target_sz = np.array([bbox[2], bbox[3]])                            # width, height
        #對應於matlab代碼avgChans = gather([mean(mean(im(:,:,1))) mean(mean(im(:,:,2))) mean(mean(im(:,:,3)))]);
        #用均值填充
        self.img_mean = tuple(map(int, frame.mean(axis=(0, 1))))
        '''
        print('img_mean: ', self.img_mean)
        img_mean:  (140, 140, 140)
        '''
        #get_exemplar_image(frame, box, 127, 0.5, self.img_mean)
        #取出初始化中心的片段(模板)
        #scale_z是s_z縮放到127的比例,sz是目標擴充的正方形邊長
        exemplar_img, scale_z, s_z = get_exemplar_image(frame, self.bbox,
                config.exemplar_size, config.context_amount, self.img_mean)

        # get exemplar feature
        #歸一化這個片段
        exemplar_img = self.transforms(exemplar_img)[None,:,:,:]
        #將模板送入gpu
        with torch.cuda.device(self.gpu_id):
            exemplar_img_var = Variable(exemplar_img.cuda())
            #前向傳播,計算模板圖像的特徵
            self.model((exemplar_img_var, None))
        #config.num_scale = 3
        self.penalty = np.ones((config.num_scale)) * config.scale_penalty#0.9745
        #self.penalty[1] = 1,self.penalty = (0.9745,1,0.9745)
        self.penalty[config.num_scale//2] = 1

        # create cosine window
        #config.response_up_stride = 16上採樣的stride,response_sz = 17
        self.interp_response_sz = config.response_up_stride * config.response_sz#272
        #創建cos窗
        self.cosine_window = self._cosine_window((self.interp_response_sz, self.interp_response_sz))

        # create scalse
        #三尺度scale_step = 1.0375,  1/1.0375,1,1.0375
        self.scales = config.scale_step ** np.arange(np.ceil(config.num_scale/2)-config.num_scale,
                np.floor(config.num_scale/2)+1)

        # instance_size = 255, exemplar_size = 127
        #搜索範圍縮放前的大小(255-127)*sz/127,按照模板縮放的比例確定的
        self.s_x = s_z + (config.instance_size-config.exemplar_size) / scale_z

        # arbitrary scale saturation
        #搜索範圍
        self.min_s_x = 0.2 * self.s_x
        self.max_s_x = 5 * self.s_x
    #跟蹤更新
    def update(self, frame):
        """track object based on the previous frame
        Args:
            frame: an RGB image

        Returns:
            bbox: tuple of 1-based bounding box(xmin, ymin, xmax, ymax)
        """
        #三尺度搜索範圍大小
        size_x_scales = self.s_x * self.scales
        #獲取搜索範圍三尺度片段
        pyramid = get_pyramid_instance_image(frame, self.pos, config.instance_size, size_x_scales, self.img_mean)
        #歸一化三個片段,並將三個片段按照行拼接在一起
        instance_imgs = torch.cat([self.transforms(x)[None,:,:,:] for x in pyramid], dim=0)
        #print('instance_imgs: ', instance_imgs.size())
        with torch.cuda.device(self.gpu_id):
            #搜索圖像送入gpu
            instance_imgs_var = Variable(instance_imgs.cuda())
            #前向傳播
            response_maps = self.model((None, instance_imgs_var))
            response_maps = response_maps.data.cpu().numpy().squeeze()
            #上採樣
            response_maps_up = [cv2.resize(x, (self.interp_response_sz, self.interp_response_sz), cv2.INTER_CUBIC)
             for x in response_maps]
        #計算每個尺度最大得分,最大值乘以懲罰因子
        max_score = np.array([x.max() for x in response_maps_up]) * self.penalty
        # penalty scale change
        scale_idx = max_score.argmax()#得分最大的索引值,是將數組平鋪成一維下的索引
        response_map = response_maps_up[scale_idx]
        #響應圖歸一化
        response_map -= response_map.min()
        response_map /= response_map.sum()
        #config.window_influenc = 0.176
        response_map = (1 - config.window_influence) * response_map + \
                config.window_influence * self.cosine_window
        #找到最大的響應位置
        max_r, max_c = np.unravel_index(response_map.argmax(), response_map.shape)
        # displacement in interpolation response
        #在響應圖上偏離中心的脫靶量
        disp_response_interp = np.array([max_c, max_r]) - (self.interp_response_sz-1) / 2.
        # displacement in input
        disp_response_input = disp_response_interp * config.total_stride / config.response_up_stride
        # displacement in frame
        #當前得分最大的尺度,disp_response_input(x,y)是在255x255圖像上的位置,還原到金字塔原圖上面x/255*s_x*scale
        scale = self.scales[scale_idx]
        disp_response_frame = disp_response_input * (self.s_x * scale) / config.instance_size
        # 絕對座標
        self.pos += disp_response_frame
        # scale_lr = 0.59尺度學習率
        self.s_x *= ((1 - config.scale_lr) + config.scale_lr * scale)
        self.s_x = max(self.min_s_x, min(self.max_s_x, self.s_x))
        self.target_sz = ((1 - config.scale_lr) + config.scale_lr * scale) * self.target_sz
        bbox = (self.pos[0] - self.target_sz[0]/2 + 1, # xmin   convert to 1-based
                self.pos[1] - self.target_sz[1]/2 + 1, # ymin
                self.pos[0] + self.target_sz[0]/2 + 1, # xmax
                self.pos[1] + self.target_sz[1]/2 + 1) # ymax
        return bbox

utils.py

import numpy as np
import cv2

def get_center(x):
    return (x - 1.) / 2.

#(x1,y1,x2,y2)轉換成(cx,cy,w,h)
def xyxy2cxcywh(bbox):
    return get_center(bbox[0]+bbox[2]), \
           get_center(bbox[1]+bbox[3]), \
           (bbox[2]-bbox[0]), \
           (bbox[3]-bbox[1])

def crop_and_pad(img, cx, cy, model_sz, original_sz, img_mean=None):
    #左上角和右下角的x,y
    xmin = cx - original_sz // 2
    xmax = cx + original_sz // 2
    ymin = cy - original_sz // 2
    ymax = cy + original_sz // 2
    im_h, im_w, _ = img.shape

    left = right = top = bottom = 0
    if xmin < 0:
        left = int(abs(xmin))
    if xmax > im_w:
        right = int(xmax - im_w)
    if ymin < 0:
        top = int(abs(ymin))
    if ymax > im_h:
        bottom = int(ymax - im_h)

    xmin = int(max(0, xmin))
    xmax = int(min(im_w, xmax))
    ymin = int(max(0, ymin))
    ymax = int(min(im_h, ymax))
    #取出目標片段
    im_patch = img[ymin:ymax, xmin:xmax]
    if left != 0 or right !=0 or top!=0 or bottom!=0:
        #前面沒計算的話這裏計算一次
        if img_mean is None:
            img_mean = tuple(map(int, img.mean(axis=(0, 1))))
        #填充im_patch大小爲original_sz
        im_patch = cv2.copyMakeBorder(im_patch, top, bottom, left, right,
                cv2.BORDER_CONSTANT, value=img_mean)
    if model_sz != original_sz:
        im_patch = cv2.resize(im_patch, (model_sz, model_sz))#縮放成127x127
    return im_patch

#get_exemplar_image(frame, box, 127, 0.5, self.img_mean),取初始化模型的片段圖像
def get_exemplar_image(img, bbox, size_z, context_amount, img_mean=None):
    #轉換成中心座標
    cx, cy, w, h = xyxy2cxcywh(bbox)
    #擴充的w,h
    wc_z = w + context_amount * (w+h)
    hc_z = h + context_amount * (w+h)
    #面積開方
    s_z = np.sqrt(wc_z * hc_z)
    #計算尺度 = 127 / sz,前面縮放的比例
    scale_z = size_z / s_z
    #以目標中心取出的片段
    exemplar_img = crop_and_pad(img, cx, cy, size_z, s_z, img_mean)
    return exemplar_img, scale_z, s_z
#
def get_instance_image(img, bbox, size_z, size_x, context_amount, img_mean=None):
    cx, cy, w, h = xyxy2cxcywh(bbox)
    wc_z = w + context_amount * (w+h)
    hc_z = h + context_amount * (w+h)
    s_z = np.sqrt(wc_z * hc_z)
    scale_z = size_z / s_z
    d_search = (size_x - size_z) / 2
    pad = d_search / scale_z
    s_x = s_z + 2 * pad
    scale_x = size_x / s_x
    instance_img = crop_and_pad(img, cx, cy, size_x, s_x, img_mean)
    return instance_img, scale_x, s_x

#獲取搜索範圍金字塔圖像(圖像,中心座標,255,三尺度,圖像均值),全部縮放至255x255
def get_pyramid_instance_image(img, center, size_x, size_x_scales, img_mean=None):
    if img_mean is None:
        img_mean = tuple(map(int, img.mean(axis=(0, 1))))
    pyramid = [crop_and_pad(img, center[0], center[1], size_x, size_x_scale, img_mean)
            for size_x_scale in size_x_scales]
    return pyramid

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章