最近開始學習目標檢測faster rcnn，首先看了很多博客講解原理，然後從github上下載tensorflow版本的代碼，代碼太長看了好幾天沒明白，後來看到了chenyuntc的 simple-faster-rcnn-pytorch，還有作者寫這份代碼的心得，讓我感覺很佩服，自認爲目前階段不能手寫如此複雜的代碼。作者是從tf版本的改爲pytorch版的，我在學習的過程中也查閱了很多其他人寫的講解代碼的博客，得到了很大的幫助，所以也打算把自己一些粗淺的理解記錄下來，一是記錄下自己的菜鳥學習之路，方便自己過後查閱，二來可以回饋網絡。目前編程能力有限，且是第一次寫博客，中間可能會有一些錯誤。

作者的README.MD部分講解的很清楚了，一步一步安裝PyTorch，cupy，然後運行pip install -r requirements.txt安裝各種包。我用的是自己的臺式機，1070GPU，python3.6，windows環境。安裝PyTorch>=0.4時上官網根據自己的環境去找對應的pip指令安裝時可能下載速度巨慢，我是直接下載了torch-0.4.1-cp36-cp36m-win_amd64.whl，運行pip install torch-0.4.1-cp36-cp36m-win_amd64.whl安裝的，這裏我將文件放到雲盤上有需要自取https://pan.baidu.com/s/1pqSpeTFH3ooh7q1M3vrtfQ,提取碼：hda0。cupy安裝時出錯了，提示我需要安裝VS，注意最好安裝vs2015以上版本。接下來是build cython code nms_gpu_post，這裏我一直報錯，後來發現是可選擇運行的，我就直接跳過了，不過作者說最好運行這一步。然後是下載預訓練模型，數據集，解壓數據集操作。需要將util/config.py中voc_data_dir改爲自己的路徑，通過執行python misc/convert_caffe_pretrain.py，下載caffe_pretrain預訓練模型，我是直接將caffe_pretrain改爲True了，如果設置false則直接會下載pytorch版本的預訓練模型。然後在終端下運行python -m visdom.server，彈出一個網址打開。visdom是類似tensorflow中的tensorboard的工具，可以可視化訓練過程中的各種曲線或者圖，然後運行python train.py train --env=‘fasterrcnn-caffe’ --plot-every=100 --caffe-pretrain，就開始了訓練過程，可以看到網頁中出現了5個loss曲線圖，還有一個標籤圖和預測圖。類似下面這樣的

第二步數據預處理

1.data/dataset.py文件（主代碼中調用的其他重要函數會在後面按順序講解）

#去正則化,img維度爲[[B,G,R],H,W],因爲caffe預訓練模型輸入爲BGR 0-255圖片，pytorch預訓練模型採用RGB 0-1圖片
def inverse_normalize(img):
    if opt.caffe_pretrain:   #如果採用caffe預訓練模型，則返回 img[::-1, :, :]，如果不採用，則返回(img * 0.225 + 0.45).clip(min=0, max=1) * 255 
        img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))   #[122.7717, 115.9465, 102.9801]reshape爲[3,1,1]與img維度相同就可以相加了，caffe_normalize之前有減均值預處理，現在還原回去。
        return img[::-1, :, :]     #將BGR轉換爲RGB圖片（python [::-1]爲逆序輸出）
    return (img * 0.225 + 0.45).clip(min=0, max=1) * 255    #pytorch_normalze中標準化爲減均值除以標準差，現在乘以標準差加上均值還原回去，轉換爲0-255

#採用pytorch預訓練模型對圖片預處理，函數輸入的img爲0-1
def pytorch_normalze(img):
    normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  #transforms.Normalize使用如下公式進行歸一化channel=（channel-mean）/std,轉換爲[-1,1]
    img = normalize(t.from_numpy(img))     #(ndarray) → Tensor
    return img.numpy()
    
 #採用caffe預訓練模型時對輸入圖像進行標準化，函數輸入的img爲0-1
 def caffe_normalize(img):
    img = img[[2, 1, 0], :, :]  # RGB-BGR
    img = img * 255
    mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1) #轉換爲與img維度相同
    img = (img - mean).astype(np.float32, copy=True)   #減均值操作
    return img
    
 #函數輸入的img爲0-255
def preprocess(img, min_size=600, max_size=1000): #按照論文長邊不超1000，短邊不超600。按此比例縮放
    C, H, W = img.shape
    scale1 = min_size / min(H, W)
    scale2 = max_size / max(H, W)
    scale = min(scale1, scale2)    #選小的比例，這樣長和寬都能放縮到規定的尺寸
    img = img / 255  #轉換爲0-1
    img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False) #resize到（H * scale, W * scale）大小，anti_aliasing爲是否採用高斯濾波
#調用pytorch_normalze或者caffe_normalze對圖像進行正則化
    if opt.caffe_pretrain:
        normalize = caffe_normalize
    else:
        normalize = pytorch_normalze
    return normalize(img)
 
class Transform(object):
    def __init__(self, min_size=600, max_size=1000):
        self.min_size = min_size
        self.max_size = max_size
    def __call__(self, in_data):
        img, bbox, label = in_data
        _, H, W = img.shape
        img = preprocess(img, self.min_size, self.max_size)  #圖像等比例縮放
        _, o_H, o_W = img.shape 
        scale = o_H / H   #得出縮放比因子
        bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))  #bbox按照與原圖等比例縮放

        # horizontally flip
        img, params = util.random_flip(
            img, x_random=True, return_param=True) #將圖片進行隨機水平翻轉，沒有進行垂直翻轉
        bbox = util.flip_bbox(
            bbox, (o_H, o_W), x_flip=params['x_flip']) #同樣地將bbox進行與對應圖片同樣的水平翻轉
        return img, bbox, label, scale 
        
class Dataset:      #訓練集樣本的生成
    def __init__(self, opt):
        self.opt = opt
        self.db = VOCBboxDataset(opt.voc_data_dir)   #實例化類
        self.tsf = Transform(opt.min_size, opt.max_size) ##實例化類
    def __getitem__(self, idx):   #__ xxx__運行Dataset類時自動運行
        ori_img, bbox, label, difficult = self.db.get_example(idx) #調用VOCBboxDataset中的get_example（）從數據集存儲路徑中將img, bbox, label, difficult 一個個的獲取出來
        img, bbox, label, scale = self.tsf((ori_img, bbox, label)) #調用前面的Transform函數將圖片,label進行最小值最大值放縮歸一化，重新調整bboxes的大小，然後隨機反轉，最後將數據集返回
        return img.copy(), bbox.copy(), label.copy(), scale
    def __len__(self):
        return len(self.db)  

class TestDataset:   #測試集樣本的生成
    def __init__(self, opt, split='test', use_difficult=True):
        self.opt = opt
        self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)  #此處設置了use_difficult,

    def __getitem__(self, idx):
        ori_img, bbox, label, difficult = self.db.get_example(idx)
        img = preprocess(ori_img)
        return img, ori_img.shape[1:], bbox, label, difficult

    def __len__(self):
        return len(self.db)

下面是data/util.py文件

def resize_bbox(bbox, in_size, out_size):
    bbox = bbox.copy()
    y_scale = float(out_size[0]) / in_size[0]
    x_scale = float(out_size[1]) / in_size[1]  #獲得與原圖同樣的縮放比
    bbox[:, 0] = y_scale * bbox[:, 0]
    bbox[:, 2] = y_scale * bbox[:, 2]
    bbox[:, 1] = x_scale * bbox[:, 1]
    bbox[:, 3] = x_scale * bbox[:, 3]  #按與原圖同等比例縮放bbox
    return bbox
    
 def random_flip(img, y_random=False, x_random=False,
                return_param=False, copy=False):
    y_flip, x_flip = False, False
    if y_random:   #False
        y_flip = random.choice([True, False])
    if x_random:   #True
        x_flip = random.choice([True, False])  #隨機選擇圖片是否進行水平翻轉

    if y_flip:
        img = img[:, ::-1, :]
    if x_flip:
        img = img[:, :, ::-1]   #python [::-1]爲逆序輸出，這裏指水平翻轉
    if copy:
        img = img.copy()
    if return_param:   #True
        return img, {'y_flip': y_flip, 'x_flip': x_flip}  #返回img和x_flip(爲了讓bbox有同樣的水平翻轉操作)
    else:
        return img

 def flip_bbox(bbox, size, y_flip=False, x_flip=False):  
    H, W = size  #縮放後圖片的size
    bbox = bbox.copy()
    if y_flip:              #沒有進行垂直翻轉
        y_max = H - bbox[:, 0]
        y_min = H - bbox[:, 2]
        bbox[:, 0] = y_min
        bbox[:, 2] = y_max
    if x_flip:
        x_max = W - bbox[:, 1]  
        x_min = W - bbox[:, 3] #計算水平翻轉後左下角和右上角的座標
        bbox[:, 1] = x_min
        bbox[:, 3] = x_max
    return bbox

下面是data/voc_dataset.py文件

class VOCBboxDataset:
    def __init__(self, data_dir, split='trainval',
                 use_difficult=False, return_difficult=False,
                 ):
        id_list_file = os.path.join(
            data_dir, 'ImageSets/Main/{0}.txt'.format(split))  # id_list_file爲split.txt，split爲'trainval'或者'test'
        self.ids = [id_.strip() for id_ in open(id_list_file)] #id_爲每個樣本文件名
        self.data_dir = data_dir #寫到/VOC2007/的路徑
        self.use_difficult = use_difficult 
        self.return_difficult = return_difficult 
        self.label_names = VOC_BBOX_LABEL_NAMES   #20類

    def __len__(self):
        return len(self.ids)  #trainval.txt有5011個，test.txt有210個
    def get_example(self, i):
        id_ = self.ids[i]
        anno = ET.parse(
            os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))  #讀入 xml標籤文件
        bbox = list()
        label = list()
        difficult = list()
        #解析xml文件
        for obj in anno.findall('object'):
            if not self.use_difficult and int(obj.find('difficult').text) == 1:  #標爲difficult的目標在測試評估中一般會被忽略
                continue   #xml文件中包含object name和difficult(0或者1,0代表容易檢測)
            difficult.append(int(obj.find('difficult').text)) 
            bndbox_anno = obj.find('bndbox')  #bndbox（xmin,ymin,xmax,ymax),表示框左下角和右上角座標
            bbox.append([
                int(bndbox_anno.find(tag).text) - 1  
                for tag in ('ymin', 'xmin', 'ymax', 'xmax')]) #讓座標基於（0,0）
            name = obj.find('name').text.lower().strip()  #框中object name
            label.append(VOC_BBOX_LABEL_NAMES.index(name))  
        bbox = np.stack(bbox).astype(np.float32)   #所有object的bbox座標存在列表裏
        label = np.stack(label).astype(np.int32)     #所有object的label存在列表裏
        # When `use_difficult==False`, all elements in `difficult` are False.
        difficult = np.array(difficult, dtype=np.bool).astype(np.uint8)  # PyTorch 不支持 np.bool，所以這裏轉換爲uint8
        img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg') #根據圖片編號在/JPEGImages/取圖片
        img = read_image(img_file, color=True)    #如果color=True，則轉換爲RGB圖
        return img, bbox, label, difficult
    __getitem__ = get_example  #一般如果想使用索引訪問元素時，就可以在類中定義這個方法（__getitem__(self, key) )

以上部分就是全部的數據預處理內容，這部分不太難，還是挺好理解的。下面一部分是模型準備部分。

逐字理解目標檢測simple-faster-rcnn-pytorch-master代碼（一）

目錄

第一步跑通代碼

第二步數據預處理

圖像中的Attention代碼（Tensorflow）

Pytorch model.train 與 model.eval的區別（我是搬運工）

YOLACT：Real-time Instance Segmentation總結

牛客網OJ系統Python輸入輸出處理

經典CNN網絡結構

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結

逐字理解目標檢測simple-faster-rcnn-pytorch-master代碼（一）

目錄

第一步 跑通代碼

第二步 數據預處理

第一步跑通代碼

第二步數據預處理