下面介紹最核心的部分:網絡結構和損失函數
。尤其是損失函數部分,YOLO的損失函數本身並不難理解,但是代碼中有很多張量運算及相關函數的使用,使得稍顯複雜。
其他相關的部分請見:
YOLO代碼解析(1) 代碼總覽與使用
YOLO代碼解析(2) 數據處理
YOLO代碼解析(3) 模型和損失函數
YOLO代碼解析(4) 訓練和測試代碼
YOLO論文中的網絡結構示意圖如下:
網絡結構相關代碼:yolo_tiny_net.py
這裏的網絡與YOLO論文中的網絡結構稍有不同,不過整體上是一致的
def inference(self, images):
"""構建yolo_tiny網絡
輸入:
images: 4-D tensor [batch_size, image_height, image_width, channels]
返回:
predicts: 4-D tensor [batch_size, cell_size, cell_size, num_classes + 5 * boxes_per_cell]
"""
conv_num = 1
temp_conv = self.conv2d('conv' + str(conv_num), images, [3, 3, 3, 16], stride=1)
conv_num += 1
temp_pool = self.max_pool(temp_conv, [2, 2], 2)
temp_conv = self.conv2d('conv' + str(conv_num), temp_pool, [3, 3, 16, 32], stride=1)
conv_num += 1
temp_pool = self.max_pool(temp_conv, [2, 2], 2)
temp_conv = self.conv2d('conv' + str(conv_num), temp_pool, [3, 3, 32, 64], stride=1)
conv_num += 1
temp_conv = self.max_pool(temp_conv, [2, 2], 2)
temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 64, 128], stride=1)
conv_num += 1
temp_conv = self.max_pool(temp_conv, [2, 2], 2)
temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 128, 256], stride=1)
conv_num += 1
temp_conv = self.max_pool(temp_conv, [2, 2], 2)
temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 256, 512], stride=1)
conv_num += 1
temp_conv = self.max_pool(temp_conv, [2, 2], 2)
temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 512, 1024], stride=1)
conv_num += 1
temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 1024, 1024], stride=1)
conv_num += 1
temp_conv = self.conv2d('conv' + str(conv_num), temp_conv, [3, 3, 1024, 1024], stride=1)
conv_num += 1
temp_conv = tf.transpose(temp_conv, (0, 3, 1, 2)) #(N,H,W,C)=>(N,C,H,W)
# 全鏈接層
local1 = self.local('local1', temp_conv, self.cell_size * self.cell_size * 1024, 256)
local2 = self.local('local2', local1, 256, 4096)
local3 = self.local('local3', local2, 4096, self.cell_size * self.cell_size * (self.num_classes + self.boxes_per_cell * 5), leaky=False, pretrain=False, train=True)
# 對全連接層輸出的tensor進行reshape
# 全連接輸出的長度cell_size*cell_size*(num_class+boxes_per_cell*5)二維tensor(還有一個維度是圖片數目N)
# YOLO論文中的7*7*(20+5*2)
# 這裏對local3進行reshape時,先將class_prob,objectness_prob和coordinate分別取出,各自reshape,最後合併到一起
# 這樣最後得到的tensor的各個通道是按照class_prob,objectness_prob和coordinate排列的
n1 = self.cell_size * self.cell_size * self.num_classes
n2 = n1 + self.cell_size * self.cell_size * self.boxes_per_cell
class_probs = tf.reshape(local3[:, 0:n1], (-1, self.cell_size, self.cell_size, self.num_classes)) #class_prob
scales = tf.reshape(local3[:, n1:n2], (-1, self.cell_size, self.cell_size, self.boxes_per_cell)) #objectness_prob
boxes = tf.reshape(local3[:, n2:], (-1, self.cell_size, self.cell_size, self.boxes_per_cell * 4)) #coordinate
# 合併得到輸出 [N,cell_size,cell_size,class_num+bbox_num*5]
local3 = tf.concat([class_probs, scales, boxes], axis=3)
predicts = local3
return predicts
損失函數相關代碼:yolo_tiny_net.py
def iou(self, boxes1, boxes2):
"""IoU 計算
Args:
boxes1: 4-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ====> (x_center, y_center, w, h)
boxes2: 1-D tensor [4] ===> (x_center, y_center, w, h)
Return:
iou: 3-D tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
"""
# 將Bbox座標由(x_center,y_center,w,h) 轉爲 (x_min, y_min, x_max, y_max)
boxes1 = tf.stack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2,
boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2])
boxes1 = tf.transpose(boxes1, [1, 2, 3, 0])
# 上面這兩句stack+transpose的操作也可以寫成一句:
# boxes1 = tf.stack([boxes1[:, :, :, 0] - boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] - boxes1[:, :, :, 3] / 2,
# boxes1[:, :, :, 0] + boxes1[:, :, :, 2] / 2, boxes1[:, :, :, 1] + boxes1[:, :, :, 3] / 2],axis=3)
boxes2 = tf.stack([boxes2[0] - boxes2[2] / 2, boxes2[1] - boxes2[3] / 2,
boxes2[0] + boxes2[2] / 2, boxes2[1] + boxes2[3] / 2])
# 計算重合區域的左上和右下頂點
lu = tf.maximum(boxes1[:, :, :, 0:2], boxes2[0:2])
rd = tf.minimum(boxes1[:, :, :, 2:], boxes2[2:])
# 計算重疊區域面積
intersection = rd - lu
inter_square = intersection[:, :, :, 0] * intersection[:, :, :, 1]
# predict box和label box也可能沒有重疊區域,這裏的mask=0時候就是沒有重疊區域的情況
mask = tf.cast(intersection[:, :, :, 0] > 0, tf.float32) * tf.cast(intersection[:, :, :, 1] > 0, tf.float32)
inter_square = mask * inter_square
# 分別計算predict box和label box各自的面積
square1 = (boxes1[:, :, :, 2] - boxes1[:, :, :, 0]) * (boxes1[:, :, :, 3] - boxes1[:, :, :, 1])
square2 = (boxes2[2] - boxes2[0]) * (boxes2[3] - boxes2[1])
# 計算並返回IoU的值,返回的tensor的shape 是(cell_size,cell_size,box_pre_cell) 如7*7*2
return inter_square/(square1 + square2 - inter_square + 1e-6)
def cond1(self, num, object_num, loss, predict, label):
"""
num初始值爲0
依次處理每個object
"""
return num < object_num
def body1(self, num, object_num, loss, predict, labels):
"""
每次計算一張圖片中的一個object的損失
Args:
predict: 3-D tensor [cell_size, cell_size, 5 * boxes_per_cell]
labels : [max_objects, 5] (x_center, y_center, w, h, class)
"""
label = labels[num:num+1, :] # 取第num個object的label:(x_center, y_center, w, h, class)
label = tf.reshape(label, [-1])
# ==1==.計算有物體的那些格子座標,即標記出物體覆蓋到的那些格子(用於計算物體檢測損失)
# 根據label的座標[x_center, y_center, w, h]和格子的數目計算以格子座標表示的座標值
min_x = (label[0] - label[2] / 2) / (self.image_size / self.cell_size)
max_x = (label[0] + label[2] / 2) / (self.image_size / self.cell_size)
min_y = (label[1] - label[3] / 2) / (self.image_size / self.cell_size)
max_y = (label[1] + label[3] / 2) / (self.image_size / self.cell_size)
# 分別取整得到格子座標
min_x = tf.floor(min_x)
min_y = tf.floor(min_y)
max_x = tf.ceil(max_x)
max_y = tf.ceil(max_y)
# objects與格子中有圖像的區域大小一致,元素的值都爲1
temp = tf.cast(tf.stack([max_y - min_y, max_x - min_x]), dtype=tf.int32)
objects = tf.ones(temp, tf.float32)
# paddings是爲了將objects擴展到與格子一樣大小,所需在objects的四周需要padding的格子數目,順序爲top,down,left,right
paddings = tf.cast(tf.stack([min_y, self.cell_size - max_y, min_x, self.cell_size - max_x]), tf.int32)
paddings = tf.reshape(paddings, (2, 2))
# 這裏得到的objects就是一個‘尺寸’爲cell_size*cell_size,並且有物體的區域標爲1,無物體區域標爲0
# paddings的shape爲[n,2],n爲待填充的tensor的秩,‘CONSTANT’表示使用0填充
objects = tf.pad(objects, paddings, "CONSTANT")
# ==2==.使用label Bbox計算responsible tensor,實際上是標記出物體中心所在的格子 (用於計算座標損失)
# 將label Bbox的中心由像素座標轉爲格子座標
center_x = label[0] / (self.image_size / self.cell_size)
center_x = tf.floor(center_x)
center_y = label[1] / (self.image_size / self.cell_size)
center_y = tf.floor(center_y)
response = tf.ones([1, 1], tf.float32)
temp = tf.cast(tf.stack([center_y, self.cell_size - center_y - 1, center_x, self.cell_size -center_x - 1]), tf.int32)
temp = tf.reshape(temp, (2, 2))
response = tf.pad(response, temp, "CONSTANT")
# ==3==.計算預測Bbox和label Bbox的IoU iou_predict_truth [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
# predict的shape爲:[cell_size,cell_size,class_num+box_num*5]
# 這裏需要明確網絡預測(inference方法)的返回predict中的座標是‘偏移+歸一化後’的還是像素座標,即明確其格式,在預測推理的時候要根據其格式‘轉換’座標值;
# 在下面第三行predict_boxes = predict_boxes * [self ... 這一行代碼中可以看到對predict座標做了一個‘反歸一化和偏移的’計算;
# 所以網絡輸出的座標確實是‘偏移+歸一化’後的格式
# 因爲在這裏對座標進行了‘反偏移和歸一化’,所以在計算座標損失的時候又重新進行了一次‘偏移和歸一化’的步驟
predict_boxes = predict[:, :, self.num_classes + self.boxes_per_cell:]
predict_boxes = tf.reshape(predict_boxes, [self.cell_size, self.cell_size, self.boxes_per_cell, 4])
# 將偏移+歸一化的predict_boxes 由[x_offset_norm,y_offset_norm,w_norm,h_norm] 轉換爲[x,y,w,h](單位爲像素值)
# 1)‘反歸一化’
predict_boxes = predict_boxes * [self.image_size / self.cell_size, self.image_size / self.cell_size, self.image_size, self.image_size]
# 2)‘反偏移’
# base_boxes 表示的是每個格子的座標對應在圖像中的像素座標
base_boxes = np.zeros([self.cell_size, self.cell_size, 4])
for y in range(self.cell_size):
for x in range(self.cell_size):
base_boxes[y, x, :] = [self.image_size / self.cell_size * x, self.image_size / self.cell_size * y, 0, 0]
# 擴展爲2個Bbox
base_boxes = np.tile(np.resize(base_boxes, [self.cell_size, self.cell_size, 1, 4]), [1, 1, self.boxes_per_cell, 1])
# 將predict_boxes 由[x_offset,y_offset,w,h](單位爲像素值)轉換爲[x,y,w,h](單位爲像素值)
predict_boxes = base_boxes + predict_boxes
# 計算IoU,返回的iou_predict_truth的shape爲(cell_size,cell_size,box_pre_cell)
iou_predict_truth = self.iou(predict_boxes, label[0:4])
# C tensor:responsible格子(物體中心落在的那個格子)的兩個Bbox的IoU值,shape: [cell_size, cell_size, boxes_per_cell]
C = iou_predict_truth * tf.reshape(response, [self.cell_size, self.cell_size, 1])
# I tensor:responsible格子(物體中心落在的那個格子)的兩個Bbox的IoU值,shape: [cell_size, cell_size, boxes_per_cell]
I = iou_predict_truth * tf.reshape(response, (self.cell_size, self.cell_size, 1))
# 獲取最大的IoU的值, max_I的shape: (cell_size,cell_size,1)
max_I = tf.reduce_max(I, 2, keep_dims=True)
# 這裏的 I 的shape是(cell_size,cell_size,box_per_cell),其含義是IoU最大的那個Bbox在tensor中的位置,所在位置爲1,其他爲0
# 經過這一步,也就得到了文章中說的'the jth bounding box predictor in cell i is “responsible”for that prediction'
# 也就是物體中心所落在的那個格子給出的N預測Bboxes中與label_box之間IoU最大的那個Bbox
I = tf.cast((I >= max_I), tf.float32) * tf.reshape(response, (self.cell_size, self.cell_size, 1))
# no_I是與I的shape相同,但取值相反的tensor
# 這一步得到了文章中的noobj
no_I = tf.ones_like(I, dtype=tf.float32) - I
# p_C 這裏是Bbox中有物體的概率
p_C = predict[:, :, self.num_classes:self.num_classes + self.boxes_per_cell]
# ==4== 計算Loss
# (1)準備計算座標損失的相關數據
x = label[0]
y = label[1]
# 文章中在計算座標損失的w,h項作了開平方縮放
sqrt_w = tf.sqrt(tf.abs(label[2]))
sqrt_h = tf.sqrt(tf.abs(label[3]))
# predict p_x, p_y, p_sqrt_w, p_sqrt_h 3-D [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
p_x = predict_boxes[:, :, :, 0]
p_y = predict_boxes[:, :, :, 1]
p_sqrt_w = tf.sqrt(tf.minimum(self.image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 2])))
p_sqrt_h = tf.sqrt(tf.minimum(self.image_size * 1.0, tf.maximum(0.0, predict_boxes[:, :, :, 3])))
# (2)準備計算類別損失的相關數據
# 將lebel中的類別ID轉爲one_hot編碼
P = tf.one_hot(tf.cast(label[4], tf.int32), self.num_classes, dtype=tf.float32)
#calculate predict p_P 3-D tensor [CELL_SIZE, CELL_SIZE, NUM_CLASSES]
p_P = predict[:, :, 0:self.num_classes]
# (3)分別計算類別損失、物體檢測損失和座標損失
# 類別損失(class_loss)
# 每個cell會給出N個預測的Bbox,比如2個,但是隻有一組物體類別的概率
# 計算類別損失的時候只計算出現了物體的那些格子的損失,所以這裏用到了objects
# class_scale 是類別損失的權重,論文中的Loss公式沒有寫出這個參數,默認爲1,實際上在train.cfg中class_scale設置的是1.
class_loss = tf.nn.l2_loss(tf.reshape(objects, (self.cell_size, self.cell_size, 1)) * (p_P - P)) * self.class_scale
#class_loss = tf.nn.l2_loss(tf.reshape(response, (self.cell_size, self.cell_size, 1)) * (p_P - P)) * self.class_scale
# 物體檢測loss(object_loss & noobject_loss)
# 物體檢測loss分成兩類,一是responsible的那一個Bbox,稱爲object_loss,二是其他的Bbox,稱爲noobject_loss
# 這裏計算損失的時候用p_C - C,p_C是模型預測的Bbox中有無物體的概率,C是物體中心所在的那個格子的的Bbox的IoU值
# 這裏實際山是用IoU的值代替有無物體的ground_truth值
object_loss = tf.nn.l2_loss(I * (p_C - C)) * self.object_scale
# noobject_loss
# 對於這些‘noobject’的Bbox,理想的情況下是將他們都預測爲無物體,也就是p_C值越小越好
# 所以這裏可以直接使用預測的Bbox有物體的概率p_C來計算損失
noobject_loss = tf.nn.l2_loss(no_I * (p_C)) * self.noobject_scale
# 座標損失(coord_loss)
# 計算座標損失的時候,對格子中心座標用的時候中心相對於所在格子左上角的偏移量並以格子寬度進行歸一化後的值
# 對寬高用的是原始寬高使用圖片寬高進行歸一化後的值
coord_loss = (tf.nn.l2_loss(I * (p_x - x)/(self.image_size/self.cell_size)) +
tf.nn.l2_loss(I * (p_y - y)/(self.image_size/self.cell_size)) +
tf.nn.l2_loss(I * (p_sqrt_w - sqrt_w))/ self.image_size +
tf.nn.l2_loss(I * (p_sqrt_h - sqrt_h))/self.image_size) * self.coord_scale
return num + 1, object_num, [loss[0] + class_loss, loss[1] + object_loss, loss[2] + noobject_loss, loss[3] + coord_loss], predict, labels
def loss(self, predicts, labels, objects_num):
"""計算Loss
Args:
predicts: 4-D tensor [batch_size, cell_size, cell_size, 5 * boxes_per_cell]
===> (num_classes, boxes_per_cell, 4 * boxes_per_cell)
labels : 3-D tensor of [batch_size, max_objects, 5]
objects_num: 1-D tensor [batch_size]
"""
# 損失函數由三部分構成:類別損失,物體檢測損失(有物體,無物體),Bbox座標損失
class_loss = tf.constant(0, tf.float32) # 類別損失
object_loss = tf.constant(0, tf.float32) # 有物體的損失
noobject_loss = tf.constant(0, tf.float32) # 無物體的損失
coord_loss = tf.constant(0, tf.float32) # 座標損失
loss = [0, 0, 0, 0]
for i in range(self.batch_size):
predict = predicts[i, :, :, :] # 每張圖片的prediction tensor
label = labels[i, :, :]
object_num = objects_num[i] # 圖片中的物體的數目
# 關於tf.while_loop(cond, body, var)
# loop(var 中滿足cond的條件,帶入body計算),loop結束,返回結果。
# >>> i = tf.constant(0)
# >>> c = lambda i: tf.less(i, 10)
# >>> b = lambda i: tf.add(i, 1)
# >>> r = tf.while_loop(c, b, [i])
# 這裏的while_loop 循環的是多個object
tuple_results = tf.while_loop(self.cond1, self.body1, [tf.constant(0), object_num, [class_loss, object_loss, noobject_loss, coord_loss], predict, label])
for j in range(4):
loss[j] = loss[j] + tuple_results[2][j]
tf.add_to_collection('losses', (loss[0] + loss[1] + loss[2] + loss[3])/self.batch_size)
# 添加到summary
tf.summary.scalar('class_loss', loss[0]/self.batch_size)
tf.summary.scalar('object_loss', loss[1]/self.batch_size)
tf.summary.scalar('noobject_loss', loss[2]/self.batch_size)
tf.summary.scalar('coord_loss', loss[3]/self.batch_size)
tf.summary.scalar('weight_loss', tf.add_n(tf.get_collection('losses')) - (loss[0] + loss[1] + loss[2] + loss[3])/self.batch_size )
return tf.add_n(tf.get_collection('losses'), name='total_loss')