起初faster-r-cnn,只採用最後一層特作爲rpn以及head部分的特徵圖 ,後來不斷改進,有了FPN, 再後來有了Panet,一般來說網絡的層數越深它的語義信息越豐富。但是隨着網絡層數的加深,檢測所需的位置信息就會越差,CNN分類網絡只需要知道一張圖像的種類即可所以很多時候網絡越深效果越好,但是不是分類效果越好的網越適合檢測。FPN如下圖所示,它用了不同大小的特徵圖進行預測,圖中:下方的特徵圖較大,對應的感受野較小可以用來檢測小目標,上部分的特徵圖尺寸較小,但是感受野較大適合檢測大目標。
Panet 是對FPN的改進,如下圖紅線所示,在fpn中頂層底層信息距離太遠,不要看紅線中間只有三四個框,這是一個示意,其中 有好多的卷積操作所以頂層底層距離很遠,所以在右側 開闢了一條新的路(綠線),只要幾個個卷積層,頂層信息就能快速與底層信息匯合。特徵提取對結果的影響特別大,融合不同尺度的信息十分必要,M2det在SSD的基礎上增加了部分網絡來優化特徵提取,得到的效果就比SSD效果好得多
代碼是基於resnet50的特徵提取部分實現的,上一張resnet50的網絡結構
具體實現如下,其中fpn參考mmdetection種fpn的結構,Panet部分自己寫的,如有不對請告知。
其中圖像經過下采樣再進行上採樣的時候有可能大小不同,某一層圖像下采樣前的尺寸爲單數。可以提前計算圖像大小對輸入圖像直接padding,我用的是上採樣時直接輸入尺寸。
import torch
from torch import nn
import torch.nn.functional as F
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, in_size, size_u, stride=1, is_down=False):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(in_size, size_u, kernel_size=1, stride=stride, bias=False)
self.bn1 = nn.BatchNorm2d(size_u)
self.conv2 = nn.Conv2d(size_u, size_u, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(size_u)
self.conv3 = nn.Conv2d(size_u, size_u * self.expansion, kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(size_u * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = nn.Sequential(
nn.Conv2d(in_size, size_u * self.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(size_u * self.expansion))
self.stride = stride
self.is_down = is_down
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.is_down:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class Resnt50(nn.Module):
def __init__(self):
super(Resnt50, self).__init__()
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.lysize = [64, 128, 256, 512, 1024, 2048]
self.layer1 = nn.Sequential(Bottleneck(self.lysize[0], self.lysize[0], 1, True),
Bottleneck(self.lysize[2], self.lysize[0], 1, False),
Bottleneck(self.lysize[2], self.lysize[0], 1, False))
self.layer2 = nn.Sequential(Bottleneck(self.lysize[2], self.lysize[1], 2, True),
Bottleneck(self.lysize[3], self.lysize[1], 1, False),
Bottleneck(self.lysize[3], self.lysize[1], 1, False),
Bottleneck(self.lysize[3], self.lysize[1], 1, False))
self.layer3 = nn.Sequential(Bottleneck(self.lysize[3], self.lysize[2], 2, True),
Bottleneck(self.lysize[4], self.lysize[2], 1, False),
Bottleneck(self.lysize[4], self.lysize[2], 1, False),
Bottleneck(self.lysize[4], self.lysize[2], 1, False),
Bottleneck(self.lysize[4], self.lysize[2], 1, False),
Bottleneck(self.lysize[4], self.lysize[2], 1, False))
self.layer4 = nn.Sequential(Bottleneck(self.lysize[4], self.lysize[3], 2, True),
Bottleneck(self.lysize[5], self.lysize[3], 1, False),
Bottleneck(self.lysize[5], self.lysize[3], 1, False))
# self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
# self.fc = nn.Linear(self.lysize[5], 3)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x):
conv1 = self.conv1(x)
bn1 = self.bn1(conv1)
relu = self.relu(bn1)
maxpool = self.maxpool(relu)
layer1 = self.layer1(maxpool)
layer2 = self.layer2(layer1)
layer3 = self.layer3(layer2)
layer4 = self.layer4(layer3)
# x = self.avgpool(layer4)
# x = x.view(x.shape[0], -1)
# x = self.fc(x)
return layer1, layer2, layer3, layer4
class FPN(nn.Module):
def __init__(self):
super(FPN, self).__init__()
self.resnet_feature = Resnt50()
self.conv1 = nn.Conv2d(in_channels=2048, out_channels=256, kernel_size=1, stride=1, padding=0)
self.conv2 = nn.Conv2d(1024, 256, 1, 1, 0)
self.conv3 = nn.Conv2d(512, 256, 1, 1, 0)
self.conv4 = nn.Conv2d(256, 256, 1, 1, 0)
self.fpn_convs = nn.Conv2d(256, 256, 3, 1, 1)
def forward(self, x):
layer1, layer2, layer3, layer4 = self.resnet_feature(x) # channel 256 512 1024 2048
P5 = self.conv1(layer4)
P4_ = self.conv2(layer3)
P3_ = self.conv3(layer2)
P2_ = self.conv4(layer1)
size4 = P4_.shape[2:]
size3 = P3_.shape[2:]
size2 = P2_.shape[2:]
P4 = P4_ + F.interpolate(P5, size=size4, mode='nearest')
P3 = P3_ + F.interpolate(P4, size=size3, mode='nearest')
P2 = P2_ + F.interpolate(P3, size=size2, mode='nearest')
P5 = self.fpn_convs(P5)
P4 = self.fpn_convs(P4)
P3 = self.fpn_convs(P3)
P2 = self.fpn_convs(P2)
return P2, P3, P4, P5
class Panet(nn.Module):
def __init__(self, class_number=512):
super(Panet, self).__init__()
self.fpn = FPN()
self.convN = nn.Conv2d(256, 256, 3, 2, 1)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
P2, P3, P4, P5 = self.fpn(x)
N2 = P2
N2_ = self.convN(N2)
N2_ = self.relu(N2_)
N3 = N2_ + P3
N3_ = self.convN(N3)
N3_ = self.relu(N3_)
N4 = N3_ + P4
N4_ = self.convN(N4)
N4_ = self.relu(N4_)
N5 = N4_ + P5
return N2, N3, N4, N5
if __name__ == '__main__':
from torchsummary import summary
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FPN().to(device)
summary(model, (3, 512, 512))
----------------------------------------------------------------------------------end---------------------------------------------------------------------------------