如何使用COCO數據集
COCO數據集可以說是語義分割等計算機視覺任務中應用較爲廣泛的一個數據集,具體可以應用到物體識別、語義分割及目標檢測等方面。我是在做語義分割方面任務時用到了COCO數據集,但本文主要講解的是數據載入方面,因此可以通用。
一、下載COCO數據集
首先,我們要下載COCO數據集,本文主要使用的是COCO2014和COCO2017,因爲是國外數據集,因此下載需要翻牆下載。
MSCOCO數據集的官網爲:http://mscoco.org/
具體來說,如果想只下載COCO2017/COCO2014的話,可以不需要翻牆下載,複製以下鏈接打開迅雷等下載軟件下載即可,網速還可以。
COCO2017 訓練數據:http://images.cocodataset.org/zips/train2017.zip
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
COCO2017驗證數據:http://images.cocodataset.org/zips/val2017.zip
http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip
COCO2017測試數據集:http://images.cocodataset.org/zips/test2017.zip
http://images.cocodataset.org/annotations/image_info_test2017.zip
COCO2014的相關數據只需要將以上鍊接中的7改成4即可。
二、COCO數據集介紹
網上關於COCO數據集的介紹多如牛毛,本文就不過多的加以介紹了,簡要的介紹以下。
以COCO2014爲例:
下載完COCO2014後進行解壓後,目錄如下:
- images
- train2014
- val2014
- test2014
- annotations
其中,images中的文件夾各自放置了訓練、驗證和測試的數據集圖片。annotations文件夾中放置了標籤文件,可以理解爲Label,簡要的來說,就是包含了某一類在圖片中的具體位置的信息,詳細可見以下鏈接:https://blog.csdn.net/happyhorizion/article/details/77894205#semantic-scene-labeling圖像分割
三、COCO數據集使用(數據載入)
所需環境爲:
- numpy
- torch
- tqdm(可視化數據載入)
- os
- pycocotools(coco數據集的應用API)
- torchvision
- PIL
如何安裝pycocotools
相信能用到COCO數據集做語義分割等任務的大佬們應該都能安裝以上絕大多數庫,這裏主要講一下如何安裝pycocotools庫。作者在安裝這個庫的時候遇到了一些問題,不過及時的解決了。
步驟如下:
- 首先下載cocoapi,在終端輸入
git clone git@github.com:lucky-ing/cocoapi.git
- 此時可以看到一個叫coco的文件夾,進入coco/PythonAPI中,懶人操作如下:
cd coco/PythonAPI
- 開始安裝,在終端輸入以下命令
如果使用的是python2:
python setup.py build_ext install
如果使用的是python3
python3 setup.py build_ext install
- 如果一切順利,安裝完成,即可進入下一章節具體使用,作者在安裝時遇到了以下問題。
error: command 'C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64\cl.exe' failed with exit status 2
解決方法很簡單,在終端安裝cython即可,在終端輸入:
conda install cython
若是沒有使用conda,在終端輸入
pip install cython
COCO數據集的載入
- dataloader
import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import trange
import os
from pycocotools.coco import COCO
from pycocotools import mask
from torchvision import transforms
import custom_transforms as tr
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
class COCOSegmentation(Dataset):
NUM_CLASSES = 21
CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4,
1, 64, 20, 63, 7, 72]
def __init__(self,
args,
base_dir=’./Path/COCO/‘,
split='train',
year='2014'):
super().__init__()
ann_file = os.path.join(base_dir, 'annotations/instances_{}{}.json'.format(split, year))
ids_file = os.path.join(base_dir, 'annotations/{}_ids_{}.pth'.format(split, year))
self.img_dir = os.path.join(base_dir, 'images/{}{}'.format(split, year))
self.split = split
self.coco = COCO(ann_file)
self.coco_mask = mask
if os.path.exists(ids_file):
self.ids = torch.load(ids_file)
else:
ids = list(self.coco.imgs.keys())
self.ids = self._preprocess(ids, ids_file)
self.args = args
def __getitem__(self, index):
_img, _target = self._make_img_gt_point_pair(index)
sample = {'image': _img, 'label': _target}
if self.split == "train":
return self.transform_tr(sample)
elif self.split == 'val':
return self.transform_val(sample)
def _make_img_gt_point_pair(self, index):
coco = self.coco
img_id = self.ids[index]
img_metadata = coco.loadImgs(img_id)[0]
path = img_metadata['file_name']
_img = Image.open(os.path.join(self.img_dir, path)).convert('RGB')
cocotarget = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
_target = Image.fromarray(self._gen_seg_mask(
cocotarget, img_metadata['height'], img_metadata['width']))
return _img, _target
def _preprocess(self, ids, ids_file):
print("Preprocessing mask, this will take a while. " + \
"But don't worry, it only run once for each split.")
tbar = trange(len(ids))
new_ids = []
for i in tbar:
img_id = ids[i]
cocotarget = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
img_metadata = self.coco.loadImgs(img_id)[0]
mask = self._gen_seg_mask(cocotarget, img_metadata['height'],
img_metadata['width'])
# more than 1k pixels
if (mask > 0).sum() > 1000:
new_ids.append(img_id)
tbar.set_description('Doing: {}/{}, got {} qualified images'. \
format(i, len(ids), len(new_ids)))
print('Found number of qualified images: ', len(new_ids))
torch.save(new_ids, ids_file)
return new_ids
def _gen_seg_mask(self, target, h, w):
mask = np.zeros((h, w), dtype=np.uint8)
coco_mask = self.coco_mask
for instance in target:
rle = coco_mask.frPyObjects(instance['segmentation'], h, w)
m = coco_mask.decode(rle)
cat = instance['category_id']
if cat in self.CAT_LIST:
c = self.CAT_LIST.index(cat)
else:
continue
if len(m.shape) < 3:
mask[:, :] += (mask == 0) * (m * c)
else:
mask[:, :] += (mask == 0) * (((np.sum(m, axis=2)) > 0) * c).astype(np.uint8)
return mask
def transform_tr(self, sample):
composed_transforms = transforms.Compose([
tr.RandomHorizontalFlip(),
tr.RandomScaleCrop(base_size=self.args.base_size, crop_size=self.args.crop_size),
tr.RandomGaussianBlur(),
tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
tr.ToTensor()])
return composed_transforms(sample)
def transform_val(self, sample):
composed_transforms = transforms.Compose([
tr.FixScaleCrop(crop_size=self.args.crop_size),
tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
tr.ToTensor()])
return composed_transforms(sample)
def __len__(self):
return len(self.ids)
if __name__ == "__main__":
from dataloaders import custom_transforms as tr
from dataloaders.utils import decode_segmap
from torch.utils.data import DataLoader
from torchvision import transforms
import matplotlib.pyplot as plt
import argparse
parser = argparse.ArgumentParser()
args = parser.parse_args()
args.base_size = 513
args.crop_size = 513
coco_val = COCOSegmentation(args, split='val', year='2017')
dataloader = DataLoader(coco_val, batch_size=4, shuffle=True, num_workers=0)
for ii, sample in enumerate(dataloader):
for jj in range(sample["image"].size()[0]):
img = sample['image'].numpy()
gt = sample['label'].numpy()
tmp = np.array(gt[jj]).astype(np.uint8)
segmap = decode_segmap(tmp, dataset='coco')
img_tmp = np.transpose(img[jj], axes=[1, 2, 0])
img_tmp *= (0.229, 0.224, 0.225)
img_tmp += (0.485, 0.456, 0.406)
img_tmp *= 255.0
img_tmp = img_tmp.astype(np.uint8)
plt.figure()
plt.title('display')
plt.subplot(211)
plt.imshow(img_tmp)
plt.subplot(212)
plt.imshow(segmap)
if ii == 1:
break
plt.show(block=True)
下面的main函數爲測試使用。
- custom_transforms.py 是數據增廣的代碼
import torch
import random
import numpy as np
from PIL import Image, ImageOps, ImageFilter
class Normalize(object):
"""Normalize a tensor image with mean and standard deviation.
Args:
mean (tuple): means for each channel.
std (tuple): standard deviations for each channel.
"""
def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
self.mean = mean
self.std = std
def __call__(self, sample):
img = sample['image']
mask = sample['label']
img = np.array(img).astype(np.float32)
mask = np.array(mask).astype(np.float32)
img /= 255.0
img -= self.mean
img /= self.std
return {'image': img,
'label': mask}
class Normalize_test(object):
"""Normalize a tensor image with mean and standard deviation.
Args:
mean (tuple): means for each channel.
std (tuple): standard deviations for each channel.
"""
def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
self.mean = mean
self.std = std
def __call__(self, sample):
img = sample['image']
#mask = sample['label']
img = np.array(img).astype(np.float32)
#mask = np.array(mask).astype(np.float32)
img /= 255.0
img -= self.mean
img /= self.std
return {'image': img}
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
img = sample['image']
mask = sample['label']
img = np.array(img).astype(np.float32).transpose((2, 0, 1))
mask = np.array(mask).astype(np.float32)
img = torch.from_numpy(img).float()
mask = torch.from_numpy(mask).float()
return {'image': img,
'label': mask}
class ToTensor_test(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
# swap color axis because
# numpy image: H x W x C
# torch image: C X H X W
img = sample['image']
#mask = sample['label']
img = np.array(img).astype(np.float32).transpose((2, 0, 1))
#mask = np.array(mask).astype(np.float32)
img = torch.from_numpy(img).float()
#mask = torch.from_numpy(mask).float()
return {'image': img}
class RandomHorizontalFlip(object):
def __call__(self, sample):
img = sample['image']
mask = sample['label']
if random.random() < 0.5:
img = img.transpose(Image.FLIP_LEFT_RIGHT)
mask = mask.transpose(Image.FLIP_LEFT_RIGHT)
return {'image': img,
'label': mask}
class RandomRotate(object):
def __init__(self, degree):
self.degree = degree
def __call__(self, sample):
img = sample['image']
mask = sample['label']
rotate_degree = random.uniform(-1*self.degree, self.degree)
img = img.rotate(rotate_degree, Image.BILINEAR)
mask = mask.rotate(rotate_degree, Image.NEAREST)
return {'image': img,
'label': mask}
class RandomGaussianBlur(object):
def __call__(self, sample):
img = sample['image']
mask = sample['label']
if random.random() < 0.5:
img = img.filter(ImageFilter.GaussianBlur(
radius=random.random()))
return {'image': img,
'label': mask}
class RandomScaleCrop(object):
def __init__(self, base_size, crop_size, fill=0):
self.base_size = base_size
self.crop_size = crop_size
self.fill = fill
def __call__(self, sample):
img = sample['image']
mask = sample['label']
# random scale (short edge)
short_size = random.randint(int(self.base_size * 0.5), int(self.base_size * 2.0))
w, h = img.size
if h > w:
ow = short_size
oh = int(1.0 * h * ow / w)
else:
oh = short_size
ow = int(1.0 * w * oh / h)
img = img.resize((ow, oh), Image.BILINEAR)
mask = mask.resize((ow, oh), Image.NEAREST)
# pad crop
if short_size < self.crop_size:
padh = self.crop_size - oh if oh < self.crop_size else 0
padw = self.crop_size - ow if ow < self.crop_size else 0
img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=self.fill)
# random crop crop_size
w, h = img.size
x1 = random.randint(0, w - self.crop_size)
y1 = random.randint(0, h - self.crop_size)
img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
return {'image': img,
'label': mask}
class FixScaleCrop(object):
def __init__(self, crop_size):
self.crop_size = crop_size
def __call__(self, sample):
img = sample['image']
mask = sample['label']
w, h = img.size
if w > h:
oh = self.crop_size
ow = int(1.0 * w * oh / h)
else:
ow = self.crop_size
oh = int(1.0 * h * ow / w)
img = img.resize((ow, oh), Image.BILINEAR)
mask = mask.resize((ow, oh), Image.NEAREST)
# center crop
w, h = img.size
x1 = int(round((w - self.crop_size) / 2.))
y1 = int(round((h - self.crop_size) / 2.))
img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
return {'image': img,
'label': mask}
class FixedResize(object):
def __init__(self):
self.size = (size, size) # size: (h, w)
def __call__(self, sample):
img = sample['image']
mask = sample['label']
assert img.size == mask.size
img = img.resize(self.size, Image.BILINEAR)
mask = mask.resize(self.size, Image.NEAREST)
return {'image': img,
'label': mask}
class FixedResize_test(object):
def __init__(self):
super().__init__()
#self.size = (size, size) # size: (h, w)
def __call__(self, sample):
img = sample['image']
w, h = img.size
#mask = sample['label']
#assert img.size == mask.size
img = img.resize(img.size, Image.BILINEAR)
#mask = mask.resize(self.size, Image.NEAREST)
return {'image': img}
將以上兩個文件加入到你的代碼中,就完成了COCO數據集的載入啦~