COCO數據集的下載、介紹及如何使用(數據載入及數據增廣,含代碼)

如何使用COCO數據集
COCO數據集可以說是語義分割等計算機視覺任務中應用較爲廣泛的一個數據集,具體可以應用到物體識別、語義分割及目標檢測等方面。我是在做語義分割方面任務時用到了COCO數據集,但本文主要講解的是數據載入方面,因此可以通用。

一、下載COCO數據集

首先,我們要下載COCO數據集,本文主要使用的是COCO2014和COCO2017,因爲是國外數據集,因此下載需要翻牆下載。
MSCOCO數據集的官網爲:http://mscoco.org/
具體來說,如果想只下載COCO2017/COCO2014的話,可以不需要翻牆下載,複製以下鏈接打開迅雷等下載軟件下載即可,網速還可以。
COCO2017 訓練數據:http://images.cocodataset.org/zips/train2017.zip
http://images.cocodataset.org/annotations/annotations_trainval2017.zip
COCO2017驗證數據:http://images.cocodataset.org/zips/val2017.zip
http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip
COCO2017測試數據集:http://images.cocodataset.org/zips/test2017.zip
http://images.cocodataset.org/annotations/image_info_test2017.zip

COCO2014的相關數據只需要將以上鍊接中的7改成4即可。

二、COCO數據集介紹

網上關於COCO數據集的介紹多如牛毛,本文就不過多的加以介紹了,簡要的介紹以下。
以COCO2014爲例:
下載完COCO2014後進行解壓後,目錄如下:

三、COCO數據集使用(數據載入)

所需環境爲:

  1. numpy
  2. torch
  3. tqdm(可視化數據載入)
  4. os
  5. pycocotools(coco數據集的應用API)
  6. torchvision
  7. PIL

如何安裝pycocotools

相信能用到COCO數據集做語義分割等任務的大佬們應該都能安裝以上絕大多數庫,這裏主要講一下如何安裝pycocotools庫。作者在安裝這個庫的時候遇到了一些問題,不過及時的解決了。
步驟如下:

  1. 首先下載cocoapi,在終端輸入
git clone git@github.com:lucky-ing/cocoapi.git
  1. 此時可以看到一個叫coco的文件夾,進入coco/PythonAPI中,懶人操作如下:
cd coco/PythonAPI
  1. 開始安裝,在終端輸入以下命令
    如果使用的是python2:
python setup.py build_ext install

如果使用的是python3

python3 setup.py build_ext install
  1. 如果一切順利,安裝完成,即可進入下一章節具體使用,作者在安裝時遇到了以下問題。
error: command 'C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Tools\MSVC\14.16.27023\bin\HostX86\x64\cl.exe' failed with exit status 2

解決方法很簡單,在終端安裝cython即可,在終端輸入:

conda install cython

若是沒有使用conda,在終端輸入

pip install cython

COCO數據集的載入

  1. dataloader
import numpy as np
import torch
from torch.utils.data import Dataset
from tqdm import trange
import os
from pycocotools.coco import COCO
from pycocotools import mask
from torchvision import transforms
import custom_transforms as tr
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


class COCOSegmentation(Dataset):
    NUM_CLASSES = 21
    CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4,
        1, 64, 20, 63, 7, 72]

    def __init__(self,
                 args,
                 base_dir=./Path/COCO/,
                 split='train',
                 year='2014'):
        super().__init__()
        ann_file = os.path.join(base_dir, 'annotations/instances_{}{}.json'.format(split, year))
        ids_file = os.path.join(base_dir, 'annotations/{}_ids_{}.pth'.format(split, year))
        self.img_dir = os.path.join(base_dir, 'images/{}{}'.format(split, year))
        self.split = split
        self.coco = COCO(ann_file)
        self.coco_mask = mask
        if os.path.exists(ids_file):
            self.ids = torch.load(ids_file)
        else:
            ids = list(self.coco.imgs.keys())
            self.ids = self._preprocess(ids, ids_file)
        self.args = args

    def __getitem__(self, index):
        _img, _target = self._make_img_gt_point_pair(index)
        sample = {'image': _img, 'label': _target}

        if self.split == "train":
            return self.transform_tr(sample)
        elif self.split == 'val':
            return self.transform_val(sample)

    def _make_img_gt_point_pair(self, index):
        coco = self.coco
        img_id = self.ids[index]
        img_metadata = coco.loadImgs(img_id)[0]
        path = img_metadata['file_name']
        _img = Image.open(os.path.join(self.img_dir, path)).convert('RGB')
        cocotarget = coco.loadAnns(coco.getAnnIds(imgIds=img_id))
        _target = Image.fromarray(self._gen_seg_mask(
            cocotarget, img_metadata['height'], img_metadata['width']))

        return _img, _target

    def _preprocess(self, ids, ids_file):
        print("Preprocessing mask, this will take a while. " + \
              "But don't worry, it only run once for each split.")
        tbar = trange(len(ids))
        new_ids = []
        for i in tbar:
            img_id = ids[i]
            cocotarget = self.coco.loadAnns(self.coco.getAnnIds(imgIds=img_id))
            img_metadata = self.coco.loadImgs(img_id)[0]
            mask = self._gen_seg_mask(cocotarget, img_metadata['height'],
                                      img_metadata['width'])
            # more than 1k pixels
            if (mask > 0).sum() > 1000:
                new_ids.append(img_id)
            tbar.set_description('Doing: {}/{}, got {} qualified images'. \
                                 format(i, len(ids), len(new_ids)))
        print('Found number of qualified images: ', len(new_ids))
        torch.save(new_ids, ids_file)
        return new_ids

    def _gen_seg_mask(self, target, h, w):
        mask = np.zeros((h, w), dtype=np.uint8)
        coco_mask = self.coco_mask
        for instance in target:
            rle = coco_mask.frPyObjects(instance['segmentation'], h, w)
            m = coco_mask.decode(rle)
            cat = instance['category_id']
            if cat in self.CAT_LIST:
                c = self.CAT_LIST.index(cat)
            else:
                continue
            if len(m.shape) < 3:
                mask[:, :] += (mask == 0) * (m * c)
            else:
                mask[:, :] += (mask == 0) * (((np.sum(m, axis=2)) > 0) * c).astype(np.uint8)
        return mask

    def transform_tr(self, sample):
        composed_transforms = transforms.Compose([
            tr.RandomHorizontalFlip(),
            tr.RandomScaleCrop(base_size=self.args.base_size, crop_size=self.args.crop_size),
            tr.RandomGaussianBlur(),
            tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            tr.ToTensor()])

        return composed_transforms(sample)

    def transform_val(self, sample):

        composed_transforms = transforms.Compose([
            tr.FixScaleCrop(crop_size=self.args.crop_size),
            tr.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
            tr.ToTensor()])

        return composed_transforms(sample)


    def __len__(self):
        return len(self.ids)



if __name__ == "__main__":
    from dataloaders import custom_transforms as tr
    from dataloaders.utils import decode_segmap
    from torch.utils.data import DataLoader
    from torchvision import transforms
    import matplotlib.pyplot as plt
    import argparse

    parser = argparse.ArgumentParser()
    args = parser.parse_args()
    args.base_size = 513
    args.crop_size = 513

    coco_val = COCOSegmentation(args, split='val', year='2017')

    dataloader = DataLoader(coco_val, batch_size=4, shuffle=True, num_workers=0)

    for ii, sample in enumerate(dataloader):
        for jj in range(sample["image"].size()[0]):
            img = sample['image'].numpy()
            gt = sample['label'].numpy()
            tmp = np.array(gt[jj]).astype(np.uint8)
            segmap = decode_segmap(tmp, dataset='coco')
            img_tmp = np.transpose(img[jj], axes=[1, 2, 0])
            img_tmp *= (0.229, 0.224, 0.225)
            img_tmp += (0.485, 0.456, 0.406)
            img_tmp *= 255.0
            img_tmp = img_tmp.astype(np.uint8)
            plt.figure()
            plt.title('display')
            plt.subplot(211)
            plt.imshow(img_tmp)
            plt.subplot(212)
            plt.imshow(segmap)

        if ii == 1:
            break

    plt.show(block=True)

下面的main函數爲測試使用。

  1. custom_transforms.py 是數據增廣的代碼
import torch
import random
import numpy as np

from PIL import Image, ImageOps, ImageFilter

class Normalize(object):
   """Normalize a tensor image with mean and standard deviation.
   Args:
       mean (tuple): means for each channel.
       std (tuple): standard deviations for each channel.
   """
   def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
       self.mean = mean
       self.std = std

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       img = np.array(img).astype(np.float32)
       mask = np.array(mask).astype(np.float32)
       img /= 255.0
       img -= self.mean
       img /= self.std

       return {'image': img,
               'label': mask}

class Normalize_test(object):
   """Normalize a tensor image with mean and standard deviation.
   Args:
       mean (tuple): means for each channel.
       std (tuple): standard deviations for each channel.
   """
   def __init__(self, mean=(0., 0., 0.), std=(1., 1., 1.)):
       self.mean = mean
       self.std = std

   def __call__(self, sample):
       img = sample['image']
       #mask = sample['label']
       img = np.array(img).astype(np.float32)
       #mask = np.array(mask).astype(np.float32)
       img /= 255.0
       img -= self.mean
       img /= self.std

       return {'image': img}


class ToTensor(object):
   """Convert ndarrays in sample to Tensors."""

   def __call__(self, sample):
       # swap color axis because
       # numpy image: H x W x C
       # torch image: C X H X W
       img = sample['image']
       mask = sample['label']
       img = np.array(img).astype(np.float32).transpose((2, 0, 1))
       mask = np.array(mask).astype(np.float32)

       img = torch.from_numpy(img).float()
       mask = torch.from_numpy(mask).float()

       return {'image': img,
               'label': mask}

class ToTensor_test(object):
   """Convert ndarrays in sample to Tensors."""

   def __call__(self, sample):
       # swap color axis because
       # numpy image: H x W x C
       # torch image: C X H X W
       img = sample['image']
       #mask = sample['label']
       img = np.array(img).astype(np.float32).transpose((2, 0, 1))
       #mask = np.array(mask).astype(np.float32)

       img = torch.from_numpy(img).float()
       #mask = torch.from_numpy(mask).float()

       return {'image': img}


class RandomHorizontalFlip(object):
   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       if random.random() < 0.5:
           img = img.transpose(Image.FLIP_LEFT_RIGHT)
           mask = mask.transpose(Image.FLIP_LEFT_RIGHT)

       return {'image': img,
               'label': mask}


class RandomRotate(object):
   def __init__(self, degree):
       self.degree = degree

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       rotate_degree = random.uniform(-1*self.degree, self.degree)
       img = img.rotate(rotate_degree, Image.BILINEAR)
       mask = mask.rotate(rotate_degree, Image.NEAREST)

       return {'image': img,
               'label': mask}


class RandomGaussianBlur(object):
   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       if random.random() < 0.5:
           img = img.filter(ImageFilter.GaussianBlur(
               radius=random.random()))

       return {'image': img,
               'label': mask}


class RandomScaleCrop(object):
   def __init__(self, base_size, crop_size, fill=0):
       self.base_size = base_size
       self.crop_size = crop_size
       self.fill = fill

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       # random scale (short edge)
       short_size = random.randint(int(self.base_size * 0.5), int(self.base_size * 2.0))
       w, h = img.size
       if h > w:
           ow = short_size
           oh = int(1.0 * h * ow / w)
       else:
           oh = short_size
           ow = int(1.0 * w * oh / h)
       img = img.resize((ow, oh), Image.BILINEAR)
       mask = mask.resize((ow, oh), Image.NEAREST)
       # pad crop
       if short_size < self.crop_size:
           padh = self.crop_size - oh if oh < self.crop_size else 0
           padw = self.crop_size - ow if ow < self.crop_size else 0
           img = ImageOps.expand(img, border=(0, 0, padw, padh), fill=0)
           mask = ImageOps.expand(mask, border=(0, 0, padw, padh), fill=self.fill)
       # random crop crop_size
       w, h = img.size
       x1 = random.randint(0, w - self.crop_size)
       y1 = random.randint(0, h - self.crop_size)
       img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
       mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))

       return {'image': img,
               'label': mask}


class FixScaleCrop(object):
   def __init__(self, crop_size):
       self.crop_size = crop_size

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']
       w, h = img.size
       if w > h:
           oh = self.crop_size
           ow = int(1.0 * w * oh / h)
       else:
           ow = self.crop_size
           oh = int(1.0 * h * ow / w)
       img = img.resize((ow, oh), Image.BILINEAR)
       mask = mask.resize((ow, oh), Image.NEAREST)
       # center crop
       w, h = img.size
       x1 = int(round((w - self.crop_size) / 2.))
       y1 = int(round((h - self.crop_size) / 2.))
       img = img.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))
       mask = mask.crop((x1, y1, x1 + self.crop_size, y1 + self.crop_size))

       return {'image': img,
               'label': mask}

class FixedResize(object):
   def __init__(self):
       self.size = (size, size)  # size: (h, w)

   def __call__(self, sample):
       img = sample['image']
       mask = sample['label']

       assert img.size == mask.size

       img = img.resize(self.size, Image.BILINEAR)
       mask = mask.resize(self.size, Image.NEAREST)

       return {'image': img,
               'label': mask}

class FixedResize_test(object):
   def __init__(self):
       super().__init__()
       #self.size = (size, size)  # size: (h, w)

   def __call__(self, sample):
       img = sample['image']
       w, h = img.size
       #mask = sample['label']

       #assert img.size == mask.size

       img = img.resize(img.size, Image.BILINEAR)
       #mask = mask.resize(self.size, Image.NEAREST)

       return {'image': img}

將以上兩個文件加入到你的代碼中,就完成了COCO數據集的載入啦~

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章