PASCAL Visual Object Classes Challenge 2007(VOC 2007)數據集預處理

VOC 2007[1] 是一個多標籤數據集,有 20 類。這裏爲 multi-label classification 任務做預處理,包括:

  • 將圖片移到同一個目錄(方便讀取);
  • 數據劃分(本身就已經分好 train/val 和 test 兩部分);
  • 處理標籤。

Prepare

[1] 有下載鏈,train/val 450M,test 430M。下下來就是 VOCtrainval_06-Nov-2007.tarVOCtest_06-Nov-2007.tar 兩個文件。以 test set 的文件爲例,解壓之後在 VOCtest_06-Nov-2007/VOCdevkit/VOC2007/ 下可以見到:

  • Annotations/:各樣本對應的 .xml 標註文件,可以從中提取 label 信息,解析可參考 [5]。其中 <object> 標籤下的 <difficult> 子標籤與下一條的 0 tag 有對應關係,見 [2];
  • ImageSets/:只用到其中 Main/ 目錄,裏面是按類組織的 .txt 文件,標註每幅 image 樣本是否包含此類物體,有 1/0/-1 三種標記(解釋見 [2]):1 是含有,-1 是不含,0 表示 difficult。
  • JPEGImages/:圖片;
  • SegmentationClass/:其它任務的,用不到;
  • SegmentationObject/:其它任務的,用不到;

ID, Label

JPEGImages/ 下的圖片是用 ID 命名的,可以從此獲取樣本 ID;而在 ImageSets/Main/ 中,又有 test.txttrain.txtval.txttrainval.txt 這 4 個 ID 劃分文件。經驗證,以兩種方式獲得的 ID 劃分是一致的,且 train/val 與 test 無重合。
處理 label 時,參照 [4],將 0 當成 -1,即只有 1 表示正例,0/-1 都表示負例,結果與 [3] 裏每類正例數統計是對得上的。獲取 label 又有兩中方式:通過 Annotations/ 中的 .xml 文件,或通過 ImageSets/Main/(除了剛纔的 ID 劃分文件之外的).txt 文件。經驗證,將 .txt 中的 0 當成 -1 處理與忽略 .xml 中 <difficult> 爲 1 的效果相同。

Code

import os
from os.path import join
from xml.dom import minidom
import numpy as np


# http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html
# http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00090000000000000000


P = "E:/iTom/dataset/VOC2007"  # 下載目錄
ALL_IMAGE_P = join(P, "images")  # 所有 image 複製一份到此目錄下

# train/val 解壓目錄
TRAIN_P = join(P, "VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007")
TRAIN_IMAGE_P = join(TRAIN_P, "JPEGImages")
TRAIN_LABEL_P = join(TRAIN_P, "ImageSets/Main")
TRAIN_ANNO_P = join(TRAIN_P, "Annotations")

# test 解壓目錄
TEST_P = join(P, "VOCtest_06-Nov-2007/VOCdevkit/VOC2007")
TEST_IMAGE_P = join(TEST_P, "JPEGImages")
TEST_LABEL_P = join(TEST_P, "ImageSets/Main")
TEST_ANNO_P = join(TEST_P, "Annotations")

# ID 劃分文件
SPLIT_TRAIN = join(TRAIN_LABEL_P, "train.txt")
SPLIT_VAL = join(TRAIN_LABEL_P, "val.txt")
SPLIT_TRAIN_VAL = join(TRAIN_LABEL_P, "trainval.txt")
SPLIT_TEST = join(TEST_LABEL_P, "test.txt")


"""處理 ID 劃分"""

# print("--- 第一種方式:從 JPEGImages/ 目錄提取 ID ---")
file_key = lambda s: int(s.split('.')[0])


# def get_id_list(path):
#     id_list = os.listdir(path)
#     id_list = list(map(file_key, id_list))
#     print("#files:", len(id_list))
#     id_set = set(id_list)
#     print("#unique:", len(id_set))
#     return id_list


# print("- train -")
# train_img_id = get_id_list(TRAIN_IMAGE_P)  # 5011
# print("- test -")
# test_img_id = get_id_list(TEST_IMAGE_P)  # 4952

# print("- 驗證 train/val 與 test 無重複 ID -")
# train_img_id_set = set(train_img_id)
# test_img_id_set = set(test_img_id)
# # no intersection in id of train/val & test
# print("#common in train & test:", len(train_img_id_set.intersection(test_img_id_set)))  # 0


print("--- 第二種方式:從 ID 劃分文件提取 ID ---")


def get_id_list_from_file(_file):
    id_list = []
    with open(_file, "r") as f:
        for line in f:
            id_list.append(int(line))
    print("#id:", len(id_list))
    id_set = set(id_list)
    print("#unique id:", len(id_set))
    return id_list


print("- train -")
id_train = get_id_list_from_file(SPLIT_TRAIN)  # 2501
print("- val -")
id_val = get_id_list_from_file(SPLIT_VAL)  # 2510
print("- train-val -")
id_train_val = get_id_list_from_file(SPLIT_TRAIN_VAL)  # 5011
print("- test -")
id_test = get_id_list_from_file(SPLIT_TEST)  # 4952

# print("- 驗證 train/val 與 test 無重複 ID -")
# train_val_id_set = set(id_train_val)
# test_id_set = set(id_test)
# # train/val 和 test 無重複 ID
# print("#common in train & test:", len(train_val_id_set.intersection(test_id_set)))  # 0
# print("- 驗證兩種方法獲取的 ID 劃分一致 -")
# print("#common in train:", len(train_img_id_set.intersection(train_val_id_set)))  # 5011
# print("#common in test:", len(test_img_id_set.intersection(test_id_set)))  # 4952

# print("- check id complete -")
id_all = id_train_val + id_test
print("#id:", len(id_all), ", max id:", max(id_all))
n_id = max(id_all)
# for i in range(1, n_id + 1):
#     if i not in id_all:
#         print("id absent:", i)
# print("complete check done")


print("- save indices -")
id_train = np.array(id_train)
id_val = np.array(id_val)
id_train_val = np.array(id_train_val)
id_test = np.array(id_test)
print("id train-val:", id_train_val.max(), id_train_val.min())
print("id test:", id_test.max(), id_test.min())

np.save(join(P, "idx_train.npy"), id_train)
np.save(join(P, "idx_val.npy"), id_val)
np.save(join(P, "idx_train_val.npy"), id_train_val)
np.save(join(P, "idx_test.npy"), id_test)


"""將全部 image 移到同一個目錄"""
# since all IDs are distinct
# we can move all image into one dir

if not os.path.exists(ALL_IMAGE_P):
    os.makedirs(ALL_IMAGE_P)


def copy_image(path):
    img_ls = os.listdir(path)
    for i, f in enumerate(img_ls):
        # os.system("cp {} {}".format(join(path, f), ALL_IMAGE_P))  # linux
        os.system("copy {} {}".format(join(path, f), ALL_IMAGE_P))  # windows
        if i % 100 == 0:
            print(i)


copy_image(TRAIN_IMAGE_P)
copy_image(TEST_IMAGE_P)


"""處理 label"""
# 2 method for processing label
# both treat 0 tag as -1
# http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00031000000000000000

test_ls = os.listdir(TEST_LABEL_P)
test_ls = [f for f in test_ls if "_test" in f]
N_CLASS = len(test_ls)
print("#class:", N_CLASS)
# map id: name -> num
test_ls = [f.split("_test")[0] for f in test_ls]  # 保留類名
id_map = {name: num for num, name in enumerate(test_ls)}  # 類名 -> 類 ID
print(id_map)


print("--- 第一種方式:從 ImageSets/Main/ 提取 label ---")
L_label = np.zeros((n_id, N_CLASS))


def proc_label(path, suffix):
    """process by class
    path: {TRAIN_LABEL_P, TEST_LABEL_P}
    suffix: {"_trainval", "_test"}
    """
    file_ls = os.listdir(path)
    for _f in file_ls:
        if suffix not in _f:
            continue
        class_name = _f.split(suffix)[0]
        assert class_name in id_map
        c = id_map[class_name]
        pos_cnt = 0
        with open(join(path, _f), "r") as f:
            for line in f:  # format: ID  1/0/-1
                line = line.split()
                if int(line[1]) > 0:  # 只把 1 當正例
                    pos_cnt += 1
                    sid = int(line[0]) - 1  # 0-base
                    L_label[sid][c] = 1
        print("#{}: {}".format(class_name, pos_cnt))


print("- train-val label -")
proc_label(TRAIN_LABEL_P, "_trainval")
print("- test label -")
proc_label(TEST_LABEL_P, "_test")
sum_label = L_label.sum(0)
print("label statistics:", sum_label)
np.save(join(P, "labels.l.npy"), L_label)


print("--- 第二種方式:從 Annotations/ 提取 label ---")
# https://github.com/HCPLab-SYSU/SSGRL/blob/master/datasets/voc07dataset.py
L_anno = np.zeros((n_id, N_CLASS))


def proc_annotation(path):
    """process by sample
    path: {TRAIN_ANNO_P, TEST_ANNO_P}
    """
    pos_cnt = {k: 0 for k in id_map.keys()}
    file_ls = os.listdir(path)
    for _f in file_ls:
        sid = file_key(_f) - 1
        DOMTree = minidom.parse(join(path, _f))
        root = DOMTree.documentElement
        objects = root.getElementsByTagName('object')
        for obj in objects:
            if '1' == obj.getElementsByTagName('difficult')[0].firstChild.data:  # 忽略 difficult
                continue
            class_name = obj.getElementsByTagName('name')[0].firstChild.data.lower()
            assert class_name in id_map
            c = id_map[class_name]
            if 0 == L_anno[sid][c]:
                L_anno[sid][c] = 1
                pos_cnt[class_name] += 1
    print("pos count:", pos_cnt)


print("- train-val annotation -")
proc_annotation(TRAIN_ANNO_P)
print("- test annotation -")
proc_annotation(TEST_ANNO_P)
sum_label = L_anno.sum(0)
print("label statistics:", sum_label)
np.save(join(P, "labels.a.npy"), L_anno)

print("#diff:", (L_label != L_anno).astype(np.int8).sum())  # 0

Cloud Drive

鏈接:https://pan.baidu.com/s/1Mh_nX-y-ijvZEmy3lzTaNw,提取碼:oq10
VOC 2007 baidu pan

References

  1. The PASCAL Visual Object Classes Challenge 2007
  2. 8.1.2 Classification Task Image Sets
  3. 2.1 Classification/Detection Image Sets
  4. SSGRL/datasets/voc07dataset.py
  5. 數據集:Pascal VOC 2007數據集分析
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章