一、VOC2012数据集的概述
VOC2012数据集是维持在英国牛津大学计算机科学系的Visual Object Classes 2012挑战赛中使用的数据集,该数据集包含了20个物体类别。VOC2012数据集中的图像是从互联网上搜索到的,图片分辨率、内容、光照条件、背景、遮挡情况等都比较多样,可以用来进行目标检测、图像分割、语义分割等任务。VOC2012数据集为各大计算机视觉研究者和工程师提供了一个很好的实验平台,该数据集已经成为目标检测、图像分割等任务的评价标准之一。
二、VOC2012数据集的使用方式
使用VOC2012数据集进行目标检测和图像分割的流程如下:
1、下载VOC2012数据集:
wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
2、解压数据集:
tar -xvf VOCtrainval_11-May-2012.tar
3、解析标注文件,例如:Annotations、ImageSets、JPEGImages:
import xml.etree.ElementTree as ET def parse_annotation(annotation_path): tree = ET.parse(annotation_path) root = tree.getroot() image_path = root.find(text='filename').text size = root.find('size') width = int(size.find('width').text) height = int(size.find('height').text) depth = int(size.find('depth').text) objects = [] for obj in root.findall('object'): obj_struct = {} obj_struct['name'] = obj.find('name').text obj_struct['xmin'] = int(obj.find('bndbox/xmin').text) obj_struct['ymin'] = int(obj.find('bndbox/ymin').text) obj_struct['xmax'] = int(obj.find('bndbox/xmax').text) obj_struct['ymax'] = int(obj.find('bndbox/ymax').text) objects.append(obj_struct) return image_path, objects, width, height, depth
4、读取JPEGImages和ImageSets文件夹中的图像,并划分为训练集和测试集:
from os import listdir from os.path import isfile, join trainval_files = ['./VOCdevkit/VOC2012/ImageSets/Main/' + f.rstrip() for f in open('./VOCdevkit/VOC2012/ImageSets/Main/trainval.txt')] test_files = ['./VOCdevkit/VOC2012/ImageSets/Main/' + f.rstrip() for f in open('./VOCdevkit/VOC2012/ImageSets/Main/test.txt')] trainval_image_names = [] for trainval_file in trainval_files: with open(trainval_file) as f: trainval_image_names += f.read().splitlines() test_image_names = [] for test_file in test_files: with open(test_file) as f: test_image_names += f.read().splitlines() trainval_image_paths = ['./VOCdevkit/VOC2012/JPEGImages/' + n + '.jpg' for n in trainval_image_names] test_image_paths = ['./VOCdevkit/VOC2012/JPEGImages/' + n + '.jpg' for n in test_image_names]
三、VOC2012数据集中物体类别的介绍
以下是VOC2012数据集中包含的20个物体类别:
- Person:人类图像
- Bird:鸟类图像
- Car:汽车图像
- Bus:公交车图像
- Cow:奶牛图像
- Sheep:绵羊图像
- Aeroplane:飞机图像
- Bicycle:自行车图像
- Horse:马图像
- Motorbike:摩托车图像
- Potted Plant:盆栽图像
- Diningtable:餐桌图像
- Cat:猫图像
- Dog:狗图像
- Boat:船舶图像
- Train:火车图像
- Sofa:沙发图像
- Bottle:瓶子图像
- Tv/Monitor:电视/显示器图像
四、VOC2012数据集应用实例
以下为使用VOC2012数据集进行目标检测和语义分割的代码示例,目录结构如下:
── VOCdevkit └── VOC2012 ├── Annotations ├── ImageSets ├── JPEGImages
目标检测代码:
import torch import torchvision import os def get_transform(train): transforms = [] transforms.append(torchvision.transforms.ToTensor()) return torchvision.transforms.Compose(transforms) class VOCDataset(torch.utils.data.Dataset): def __init__(self, data_folder, split, transform=None): self.split = split.upper() assert self.split in {'TRAIN', 'TEST'} self.year = "2012" self.data_folder = data_folder self.transform = transform # VOC2012只有20个类别,对应标签编号为1-20 self.label_map = { "aeroplane": 1, "bicycle": 2, "bird": 3, "boat": 4, "bottle": 5, "bus": 6, "car": 7, "cat": 8, ... } if self.split == 'TRAIN': data_file = os.path.join(data_folder, 'ImageSets', 'Main', 'trainval.txt') else: data_file = os.path.join(data_folder, 'ImageSets', 'Main', 'test.txt') self.image_names = [] with open(data_file, 'r') as f: for line in f.readlines(): self.image_names.append(line.strip() + '.jpg') def __getitem__(self, index): image_path = os.path.join(self.data_folder, 'JPEGImages', self.image_names[index]) annotation_path = os.path.join(self.data_folder, 'Annotations', self.image_names[index].replace('.jpg', '.xml')) # parse the xml annotations file image_name, objects, image_width, image_height, image_depth = parse_annotation(annotation_path) # Convert everything into a torch.Tensor boxes = torch.zeros((len(objects), 4), dtype=torch.float32) labels = torch.zeros((len(objects)), dtype=torch.int64) difficulties = torch.zeros((len(objects)), dtype=torch.uint8) for i, object_dict in enumerate(objects): boxes[i, 0] = object_dict['xmin'] boxes[i, 1] = object_dict['ymin'] boxes[i, 2] = object_dict['xmax'] boxes[i, 3] = object_dict['ymax'] labels[i] = self.label_map[object_dict['name']] difficulties[i] = 0 image = Image.open(image_path).convert("RGB") # apply the transformations if self.transform is not None: image = self.transform(image) return image, boxes, labels, difficulties def __len__(self): return len(self.image_names) test_dataset = VOCDataset(data_folder='./VOCdevkit/VOC2012', split='TEST', transform=get_transform(train=False)) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=True) # load a pre-trained Faster R-CNN model model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True) model.eval() def predict_my_image(image_path, confidence_threshold=0.5): # load the image image = Image.open(image_path).convert("RGB") # define the transforms transform = get_transform(train=False) # apply the transforms image = transform(image) # unsqueeze to add a batch dimension image = image.unsqueeze(0) # pass the image through the model with torch.no_grad(): predictions = model(image) # get the predictions tensor predictions = [{k: v.to(torch.device('cpu')) for k, v in t.items()} for t in predictions] # get all the boxes above the confidence threshold thresholded_predictions = [pred for pred in predictions if pred['scores'][0] > confidence_threshold] for i, prediction in enumerate(thresholded_predictions): print(f"Prediction {i}: {prediction['labels'][0]}, {prediction['scores'][0]}, {prediction['boxes'][0]}")
语义分割代码:
from PIL import Image import numpy as np import os import torch import torchvision class VOCSemanticSegmentation(torch.utils.data.Dataset): """ Dataset for Semantic Segmentation on the Pascal VOC dataset """ def __init__(self, data_folder, split="TRAIN"): self.split = split.upper() assert self.split in {"TRAIN", "VAL", "TEST"} self.data_folder = data_folder self.image_name_list_file = os.path.join(data_folder, self.split + ".txt") with open(self.image_name_list_file, "r") as f: self.image_names = [x.strip() for x in f.readlines()] def __len__(self): return len(self.image_names) def __getitem__(self, index): # load image and target image_name = self.image_names[index] image_path = os.path.join(self.data_folder, "JPEGImages", image_name + ".jpg") target_path = os.path.join(self.data_folder, "SegmentationClass", image_name + ".png") image = Image.open(image_path).convert("RGB") target = Image.open(target_path) # convert target image tensor into array of labels target_array = np.array(target).astype(np.int32) labels = np.zeros_like(target_array) labels[np.where(target_array == 0)] = 0 labels[np.where(target_array == 128)] = 1 labels[np.where(target_array == 192)] = 2 labels[np.where(target_array == 255)] = 3 # convert everything into a torch.Tensor and return image = torchvision.transforms.functional.to_tensor(image) labels = torch.from_numpy(labels).long() return image, labels
总结
以上就是对VOC2012数据集的介绍和使用方法的阐述,该数据集已经成为计算机视觉领域的标杆之一,在目标检测和图像分割等方面都有着广泛的应用。希望本文对大家的学习和研究有所帮助。