由于数据集本身较大,论文中使用的都是其中很小的一部分,进入下载页面,选择《WebVision Dataset 1.0》《Resized Images (small version)》:

  • 一般需要数据集的训练集《Google Images Resized (16 GB) 》
  • 验证集《Validation Images Resized (834 MB)》
  • 这两个集合的标签《Metadata》下的《Training & Validation Labels (183 MB)》。


整理下下载的数据集,放到目录 ~/data/webvision1.0 下,目录结构如下:


│ ├─q0001

│ ├─q0002

│ ├─...

│ ├─q1631

│ └─q1632



info 目录下重要的几个txt:

  1. queries_google.txt:1632行,与google目录下的文件夹相对应,每行是一个查询词。
  2. synsets.txt:1000行表示1000个类,标签 \(i(0\le i\le 999)\) 的具体含义在第 \(i+1\)行。
  3. train_filelist_google.txt:每行表示一个图片的路径和标签,路径是相对于google目录的。
  4. val_filelist.txt:同上,但是是相对于val_images_256目录的。



数据集 & DataLoader

# webvision.py

from torch.utils.data import Dataset, DataLoader

import torchvision.transforms as transforms

from PIL import Image

import os

class Webvision(Dataset):

def __init__(self, root, train=True, transform=None, num_classes=50):

root = os.path.expanduser(root)

self.root = root

self.transform = transform

self.train = train

if train:

with open(os.path.join(root, 'info/train_filelist_google.txt')) as f:

lines = f.readlines()

data, targets = [], []

for line in lines:

img, target = line.split()

target = int(target)

if target < num_classes:




with open(os.path.join(root, 'info/val_filelist.txt')) as f:

lines = f.readlines()

data, targets = [], []

for line in lines:

img, target = line.split()

target = int(target)

if target < num_classes:



assert len(data) == len(targets)

self.data = data

self.targets = targets

def __len__(self):

return len(self.targets)

def __getitem__(self, index):

img_path = self.data[index]

target = self.targets[index]

if self.train:

image = Image.open(os.path.join(self.root, img_path)).convert('RGB')


image = Image.open(os.path.join(self.root, 'val_images_256', img_path)).convert('RGB')

image = self.transform(image)

return image, target

class WebvisionDataloader:

def __init__(self, batch_size=128, num_classes=50, num_workers=8, root='~/data/webvision1.0'):code>

self.batch_size = batch_size

self.num_classes = num_classes

self.num_workers = num_workers

self.root = root

self.transform_train = transforms.Compose([





transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),


self.transform_test = transforms.Compose([




transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),


def train(self):

dataset = Webvision(root=self.root, train=True, transform=self.transform_train,


dataloader = DataLoader(

dataset=dataset, batch_size=self.batch_size,

shuffle=True, num_workers=self.num_workers, pin_memory=True)

return dataloader

def test(self):

dataset = Webvision(root=self.root, train=False, transform=self.transform_test,


test_loader = DataLoader(

dataset=dataset, batch_size=self.batch_size,

shuffle=False, num_workers=self.num_workers, pin_memory=True)

return test_loader



# InceptionResNetV2.py

import torch

from torch import nn

class BasicConv2d(nn.Module):

def __init__(self, in_planes, out_planes, kernel_size, stride, padding: int | tuple[int, int] = 0):

super(BasicConv2d, self).__init__()

self.conv = nn.Conv2d(in_planes, out_planes,

kernel_size=kernel_size, stride=stride,

padding=padding, bias=False) # verify bias false

self.bn = nn.BatchNorm2d(out_planes,

eps=0.001, # value found in tensorflow

momentum=0.1, # default pytorch value


self.relu = nn.ReLU(inplace=False)

def forward(self, x):

x = self.conv(x)

x = self.bn(x)

x = self.relu(x)

return x

class Mixed_5b(nn.Module):

def __init__(self):

super(Mixed_5b, self).__init__()

self.branch0 = BasicConv2d(192, 96, kernel_size=1, stride=1)

self.branch1 = nn.Sequential(

BasicConv2d(192, 48, kernel_size=1, stride=1),

BasicConv2d(48, 64, kernel_size=5, stride=1, padding=2)


self.branch2 = nn.Sequential(

BasicConv2d(192, 64, kernel_size=1, stride=1),

BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1),

BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1)


self.branch3 = nn.Sequential(

nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False),

BasicConv2d(192, 64, kernel_size=1, stride=1)


def forward(self, x):

x0 = self.branch0(x)

x1 = self.branch1(x)

x2 = self.branch2(x)

x3 = self.branch3(x)

out = torch.cat((x0, x1, x2, x3), 1)

return out

class Block35(nn.Module):

def __init__(self, scale=1.0):

super(Block35, self).__init__()

self.scale = scale

self.branch0 = BasicConv2d(320, 32, kernel_size=1, stride=1)

self.branch1 = nn.Sequential(

BasicConv2d(320, 32, kernel_size=1, stride=1),

BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1)


self.branch2 = nn.Sequential(

BasicConv2d(320, 32, kernel_size=1, stride=1),

BasicConv2d(32, 48, kernel_size=3, stride=1, padding=1),

BasicConv2d(48, 64, kernel_size=3, stride=1, padding=1)


self.conv2d = nn.Conv2d(128, 320, kernel_size=1, stride=1)

self.relu = nn.ReLU(inplace=False)

def forward(self, x):

x0 = self.branch0(x)

x1 = self.branch1(x)

x2 = self.branch2(x)

out = torch.cat((x0, x1, x2), 1)

out = self.conv2d(out)

out = out * self.scale + x

out = self.relu(out)

return out

class Mixed_6a(nn.Module):

def __init__(self):

super(Mixed_6a, self).__init__()

self.branch0 = BasicConv2d(320, 384, kernel_size=3, stride=2)

self.branch1 = nn.Sequential(

BasicConv2d(320, 256, kernel_size=1, stride=1),

BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1),

BasicConv2d(256, 384, kernel_size=3, stride=2)


self.branch2 = nn.MaxPool2d(3, stride=2)

def forward(self, x):

x0 = self.branch0(x)

x1 = self.branch1(x)

x2 = self.branch2(x)

out = torch.cat((x0, x1, x2), 1)

return out

class Block17(nn.Module):

def __init__(self, scale=1.0):

super(Block17, self).__init__()

self.scale = scale

self.branch0 = BasicConv2d(1088, 192, kernel_size=1, stride=1)

self.branch1 = nn.Sequential(

BasicConv2d(1088, 128, kernel_size=1, stride=1),

BasicConv2d(128, 160, kernel_size=(1, 7), stride=1, padding=(0, 3)),

BasicConv2d(160, 192, kernel_size=(7, 1), stride=1, padding=(3, 0))


self.conv2d = nn.Conv2d(384, 1088, kernel_size=1, stride=1)

self.relu = nn.ReLU(inplace=False)

def forward(self, x):

x0 = self.branch0(x)

x1 = self.branch1(x)

out = torch.cat((x0, x1), 1)

out = self.conv2d(out)

out = out * self.scale + x

out = self.relu(out)

return out

class Mixed_7a(nn.Module):

def __init__(self):

super(Mixed_7a, self).__init__()

self.branch0 = nn.Sequential(

BasicConv2d(1088, 256, kernel_size=1, stride=1),

BasicConv2d(256, 384, kernel_size=3, stride=2)


self.branch1 = nn.Sequential(

BasicConv2d(1088, 256, kernel_size=1, stride=1),

BasicConv2d(256, 288, kernel_size=3, stride=2)


self.branch2 = nn.Sequential(

BasicConv2d(1088, 256, kernel_size=1, stride=1),

BasicConv2d(256, 288, kernel_size=3, stride=1, padding=1),

BasicConv2d(288, 320, kernel_size=3, stride=2)


self.branch3 = nn.MaxPool2d(3, stride=2)

def forward(self, x):

x0 = self.branch0(x)

x1 = self.branch1(x)

x2 = self.branch2(x)

x3 = self.branch3(x)

out = torch.cat((x0, x1, x2, x3), 1)

return out

class Block8(nn.Module):

def __init__(self, scale=1.0, noReLU=False):

super(Block8, self).__init__()

self.scale = scale

self.noReLU = noReLU

self.branch0 = BasicConv2d(2080, 192, kernel_size=1, stride=1)

self.branch1 = nn.Sequential(

BasicConv2d(2080, 192, kernel_size=1, stride=1),

BasicConv2d(192, 224, kernel_size=(1, 3), stride=1, padding=(0, 1)),

BasicConv2d(224, 256, kernel_size=(3, 1), stride=1, padding=(1, 0))


self.conv2d = nn.Conv2d(448, 2080, kernel_size=1, stride=1)

if not self.noReLU:

self.relu = nn.ReLU(inplace=False)

def forward(self, x):

x0 = self.branch0(x)

x1 = self.branch1(x)

out = torch.cat((x0, x1), 1)

out = self.conv2d(out)

out = out * self.scale + x

if not self.noReLU:

out = self.relu(out)

return out

def _make_layer(block, num_blocks, **kwargs):

layers = []

for _ in range(num_blocks):


return nn.Sequential(*layers)

class InceptionResNetV2(nn.Module):

def __init__(self, num_classes=1001):

super(InceptionResNetV2, self).__init__()

# Special attributes

self.input_space = None

self.input_size = (299, 299, 3)

self.mean = None

self.std = None

# Modules

self.conv2d_1a = BasicConv2d(3, 32, kernel_size=3, stride=2)

self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1)

self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1)

self.maxpool_3a = nn.MaxPool2d(3, stride=2)

self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1)

self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1)

self.maxpool_5a = nn.MaxPool2d(3, stride=2)

self.mixed_5b = Mixed_5b()

self.repeat = _make_layer(Block35, 10, scale=0.17)

self.mixed_6a = Mixed_6a()

self.repeat_1 = _make_layer(Block17, 20, scale=0.10)

self.mixed_7a = Mixed_7a()

self.repeat_2 = _make_layer(Block8, 9, scale=0.20)

self.block8 = Block8(noReLU=True)

self.conv2d_7b = BasicConv2d(2080, 1536, kernel_size=1, stride=1)

self.avgpool_1a = nn.AvgPool2d(8, count_include_pad=False)

self.last_linear = nn.Linear(1536, num_classes)

def features(self, input_):

x = self.conv2d_1a(input_)

x = self.conv2d_2a(x)

x = self.conv2d_2b(x)

x = self.maxpool_3a(x)

x = self.conv2d_3b(x)

x = self.conv2d_4a(x)

x = self.maxpool_5a(x)

x = self.mixed_5b(x)

x = self.repeat(x)

x = self.mixed_6a(x)

x = self.repeat_1(x)

x = self.mixed_7a(x)

x = self.repeat_2(x)

x = self.block8(x)

x = self.conv2d_7b(x)

return x

def logits(self, features):

x = self.avgpool_1a(features)

x = x.view(x.size(0), -1)

x = self.last_linear(x)

return x

def forward(self, input_):

x = self.features(input_)

x = self.logits(x)

return x


import torch

from torch import optim

from torch import nn

from torch.nn import functional as F

import time

from torch import Tensor

from InceptionResNetV2 import InceptionResNetV2

from webvision import WebvisionDataloader

def accuracy(output: Tensor, target: Tensor, topk=(1,)) -> list:

maxk = max(topk)

batch_size = target.size(0)

# Get the descending order of the top k probabilities

_, pred = output.topk(maxk, dim=1) # Shape: [batch_size, maxk]

pred = pred.t() # Shape: [maxk, batch_size]

correct = pred.eq(target.view(1, -1).expand_as(pred)) # Shape: [maxk, batch_size]

res = []

for k in topk:

correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)

res.append(correct_k.mul_(100.0 / batch_size))

return res

best_acc1, best_acc5 = 0, 0


def evaluate(model, dataloader):

global best_acc1, best_acc5


total_loss = 0.0

total_acc1 = 0.0

total_acc5 = 0.0

total_samples = 0

for x, labels in dataloader:

x = x.cuda(non_blocking=True)

labels = labels.cuda(non_blocking=True)

logits = model(x)

loss = F.cross_entropy(logits, labels)

acc1, acc5 = accuracy(logits.cpu(), labels.cpu(), topk=(1, 5))

batch_size = x.size(0)

total_loss += loss.item() * batch_size

total_acc1 += acc1.item() * batch_size

total_acc5 += acc5.item() * batch_size

total_samples += batch_size

avg_loss = total_loss / total_samples

avg_acc1 = total_acc1 / total_samples

avg_acc5 = total_acc5 / total_samples

best_acc1 = max(best_acc1, avg_acc1)

best_acc5 = max(best_acc5, avg_acc5)

print(f'Average Loss: {avg_loss:.4f}\t'

f'Acc@1 {avg_acc1:.2f}(Best {best_acc1:.2f})\t'

f'Acc@5 {avg_acc5:.2f}({best_acc5:.2f})')

return avg_loss, avg_acc1, avg_acc5

def train(dataloader, model, optimizer, criterion):

end = time.time()


for i, (x, labels) in enumerate(dataloader):

x = x.cuda(non_blocking=True)

labels = labels.cuda(non_blocking=True)


logits = model(x)

loss = criterion(logits, labels)



if i == 100:

print(f'Loss: {loss.item():.4f}')

print(f'Training done in {time.time() - end:.2f}s')

def main_work():

num_classes = 50

epochs = 80

lr = 0.1

end = time.time()

dataloaders = WebvisionDataloader(num_classes=50)

train_loder = dataloaders.train()

test_loader = dataloaders.test()

print(f'Dataloader made in {time.time() - end:.2f}s')

model = InceptionResNetV2(num_classes).cuda()

optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)

criterion = nn.CrossEntropyLoss()

scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60], gamma=0.1)

for i in range(epochs):

print(f'Epoch[{i}] starting')

end = time.time()

train(train_loder, model, optimizer, criterion)

evaluate(model, test_loader)


print(f'Epoch[{i}] done in {time.time() - end:.2f}s')

def main():

end = time.time()


print(f'Total time: {time.time() - end:.2f}s')

if __name__ == '__main__':


简单运行了1个epoch,设备为NVIDIA GeForce RTX 3090,还是非常耗时的。

Dataloader made in 0.40s

Epoch[0] starting

Loss: 3.4720

Training done in 4325.29s

Average Loss: 3.0656Acc@1 19.28(Best 19.28)Acc@5 51.40(51.40)

Epoch[0] done in 4374.54s

Epoch[1] starting

Loss: 3.0587

Training done in 4264.95s

Average Loss: 2.8655Acc@1 25.56(Best 25.56)Acc@5 54.72(54.72)

Epoch[1] done in 4310.30s


