论文地址: https://arxiv.org/abs/1409.1556
1. VGG的主要贡献
- VGG使用 2个3X3的卷积核 来代替 5X5的卷积核,3个3X3的卷积核 代替7X7的卷积核,在保证相同感受野的情况下,多个小卷积层堆积可以提升网络深度,增加特征提取能力;
- 相比于AlexNet,搭建了更深层次的神经网络。
2. VGG的主要结构
以最常用的VGG16为例:
自适应池化(AdaptiveAvgPool2d): 对输入信号,提供2维的自适应平均池化操作 对于任何输入大小的输入,可以将输出尺寸指定为H*W,但是输入和输出特征的数目不会变化。 https://blog.csdn.net/qq_41997920/article/details/98963215
3. 模型代码及训练过程
模型文件下载地址: 链接:https://pan.baidu.com/s/1h8r3_2HVqUKA_nIlXgOY5g 提取码:kov6
3.1 模型代码及加载权重
import torch
import torchvision.models
from torch import nn
vgg_types = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}
class VGG(nn.Module):
def __init__(self, in_channels=3, num_classes=1000, vgg_type="VGG16"):
super(VGG, self).__init__()
self.in_channels = in_channels
self.features = self.create_conv_layers(vgg_types[vgg_type])
self.avgpool = nn.AdaptiveAvgPool2d(output_size=(7, 7))
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, num_classes, bias=True)
)
self.initialize_weights()
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = x.view(x.shape[0], -1)
x = self.classifier(x)
return x
def create_conv_layers(self, architecture):
layers = []
in_channels = self.in_channels
for x in architecture:
if type(x) == int:
out_channels = x
layers += [
nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)
]
in_channels = x
elif x == 'M':
layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]
return nn.Sequential(*layers)
def initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_uniform_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.kaiming_uniform_(m.weight)
nn.init.constant_(m.bias, 0)
def get_vgg_model(in_channels=3, num_classes=1000, vgg_type="VGG16", is_load_weights=False, model_path=None):
vgg_type = vgg_type.upper()
if is_load_weights:
assert model_path is not None, "model path cannot be none"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = VGG(in_channels=in_channels, num_classes=10, vgg_type=vgg_type)
missing_keys, unexpected_keys = model.load_state_dict(torch.load(model_path, map_location=device), strict=False)
print(f"missing keys: {missing_keys}")
print(f"unexpected key: {unexpected_keys}")
model.classifier[6] = nn.Linear(model.classifier[6].in_features, num_classes)
else:
model = VGG(in_channels=in_channels, num_classes=num_classes, vgg_type=vgg_type)
return model
if __name__ == '__main__':
model_path = "./VGG16_0.898.pth"
model = get_vgg_model(num_classes=10, is_load_weights=True, model_path=model_path)
input_x = torch.randn((3, 3, 224, 224))
preds = model(input_x)
print(preds.shape)
由于原始的VGG模型中没有BN层,所以使用上述代码加载Pytorch官方的预训练权重会出错; 如果要使用官方的预训练模型,需要注释掉BN层,并将第81行的num_classes=10修改为num_classes=1000; VGG16_0.898.pth模型文件是我用上述网络结构在CIFAR10数据集训练的权重,可以作为练习使用。
3.2 使用CIFAR10数据集训练模型
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch import optim, nn, cuda
import torchvision
from torchvision import transforms
from model import get_vgg_model
import platform
from tqdm import tqdm
import numpy as np
import requests
import os
sysstr = platform.system()
device = "cuda" if torch.cuda.is_available() else "cpu"
num_workers = 0 if sysstr == "Windows" else 8
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])
])
classes = ("airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck")
train_dataset = torchvision.datasets.CIFAR10("./data/", train=True, download=True, transform=transform)
val_dataset = torchvision.datasets.CIFAR10("./data/", train=False, download=True, transform=transform)
batch_size = 64
train_dl = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)
val_dl = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers)
logs_dir = "/root/tf-logs/"
writer = SummaryWriter(logs_dir + "vgg")
writer_dict = {}
for i in range(10):
writer_dict[f"writer_{i}"] = SummaryWriter(logs_dir + classes[i])
vgg_type = "VGG16"
model = get_vgg_model(vgg_type=vgg_type, num_classes=10)
num_epochs = 100
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scaler = cuda.amp.GradScaler()
torch.backends.cudnn.benchmark = True
best_model_acc = 0
model.to(device)
for epoch in range(num_epochs):
train_loop = tqdm(train_dl, leave=True, desc=f"Train Epoch: {epoch}/{num_epochs}")
model.train()
epoch_loss = 0.0
for step, (imgs, labels) in enumerate(train_loop):
imgs = imgs.to(device)
labels = labels.to(device)
with cuda.amp.autocast():
preds = model(imgs)
loss = loss_fn(preds, labels)
optimizer.zero_grad()
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
epoch_loss += loss.item()
train_loop.set_postfix(loss=f"{np.round(loss.item(), 3)}")
epoch_loss /= len(train_loop)
model.eval()
acc_num = 0
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
with torch.no_grad():
val_loop = tqdm(val_dl, leave=True, desc=f"Val: ")
for step, (imgs, labels) in enumerate(val_loop):
mini_batch_size = len(imgs)
imgs = imgs.to(device)
labels = labels.to(device)
output = model(imgs)
_, preds = torch.max(output, 1)
c = (preds == labels)
c = c.squeeze()
for i in range(mini_batch_size):
label = labels[i]
class_correct[label] += c[i]
class_total[label] += 1
acc = c.sum().item()
acc_num += acc
accuracy = acc_num / len(val_dataset)
for i in range(10):
writer_dict[f"writer_{i}"].add_scalar(f"classes acc", class_correct[i].item() / class_total[i], epoch)
if accuracy > best_model_acc:
torch.save(model.state_dict(), f"./vgg_{vgg_type}_{np.round(accuracy, 3)}.pth")
best_model_acc = accuracy
tqdm.write(f"Val Accuracy: {np.round(accuracy * 100, 2)}%")
writer.add_scalar("classes acc", accuracy, epoch)
writer.add_scalar("loss", epoch_loss, epoch)
os.system("shutdown")
3.3 训练结果
|