计算机视觉模型从ResNet到Vision Transformer核心概念与发展历程计算机视觉模型经历了从传统手工特征提取到深度学习的重大转变其中ResNet和Vision Transformer (ViT) 是两个重要的里程碑。发展时间线年份模型关键创新2012AlexNet深度卷积神经网络2014VGGNet更深的网络结构2015ResNet残差连接解决梯度消失2017DenseNet密集连接2019Vision Transformer引入自注意力机制2021Swin Transformer窗口注意力机制ResNet残差网络核心原理ResNet 通过引入残差块Residual Block解决了深层网络的梯度消失问题使得网络可以达到数百层甚至更深。残差块结构import torch import torch.nn as nn class ResidualBlock(nn.Module): def __init__(self, in_channels, out_channels, stride1): super(ResidualBlock, self).__init__() # 主路径 self.conv1 nn.Conv2d(in_channels, out_channels, kernel_size3, stridestride, padding1, biasFalse) self.bn1 nn.BatchNorm2d(out_channels) self.relu nn.ReLU(inplaceTrue) self.conv2 nn.Conv2d(out_channels, out_channels, kernel_size3, stride1, padding1, biasFalse) self.bn2 nn.BatchNorm2d(out_channels) # shortcut路径 self.shortcut nn.Sequential() if stride ! 1 or in_channels ! out_channels: self.shortcut nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size1, stridestride, biasFalse), nn.BatchNorm2d(out_channels) ) def forward(self, x): residual x out self.conv1(x) out self.bn1(out) out self.relu(out) out self.conv2(out) out self.bn2(out) out self.shortcut(residual) # 残差连接 out self.relu(out) return outResNet整体结构class ResNet(nn.Module): def __init__(self, block, layers, num_classes1000): super(ResNet, self).__init__() self.in_channels 64 self.conv1 nn.Conv2d(3, 64, kernel_size7, stride2, padding3, biasFalse) self.bn1 nn.BatchNorm2d(64) self.relu nn.ReLU(inplaceTrue) self.maxpool nn.MaxPool2d(kernel_size3, stride2, padding1) # 残差层 self.layer1 self._make_layer(block, 64, layers[0], stride1) self.layer2 self._make_layer(block, 128, layers[1], stride2) self.layer3 self._make_layer(block, 256, layers[2], stride2) self.layer4 self._make_layer(block, 512, layers[3], stride2) self.avgpool nn.AdaptiveAvgPool2d((1, 1)) self.fc nn.Linear(512, num_classes) def _make_layer(self, block, out_channels, blocks, stride): strides [stride] [1] * (blocks - 1) layers [] for stride in strides: layers.append(block(self.in_channels, out_channels, stride)) self.in_channels out_channels return nn.Sequential(*layers) def forward(self, x): out self.conv1(x) out self.bn1(out) out self.relu(out) out self.maxpool(out) out self.layer1(out) out self.layer2(out) out self.layer3(out) out self.layer4(out) out self.avgpool(out) out torch.flatten(out, 1) out self.fc(out) return out # 创建ResNet-34 resnet34 ResNet(ResidualBlock, [3, 4, 6, 3])Vision Transformer视觉 transformer核心原理Vision Transformer 将 NLP 中的 Transformer 架构应用到计算机视觉领域通过自注意力机制捕获图像中的长距离依赖关系。ViT 结构import torch import torch.nn as nn import torch.nn.functional as F class PatchEmbedding(nn.Module): def __init__(self, img_size224, patch_size16, in_channels3, embed_dim768): super(PatchEmbedding, self).__init__() self.img_size img_size self.patch_size patch_size self.n_patches (img_size // patch_size) ** 2 self.proj nn.Conv2d( in_channelsin_channels, out_channelsembed_dim, kernel_sizepatch_size, stridepatch_size ) def forward(self, x): x self.proj(x) # (B, embed_dim, H//patch_size, W//patch_size) x x.flatten(2) # (B, embed_dim, n_patches) x x.transpose(1, 2) # (B, n_patches, embed_dim) return x class MultiHeadAttention(nn.Module): def __init__(self, embed_dim, num_heads8): super(MultiHeadAttention, self).__init__() self.embed_dim embed_dim self.num_heads num_heads self.head_dim embed_dim // num_heads self.qkv nn.Linear(embed_dim, embed_dim * 3) self.proj nn.Linear(embed_dim, embed_dim) def forward(self, x): B, N, C x.shape qkv self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) q, k, v qkv[0], qkv[1], qkv[2] attn (q k.transpose(-2, -1)) * (self.head_dim ** -0.5) attn attn.softmax(dim-1) x (attn v).transpose(1, 2).reshape(B, N, C) x self.proj(x) return x class VisionTransformer(nn.Module): def __init__(self, img_size224, patch_size16, in_channels3, embed_dim768, num_heads8, num_layers12, mlp_ratio4., num_classes1000): super(VisionTransformer, self).__init__() self.patch_embed PatchEmbedding(img_size, patch_size, in_channels, embed_dim) self.cls_token nn.Parameter(torch.zeros(1, 1, embed_dim)) self.pos_embed nn.Parameter(torch.zeros(1, 1 self.patch_embed.n_patches, embed_dim)) self.blocks nn.ModuleList([ nn.Sequential( nn.LayerNorm(embed_dim), MultiHeadAttention(embed_dim, num_heads), nn.LayerNorm(embed_dim), nn.Sequential( nn.Linear(embed_dim, int(embed_dim * mlp_ratio)), nn.GELU(), nn.Linear(int(embed_dim * mlp_ratio), embed_dim) ) ) for _ in range(num_layers) ]) self.norm nn.LayerNorm(embed_dim) self.head nn.Linear(embed_dim, num_classes) def forward(self, x): B x.shape[0] x self.patch_embed(x) cls_tokens self.cls_token.expand(B, -1, -1) x torch.cat((cls_tokens, x), dim1) x self.pos_embed for block in self.blocks: x x block[1](block[0](x)) # 残差连接 x x block[3](block[2](x)) # 残差连接 x self.norm(x) x x[:, 0] # 取CLS token x self.head(x) return x # 创建ViT模型 vit VisionTransformer()性能分析模型性能对比模型参数量 (M)Top-1 准确率 (%)推理速度 (ms)内存使用 (MB)ResNet-1811.770.31.2310ResNet-3421.873.32.1440ResNet-5025.676.23.2630ResNet-10144.577.35.81020ViT-B/1686.878.56.51250ViT-L/16304.481.818.24100训练性能分析import time import torch import torchvision.models as models # 测试不同模型的训练速度 def test_training_speed(): models_to_test { ResNet-18: models.resnet18(), ResNet-50: models.resnet50(), ViT-B/16: models.vit_b_16() } device torch.device(cuda if torch.cuda.is_available() else cpu) batch_size 32 input_tensor torch.randn(batch_size, 3, 224, 224).to(device) labels torch.randint(0, 1000, (batch_size,)).to(device) for name, model in models_to_test.items(): model.to(device) model.train() optimizer torch.optim.SGD(model.parameters(), lr0.01) criterion torch.nn.CrossEntropyLoss() start_time time.time() for i in range(10): optimizer.zero_grad() outputs model(input_tensor) loss criterion(outputs, labels) loss.backward() optimizer.step() end_time time.time() print(f{name}: {end_time - start_time:.2f}秒) if __name__ __main__: test_training_speed()高级应用场景1. 图像分类import torch import torchvision.models as models import torchvision.transforms as transforms from PIL import Image # 加载预训练模型 model models.resnet50(pretrainedTrue) model.eval() # 图像预处理 transform transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean[0.485, 0.456, 0.406], std[0.229, 0.224, 0.225]) ]) # 加载图像 image Image.open(cat.jpg) image transform(image).unsqueeze(0) # 推理 with torch.no_grad(): outputs model(image) _, predicted torch.max(outputs, 1) # 加载标签 with open(imagenet_classes.txt, r) as f: classes [line.strip() for line in f.readlines()] print(f预测结果: {classes[predicted.item()]})2. 目标检测import torch import torchvision.models as models from torchvision.models.detection import FasterRCNN from torchvision.models.detection.rpn import AnchorGenerator # 使用ResNet作为 backbone backbone models.resnet50(pretrainedTrue) backbone.out_channels 2048 # 创建RPN锚点生成器 anchor_generator AnchorGenerator( sizes((32, 64, 128, 256, 512),), aspect_ratios((0.5, 1.0, 2.0),) ) # 创建Faster R-CNN模型 model FasterRCNN( backbonebackbone, num_classes91, # COCO数据集类别数 rpn_anchor_generatoranchor_generator ) # 模型推理 model.eval() input_tensor torch.randn(1, 3, 640, 640) with torch.no_grad(): predictions model(input_tensor) print(predictions)3. 图像分割import torch import torchvision.models as models # 加载预训练的分割模型 model models.segmentation.deeplabv3_resnet101(pretrainedTrue) model.eval() # 图像预处理 transform transforms.Compose([ transforms.Resize(520), transforms.ToTensor(), transforms.Normalize(mean[0.485, 0.456, 0.406], std[0.229, 0.224, 0.225]) ]) # 加载图像 image Image.open(street.jpg) image transform(image).unsqueeze(0) # 推理 with torch.no_grad(): output model(image)[out][0] output_predictions output.argmax(0) # 可视化分割结果 import numpy as np import matplotlib.pyplot as plt # 颜色映射 palette torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) colors torch.as_tensor([i for i in range(21)])[:, None] * palette colors (colors % 255).numpy().astype(uint8) # 应用颜色映射 seg_map colors[output_predictions.numpy()] seg_map Image.fromarray(seg_map) plt.figure(figsize(10, 10)) plt.subplot(121) plt.imshow(Image.open(street.jpg)) plt.title(原始图像) plt.axis(off) plt.subplot(122) plt.imshow(seg_map) plt.title(分割结果) plt.axis(off) plt.show()最佳实践1. 模型选择小数据集/边缘设备ResNet-18、ResNet-34平衡性能与精度ResNet-50、ViT-B/16高精度需求ResNet-101、ViT-L/162. 训练技巧数据增强随机裁剪、翻转、色彩抖动学习率调度余弦退火、学习率衰减正则化Dropout、权重衰减混合精度训练使用FP16加速训练3. 模型部署模型量化INT8量化减少模型大小和推理时间模型剪枝移除不重要的神经元知识蒸馏将大模型知识迁移到小模型代码优化建议1. 模型优化# 原始代码 def train(model, dataloader, optimizer, criterion, device): model.train() for batch in dataloader: images, labels batch images, labels images.to(device), labels.to(device) optimizer.zero_grad() outputs model(images) loss criterion(outputs, labels) loss.backward() optimizer.step() # 优化后代码 from torch.cuda.amp import autocast, GradScaler def train_optimized(model, dataloader, optimizer, criterion, device): model.train() scaler GradScaler() # 混合精度训练 for batch in dataloader: images, labels batch images, labels images.to(device), labels.to(device) optimizer.zero_grad() with autocast(): outputs model(images) loss criterion(outputs, labels) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update()2. 推理优化# 原始代码 def inference(model, image): model.eval() with torch.no_grad(): output model(image) return output # 优化后代码 def inference_optimized(model, image): model.eval() model torch.jit.trace(model, image) model torch.jit.freeze(model) with torch.no_grad(): output model(image) return output输入输出示例输入输出示例示例1ResNet图像分类输入import torch import torchvision.models as models import torchvision.transforms as transforms from PIL import Image # 加载模型 model models.resnet50(pretrainedTrue) model.eval() # 预处理 transform transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # 加载图像 image Image.open(cat.jpg) image_tensor transform(image).unsqueeze(0) # 推理 with torch.no_grad(): outputs model(image_tensor) _, predicted torch.max(outputs, 1) # 加载标签 with open(imagenet_classes.txt, r) as f: classes [line.strip() for line in f.readlines()] print(f预测结果: {classes[predicted.item()]})输出预测结果: 虎斑猫示例2Vision Transformer图像分类输入import torch import torchvision.models as models import torchvision.transforms as transforms from PIL import Image # 加载模型 model models.vit_b_16(pretrainedTrue) model.eval() # 预处理 transform transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # 加载图像 image Image.open(dog.jpg) image_tensor transform(image).unsqueeze(0) # 推理 with torch.no_grad(): outputs model(image_tensor) _, predicted torch.max(outputs, 1) print(f预测结果: {classes[predicted.item()]})输出预测结果: 金毛寻回犬总结计算机视觉模型从ResNet到Vision Transformer的发展代表了深度学习在视觉领域的重大突破。ResNet通过残差连接解决了深层网络的梯度消失问题而Vision Transformer则引入了自注意力机制能够更好地捕获图像中的长距离依赖关系。核心优势对比模型类型优势劣势ResNet计算效率高、适合小数据集、易于训练感受野有限、长距离依赖建模能力弱Vision Transformer长距离依赖建模能力强、精度高、可扩展性好计算成本高、需要大量训练数据未来发展趋势混合架构结合CNN和Transformer的优势如ViT与ResNet的混合模型轻量级模型如MobileViT等针对移动设备的高效模型自监督学习利用未标记数据提升模型性能多模态融合结合视觉和语言等多模态信息通过选择合适的模型架构并应用最佳实践我们可以在各种计算机视觉任务中取得更好的性能。无论是ResNet还是Vision Transformer它们都为计算机视觉的发展做出了重要贡献并且将继续推动该领域的创新。