🚀 PyTorch 速查手册

模块/主题核心类/函数/操作主要用途/说明
Tensor操作torch.tensor(), torch.randn(), view(), reshape(), to(device)创建张量、改变形状、设备转换
自动微分requires_grad=True, backward(), grad, torch.no_grad()设置梯度追踪、计算梯度、清空梯度、禁用梯度计算
数据加载与处理Dataset, DataLoader自定义数据集、批量加载与打乱数据
神经网络构建torch.nn.Module, nn.Linear, nn.Conv2d, nn.ReLU, nn.Sequential模型基类、定义层、激活函数、顺序容器
损失函数nn.MSELoss, nn.CrossEntropyLoss, nn.NLLLoss回归任务、分类任务
优化器torch.optim.SGD, torch.optim.Adam, zero_grad(), step()定义优化算法、清空梯度、执行参数更新
模型训练流程model.train(), 循环迭代, 前向传播, 损失计算, loss.backward(), optimizer.step()训练模式、计算损失、反向传播、更新参数
模型验证/测试流程model.eval(), with torch.no_grad():评估模式、禁用梯度计算
模型保存与加载torch.save(), torch.load(), model.state_dict()保存模型、加载模型、获取/加载状态字典
设备管理torch.cuda.is_available(), torch.device()检查GPU可用性、设置设备

🛠️ 核心操作与概念

1. 🎯 Tensor 操作大全

1.1 Tensor 创建与初始化

import torch
import numpy as np

# 基础创建
x = torch.tensor([1, 2, 3])                    # 从列表创建
y = torch.tensor(np.array([1, 2, 3]))          # 从numpy创建

# 特殊矩阵
zeros = torch.zeros(2, 3)                      # 全零矩阵
ones = torch.ones(2, 3)                        # 全一矩阵
eye = torch.eye(3)                             # 单位矩阵
arange = torch.arange(0, 10, 2)                # 范围张量 [0, 2, 4, 6, 8]
linspace = torch.linspace(0, 1, 5)             # 线性间隔 [0.0000, 0.2500, 0.5000, 0.7500, 1.0000]

# 随机张量
rand_uniform = torch.rand(2, 3)                # 均匀分布 [0,1]
rand_normal = torch.randn(2, 3)                # 标准正态分布
rand_int = torch.randint(0, 10, (2, 3))        # 随机整数

# 类似现有张量
x_like = torch.zeros_like(rand_normal)         # 类似形状的全零
same_shape = torch.randn_like(rand_normal)     # 类似形状的随机

1.2 Tensor 索引与切片

# 创建示例张量
tensor = torch.randn(4, 5, 6)

# 基础索引
print(tensor[0])           # 第一行
print(tensor[:, 0])        # 第一列  
print(tensor[..., 0])      # 最后一维的第一个元素

# 高级索引
mask = tensor > 0.5
filtered = tensor[mask]    # 布尔索引

indices = torch.tensor([0, 2, 3])
selected = tensor[indices] # 数组索引

# 切片操作
sliced = tensor[1:3, 2:4]           # 行1-2, 列2-3
strided = tensor[::2, ::3]          # 每隔2行, 每隔3列
reversed = tensor[::-1, ::-1]       # 反转

# 索引赋值
tensor[0, :] = 1.0                  # 第一行赋值为1
tensor[mask] = 0.0                  # 符合条件的赋值为0

1.3 Tensor 形状操作

x = torch.randn(2, 3, 4)

# 重塑操作
reshaped = x.reshape(6, 4)          # 重塑为(6,4)
viewed = x.view(3, 8)               # 视图重塑(必须连续)
squeezed = x.squeeze()              # 移除维度1的维度
unsqueezed = x.unsqueeze(0)         # 在指定维度添加维度1

# 转置与重排
transposed = x.T                    # 转置
permuted = x.permute(2, 0, 1)       # 维度重排

# 连接与分割
a, b = torch.randn(2, 3), torch.randn(2, 3)
cat = torch.cat([a, b], dim=0)      # 沿维度0连接 → (4,3)
stack = torch.stack([a, b], dim=0)  # 堆叠 → (2,2,3)

# 分割
chunks = torch.chunk(x, 2, dim=0)   # 沿维度0分成2块
split = torch.split(x, 2, dim=1)    # 沿维度1每2个分割

1.4 Tensor 数学运算

a, b = torch.tensor([1.0, 2.0]), torch.tensor([3.0, 4.0])

# 基础运算
add = a + b                         # 加法
sub = a - b                         # 减法  
mul = a * b                         # 逐元素乘法
div = a / b                         # 除法
pow = a ** 2                        # 幂运算

# 矩阵运算
mat_a, mat_b = torch.randn(2, 3), torch.randn(3, 2)
matmul = torch.mm(mat_a, mat_b)     # 矩阵乘法
bmm = torch.bmm(mat_a.unsqueeze(0), mat_b.unsqueeze(0)) # 批量矩阵乘

# 归约操作
x = torch.randn(2, 3)
sum_all = x.sum()                   # 所有元素和
sum_dim = x.sum(dim=0)              # 沿维度0求和
mean = x.mean()                     # 平均值
std = x.std()                       # 标准差
max_val, max_idx = x.max(dim=1)     # 最大值和索引

# 比较操作
eq = torch.eq(a, b)                 # 相等
gt = torch.gt(a, b)                 # 大于
lt = torch.lt(a, b)                 # 小于

2. 🧠 自动微分高级特性

2.1 梯度计算与控制

# 基础梯度计算
x = torch.tensor(2.0, requires_grad=True)
y = x ** 3 + 2 * x + 1
y.backward()
print(x.grad)  # 3*x² + 2 = 14.0

# 向量值的梯度
x = torch.tensor([1.0, 2.0], requires_grad=True)
y = x.sum() ** 2
y.backward()
print(x.grad)  # [2*sum, 2*sum] = [6.0, 6.0]

# 高阶梯度
x = torch.tensor(3.0, requires_grad=True)
y = x ** 3
grad1 = torch.autograd.grad(y, x, create_graph=True)[0]  # 一阶导
grad2 = torch.autograd.grad(grad1, x)[0]                 # 二阶导
print(grad2)  # 6*x = 18.0

2.2 梯度控制上下文

# 梯度禁用
x = torch.tensor(1.0, requires_grad=True)

with torch.no_grad():
    y = x * 2  # 不追踪梯度
    # y.requires_grad = False

# 梯度检查点(内存优化)
def gradient_checkpointing():
    from torch.utils.checkpoint import checkpoint
    
    def custom_forward(x):
        return x ** 2
    
    x = torch.tensor(2.0, requires_grad=True)
    y = checkpoint(custom_forward, x)  # 节省内存
    y.backward()

# 梯度累积
optimizer.zero_grad()
for i, (data, target) in enumerate(dataloader):
    output = model(data)
    loss = criterion(output, target)
    loss.backward()
    
    if (i + 1) % 4 == 0:  # 每4个batch更新一次
        optimizer.step()
        optimizer.zero_grad()

3. 🏗️ 神经网络构建大全

3.1 网络层详解

import torch.nn as nn

# 卷积层
conv1d = nn.Conv1d(1, 32, kernel_size=3)      # 一维卷积
conv2d = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1)
conv3d = nn.Conv3d(1, 32, kernel_size=3)

# 池化层
maxpool2d = nn.MaxPool2d(2, stride=2)         # 最大池化
avgpool2d = nn.AvgPool2d(2)                   # 平均池化
adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))  # 自适应池化

# 归一化层
batch_norm = nn.BatchNorm2d(64)               # 批归一化
layer_norm = nn.LayerNorm(128)                # 层归一化
instance_norm = nn.InstanceNorm2d(32)         # 实例归一化

# 循环层
lstm = nn.LSTM(128, 64, num_layers=2, bidirectional=True)
gru = nn.GRU(128, 64, batch_first=True)
rnn = nn.RNN(128, 64, nonlinearity='tanh')

# 注意力机制
multihead_attn = nn.MultiheadAttention(512, 8)  # 多头注意力

3.2 复杂网络架构

class ResidualBlock(nn.Module):
    """残差块"""
    def __init__(self, channels):
        super().__init__()
        self.conv1 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2d(channels, channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(channels)
    
    def forward(self, x):
        residual = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += residual  # 残差连接
        return self.relu(out)

class ComplexNetwork(nn.Module):
    """复杂网络示例"""
    def __init__(self, num_classes=10):
        super().__init__()
        
        # 特征提取
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            ResidualBlock(64),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        
        # 分类器
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(128, 256),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )
        
        # 注意力机制
        self.attention = nn.MultiheadAttention(128, 8)
    
    def forward(self, x):
        x = self.features(x)
        
        # 应用注意力
        b, c, h, w = x.shape
        x_attn = x.view(b, c, -1).permute(2, 0, 1)  # (h*w, b, c)
        attn_out, _ = self.attention(x_attn, x_attn, x_attn)
        x = attn_out.permute(1, 2, 0).view(b, c, h, w)
        
        return self.classifier(x)

4. 🔄 训练循环优化

4.1 高级训练技巧

def advanced_training_loop(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    """高级训练循环"""
    
    # 学习率调度器
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer, 
        max_lr=0.1,
        epochs=num_epochs,
        steps_per_epoch=len(train_loader)
    )
    
    # 梯度缩放(混合精度训练)
    scaler = torch.cuda.amp.GradScaler()
    
    # 早停机制
    best_loss = float('inf')
    patience = 5
    patience_counter = 0
    
    for epoch in range(num_epochs):
        # 训练阶段
        model.train()
        train_loss = 0.0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.cuda(), target.cuda()
            
            optimizer.zero_grad()
            
            # 混合精度训练
            with torch.cuda.amp.autocast():
                output = model(data)
                loss = criterion(output, target)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            scheduler.step()
            
            train_loss += loss.item()
            
            # 梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # 验证阶段
        model.eval()
        val_loss = 0.0
        correct = 0
        
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.cuda(), target.cuda()
                output = model(data)
                val_loss += criterion(output, target).item()
                pred = output.argmax(dim=1)
                correct += pred.eq(target).sum().item()
        
        # 早停检查
        if val_loss < best_loss:
            best_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break
        
        print(f'Epoch {epoch}: Train Loss: {train_loss/len(train_loader):.4f}, '
              f'Val Loss: {val_loss/len(val_loader):.4f}, '
              f'Accuracy: {100.*correct/len(val_loader.dataset):.2f}%')

5. 📊 可视化与调试

5.1 TensorBoard 集成

from torch.utils.tensorboard import SummaryWriter

def setup_tensorboard_logging(model, train_loader):
    """设置TensorBoard日志"""
    writer = SummaryWriter('runs/experiment1')
    
    # 记录模型图
    data_iter = iter(train_loader)
    sample_data, _ = next(data_iter)
    writer.add_graph(model, sample_data)
    
    # 训练过程中记录指标
    for epoch in range(num_epochs):
        # ... 训练代码 ...
        
        # 记录损失和准确率
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Accuracy/train', train_accuracy, epoch)
        writer.add_scalar('Learning_rate', optimizer.param_groups[0]['lr'], epoch)
        
        # 记录权重分布
        for name, param in model.named_parameters():
            writer.add_histogram(name, param, epoch)
            writer.add_histogram(f'{name}.grad', param.grad, epoch)
    
    writer.close()

5.2 模型调试工具

def model_debugging_tools(model, dataloader):
    """模型调试工具"""
    
    # 钩子函数用于中间层激活
    activations = {}
    def get_activation(name):
        def hook(model, input, output):
            activations[name] = output.detach()
        return hook
    
    # 注册钩子
    hooks = []
    for name, layer in model.named_modules():
        if isinstance(layer, nn.Conv2d):
            hook = layer.register_forward_hook(get_activation(name))
            hooks.append(hook)
    
    # 前向传播捕获激活
    data, target = next(iter(dataloader))
    output = model(data)
    
    # 移除钩子
    for hook in hooks:
        hook.remove()
    
    # 模型统计
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"总参数: {total_params:,}")
    print(f"可训练参数: {trainable_params:,}")
    
    # 内存使用分析
    if torch.cuda.is_available():
        print(f"GPU内存使用: {torch.cuda.memory_allocated()/1024**3:.2f} GB")

6. 🚀 分布式训练

6.1 多GPU训练

import torch.distributed as dist
import torch.multiprocessing as mp

def setup_ddp(rank, world_size):
    """设置分布式数据并行"""
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup_ddp():
    dist.destroy_process_group()

def train_ddp(rank, world_size, model, dataset):
    """分布式训练函数"""
    setup_ddp(rank, world_size)
    
    # 分布式采样器
    sampler = torch.utils.data.DistributedSampler(
        dataset, num_replicas=world_size, rank=rank
    )
    
    dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=64, sampler=sampler
    )
    
    # 模型移到GPU并包装
    model = model.to(rank)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
    
    # 训练循环
    for epoch in range(epochs):
        sampler.set_epoch(epoch)
        for batch in dataloader:
            # ... 训练代码 ...
            pass
    
    cleanup_ddp()

# 启动分布式训练
def main():
    world_size = torch.cuda.device_count()
    mp.spawn(train_ddp, args=(world_size, model, dataset), 
             nprocs=world_size, join=True)

7. 🎯 模型部署与优化

7.1 模型量化与优化

def model_quantization(model, calibration_loader):
    """模型量化"""
    model.eval()
    
    # 动态量化
    quantized_model = torch.quantization.quantize_dynamic(
        model, {nn.Linear, nn.Conv2d}, dtype=torch.qint8
    )
    
    # 静态量化准备
    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
    model_prepared = torch.quantization.prepare(model, inplace=False)
    
    # 校准
    with torch.no_grad():
        for data, _ in calibration_loader:
            model_prepared(data)
    
    # 转换
    model_quantized = torch.quantization.convert(model_prepared)
    
    return model_quantized

def model_pruning(model, pruning_amount=0.3):
    """模型剪枝"""
    parameters_to_prune = []
    
    for name, module in model.named_modules():
        if isinstance(module, nn.Conv2d):
            parameters_to_prune.append((module, 'weight'))
        elif isinstance(module, nn.Linear):
            parameters_to_prune.append((module, 'weight'))
    
    # 全局剪枝
    torch.nn.utils.prune.global_unstructured(
        parameters_to_prune,
        pruning_method=torch.nn.utils.prune.L1Unstructured,
        amount=pruning_amount,
    )
    
    return model

7.2 ONNX导出与推理

def export_to_onnx(model, sample_input, onnx_path="model.onnx"):
    """导出模型到ONNX格式"""
    model.eval()
    
    torch.onnx.export(
        model,
        sample_input,
        onnx_path,
        export_params=True,
        opset_version=13,
        do_constant_folding=True,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes={
            'input': {0: 'batch_size'},
            'output': {0: 'batch_size'}
        }
    )
    print(f"Model exported to {onnx_path}")

def onnx_inference(onnx_path, input_data):
    """ONNX模型推理"""
    import onnxruntime as ort
    
    ort_session = ort.InferenceSession(onnx_path)
    
    # 准备输入
    ort_inputs = {ort_session.get_inputs()[0].name: input_data.numpy()}
    
    # 推理
    ort_outs = ort_session.run(None, ort_inputs)
    
    return torch.tensor(ort_outs[0])

8. 🔧 实用工具函数

8.1 训练辅助工具

class ModelCheckpoint:
    """模型检查点管理器"""
    def __init__(self, save_dir, mode='min', patience=5):
        self.save_dir = save_dir
        self.mode = mode
        self.patience = patience
        self.best_score = None
        self.counter = 0
        
    def __call__(self, score, model, optimizer, epoch):
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(model, optimizer, epoch, True)
        elif (self.mode == 'min' and score < self.best_score) or \
             (self.mode == 'max' and score > self.best_score):
            self.best_score = score
            self.save_checkpoint(model, optimizer, epoch, True)
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                return True  # 早停
        return False
    
    def save_checkpoint(self, model, optimizer, epoch, is_best):
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_score': self.best_score
        }
        
        torch.save(checkpoint, f'{self.save_dir}/checkpoint.pth')
        if is_best:
            torch.save(model.state_dict(), f'{self.save_dir}/best_model.pth')

总结:

  1. 动态计算图:PyTorch使用动态计算图,这意味着图在运行时构建,提供了极大的灵活性。
  2. 梯度累积:对于大模型,可以通过多次前向传播后一次反向传播来模拟更大的batch size。
  3. 学习率调度:使用torch.optim.lr_scheduler来调整学习率。
  4. TensorBoard可视化:集成TensorBoard来跟踪实验指标。
  5. 混合精度训练:使用torch.cuda.amp进行自动混合精度训练,节省显存并加速。

添加新评论