基础概念与张量操作
1. PyTorch中的Tensor与NumPy数组的主要区别是什么?
# 主要区别:
# 1. Tensor可以在GPU上运行,NumPy数组只能在CPU上
# 2. Tensor支持自动求导(autograd)
# 3. Tensor有更丰富的API用于深度学习
import torch
import numpy as np
# 相互转换
numpy_array = np.array([1, 2, 3])
tensor = torch.from_numpy(numpy_array)
tensor_to_numpy = tensor.numpy()
2. 如何在CPU和GPU之间移动Tensor?
# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 创建tensor并移动到设备
tensor = torch.tensor([1, 2, 3])
tensor = tensor.to(device)
# 或者直接创建在设备上
tensor_on_gpu = torch.tensor([1, 2, 3], device=device)
3. 解释view()和reshape()的区别
tensor = torch.randn(2, 3, 4)
# view()要求内存连续,否则会报错
try:
viewed = tensor.view(4, 6)
except RuntimeError as e:
print(f"view error: {e}")
# reshape()更灵活,会自动处理内存连续性问题
reshaped = tensor.reshape(4, 6)
# 让tensor连续后再使用view
contiguous_tensor = tensor.contiguous()
viewed = contiguous_tensor.view(4, 6)
自动求导与梯度计算
4. 解释requires_grad参数的作用
# requires_grad=True 表示需要跟踪该tensor的计算历史
x = torch.tensor([1.0, 2.0], requires_grad=True)
y = torch.tensor([3.0, 4.0], requires_grad=False)
w = torch.tensor([5.0], requires_grad=True)
b = torch.tensor([6.0], requires_grad=True)
# 计算过程会被跟踪
z = w * x + b
loss = z.sum()
# 反向传播计算梯度
loss.backward()
print(f"w的梯度: {w.grad}") # tensor([3.])
print(f"b的梯度: {b.grad}") # tensor([2.])
5. 解释detach()和with torch.no_grad()的区别
x = torch.tensor([1.0], requires_grad=True)
y = x * 2
# detach() 创建一个不需要梯度的新tensor
y_detached = y.detach()
print(f"y_detached requires_grad: {y_detached.requires_grad}")
# with torch.no_grad() 上下文管理器中的操作不会被跟踪
with torch.no_grad():
z = x * 3
print(f"z requires_grad: {z.requires_grad}")
# 区别:detach()作用于单个tensor,no_grad()作用于代码块
6. 如何手动计算梯度并更新参数?
# 定义参数
w = torch.tensor([1.0], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)
# 前向传播
x = torch.tensor([2.0])
y_pred = w * x + b
y_true = torch.tensor([3.0])
loss = (y_pred - y_true) ** 2
# 方法1: 自动求导
loss.backward()
print(f"自动求导 - w_grad: {w.grad}, b_grad: {b.grad}")
# 清零梯度
w.grad.zero_()
b.grad.zero_()
# 方法2: 手动计算梯度
manual_w_grad = 2 * (y_pred - y_true) * x
manual_b_grad = 2 * (y_pred - y_true)
print(f"手动计算 - w_grad: {manual_w_grad}, b_grad: {manual_b_grad}")
神经网络构建
7. 如何自定义一个简单的神经网络?
import torch.nn as nn
import torch.nn.functional as F
class SimpleNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(SimpleNN, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
# 使用网络
model = SimpleNN(10, 50, 2)
input_data = torch.randn(32, 10) # batch_size=32, input_size=10
output = model(input_data)
8. 解释nn.Sequential的用法
# 方法1: 直接添加层
model = nn.Sequential(
nn.Linear(10, 50),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(50, 20),
nn.ReLU(),
nn.Linear(20, 2)
)
# 方法2: 使用OrderedDict
from collections import OrderedDict
model = nn.Sequential(OrderedDict([
('fc1', nn.Linear(10, 50)),
('relu1', nn.ReLU()),
('dropout1', nn.Dropout(0.2)),
('fc2', nn.Linear(50, 2))
]))
# 前向传播
output = model(input_data)
9. 如何实现残差连接?
class ResidualBlock(nn.Module):
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels, out_channels,
kernel_size=3, stride=stride, padding=1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(out_channels, out_channels,
kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(out_channels)
# shortcut connection
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv2d(in_channels, out_channels,
kernel_size=1, stride=stride),
nn.BatchNorm2d(out_channels)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
out += self.shortcut(x) # 残差连接
out = F.relu(out)
return out
损失函数与优化器
10. 常用的损失函数有哪些?
# 分类问题
criterion_ce = nn.CrossEntropyLoss() # 多分类
criterion_bce = nn.BCELoss() # 二分类
criterion_bce_logits = nn.BCEWithLogitsLoss() # 二分类(带sigmoid)
# 回归问题
criterion_mse = nn.MSELoss() # 均方误差
criterion_l1 = nn.L1Loss() # 平均绝对误差
criterion_huber = nn.SmoothL1Loss() # Huber损失
# 使用示例
outputs = torch.randn(10, 5) # 10个样本,5个类别
labels = torch.randint(0, 5, (10,))
loss = criterion_ce(outputs, labels)
11. 优化器的选择和使用
model = SimpleNN(10, 50, 2)
# 不同优化器
optimizer_sgd = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer_adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
optimizer_rmsprop = torch.optim.RMSprop(model.parameters(), lr=0.01)
# 训练循环中的使用
for epoch in range(100):
# 前向传播
outputs = model(input_data)
loss = criterion(outputs, labels)
# 反向传播
optimizer.zero_grad() # 清零梯度
loss.backward() # 计算梯度
optimizer.step() # 更新参数
12. 学习率调度器的使用
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
# 各种学习率调度器
scheduler_step = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
scheduler_exp = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
scheduler_plateau = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5)
for epoch in range(100):
# 训练步骤...
train_loss = ...
# 更新学习率
scheduler_step.step() # 按步数更新
# 或者
scheduler_plateau.step(train_loss) # 根据损失更新
current_lr = optimizer.param_groups[0]['lr']
print(f'Epoch {epoch}, LR: {current_lr}')
数据加载与处理
13. 如何创建自定义数据集?
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
def __init__(self, data, labels, transform=None):
self.data = data
self.labels = labels
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
sample = self.data[idx]
label = self.labels[idx]
if self.transform:
sample = self.transform(sample)
return sample, label
# 使用示例
data = torch.randn(1000, 10)
labels = torch.randint(0, 2, (1000,))
dataset = CustomDataset(data, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
14. DataLoader的重要参数
dataloader = DataLoader(
dataset,
batch_size=32, # 批大小
shuffle=True, # 是否打乱数据
num_workers=4, # 数据加载进程数
pin_memory=True, # 是否锁页内存,GPU训练时建议True
drop_last=True, # 是否丢弃最后不完整的batch
collate_fn=None # 自定义batch组装函数
)
# 使用示例
for batch_data, batch_labels in dataloader:
# 训练模型
outputs = model(batch_data)
loss = criterion(outputs, batch_labels)
# ...
模型训练与验证
15. 基本的训练循环模板
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
train_losses = []
val_losses = []
for epoch in range(epochs):
# 训练模式
model.train()
train_loss = 0.0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()
# 验证模式
model.eval()
val_loss = 0.0
with torch.no_grad():
for data, target in val_loader:
data, target = data.to(device), target.to(device)
output = model(data)
val_loss += criterion(output, target).item()
# 记录损失
avg_train_loss = train_loss / len(train_loader)
avg_val_loss = val_loss / len(val_loader)
train_losses.append(avg_train_loss)
val_losses.append(avg_val_loss)
print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
return train_losses, val_losses
16. 模型评估指标计算
def evaluate_model(model, test_loader):
model.eval()
correct = 0
total = 0
all_predictions = []
all_targets = []
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
outputs = model(data)
_, predicted = torch.max(outputs.data, 1)
total += target.size(0)
correct += (predicted == target).sum().item()
all_predictions.extend(predicted.cpu().numpy())
all_targets.extend(target.cpu().numpy())
accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')
# 可以进一步计算精确率、召回率等
from sklearn.metrics import classification_report
print(classification_report(all_targets, all_predictions))
return accuracy
高级特性
17. 使用GPU进行训练
# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# 如果有多个GPU
if torch.cuda.device_count() > 1:
print(f"Using {torch.cuda.device_count()} GPUs!")
model = nn.DataParallel(model)
# 将模型和数据移动到设备
model = model.to(device)
# 在训练循环中
for data, target in dataloader:
data, target = data.to(device), target.to(device)
# ... 训练步骤
18. 梯度累积的实现
def train_with_gradient_accumulation(model, dataloader, optimizer, accumulation_steps=4):
model.train()
optimizer.zero_grad()
for i, (data, target) in enumerate(dataloader):
data, target = data.to(device), target.to(device)
output = model(data)
loss = criterion(output, target)
loss = loss / accumulation_steps # 标准化损失
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
# 处理最后不足accumulation_steps的批次
if len(dataloader) % accumulation_steps != 0:
optimizer.step()
optimizer.zero_grad()
模型保存与加载
19. 保存和加载模型的几种方式
# 方法1: 保存整个模型
torch.save(model, 'model.pth')
loaded_model = torch.load('model.pth')
# 方法2: 只保存状态字典(推荐)
torch.save(model.state_dict(), 'model_state_dict.pth')
model = SimpleNN(10, 50, 2)
model.load_state_dict(torch.load('model_state_dict.pth'))
# 方法3: 保存checkpoint(包含其他信息)
checkpoint = {
'epoch': 100,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss,
'accuracy': accuracy
}
torch.save(checkpoint, 'checkpoint.pth')
# 加载checkpoint
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
更多面试题目(简要答案)
20. 解释torch.nn和torch.nn.functional的区别
torch.nn
:类接口,有状态的(包含参数)torch.nn.functional
:函数接口,无状态的
21. Batch Normalization的作用
- 加速训练收敛
- 减少对初始化的依赖
- 有轻微的正则化效果
22. Dropout的作用和原理
- 防止过拟合
- 训练时随机丢弃神经元
- 测试时使用所有神经元但缩放权重
23. 1x1卷积的作用
- 降维或升维
- 跨通道信息整合
- 减少计算量
24. 梯度消失和梯度爆炸的原因及解决方法
原因:深层网络中的连乘效应
解决方法:
- 合适的权重初始化
- Batch Normalization
- 梯度裁剪
- 残差连接
- 合适的激活函数
25. 常见的权重初始化方法
# Xavier初始化
nn.init.xavier_uniform_(layer.weight)
# He初始化(ReLU适用)
nn.init.kaiming_uniform_(layer.weight)
# 正态分布初始化
nn.init.normal_(layer.weight, mean=0, std=0.02)
26. 早停法(Early Stopping)的实现
best_loss = float('inf')
patience = 10
trigger_times = 0
for epoch in range(epochs):
# 训练和验证...
current_val_loss = ...
if current_val_loss < best_loss:
best_loss = current_val_loss
trigger_times = 0
# 保存最佳模型
torch.save(model.state_dict(), 'best_model.pth')
else:
trigger_times += 1
if trigger_times >= patience:
print("Early stopping!")
break
27. 模型融合(Ensemble)的实现
class ModelEnsemble:
def __init__(self, models):
self.models = models
def predict(self, x):
predictions = []
for model in self.models:
model.eval()
with torch.no_grad():
pred = model(x)
predictions.append(pred)
# 平均预测
avg_prediction = torch.stack(predictions).mean(0)
return avg_prediction
28. 学习率finder的实现
def find_learning_rate(model, train_loader, criterion, min_lr=1e-7, max_lr=1, steps=100):
optimizer = torch.optim.Adam(model.parameters(), lr=min_lr)
lr_lambda = lambda x: math.exp(x * math.log(max_lr / min_lr) / steps)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
lrs = []
losses = []
for i, (data, target) in enumerate(train_loader):
if i >= steps:
break
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
lrs.append(optimizer.param_groups[0]['lr'])
losses.append(loss.item())
scheduler.step()
return lrs, losses
29. 混合精度训练
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for data, target in dataloader:
optimizer.zero_grad()
with autocast():
output = model(data)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
30-50. 其他重要题目
31. 解释Transformer中的Self-Attention机制
- Query, Key, Value的概念
- 缩放点积注意力
- 多头注意力机制
32. CNN中的感受野计算
- 逐层计算感受野大小
- 空洞卷积的影响
33. 目标检测中的IoU计算
def calculate_iou(box1, box2):
# box格式: [x1, y1, x2, y2]
x1 = max(box1[0], box2[0])
y1 = max(box1[1], box2[1])
x2 = min(box1[2], box2[2])
y2 = min(box1[3], box2[3])
intersection = max(0, x2 - x1) * max(0, y2 - y1)
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
union = area1 + area2 - intersection
return intersection / union if union > 0 else 0
34. Focal Loss的作用和实现
- 解决类别不平衡问题
- 降低易分类样本的权重
35. 模型剪枝的基本方法
- 基于重要性的剪枝
- 结构化剪枝 vs 非结构化剪枝
36. 知识蒸馏的原理
- 教师模型指导学生模型
- 软标签的使用
37. 模型量化的好处
- 减少模型大小
- 加速推理
- 降低功耗
38. ONNX格式的作用
- 模型格式标准化
- 跨框架部署
39. TorchScript的作用
- 将PyTorch模型转换为可序列化的图表示
- 提高推理速度
40. 解释CUDA和cuDNN的关系
- CUDA: 通用并行计算平台
- cuDNN: 深度神经网络加速库
41. 数据并行的实现
model = nn.DataParallel(model, device_ids=[0, 1, 2, 3])
42. 分布式训练的基本概念
- 数据并行 vs 模型并行
- All-Reduce操作
43. 梯度检查点技术
- 用时间换空间
- 减少内存使用
44. 自动混合精度的优势
- 减少内存占用
- 加速训练
- 保持精度
45. 模型部署的常见方式
- TorchServe
- ONNX Runtime
- TensorRT
- 移动端部署
46. 解释Attention机制的各种变体
- Self-Attention
- Multi-Head Attention
- Cross-Attention
- Sparse Attention
47. GAN训练中的问题和对策
- 模式崩溃
- 训练不稳定
- 梯度惩罚
- Wasserstein GAN
48. 强化学习中的DQN实现要点
- 经验回放
- 目标网络
- ϵ-greedy策略
49. 元学习的基本概念
- 学习如何学习
- MAML算法
- 原型网络
50. 解释对比学习原理
- 正样本对和负样本对
- InfoNCE损失
- SimCLR、MoCo等算法