0. 环境
pip install -U numpy scipy matplotlib cvxpy pulp ortools
1. 通用套路与停止准则
import numpy as np
def grad_check(g, x, tol=1e-6):
"""梯度范数作为停止条件"""
return np.linalg.norm(g) < tol
def backtracking(f, g, x, p, alpha=1.0, beta=0.5, c=1e-4):
"""
回溯直线搜索(Armijo):给定搜索方向 p,找到步长 alpha
f: 目标函数, 返回 标量
g: 当前梯度向量
"""
fx = f(x)
t = alpha
while f(x + t*p) > fx + c*t*np.dot(g, p):
t *= beta
return t
2. 无约束优化
2.1 梯度下降(带回溯)
def gradient_descent(f, grad, x0, maxit=1000, tol=1e-6):
x = x0.copy().astype(float)
for k in range(maxit):
g = grad(x)
if grad_check(g, x, tol): break
p = -g
t = backtracking(f, g, x, p)
x = x + t*p
return x
示例:二次函数
Q = np.array([[3,1],[1,2]], dtype=float); b = np.array([-1,-2], dtype=float)
f = lambda x: 0.5*x@Q@x + b@x
grad = lambda x: Q@x + b
x_star = gradient_descent(f, grad, x0=np.zeros(2))
print("GD ->", x_star)
2.2 牛顿法(带阻尼)
def newton(f, grad, hess, x0, maxit=100, tol=1e-8):
x = x0.copy().astype(float)
for k in range(maxit):
g = grad(x)
if grad_check(g, x, tol): break
H = hess(x)
# 解 H p = -g
p = np.linalg.solve(H + 1e-8*np.eye(H.shape[0]), -g)
t = backtracking(f, g, x, p)
x = x + t*p
return x
hess = lambda x: Q
print("Newton ->", newton(f, grad, hess, x0=np.zeros(2)))
2.3 L-BFGS(SciPy 内置),示例:Rosenbrock
from scipy.optimize import minimize
rosen = lambda x: (1-x[0])**2 + 100*(x[1]-x[0]**2)**2
x0 = np.array([-1.2, 1.0])
res = minimize(rosen, x0, method="L-BFGS-B")
print("L-BFGS-B:", res.x, res.nit, res.fun)
3. 稀疏与非光滑:L1 正则的 ISTA(LASSO)
目标:(\min_x \frac{1}{2}|Ax-b|_2^2 + \lambda|x|_1)
def soft_threshold(z, lmbd):
return np.sign(z)*np.maximum(np.abs(z)-lmbd, 0.0)
def lasso_ista(A, b, lam=1e-2, step=None, iters=1000):
# Lipschitz 常数 L = ||A^T A||2,步长 = 1/L
if step is None:
L = np.linalg.norm(A.T@A, 2)
step = 1.0/(L+1e-12)
x = np.zeros(A.shape[1])
for _ in range(iters):
g = A.T@(A@x - b)
x = soft_threshold(x - step*g, lam*step)
return x
# demo
m,n = 200, 400
A = np.random.randn(m,n)
x_true = np.zeros(n); x_true[:10] = np.random.randn(10)
b = A@x_true + 0.05*np.random.randn(m)
x_hat = lasso_ista(A,b,lam=0.05)
print("非零个数(真/估):", (np.abs(x_true)>1e-8).sum(), (np.abs(x_hat)>1e-3).sum())
4. 约束优化
4.1 投影梯度(盒约束 / 单纯形)
盒约束 (l \le x \le u)
def proj_box(x, l, u):
return np.minimum(np.maximum(x, l), u)
def pgd_box(f, grad, x0, l, u, iters=500):
x = proj_box(x0, l, u)
for _ in range(iters):
g = grad(x)
p = -g
t = backtracking(f, g, x, p)
x = proj_box(x + t*p, l, u)
return x
单纯形约束 ({x \ge 0, \sum x = 1})
def proj_simplex(v):
# 欧式投影:排序-阈值法
u = np.sort(v)[::-1]
css = np.cumsum(u)
rho = np.where(u > (css - 1)/np.arange(1,len(u)+1))[0][-1]
theta = (css[rho] - 1)/(rho+1)
return np.maximum(v - theta, 0)
# 使用:x = proj_simplex(x)
4.2 CVXPY:一行一个约束(QP/LP/SOCP)
import cvxpy as cp
# 二次规划:min 1/2 x^T Q x + c^T x s.t. Ax <= b, l <= x <= u
Q = np.array([[4,1],[1,2]], float)
c = np.array([1.0, 1.0])
A = np.array([[1.0,2.0]])
b = np.array([1.0])
l = np.zeros(2); u = np.ones(2)*10
x = cp.Variable(2)
prob = cp.Problem(cp.Minimize(0.5*cp.quad_form(x,Q)+ c@x),
[A@x <= b, x >= l, x <= u])
prob.solve()
print("CVXPY:", x.value, "opt:", prob.value)
5. 机器学习中的优化
5.1 逻辑回归(二分类)从零实现
$$\min_w \frac{1}{N}\sum_{i=1}^N \log\left(1+\exp(-y_i w^\top x_i)\right) + \lambda\|w\|_2^2$$
这是逻辑回归的L2正则化形式,其中:
- $w$ 是权重向量
- $x_i$ 是第i个样本特征
- $y_i$ 是第i个样本标签
- $\lambda$ 是正则化系数
- $\|w\|_2^2$ 表示权重向量的L2范数平方
def logistic_loss_grad(w, X, y, lam=1e-4):
z = X@w
yz = y*z
# sigmoid(-yz) = 1 / (1+exp(yz))
s = 1.0/(1.0 + np.exp(yz))
loss = np.mean(np.log1p(np.exp(-yz))) + lam*0.5*np.dot(w,w)
grad = -(X.T@(y*s))/len(y) + lam*w
return loss, grad
def fit_logreg_gd(X, y, lam=1e-4, iters=500):
w = np.zeros(X.shape[1])
f = lambda w: logistic_loss_grad(w, X, y, lam)[0]
for _ in range(iters):
loss, g = logistic_loss_grad(w, X, y, lam)
t = 1.0/(1.0 + np.linalg.norm(g)) # 简单自适应步长
w -= t*g
return w
# demo (线性可分伪数据)
np.random.seed(0)
N, d = 400, 5
X = np.random.randn(N,d)
w_true = np.random.randn(d)
y = np.sign(X@w_true + 0.5*np.random.randn(N))
w_hat = fit_logreg_gd(X, y)
acc = (np.sign(X@w_hat)==y).mean()
print("train acc:", round(acc,3))
生产中可换scipy.optimize.minimize
(牛顿/共轭梯度)或sklearn.linear_model.LogisticRegression
以获得更稳的收敛。
5.2 约束/正则混合:带 L1 的逻辑回归 → 用 近端梯度(ISTA) 把上面 soft_threshold
套进去即可。
6. 整数/组合优化(运筹)
6.1 线性规划/整数规划(PuLP)
import pulp as pl
# 0-1 背包:max v^T x s.t. w^T x <= W, x ∈ {0,1}
v = [8,4,0,5,3]; w = [3,2,1,4,2]; W = 6
n = len(v)
model = pl.LpProblem("knapsack", pl.LpMaximize)
x = [pl.LpVariable(f"x{i}", lowBound=0, upBound=1, cat="Binary") for i in range(n)]
model += pl.lpDot(v, x)
model += pl.lpDot(w, x) <= W
model.solve(pl.PULP_CBC_CMD(msg=False))
print("obj:", pl.value(model.objective))
print("x:", [int(xi.value()) for xi in x])
6.2 指派/排班(OR-Tools)
from ortools.linear_solver import pywraplp
cost = [[9,2,7],[6,4,3],[5,8,1]] # 工人 i -> 任务 j 成本
n = len(cost)
solver = pywraplp.Solver.CreateSolver('SCIP')
x = [[solver.BoolVar(f"x{i}_{j}") for j in range(n)] for i in range(n)]
# 每个工人 1 个任务、每个任务 1 个工人
for i in range(n): solver.Add(sum(x[i][j] for j in range(n)) == 1)
for j in range(n): solver.Add(sum(x[i][j] for i in range(n)) == 1)
solver.Minimize(sum(cost[i][j]*x[i][j] for i in range(n) for j in range(n)))
status = solver.Solve()
print("cost:", solver.Objective().Value())
7. 调参与诊断清单(实战高频)
- 缩放与标准化:特征缩放到相近量级,Hessian 条件数更好,GD/牛顿法都更稳。
- 步长策略:先用回溯,稳定后可尝试 Barzilai–Borwein、Adam(非凸/深度学习)。
- 停止准则:$\nabla f(x) \le \varepsilon$、相对改进 (|f_{k+1}-f_k|/|f_k|)、可行性 (|Ax-b|)。
- 数值稳健:加 L2 平滑、对角扰动 (H+\delta I)、使用
np.log1p/expit
替代裸log/exp
。 - KKT 检查(有约束):看互补松弛、可行性与拉格朗日梯度是否接近 0。
- 复现性:固定随机种子;记录超参与损失曲线。
8. 练习题(动手就会)
- 把 逻辑回归 的梯度下降替换为 牛顿法,画出迭代的目标值曲线。
- 用 CVXPY 写一个 投资组合最优化:最小方差、收益下界和持仓上限约束。
- 把 LASSO 的 ISTA 升级为 FISTA(加入动量),对比收敛速度。
可以在评论区分享实战案例