python最优化入门一

0. 环境

pip install -U numpy scipy matplotlib cvxpy pulp ortools

1. 通用套路与停止准则

import numpy as np

def grad_check(g, x, tol=1e-6):
    """梯度范数作为停止条件"""
    return np.linalg.norm(g) < tol

def backtracking(f, g, x, p, alpha=1.0, beta=0.5, c=1e-4):
    """
    回溯直线搜索(Armijo):给定搜索方向 p,找到步长 alpha
    f: 目标函数, 返回 标量
    g: 当前梯度向量
    """
    fx = f(x)
    t = alpha
    while f(x + t*p) > fx + c*t*np.dot(g, p):
        t *= beta
    return t

2. 无约束优化

2.1 梯度下降(带回溯)

def gradient_descent(f, grad, x0, maxit=1000, tol=1e-6):
    x = x0.copy().astype(float)
    for k in range(maxit):
        g = grad(x)
        if grad_check(g, x, tol): break
        p = -g
        t = backtracking(f, g, x, p)
        x = x + t*p
    return x

示例:二次函数

Q = np.array([[3,1],[1,2]], dtype=float); b = np.array([-1,-2], dtype=float)
f  = lambda x: 0.5*x@Q@x + b@x
grad = lambda x: Q@x + b

x_star = gradient_descent(f, grad, x0=np.zeros(2))
print("GD ->", x_star)

2.2 牛顿法(带阻尼)

def newton(f, grad, hess, x0, maxit=100, tol=1e-8):
    x = x0.copy().astype(float)
    for k in range(maxit):
        g = grad(x)
        if grad_check(g, x, tol): break
        H = hess(x)
        # 解 H p = -g
        p = np.linalg.solve(H + 1e-8*np.eye(H.shape[0]), -g)
        t = backtracking(f, g, x, p)
        x = x + t*p
    return x

hess = lambda x: Q
print("Newton ->", newton(f, grad, hess, x0=np.zeros(2)))

2.3 L-BFGS(SciPy 内置),示例:Rosenbrock

from scipy.optimize import minimize

rosen = lambda x: (1-x[0])**2 + 100*(x[1]-x[0]**2)**2
x0 = np.array([-1.2, 1.0])
res = minimize(rosen, x0, method="L-BFGS-B")
print("L-BFGS-B:", res.x, res.nit, res.fun)

3. 稀疏与非光滑:L1 正则的 ISTA(LASSO)

目标:(\min_x \frac{1}{2}|Ax-b|_2^2 + \lambda|x|_1)

def soft_threshold(z, lmbd):
    return np.sign(z)*np.maximum(np.abs(z)-lmbd, 0.0)

def lasso_ista(A, b, lam=1e-2, step=None, iters=1000):
    # Lipschitz 常数 L = ||A^T A||2,步长 = 1/L
    if step is None:
        L = np.linalg.norm(A.T@A, 2)
        step = 1.0/(L+1e-12)
    x = np.zeros(A.shape[1])
    for _ in range(iters):
        g = A.T@(A@x - b)
        x = soft_threshold(x - step*g, lam*step)
    return x

# demo
m,n = 200, 400
A = np.random.randn(m,n)
x_true = np.zeros(n); x_true[:10] = np.random.randn(10)
b = A@x_true + 0.05*np.random.randn(m)
x_hat = lasso_ista(A,b,lam=0.05)
print("非零个数(真/估):", (np.abs(x_true)>1e-8).sum(), (np.abs(x_hat)>1e-3).sum())

4. 约束优化

4.1 投影梯度(盒约束 / 单纯形)

盒约束 (l \le x \le u)

def proj_box(x, l, u):
    return np.minimum(np.maximum(x, l), u)

def pgd_box(f, grad, x0, l, u, iters=500):
    x = proj_box(x0, l, u)
    for _ in range(iters):
        g = grad(x)
        p = -g
        t = backtracking(f, g, x, p)
        x = proj_box(x + t*p, l, u)
    return x

单纯形约束 ({x \ge 0, \sum x = 1})

def proj_simplex(v):
    # 欧式投影:排序-阈值法
    u = np.sort(v)[::-1]
    css = np.cumsum(u)
    rho = np.where(u > (css - 1)/np.arange(1,len(u)+1))[0][-1]
    theta = (css[rho] - 1)/(rho+1)
    return np.maximum(v - theta, 0)

# 使用:x = proj_simplex(x)

4.2 CVXPY:一行一个约束(QP/LP/SOCP)

import cvxpy as cp

# 二次规划:min 1/2 x^T Q x + c^T x  s.t.  Ax <= b,  l <= x <= u
Q = np.array([[4,1],[1,2]], float)
c = np.array([1.0, 1.0])
A = np.array([[1.0,2.0]])
b = np.array([1.0])
l = np.zeros(2); u = np.ones(2)*10

x = cp.Variable(2)
prob = cp.Problem(cp.Minimize(0.5*cp.quad_form(x,Q)+ c@x),
                  [A@x <= b, x >= l, x <= u])
prob.solve()
print("CVXPY:", x.value, "opt:", prob.value)

5. 机器学习中的优化

5.1 逻辑回归(二分类)从零实现

$$\min_w \frac{1}{N}\sum_{i=1}^N \log\left(1+\exp(-y_i w^\top x_i)\right) + \lambda\|w\|_2^2$$

这是逻辑回归的L2正则化形式,其中:

  • $w$ 是权重向量
  • $x_i$ 是第i个样本特征
  • $y_i$ 是第i个样本标签
  • $\lambda$ 是正则化系数
  • $\|w\|_2^2$ 表示权重向量的L2范数平方
def logistic_loss_grad(w, X, y, lam=1e-4):
    z = X@w
    yz = y*z
    # sigmoid(-yz) = 1 / (1+exp(yz))
    s = 1.0/(1.0 + np.exp(yz))
    loss = np.mean(np.log1p(np.exp(-yz))) + lam*0.5*np.dot(w,w)
    grad = -(X.T@(y*s))/len(y) + lam*w
    return loss, grad

def fit_logreg_gd(X, y, lam=1e-4, iters=500):
    w = np.zeros(X.shape[1])
    f = lambda w: logistic_loss_grad(w, X, y, lam)[0]
    for _ in range(iters):
        loss, g = logistic_loss_grad(w, X, y, lam)
        t = 1.0/(1.0 + np.linalg.norm(g))  # 简单自适应步长
        w -= t*g
    return w

# demo (线性可分伪数据)
np.random.seed(0)
N, d = 400, 5
X = np.random.randn(N,d)
w_true = np.random.randn(d)
y = np.sign(X@w_true + 0.5*np.random.randn(N))
w_hat = fit_logreg_gd(X, y)
acc = (np.sign(X@w_hat)==y).mean()
print("train acc:", round(acc,3))
生产中可换 scipy.optimize.minimize(牛顿/共轭梯度)或 sklearn.linear_model.LogisticRegression 以获得更稳的收敛。

5.2 约束/正则混合:带 L1 的逻辑回归 → 用 近端梯度(ISTA) 把上面 soft_threshold 套进去即可。


6. 整数/组合优化(运筹)

6.1 线性规划/整数规划(PuLP)

import pulp as pl

# 0-1 背包:max v^T x s.t. w^T x <= W, x ∈ {0,1}
v = [8,4,0,5,3]; w = [3,2,1,4,2]; W = 6
n = len(v)

model = pl.LpProblem("knapsack", pl.LpMaximize)
x = [pl.LpVariable(f"x{i}", lowBound=0, upBound=1, cat="Binary") for i in range(n)]
model += pl.lpDot(v, x)
model += pl.lpDot(w, x) <= W
model.solve(pl.PULP_CBC_CMD(msg=False))

print("obj:", pl.value(model.objective))
print("x:", [int(xi.value()) for xi in x])

6.2 指派/排班(OR-Tools)

from ortools.linear_solver import pywraplp
cost = [[9,2,7],[6,4,3],[5,8,1]]  # 工人 i -> 任务 j 成本
n = len(cost)
solver = pywraplp.Solver.CreateSolver('SCIP')
x = [[solver.BoolVar(f"x{i}_{j}") for j in range(n)] for i in range(n)]
# 每个工人 1 个任务、每个任务 1 个工人
for i in range(n): solver.Add(sum(x[i][j] for j in range(n)) == 1)
for j in range(n): solver.Add(sum(x[i][j] for i in range(n)) == 1)
solver.Minimize(sum(cost[i][j]*x[i][j] for i in range(n) for j in range(n)))
status = solver.Solve()
print("cost:", solver.Objective().Value())

7. 调参与诊断清单(实战高频)

  • 缩放与标准化:特征缩放到相近量级,Hessian 条件数更好,GD/牛顿法都更稳。
  • 步长策略:先用回溯,稳定后可尝试 Barzilai–Borwein、Adam(非凸/深度学习)。
  • 停止准则:$\nabla f(x) \le \varepsilon$、相对改进 (|f_{k+1}-f_k|/|f_k|)、可行性 (|Ax-b|)。
  • 数值稳健:加 L2 平滑、对角扰动 (H+\delta I)、使用 np.log1p/expit 替代裸 log/exp
  • KKT 检查(有约束):看互补松弛、可行性与拉格朗日梯度是否接近 0。
  • 复现性:固定随机种子;记录超参与损失曲线。

8. 练习题(动手就会)

  1. 逻辑回归 的梯度下降替换为 牛顿法,画出迭代的目标值曲线。
  2. CVXPY 写一个 投资组合最优化:最小方差、收益下界和持仓上限约束。
  3. LASSO 的 ISTA 升级为 FISTA(加入动量),对比收敛速度。
可以在评论区分享实战案例

添加新评论