kyrie
kyrie
发布于 2025-12-17 / 8 阅读
0
0

Q-learning

Q-learning+ε-greedy

Q-learing原理

为什么要用 ε-greedy

在强化学习中,智能体(Agent)面临两个选择:

  1. 利用 (Exploitation):根据当前学到的知识(Q表),选择分数最高的动作,以获取已知最大的奖励。

  2. 探索 (Exploration):尝试未知的或分数较低的动作,希望能发现更好的策略(虽然可能会暂时亏损)。

如果只利用(纯贪婪),智能体可能陷入局部最优,永远找不到最佳路径;如果只探索,智能体永远无法利用学到的知识。

\epsilon-greedy 提供了一个简单的平衡方案:

  • 1-\epsilon 的概率:选择当前 Q 值最高的动作(利用)。

  • \epsilon 的概率:随机选择一个动作(探索)。

任务

  • 使用 gymnasium

  • 实现 Q-learning +ε-greedy 搜索

  • 展示训练曲线与智能体路径改善

输出要求

  • 学习曲线:Episode vs RetuIn

  • 路径示意图(至少 3 次不同时间点)

  • 100 字讨论:探索率。ε对结果的影响

代码

import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

# 实现epsilon-greedy策略选择动作
# 参数:Q值表、当前状态s、探索率eps、动作空间大小nA、随机数生成器rng
# 原理:以概率eps随机选择动作,否则选择Q值最大的动作
def epsilon_greedy(Q, s, eps, nA, rng):
    if rng.random() < eps:
        return rng.integers(nA)
    return int(np.argmax(Q[s]))

# 使用当前Q表执行一条贪心轨迹,并返回轨迹和总奖励
def rollout_greedy(env, Q, max_steps=200, seed=0):
    rng = np.random.default_rng(seed)
    s, _ = env.reset(seed=int(rng.integers(1_000_000)))
    path = [s]
    total_r = 0.0
    for _ in range(max_steps):
        a = int(np.argmax(Q[s]))
        s2, r, terminated, truncated, _ = env.step(a)
        total_r += r
        path.append(s2)
        s = s2
        if terminated or truncated:
            break
    return path, total_r

def draw_trajectory_cliffwalking(env, path, title, save_path):
    H, W = env.unwrapped.shape  # (4, 12)
    # state -> (row, col)
    def rc(state):
        return divmod(int(state), W)

    # 背景:cliff 区域
    grid = np.zeros((H, W), dtype=float)
    grid[H-1, 1:W-1] = -1  # cliff

    # 把轨迹点标在 grid 上
    for st in path:
        r, c = rc(st)
        grid[r, c] = 0.5
    sr, sc = rc(path[0])
    gr, gc = rc(path[-1])

    plt.figure()
    plt.imshow(grid, interpolation="nearest")
    # 画路径连线(用格子中心点)
    xs, ys = [], []
    for st in path:
        r, c = rc(st)
        xs.append(c)
        ys.append(r)
    plt.plot(xs, ys)

    plt.scatter([sc], [sr], marker="o")
    plt.scatter([gc], [gr], marker="x")
    plt.title(title)
    plt.gca().invert_yaxis()
    plt.xticks(range(W))
    plt.yticks(range(H))
    plt.grid(True, linewidth=0.5)
    plt.tight_layout()
    plt.savefig(save_path, dpi=200)
    plt.close()

# 实现Q-learning算法训练
# alpha:学习率 gamma:折扣因子 eps_start/eps_end:初始和最终探索率 eps_decay:探索率衰减系数
def train_q_learning(
    env_name="CliffWalking-v1",
    episodes=5000,
    alpha=0.1,
    gamma=0.99,
    eps_start=1.0,
    eps_end=0.05,
    eps_decay=0.999,
    seed=42,
    checkpoints=(50, 500, 3000),
):
    env = gym.make(env_name)
    nS = env.observation_space.n
    nA = env.action_space.n
    Q = np.zeros((nS, nA), dtype=np.float32)

    rng = np.random.default_rng(seed)
    returns = []

    saved_Q = {}  # episode -> Q snapshot

    eps = eps_start
    for ep in range(1, episodes + 1):
        s, _ = env.reset(seed=int(rng.integers(1_000_000)))
        done = False
        G = 0.0

        while not done:
            a = epsilon_greedy(Q, s, eps, nA, rng)
            s2, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated

            # Q-learning update
            td_target = r + (0.0 if done else gamma * np.max(Q[s2]))
            Q[s, a] += alpha * (td_target - Q[s, a])

            s = s2
            G += r

        returns.append(G)

        # epsilon decay
        eps = max(eps_end, eps * eps_decay)

        if ep in checkpoints:
            saved_Q[ep] = Q.copy()

    env.close()
    return Q, np.array(returns, dtype=np.float32), saved_Q

def plot_learning_curve(returns, save_path="learning_curve.png", window=50):
    plt.figure()
    plt.plot(np.arange(1, len(returns) + 1), returns, linewidth=1)
    if len(returns) >= window:
        ma = np.convolve(returns, np.ones(window)/window, mode="valid")
        plt.plot(np.arange(window, len(returns) + 1), ma, linewidth=2)
    plt.xlabel("Episode")
    plt.ylabel("Return")
    plt.title("Episode vs Return (Q-learning + epsilon-greedy)")
    plt.tight_layout()
    plt.savefig(save_path, dpi=200)
    plt.close()

if __name__ == "__main__":
    env_name = "CliffWalking-v1"
    episodes = 5000
    checkpoints = (50, 500, 3000)  # 至少 3 次不同时间点

    Q, returns, saved_Q = train_q_learning(
        env_name=env_name,
        episodes=episodes,
        alpha=0.1,
        gamma=0.99,
        eps_start=1.0,
        eps_end=0.05,
        eps_decay=0.999,
        seed=42,
        checkpoints=checkpoints,
    )

    # 1) 学习曲线
    plot_learning_curve(returns, save_path="learning_curve.png", window=50)
    print("Saved: learning_curve.png")

    # 2) 轨迹示意图(3个时间点)
    env = gym.make(env_name)
    for ep in checkpoints:
        path, G = rollout_greedy(env, saved_Q[ep], seed=ep)
        out = f"traj_ep{ep}.png"
        draw_trajectory_cliffwalking(env, path, f"Greedy Trajectory @ Episode {ep} (Return={G:.1f})", out)
        print("Saved:", out)

    # 3) 最终策略轨迹(可选)
    path, G = rollout_greedy(env, Q, seed=2025)
    draw_trajectory_cliffwalking(env, path, f"Final Greedy Trajectory (Return={G:.1f})", "traj_final.png")
    print("Saved: traj_final.png")

    env.close()

可视化

说明

探索率ε越大,智能体越常随机试探,前期更容易跳出坏策略,但回报曲线上升更慢且波动大;ε过小则更快收敛,却可能过早陷入次优路径。采用随训练递减的ε,可在早期充分探索、后期稳定利用,从而得到更短更安全的路径与更高回报。


评论