adding RL demo

2022-05-26 15:29:41 +00:00 · 2022-05-26 15:29:41 +00:00 · ae89e4d318
commit ae89e4d318
parent dc8848b895
8 changed files with 376 additions and 0 deletions
--- a/reinforcement-learning/DemoGym.ipynb
+++ b/reinforcement-learning/DemoGym.ipynb
@ -0,0 +1,86 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "71383e8d-63f1-462c-bd77-688d8d34a60a",
   "metadata": {},
   "source": [
    "# Demonstration of `gym`: Visualize Interactive Results in Jupyter Notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eae51654-4ccf-44ed-aaac-f1d993d7e4a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from pyvirtualdisplay import Display\n",
    "display = Display(visible=0, size=(1400, 900))\n",
    "display.start()\n",
    "\n",
    "is_ipython = 'inline' in plt.get_backend()\n",
    "if is_ipython:\n",
    "    from IPython import display\n",
    "\n",
    "plt.ion()\n",
    "\n",
    "# Load the gym environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be872e01-e4fd-4940-874e-d46e97fb3519",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gym\n",
    "import random\n",
    "%matplotlib inline\n",
    "\n",
    "env = gym.make('LunarLander-v2')\n",
    "env.seed(23)\n",
    "\n",
    "# Let's watch how an untrained agent moves around\n",
    "\n",
    "state = env.reset()\n",
    "img = plt.imshow(env.render(mode='rgb_array'))\n",
    "for j in range(200):\n",
    "#     action = agent.act(state)\n",
    "    action = random.choice(range(4))\n",
    "    img.set_data(env.render(mode='rgb_array')) \n",
    "    plt.axis('off')\n",
    "    display.display(plt.gcf())\n",
    "    display.clear_output(wait=True)\n",
    "    state, reward, done, _ = env.step(action)\n",
    "    if done:\n",
    "        break\n",
    "\n",
    "env.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/reinforcement-learning/DemoMUD.ipynb
+++ b/reinforcement-learning/DemoMUD.ipynb
@ -0,0 +1,74 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2e848ca9-c915-4aa2-a7cc-a5654ed06863",
   "metadata": {},
   "source": [
    "# Demonstration of Training and Testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9506e99-a947-4f69-8355-a3ce696793fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from main import train, test\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de500d9e-40d1-4b6b-900f-96c2ec69e464",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pickle.load(open(\"data.pkl\", \"rb\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39ca7791-c844-4231-9f3b-e8ae80fe8103",
   "metadata": {},
   "outputs": [],
   "source": [
    "mud_point = train(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d8c70ab-d055-418c-b67e-ba5109d989f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "test(mud_point)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/reinforcement-learning/README.md
+++ b/reinforcement-learning/README.md
@ -0,0 +1,72 @@
 # PREFACE
 This is a direct migration (stripping `git` history) of [mud-games](https://git.mlden.com/mm/mud-games) (as of commit `1a2259827f`) which shows an actual research-oriented experiment which involves a novel method of "training" (this `mud` stuff) and "testing" (visually).
 The intent was to explore a utility library named [`gym`](https://github.com/openai/gym) which provides a consistent interface with which to train reinforcement-learning algorithms, and try to "learn to win" one of its most basic games (`Cartpole-V1`).
 Takeaways from this example:
 - much more friendly for reproducibility
 - runs on desktop AND in notebook (handling visual output is tricky, leverage the patterns here if you need to move interactive outputs into the cloud)
 - functions defined in `main.py` are "clean" but still not "clear"
 - notice the lack of documntation: where would it be helpful to have it?
 - data is not only supplied (perhaps not good to commit it) but a method to generate it is also provided (takes some time)
 - notice the comprehensive `README` below
 # mud-games
 control systems with MUD points
 # installation
 ```bash
 pip install -r requirements.txt
 ```
 # usage
 A `data.pkl` file is provided for your convenience with input / output samples.
 ```bash
 python main.py
 ```
 You can also instead use the included [jupyter notebook](./DemoMUD.ipynb).
 # info
 The inputs are the parameters to a `1x4` matrix which is multiplied against the observations of the state in order to make a decision for the next action (push left or right). The output of the vector inner-product is binarized by comparing it to zero as a threshold value.
 The parameter space is standard normal.
 There is no assumed error in observations; the "data variance" is designed to reflect the acceptable [ranges for the observations](https://www.gymlibrary.ml/pages/environments/classic_control/cart_pole):
 - The cart x-position (index 0) can be take values between (-4.8, 4.8), but the episode terminates if the cart leaves the (-2.4, 2.4) range.
 - The pole angle can be observed between (-.418, .418) radians (or ±24°), but the episode terminates if the pole angle is not in the range (-.2095, .2095) (or ±12°)
 Therefore, since our objective is to stabilize the cart, the target "time series signal" is zero for all four dimensions of the observation space. The presumed "data variance" should actually correspond to the acceptable bands of signal (WIP).
 # generate data
 You can generate your own data with:
 ```bash
 python sample.py
 ```
 Note: if you change the presumed sample space in `data.py`, you should make the corresponding changes to the initial distribution in `main.py`.
 # improvements
 Using the following presumptions, we can establish better values for the "data variance":
 > The angular momentum of the pole is the most important thing to stabilize.
 # headless mode / notebook demos
 Run `./headless.sh` (requires `sudo`) to install virtual displays so you can use the included Jupyter notebooks.
--- a/reinforcement-learning/data.pkl
+++ b/reinforcement-learning/data.pkl
--- a/reinforcement-learning/headless.sh
+++ b/reinforcement-learning/headless.sh
@ -0,0 +1,3 @@
 #!/bin/sh
 sudo apt update && sudo apt install build-essential xvfb swig
 pip install box2d-py pyvirtualdisplay
--- a/reinforcement-learning/main.py
+++ b/reinforcement-learning/main.py
@ -0,0 +1,71 @@
 #!/usr/bin/env python
 import pickle
 import gym
 import numpy as np
 import pandas as pd
 from scipy.stats import gaussian_kde as gkde
 from scipy.stats import norm
 import matplotlib.pyplot as plt
 try:
    from pyvirtualdisplay import Display
    display = Display(visible=0, size=(1400, 900))
    display.start()
 except ImportError:
    pass
 is_ipython = 'inline' in plt.get_backend()
 if is_ipython:
    from IPython import display
 plt.ion()
 def train(data):
    D = pd.DataFrame(data)
    sd = np.array([1.0, 0.5, 0.2, 0.5])
    D["qoi"] = D["obs"].apply(lambda o: np.sum(o, axis=0) / sd / np.sqrt(len(o)))
    D["i"] = D["lam"].apply(lambda l: norm.pdf(l).prod())
    D["o"] = D["qoi"].apply(lambda q: norm.pdf(q).prod())
    Q = np.array(D["qoi"].to_list()).reshape(-1, 4)
    K = [gkde(Q[:, i]) for i in range(4)]
    D["p"] = D["qoi"].apply(lambda q: np.prod([K[i].pdf(q[i]) for i in range(4)]))
    D["u"] = D["i"] * D["o"] / D["p"]
    mud_point_idx = D["u"].argmax()
    mud_point = D["lam"].iloc[mud_point_idx]
    print(f"MUD Point {mud_point_idx}: {mud_point}")
    return mud_point
 def test(decision=np.array([-0.09, -0.71, -0.43, -0.74]), seed=1992):
    env = gym.make("CartPole-v1")
    observation, info = env.reset(seed=seed, return_info=True)
    score = 0
    if is_ipython:
        img = plt.imshow(env.render(mode='rgb_array'))
    for i in range(10000):
        action = 1 if decision.T @ observation < 0 else 0
        observation, reward, done, info = env.step(action)
        score += reward
        if not is_ipython:
            env.render()
        else:
            img.set_data(env.render(mode='rgb_array'))
            plt.axis('off')
            display.display(plt.gcf())
            display.clear_output(wait=True)
        if done:
            if score == 500:
                print("WIN")
            else:
                print(f"LOSE: {int(score)}")
            score = 0  # reset score
            observation, info = env.reset(return_info=True)
    env.close()
 if __name__ == "__main__":
    data = pickle.load(open("data.pkl", "rb"))
    mud_point = train(data)
    test(mud_point)
--- a/reinforcement-learning/requirements.txt
+++ b/reinforcement-learning/requirements.txt
@ -0,0 +1,5 @@
 scipy
 numpy
 gym[classic_control]
 matplotlib
 pandas
--- a/reinforcement-learning/sample.py
+++ b/reinforcement-learning/sample.py
@ -0,0 +1,65 @@
 import pickle
 import gym
 import numpy as np
 from matplotlib import pyplot as plt
 # numpy precision for printing
 np.set_printoptions(precision=3, suppress=True)
 plt.ion()  # interactive plotting
 fig, ax = plt.subplots()
 colors = ["xkcd:orange", "xkcd:forest green", "xkcd:gray", "xkcd:light blue"]
 plots = [None] * 4
 env = gym.make("CartPole-v1")
 observation, info = env.reset(seed=42, return_info=True)
 max_steps = 100
 num_samples = 500
 samples = np.random.randn(num_samples, 4)
 data = []
 for lam in samples:
    breakpoints = []
    score = 0
    obs = []
    for n in range(max_steps):
        ax.cla()
        # action = env.action_space.sample()
        action = 1 if lam.T @ observation < 0 else 0
        # action = 1 if observation[0] - observation[3]  < 0 else 0
        observation, reward, done, info = env.step(action)
        score += reward
        obs.append(observation.tolist())
        o = np.array(obs)
        var = np.var(o[-int(score) :, :], axis=0)
        for q in range(4):
            lines = np.hstack([o[:, q], np.zeros(max_steps - n)])
            ax.plot(range(max_steps + 1), lines, c=colors[q])
        ax.set_title(f"Reward: {int(score)}, Variance: {var}")
        ax.set_ylim([-3, 3])
        if done or n == max_steps:
            breakpoints.append(n)
            observation, info = env.reset(return_info=True)
            # print(score, observation)
            score = 0  # reset score
        # draw break-point lines when game is lost
        for b in breakpoints:
            ax.vlines(
                b, np.min(o, axis=0).min(), np.max(o, axis=0).max(), color="black", lw=2
            )
        fig.canvas.draw()
        fig.show()
        fig.canvas.flush_events()
        env.render()
    data.append({"lam": lam, "obs": obs, "break": breakpoints})
    pickle.dump(data, open("data.pkl", "wb"))  # dump data frequently
 stop = input("Press any key to close.")
 plt.close()
 env.close()