10:18 now i'm at https://stable-baselines3.readthedocs.io/en/master/guide/examples.html#id5 here is my vim buffer import gym import numpy as np import time class Env(gym.Env): # spaces: gym.spaces.Box(low, high, shape=None, dtype=np.float32) # gym.spaces.Discrete(n) def __init__(self): self.action_space = gym.spaces.Box(-np.inf, np.inf, (2,)) self.observation_space = gym.spaces.Box(-np.inf, np.inf, (2,)) #reward_range = None def seed(self, seed=None): self.action_space.seed(seed) self.observation_space.seed(self.action_space.sample()) return seed def reset(self): self.state = self.observation_space.sample() self.goal = self.observation_space.sample() self.state[1] = time.time() self.goal[1] = self.state[1] + 1 return self.state def step(self, action): nexttime = time.time() change = nexttime - self.state[1] self.state[0] += action * change self.state[1] = nexttime distance = self.state[0] - self.goal[0] reward = -np.log(np.abs(distance)) return (self.state, reward, (distance == 0 or self.state[1] >= self.goal[1]), {}) def render(self, mode='human'): print(self.goal, self.state) import stable_baselines3 as sb3 from stable_baselines3 import PPO from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv from stable_baselines3.common.env_util import make_vec_env from stable_baselines3.common.utils import set_random_seed there's an issue where the action space is a vector including frownedness , but is treated as a scalar in the step function. this would be resolved by indexing the action to get a time coefficient out of the vector i guess. PPO is right there. the docs say to use a vector of parallel environments, whcih means just passing the environment class to a function. it's 10:21 and i'm thinking of doing something else.