RL

Core

class catalyst.rl.core.agent.ActorSpec[source]

Bases: abc.ABC, torch.nn.modules.module.Module

abstract forward(state, logprob=None, deterministic=False)[source]
abstract property policy_type
class catalyst.rl.core.agent.CriticSpec[source]

Bases: abc.ABC, torch.nn.modules.module.Module

abstract property distribution
abstract forward(*args, **kwargs)[source]
abstract property num_atoms
abstract property num_outputs
abstract property values_range
class catalyst.rl.core.algorithm.AlgorithmSpec[source]

Bases: abc.ABC

abstract property gamma
abstract property n_step
abstract pack_checkpoint(with_optimizer: bool = True, **kwargs)[source]
abstract classmethod prepare_for_sampler(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec][source]
abstract classmethod prepare_for_trainer(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict)[source]
abstract train(batch: Dict, **kwargs)[source]
abstract unpack_checkpoint(checkpoint, with_optimizer: bool = True, **kwargs)[source]
class catalyst.rl.core.db.DBSpec[source]

Bases: abc.ABC

class Message[source]

Bases: enum.Enum

An enumeration.

DISABLE_SAMPLING = 3
DISABLE_TRAINING = 1
ENABLE_SAMPLING = 2
ENABLE_TRAINING = 0
abstract del_checkpoint()[source]
abstract del_trajectory()[source]
abstract property epoch
abstract get_checkpoint()[source]
abstract get_trajectory(index=None)[source]
abstract property num_trajectories
abstract push_message(message: catalyst.rl.core.db.DBSpec.Message)[source]
abstract put_checkpoint(checkpoint, epoch)[source]
abstract put_trajectory(trajectory, raw: bool)[source]
abstract property sampling_enabled
abstract property training_enabled
class catalyst.rl.core.environment.EnvironmentSpec(visualize=False, mode='train', sampler_id=None)[source]

Bases: abc.ABC

abstract property action_space
property discrete_actions
property history_len
abstract property observation_space
abstract reset()[source]
property reward_space
abstract property state_space
abstract step(action)[source]
class catalyst.rl.core.exploration.ExplorationStrategy(power=1.0)[source]

Bases: object

Base class for working with various exploration strategies. In discrete case must contain method get_action(q_values). In continuous case must contain method get_action(action).

set_power(value)[source]
class catalyst.rl.core.exploration.ExplorationHandler(*exploration_params, env: catalyst.rl.core.environment.EnvironmentSpec)[source]

Bases: object

get_exploration_strategy()[source]
set_power(value)[source]
class catalyst.rl.core.policy_handler.PolicyHandler(env: catalyst.rl.core.environment.EnvironmentSpec, agent: Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec], device)[source]

Bases: object

class catalyst.rl.core.sampler.Sampler(agent: Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec], env: catalyst.rl.core.environment.EnvironmentSpec, db_server: catalyst.rl.core.db.DBSpec = None, exploration_handler: catalyst.rl.core.exploration.ExplorationHandler = None, logdir: str = None, id: int = 0, mode: str = 'infer', deterministic: bool = None, weights_sync_period: int = 1, weights_sync_mode: str = None, sampler_seed: int = 42, trajectory_seeds: List = None, trajectory_limit: int = None, force_store: bool = False, gc_period: int = 10, monitoring_params: Dict = None, **kwargs)[source]

Bases: object

load_checkpoint(*, filepath: str = None, db_server: catalyst.rl.core.db.DBSpec = None)[source]
run()[source]
class catalyst.rl.core.sampler.ValidSampler(agent: Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec], env: catalyst.rl.core.environment.EnvironmentSpec, db_server: catalyst.rl.core.db.DBSpec = None, exploration_handler: catalyst.rl.core.exploration.ExplorationHandler = None, logdir: str = None, id: int = 0, mode: str = 'infer', deterministic: bool = None, weights_sync_period: int = 1, weights_sync_mode: str = None, sampler_seed: int = 42, trajectory_seeds: List = None, trajectory_limit: int = None, force_store: bool = False, gc_period: int = 10, monitoring_params: Dict = None, **kwargs)[source]

Bases: catalyst.rl.core.sampler.Sampler

load_checkpoint(*, filepath: str = None, db_server: catalyst.rl.core.db.DBSpec = None) → bool[source]
static rewards2metric(rewards)[source]
run()[source]
save_checkpoint(logdir: str, checkpoint: Dict, save_n_best: int = 3, main_metric: str = 'raw_reward', minimize_metric: bool = False)[source]
class catalyst.rl.core.trainer.TrainerSpec(algorithm: catalyst.rl.core.algorithm.AlgorithmSpec, env_spec: catalyst.rl.core.environment.EnvironmentSpec, db_server: catalyst.rl.core.db.DBSpec, logdir: str, num_workers: int = 1, batch_size: int = 64, min_num_transitions: int = 10000, online_update_period: int = 1, weights_sync_period: int = 1, save_period: int = 10, gc_period: int = 10, seed: int = 42, epoch_limit: int = None, monitoring_params: Dict = None, **kwargs)[source]

Bases: object

run()[source]
class catalyst.rl.core.trajectory_sampler.TrajectorySampler(env: catalyst.rl.core.environment.EnvironmentSpec, agent: Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec], device, deterministic: bool = False, initial_capacity: int = 1000, sampling_flag: multiprocessing.context.BaseContext.Value = None)[source]

Bases: object

get_state(index=None, history_len=None)[source]
get_trajectory()[source]
reset(exploration_strategy=None)[source]
sample(exploration_strategy=None)[source]

Agent

class catalyst.rl.agent.actor.ActorSpec[source]

Bases: abc.ABC, torch.nn.modules.module.Module

abstract forward(state, logprob=None, deterministic=False)[source]
abstract property policy_type
class catalyst.rl.agent.actor.Actor(state_net: catalyst.rl.agent.network.StateNet, head_net: catalyst.rl.agent.head.PolicyHead)[source]

Bases: catalyst.rl.core.agent.ActorSpec

Actor which learns agents policy.

forward(state: torch.Tensor, logprob=False, deterministic=False)[source]
classmethod get_from_params(state_net_params: Dict, policy_head_params: Dict, env_spec: catalyst.rl.core.environment.EnvironmentSpec)[source]
property policy_type
class catalyst.rl.agent.critic.CriticSpec[source]

Bases: abc.ABC, torch.nn.modules.module.Module

abstract property distribution
abstract forward(*args, **kwargs)[source]
abstract property num_atoms
abstract property num_outputs
abstract property values_range
class catalyst.rl.agent.critic.StateCritic(state_net: catalyst.rl.agent.network.StateNet, head_net: catalyst.rl.agent.head.ValueHead)[source]

Bases: catalyst.rl.core.agent.CriticSpec

Critic that learns state value functions, like V(s).

property distribution
forward(state)[source]
classmethod get_from_params(state_net_params: Dict, value_head_params: Dict, env_spec: catalyst.rl.core.environment.EnvironmentSpec)[source]
property hyperbolic_constant
property num_atoms
property num_heads
property num_outputs
property values_range
class catalyst.rl.agent.critic.ActionCritic(state_net: catalyst.rl.agent.network.StateNet, head_net: catalyst.rl.agent.head.ValueHead)[source]

Bases: catalyst.rl.agent.critic.StateCritic

Critic that learns state-action value functions, like Q(s).

classmethod get_from_params(state_net_params: Dict, value_head_params: Dict, env_spec: catalyst.rl.core.environment.EnvironmentSpec)[source]
class catalyst.rl.agent.critic.StateActionCritic(state_action_net: catalyst.rl.agent.network.StateActionNet, head_net: catalyst.rl.agent.head.ValueHead)[source]

Bases: catalyst.rl.core.agent.CriticSpec

Critic which learns state-action value functions, like Q(s, a).

property distribution
forward(state, action)[source]
classmethod get_from_params(state_action_net_params: Dict, value_head_params: Dict, env_spec: catalyst.rl.core.environment.EnvironmentSpec)[source]
property hyperbolic_constant
property num_atoms
property num_heads
property num_outputs
property values_range
class catalyst.rl.agent.head.ValueHead(in_features: int, out_features: int, bias: bool = True, num_atoms: int = 1, use_state_value_head: bool = False, distribution: str = None, values_range: tuple = None, num_heads: int = 1, hyperbolic_constant: float = 1.0)[source]

Bases: torch.nn.modules.module.Module

forward(state: torch.Tensor)[source]
class catalyst.rl.agent.head.PolicyHead(in_features: int, out_features: int, policy_type: str = None, out_activation: torch.nn.modules.module.Module = None)[source]

Bases: torch.nn.modules.module.Module

forward(state: torch.Tensor, logprob=None, deterministic=False)[source]
class catalyst.rl.agent.network.StateNet(main_net: torch.nn.modules.module.Module, observation_net: torch.nn.modules.module.Module = None, aggregation_net: torch.nn.modules.module.Module = None)[source]

Bases: torch.nn.modules.module.Module

__init__(main_net: torch.nn.modules.module.Module, observation_net: torch.nn.modules.module.Module = None, aggregation_net: torch.nn.modules.module.Module = None)[source]

Abstract network, that takes some tensor T of shape [bs; history_len; …] and outputs some representation tensor R of shape [bs; representation_size]

input_T [bs; history_len; in_features]

-> observation_net (aka observation_encoder) ->

observations_representations [bs; history_len; obs_features]

-> aggregation_net (flatten in simplified case) ->

aggregated_representation [bs; hid_features]

-> main_net ->

output_T [bs; representation_size]

Parameters
  • main_net

  • observation_net

  • aggregation_net

forward(state)[source]
classmethod get_from_params(state_shape, observation_net_params=None, aggregation_net_params=None, main_net_params=None) → catalyst.rl.agent.network.StateNet[source]
class catalyst.rl.agent.network.StateActionNet(main_net: torch.nn.modules.module.Module, observation_net: torch.nn.modules.module.Module = None, action_net: torch.nn.modules.module.Module = None, aggregation_net: torch.nn.modules.module.Module = None)[source]

Bases: torch.nn.modules.module.Module

forward(state, action)[source]
classmethod get_from_params(state_shape, action_shape, observation_net_params=None, action_net_params=None, aggregation_net_params=None, main_net_params=None) → catalyst.rl.agent.network.StateNet[source]
class catalyst.rl.agent.policy.CategoricalPolicy[source]

Bases: torch.nn.modules.module.Module

forward(logits, logprob=None, deterministic=False)[source]
class catalyst.rl.agent.policy.BernoulliPolicy[source]

Bases: torch.nn.modules.module.Module

forward(logits, logprob=None, deterministic=False)[source]
class catalyst.rl.agent.policy.DiagonalGaussPolicy[source]

Bases: torch.nn.modules.module.Module

forward(logits, logprob=None, deterministic=False)[source]
class catalyst.rl.agent.policy.SquashingGaussPolicy(squashing_fn=<class 'torch.nn.modules.activation.Tanh'>)[source]

Bases: torch.nn.modules.module.Module

forward(logits, logprob=None, deterministic=False)[source]
class catalyst.rl.agent.policy.RealNVPPolicy(action_size, layer_fn, activation_fn=<class 'torch.nn.modules.activation.ReLU'>, squashing_fn=<class 'torch.nn.modules.activation.Tanh'>, bias=False)[source]

Bases: torch.nn.modules.module.Module

forward(logits, logprob=None, deterministic=False)[source]

DB

class catalyst.rl.db.mongo.MongoDB(host: str = '127.0.0.1', port: int = 12000, prefix: str = None, sync_epoch: bool = False, reconnect_timeout: int = 3)[source]

Bases: catalyst.rl.core.db.DBSpec

del_checkpoint()[source]
del_trajectory()[source]
property epoch
get_checkpoint()[source]
get_trajectory(index=None)[source]
property num_trajectories
push_message(message: catalyst.rl.core.db.DBSpec.Message)[source]
put_checkpoint(checkpoint, epoch)[source]
put_trajectory(trajectory, raw=False)[source]
property sampling_enabled
property training_enabled
class catalyst.rl.db.redis.RedisDB(host='127.0.0.1', port=12000, prefix=None, sync_epoch=False)[source]

Bases: catalyst.rl.core.db.DBSpec

del_checkpoint()[source]
del_trajectory()[source]
property epoch
get_checkpoint()[source]
get_trajectory(index=None)[source]
property num_trajectories
push_message(message: catalyst.rl.core.db.DBSpec.Message)[source]
put_checkpoint(checkpoint, epoch)[source]
put_trajectory(trajectory, raw=False)[source]
property sampling_enabled
property training_enabled

Environments

class catalyst.rl.environment.gym.GymEnvWrapper(env_name, **params)[source]

Bases: catalyst.rl.environment.environment.EnvironmentWrapper

Exploration

class catalyst.rl.exploration.boltzman.Boltzmann(temp_init, temp_final, annealing_steps, temp_min=0.01)[source]

Bases: catalyst.rl.core.exploration.ExplorationStrategy

For discrete environments only. Selects soft maximum action (softmax_a [Q(s,a)/t]). Temperature parameter t usually decreases during the course of training. Importantly, the effective range of t depends on the magnitutdes of environment rewards.

get_action(q_values)[source]
set_power(value)[source]
class catalyst.rl.exploration.gauss.NoExploration(power=1.0)[source]

Bases: catalyst.rl.core.exploration.ExplorationStrategy

For continuous environments only. Returns action produced by the actor network without changes.

get_action(action)[source]
class catalyst.rl.exploration.gauss.GaussNoise(sigma)[source]

Bases: catalyst.rl.core.exploration.ExplorationStrategy

For continuous environments only. Adds spherical Gaussian noise to the action produced by actor.

get_action(action)[source]
set_power(value)[source]
class catalyst.rl.exploration.gauss.OrnsteinUhlenbeckProcess(sigma, theta, dt=0.01)[source]

Bases: catalyst.rl.core.exploration.ExplorationStrategy

For continuous environments only. Adds temporally correlated Gaussian noise generated with Ornstein-Uhlenbeck process. Paper: https://arxiv.org/abs/1509.02971

get_action(action)[source]
reset_state(action_size)[source]
set_power(value)[source]
class catalyst.rl.exploration.greedy.Greedy(power=1.0)[source]

Bases: catalyst.rl.core.exploration.ExplorationStrategy

For discrete environments only. Selects greedy action (argmax_a Q(s,a)).

get_action(q_values)[source]
class catalyst.rl.exploration.greedy.EpsilonGreedy(eps_init, eps_final, annealing_steps, eps_min=0.01)[source]

Bases: catalyst.rl.core.exploration.ExplorationStrategy

For discrete environments only. Selects random action with probability eps and greedy action (argmax_a Q(s,a)) with probability 1-eps. Random action selection probability eps usually decreases from 1 to 0.01-0.05 during the course of training.

get_action(q_values)[source]
set_power(value)[source]
class catalyst.rl.exploration.param_noise.ParameterSpaceNoise(target_sigma, tolerance=0.001, max_steps=1000)[source]

Bases: catalyst.rl.core.exploration.ExplorationStrategy

For continuous environments only. At the beginning of the episode, perturbs the weights of actor network forcing it to produce more diverse actions. Paper: https://arxiv.org/abs/1706.01905

get_action(action)[source]
set_power(value)[source]
update_actor(actor, states)[source]

Off-policy

class catalyst.rl.offpolicy.trainer.OffpolicyTrainer(algorithm: catalyst.rl.core.algorithm.AlgorithmSpec, env_spec: catalyst.rl.core.environment.EnvironmentSpec, db_server: catalyst.rl.core.db.DBSpec, logdir: str, num_workers: int = 1, batch_size: int = 64, min_num_transitions: int = 10000, online_update_period: int = 1, weights_sync_period: int = 1, save_period: int = 10, gc_period: int = 10, seed: int = 42, epoch_limit: int = None, monitoring_params: Dict = None, **kwargs)[source]

Bases: catalyst.rl.core.trainer.TrainerSpec

Discrete

class catalyst.rl.offpolicy.algorithms.critic.OffpolicyCritic(critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, critic_loss_params: Dict = None, critic_optimizer_params: Dict = None, critic_scheduler_params: Dict = None, critic_grad_clip_params: Dict = None, critic_tau: float = 1.0, **kwargs)[source]

Bases: catalyst.rl.core.algorithm.AlgorithmSpec

critic_update(loss)[source]
property gamma
property n_step
pack_checkpoint(with_optimizer: bool = True)[source]
classmethod prepare_for_sampler(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec][source]
classmethod prepare_for_trainer(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → catalyst.rl.core.algorithm.AlgorithmSpec[source]
target_critic_update()[source]
train(batch, actor_update=False, critic_update=True)[source]
unpack_checkpoint(checkpoint, with_optimizer: bool = True)[source]
update_step(value_loss, critic_update=True)[source]

Updates parameters of neural networks and returns learning metrics

Parameters
  • value_loss

  • critic_update

Returns:

class catalyst.rl.offpolicy.algorithms.dqn.DQN(critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, critic_loss_params: Dict = None, critic_optimizer_params: Dict = None, critic_scheduler_params: Dict = None, critic_grad_clip_params: Dict = None, critic_tau: float = 1.0, **kwargs)[source]

Bases: catalyst.rl.offpolicy.algorithms.critic.OffpolicyCritic

Swiss Army knife DQN algorithm.

update_step(value_loss, critic_update=True)[source]

Continuous

class catalyst.rl.offpolicy.algorithms.actor_critic.OffpolicyActorCritic(actor: catalyst.rl.core.agent.ActorSpec, critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, actor_tau: float = 1.0, critic_tau: float = 1.0, action_boundaries: tuple = None, **kwargs)[source]

Bases: catalyst.rl.core.algorithm.AlgorithmSpec

actor_update(loss)[source]
critic_update(loss)[source]
property gamma
property n_step
pack_checkpoint(with_optimizer: bool = True)[source]
classmethod prepare_for_sampler(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec][source]
classmethod prepare_for_trainer(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → catalyst.rl.core.algorithm.AlgorithmSpec[source]
target_actor_update()[source]
target_critic_update()[source]
train(batch, actor_update=True, critic_update=True)[source]
unpack_checkpoint(checkpoint, with_optimizer: bool = True)[source]
update_step(policy_loss, value_loss, actor_update=True, critic_update=True)[source]

Updates parameters of neural networks and returns learning metrics

Parameters
  • policy_loss

  • value_loss

  • actor_update

  • critic_update

Returns:

class catalyst.rl.offpolicy.algorithms.ddpg.DDPG(actor: catalyst.rl.core.agent.ActorSpec, critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, actor_tau: float = 1.0, critic_tau: float = 1.0, action_boundaries: tuple = None, **kwargs)[source]

Bases: catalyst.rl.offpolicy.algorithms.actor_critic.OffpolicyActorCritic

Swiss Army knife DDPG algorithm.

update_step(policy_loss, value_loss, actor_update=True, critic_update=True)[source]
class catalyst.rl.offpolicy.algorithms.sac.SAC(actor: catalyst.rl.core.agent.ActorSpec, critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, actor_tau: float = 1.0, critic_tau: float = 1.0, action_boundaries: tuple = None, **kwargs)[source]

Bases: catalyst.rl.offpolicy.algorithms.actor_critic.OffpolicyActorCritic

critic_update(loss)[source]
pack_checkpoint(with_optimizer: bool = True)[source]
classmethod prepare_for_trainer(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → catalyst.rl.core.algorithm.AlgorithmSpec[source]
target_actor_update()[source]
target_critic_update()[source]
unpack_checkpoint(checkpoint, with_optimizer: bool = True)[source]
update_step(policy_loss, value_loss, actor_update=True, critic_update=True)[source]
class catalyst.rl.offpolicy.algorithms.td3.TD3(actor: catalyst.rl.core.agent.ActorSpec, critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, actor_tau: float = 1.0, critic_tau: float = 1.0, action_boundaries: tuple = None, **kwargs)[source]

Bases: catalyst.rl.offpolicy.algorithms.actor_critic.OffpolicyActorCritic

Swiss Army knife TD3 algorithm.

critic_update(loss)[source]
pack_checkpoint(with_optimizer: bool = True)[source]
classmethod prepare_for_trainer(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → catalyst.rl.core.algorithm.AlgorithmSpec[source]
target_critic_update()[source]
unpack_checkpoint(checkpoint, with_optimizer: bool = True)[source]
update_step(policy_loss, value_loss, actor_update=True, critic_update=True)[source]

On-policy

class catalyst.rl.onpolicy.trainer.OnpolicyTrainer(algorithm: catalyst.rl.core.algorithm.AlgorithmSpec, env_spec: catalyst.rl.core.environment.EnvironmentSpec, db_server: catalyst.rl.core.db.DBSpec, logdir: str, num_workers: int = 1, batch_size: int = 64, min_num_transitions: int = 10000, online_update_period: int = 1, weights_sync_period: int = 1, save_period: int = 10, gc_period: int = 10, seed: int = 42, epoch_limit: int = None, monitoring_params: Dict = None, **kwargs)[source]

Bases: catalyst.rl.core.trainer.TrainerSpec

class catalyst.rl.onpolicy.algorithms.actor.OnpolicyActor(actor: catalyst.rl.core.agent.ActorSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, actor_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, **kwargs)[source]

Bases: catalyst.rl.core.algorithm.AlgorithmSpec

actor_update(loss)[source]
property gamma
get_rollout(states, actions, rewards, dones)[source]
get_rollout_spec() → Dict[source]
property n_step
pack_checkpoint(with_optimizer: bool = True)[source]
postprocess_buffer(buffers, len)[source]
classmethod prepare_for_sampler(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec][source]
classmethod prepare_for_trainer(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → catalyst.rl.core.algorithm.AlgorithmSpec[source]
unpack_checkpoint(checkpoint, with_optimizer: bool = True)[source]
class catalyst.rl.onpolicy.algorithms.actor_critic.OnpolicyActorCritic(actor: catalyst.rl.core.agent.ActorSpec, critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, **kwargs)[source]

Bases: catalyst.rl.core.algorithm.AlgorithmSpec

actor_update(loss)[source]
critic_update(loss)[source]
property gamma
get_rollout(states, actions, rewards, dones)[source]
get_rollout_spec() → Dict[source]
property n_step
pack_checkpoint(with_optimizer: bool = True)[source]
postprocess_buffer(buffers, len)[source]
classmethod prepare_for_sampler(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → Union[catalyst.rl.core.agent.ActorSpec, catalyst.rl.core.agent.CriticSpec][source]
classmethod prepare_for_trainer(env_spec: catalyst.rl.core.environment.EnvironmentSpec, config: Dict) → catalyst.rl.core.algorithm.AlgorithmSpec[source]
unpack_checkpoint(checkpoint, with_optimizer: bool = True)[source]
class catalyst.rl.onpolicy.algorithms.ppo.PPO(actor: catalyst.rl.core.agent.ActorSpec, critic: catalyst.rl.core.agent.CriticSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, critic_loss_params: Dict = None, actor_optimizer_params: Dict = None, critic_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, critic_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, critic_grad_clip_params: Dict = None, **kwargs)[source]

Bases: catalyst.rl.onpolicy.algorithms.actor_critic.OnpolicyActorCritic

get_rollout(states, actions, rewards, dones)[source]
get_rollout_spec()[source]
postprocess_buffer(buffers, len)[source]
train(batch, **kwargs)[source]
class catalyst.rl.onpolicy.algorithms.reinforce.REINFORCE(actor: catalyst.rl.core.agent.ActorSpec, gamma: float, n_step: int, actor_loss_params: Dict = None, actor_optimizer_params: Dict = None, actor_scheduler_params: Dict = None, actor_grad_clip_params: Dict = None, **kwargs)[source]

Bases: catalyst.rl.onpolicy.algorithms.actor.OnpolicyActor

get_rollout(states, actions, rewards, dones)[source]
get_rollout_spec()[source]
postprocess_buffer(buffers, len)[source]
train(batch, **kwargs)[source]