Getting Started
===============

In this document, we provide some toy examples for getting started. All
the examples in this document and even more examples are available in
`examples/ <https://github.com/datamllab/rlcard/tree/master/examples>`__.

Playing with Random Agents
--------------------------

We have set up a random agent that can play randomly on each
environment. An example of applying a random agent on Blackjack is as
follow:

.. code:: python

   import rlcard
   from rlcard.agents import RandomAgent
   from rlcard.utils import set_global_seed

   # Make environment
   env = rlcard.make('blackjack', config={'seed': 0})
   episode_num = 2

   # Set a global seed
   set_global_seed(0)

   # Set up agents
   agent_0 = RandomAgent(action_num=env.action_num)
   env.set_agents([agent_0])

   for episode in range(episode_num):

       # Generate data from the environment
       trajectories, _ = env.run(is_training=False)

       # Print out the trajectories
       print('\nEpisode {}'.format(episode))
       for ts in trajectories[0]:
           print('State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'.format(ts[0], ts[1], ts[2], ts[3], ts[4]))

The expected output should look like something as follows:

::

   Episode 0
   State: {'obs': array([20,  3]), 'legal_actions': [0, 1]}, Action: 0, Reward: 0, Next State: {'obs': array([15,  3]), 'legal_actions': [0, 1]}, Done: False
   State: {'obs': array([15,  3]), 'legal_actions': [0, 1]}, Action: 1, Reward: -1, Next State: {'obs': array([15, 20]), 'legal_actions': [0, 1]}, Done: True

   Episode 1
   State: {'obs': array([15,  5]), 'legal_actions': [0, 1]}, Action: 1, Reward: 1, Next State: {'obs': array([15, 23]), 'legal_actions': [0, 1]}, Done: True

Note that the states and actions are wrapped by ``env`` in Blackjack. In
this example, the ``[20, 3]`` suggests the current player obtains score
20 while the card that faces up in the dealer’s hand has score 3. Action
0 means “hit” while action 1 means “stand”. Reward 1 suggests the player
wins while reward -1 suggests the dealer wins. Reward 0 suggests a tie.
The above data can be directly fed into a RL algorithm for training.

Deep-Q Learning on Blackjack
----------------------------

The second example is to use Deep-Q learning to train an agent on
Blackjack. We aim to use this example to show how reinforcement learning
algorithms can be developed and applied in our toolkit. We design a
``run`` function which plays one complete game and provides the data for
training RL agents. The example is shown below:

.. code:: python

   import tensorflow as tf
   import os

   import rlcard
   from rlcard.agents import DQNAgent
   from rlcard.utils import set_global_seed, tournament
   from rlcard.utils import Logger

   # Make environment
   env = rlcard.make('blackjack', config={'seed': 0})
   eval_env = rlcard.make('blackjack', config={'seed': 0})

   # Set the iterations numbers and how frequently we evaluate/save plot
   evaluate_every = 100
   evaluate_num = 10000
   episode_num = 100000

   # The intial memory size
   memory_init_size = 100

   # Train the agent every X steps
   train_every = 1

   # The paths for saving the logs and learning curves
   log_dir = './experiments/blackjack_dqn_result/'

   # Set a global seed
   set_global_seed(0)

   with tf.Session() as sess:

       # Initialize a global step
       global_step = tf.Variable(0, name='global_step', trainable=False)

       # Set up the agents
       agent = DQNAgent(sess,
                        scope='dqn',
                        action_num=env.action_num,
                        replay_memory_init_size=memory_init_size,
                        train_every=train_every,
                        state_shape=env.state_shape,
                        mlp_layers=[10,10])
       env.set_agents([agent])
       eval_env.set_agents([agent])

       # Initialize global variables
       sess.run(tf.global_variables_initializer())

       # Init a Logger to plot the learning curve
       logger = Logger(log_dir)

       for episode in range(episode_num):

           # Generate data from the environment
           trajectories, _ = env.run(is_training=True)

           # Feed transitions into agent memory, and train the agent
           for ts in trajectories[0]:
               agent.feed(ts)

           # Evaluate the performance. Play with random agents.
           if episode % evaluate_every == 0:
               logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

       # Close files in the logger
       logger.close_files()

       # Plot the learning curve
       logger.plot('DQN')
       
       # Save model
       save_dir = 'models/blackjack_dqn'
       if not os.path.exists(save_dir):
           os.makedirs(save_dir)
       saver = tf.train.Saver()
       saver.save(sess, os.path.join(save_dir, 'model'))

The expected output is something like below:

::

   ----------------------------------------
     timestep     |  1
     reward       |  -0.7342
   ----------------------------------------
   INFO - Agent dqn, step 100, rl-loss: 1.0042707920074463
   INFO - Copied model parameters to target network.
   INFO - Agent dqn, step 136, rl-loss: 0.7888197302818298
   ----------------------------------------
     timestep     |  136
     reward       |  -0.1406
   ----------------------------------------
   INFO - Agent dqn, step 278, rl-loss: 0.6946825981140137
   ----------------------------------------
     timestep     |  278
     reward       |  -0.1523
   ----------------------------------------
   INFO - Agent dqn, step 412, rl-loss: 0.62268990278244025
   ----------------------------------------
     timestep     |  412
     reward       |  -0.088
   ----------------------------------------
   INFO - Agent dqn, step 544, rl-loss: 0.69050502777099616
   ----------------------------------------
     timestep     |  544
     reward       |  -0.08
   ----------------------------------------
   INFO - Agent dqn, step 681, rl-loss: 0.61789089441299444
   ----------------------------------------
     timestep     |  681
     reward       |  -0.0793
   ----------------------------------------

In Blackjack, the player will get a payoff at the end of the game: 1 if
the player wins, -1 if the player loses, and 0 if it is a tie. The
performance is measured by the average payoff the player obtains by
playing 10000 episodes. The above example shows that the agent achieves
better and better performance during training. The logs and learning
curves are saved in ``./experiments/blackjack_dqn_result/``.

Running Multiple Processes
--------------------------

The environments can be run with multiple processes to accelerate the
training. Below is an example to train DQN on Blackjack with multiple
processes.

.. code:: python

   ''' An example of learning a Deep-Q Agent on Blackjack with multiple processes
   Note that we must use if __name__ == '__main__' for multiprocessing
   '''

   import tensorflow as tf
   import os

   import rlcard
   from rlcard.agents import DQNAgent
   from rlcard.utils import set_global_seed, tournament
   from rlcard.utils import Logger

   def main():
       # Make environment
       env = rlcard.make('blackjack', config={'seed': 0, 'env_num': 4})
       eval_env = rlcard.make('blackjack', config={'seed': 0, 'env_num': 4})

       # Set the iterations numbers and how frequently we evaluate performance
       evaluate_every = 100
       evaluate_num = 10000
       iteration_num = 100000

       # The intial memory size
       memory_init_size = 100

       # Train the agent every X steps
       train_every = 1

       # The paths for saving the logs and learning curves
       log_dir = './experiments/blackjack_dqn_result/'

       # Set a global seed
       set_global_seed(0)

       with tf.Session() as sess:

           # Initialize a global step
           global_step = tf.Variable(0, name='global_step', trainable=False)

           # Set up the agents
           agent = DQNAgent(sess,
                            scope='dqn',
                            action_num=env.action_num,
                            replay_memory_init_size=memory_init_size,
                            train_every=train_every,
                            state_shape=env.state_shape,
                            mlp_layers=[10,10])
           env.set_agents([agent])
           eval_env.set_agents([agent])

           # Initialize global variables
           sess.run(tf.global_variables_initializer())

           # Initialize a Logger to plot the learning curve
           logger = Logger(log_dir)

           for iteration in range(iteration_num):

               # Generate data from the environment
               trajectories, _ = env.run(is_training=True)

               # Feed transitions into agent memory, and train the agent
               for ts in trajectories[0]:
                   agent.feed(ts)

               # Evaluate the performance. Play with random agents.
               if iteration % evaluate_every == 0:
                   logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

           # Close files in the logger
           logger.close_files()

           # Plot the learning curve
           logger.plot('DQN')
           
           # Save model
           save_dir = 'models/blackjack_dqn'
           if not os.path.exists(save_dir):
               os.makedirs(save_dir)
           saver = tf.train.Saver()
           saver.save(sess, os.path.join(save_dir, 'model'))

   if __name__ == '__main__':
       main()

Example output is as follow:

::

   ----------------------------------------
     timestep     |  17
     reward       |  -0.7378
   ----------------------------------------

   INFO - Copied model parameters to target network.
   INFO - Agent dqn, step 1100, rl-loss: 0.40940183401107797
   INFO - Copied model parameters to target network.
   INFO - Agent dqn, step 2100, rl-loss: 0.44971221685409546
   INFO - Copied model parameters to target network.
   INFO - Agent dqn, step 2225, rl-loss: 0.65466868877410897
   ----------------------------------------
     timestep     |  2225
     reward       |  -0.0658
   ----------------------------------------
   INFO - Agent dqn, step 3100, rl-loss: 0.48663979768753053
   INFO - Copied model parameters to target network.
   INFO - Agent dqn, step 4100, rl-loss: 0.71293979883193974
   INFO - Copied model parameters to target network.
   INFO - Agent dqn, step 4440, rl-loss: 0.55871248245239263
   ----------------------------------------
     timestep     |  4440
     reward       |  -0.0736
   ----------------------------------------

Training CFR on Leduc Hold’em
-----------------------------

To show how we can use ``step`` and ``step_back`` to traverse the game
tree, we provide an example of solving Leduc Hold’em with CFR:

.. code:: python

   import numpy as np

   import rlcard
   from rlcard.agents import CFRAgent
   from rlcard import models
   from rlcard.utils import set_global_seed, tournament
   from rlcard.utils import Logger

   # Make environment and enable human mode
   env = rlcard.make('leduc-holdem', config={'seed': 0, 'allow_step_back':True})
   eval_env = rlcard.make('leduc-holdem', config={'seed': 0})

   # Set the iterations numbers and how frequently we evaluate/save plot
   evaluate_every = 100
   save_plot_every = 1000
   evaluate_num = 10000
   episode_num = 10000

   # The paths for saving the logs and learning curves
   log_dir = './experiments/leduc_holdem_cfr_result/'

   # Set a global seed
   set_global_seed(0)

   # Initilize CFR Agent
   agent = CFRAgent(env)
   agent.load()  # If we have saved model, we first load the model

   # Evaluate CFR against pre-trained NFSP
   eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]])

   # Init a Logger to plot the learning curve
   logger = Logger(log_dir)

   for episode in range(episode_num):
       agent.train()
       print('\rIteration {}'.format(episode), end='')
       # Evaluate the performance. Play with NFSP agents.
       if episode % evaluate_every == 0:
           agent.save() # Save model
           logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

   # Close files in the logger
   logger.close_files()

   # Plot the learning curve
   logger.plot('CFR')

In the above example, the performance is measured by playing against a
pre-trained NFSP model. The expected output is as below:

::

   Iteration 0
   ----------------------------------------
     timestep     |  192
     reward       |  -1.3662
   ----------------------------------------
   Iteration 100
   ----------------------------------------
     timestep     |  19392
     reward       |  0.9462
   ----------------------------------------
   Iteration 200
   ----------------------------------------
     timestep     |  38592
     reward       |  0.8591
   ----------------------------------------
   Iteration 300
   ----------------------------------------
     timestep     |  57792
     reward       |  0.7861
   ----------------------------------------
   Iteration 400
   ----------------------------------------
     timestep     |  76992
     reward       |  0.7752
   ----------------------------------------
   Iteration 500
   ----------------------------------------
     timestep     |  96192
     reward       |  0.7215
   ----------------------------------------

We observe that CFR achieves better performance as NFSP. However, CFR
requires traversal of the game tree, which is infeasible in large
environments.

Having Fun with Pretrained Leduc Model
--------------------------------------

We have designed simple human interfaces to play against the pretrained
model. Leduc Hold’em is a simplified version of Texas Hold’em. Rules can
be found `here <games.md#leduc-holdem>`__. Example of playing against
Leduc Hold’em CFR model is as below:

.. code:: python

   import rlcard
   from rlcard import models
   from rlcard.agents import LeducholdemHumanAgent as HumanAgent
   from rlcard.utils import print_card

   # Make environment
   # Set 'record_action' to True because we need it to print results
   env = rlcard.make('leduc-holdem', config={'record_action': True})
   human_agent = HumanAgent(env.action_num)
   cfr_agent = models.load('leduc-holdem-cfr').agents[0]
   env.set_agents([human_agent, cfr_agent])

   print(">> Leduc Hold'em pre-trained model")

   while (True):
       print(">> Start a new game")

       trajectories, payoffs = env.run(is_training=False)
       # If the human does not take the final action, we need to
       # print other players action
       final_state = trajectories[0][-1][-2]
       action_record = final_state['action_record']
       state = final_state['raw_obs']
       _action_list = []
       for i in range(1, len(action_record)+1):
           if action_record[-i][0] == state['current_player']:
               break
           _action_list.insert(0, action_record[-i])
       for pair in _action_list:
           print('>> Player', pair[0], 'chooses', pair[1])

       # Let's take a look at what the agent card is
       print('===============     CFR Agent    ===============')
       print_card(env.get_perfect_information()['hand_cards'][1])

       print('===============     Result     ===============')
       if payoffs[0] > 0:
           print('You win {} chips!'.format(payoffs[0]))
       elif payoffs[0] == 0:
           print('It is a tie.')
       else:
           print('You lose {} chips!'.format(-payoffs[0]))
       print('')

       input("Press any key to continue...")

Example output is as follow:

::

   >> Leduc Hold'em pre-trained model

   >> Start a new game!
   >> Agent 1 chooses raise

   =============== Community Card ===============
   ┌─────────┐
   │░░░░░░░░░│
   │░░░░░░░░░│
   │░░░░░░░░░│
   │░░░░░░░░░│
   │░░░░░░░░░│
   │░░░░░░░░░│
   │░░░░░░░░░│
   └─────────┘
   ===============   Your Hand    ===============
   ┌─────────┐
   │J        │
   │         │
   │         │
   │    ♥    │
   │         │
   │         │
   │        J│
   └─────────┘
   ===============     Chips      ===============
   Yours:   +
   Agent 1: +++
   =========== Actions You Can Choose ===========
   0: call, 1: raise, 2: fold

   >> You choose action (integer):

We also provide a running demo of a rule-based agent for UNO. Try it by
running ``examples/uno_human.py``.

Leduc Hold’em as Single-Agent Environment
-----------------------------------------

We have wrraped the environment as single agent environment by assuming
that other players play with pre-trained models. The interfaces are
exactly the same to OpenAI Gym. Thus, any single-agent algorithm can be
connected to the environment. An example of Leduc Hold’em is as below:

.. code:: python

   import tensorflow as tf
   import os
   import numpy as np

   import rlcard
   from rlcard.agents import DQNAgent
   from rlcard.agents import RandomAgent
   from rlcard.utils import set_global_seed, tournament
   from rlcard.utils import Logger

   # Make environment
   env = rlcard.make('leduc-holdem', config={'seed': 0, 'single_agent_mode':True})
   eval_env = rlcard.make('leduc-holdem', config={'seed': 0, 'single_agent_mode':True})

   # Set the iterations numbers and how frequently we evaluate/save plot
   evaluate_every = 1000
   evaluate_num = 10000
   timesteps = 100000

   # The intial memory size
   memory_init_size = 1000

   # Train the agent every X steps
   train_every = 1

   # The paths for saving the logs and learning curves
   log_dir = './experiments/leduc_holdem_single_dqn_result/'

   # Set a global seed
   set_global_seed(0)

   with tf.Session() as sess:

       # Initialize a global step
       global_step = tf.Variable(0, name='global_step', trainable=False)

       # Set up the agents
       agent = DQNAgent(sess,
                        scope='dqn',
                        action_num=env.action_num,
                        replay_memory_init_size=memory_init_size,
                        train_every=train_every,
                        state_shape=env.state_shape,
                        mlp_layers=[128,128])
       # Initialize global variables
       sess.run(tf.global_variables_initializer())

       # Init a Logger to plot the learning curve
       logger = Logger(log_dir)

       state = env.reset()

       for timestep in range(timesteps):
           action = agent.step(state)
           next_state, reward, done = env.step(action)
           ts = (state, action, reward, next_state, done)
           agent.feed(ts)

           if timestep % evaluate_every == 0:
               rewards = []
               state = eval_env.reset()
               for _ in range(evaluate_num):
                   action, _ = agent.eval_step(state)
                   _, reward, done = env.step(action)
                   if done:
                       rewards.append(reward)
               logger.log_performance(env.timestep, np.mean(rewards))

       # Close files in the logger
       logger.close_files()

       # Plot the learning curve
       logger.plot('DQN')
       
       # Save model
       save_dir = 'models/leduc_holdem_single_dqn'
       if not os.path.exists(save_dir):
           os.makedirs(save_dir)
       saver = tf.train.Saver()
       saver.save(sess, os.path.join(save_dir, 'model'))