forked from TensorLayer/tensorlayer3
160 lines
4.7 KiB
Python
160 lines
4.7 KiB
Python
#! /usr/bin/python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import numpy as np
|
|
import tensorflow as tf
|
|
from six.moves import xrange
|
|
|
|
__all__ = [
|
|
'discount_episode_rewards',
|
|
'cross_entropy_reward_loss',
|
|
'log_weight',
|
|
'choice_action_by_probs',
|
|
]
|
|
|
|
|
|
def discount_episode_rewards(rewards=None, gamma=0.99, mode=0):
|
|
"""Take 1D float array of rewards and compute discounted rewards for an
|
|
episode. When encount a non-zero value, consider as the end a of an episode.
|
|
|
|
Parameters
|
|
----------
|
|
rewards : list
|
|
List of rewards
|
|
gamma : float
|
|
Discounted factor
|
|
mode : int
|
|
Mode for computing the discount rewards.
|
|
- If mode == 0, reset the discount process when encount a non-zero reward (Ping-pong game).
|
|
- If mode == 1, would not reset the discount process.
|
|
|
|
Returns
|
|
--------
|
|
list of float
|
|
The discounted rewards.
|
|
|
|
Examples
|
|
----------
|
|
>>> rewards = np.asarray([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])
|
|
>>> gamma = 0.9
|
|
>>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma)
|
|
>>> print(discount_rewards)
|
|
[ 0.72899997 0.81 0.89999998 1. 0.72899997 0.81
|
|
0.89999998 1. 0.72899997 0.81 0.89999998 1. ]
|
|
>>> discount_rewards = tl.rein.discount_episode_rewards(rewards, gamma, mode=1)
|
|
>>> print(discount_rewards)
|
|
[ 1.52110755 1.69011939 1.87791049 2.08656716 1.20729685 1.34144104
|
|
1.49048996 1.65610003 0.72899997 0.81 0.89999998 1. ]
|
|
|
|
"""
|
|
if rewards is None:
|
|
raise Exception("rewards should be a list")
|
|
discounted_r = np.zeros_like(rewards, dtype=np.float32)
|
|
running_add = 0
|
|
for t in reversed(xrange(0, rewards.size)):
|
|
if mode == 0:
|
|
if rewards[t] != 0: running_add = 0
|
|
|
|
running_add = running_add * gamma + rewards[t]
|
|
discounted_r[t] = running_add
|
|
return discounted_r
|
|
|
|
|
|
def cross_entropy_reward_loss(logits, actions, rewards, name=None):
|
|
"""Calculate the loss for Policy Gradient Network.
|
|
|
|
Parameters
|
|
----------
|
|
logits : tensor
|
|
The network outputs without softmax. This function implements softmax inside.
|
|
actions : tensor or placeholder
|
|
The agent actions.
|
|
rewards : tensor or placeholder
|
|
The rewards.
|
|
|
|
Returns
|
|
--------
|
|
Tensor
|
|
The TensorFlow loss function.
|
|
|
|
Examples
|
|
----------
|
|
>>> states_batch_pl = tf.placeholder(tf.float32, shape=[None, D])
|
|
>>> network = InputLayer(states_batch_pl, name='input')
|
|
>>> network = DenseLayer(network, n_units=H, act=tf.ops.relu, name='relu1')
|
|
>>> network = DenseLayer(network, n_units=3, name='out')
|
|
>>> probs = network.outputs
|
|
>>> sampling_prob = tf.ops.softmax(probs)
|
|
>>> actions_batch_pl = tf.placeholder(tf.int32, shape=[None])
|
|
>>> discount_rewards_batch_pl = tf.placeholder(tf.float32, shape=[None])
|
|
>>> loss = tl.rein.cross_entropy_reward_loss(probs, actions_batch_pl, discount_rewards_batch_pl)
|
|
>>> train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)
|
|
|
|
"""
|
|
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=actions, logits=logits, name=name)
|
|
|
|
return tf.reduce_sum(tf.multiply(cross_entropy, rewards))
|
|
|
|
|
|
def log_weight(probs, weights, name='log_weight'):
|
|
"""Log weight.
|
|
|
|
Parameters
|
|
-----------
|
|
probs : tensor
|
|
If it is a network output, usually we should scale it to [0, 1] via softmax.
|
|
weights : tensor
|
|
The weights.
|
|
|
|
Returns
|
|
--------
|
|
Tensor
|
|
The Tensor after appling the log weighted expression.
|
|
|
|
"""
|
|
with tf.variable_scope(name):
|
|
exp_v = tf.reduce_mean(tf.log(probs) * weights)
|
|
return exp_v
|
|
|
|
|
|
def choice_action_by_probs(probs=(0.5, 0.5), action_list=None):
|
|
"""Choice and return an an action by given the action probability distribution.
|
|
|
|
Parameters
|
|
------------
|
|
probs : list of float.
|
|
The probability distribution of all actions.
|
|
action_list : None or a list of int or others
|
|
A list of action in integer, string or others. If None, returns an integer range between 0 and len(probs)-1.
|
|
|
|
Returns
|
|
--------
|
|
float int or str
|
|
The chosen action.
|
|
|
|
Examples
|
|
----------
|
|
>>> for _ in range(5):
|
|
>>> a = choice_action_by_probs([0.2, 0.4, 0.4])
|
|
>>> print(a)
|
|
0
|
|
1
|
|
1
|
|
2
|
|
1
|
|
>>> for _ in range(3):
|
|
>>> a = choice_action_by_probs([0.5, 0.5], ['a', 'b'])
|
|
>>> print(a)
|
|
a
|
|
b
|
|
b
|
|
|
|
"""
|
|
if action_list is None:
|
|
n_action = len(probs)
|
|
action_list = np.arange(n_action)
|
|
else:
|
|
if len(action_list) != len(probs):
|
|
raise Exception("number of actions should equal to number of probabilities.")
|
|
return np.random.choice(action_list, p=probs)
|