WEIGHTS_ABSOLUTE_PATH = '/content/gdrive/MyDrive/snake.reinforce.json'


from google.colab import drive
drive.mount('/content/gdrive')


md5 = !md5sum 'snake_game.py'
assert 'fe6f1b08c885095e7425b25c5a7ad9e5' == md5[0].split()[0]


from snake_game import SnakeGame
from datetime import datetime, timedelta
from tensorflow import keras
import tensorflow as tf
import matplotlib.pyplot as plt
import copy
import json
print('tensorflow', tf.__version__)
!python -V

tensorflow 2.8.2
Python 3.7.13


env = SnakeGame()


def extract_features(game=env):
  obsticle = lambda k: copy.deepcopy(game).step(k).done
  snake, food = game.get_state()
  head,  food = [h + w * game.board_height for h, w in (snake.head, food)]
  obs = sum(v << i for i, v in enumerate(obsticle(k) for k in game.ACTION))
  max = game.board_height * game.board_width - 1
  return obs / 0b1111, head / max, food / max


def build_model():
  x_in = x = keras.Input(shape=(3,))
  x = keras.layers.Dense(32, activation='relu')(x)
  x = keras.layers.Dense(32, activation='relu')(x)
  x = keras.layers.Dense(len(env.ACTION))(x)

  model = keras.Model(x_in, x)

  model.compile(loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                optimizer='Nadam')
  model.summary()
  return model


model = build_model()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 3)]               0         
                                                                 
 dense (Dense)               (None, 32)                128       
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dense_2 (Dense)             (None, 4)                 132       
                                                                 
=================================================================
Total params: 1,316
Trainable params: 1,316
Non-trainable params: 0
_________________________________________________________________


def play_game():
  env.reset()
  game_data = []
  done = False
  while not done:
    features = extract_features()
    action = model(tf.expand_dims(features, 0), training=False)
    action = tf.squeeze(tf.random.categorical(action, 1))
    _, reward, done, info = env.step(action)
    game_data.append((features, action, -1e-2 if reward < 0 else reward))
  return game_data, info.score


disc = lambda r: tf.scan(lambda a, x: x + .9 * a, r, reverse=True)
disc(tf.stack([0, 0, 1, 0, 1, 0, -1e-2])).numpy()

array([ 1.4607855 ,  1.623095  ,  1.803439  ,  0.89271003,  0.9919    ,
       -0.009     , -0.01      ], dtype=float32)


def reinforce(time_limit=timedelta(hours=11, minutes=50)):
  history = []
  best_score = -float('inf')
  stop_time = datetime.now() + time_limit
  while datetime.now() < stop_time:
    data, score = play_game()
    if best_score < score:
      best_score = score
      model.save_weights('best_weights.h5')
    X, Y, R = [tf.stack(i) for i in zip(*data)]
    history.append((
      model.train_on_batch(X, Y, sample_weight=disc(R)),
      float(tf.math.count_nonzero(R > 0) - 1), len(Y), score))
    if len(history) % 10 == 0:
      log = zip(*history[-100:])
      print('\r%5d' % len(history),
            '| Loss: %7.4f'   % tf.reduce_mean(next(log)),
            '| Reward: %5.2f' % tf.reduce_mean(next(log)),
            '| Age: %4d'      % tf.reduce_mean(next(log)),
            '| Score: %4d'    % tf.reduce_mean(next(log)),
            '| Best: %4d'     % best_score,
            '| %d %%' % (100 * (1 - (stop_time - datetime.now()) / time_limit)),
            end = '')
  return history


history = zip(*reinforce())
plt.rc('figure', figsize=(13, 4))

22170 | Loss:  0.0016 | Reward: 21.14 | Age:  762 | Score: 1474 | Best: 4789 | 99 %


plt.plot(next(history))
plt.ylabel('Loss')
plt.xlabel('Games') and None


plt.plot(next(history))
plt.ylabel('Reward')
plt.xlabel('Games') and None


plt.plot(next(history))
plt.ylabel('Age')
plt.xlabel('Games') and None


plt.plot(next(history))
plt.ylabel('Score')
plt.xlabel('Games') and None


model.load_weights('best_weights.h5')


model.predict(tf.stack([(0,)*model.input_shape[-1],
                        (1,)*model.input_shape[-1]]))

array([[  5.8412104,  -4.8576665,  -5.276733 ,  -1.5355242],
       [  6.1069336, -12.892834 , -12.935051 ,   5.11665  ]],
      dtype=float32)


with open(WEIGHTS_ABSOLUTE_PATH, 'w') as f:
  json.dump([w.tolist() for w in model.get_weights()], f)
!md5sum "$WEIGHTS_ABSOLUTE_PATH"

e415c8070ee60176e739dffa95de5b4b  /content/gdrive/MyDrive/snake.reinforce.json