$$

*Xuehao(David) Hu*

In [209]:

```
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import neuralnetworksbylayer as nn
import random
from copy import copy
```

In [46]:

```
#a function used to draw the three graphs at the bottom of this notebook
# first: wins,draws,losses per batch
# second: Epsilon per batch
# third: TD error per SCG iteration
def plotStatus(batch,nnetQ):
plt.subplot(3,1,1)
avgOver = 50
n = int(len(finalR)/avgOver) * avgOver # where is the finalR????
wins = (np.array(finalR)==1)[:n].reshape((-1,avgOver)).sum(1)
draws = (np.array(finalR)==0)[:n].reshape((-1,avgOver)).sum(1)
losses = (np.array(finalR)==-1)[:n].reshape((-1,avgOver)).sum(1)
plt.plot(wins)
plt.plot(draws)
plt.plot(losses)
plt.ylim(0,avgOver)
plt.xlabel('Avg over batches of '+str(avgOver)+' games')
plt.subplot(3,1,2)
plt.plot(epsilonTrace[:batch])
plt.ylabel("epsilon")
plt.xlabel('Batches')
plt.ylim(0,1)
plt.subplot(3,1,3)
plt.plot(nnetQ.getErrorTrace())
plt.ylabel('TD Error')
plt.xlabel('SCG Iterations')
plt.tight_layout()
```

In [47]:

```
#helper function for make image
#drawCircle(boardArray,xStart,yStart)
#Params: boardArray(21by21matrix),
# xStart and yStart are the x and y coordinate in the boardingArray
# for the top left point of the 7 by 7 symbol(Circle)
#Return: modified boardArray( after inserting and circle)
def drawCircle(boardArray,xStart,yStart):
boardArray[xStart+2][yStart]=1;
boardArray[xStart+3][yStart]=1;
boardArray[xStart+4][yStart]=1;
boardArray[xStart+1][yStart+1]=1;
boardArray[xStart][yStart+2]=1;
boardArray[xStart][yStart+3]=1;
boardArray[xStart][yStart+4]=1;
boardArray[xStart+1][yStart+5]=1;
boardArray[xStart+2][yStart+6]=1;
boardArray[xStart+3][yStart+6]=1;
boardArray[xStart+4][yStart+6]=1;
boardArray[xStart+5][yStart+5]=1;
boardArray[xStart+6][yStart+4]=1;
boardArray[xStart+6][yStart+3]=1;
boardArray[xStart+6][yStart+2]=1;
boardArray[xStart+5][yStart+1]=1;
return boardArray
#helper function for make image
#drawCross(boardArray,xStart,yStart)
#Params: boardArray(21by21matrix),
# xStart and yStart are the x and y coordinate in the boardingArray
# for the top left point of the 7 by 7 symbol(Cross)
#Return: modified boardArray( after inserting and cross)
def drawCross(boardArray,xStart,yStart):
boardArray[xStart][yStart]=1;
boardArray[xStart+1][yStart+1]=1;
boardArray[xStart+2][yStart+2]=1;
boardArray[xStart+3][yStart+3]=1;
boardArray[xStart+4][yStart+4]=1;
boardArray[xStart+5][yStart+5]=1;
boardArray[xStart+6][yStart+6]=1;
boardArray[xStart+6][yStart]=1;
boardArray[xStart+5][yStart+1]=1;
boardArray[xStart+4][yStart+2]=1;
boardArray[xStart+3][yStart+3]=1;
boardArray[xStart+2][yStart+4]=1;
boardArray[xStart+1][yStart+5]=1;
boardArray[xStart][yStart+6]=1;
return boardArray
```

In [71]:

```
# Return if current board is full or not (check if there is any slot is still in state 'draw')
# Param: an array of state value in it
# return: full status
def full(state):
return not np.any(state == 0)
# Draw a 21 by 21 board with X and O in it per state
# Param: an array of state value( 1 by 9 row array)
# Return: an row array(1 by 441)
def makeImage(state):
# add creation of image here, and return as row vector
boardArray = np.zeros((21,21))
state = np.asarray(state)
state = state.reshape((3,3));
for i in range(3):
for j in range(3):
xStart=i*7
yStart=j*7
if state[i][j]==1:
boardArray=drawCross(boardArray,xStart,yStart)
if state[i][j]==-1:
boardArray=drawCircle(boardArray,xStart,yStart)
plt.imshow(boardArray,cmap='Greys', interpolation='nearest');
boardArray = boardArray.reshape((1,441))
return boardArray
# Initialize a 3 by 3 board state array with all
#zeros, draw a 21X21 image from this initial state
def initialState():
state = np.zeros((1,9)) # 0 means empty cell
return state, makeImage(state) # second item is features
#apply a move(action) for X to current state and return next state and features
# randomly make a move for O
def nextState(state, action):
newstate = copy(state)
if newstate[0,action] != 0:
print("BAD MOVE",newstate,action)
newstate[0,action] = 1 # 1 for X
# Now O moves, randomly, if board not full
if not full(newstate):
moveO = np.random.choice(np.where(newstate[0,:] == 0)[0])
if newstate[0,moveO] != 0:
print("BAD MOVE",newstate,moveO)
newstate[0,moveO] = -1
features = makeImage(newstate)
return newstate, features
#check if any winning senario for X is matched, return 1 for it if so.
#return -1 for O wins and 0 for even
def reinf(oldstate, state):
combos = np.array((0,1,2, 3,4,5, 6,7,8, 0,3,6, 1,4,7, 2,5,8, 0,4,8, 2,4,6))
if np.any(np.all(1 == state[:,combos].reshape((-1,3)), axis=1)):
finalR.append(1)
return 1 # X wins
if np.any(np.all(-1 == state[:,combos].reshape((-1,3)), axis=1)):
finalR.append(-1)
return -1 # O wins
if full(state):
finalR.append(0)
return 0 # draw
return None # not done
```

In [72]:

```
# Cross side loses
########################################
#last second step before circle side win
plt.subplot(2,1,1)
features1=makeImage([[1,0,0,
0,-1,0,
-1,1,1]])
# after circle is placed to
plt.subplot(2,1,2)
features1=makeImage([[1,0,-1,
0,-1,0,
-1,1,1]])
```

In [73]:

```
# Cross side wins
########################################
#last second step before circle side win
plt.subplot(2,1,1)
features1=makeImage([[1,-1,0,
1,0,0,
-1,-1,1]])
# after circle is placed to
plt.subplot(2,1,2)
features1=makeImage([[1,-1,0,
1,1,0,
-1,-1,1]])
```

In [74]:

```
def makeSamples(nnet, initialStateF, nextStateF, reinforcementF, nGames, epsilon):
nSamples = nGames * 5
X = np.zeros((nSamples, nnet.layers[0].nInputs))
R = np.zeros((nSamples, 1))
Qn = np.zeros((nSamples, 1))
step = 0
for game in range(nGames):
state, features = initialStateF()
done = False
action = None # to mark first move
while not done:
validActions = np.where(state[0,:] == 0)[0]
nextAction, nextQ = epsilonGreedy(nnet, state, validActions, epsilon)
nextState, nextFeatures = nextStateF(state, nextAction) # Update state, sn from s and a
rn = reinforcementF(state, nextState) # Calculate resulting reinforcement
if action is not None: # not first move
if rn is None:
# Game continues
X[step, :] = np.hstack((features, action))
R[step, 0] = 0
Qn[step, 0] = nextQ
else:
# Game over
X[step, :] = np.hstack((features, action))
R[step, 0] = rn
Qn[step, 0] = 0 # no next Q
done = True # to break out of one game loop
state = nextState
action = nextAction
features = nextFeatures
step += 1
return (X, R, Qn, epsilon)
```

In [75]:

```
def epsilonGreedy(nnet, state, validActions, epsilon):
if np.random.uniform() < epsilon:
# Random Move
action = np.random.choice(validActions)
action = np.array([[action]])
else:
# Greedy Move
actionsRandomlyOrdered = random.sample(validActions.tolist(), len(validActions))
Qs = [nnet.use(np.hstack((makeImage(state), np.array([[a]]))))
for a in actionsRandomlyOrdered] # to explore of same Q values
ai = np.argmax(Qs)
action = actionsRandomlyOrdered[ai]
action = np.array([[action]])
Q = nnet.use(np.hstack((makeImage(state), action))) #:,0:1]
return action, Q
```

In [174]:

```
from IPython.display import display, clear_output
fig = plt.figure(figsize=(5,8))
# Parameters
nGames = 5000 #5000
nGamesPerBatch = 100
nBatches = int(nGames/nGamesPerBatch)
print('Running for',nBatches,'batches of',nGamesPerBatch,'games per batch.')
nReplay = 0
nSCGIterations = 10
nHidden = [5]
gamma = 0.8
finalEpsilon = 0.00001# value of epsilon at end of simulation. Decay rate is calculated
epsilonDecay = np.exp(np.log(finalEpsilon)/nBatches)
print('epsilonDecay is',epsilonDecay)
# Number of inputs is the 9 Tic-Tac-Toe cells plus 1 for the action
nnetQ = nn.NeuralNetwork([441+1] + nHidden + [1])
nnetQ.setInputRanges([[-1,1]]*441 + [[0,8]])
finalR = []
epsilonTrace = np.zeros((nBatches,1))
epsilon = 1
for batch in range(nBatches):
X,R,Qn,epsilonjunk = makeSamples(nnetQ, initialState, nextState, reinf,
nGamesPerBatch, epsilon)
nnetQ.train(X, R + gamma*Qn, nIterations=nSCGIterations)
for nq in range(nReplay):
Qn[:-1,:] = nnetQ.use(X[1:,:])
nnetQ.train(X, R + gamma*Qn, nIterations=nSCGIterations)
epsilonTrace[batch] = epsilon
epsilon *= max(epsilonDecay,0.01)
if True:
plt.clf()
plotStatus(batch,nnetQ)
clear_output(wait=True)
display(fig)
plt.draw()
plt.pause(0.01)
clear_output(wait=True)
```

In [122]:

```
import matplotlib.image as mpimg
#numberOfGames:100000,nGamesPerBatch=100,nSCGIterations = 10,nHidden = [5],gamma = 0.8,finalEpsilon = 0.00001
#plt.figure(2)
plt.subplot(131)
img100000=mpimg.imread('http://www.cs.colostate.edu/~xuehaohu/img-100000-games.png')
imgplot = plt.imshow(img100000)
#numberOfGames:50000,nGamesPerBatch=100,nSCGIterations = 10,nHidden = [5],gamma = 0.8,finalEpsilon = 0.00001
plt.subplot(132)
img50000=mpimg.imread('http://www.cs.colostate.edu/~xuehaohu/img-50000-games.png');
imgplot = plt.imshow(img50000)
#numberOfGames:5000,nGamesPerBatch=100,nSCGIterations = 10,nHidden = [5],gamma = 0.8,finalEpsilon = 0.00001
plt.subplot(133)
img5000=mpimg.imread('http://www.cs.colostate.edu/~xuehaohu/img-5000-games.png');
imgplot = plt.imshow(img5000)
```

In [126]:

```
#numberOfGames:50000,nGamesPerBatch=100,nSCGIterations = 10,nHidden = [5],gamma = 0.8,finalEpsilon = 0.00001
plt.subplot(121)
img100batch=mpimg.imread('http://www.cs.colostate.edu/~xuehaohu/img-50000-games.png');
imgplot = plt.imshow(img100batch)
#numberOfGames:50000,nGamesPerBatch=300,nSCGIterations = 10,nHidden = [5],gamma = 0.8,finalEpsilon = 0.00001
plt.subplot(122)
img300batch=mpimg.imread('http://www.cs.colostate.edu/~xuehaohu/img-300-batch.png');
imgplot = plt.imshow(img300batch)
```

[1 number of hidden layers] more layers took a long time to run on my laptop, but it as expected increase the wins with other parameters unchanged.

[2 Gamma] It is used to represent how strongly we would like to reinforce the learning. From the dynamic output graph, the bigger will sometimes cause a large flatuation at some point, it makes sense becuase the bigger Gamma helps force the learning to move to the "best next move", but probably moved "a bad move" in the long run. And smaller gamma helps learn steadily, which may not find the "best next move", but will be trained to find the best move in the long run by trying more possible moves in the training.

[3 Epsilon] The bigger epsilon will essentially force the learning to make few random moves when it goes throught more number of games. Form the output, the smaller epsilon sort of help the wins to reach its threshold earlier, but it flatuate more than the bigger epsilon. it make sense because smaller epsilon make greedier steps, which sort of like gamma and want to make best moves as early as possible. The bad thing is that it is probably missing some tries that may help increase wins. That is, bigger epsilon will make the learning make more random moves, which help the learning make more tries and try more possible moves expecially where the board of the games become bigger, it will then leanrn steadily will have a better wins in the long run.

[4 SCGIteration] the scaled conjugate gradient helps fit the modle for each batch, I didn't find a perfect iteration number for the 300 games per batch. But, for a specific number of games per batch, we want a appropriate corresponding iteration number to fit the games. Otherwise, too many iterations will cause overfitting, which will cause bigger error, and two few iterations won't fit the games well, which will also cause errors.

In [208]:

```
theTuple = [layer.W[:441,0] for layer in nnetQ.layers]
#print(theTuple)
weights = np.asarray(theTuple)
#print(weights[0])
weights = weights[0].reshape((21,21))
#print(weights.shape)
plt.imshow(weights,cmap='Greys', interpolation='nearest');
```

Come up with some ways to visualize what the units in the hidden layers are doing. For example, you can visualize the weights in the first layer's hidden units as a square image of corresponding to the board. You can also find samples for which a unit's output is the highest.

Above is the image for the weights in the only hidden layer(as I have nHidden = [5]), where there 5 units.

From this final state, we can obviously see that there three Xs at the left, which make the cross side win, which is expected as we are training the neural network lean to learn as much as possible.

In [212]:

```
#this is the final state we got above for
#nGames = 5000;nGamesPerBatch = 100;nBatches = int(nGames/nGamesPerBatch);nSCGIterations = 10
#nHidden = [5]; gamma = 0.8;finalEpsilon = 0.00001
plt.subplot(111)
img300batch=mpimg.imread('http://www.cs.colostate.edu/~xuehaohu/final-state.png');
imgplot = plt.imshow(img300batch)
```

As we can also see form the final state above, TD errors get up very much finally, which is probably becuase is over fitted for the games in each batch. Meanwhile, there wins and losses are flatuating very much as well when it reaches the end.

Therefore, the final state can either randomly be a lose or win (in this case, we had a win), so, just like in the first graph, it is not reaching a final good result, the parameters i had right now are not leadning to a good result as defined at the beginnning