NumpyNetwork/Unga Bunga.py at main · Martyn0324/NumpyNetwork · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import numpy as np
from PIL import Image
from activations.py import ReLU, Tanh
from losses.py import mean_squared_error

PATH = PATH
EPOCHS = EPOCHS # Number of Iterations
BATCH_SIZE = BATCH_SIZE # Number of samples that are gonna be passed through the Neural Network on each epoch
LR = LEARNING_RATE # Constant that will multiply the weights derivative. Preferably a number below 1 in order to avoid exploding gradients...be careful with vanishing ones
LAYER1 = LAYER1 # Size of first layer, AKA number of neurons
LAYER2 = LAYER2 # idem for second layer
LAYER3 = LAYER3 # and so on...
OUTPUT = OUTPUT # This one can vary. Do you want a binary classifier? Use 1. Multi-class classifier? Then use the number of labels. Image? (BATCHxHEIGHTxWIDTHxCHANNELS)

image = Image.open(PATH)
image = image.resize((100, 100))
image = image.convert("RGB")
image = np.array(image)
image = image/127.5 - 1.0 # Remember to normalize your data. Your neural network might not work if you don't. Here, the pixel values are in the interval [-1, 1]

# In order to divide the data into batches, we're gonna need a function for this. If you don't want to use batches, then consider batch=1 and skip this function.
def DataLoader(data, batch_size):
    for batch in range(0, len(data), batch_size):
        yield data[batch:min(batch+batch_size, len(data))]

# With Linear(Dense) layers, we have to flatten the data, in order to properly make its operations(aka: matrix multiplication)
INPUT = BATCH_SIZE * image.shape[1] * image.shape[2] * image.shape[3] # Since we're using an image, INPUT = BATCH x HEIGHT x WIDTH x CHANNELS
# If you get IndexError: tuple index out of range, then your input isn't in the shape (N_SAMPLES, HEIGHT, WIDTH, CHANNELS). I suggest preprocessing it.

# Generating the weights and bias. The weights will be multiplying the layer's input, and this multiplication will be summed to the bias. The bias can be deleted
# in some cases, sticking to the weights only.
# Also, initiating the weights through normal distribution around 0, since this method seems more common to GANs.
# For the bias we can just initiate with zeros.
w1 = np.random.normal(loc=0, scale=0.01, size=(INPUT, LAYER1)) # Remember that, the greater the numbers, the more computation power will be needed.
b1 = np.zeros(LAYER1)

w2 = np.random.normal(0, 0.01, (LAYER1, LAYER2))
b2 = np.zeros(LAYER2)

w3 = np.random.normal(0, 0.01, (LAYER2, LAYER3))
b3 = np.zeros(LAYER3)

w_out = np.random.normal(0, 0.01, (LAYER3, OUTPUT))
b_out = np.zeros(OUTPUT)

# Let's begin the training loop. You can create a function with this, but I think it's more didactical to make it like this

for epoch in range(EPOCHS):
    input = next(DataLoader(image, BATCH_SIZE)) # This is the actual input
    input = input.flatten() # Remember that when we're dealing with Linear layers we have to flatten our data.

    # And this is where the fun begins.
    # Remember: the output of a Linear Layer is out = (input * weights) + bias.

    l1 = np.matmul(input, w1) + b1 # Many tutorials use np.dot(), but, sincerely, this is pratically the same thing(if not more effective) in this case.
    l1_out, dact1 = ReLU(l1) # The output of an activation layer is out = act(input). In this case, out = ReLU(input)

    l2 = np.matmul(l1_out, w2) + b2 # The input of the second layer is the output of the first one, and so forth.
    l2_out, dact2 = ReLU(l2) # Feel free to test other activation functions...I would avoid using softmax before the actual output, though.

    l3 = np.matmul(l2_out, w3) + b3
    l3_out, dact3 = ReLU(l3)

    out = np.matmul(l3_out, w_out) + b_out
    output, dactout = Tanh(out) # Tanh is a good function to get values between -1 and 1, a normalized image.

    # The loss function and its derivative.
    # My idea here is to simply decompose and then recompose my image, so my labels are my inputs. Maybe this could get me to a SuperResolution Model...
    loss, dloss = mean_squared_error(output, input) # Remember: The loss here is just for us. Our dear NN will actually be using the derivative.

    # Now this is where things get messy. Beginning backpropagation + optimization through Stochastic Gradient Descent.
    '''
    Using the chain rule
    dloss/dw_out = dloss/doutput * doutput/dout * dout/dw_out

    loss = (output-labels)**2 ---> dloss/doutput = 2(output-labels)
    output = act(out) ----> doutput/dout = act'(out) = dtanh(out)
    out = w_out * l3_out + b_out ----> dout/dwout = l3_out

    dloss/dwtout = dloss(output, input) * dtanh(out) * l3_out
    dloss/dwout = dloss * dactout * l3_out
    '''
    A = dloss * dactout # Applying derivative of activation to the loss derivative. This part MUST be done this way. Otherwise you'll have problems with shapes.

    dw_out = np.matmul(A.T, l3_out) # A.shape is (BATCH_SIZE, OUTPUT), while l3_out.shape is (100, OUTPUT), so transposition is required.

    # We're gonna use the w_out later, so let's apply the optimization last.
    '''
    dloss/dbout = dloss/doutput * doutput/dout * dout/dbout

    out = wout * l3_out + bout ---> dout/dbout = 1
    '''
    db_out = A * 1

    # Optimizing bias later also for claryfication

    '''
    Chain rule on the chain rule
    dloss/dw3 = dloss/doutput * doutput/dout * dout/dl3_out * dl3_out/dl3 * dl3/dw3

    out = wout * l3_out + bout ---> dout/dl3_out = wout
    l3_out = act(l3) ---> dl3_out/dl3 = act'(l3) = dReLU(l3)
    l3 = w3 * l2_out + b3 ---> dl3/dw3 = l2_out

    dloss/dw3 = [dloss(output, input) * dtanh(out)] * w_out * dReLU(l3) * l2_out
    dloss/dw3 = [dloss * dactout] * w_out * dact3 * l2_out
    dloss/dw3 = A * wout * dact3 * l2_out
    '''
    B = np.matmul(A, w_out.T) * dact3 # Same case as before. Also attention for A.shape and w_out.shape.

    dw3 = np.matmul(B.T, l2_out)

    '''
    dloss/db3 = dloss/doutput * doutput/dout * dout/dl3_out * dl3_out/dl3 * dl3/db3

    l3 = w3 * l2_out + b3 ---> dl3/db3 = 1
    '''
    db3 = B * 1

    '''
    dloss/dw2 = [dloss/doutput * doutput/dout * dout/dl3_out * dl3_out/dl3] * dl3/dl2_out * dl2_out/dl2 * dl2/dw2
    dloss/dw2 = [dloss * dactout * w_out * dact3] * w3 * dReLU(l2) * l1_out

    dloss/dw2 = B * w3 * dact2 * l1_out
    '''
    C = np.matmul(B, w3.T) * dact2

    dw2 = np.matmul(C.T, l1_out)

    db2 = C

    '''
    dloss/dw1 = [B * dl3/dl2_out * dl2_out/dl2] * dl2/dl1_out * dl1_out/dl1 * dl1/dw1
    dloss/dw1 = [B * w3 * dact2] * w2 * dact1 * input
    dloss/dw1 = C * w2 * dact1 * input
    '''
    D = np.matmul(C, w2.T) * dact1

    dw1 = np.matmul(D.T, input)

    db1 = D

    # Finally, the optimization itself. Remember: dw > 0 ---> we're on the right side of the optimized weight(Graphic Weights(x) x Loss(y)),
    # So we have to subtract the dw from the weight. That way, if dw > 0, the new weight will be lower than the old one.

    w_out = w_out - LR * dw_out.T # Transpose is required
    b_out = b_out - LR * db_out # But not here. Don't use transpose here. We need bias.shape = (BATCH, layer_output)

    w3 = w3 - LR * dw3.T
    b3 = b3 - LR * db3

    w2 = w2 - LR * dw2.T
    b2 = b2 - LR * db2

    w1 = w1 - LR * dw1.T
    b1 = b1 - LR * db1

    if epoch+1 % 100 == 0:
        print(f"Epoch: {epoch}\nLoss: {loss}")


# Since we're working with an image, let's see the result

output = np.reshape(output, (BATCH_SIZE, 100, 100, 3))
print(output[0])
output[0] = (output[0] + 1.0)*127.5
output = Image.fromarray(output[0].astype(np.uint8))
print(output.show())


# Simply sticking to the linear layer is too meh. Any mediocre tutorial does this. How about some Conv2D? Not here, though!