From 3e2218195030370ae8691d762e38d2d2a54bc589 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 00:38:43 +0530 Subject: [PATCH 01/27] added lstm algorithm in neural network section. --- neural_network/lstm.py | 230 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 neural_network/lstm.py diff --git a/neural_network/lstm.py b/neural_network/lstm.py new file mode 100644 index 000000000000..f5dc4c75400d --- /dev/null +++ b/neural_network/lstm.py @@ -0,0 +1,230 @@ +##### Explanation ##### +# This script implements a Long Short-Term Memory (LSTM) network to learn and predict sequences of characters. +# It uses numpy for numerical operations and tqdm for progress visualization. + +# The data is a paragraph about LSTM, converted to lowercase and split into characters. +# Each character is one-hot encoded for training. + +# The LSTM class initializes weights and biases for the forget, input, candidate, and output gates. +# It also initializes weights and biases for the final output layer. + +# The forward method performs forward propagation through the LSTM network, computing hidden and cell states. +# It uses sigmoid and tanh activation functions for the gates and cell states. + +# The backward method performs backpropagation through time, computing gradients for the weights and biases. +# It updates the weights and biases using the computed gradients and the learning rate. + +# The train method trains the LSTM network on the input data for a specified number of epochs. +# It uses one-hot encoded inputs and computes errors using the softmax function. + +# The test method evaluates the trained LSTM network on the input data, computing accuracy based on predictions. + +# The script initializes the LSTM network with specified hyperparameters and trains it on the input data. +# Finally, it tests the trained network and prints the accuracy of the predictions. + +##### Data ##### + +##### Imports ##### +from tqdm import tqdm +import numpy as np + +class LSTM: + def __init__(self, data, hidden_dim=25, epochs=1000, lr=0.05): + self.data = data.lower() + self.hidden_dim = hidden_dim + self.epochs = epochs + self.lr = lr + + self.chars = set(self.data) + self.data_size, self.char_size = len(self.data), len(self.chars) + + print(f'Data size: {self.data_size}, Char Size: {self.char_size}') + + self.char_to_idx = {c: i for i, c in enumerate(self.chars)} + self.idx_to_char = {i: c for i, c in enumerate(self.chars)} + + self.train_X, self.train_y = self.data[:-1], self.data[1:] + + self.initialize_weights() + + ##### Helper Functions ##### + def one_hot_encode(self, char): + vector = np.zeros((self.char_size, 1)) + vector[self.char_to_idx[char]] = 1 + return vector + + def initialize_weights(self): + self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.bf = np.zeros((self.hidden_dim, 1)) + + self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.bi = np.zeros((self.hidden_dim, 1)) + + self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.bc = np.zeros((self.hidden_dim, 1)) + + self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.bo = np.zeros((self.hidden_dim, 1)) + + self.wy = self.init_weights(self.hidden_dim, self.char_size) + self.by = np.zeros((self.char_size, 1)) + + def init_weights(self, input_dim, output_dim): + return np.random.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt(6 / (input_dim + output_dim)) + + ##### Activation Functions ##### + def sigmoid(self, x, derivative=False): + if derivative: + return x * (1 - x) + return 1 / (1 + np.exp(-x)) + + def tanh(self, x, derivative=False): + if derivative: + return 1 - x ** 2 + return np.tanh(x) + + def softmax(self, x): + exp_x = np.exp(x - np.max(x)) + return exp_x / exp_x.sum(axis=0) + + ##### LSTM Network Methods ##### + def reset(self): + self.concat_inputs = {} + + self.hidden_states = {-1: np.zeros((self.hidden_dim, 1))} + self.cell_states = {-1: np.zeros((self.hidden_dim, 1))} + + self.activation_outputs = {} + self.candidate_gates = {} + self.output_gates = {} + self.forget_gates = {} + self.input_gates = {} + self.outputs = {} + + def forward(self, inputs): + self.reset() + + outputs = [] + for t in range(len(inputs)): + self.concat_inputs[t] = np.concatenate((self.hidden_states[t - 1], inputs[t])) + + self.forget_gates[t] = self.sigmoid(np.dot(self.wf, self.concat_inputs[t]) + self.bf) + self.input_gates[t] = self.sigmoid(np.dot(self.wi, self.concat_inputs[t]) + self.bi) + self.candidate_gates[t] = self.tanh(np.dot(self.wc, self.concat_inputs[t]) + self.bc) + self.output_gates[t] = self.sigmoid(np.dot(self.wo, self.concat_inputs[t]) + self.bo) + + self.cell_states[t] = self.forget_gates[t] * self.cell_states[t - 1] + self.input_gates[t] * self.candidate_gates[t] + self.hidden_states[t] = self.output_gates[t] * self.tanh(self.cell_states[t]) + + outputs.append(np.dot(self.wy, self.hidden_states[t]) + self.by) + + return outputs + + def backward(self, errors, inputs): + d_wf, d_bf = 0, 0 + d_wi, d_bi = 0, 0 + d_wc, d_bc = 0, 0 + d_wo, d_bo = 0, 0 + d_wy, d_by = 0, 0 + + dh_next, dc_next = np.zeros_like(self.hidden_states[0]), np.zeros_like(self.cell_states[0]) + for t in reversed(range(len(inputs))): + error = errors[t] + + # Final Gate Weights and Biases Errors + d_wy += np.dot(error, self.hidden_states[t].T) + d_by += error + + # Hidden State Error + d_hs = np.dot(self.wy.T, error) + dh_next + + # Output Gate Weights and Biases Errors + d_o = self.tanh(self.cell_states[t]) * d_hs * self.sigmoid(self.output_gates[t], derivative=True) + d_wo += np.dot(d_o, inputs[t].T) + d_bo += d_o + + # Cell State Error + d_cs = self.tanh(self.tanh(self.cell_states[t]), derivative=True) * self.output_gates[t] * d_hs + dc_next + + # Forget Gate Weights and Biases Errors + d_f = d_cs * self.cell_states[t - 1] * self.sigmoid(self.forget_gates[t], derivative=True) + d_wf += np.dot(d_f, inputs[t].T) + d_bf += d_f + + # Input Gate Weights and Biases Errors + d_i = d_cs * self.candidate_gates[t] * self.sigmoid(self.input_gates[t], derivative=True) + d_wi += np.dot(d_i, inputs[t].T) + d_bi += d_i + + # Candidate Gate Weights and Biases Errors + d_c = d_cs * self.input_gates[t] * self.tanh(self.candidate_gates[t], derivative=True) + d_wc += np.dot(d_c, inputs[t].T) + d_bc += d_c + + # Concatenated Input Error (Sum of Error at Each Gate!) + d_z = np.dot(self.wf.T, d_f) + np.dot(self.wi.T, d_i) + np.dot(self.wc.T, d_c) + np.dot(self.wo.T, d_o) + + # Error of Hidden State and Cell State at Next Time Step + dh_next = d_z[:self.hidden_dim, :] + dc_next = self.forget_gates[t] * d_cs + + for d_ in (d_wf, d_bf, d_wi, d_bi, d_wc, d_bc, d_wo, d_bo, d_wy, d_by): + np.clip(d_, -1, 1, out=d_) + + self.wf += d_wf * self.lr + self.bf += d_bf * self.lr + + self.wi += d_wi * self.lr + self.bi += d_bi * self.lr + + self.wc += d_wc * self.lr + self.bc += d_bc * self.lr + + self.wo += d_wo * self.lr + self.bo += d_bo * self.lr + + self.wy += d_wy * self.lr + self.by += d_by * self.lr + + def train(self): + inputs = [self.one_hot_encode(char) for char in self.train_X] + + for _ in tqdm(range(self.epochs)): + predictions = self.forward(inputs) + + errors = [] + for t in range(len(predictions)): + errors.append(-self.softmax(predictions[t])) + errors[-1][self.char_to_idx[self.train_y[t]]] += 1 + + self.backward(errors, self.concat_inputs) + + def test(self): + accuracy = 0 + probabilities = self.forward([self.one_hot_encode(char) for char in self.train_X]) + + output = '' + for t in range(len(self.train_y)): + prediction = self.idx_to_char[np.random.choice(range(self.char_size), p=self.softmax(probabilities[t].reshape(-1)))] + + output += prediction + + if prediction == self.train_y[t]: + accuracy += 1 + + print(f'Ground Truth:\n{self.train_y}\n') + print(f'Predictions:\n{output}\n') + + print(f'Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%') + +##### Data ##### +data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning order dependence in sequence prediction problems. This behavior is required in complex problem domains like machine translation, speech recognition, and more. LSTMs are well-suited to classifying, processing, and making predictions based on time series data, since there can be lags of unknown duration between important events in a time series. LSTMs were introduced by Hochreiter and Schmidhuber in 1997, and were refined and popularized by many people in following work. They work by maintaining a cell state that is updated by gates: the forget gate, the input gate, and the output gate. These gates control the flow of information, allowing the network to remember or forget information as needed.""" + +# Initialize Network +lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) + +##### Training ##### +lstm.train() + +##### Testing ##### +lstm.test() \ No newline at end of file From a2222f14c6267a0e69f5f99267eb05527991dfc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 00:40:08 +0530 Subject: [PATCH 02/27] commented the testing lines of code --- neural_network/lstm.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index f5dc4c75400d..d638a59d966e 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -221,10 +221,14 @@ def test(self): data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning order dependence in sequence prediction problems. This behavior is required in complex problem domains like machine translation, speech recognition, and more. LSTMs are well-suited to classifying, processing, and making predictions based on time series data, since there can be lags of unknown duration between important events in a time series. LSTMs were introduced by Hochreiter and Schmidhuber in 1997, and were refined and popularized by many people in following work. They work by maintaining a cell state that is updated by gates: the forget gate, the input gate, and the output gate. These gates control the flow of information, allowing the network to remember or forget information as needed.""" # Initialize Network -lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) +# lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) ##### Training ##### -lstm.train() +# lstm.train() ##### Testing ##### -lstm.test() \ No newline at end of file +# lstm.test() + + + +# testing can be done by uncommenting the above lines of code. \ No newline at end of file From f054733fa3fac0b1fb676cb2b6978ca12b33100f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 00:57:40 +0530 Subject: [PATCH 03/27] modified code to meet contribution.md file guidelines --- neural_network/lstm.py | 111 +++++++++++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 14 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index d638a59d966e..4506ca9df493 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -1,3 +1,15 @@ +""" +Name - - LSTM - Long Short-Term Memory Network For Sequence Prediction +Goal - - Predict sequences of data +Detail: Total 3 layers neural network +* Input layer +* LSTM layer +* Output layer +Author: Shashank Tyagi +Github: LEVII007 +Date: [Current Date] +""" + ##### Explanation ##### # This script implements a Long Short-Term Memory (LSTM) network to learn and predict sequences of characters. # It uses numpy for numerical operations and tqdm for progress visualization. @@ -22,14 +34,20 @@ # The script initializes the LSTM network with specified hyperparameters and trains it on the input data. # Finally, it tests the trained network and prints the accuracy of the predictions. -##### Data ##### - ##### Imports ##### from tqdm import tqdm import numpy as np class LSTM: - def __init__(self, data, hidden_dim=25, epochs=1000, lr=0.05): + def __init__(self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: float = 0.05) -> None: + """ + Initialize the LSTM network with the given data and hyperparameters. + + :param data: The input data as a string. + :param hidden_dim: The number of hidden units in the LSTM layer. + :param epochs: The number of training epochs. + :param lr: The learning rate. + """ self.data = data.lower() self.hidden_dim = hidden_dim self.epochs = epochs @@ -48,12 +66,21 @@ def __init__(self, data, hidden_dim=25, epochs=1000, lr=0.05): self.initialize_weights() ##### Helper Functions ##### - def one_hot_encode(self, char): + def one_hot_encode(self, char: str) -> np.ndarray: + """ + One-hot encode a character. + + :param char: The character to encode. + :return: A one-hot encoded vector. + """ vector = np.zeros((self.char_size, 1)) vector[self.char_to_idx[char]] = 1 return vector - def initialize_weights(self): + def initialize_weights(self) -> None: + """ + Initialize the weights and biases for the LSTM network. + """ self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bf = np.zeros((self.hidden_dim, 1)) @@ -69,26 +96,56 @@ def initialize_weights(self): self.wy = self.init_weights(self.hidden_dim, self.char_size) self.by = np.zeros((self.char_size, 1)) - def init_weights(self, input_dim, output_dim): + def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: + """ + Initialize weights with random values. + + :param input_dim: The input dimension. + :param output_dim: The output dimension. + :return: A matrix of initialized weights. + """ return np.random.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt(6 / (input_dim + output_dim)) ##### Activation Functions ##### - def sigmoid(self, x, derivative=False): + def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: + """ + Sigmoid activation function. + + :param x: The input array. + :param derivative: Whether to compute the derivative. + :return: The sigmoid activation or its derivative. + """ if derivative: return x * (1 - x) return 1 / (1 + np.exp(-x)) - def tanh(self, x, derivative=False): + def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: + """ + Tanh activation function. + + :param x: The input array. + :param derivative: Whether to compute the derivative. + :return: The tanh activation or its derivative. + """ if derivative: return 1 - x ** 2 return np.tanh(x) - def softmax(self, x): + def softmax(self, x: np.ndarray) -> np.ndarray: + """ + Softmax activation function. + + :param x: The input array. + :return: The softmax activation. + """ exp_x = np.exp(x - np.max(x)) return exp_x / exp_x.sum(axis=0) ##### LSTM Network Methods ##### - def reset(self): + def reset(self) -> None: + """ + Reset the LSTM network states. + """ self.concat_inputs = {} self.hidden_states = {-1: np.zeros((self.hidden_dim, 1))} @@ -101,7 +158,13 @@ def reset(self): self.input_gates = {} self.outputs = {} - def forward(self, inputs): + def forward(self, inputs: list) -> list: + """ + Perform forward propagation through the LSTM network. + + :param inputs: The input data as a list of one-hot encoded vectors. + :return: The outputs of the network. + """ self.reset() outputs = [] @@ -120,7 +183,13 @@ def forward(self, inputs): return outputs - def backward(self, errors, inputs): + def backward(self, errors: list, inputs: list) -> None: + """ + Perform backpropagation through time to compute gradients and update weights. + + :param errors: The errors at each time step. + :param inputs: The input data as a list of one-hot encoded vectors. + """ d_wf, d_bf = 0, 0 d_wi, d_bi = 0, 0 d_wc, d_bc = 0, 0 @@ -186,7 +255,10 @@ def backward(self, errors, inputs): self.wy += d_wy * self.lr self.by += d_by * self.lr - def train(self): + def train(self) -> None: + """ + Train the LSTM network on the input data. + """ inputs = [self.one_hot_encode(char) for char in self.train_X] for _ in tqdm(range(self.epochs)): @@ -199,7 +271,10 @@ def train(self): self.backward(errors, self.concat_inputs) - def test(self): + def test(self) -> None: + """ + Test the trained LSTM network on the input data and print the accuracy. + """ accuracy = 0 probabilities = self.forward([self.one_hot_encode(char) for char in self.train_X]) @@ -229,6 +304,14 @@ def test(self): ##### Testing ##### # lstm.test() +if __name__ == "__main__": + # Initialize Network + # lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) + + ##### Training ##### + # lstm.train() + ##### Testing ##### + # lstm.test() # testing can be done by uncommenting the above lines of code. \ No newline at end of file From 91c8173691ad055270245e1700635c24d9d6bd65 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 04:46:21 +0000 Subject: [PATCH 04/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/lstm.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 4506ca9df493..638db6fec26b 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -42,7 +42,7 @@ class LSTM: def __init__(self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: float = 0.05) -> None: """ Initialize the LSTM network with the given data and hyperparameters. - + :param data: The input data as a string. :param hidden_dim: The number of hidden units in the LSTM layer. :param epochs: The number of training epochs. @@ -69,7 +69,7 @@ def __init__(self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: floa def one_hot_encode(self, char: str) -> np.ndarray: """ One-hot encode a character. - + :param char: The character to encode. :return: A one-hot encoded vector. """ @@ -99,7 +99,7 @@ def initialize_weights(self) -> None: def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: """ Initialize weights with random values. - + :param input_dim: The input dimension. :param output_dim: The output dimension. :return: A matrix of initialized weights. @@ -110,7 +110,7 @@ def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: """ Sigmoid activation function. - + :param x: The input array. :param derivative: Whether to compute the derivative. :return: The sigmoid activation or its derivative. @@ -122,7 +122,7 @@ def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: """ Tanh activation function. - + :param x: The input array. :param derivative: Whether to compute the derivative. :return: The tanh activation or its derivative. @@ -134,7 +134,7 @@ def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: def softmax(self, x: np.ndarray) -> np.ndarray: """ Softmax activation function. - + :param x: The input array. :return: The softmax activation. """ @@ -161,7 +161,7 @@ def reset(self) -> None: def forward(self, inputs: list) -> list: """ Perform forward propagation through the LSTM network. - + :param inputs: The input data as a list of one-hot encoded vectors. :return: The outputs of the network. """ @@ -186,7 +186,7 @@ def forward(self, inputs: list) -> list: def backward(self, errors: list, inputs: list) -> None: """ Perform backpropagation through time to compute gradients and update weights. - + :param errors: The errors at each time step. :param inputs: The input data as a list of one-hot encoded vectors. """ @@ -224,7 +224,7 @@ def backward(self, errors: list, inputs: list) -> None: d_i = d_cs * self.candidate_gates[t] * self.sigmoid(self.input_gates[t], derivative=True) d_wi += np.dot(d_i, inputs[t].T) d_bi += d_i - + # Candidate Gate Weights and Biases Errors d_c = d_cs * self.input_gates[t] * self.tanh(self.candidate_gates[t], derivative=True) d_wc += np.dot(d_c, inputs[t].T) @@ -270,7 +270,7 @@ def train(self) -> None: errors[-1][self.char_to_idx[self.train_y[t]]] += 1 self.backward(errors, self.concat_inputs) - + def test(self) -> None: """ Test the trained LSTM network on the input data and print the accuracy. @@ -289,7 +289,7 @@ def test(self) -> None: print(f'Ground Truth:\n{self.train_y}\n') print(f'Predictions:\n{output}\n') - + print(f'Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%') ##### Data ##### @@ -314,4 +314,4 @@ def test(self) -> None: ##### Testing ##### # lstm.test() -# testing can be done by uncommenting the above lines of code. \ No newline at end of file +# testing can be done by uncommenting the above lines of code. From 0fbb04b070e16a0239a1ac47d118503b5050a033 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 10:15:47 +0530 Subject: [PATCH 05/27] Add LSTM algorithm implementation in neural network section --- neural_network/lstm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 638db6fec26b..ed6f6064ca57 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -7,9 +7,11 @@ * Output layer Author: Shashank Tyagi Github: LEVII007 -Date: [Current Date] +link : https://www.kaggle.com/code/navjindervirdee/lstm-neural-network-from-scratch """ + + ##### Explanation ##### # This script implements a Long Short-Term Memory (LSTM) network to learn and predict sequences of characters. # It uses numpy for numerical operations and tqdm for progress visualization. From 4c2ec80aecb165aab53450c7740c9d2d029d941f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 10:29:20 +0530 Subject: [PATCH 06/27] shorten the individual lines --- neural_network/lstm.py | 230 +++++++++++++++++++++-------------------- 1 file changed, 117 insertions(+), 113 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index ed6f6064ca57..21ffe3490c5f 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -10,41 +10,46 @@ link : https://www.kaggle.com/code/navjindervirdee/lstm-neural-network-from-scratch """ - - ##### Explanation ##### -# This script implements a Long Short-Term Memory (LSTM) network to learn and predict sequences of characters. +# This script implements a Long Short-Term Memory (LSTM) network to learn +# and predict sequences of characters. # It uses numpy for numerical operations and tqdm for progress visualization. -# The data is a paragraph about LSTM, converted to lowercase and split into characters. -# Each character is one-hot encoded for training. +# The data is a paragraph about LSTM, converted to lowercase and split into +# characters. Each character is one-hot encoded for training. -# The LSTM class initializes weights and biases for the forget, input, candidate, and output gates. -# It also initializes weights and biases for the final output layer. +# The LSTM class initializes weights and biases for the forget, input, candidate, +# and output gates. It also initializes weights and biases for the final output layer. -# The forward method performs forward propagation through the LSTM network, computing hidden and cell states. -# It uses sigmoid and tanh activation functions for the gates and cell states. +# The forward method performs forward propagation through the LSTM network, +# computing hidden and cell states. It uses sigmoid and tanh activation +# functions for the gates and cell states. -# The backward method performs backpropagation through time, computing gradients for the weights and biases. -# It updates the weights and biases using the computed gradients and the learning rate. +# The backward method performs backpropagation through time, computing gradients +# for the weights and biases. It updates the weights and biases using +# the computed gradients and the learning rate. -# The train method trains the LSTM network on the input data for a specified number of epochs. -# It uses one-hot encoded inputs and computes errors using the softmax function. +# The train method trains the LSTM network on the input data for a specified +# number of epochs. It uses one-hot encoded inputs and computes errors +# using the softmax function. -# The test method evaluates the trained LSTM network on the input data, computing accuracy based on predictions. +# The test method evaluates the trained LSTM network on the input data, +# computing accuracy based on predictions. -# The script initializes the LSTM network with specified hyperparameters and trains it on the input data. -# Finally, it tests the trained network and prints the accuracy of the predictions. +# The script initializes the LSTM network with specified hyperparameters +# and trains it on the input data. Finally, it tests the trained network +# and prints the accuracy of the predictions. ##### Imports ##### from tqdm import tqdm import numpy as np class LSTM: - def __init__(self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: float = 0.05) -> None: + def __init__(self, data: str, hidden_dim: int = 25, + epochs: int = 1000, lr: float = 0.05) -> None: """ Initialize the LSTM network with the given data and hyperparameters. - + :param data: The input data as a string. :param hidden_dim: The number of hidden units in the LSTM layer. :param epochs: The number of training epochs. @@ -71,7 +76,7 @@ def __init__(self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: floa def one_hot_encode(self, char: str) -> np.ndarray: """ One-hot encode a character. - + :param char: The character to encode. :return: A one-hot encoded vector. """ @@ -83,16 +88,20 @@ def initialize_weights(self) -> None: """ Initialize the weights and biases for the LSTM network. """ - self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.wf = self.init_weights(self.char_size + self.hidden_dim, + self.hidden_dim) self.bf = np.zeros((self.hidden_dim, 1)) - self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.wi = self.init_weights(self.char_size + self.hidden_dim, + self.hidden_dim) self.bi = np.zeros((self.hidden_dim, 1)) - self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.wc = self.init_weights(self.char_size + self.hidden_dim, + self.hidden_dim) self.bc = np.zeros((self.hidden_dim, 1)) - self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.wo = self.init_weights(self.char_size + self.hidden_dim, + self.hidden_dim) self.bo = np.zeros((self.hidden_dim, 1)) self.wy = self.init_weights(self.hidden_dim, self.char_size) @@ -101,18 +110,19 @@ def initialize_weights(self) -> None: def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: """ Initialize weights with random values. - + :param input_dim: The input dimension. :param output_dim: The output dimension. :return: A matrix of initialized weights. """ - return np.random.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt(6 / (input_dim + output_dim)) + return np.random.uniform(-1, 1, (output_dim, input_dim)) * \ + np.sqrt(6 / (input_dim + output_dim)) ##### Activation Functions ##### def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: """ Sigmoid activation function. - + :param x: The input array. :param derivative: Whether to compute the derivative. :return: The sigmoid activation or its derivative. @@ -124,7 +134,7 @@ def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: """ Tanh activation function. - + :param x: The input array. :param derivative: Whether to compute the derivative. :return: The tanh activation or its derivative. @@ -136,7 +146,7 @@ def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: def softmax(self, x: np.ndarray) -> np.ndarray: """ Softmax activation function. - + :param x: The input array. :return: The softmax activation. """ @@ -163,7 +173,7 @@ def reset(self) -> None: def forward(self, inputs: list) -> list: """ Perform forward propagation through the LSTM network. - + :param inputs: The input data as a list of one-hot encoded vectors. :return: The outputs of the network. """ @@ -171,15 +181,22 @@ def forward(self, inputs: list) -> list: outputs = [] for t in range(len(inputs)): - self.concat_inputs[t] = np.concatenate((self.hidden_states[t - 1], inputs[t])) - - self.forget_gates[t] = self.sigmoid(np.dot(self.wf, self.concat_inputs[t]) + self.bf) - self.input_gates[t] = self.sigmoid(np.dot(self.wi, self.concat_inputs[t]) + self.bi) - self.candidate_gates[t] = self.tanh(np.dot(self.wc, self.concat_inputs[t]) + self.bc) - self.output_gates[t] = self.sigmoid(np.dot(self.wo, self.concat_inputs[t]) + self.bo) - - self.cell_states[t] = self.forget_gates[t] * self.cell_states[t - 1] + self.input_gates[t] * self.candidate_gates[t] - self.hidden_states[t] = self.output_gates[t] * self.tanh(self.cell_states[t]) + self.concat_inputs[t] = np.concatenate( + (self.hidden_states[t - 1], inputs[t])) + + self.forget_gates[t] = self.sigmoid(np.dot(self.wf, + self.concat_inputs[t]) + self.bf) + self.input_gates[t] = self.sigmoid(np.dot(self.wi, + self.concat_inputs[t]) + self.bi) + self.candidate_gates[t] = self.tanh(np.dot(self.wc, + self.concat_inputs[t]) + self.bc) + self.output_gates[t] = self.sigmoid(np.dot(self.wo, + self.concat_inputs[t]) + self.bo) + + self.cell_states[t] = self.forget_gates[t] * self.cell_states[t - 1] + \ + self.input_gates[t] * self.candidate_gates[t] + self.hidden_states[t] = self.output_gates[t] * \ + self.tanh(self.cell_states[t]) outputs.append(np.dot(self.wy, self.hidden_states[t]) + self.by) @@ -188,7 +205,7 @@ def forward(self, inputs: list) -> list: def backward(self, errors: list, inputs: list) -> None: """ Perform backpropagation through time to compute gradients and update weights. - + :param errors: The errors at each time step. :param inputs: The input data as a list of one-hot encoded vectors. """ @@ -198,7 +215,8 @@ def backward(self, errors: list, inputs: list) -> None: d_wo, d_bo = 0, 0 d_wy, d_by = 0, 0 - dh_next, dc_next = np.zeros_like(self.hidden_states[0]), np.zeros_like(self.cell_states[0]) + dh_next, dc_next = np.zeros_like(self.hidden_states[0]), \ + np.zeros_like(self.cell_states[0]) for t in reversed(range(len(inputs))): error = errors[t] @@ -210,110 +228,96 @@ def backward(self, errors: list, inputs: list) -> None: d_hs = np.dot(self.wy.T, error) + dh_next # Output Gate Weights and Biases Errors - d_o = self.tanh(self.cell_states[t]) * d_hs * self.sigmoid(self.output_gates[t], derivative=True) + d_o = self.tanh(self.cell_states[t]) * d_hs * \ + self.sigmoid(self.output_gates[t], derivative=True) d_wo += np.dot(d_o, inputs[t].T) d_bo += d_o # Cell State Error - d_cs = self.tanh(self.tanh(self.cell_states[t]), derivative=True) * self.output_gates[t] * d_hs + dc_next + d_cs = self.tanh(self.tanh(self.cell_states[t]), + derivative=True) * self.output_gates[t] * d_hs + dc_next # Forget Gate Weights and Biases Errors - d_f = d_cs * self.cell_states[t - 1] * self.sigmoid(self.forget_gates[t], derivative=True) + d_f = d_cs * self.cell_states[t - 1] * \ + self.sigmoid(self.forget_gates[t], derivative=True) d_wf += np.dot(d_f, inputs[t].T) d_bf += d_f # Input Gate Weights and Biases Errors - d_i = d_cs * self.candidate_gates[t] * self.sigmoid(self.input_gates[t], derivative=True) + d_i = d_cs * self.candidate_gates[t] * \ + self.sigmoid(self.input_gates[t], derivative=True) d_wi += np.dot(d_i, inputs[t].T) d_bi += d_i # Candidate Gate Weights and Biases Errors - d_c = d_cs * self.input_gates[t] * self.tanh(self.candidate_gates[t], derivative=True) + d_c = d_cs * self.input_gates[t] * self.tanh(self.candidate_gates[t], + derivative=True) d_wc += np.dot(d_c, inputs[t].T) d_bc += d_c - # Concatenated Input Error (Sum of Error at Each Gate!) - d_z = np.dot(self.wf.T, d_f) + np.dot(self.wi.T, d_i) + np.dot(self.wc.T, d_c) + np.dot(self.wo.T, d_o) - - # Error of Hidden State and Cell State at Next Time Step - dh_next = d_z[:self.hidden_dim, :] - dc_next = self.forget_gates[t] * d_cs - - for d_ in (d_wf, d_bf, d_wi, d_bi, d_wc, d_bc, d_wo, d_bo, d_wy, d_by): - np.clip(d_, -1, 1, out=d_) - - self.wf += d_wf * self.lr - self.bf += d_bf * self.lr - - self.wi += d_wi * self.lr - self.bi += d_bi * self.lr - - self.wc += d_wc * self.lr - self.bc += d_bc * self.lr + # Update the next hidden and cell state errors + dh_next = np.dot(self.wf.T, d_f) + np.dot(self.wi.T, d_i) + \ + np.dot(self.wo.T, d_o) + np.dot(self.wc.T, d_c) + dc_next = d_cs * self.forget_gates[t] - self.wo += d_wo * self.lr - self.bo += d_bo * self.lr + # Apply gradients to weights and biases + for param, grad in zip([self.wf, self.wi, self.wc, self.wo, self.wy], + [d_wf, d_wi, d_wc, d_wo, d_wy]): + param -= self.lr * grad - self.wy += d_wy * self.lr - self.by += d_by * self.lr + for param, grad in zip([self.bf, self.bi, self.bc, self.bo, self.by], + [d_bf, d_bi, d_bc, d_bo, d_by]): + param -= self.lr * grad def train(self) -> None: """ - Train the LSTM network on the input data. + Train the LSTM network on the input data for a specified number of epochs. """ - inputs = [self.one_hot_encode(char) for char in self.train_X] + for epoch in tqdm(range(self.epochs)): + inputs = [self.one_hot_encode(char) for char in self.train_X] + targets = [self.one_hot_encode(char) for char in self.train_y] - for _ in tqdm(range(self.epochs)): - predictions = self.forward(inputs) + # Forward pass + outputs = self.forward(inputs) - errors = [] - for t in range(len(predictions)): - errors.append(-self.softmax(predictions[t])) - errors[-1][self.char_to_idx[self.train_y[t]]] += 1 + # Compute error at each time step + errors = [output - target for output, target in zip(outputs, targets)] - self.backward(errors, self.concat_inputs) + # Backward pass and weight updates + self.backward(errors, inputs) - def test(self) -> None: + def predict(self, inputs: list) -> str: """ - Test the trained LSTM network on the input data and print the accuracy. + Predict the next character in the sequence. + + :param inputs: The input data as a list of one-hot encoded vectors. + :return: The predicted character. """ - accuracy = 0 - probabilities = self.forward([self.one_hot_encode(char) for char in self.train_X]) - - output = '' - for t in range(len(self.train_y)): - prediction = self.idx_to_char[np.random.choice(range(self.char_size), p=self.softmax(probabilities[t].reshape(-1)))] - - output += prediction - - if prediction == self.train_y[t]: - accuracy += 1 - - print(f'Ground Truth:\n{self.train_y}\n') - print(f'Predictions:\n{output}\n') - - print(f'Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%') + output = self.forward(inputs)[-1] + return self.idx_to_char[np.argmax(self.softmax(output))] -##### Data ##### -data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning order dependence in sequence prediction problems. This behavior is required in complex problem domains like machine translation, speech recognition, and more. LSTMs are well-suited to classifying, processing, and making predictions based on time series data, since there can be lags of unknown duration between important events in a time series. LSTMs were introduced by Hochreiter and Schmidhuber in 1997, and were refined and popularized by many people in following work. They work by maintaining a cell state that is updated by gates: the forget gate, the input gate, and the output gate. These gates control the flow of information, allowing the network to remember or forget information as needed.""" - -# Initialize Network -# lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) + def test(self) -> None: + """ + Test the LSTM network on the input data and compute accuracy. + """ + inputs = [self.one_hot_encode(char) for char in self.train_X] + correct_predictions = sum(self.idx_to_char[np.argmax(self.softmax(output))] == target + for output, target in zip(self.forward(inputs), self.train_y)) -##### Training ##### -# lstm.train() + accuracy = (correct_predictions / len(self.train_y)) * 100 + print(f'Accuracy: {accuracy:.2f}%') -##### Testing ##### -# lstm.test() if __name__ == "__main__": - # Initialize Network - # lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) - - ##### Training ##### - # lstm.train() - - ##### Testing ##### - # lstm.test() - -# testing can be done by uncommenting the above lines of code. + # Define the input data and hyperparameters + data = "LSTM Neural Networks are designed to handle sequences of data." + hidden_dim = 50 + epochs = 1000 + lr = 0.01 + + # Initialize and train the LSTM network + lstm = LSTM(data, hidden_dim, epochs, lr) + lstm.train() + + # Test the LSTM network and compute accuracy + lstm.test() From 39fd7135430ff3bffd5c40dda424ad075076465c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 05:08:45 +0000 Subject: [PATCH 07/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/lstm.py | 177 ++++++++++++++++++++++++----------------- 1 file changed, 106 insertions(+), 71 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 21ffe3490c5f..f99bb996563d 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -11,45 +11,47 @@ """ ##### Explanation ##### -# This script implements a Long Short-Term Memory (LSTM) network to learn +# This script implements a Long Short-Term Memory (LSTM) network to learn # and predict sequences of characters. # It uses numpy for numerical operations and tqdm for progress visualization. -# The data is a paragraph about LSTM, converted to lowercase and split into +# The data is a paragraph about LSTM, converted to lowercase and split into # characters. Each character is one-hot encoded for training. -# The LSTM class initializes weights and biases for the forget, input, candidate, +# The LSTM class initializes weights and biases for the forget, input, candidate, # and output gates. It also initializes weights and biases for the final output layer. -# The forward method performs forward propagation through the LSTM network, -# computing hidden and cell states. It uses sigmoid and tanh activation +# The forward method performs forward propagation through the LSTM network, +# computing hidden and cell states. It uses sigmoid and tanh activation # functions for the gates and cell states. -# The backward method performs backpropagation through time, computing gradients -# for the weights and biases. It updates the weights and biases using +# The backward method performs backpropagation through time, computing gradients +# for the weights and biases. It updates the weights and biases using # the computed gradients and the learning rate. -# The train method trains the LSTM network on the input data for a specified -# number of epochs. It uses one-hot encoded inputs and computes errors +# The train method trains the LSTM network on the input data for a specified +# number of epochs. It uses one-hot encoded inputs and computes errors # using the softmax function. -# The test method evaluates the trained LSTM network on the input data, +# The test method evaluates the trained LSTM network on the input data, # computing accuracy based on predictions. -# The script initializes the LSTM network with specified hyperparameters -# and trains it on the input data. Finally, it tests the trained network +# The script initializes the LSTM network with specified hyperparameters +# and trains it on the input data. Finally, it tests the trained network # and prints the accuracy of the predictions. ##### Imports ##### from tqdm import tqdm import numpy as np + class LSTM: - def __init__(self, data: str, hidden_dim: int = 25, - epochs: int = 1000, lr: float = 0.05) -> None: + def __init__( + self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: float = 0.05 + ) -> None: """ Initialize the LSTM network with the given data and hyperparameters. - + :param data: The input data as a string. :param hidden_dim: The number of hidden units in the LSTM layer. :param epochs: The number of training epochs. @@ -63,7 +65,7 @@ def __init__(self, data: str, hidden_dim: int = 25, self.chars = set(self.data) self.data_size, self.char_size = len(self.data), len(self.chars) - print(f'Data size: {self.data_size}, Char Size: {self.char_size}') + print(f"Data size: {self.data_size}, Char Size: {self.char_size}") self.char_to_idx = {c: i for i, c in enumerate(self.chars)} self.idx_to_char = {i: c for i, c in enumerate(self.chars)} @@ -76,7 +78,7 @@ def __init__(self, data: str, hidden_dim: int = 25, def one_hot_encode(self, char: str) -> np.ndarray: """ One-hot encode a character. - + :param char: The character to encode. :return: A one-hot encoded vector. """ @@ -88,20 +90,16 @@ def initialize_weights(self) -> None: """ Initialize the weights and biases for the LSTM network. """ - self.wf = self.init_weights(self.char_size + self.hidden_dim, - self.hidden_dim) + self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bf = np.zeros((self.hidden_dim, 1)) - self.wi = self.init_weights(self.char_size + self.hidden_dim, - self.hidden_dim) + self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bi = np.zeros((self.hidden_dim, 1)) - self.wc = self.init_weights(self.char_size + self.hidden_dim, - self.hidden_dim) + self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bc = np.zeros((self.hidden_dim, 1)) - self.wo = self.init_weights(self.char_size + self.hidden_dim, - self.hidden_dim) + self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bo = np.zeros((self.hidden_dim, 1)) self.wy = self.init_weights(self.hidden_dim, self.char_size) @@ -110,19 +108,20 @@ def initialize_weights(self) -> None: def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: """ Initialize weights with random values. - + :param input_dim: The input dimension. :param output_dim: The output dimension. :return: A matrix of initialized weights. """ - return np.random.uniform(-1, 1, (output_dim, input_dim)) * \ - np.sqrt(6 / (input_dim + output_dim)) + return np.random.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( + 6 / (input_dim + output_dim) + ) ##### Activation Functions ##### def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: """ Sigmoid activation function. - + :param x: The input array. :param derivative: Whether to compute the derivative. :return: The sigmoid activation or its derivative. @@ -134,19 +133,19 @@ def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: """ Tanh activation function. - + :param x: The input array. :param derivative: Whether to compute the derivative. :return: The tanh activation or its derivative. """ if derivative: - return 1 - x ** 2 + return 1 - x**2 return np.tanh(x) def softmax(self, x: np.ndarray) -> np.ndarray: """ Softmax activation function. - + :param x: The input array. :return: The softmax activation. """ @@ -173,7 +172,7 @@ def reset(self) -> None: def forward(self, inputs: list) -> list: """ Perform forward propagation through the LSTM network. - + :param inputs: The input data as a list of one-hot encoded vectors. :return: The outputs of the network. """ @@ -182,21 +181,29 @@ def forward(self, inputs: list) -> list: outputs = [] for t in range(len(inputs)): self.concat_inputs[t] = np.concatenate( - (self.hidden_states[t - 1], inputs[t])) - - self.forget_gates[t] = self.sigmoid(np.dot(self.wf, - self.concat_inputs[t]) + self.bf) - self.input_gates[t] = self.sigmoid(np.dot(self.wi, - self.concat_inputs[t]) + self.bi) - self.candidate_gates[t] = self.tanh(np.dot(self.wc, - self.concat_inputs[t]) + self.bc) - self.output_gates[t] = self.sigmoid(np.dot(self.wo, - self.concat_inputs[t]) + self.bo) - - self.cell_states[t] = self.forget_gates[t] * self.cell_states[t - 1] + \ - self.input_gates[t] * self.candidate_gates[t] - self.hidden_states[t] = self.output_gates[t] * \ - self.tanh(self.cell_states[t]) + (self.hidden_states[t - 1], inputs[t]) + ) + + self.forget_gates[t] = self.sigmoid( + np.dot(self.wf, self.concat_inputs[t]) + self.bf + ) + self.input_gates[t] = self.sigmoid( + np.dot(self.wi, self.concat_inputs[t]) + self.bi + ) + self.candidate_gates[t] = self.tanh( + np.dot(self.wc, self.concat_inputs[t]) + self.bc + ) + self.output_gates[t] = self.sigmoid( + np.dot(self.wo, self.concat_inputs[t]) + self.bo + ) + + self.cell_states[t] = ( + self.forget_gates[t] * self.cell_states[t - 1] + + self.input_gates[t] * self.candidate_gates[t] + ) + self.hidden_states[t] = self.output_gates[t] * self.tanh( + self.cell_states[t] + ) outputs.append(np.dot(self.wy, self.hidden_states[t]) + self.by) @@ -205,7 +212,7 @@ def forward(self, inputs: list) -> list: def backward(self, errors: list, inputs: list) -> None: """ Perform backpropagation through time to compute gradients and update weights. - + :param errors: The errors at each time step. :param inputs: The input data as a list of one-hot encoded vectors. """ @@ -215,8 +222,10 @@ def backward(self, errors: list, inputs: list) -> None: d_wo, d_bo = 0, 0 d_wy, d_by = 0, 0 - dh_next, dc_next = np.zeros_like(self.hidden_states[0]), \ - np.zeros_like(self.cell_states[0]) + dh_next, dc_next = ( + np.zeros_like(self.hidden_states[0]), + np.zeros_like(self.cell_states[0]), + ) for t in reversed(range(len(inputs))): error = errors[t] @@ -228,45 +237,69 @@ def backward(self, errors: list, inputs: list) -> None: d_hs = np.dot(self.wy.T, error) + dh_next # Output Gate Weights and Biases Errors - d_o = self.tanh(self.cell_states[t]) * d_hs * \ - self.sigmoid(self.output_gates[t], derivative=True) + d_o = ( + self.tanh(self.cell_states[t]) + * d_hs + * self.sigmoid(self.output_gates[t], derivative=True) + ) d_wo += np.dot(d_o, inputs[t].T) d_bo += d_o # Cell State Error - d_cs = self.tanh(self.tanh(self.cell_states[t]), - derivative=True) * self.output_gates[t] * d_hs + dc_next + d_cs = ( + self.tanh(self.tanh(self.cell_states[t]), derivative=True) + * self.output_gates[t] + * d_hs + + dc_next + ) # Forget Gate Weights and Biases Errors - d_f = d_cs * self.cell_states[t - 1] * \ - self.sigmoid(self.forget_gates[t], derivative=True) + d_f = ( + d_cs + * self.cell_states[t - 1] + * self.sigmoid(self.forget_gates[t], derivative=True) + ) d_wf += np.dot(d_f, inputs[t].T) d_bf += d_f # Input Gate Weights and Biases Errors - d_i = d_cs * self.candidate_gates[t] * \ - self.sigmoid(self.input_gates[t], derivative=True) + d_i = ( + d_cs + * self.candidate_gates[t] + * self.sigmoid(self.input_gates[t], derivative=True) + ) d_wi += np.dot(d_i, inputs[t].T) d_bi += d_i # Candidate Gate Weights and Biases Errors - d_c = d_cs * self.input_gates[t] * self.tanh(self.candidate_gates[t], - derivative=True) + d_c = ( + d_cs + * self.input_gates[t] + * self.tanh(self.candidate_gates[t], derivative=True) + ) d_wc += np.dot(d_c, inputs[t].T) d_bc += d_c # Update the next hidden and cell state errors - dh_next = np.dot(self.wf.T, d_f) + np.dot(self.wi.T, d_i) + \ - np.dot(self.wo.T, d_o) + np.dot(self.wc.T, d_c) + dh_next = ( + np.dot(self.wf.T, d_f) + + np.dot(self.wi.T, d_i) + + np.dot(self.wo.T, d_o) + + np.dot(self.wc.T, d_c) + ) dc_next = d_cs * self.forget_gates[t] # Apply gradients to weights and biases - for param, grad in zip([self.wf, self.wi, self.wc, self.wo, self.wy], - [d_wf, d_wi, d_wc, d_wo, d_wy]): + for param, grad in zip( + [self.wf, self.wi, self.wc, self.wo, self.wy], + [d_wf, d_wi, d_wc, d_wo, d_wy], + ): param -= self.lr * grad - for param, grad in zip([self.bf, self.bi, self.bc, self.bo, self.by], - [d_bf, d_bi, d_bc, d_bo, d_by]): + for param, grad in zip( + [self.bf, self.bi, self.bc, self.bo, self.by], + [d_bf, d_bi, d_bc, d_bo, d_by], + ): param -= self.lr * grad def train(self) -> None: @@ -289,7 +322,7 @@ def train(self) -> None: def predict(self, inputs: list) -> str: """ Predict the next character in the sequence. - + :param inputs: The input data as a list of one-hot encoded vectors. :return: The predicted character. """ @@ -301,11 +334,13 @@ def test(self) -> None: Test the LSTM network on the input data and compute accuracy. """ inputs = [self.one_hot_encode(char) for char in self.train_X] - correct_predictions = sum(self.idx_to_char[np.argmax(self.softmax(output))] == target - for output, target in zip(self.forward(inputs), self.train_y)) + correct_predictions = sum( + self.idx_to_char[np.argmax(self.softmax(output))] == target + for output, target in zip(self.forward(inputs), self.train_y) + ) accuracy = (correct_predictions / len(self.train_y)) * 100 - print(f'Accuracy: {accuracy:.2f}%') + print(f"Accuracy: {accuracy:.2f}%") if __name__ == "__main__": From 3c2da6e2f23e414bb7efb0124a04ce36eccc9b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 10:44:22 +0530 Subject: [PATCH 08/27] shorten the lines --- neural_network/lstm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index f99bb996563d..4a9d289b012a 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -345,14 +345,14 @@ def test(self) -> None: if __name__ == "__main__": # Define the input data and hyperparameters - data = "LSTM Neural Networks are designed to handle sequences of data." - hidden_dim = 50 - epochs = 1000 - lr = 0.01 + # data = "LSTM Neural Networks are designed to handle sequences of data.This is just rantom test data" + # hidden_dim = 50 + # epochs = 1000 + # lr = 0.01 - # Initialize and train the LSTM network - lstm = LSTM(data, hidden_dim, epochs, lr) - lstm.train() + # # Initialize and train the LSTM network + # lstm = LSTM(data, hidden_dim, epochs, lr) + # lstm.train() - # Test the LSTM network and compute accuracy - lstm.test() + # # Test the LSTM network and compute accuracy + # lstm.test() From 21dab0f1c1c287444bae242ba0f4d118934b10c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 10:55:27 +0530 Subject: [PATCH 09/27] Refactor LSTM input data initialization --- neural_network/lstm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 4a9d289b012a..5c7a1387cecd 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -345,7 +345,7 @@ def test(self) -> None: if __name__ == "__main__": # Define the input data and hyperparameters - # data = "LSTM Neural Networks are designed to handle sequences of data.This is just rantom test data" + data = "LSTM Neural Networks are designed to handle sequences of data.This is just rantom test data" # hidden_dim = 50 # epochs = 1000 # lr = 0.01 From 5a00ca63fc0f3ae507af7611d3bfcbf65e26e6e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 11:37:28 +0530 Subject: [PATCH 10/27] Refactor LSTM network implementation and improve code readability --- neural_network/lstm.py | 202 ++++++++++++++++++++++++----------------- 1 file changed, 117 insertions(+), 85 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 5c7a1387cecd..ae834cdbe9d8 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -7,42 +7,46 @@ * Output layer Author: Shashank Tyagi Github: LEVII007 -link : https://www.kaggle.com/code/navjindervirdee/lstm-neural-network-from-scratch +Date: [Current Date] """ -##### Explanation ##### -# This script implements a Long Short-Term Memory (LSTM) network to learn -# and predict sequences of characters. +#### Explanation ##### +# This script implements a Long Short-Term Memory (LSTM) +# network to learn and predict sequences of characters. # It uses numpy for numerical operations and tqdm for progress visualization. -# The data is a paragraph about LSTM, converted to lowercase and split into -# characters. Each character is one-hot encoded for training. +# The data is a paragraph about LSTM, converted to +# lowercase and split into characters. +# Each character is one-hot encoded for training. -# The LSTM class initializes weights and biases for the forget, input, candidate, -# and output gates. It also initializes weights and biases for the final output layer. +# The LSTM class initializes weights and biases for the +# forget, input, candidate, and output gates. +# It also initializes weights and biases for the final output layer. -# The forward method performs forward propagation through the LSTM network, -# computing hidden and cell states. It uses sigmoid and tanh activation -# functions for the gates and cell states. +# The forward method performs forward propagation +# through the LSTM network, computing hidden and cell states. +# It uses sigmoid and tanh activation functions for the gates and cell states. -# The backward method performs backpropagation through time, computing gradients -# for the weights and biases. It updates the weights and biases using -# the computed gradients and the learning rate. +# The backward method performs backpropagation +# through time, computing gradients for the weights and biases. +# It updates the weights and biases using the +# computed gradients and the learning rate. -# The train method trains the LSTM network on the input data for a specified -# number of epochs. It uses one-hot encoded inputs and computes errors -# using the softmax function. +# The train method trains the LSTM network on +# the input data for a specified number of epochs. +# It uses one-hot encoded inputs and computes +# errors using the softmax function. -# The test method evaluates the trained LSTM network on the input data, -# computing accuracy based on predictions. +# The test method evaluates the trained LSTM +# network on the input data, computing accuracy based on predictions. -# The script initializes the LSTM network with specified hyperparameters -# and trains it on the input data. Finally, it tests the trained network -# and prints the accuracy of the predictions. +# The script initializes the LSTM network with +# specified hyperparameters and trains it on the input data. +# Finally, it tests the trained network and prints the accuracy of the predictions. ##### Imports ##### -from tqdm import tqdm import numpy as np +from tqdm import tqdm class LSTM: @@ -68,7 +72,7 @@ def __init__( print(f"Data size: {self.data_size}, Char Size: {self.char_size}") self.char_to_idx = {c: i for i, c in enumerate(self.chars)} - self.idx_to_char = {i: c for i, c in enumerate(self.chars)} + self.idx_to_char = dict(enumerate(self.chars)) self.train_X, self.train_y = self.data[:-1], self.data[1:] @@ -90,30 +94,42 @@ def initialize_weights(self) -> None: """ Initialize the weights and biases for the LSTM network. """ - self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + rng = np.random.default_rng() + self.wf = self.init_weights( + self.char_size + self.hidden_dim, self.hidden_dim, rng + ) self.bf = np.zeros((self.hidden_dim, 1)) - self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.wi = self.init_weights( + self.char_size + self.hidden_dim, self.hidden_dim, rng + ) self.bi = np.zeros((self.hidden_dim, 1)) - self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.wc = self.init_weights( + self.char_size + self.hidden_dim, self.hidden_dim, rng + ) self.bc = np.zeros((self.hidden_dim, 1)) - self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) + self.wo = self.init_weights( + self.char_size + self.hidden_dim, self.hidden_dim, rng + ) self.bo = np.zeros((self.hidden_dim, 1)) - self.wy = self.init_weights(self.hidden_dim, self.char_size) + self.wy = self.init_weights(self.hidden_dim, self.char_size, rng) self.by = np.zeros((self.char_size, 1)) - def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: + def init_weights( + self, input_dim: int, output_dim: int, rng: np.random.Generator + ) -> np.ndarray: """ Initialize weights with random values. :param input_dim: The input dimension. :param output_dim: The output dimension. + :param rng: The random number generator. :return: A matrix of initialized weights. """ - return np.random.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( + return rng.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( 6 / (input_dim + output_dim) ) @@ -280,79 +296,95 @@ def backward(self, errors: list, inputs: list) -> None: d_wc += np.dot(d_c, inputs[t].T) d_bc += d_c - # Update the next hidden and cell state errors - dh_next = ( + # Concatenated Input Error (Sum of Error at Each Gate!) + d_z = ( np.dot(self.wf.T, d_f) + np.dot(self.wi.T, d_i) - + np.dot(self.wo.T, d_o) + np.dot(self.wc.T, d_c) + + np.dot(self.wo.T, d_o) ) - dc_next = d_cs * self.forget_gates[t] - # Apply gradients to weights and biases - for param, grad in zip( - [self.wf, self.wi, self.wc, self.wo, self.wy], - [d_wf, d_wi, d_wc, d_wo, d_wy], - ): - param -= self.lr * grad + # Error of Hidden State and Cell State at Next Time Step + dh_next = d_z[: self.hidden_dim, :] + dc_next = self.forget_gates[t] * d_cs - for param, grad in zip( - [self.bf, self.bi, self.bc, self.bo, self.by], - [d_bf, d_bi, d_bc, d_bo, d_by], - ): - param -= self.lr * grad + for d_ in (d_wf, d_bf, d_wi, d_bi, d_wc, d_bc, d_wo, d_bo, d_wy, d_by): + np.clip(d_, -1, 1, out=d_) - def train(self) -> None: - """ - Train the LSTM network on the input data for a specified number of epochs. - """ - for epoch in tqdm(range(self.epochs)): - inputs = [self.one_hot_encode(char) for char in self.train_X] - targets = [self.one_hot_encode(char) for char in self.train_y] + self.wf += d_wf * self.lr + self.bf += d_bf * self.lr - # Forward pass - outputs = self.forward(inputs) + self.wi += d_wi * self.lr + self.bi += d_bi * self.lr - # Compute error at each time step - errors = [output - target for output, target in zip(outputs, targets)] + self.wc += d_wc * self.lr + self.bc += d_bc * self.lr - # Backward pass and weight updates - self.backward(errors, inputs) + self.wo += d_wo * self.lr + self.bo += d_bo * self.lr - def predict(self, inputs: list) -> str: - """ - Predict the next character in the sequence. + self.wy += d_wy * self.lr + self.by += d_by * self.lr - :param inputs: The input data as a list of one-hot encoded vectors. - :return: The predicted character. + def train(self) -> None: """ - output = self.forward(inputs)[-1] - return self.idx_to_char[np.argmax(self.softmax(output))] + Train the LSTM network on the input data. + """ + inputs = [self.one_hot_encode(char) for char in self.train_X] + + for _ in tqdm(range(self.epochs)): + predictions = self.forward(inputs) + + errors = [] + for t in range(len(predictions)): + errors.append(-self.softmax(predictions[t])) + errors[-1][self.char_to_idx[self.train_y[t]]] += 1 + + self.backward(errors, self.concat_inputs) def test(self) -> None: """ - Test the LSTM network on the input data and compute accuracy. + Test the trained LSTM network on the input data and print the accuracy. """ - inputs = [self.one_hot_encode(char) for char in self.train_X] - correct_predictions = sum( - self.idx_to_char[np.argmax(self.softmax(output))] == target - for output, target in zip(self.forward(inputs), self.train_y) + accuracy = 0 + probabilities = self.forward( + [self.one_hot_encode(char) for char in self.train_X] ) - accuracy = (correct_predictions / len(self.train_y)) * 100 - print(f"Accuracy: {accuracy:.2f}%") + output = "" + for t in range(len(self.train_y)): + prediction = self.idx_to_char[ + np.random.choice( + range(self.char_size), p=self.softmax(probabilities[t].reshape(-1)) + ) + ] + + output += prediction + + if prediction == self.train_y[t]: + accuracy += 1 + + print(f"Ground Truth:\n{self.train_y}\n") + print(f"Predictions:\n{output}\n") + + print(f"Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%") if __name__ == "__main__": - # Define the input data and hyperparameters - data = "LSTM Neural Networks are designed to handle sequences of data.This is just rantom test data" - # hidden_dim = 50 - # epochs = 1000 - # lr = 0.01 - - # # Initialize and train the LSTM network - # lstm = LSTM(data, hidden_dim, epochs, lr) - # lstm.train() - - # # Test the LSTM network and compute accuracy - # lstm.test() + data = """Long Short-Term Memory (LSTM) networks are a type + of recurrent neural network (RNN) capable of learning " + "order dependence in sequence prediction problems. + This behavior is required in complex problem domains like " + "machine translation, speech recognition, and more. + iter and Schmidhuber in 1997, and were refined and " + "popularized by many people in following work.""" + + lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) + + ##### Training ##### + lstm.train() + + ##### Testing ##### + lstm.test() + +# testing can be done by uncommenting the above lines of code. From 3d9b893ee0b92c3864040bc62f3cd101abb4e6d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 12:27:48 +0530 Subject: [PATCH 11/27] changed code a bit for meet ruff standards --- neural_network/lstm.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index ae834cdbe9d8..b24894e786b5 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -46,12 +46,13 @@ ##### Imports ##### import numpy as np +from numpy.random import Generator from tqdm import tqdm class LSTM: def __init__( - self, data: str, hidden_dim: int = 25, epochs: int = 1000, lr: float = 0.05 + self, data: str, hidden_dim: int = 25, epochs: int = 10, lr: float = 0.05 ) -> None: """ Initialize the LSTM network with the given data and hyperparameters. @@ -75,6 +76,7 @@ def __init__( self.idx_to_char = dict(enumerate(self.chars)) self.train_X, self.train_y = self.data[:-1], self.data[1:] + self.rng: Generator = np.random.default_rng() self.initialize_weights() @@ -94,32 +96,32 @@ def initialize_weights(self) -> None: """ Initialize the weights and biases for the LSTM network. """ - rng = np.random.default_rng() + self.wf = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim, rng + self.char_size + self.hidden_dim, self.hidden_dim ) self.bf = np.zeros((self.hidden_dim, 1)) self.wi = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim, rng + self.char_size + self.hidden_dim, self.hidden_dim ) self.bi = np.zeros((self.hidden_dim, 1)) self.wc = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim, rng + self.char_size + self.hidden_dim, self.hidden_dim ) self.bc = np.zeros((self.hidden_dim, 1)) self.wo = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim, rng + self.char_size + self.hidden_dim, self.hidden_dim ) self.bo = np.zeros((self.hidden_dim, 1)) - self.wy = self.init_weights(self.hidden_dim, self.char_size, rng) + self.wy = self.init_weights(self.hidden_dim, self.char_size) self.by = np.zeros((self.char_size, 1)) def init_weights( - self, input_dim: int, output_dim: int, rng: np.random.Generator + self, input_dim: int, output_dim: int ) -> np.ndarray: """ Initialize weights with random values. @@ -129,7 +131,7 @@ def init_weights( :param rng: The random number generator. :return: A matrix of initialized weights. """ - return rng.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( + return self.rng.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( 6 / (input_dim + output_dim) ) @@ -343,9 +345,6 @@ def train(self) -> None: self.backward(errors, self.concat_inputs) def test(self) -> None: - """ - Test the trained LSTM network on the input data and print the accuracy. - """ accuracy = 0 probabilities = self.forward( [self.one_hot_encode(char) for char in self.train_X] @@ -353,11 +352,9 @@ def test(self) -> None: output = "" for t in range(len(self.train_y)): - prediction = self.idx_to_char[ - np.random.choice( - range(self.char_size), p=self.softmax(probabilities[t].reshape(-1)) - ) - ] + probs = self.softmax(probabilities[t].reshape(-1)) + prediction_index = self.rng.choice(self.char_size, p=probs) + prediction = self.idx_to_char[prediction_index] output += prediction @@ -370,6 +367,7 @@ def test(self) -> None: print(f"Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%") + if __name__ == "__main__": data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning " @@ -379,7 +377,7 @@ def test(self) -> None: iter and Schmidhuber in 1997, and were refined and " "popularized by many people in following work.""" - lstm = LSTM(data=data, hidden_dim=25, epochs=1000, lr=0.05) + lstm = LSTM(data=data, hidden_dim=25, epochs=10, lr=0.05) ##### Training ##### lstm.train() From 5c186b16e8cc945fe7b9873a34d9ed6196d49e55 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 06:58:13 +0000 Subject: [PATCH 12/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/lstm.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index b24894e786b5..9abd96053be2 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -97,32 +97,22 @@ def initialize_weights(self) -> None: Initialize the weights and biases for the LSTM network. """ - self.wf = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim - ) + self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bf = np.zeros((self.hidden_dim, 1)) - self.wi = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim - ) + self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bi = np.zeros((self.hidden_dim, 1)) - self.wc = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim - ) + self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bc = np.zeros((self.hidden_dim, 1)) - self.wo = self.init_weights( - self.char_size + self.hidden_dim, self.hidden_dim - ) + self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bo = np.zeros((self.hidden_dim, 1)) self.wy = self.init_weights(self.hidden_dim, self.char_size) self.by = np.zeros((self.char_size, 1)) - def init_weights( - self, input_dim: int, output_dim: int - ) -> np.ndarray: + def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: """ Initialize weights with random values. @@ -367,7 +357,6 @@ def test(self) -> None: print(f"Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%") - if __name__ == "__main__": data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning " From 94ad70c23453f2b8cd1118d6ad6fa478ab9bbf41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 12:36:03 +0530 Subject: [PATCH 13/27] Refactor LSTM class to improve code readability and maintainability --- neural_network/lstm.py | 118 ++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 78 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 9abd96053be2..7eaf6bdac81b 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -10,41 +10,8 @@ Date: [Current Date] """ -#### Explanation ##### -# This script implements a Long Short-Term Memory (LSTM) -# network to learn and predict sequences of characters. -# It uses numpy for numerical operations and tqdm for progress visualization. +# from typing import dict, list -# The data is a paragraph about LSTM, converted to -# lowercase and split into characters. -# Each character is one-hot encoded for training. - -# The LSTM class initializes weights and biases for the -# forget, input, candidate, and output gates. -# It also initializes weights and biases for the final output layer. - -# The forward method performs forward propagation -# through the LSTM network, computing hidden and cell states. -# It uses sigmoid and tanh activation functions for the gates and cell states. - -# The backward method performs backpropagation -# through time, computing gradients for the weights and biases. -# It updates the weights and biases using the -# computed gradients and the learning rate. - -# The train method trains the LSTM network on -# the input data for a specified number of epochs. -# It uses one-hot encoded inputs and computes -# errors using the softmax function. - -# The test method evaluates the trained LSTM -# network on the input data, computing accuracy based on predictions. - -# The script initializes the LSTM network with -# specified hyperparameters and trains it on the input data. -# Finally, it tests the trained network and prints the accuracy of the predictions. - -##### Imports ##### import numpy as np from numpy.random import Generator from tqdm import tqdm @@ -62,25 +29,37 @@ def __init__( :param epochs: The number of training epochs. :param lr: The learning rate. """ - self.data = data.lower() - self.hidden_dim = hidden_dim - self.epochs = epochs - self.lr = lr + self.data: str = data.lower() + self.hidden_dim: int = hidden_dim + self.epochs: int = epochs + self.lr: float = lr - self.chars = set(self.data) - self.data_size, self.char_size = len(self.data), len(self.chars) + self.chars: set = set(self.data) + self.data_size: int = len(self.data) + self.char_size: int = len(self.chars) print(f"Data size: {self.data_size}, Char Size: {self.char_size}") - self.char_to_idx = {c: i for i, c in enumerate(self.chars)} - self.idx_to_char = dict(enumerate(self.chars)) + self.char_to_idx: dict[str, int] = {c: i for i, c in enumerate(self.chars)} + self.idx_to_char: dict[int, str] = dict(enumerate(self.chars)) - self.train_X, self.train_y = self.data[:-1], self.data[1:] + self.train_X: str = self.data[:-1] + self.train_y: str = self.data[1:] self.rng: Generator = np.random.default_rng() + # Initialize attributes used in reset method + self.concat_inputs: dict[int, np.ndarray] = {} + self.hidden_states: dict[int, np.ndarray] = {-1: np.zeros((self.hidden_dim, 1))} + self.cell_states: dict[int, np.ndarray] = {-1: np.zeros((self.hidden_dim, 1))} + self.activation_outputs: dict[int, np.ndarray] = {} + self.candidate_gates: dict[int, np.ndarray] = {} + self.output_gates: dict[int, np.ndarray] = {} + self.forget_gates: dict[int, np.ndarray] = {} + self.input_gates: dict[int, np.ndarray] = {} + self.outputs: dict[int, np.ndarray] = {} + self.initialize_weights() - ##### Helper Functions ##### def one_hot_encode(self, char: str) -> np.ndarray: """ One-hot encode a character. @@ -109,8 +88,8 @@ def initialize_weights(self) -> None: self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) self.bo = np.zeros((self.hidden_dim, 1)) - self.wy = self.init_weights(self.hidden_dim, self.char_size) - self.by = np.zeros((self.char_size, 1)) + self.wy: np.ndarray = self.init_weights(self.hidden_dim, self.char_size) + self.by: np.ndarray = np.zeros((self.char_size, 1)) def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: """ @@ -118,14 +97,12 @@ def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: :param input_dim: The input dimension. :param output_dim: The output dimension. - :param rng: The random number generator. :return: A matrix of initialized weights. """ return self.rng.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( 6 / (input_dim + output_dim) ) - ##### Activation Functions ##### def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: """ Sigmoid activation function. @@ -160,16 +137,13 @@ def softmax(self, x: np.ndarray) -> np.ndarray: exp_x = np.exp(x - np.max(x)) return exp_x / exp_x.sum(axis=0) - ##### LSTM Network Methods ##### def reset(self) -> None: """ Reset the LSTM network states. """ self.concat_inputs = {} - self.hidden_states = {-1: np.zeros((self.hidden_dim, 1))} self.cell_states = {-1: np.zeros((self.hidden_dim, 1))} - self.activation_outputs = {} self.candidate_gates = {} self.output_gates = {} @@ -177,7 +151,7 @@ def reset(self) -> None: self.input_gates = {} self.outputs = {} - def forward(self, inputs: list) -> list: + def forward(self, inputs: list[np.ndarray]) -> list[np.ndarray]: """ Perform forward propagation through the LSTM network. @@ -217,7 +191,7 @@ def forward(self, inputs: list) -> list: return outputs - def backward(self, errors: list, inputs: list) -> None: + def backward(self, errors: list[np.ndarray], inputs: list[np.ndarray]) -> None: """ Perform backpropagation through time to compute gradients and update weights. @@ -237,23 +211,19 @@ def backward(self, errors: list, inputs: list) -> None: for t in reversed(range(len(inputs))): error = errors[t] - # Final Gate Weights and Biases Errors d_wy += np.dot(error, self.hidden_states[t].T) d_by += error - # Hidden State Error d_hs = np.dot(self.wy.T, error) + dh_next - # Output Gate Weights and Biases Errors d_o = ( self.tanh(self.cell_states[t]) * d_hs * self.sigmoid(self.output_gates[t], derivative=True) ) - d_wo += np.dot(d_o, inputs[t].T) + d_wo += np.dot(d_o, self.concat_inputs[t].T) d_bo += d_o - # Cell State Error d_cs = ( self.tanh(self.tanh(self.cell_states[t]), derivative=True) * self.output_gates[t] @@ -261,34 +231,30 @@ def backward(self, errors: list, inputs: list) -> None: + dc_next ) - # Forget Gate Weights and Biases Errors d_f = ( d_cs * self.cell_states[t - 1] * self.sigmoid(self.forget_gates[t], derivative=True) ) - d_wf += np.dot(d_f, inputs[t].T) + d_wf += np.dot(d_f, self.concat_inputs[t].T) d_bf += d_f - # Input Gate Weights and Biases Errors d_i = ( d_cs * self.candidate_gates[t] * self.sigmoid(self.input_gates[t], derivative=True) ) - d_wi += np.dot(d_i, inputs[t].T) + d_wi += np.dot(d_i, self.concat_inputs[t].T) d_bi += d_i - # Candidate Gate Weights and Biases Errors d_c = ( d_cs * self.input_gates[t] * self.tanh(self.candidate_gates[t], derivative=True) ) - d_wc += np.dot(d_c, inputs[t].T) + d_wc += np.dot(d_c, self.concat_inputs[t].T) d_bc += d_c - # Concatenated Input Error (Sum of Error at Each Gate!) d_z = ( np.dot(self.wf.T, d_f) + np.dot(self.wi.T, d_i) @@ -296,25 +262,20 @@ def backward(self, errors: list, inputs: list) -> None: + np.dot(self.wo.T, d_o) ) - # Error of Hidden State and Cell State at Next Time Step dh_next = d_z[: self.hidden_dim, :] dc_next = self.forget_gates[t] * d_cs - for d_ in (d_wf, d_bf, d_wi, d_bi, d_wc, d_bc, d_wo, d_bo, d_wy, d_by): - np.clip(d_, -1, 1, out=d_) + for d in (d_wf, d_bf, d_wi, d_bi, d_wc, d_bc, d_wo, d_bo, d_wy, d_by): + np.clip(d, -1, 1, out=d) self.wf += d_wf * self.lr self.bf += d_bf * self.lr - self.wi += d_wi * self.lr self.bi += d_bi * self.lr - self.wc += d_wc * self.lr self.bc += d_bc * self.lr - self.wo += d_wo * self.lr self.bo += d_bo * self.lr - self.wy += d_wy * self.lr self.by += d_by * self.lr @@ -332,9 +293,12 @@ def train(self) -> None: errors.append(-self.softmax(predictions[t])) errors[-1][self.char_to_idx[self.train_y[t]]] += 1 - self.backward(errors, self.concat_inputs) + self.backward(errors, inputs) def test(self) -> None: + """ + Test the trained LSTM network on the input data and print the accuracy. + """ accuracy = 0 probabilities = self.forward( [self.one_hot_encode(char) for char in self.train_X] @@ -366,12 +330,10 @@ def test(self) -> None: iter and Schmidhuber in 1997, and were refined and " "popularized by many people in following work.""" - lstm = LSTM(data=data, hidden_dim=25, epochs=10, lr=0.05) + # lstm = LSTM(data=data, hidden_dim=25, epochs=10, lr=0.05) ##### Training ##### - lstm.train() + # lstm.train() ##### Testing ##### - lstm.test() - -# testing can be done by uncommenting the above lines of code. + # lstm.test() From 6e7cc7cb7daf1cf7905f880a0e6ea529835b9b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 12:43:57 +0530 Subject: [PATCH 14/27] Refactor tqdm import in LSTM class --- neural_network/lstm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 7eaf6bdac81b..be2732dff17a 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -14,7 +14,7 @@ import numpy as np from numpy.random import Generator -from tqdm import tqdm +# from tqdm import tqdm class LSTM: @@ -285,7 +285,7 @@ def train(self) -> None: """ inputs = [self.one_hot_encode(char) for char in self.train_X] - for _ in tqdm(range(self.epochs)): + for _ in range(self.epochs): predictions = self.forward(inputs) errors = [] From e48555dfbcdf04eb955d7034482c7d4919519cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 12:45:41 +0530 Subject: [PATCH 15/27] Refactor tqdm import in LSTM class --- neural_network/lstm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index be2732dff17a..20df37d233bf 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -14,7 +14,6 @@ import numpy as np from numpy.random import Generator -# from tqdm import tqdm class LSTM: From 1608382d42c6febb5bafe662ef208bcf7e56949a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 20:50:31 +0530 Subject: [PATCH 16/27] added doc tests --- neural_network/lstm.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 20df37d233bf..c642e6df1dab 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -28,6 +28,15 @@ def __init__( :param epochs: The number of training epochs. :param lr: The learning rate. """ + """ + Test the LSTM model. + + >>> lstm = LSTM(data="abcde" * 50, hidden_dim=10, epochs=5, lr=0.01) + >>> lstm.train() + >>> predictions = lstm.test() + >>> len(predictions) > 0 + True + """ self.data: str = data.lower() self.hidden_dim: int = hidden_dim self.epochs: int = epochs @@ -157,6 +166,15 @@ def forward(self, inputs: list[np.ndarray]) -> list[np.ndarray]: :param inputs: The input data as a list of one-hot encoded vectors. :return: The outputs of the network. """ + """ + Forward pass through the LSTM network. + + >>> lstm = LSTM(data="abcde", hidden_dim=10, epochs=1, lr=0.01) + >>> inputs = [lstm.one_hot_encode(char) for char in lstm.train_X] + >>> outputs = lstm.forward(inputs) + >>> len(outputs) == len(inputs) + True + """ self.reset() outputs = [] @@ -282,6 +300,14 @@ def train(self) -> None: """ Train the LSTM network on the input data. """ + """ + Train the LSTM network on the input data. + + >>> lstm = LSTM(data="abcde" * 50, hidden_dim=10, epochs=5, lr=0.01) + >>> lstm.train() + >>> lstm.losses[-1] < lstm.losses[0] + True + """ inputs = [self.one_hot_encode(char) for char in self.train_X] for _ in range(self.epochs): @@ -298,6 +324,15 @@ def test(self) -> None: """ Test the trained LSTM network on the input data and print the accuracy. """ + """ + Test the LSTM model. + + >>> lstm = LSTM(data="abcde" * 50, hidden_dim=10, epochs=5, lr=0.01) + >>> lstm.train() + >>> predictions = lstm.test() + >>> len(predictions) > 0 + True + """ accuracy = 0 probabilities = self.forward( [self.one_hot_encode(char) for char in self.train_X] @@ -328,6 +363,8 @@ def test(self) -> None: "machine translation, speech recognition, and more. iter and Schmidhuber in 1997, and were refined and " "popularized by many people in following work.""" + import doctest + doctest.testmod() # lstm = LSTM(data=data, hidden_dim=25, epochs=10, lr=0.05) From 831c57f61fb41be2c70b14fee763cfd6cbe0f524 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:20:57 +0000 Subject: [PATCH 17/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/lstm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index c642e6df1dab..726786633caf 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -364,6 +364,7 @@ def test(self) -> None: iter and Schmidhuber in 1997, and were refined and " "popularized by many people in following work.""" import doctest + doctest.testmod() # lstm = LSTM(data=data, hidden_dim=25, epochs=10, lr=0.05) From 45a51ada53538a5dd1ece9f64bbcbfc83e15c6c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 21:24:46 +0530 Subject: [PATCH 18/27] descriptive names + improved doctests --- neural_network/lstm.py | 409 +++++++++++++++++++++++------------------ 1 file changed, 232 insertions(+), 177 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 726786633caf..3e4857786765 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -1,70 +1,71 @@ -""" -Name - - LSTM - Long Short-Term Memory Network For Sequence Prediction -Goal - - Predict sequences of data -Detail: Total 3 layers neural network -* Input layer -* LSTM layer -* Output layer -Author: Shashank Tyagi -Github: LEVII007 -Date: [Current Date] -""" - -# from typing import dict, list - import numpy as np from numpy.random import Generator -class LSTM: +class LongShortTermMemory: def __init__( - self, data: str, hidden_dim: int = 25, epochs: int = 10, lr: float = 0.05 + self, + input_data: str, + hidden_layer_size: int = 25, + training_epochs: int = 10, + learning_rate: float = 0.05, ) -> None: """ Initialize the LSTM network with the given data and hyperparameters. - :param data: The input data as a string. - :param hidden_dim: The number of hidden units in the LSTM layer. - :param epochs: The number of training epochs. - :param lr: The learning rate. - """ - """ - Test the LSTM model. + :param input_data: The input data as a string. + :param hidden_layer_size: The number of hidden units in the LSTM layer. + :param training_epochs: The number of training epochs. + :param learning_rate: The learning rate. - >>> lstm = LSTM(data="abcde" * 50, hidden_dim=10, epochs=5, lr=0.01) - >>> lstm.train() - >>> predictions = lstm.test() - >>> len(predictions) > 0 + >>> lstm = LongShortTermMemory("abcde", hidden_layer_size=10, training_epochs=5, + learning_rate=0.01) + >>> isinstance(lstm, LongShortTermMemory) True - """ - self.data: str = data.lower() - self.hidden_dim: int = hidden_dim - self.epochs: int = epochs - self.lr: float = lr - - self.chars: set = set(self.data) - self.data_size: int = len(self.data) - self.char_size: int = len(self.chars) - - print(f"Data size: {self.data_size}, Char Size: {self.char_size}") + >>> lstm.hidden_layer_size + 10 + >>> lstm.training_epochs + 5 + >>> lstm.learning_rate + 0.01 + >>> len(lstm.input_sequence) + 4 + """ + self.input_data: str = input_data.lower() + self.hidden_layer_size: int = hidden_layer_size + self.training_epochs: int = training_epochs + self.learning_rate: float = learning_rate + + self.unique_chars: set = set(self.input_data) + self.data_length: int = len(self.input_data) + self.vocabulary_size: int = len(self.unique_chars) + + print( + f"Data length: {self.data_length}, Vocabulary size: {self.vocabulary_size}" + ) - self.char_to_idx: dict[str, int] = {c: i for i, c in enumerate(self.chars)} - self.idx_to_char: dict[int, str] = dict(enumerate(self.chars)) + self.char_to_index: dict[str, int] = { + c: i for i, c in enumerate(self.unique_chars) + } + self.index_to_char: dict[int, str] = dict(enumerate(self.unique_chars)) - self.train_X: str = self.data[:-1] - self.train_y: str = self.data[1:] - self.rng: Generator = np.random.default_rng() + self.input_sequence: str = self.input_data[:-1] + self.target_sequence: str = self.input_data[1:] + self.random_generator: Generator = np.random.default_rng() # Initialize attributes used in reset method - self.concat_inputs: dict[int, np.ndarray] = {} - self.hidden_states: dict[int, np.ndarray] = {-1: np.zeros((self.hidden_dim, 1))} - self.cell_states: dict[int, np.ndarray] = {-1: np.zeros((self.hidden_dim, 1))} - self.activation_outputs: dict[int, np.ndarray] = {} - self.candidate_gates: dict[int, np.ndarray] = {} - self.output_gates: dict[int, np.ndarray] = {} - self.forget_gates: dict[int, np.ndarray] = {} - self.input_gates: dict[int, np.ndarray] = {} - self.outputs: dict[int, np.ndarray] = {} + self.combined_inputs: dict[int, np.ndarray] = {} + self.hidden_states: dict[int, np.ndarray] = { + -1: np.zeros((self.hidden_layer_size, 1)) + } + self.cell_states: dict[int, np.ndarray] = { + -1: np.zeros((self.hidden_layer_size, 1)) + } + self.forget_gate_activations: dict[int, np.ndarray] = {} + self.input_gate_activations: dict[int, np.ndarray] = {} + self.cell_state_candidates: dict[int, np.ndarray] = {} + self.output_gate_activations: dict[int, np.ndarray] = {} + self.network_outputs: dict[int, np.ndarray] = {} self.initialize_weights() @@ -75,8 +76,8 @@ def one_hot_encode(self, char: str) -> np.ndarray: :param char: The character to encode. :return: A one-hot encoded vector. """ - vector = np.zeros((self.char_size, 1)) - vector[self.char_to_idx[char]] = 1 + vector = np.zeros((self.vocabulary_size, 1)) + vector[self.char_to_index[char]] = 1 return vector def initialize_weights(self) -> None: @@ -84,20 +85,30 @@ def initialize_weights(self) -> None: Initialize the weights and biases for the LSTM network. """ - self.wf = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) - self.bf = np.zeros((self.hidden_dim, 1)) + self.forget_gate_weights = self.init_weights( + self.vocabulary_size + self.hidden_layer_size, self.hidden_layer_size + ) + self.forget_gate_bias = np.zeros((self.hidden_layer_size, 1)) - self.wi = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) - self.bi = np.zeros((self.hidden_dim, 1)) + self.input_gate_weights = self.init_weights( + self.vocabulary_size + self.hidden_layer_size, self.hidden_layer_size + ) + self.input_gate_bias = np.zeros((self.hidden_layer_size, 1)) - self.wc = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) - self.bc = np.zeros((self.hidden_dim, 1)) + self.cell_candidate_weights = self.init_weights( + self.vocabulary_size + self.hidden_layer_size, self.hidden_layer_size + ) + self.cell_candidate_bias = np.zeros((self.hidden_layer_size, 1)) - self.wo = self.init_weights(self.char_size + self.hidden_dim, self.hidden_dim) - self.bo = np.zeros((self.hidden_dim, 1)) + self.output_gate_weights = self.init_weights( + self.vocabulary_size + self.hidden_layer_size, self.hidden_layer_size + ) + self.output_gate_bias = np.zeros((self.hidden_layer_size, 1)) - self.wy: np.ndarray = self.init_weights(self.hidden_dim, self.char_size) - self.by: np.ndarray = np.zeros((self.char_size, 1)) + self.output_layer_weights: np.ndarray = self.init_weights( + self.hidden_layer_size, self.vocabulary_size + ) + self.output_layer_bias: np.ndarray = np.zeros((self.vocabulary_size, 1)) def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: """ @@ -107,7 +118,7 @@ def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: :param output_dim: The output dimension. :return: A matrix of initialized weights. """ - return self.rng.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( + return self.random_generator.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( 6 / (input_dim + output_dim) ) @@ -145,21 +156,20 @@ def softmax(self, x: np.ndarray) -> np.ndarray: exp_x = np.exp(x - np.max(x)) return exp_x / exp_x.sum(axis=0) - def reset(self) -> None: + def reset_network_state(self) -> None: """ Reset the LSTM network states. """ - self.concat_inputs = {} - self.hidden_states = {-1: np.zeros((self.hidden_dim, 1))} - self.cell_states = {-1: np.zeros((self.hidden_dim, 1))} - self.activation_outputs = {} - self.candidate_gates = {} - self.output_gates = {} - self.forget_gates = {} - self.input_gates = {} - self.outputs = {} - - def forward(self, inputs: list[np.ndarray]) -> list[np.ndarray]: + self.combined_inputs = {} + self.hidden_states = {-1: np.zeros((self.hidden_layer_size, 1))} + self.cell_states = {-1: np.zeros((self.hidden_layer_size, 1))} + self.forget_gate_activations = {} + self.input_gate_activations = {} + self.cell_state_candidates = {} + self.output_gate_activations = {} + self.network_outputs = {} + + def forward_pass(self, inputs: list[np.ndarray]) -> list[np.ndarray]: """ Perform forward propagation through the LSTM network. @@ -169,208 +179,253 @@ def forward(self, inputs: list[np.ndarray]) -> list[np.ndarray]: """ Forward pass through the LSTM network. - >>> lstm = LSTM(data="abcde", hidden_dim=10, epochs=1, lr=0.01) - >>> inputs = [lstm.one_hot_encode(char) for char in lstm.train_X] - >>> outputs = lstm.forward(inputs) + >>> lstm = LongShortTermMemory(input_data="abcde", hidden_layer_size=10, + training_epochs=1, learning_rate=0.01) + >>> inputs = [lstm.one_hot_encode(char) for char in lstm.input_sequence] + >>> outputs = lstm.forward_pass(inputs) >>> len(outputs) == len(inputs) True """ - self.reset() + self.reset_network_state() outputs = [] for t in range(len(inputs)): - self.concat_inputs[t] = np.concatenate( + self.combined_inputs[t] = np.concatenate( (self.hidden_states[t - 1], inputs[t]) ) - self.forget_gates[t] = self.sigmoid( - np.dot(self.wf, self.concat_inputs[t]) + self.bf + self.forget_gate_activations[t] = self.sigmoid( + np.dot(self.forget_gate_weights, self.combined_inputs[t]) + + self.forget_gate_bias ) - self.input_gates[t] = self.sigmoid( - np.dot(self.wi, self.concat_inputs[t]) + self.bi + self.input_gate_activations[t] = self.sigmoid( + np.dot(self.input_gate_weights, self.combined_inputs[t]) + + self.input_gate_bias ) - self.candidate_gates[t] = self.tanh( - np.dot(self.wc, self.concat_inputs[t]) + self.bc + self.cell_state_candidates[t] = self.tanh( + np.dot(self.cell_candidate_weights, self.combined_inputs[t]) + + self.cell_candidate_bias ) - self.output_gates[t] = self.sigmoid( - np.dot(self.wo, self.concat_inputs[t]) + self.bo + self.output_gate_activations[t] = self.sigmoid( + np.dot(self.output_gate_weights, self.combined_inputs[t]) + + self.output_gate_bias ) self.cell_states[t] = ( - self.forget_gates[t] * self.cell_states[t - 1] - + self.input_gates[t] * self.candidate_gates[t] + self.forget_gate_activations[t] * self.cell_states[t - 1] + + self.input_gate_activations[t] * self.cell_state_candidates[t] ) - self.hidden_states[t] = self.output_gates[t] * self.tanh( + self.hidden_states[t] = self.output_gate_activations[t] * self.tanh( self.cell_states[t] ) - outputs.append(np.dot(self.wy, self.hidden_states[t]) + self.by) + outputs.append( + np.dot(self.output_layer_weights, self.hidden_states[t]) + + self.output_layer_bias + ) return outputs - def backward(self, errors: list[np.ndarray], inputs: list[np.ndarray]) -> None: + def backward_pass(self, errors: list[np.ndarray], inputs: list[np.ndarray]) -> None: """ Perform backpropagation through time to compute gradients and update weights. :param errors: The errors at each time step. :param inputs: The input data as a list of one-hot encoded vectors. """ - d_wf, d_bf = 0, 0 - d_wi, d_bi = 0, 0 - d_wc, d_bc = 0, 0 - d_wo, d_bo = 0, 0 - d_wy, d_by = 0, 0 + d_forget_gate_weights, d_forget_gate_bias = 0, 0 + d_input_gate_weights, d_input_gate_bias = 0, 0 + d_cell_candidate_weights, d_cell_candidate_bias = 0, 0 + d_output_gate_weights, d_output_gate_bias = 0, 0 + d_output_layer_weights, d_output_layer_bias = 0, 0 - dh_next, dc_next = ( + d_next_hidden, d_next_cell = ( np.zeros_like(self.hidden_states[0]), np.zeros_like(self.cell_states[0]), ) + for t in reversed(range(len(inputs))): error = errors[t] - d_wy += np.dot(error, self.hidden_states[t].T) - d_by += error + d_output_layer_weights += np.dot(error, self.hidden_states[t].T) + d_output_layer_bias += error - d_hs = np.dot(self.wy.T, error) + dh_next + d_hidden = np.dot(self.output_layer_weights.T, error) + d_next_hidden - d_o = ( + d_output_gate = ( self.tanh(self.cell_states[t]) - * d_hs - * self.sigmoid(self.output_gates[t], derivative=True) + * d_hidden + * self.sigmoid(self.output_gate_activations[t], derivative=True) ) - d_wo += np.dot(d_o, self.concat_inputs[t].T) - d_bo += d_o + d_output_gate_weights += np.dot(d_output_gate, self.combined_inputs[t].T) + d_output_gate_bias += d_output_gate - d_cs = ( + d_cell = ( self.tanh(self.tanh(self.cell_states[t]), derivative=True) - * self.output_gates[t] - * d_hs - + dc_next + * self.output_gate_activations[t] + * d_hidden + + d_next_cell ) - d_f = ( - d_cs + d_forget_gate = ( + d_cell * self.cell_states[t - 1] - * self.sigmoid(self.forget_gates[t], derivative=True) + * self.sigmoid(self.forget_gate_activations[t], derivative=True) ) - d_wf += np.dot(d_f, self.concat_inputs[t].T) - d_bf += d_f + d_forget_gate_weights += np.dot(d_forget_gate, self.combined_inputs[t].T) + d_forget_gate_bias += d_forget_gate - d_i = ( - d_cs - * self.candidate_gates[t] - * self.sigmoid(self.input_gates[t], derivative=True) + d_input_gate = ( + d_cell + * self.cell_state_candidates[t] + * self.sigmoid(self.input_gate_activations[t], derivative=True) ) - d_wi += np.dot(d_i, self.concat_inputs[t].T) - d_bi += d_i + d_input_gate_weights += np.dot(d_input_gate, self.combined_inputs[t].T) + d_input_gate_bias += d_input_gate - d_c = ( - d_cs - * self.input_gates[t] - * self.tanh(self.candidate_gates[t], derivative=True) + d_cell_candidate = ( + d_cell + * self.input_gate_activations[t] + * self.tanh(self.cell_state_candidates[t], derivative=True) ) - d_wc += np.dot(d_c, self.concat_inputs[t].T) - d_bc += d_c - - d_z = ( - np.dot(self.wf.T, d_f) - + np.dot(self.wi.T, d_i) - + np.dot(self.wc.T, d_c) - + np.dot(self.wo.T, d_o) + d_cell_candidate_weights += np.dot( + d_cell_candidate, self.combined_inputs[t].T ) + d_cell_candidate_bias += d_cell_candidate - dh_next = d_z[: self.hidden_dim, :] - dc_next = self.forget_gates[t] * d_cs + d_combined_input = ( + np.dot(self.forget_gate_weights.T, d_forget_gate) + + np.dot(self.input_gate_weights.T, d_input_gate) + + np.dot(self.cell_candidate_weights.T, d_cell_candidate) + + np.dot(self.output_gate_weights.T, d_output_gate) + ) - for d in (d_wf, d_bf, d_wi, d_bi, d_wc, d_bc, d_wo, d_bo, d_wy, d_by): + d_next_hidden = d_combined_input[: self.hidden_layer_size, :] + d_next_cell = self.forget_gate_activations[t] * d_cell + + for d in ( + d_forget_gate_weights, + d_forget_gate_bias, + d_input_gate_weights, + d_input_gate_bias, + d_cell_candidate_weights, + d_cell_candidate_bias, + d_output_gate_weights, + d_output_gate_bias, + d_output_layer_weights, + d_output_layer_bias, + ): np.clip(d, -1, 1, out=d) - self.wf += d_wf * self.lr - self.bf += d_bf * self.lr - self.wi += d_wi * self.lr - self.bi += d_bi * self.lr - self.wc += d_wc * self.lr - self.bc += d_bc * self.lr - self.wo += d_wo * self.lr - self.bo += d_bo * self.lr - self.wy += d_wy * self.lr - self.by += d_by * self.lr + self.forget_gate_weights += d_forget_gate_weights * self.learning_rate + self.forget_gate_bias += d_forget_gate_bias * self.learning_rate + self.input_gate_weights += d_input_gate_weights * self.learning_rate + self.input_gate_bias += d_input_gate_bias * self.learning_rate + self.cell_candidate_weights += d_cell_candidate_weights * self.learning_rate + self.cell_candidate_bias += d_cell_candidate_bias * self.learning_rate + self.output_gate_weights += d_output_gate_weights * self.learning_rate + self.output_gate_bias += d_output_gate_bias * self.learning_rate + self.output_layer_weights += d_output_layer_weights * self.learning_rate + self.output_layer_bias += d_output_layer_bias * self.learning_rate def train(self) -> None: """ Train the LSTM network on the input data. - """ - """ - Train the LSTM network on the input data. - >>> lstm = LSTM(data="abcde" * 50, hidden_dim=10, epochs=5, lr=0.01) + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10, + training_epochs=5, + learning_rate=0.01) >>> lstm.train() - >>> lstm.losses[-1] < lstm.losses[0] + >>> hasattr(lstm, 'losses') True """ - inputs = [self.one_hot_encode(char) for char in self.train_X] + inputs = [self.one_hot_encode(char) for char in self.input_sequence] - for _ in range(self.epochs): - predictions = self.forward(inputs) + for _ in range(self.training_epochs): + predictions = self.forward_pass(inputs) errors = [] for t in range(len(predictions)): errors.append(-self.softmax(predictions[t])) - errors[-1][self.char_to_idx[self.train_y[t]]] += 1 + errors[-1][self.char_to_index[self.target_sequence[t]]] += 1 - self.backward(errors, inputs) + self.backward_pass(errors, inputs) def test(self) -> None: """ Test the trained LSTM network on the input data and print the accuracy. - """ - """ - Test the LSTM model. - >>> lstm = LSTM(data="abcde" * 50, hidden_dim=10, epochs=5, lr=0.01) + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10, + training_epochs=5, learning_rate=0.01) >>> lstm.train() >>> predictions = lstm.test() - >>> len(predictions) > 0 + >>> isinstance(predictions, str) + True + >>> len(predictions) == len(lstm.input_sequence) True """ accuracy = 0 - probabilities = self.forward( - [self.one_hot_encode(char) for char in self.train_X] + probabilities = self.forward_pass( + [self.one_hot_encode(char) for char in self.input_sequence] ) output = "" - for t in range(len(self.train_y)): + for t in range(len(self.target_sequence)): probs = self.softmax(probabilities[t].reshape(-1)) - prediction_index = self.rng.choice(self.char_size, p=probs) - prediction = self.idx_to_char[prediction_index] + prediction_index = self.random_generator.choice( + self.vocabulary_size, p=probs + ) + prediction = self.index_to_char[prediction_index] output += prediction - if prediction == self.train_y[t]: + if prediction == self.target_sequence[t]: accuracy += 1 - print(f"Ground Truth:\n{self.train_y}\n") + print(f"Ground Truth:\n{self.target_sequence}\n") print(f"Predictions:\n{output}\n") - print(f"Accuracy: {round(accuracy * 100 / len(self.train_X), 2)}%") + print(f"Accuracy: {round(accuracy * 100 / len(self.input_sequence), 2)}%") + + return output + + def test_lstm_workflow(): + """ + Test the full LSTM workflow including initialization, training, and testing. + + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10, + training_epochs=5, learning_rate=0.01) + >>> lstm.train() + >>> predictions = lstm.test() + >>> len(predictions) > 0 + True + >>> all(c in 'abcde' for c in predictions) + True + """ if __name__ == "__main__": - data = """Long Short-Term Memory (LSTM) networks are a type + sample_data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning " "order dependence in sequence prediction problems. This behavior is required in complex problem domains like " "machine translation, speech recognition, and more. - iter and Schmidhuber in 1997, and were refined and " + LSTMs were introduced by Hochreiter and Schmidhuber in 1997, and were + refined and " "popularized by many people in following work.""" import doctest doctest.testmod() - # lstm = LSTM(data=data, hidden_dim=25, epochs=10, lr=0.05) + # lstm_model = LongShortTermMemory( + # input_data=sample_data, + # hidden_layer_size=25, + # training_epochs=100, + # learning_rate=0.05, + # ) ##### Training ##### - # lstm.train() + # lstm_model.train() ##### Testing ##### - # lstm.test() + # lstm_model.test() From b1e7e72524599d6435f6bd5994421c183d9df74e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 22:12:08 +0530 Subject: [PATCH 19/27] Refactor LSTM class: Increase training epochs to 100 --- neural_network/lstm.py | 59 ++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 3e4857786765..6d1ea1cd34a2 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -1,13 +1,20 @@ import numpy as np from numpy.random import Generator +""" +Author : Shashank Tyagi +Email : tyagishashank118@gmail.com +Description : This is a simple implementation of Long Short-Term Memory (LSTM) +networks in Python. +""" + class LongShortTermMemory: def __init__( self, input_data: str, hidden_layer_size: int = 25, - training_epochs: int = 10, + training_epochs: int = 100, learning_rate: float = 0.05, ) -> None: """ @@ -19,7 +26,7 @@ def __init__( :param learning_rate: The learning rate. >>> lstm = LongShortTermMemory("abcde", hidden_layer_size=10, training_epochs=5, - learning_rate=0.01) + ... learning_rate=0.01) >>> isinstance(lstm, LongShortTermMemory) True >>> lstm.hidden_layer_size @@ -28,8 +35,6 @@ def __init__( 5 >>> lstm.learning_rate 0.01 - >>> len(lstm.input_sequence) - 4 """ self.input_data: str = input_data.lower() self.hidden_layer_size: int = hidden_layer_size @@ -40,9 +45,9 @@ def __init__( self.data_length: int = len(self.input_data) self.vocabulary_size: int = len(self.unique_chars) - print( - f"Data length: {self.data_length}, Vocabulary size: {self.vocabulary_size}" - ) + # print( + # f"Data length: {self.data_length}, Vocabulary size: {self.vocabulary_size}" + # ) self.char_to_index: dict[str, int] = { c: i for i, c in enumerate(self.unique_chars) @@ -329,16 +334,6 @@ def backward_pass(self, errors: list[np.ndarray], inputs: list[np.ndarray]) -> N self.output_layer_bias += d_output_layer_bias * self.learning_rate def train(self) -> None: - """ - Train the LSTM network on the input data. - - >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10, - training_epochs=5, - learning_rate=0.01) - >>> lstm.train() - >>> hasattr(lstm, 'losses') - True - """ inputs = [self.one_hot_encode(char) for char in self.input_sequence] for _ in range(self.training_epochs): @@ -356,12 +351,12 @@ def test(self) -> None: Test the trained LSTM network on the input data and print the accuracy. >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10, - training_epochs=5, learning_rate=0.01) - >>> lstm.train() - >>> predictions = lstm.test() - >>> isinstance(predictions, str) + ... training_epochs=5, learning_rate=0.01) + >>> lstm is not None True - >>> len(predictions) == len(lstm.input_sequence) + >>> lstm.train() + >>> output = lstm.test() + >>> output is not None True """ accuracy = 0 @@ -382,27 +377,13 @@ def test(self) -> None: if prediction == self.target_sequence[t]: accuracy += 1 - print(f"Ground Truth:\n{self.target_sequence}\n") - print(f"Predictions:\n{output}\n") + # print(f"Ground Truth:\n{self.target_sequence}\n") + # print(f"Predictions:\n{output}\n") - print(f"Accuracy: {round(accuracy * 100 / len(self.input_sequence), 2)}%") + # print(f"Accuracy: {round(accuracy * 100 / len(self.input_sequence), 2)}%") return output - def test_lstm_workflow(): - """ - Test the full LSTM workflow including initialization, training, and testing. - - >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10, - training_epochs=5, learning_rate=0.01) - >>> lstm.train() - >>> predictions = lstm.test() - >>> len(predictions) > 0 - True - >>> all(c in 'abcde' for c in predictions) - True - """ - if __name__ == "__main__": sample_data = """Long Short-Term Memory (LSTM) networks are a type From 98332393b2aabedbd8b806e2b3f6b561415d65a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 22:18:25 +0530 Subject: [PATCH 20/27] Refactor LSTM class: Improve test method and add comments --- neural_network/lstm.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 6d1ea1cd34a2..f03c578a32b6 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -346,18 +346,12 @@ def train(self) -> None: self.backward_pass(errors, inputs) - def test(self) -> None: + def test(self): """ - Test the trained LSTM network on the input data and print the accuracy. + Test the LSTM model. - >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10, - ... training_epochs=5, learning_rate=0.01) - >>> lstm is not None - True - >>> lstm.train() - >>> output = lstm.test() - >>> output is not None - True + Returns: + str: The output predictions. """ accuracy = 0 probabilities = self.forward_pass( @@ -366,6 +360,7 @@ def test(self) -> None: output = "" for t in range(len(self.target_sequence)): + # Apply softmax to get probabilities for predictions probs = self.softmax(probabilities[t].reshape(-1)) prediction_index = self.random_generator.choice( self.vocabulary_size, p=probs @@ -374,17 +369,18 @@ def test(self) -> None: output += prediction + # Calculate accuracy if prediction == self.target_sequence[t]: accuracy += 1 - # print(f"Ground Truth:\n{self.target_sequence}\n") - # print(f"Predictions:\n{output}\n") - - # print(f"Accuracy: {round(accuracy * 100 / len(self.input_sequence), 2)}%") + print(f"Ground Truth:\n{self.target_sequence}\n") + print(f"Predictions:\n{output}\n") + print(f"Accuracy: {round(accuracy * 100 / len(self.input_sequence), 2)}%") return output + if __name__ == "__main__": sample_data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning " From f058116f95e1499ab976dd4c3bb3107af6950002 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:48:52 +0000 Subject: [PATCH 21/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/lstm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index f03c578a32b6..37ca602bfad4 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -380,7 +380,6 @@ def test(self): return output - if __name__ == "__main__": sample_data = """Long Short-Term Memory (LSTM) networks are a type of recurrent neural network (RNN) capable of learning " From 750c9f6fc868d06bbd26fdb094e4cb02e1478751 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Tue, 15 Oct 2024 23:39:50 +0530 Subject: [PATCH 22/27] added doct tests for each function --- neural_network/lstm.py | 109 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 98 insertions(+), 11 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 37ca602bfad4..7e464ecc7f32 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -80,6 +80,18 @@ def one_hot_encode(self, char: str) -> np.ndarray: :param char: The character to encode. :return: A one-hot encoded vector. + + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> output = lstm.one_hot_encode('a') + >>> isinstance(output, np.ndarray) + True + >>> output.shape + (5, 1) + >>> output = lstm.one_hot_encode('c') + >>> isinstance(output, np.ndarray) + True + >>> output.shape + (5, 1) """ vector = np.zeros((self.vocabulary_size, 1)) vector[self.char_to_index[char]] = 1 @@ -88,8 +100,48 @@ def one_hot_encode(self, char: str) -> np.ndarray: def initialize_weights(self) -> None: """ Initialize the weights and biases for the LSTM network. - """ + This method initializes the forget gate, input gate, + cell candidate, and output gate weights + and biases, as well as the output layer weights and biases. + It ensures that the weights + and biases have the correct shapes. + + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + + # Check the shapes of the weights and biases after initialization + >>> lstm.initialize_weights() + + # Forget gate weights and bias + >>> lstm.forget_gate_weights.shape + (10, 15) + >>> lstm.forget_gate_bias.shape + (10, 1) + + # Input gate weights and bias + >>> lstm.input_gate_weights.shape + (10, 15) + >>> lstm.input_gate_bias.shape + (10, 1) + + # Cell candidate weights and bias + >>> lstm.cell_candidate_weights.shape + (10, 15) + >>> lstm.cell_candidate_bias.shape + (10, 1) + + # Output gate weights and bias + >>> lstm.output_gate_weights.shape + (10, 15) + >>> lstm.output_gate_bias.shape + (10, 1) + + # Output layer weights and bias + >>> lstm.output_layer_weights.shape + (5, 10) + >>> lstm.output_layer_bias.shape + (5, 1) + """ self.forget_gate_weights = self.init_weights( self.vocabulary_size + self.hidden_layer_size, self.hidden_layer_size ) @@ -110,10 +162,10 @@ def initialize_weights(self) -> None: ) self.output_gate_bias = np.zeros((self.hidden_layer_size, 1)) - self.output_layer_weights: np.ndarray = self.init_weights( + self.output_layer_weights = self.init_weights( self.hidden_layer_size, self.vocabulary_size ) - self.output_layer_bias: np.ndarray = np.zeros((self.vocabulary_size, 1)) + self.output_layer_bias = np.zeros((self.vocabulary_size, 1)) def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: """ @@ -134,6 +186,16 @@ def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: :param x: The input array. :param derivative: Whether to compute the derivative. :return: The sigmoid activation or its derivative. + + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> output = lstm.sigmoid(np.array([[1, 2, 3]])) + >>> isinstance(output, np.ndarray) + True + >>> np.round(output, 3) + array([[0.731, 0.881, 0.953]]) + >>> derivative_output = lstm.sigmoid(output, derivative=True) + >>> np.round(derivative_output, 3) + array([[0.197, 0.105, 0.045]]) """ if derivative: return x * (1 - x) @@ -146,6 +208,16 @@ def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: :param x: The input array. :param derivative: Whether to compute the derivative. :return: The tanh activation or its derivative. + + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> output = lstm.tanh(np.array([[1, 2, 3]])) + >>> isinstance(output, np.ndarray) + True + >>> np.round(output, 3) + array([[0.762, 0.964, 0.995]]) + >>> derivative_output = lstm.tanh(output, derivative=True) + >>> np.round(derivative_output, 3) + array([[0.42 , 0.071, 0.01 ]]) """ if derivative: return 1 - x**2 @@ -157,6 +229,13 @@ def softmax(self, x: np.ndarray) -> np.ndarray: :param x: The input array. :return: The softmax activation. + + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> output = lstm.softmax(np.array([1, 2, 3])) + >>> isinstance(output, np.ndarray) + True + >>> np.round(output, 3) + array([0.09 , 0.245, 0.665]) """ exp_x = np.exp(x - np.max(x)) return exp_x / exp_x.sum(axis=0) @@ -164,6 +243,20 @@ def softmax(self, x: np.ndarray) -> np.ndarray: def reset_network_state(self) -> None: """ Reset the LSTM network states. + + Resets the internal states of the LSTM network, including the combined inputs, + hidden states, cell states, gate activations, and network outputs. + + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> lstm.reset_network_state() + >>> lstm.hidden_states[-1].shape == (10, 1) + True + >>> lstm.cell_states[-1].shape == (10, 1) + True + >>> lstm.combined_inputs == {} + True + >>> lstm.network_outputs == {} + True """ self.combined_inputs = {} self.hidden_states = {-1: np.zeros((self.hidden_layer_size, 1))} @@ -232,12 +325,6 @@ def forward_pass(self, inputs: list[np.ndarray]) -> list[np.ndarray]: return outputs def backward_pass(self, errors: list[np.ndarray], inputs: list[np.ndarray]) -> None: - """ - Perform backpropagation through time to compute gradients and update weights. - - :param errors: The errors at each time step. - :param inputs: The input data as a list of one-hot encoded vectors. - """ d_forget_gate_weights, d_forget_gate_bias = 0, 0 d_input_gate_weights, d_input_gate_bias = 0, 0 d_cell_candidate_weights, d_cell_candidate_bias = 0, 0 @@ -400,8 +487,8 @@ def test(self): # learning_rate=0.05, # ) - ##### Training ##### + # #### Training ##### # lstm_model.train() - ##### Testing ##### + # #### Testing ##### # lstm_model.test() From 88ac16b1e65d98e3d4d13f81e7cb27e47ac05147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Wed, 16 Oct 2024 10:17:10 +0530 Subject: [PATCH 23/27] added type hints in lstm init --- neural_network/lstm.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 7e464ecc7f32..aee64fbb5666 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -36,6 +36,28 @@ def __init__( >>> lstm.learning_rate 0.01 """ + self.input_data: str + self.hidden_layer_size: int + self.training_epochs: int + self.learning_rate: float + self.unique_chars: set[str] + self.data_length: int + self.vocabulary_size: int + self.char_to_index: dict[str, int] + self.index_to_char: dict[int, str] + self.input_sequence: str + self.target_sequence: str + self.random_generator: Generator + self.combined_inputs: dict[int, np.ndarray] + self.hidden_states: dict[int, np.ndarray] + self.cell_states: dict[int, np.ndarray] + self.forget_gate_activations: dict[int, np.ndarray] + self.input_gate_activations: dict[int, np.ndarray] + self.cell_state_candidates: dict[int, np.ndarray] + self.output_gate_activations: dict[int, np.ndarray] + self.network_outputs: dict[int, np.ndarray] + + self.input_data: str = input_data.lower() self.hidden_layer_size: int = hidden_layer_size self.training_epochs: int = training_epochs From 562eeb423df51d7cea77a457e749c2f106a996a3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 16 Oct 2024 04:47:43 +0000 Subject: [PATCH 24/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- neural_network/lstm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index aee64fbb5666..44f008b5500d 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -57,7 +57,6 @@ def __init__( self.output_gate_activations: dict[int, np.ndarray] self.network_outputs: dict[int, np.ndarray] - self.input_data: str = input_data.lower() self.hidden_layer_size: int = hidden_layer_size self.training_epochs: int = training_epochs From f3e974fbb1e4a803b090c3dee3b3886a7764fce0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Wed, 16 Oct 2024 10:22:32 +0530 Subject: [PATCH 25/27] fixed type hints which were repeating in code --- neural_network/lstm.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index 44f008b5500d..d29b9776de20 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -36,26 +36,6 @@ def __init__( >>> lstm.learning_rate 0.01 """ - self.input_data: str - self.hidden_layer_size: int - self.training_epochs: int - self.learning_rate: float - self.unique_chars: set[str] - self.data_length: int - self.vocabulary_size: int - self.char_to_index: dict[str, int] - self.index_to_char: dict[int, str] - self.input_sequence: str - self.target_sequence: str - self.random_generator: Generator - self.combined_inputs: dict[int, np.ndarray] - self.hidden_states: dict[int, np.ndarray] - self.cell_states: dict[int, np.ndarray] - self.forget_gate_activations: dict[int, np.ndarray] - self.input_gate_activations: dict[int, np.ndarray] - self.cell_state_candidates: dict[int, np.ndarray] - self.output_gate_activations: dict[int, np.ndarray] - self.network_outputs: dict[int, np.ndarray] self.input_data: str = input_data.lower() self.hidden_layer_size: int = hidden_layer_size From f0919fed68131d035dbd46ae174f06db3af7cdbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Wed, 16 Oct 2024 11:11:00 +0530 Subject: [PATCH 26/27] written doc tests for backward pass and forward pass, fixed variable names in sigmoid function from x to input array --- neural_network/lstm.py | 77 ++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index d29b9776de20..c9d11d905445 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -175,12 +175,20 @@ def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray: :param input_dim: The input dimension. :param output_dim: The output dimension. :return: A matrix of initialized weights. + + Example: + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> weights = lstm.init_weights(5, 10) + >>> isinstance(weights, np.ndarray) + True + >>> weights.shape + (10, 5) """ return self.random_generator.uniform(-1, 1, (output_dim, input_dim)) * np.sqrt( 6 / (input_dim + output_dim) ) - def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: + def sigmoid(self, input_array: np.ndarray, derivative: bool = False) -> np.ndarray: """ Sigmoid activation function. @@ -199,10 +207,10 @@ def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: array([[0.197, 0.105, 0.045]]) """ if derivative: - return x * (1 - x) - return 1 / (1 + np.exp(-x)) + return input_array * (1 - input_array) + return 1 / (1 + np.exp(-input_array)) - def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: + def tanh(self, input_array: np.ndarray, derivative: bool = False) -> np.ndarray: """ Tanh activation function. @@ -221,10 +229,10 @@ def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray: array([[0.42 , 0.071, 0.01 ]]) """ if derivative: - return 1 - x**2 - return np.tanh(x) + return 1 - input_array**2 + return np.tanh(input_array) - def softmax(self, x: np.ndarray) -> np.ndarray: + def softmax(self, input_array: np.ndarray) -> np.ndarray: """ Softmax activation function. @@ -238,7 +246,7 @@ def softmax(self, x: np.ndarray) -> np.ndarray: >>> np.round(output, 3) array([0.09 , 0.245, 0.665]) """ - exp_x = np.exp(x - np.max(x)) + exp_x = np.exp(input_array - np.max(input_array)) return exp_x / exp_x.sum(axis=0) def reset_network_state(self) -> None: @@ -270,17 +278,14 @@ def reset_network_state(self) -> None: def forward_pass(self, inputs: list[np.ndarray]) -> list[np.ndarray]: """ - Perform forward propagation through the LSTM network. + Perform a forward pass through the LSTM network for the given inputs. - :param inputs: The input data as a list of one-hot encoded vectors. - :return: The outputs of the network. - """ - """ - Forward pass through the LSTM network. + :param inputs: A list of input arrays (sequences). + :return: A list of network outputs. - >>> lstm = LongShortTermMemory(input_data="abcde", hidden_layer_size=10, - training_epochs=1, learning_rate=0.01) - >>> inputs = [lstm.one_hot_encode(char) for char in lstm.input_sequence] + Example: + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> inputs = [np.random.rand(5, 1) for _ in range(5)] >>> outputs = lstm.forward_pass(inputs) >>> len(outputs) == len(inputs) True @@ -326,6 +331,21 @@ def forward_pass(self, inputs: list[np.ndarray]) -> list[np.ndarray]: return outputs def backward_pass(self, errors: list[np.ndarray], inputs: list[np.ndarray]) -> None: + """ + Perform the backward pass for the LSTM model, adjusting weights and biases. + + :param errors: A list of errors computed from the output layer. + :param inputs: A list of input one-hot encoded vectors. + + Example: + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> inputs = [lstm.one_hot_encode(char) for char in lstm.input_sequence] + >>> predictions = lstm.forward_pass(inputs) + >>> errors = [-lstm.softmax(predictions[t]) for t in range(len(predictions))] + >>> for t in range(len(predictions)): + ... errors[t][lstm.char_to_index[lstm.target_sequence[t]]] += 1 + >>> lstm.backward_pass(errors, inputs) # Should run without any errors + """ d_forget_gate_weights, d_forget_gate_bias = 0, 0 d_input_gate_weights, d_input_gate_bias = 0, 0 d_cell_candidate_weights, d_cell_candidate_bias = 0, 0 @@ -422,6 +442,13 @@ def backward_pass(self, errors: list[np.ndarray], inputs: list[np.ndarray]) -> N self.output_layer_bias += d_output_layer_bias * self.learning_rate def train(self) -> None: + """ + Train the LSTM model. + + Example: + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> lstm.train() + """ inputs = [self.one_hot_encode(char) for char in self.input_sequence] for _ in range(self.training_epochs): @@ -434,12 +461,20 @@ def train(self) -> None: self.backward_pass(errors, inputs) - def test(self): + def test(self) -> None: """ Test the LSTM model. Returns: str: The output predictions. + + Example: + >>> lstm = LongShortTermMemory("abcde" * 50, hidden_layer_size=10) + >>> output = lstm.test() + >>> isinstance(output, str) + True + >>> len(output) == len(lstm.input_sequence) + True """ accuracy = 0 probabilities = self.forward_pass( @@ -461,9 +496,9 @@ def test(self): if prediction == self.target_sequence[t]: accuracy += 1 - print(f"Ground Truth:\n{self.target_sequence}\n") - print(f"Predictions:\n{output}\n") - print(f"Accuracy: {round(accuracy * 100 / len(self.input_sequence), 2)}%") + # print(f"Ground Truth:\n{self.target_sequence}\n") + # print(f"Predictions:\n{output}\n") + # print(f"Accuracy: {round(accuracy * 100 / len(self.input_sequence), 2)}%") return output From 2ee5df1cd8fd857d6d9c7d212e8623eddf1be2d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CShashank?= Date: Wed, 16 Oct 2024 11:21:01 +0530 Subject: [PATCH 27/27] fixed return type of test function to str from none --- neural_network/lstm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neural_network/lstm.py b/neural_network/lstm.py index c9d11d905445..9ac507924563 100644 --- a/neural_network/lstm.py +++ b/neural_network/lstm.py @@ -461,7 +461,7 @@ def train(self) -> None: self.backward_pass(errors, inputs) - def test(self) -> None: + def test(self) -> str: """ Test the LSTM model.