neural_network/lstm.py

"""
Name - - LSTM - Long Short-Term Memory Network For Sequence Prediction
Goal - - Predict sequences of data
Detail: Total 3 layers neural network
* Input layer
* LSTM layer
* Output layer
Author: Shashank Tyagi
Github: LEVII007
link : https://www.kaggle.com/code/navjindervirdee/lstm-neural-network-from-scratch
"""

##### Explanation #####
# This script implements a Long Short-Term Memory (LSTM) network to learn 
# and predict sequences of characters.
# It uses numpy for numerical operations and tqdm for progress visualization.

# The data is a paragraph about LSTM, converted to lowercase and split into 
# characters. Each character is one-hot encoded for training.

# The LSTM class initializes weights and biases for the forget, input, candidate, 
# and output gates. It also initializes weights and biases for the final output layer.

# The forward method performs forward propagation through the LSTM network, 
# computing hidden and cell states. It uses sigmoid and tanh activation 
# functions for the gates and cell states.

# The backward method performs backpropagation through time, computing gradients 
# for the weights and biases. It updates the weights and biases using 
# the computed gradients and the learning rate.

# The train method trains the LSTM network on the input data for a specified 
# number of epochs. It uses one-hot encoded inputs and computes errors 
# using the softmax function.

# The test method evaluates the trained LSTM network on the input data, 
# computing accuracy based on predictions.

# The script initializes the LSTM network with specified hyperparameters 
# and trains it on the input data. Finally, it tests the trained network 
# and prints the accuracy of the predictions.

##### Imports #####
from tqdm import tqdm
import numpy as np

class LSTM:
    def __init__(self, data: str, hidden_dim: int = 25, 
                 epochs: int = 1000, lr: float = 0.05) -> None:
        """
        Initialize the LSTM network with the given data and hyperparameters.
        
        :param data: The input data as a string.
        :param hidden_dim: The number of hidden units in the LSTM layer.
        :param epochs: The number of training epochs.
        :param lr: The learning rate.
        """
        self.data = data.lower()
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.lr = lr

        self.chars = set(self.data)
        self.data_size, self.char_size = len(self.data), len(self.chars)

        print(f'Data size: {self.data_size}, Char Size: {self.char_size}')

        self.char_to_idx = {c: i for i, c in enumerate(self.chars)}
        self.idx_to_char = {i: c for i, c in enumerate(self.chars)}

        self.train_X, self.train_y = self.data[:-1], self.data[1:]

        self.initialize_weights()

    ##### Helper Functions #####
    def one_hot_encode(self, char: str) -> np.ndarray:
        """
        One-hot encode a character.
        
        :param char: The character to encode.
        :return: A one-hot encoded vector.
        """
        vector = np.zeros((self.char_size, 1))
        vector[self.char_to_idx[char]] = 1
        return vector

    def initialize_weights(self) -> None:
        """
        Initialize the weights and biases for the LSTM network.
        """
        self.wf = self.init_weights(self.char_size + self.hidden_dim, 
                                    self.hidden_dim)
        self.bf = np.zeros((self.hidden_dim, 1))

        self.wi = self.init_weights(self.char_size + self.hidden_dim, 
                                    self.hidden_dim)
        self.bi = np.zeros((self.hidden_dim, 1))

        self.wc = self.init_weights(self.char_size + self.hidden_dim, 
                                    self.hidden_dim)
        self.bc = np.zeros((self.hidden_dim, 1))

        self.wo = self.init_weights(self.char_size + self.hidden_dim, 
                                    self.hidden_dim)
        self.bo = np.zeros((self.hidden_dim, 1))

        self.wy = self.init_weights(self.hidden_dim, self.char_size)
        self.by = np.zeros((self.char_size, 1))

    def init_weights(self, input_dim: int, output_dim: int) -> np.ndarray:
        """
        Initialize weights with random values.
        
        :param input_dim: The input dimension.
        :param output_dim: The output dimension.
        :return: A matrix of initialized weights.
        """
        return np.random.uniform(-1, 1, (output_dim, input_dim)) * \
               np.sqrt(6 / (input_dim + output_dim))

    ##### Activation Functions #####
    def sigmoid(self, x: np.ndarray, derivative: bool = False) -> np.ndarray:
        """
        Sigmoid activation function.
        
        :param x: The input array.
        :param derivative: Whether to compute the derivative.
        :return: The sigmoid activation or its derivative.
        """
        if derivative:
            return x * (1 - x)
        return 1 / (1 + np.exp(-x))

    def tanh(self, x: np.ndarray, derivative: bool = False) -> np.ndarray:
        """
        Tanh activation function.
        
        :param x: The input array.
        :param derivative: Whether to compute the derivative.
        :return: The tanh activation or its derivative.
        """
        if derivative:
            return 1 - x ** 2
        return np.tanh(x)

    def softmax(self, x: np.ndarray) -> np.ndarray:
        """
        Softmax activation function.
        
        :param x: The input array.
        :return: The softmax activation.
        """
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)

    ##### LSTM Network Methods #####
    def reset(self) -> None:
        """
        Reset the LSTM network states.
        """
        self.concat_inputs = {}

        self.hidden_states = {-1: np.zeros((self.hidden_dim, 1))}
        self.cell_states = {-1: np.zeros((self.hidden_dim, 1))}

        self.activation_outputs = {}
        self.candidate_gates = {}
        self.output_gates = {}
        self.forget_gates = {}
        self.input_gates = {}
        self.outputs = {}

    def forward(self, inputs: list) -> list:
        """
        Perform forward propagation through the LSTM network.
        
        :param inputs: The input data as a list of one-hot encoded vectors.
        :return: The outputs of the network.
        """
        self.reset()

        outputs = []
        for t in range(len(inputs)):
            self.concat_inputs[t] = np.concatenate(
                (self.hidden_states[t - 1], inputs[t]))

            self.forget_gates[t] = self.sigmoid(np.dot(self.wf, 
                                           self.concat_inputs[t]) + self.bf)
            self.input_gates[t] = self.sigmoid(np.dot(self.wi, 
                                          self.concat_inputs[t]) + self.bi)
            self.candidate_gates[t] = self.tanh(np.dot(self.wc, 
                                          self.concat_inputs[t]) + self.bc)
            self.output_gates[t] = self.sigmoid(np.dot(self.wo, 
                                          self.concat_inputs[t]) + self.bo)

            self.cell_states[t] = self.forget_gates[t] * self.cell_states[t - 1] + \
                                  self.input_gates[t] * self.candidate_gates[t]
            self.hidden_states[t] = self.output_gates[t] * \
                                    self.tanh(self.cell_states[t])

            outputs.append(np.dot(self.wy, self.hidden_states[t]) + self.by)

        return outputs

    def backward(self, errors: list, inputs: list) -> None:
        """
        Perform backpropagation through time to compute gradients and update weights.
        
        :param errors: The errors at each time step.
        :param inputs: The input data as a list of one-hot encoded vectors.
        """
        d_wf, d_bf = 0, 0
        d_wi, d_bi = 0, 0
        d_wc, d_bc = 0, 0
        d_wo, d_bo = 0, 0
        d_wy, d_by = 0, 0

        dh_next, dc_next = np.zeros_like(self.hidden_states[0]), \
                           np.zeros_like(self.cell_states[0])
        for t in reversed(range(len(inputs))):
            error = errors[t]

            # Final Gate Weights and Biases Errors
            d_wy += np.dot(error, self.hidden_states[t].T)
            d_by += error

            # Hidden State Error
            d_hs = np.dot(self.wy.T, error) + dh_next

            # Output Gate Weights and Biases Errors
            d_o = self.tanh(self.cell_states[t]) * d_hs * \
                  self.sigmoid(self.output_gates[t], derivative=True)
            d_wo += np.dot(d_o, inputs[t].T)
            d_bo += d_o

            # Cell State Error
            d_cs = self.tanh(self.tanh(self.cell_states[t]), 
                             derivative=True) * self.output_gates[t] * d_hs + dc_next

            # Forget Gate Weights and Biases Errors
            d_f = d_cs * self.cell_states[t - 1] * \
                  self.sigmoid(self.forget_gates[t], derivative=True)
            d_wf += np.dot(d_f, inputs[t].T)
            d_bf += d_f

            # Input Gate Weights and Biases Errors
            d_i = d_cs * self.candidate_gates[t] * \
                  self.sigmoid(self.input_gates[t], derivative=True)
            d_wi += np.dot(d_i, inputs[t].T)
            d_bi += d_i

            # Candidate Gate Weights and Biases Errors
            d_c = d_cs * self.input_gates[t] * self.tanh(self.candidate_gates[t], 
                              derivative=True)
            d_wc += np.dot(d_c, inputs[t].T)
            d_bc += d_c

            # Update the next hidden and cell state errors
            dh_next = np.dot(self.wf.T, d_f) + np.dot(self.wi.T, d_i) + \
                      np.dot(self.wo.T, d_o) + np.dot(self.wc.T, d_c)
            dc_next = d_cs * self.forget_gates[t]

        # Apply gradients to weights and biases
        for param, grad in zip([self.wf, self.wi, self.wc, self.wo, self.wy],
                               [d_wf, d_wi, d_wc, d_wo, d_wy]):
            param -= self.lr * grad

        for param, grad in zip([self.bf, self.bi, self.bc, self.bo, self.by],
                               [d_bf, d_bi, d_bc, d_bo, d_by]):
            param -= self.lr * grad

    def train(self) -> None:
        """
        Train the LSTM network on the input data for a specified number of epochs.
        """
        for epoch in tqdm(range(self.epochs)):
            inputs = [self.one_hot_encode(char) for char in self.train_X]
            targets = [self.one_hot_encode(char) for char in self.train_y]

            # Forward pass
            outputs = self.forward(inputs)

            # Compute error at each time step
            errors = [output - target for output, target in zip(outputs, targets)]

            # Backward pass and weight updates
            self.backward(errors, inputs)

    def predict(self, inputs: list) -> str:
        """
        Predict the next character in the sequence.
        
        :param inputs: The input data as a list of one-hot encoded vectors.
        :return: The predicted character.
        """
        output = self.forward(inputs)[-1]
        return self.idx_to_char[np.argmax(self.softmax(output))]

    def test(self) -> None:
        """
        Test the LSTM network on the input data and compute accuracy.
        """
        inputs = [self.one_hot_encode(char) for char in self.train_X]
        correct_predictions = sum(self.idx_to_char[np.argmax(self.softmax(output))] == target
                                  for output, target in zip(self.forward(inputs), self.train_y))

        accuracy = (correct_predictions / len(self.train_y)) * 100
        print(f'Accuracy: {accuracy:.2f}%')


if __name__ == "__main__":
    # Define the input data and hyperparameters
    data = "LSTM Neural Networks are designed to handle sequences of data."
    hidden_dim = 50
    epochs = 1000
    lr = 0.01

    # Initialize and train the LSTM network
    lstm = LSTM(data, hidden_dim, epochs, lr)
    lstm.train()

    # Test the LSTM network and compute accuracy
    lstm.test()