Simple RNN Implementation from Scratch | Generated by AI
Simple RNN Example in NumPy (From Scratch)
Implementing an RNN from scratch in NumPy is a great way to understand the mechanics: the recurrent hidden state, forward propagation, and backpropagation through time (BPTT). Below is a basic example for binary classification on random sequences (similar to the previous PyTorch version). It uses univariate time series data (input_size=1), hidden_size=32, and sequence_length=10.
This code includes:
- Data generation.
- RNN forward pass (with tanh activation for hidden states).
- Backward pass (BPTT with simple gradient descent).
- Training loop.
Note: This is a vanilla RNN; gradients can vanish/explode on long sequences. For production, consider clipping or using LSTM logic.
import numpy as np
# Hyperparameters
input_size = 1 # Feature size (univariate)
hidden_size = 32 # Hidden state size
output_size = 1 # Binary output
sequence_length = 10 # Length of each sequence
num_samples = 1000
num_epochs = 10
learning_rate = 0.01
# Generate dummy data: sequences of length 10, labels based on sum (binary)
def generate_data(num_samples):
X = np.random.randn(num_samples, sequence_length, input_size)
y = (np.sum(X, axis=1) > 0).astype(float).reshape(num_samples, 1)
return X, y
# Simple RNN Class (from scratch)
class SimpleRNN:
def __init__(self, input_size, hidden_size, output_size):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
# Initialize weights (Xavier init)
self.Wxh = np.random.randn(hidden_size, input_size) * np.sqrt(1. / input_size)
self.Whh = np.random.randn(hidden_size, hidden_size) * np.sqrt(1. / hidden_size)
self.Why = np.random.randn(output_size, hidden_size) * np.sqrt(1. / hidden_size)
# Biases
self.bh = np.zeros((hidden_size, 1))
self.by = np.zeros((output_size, 1))
def forward(self, x):
# x shape: (sequence_length, input_size, 1) for single sample
self.x = x # Store for backprop
self.h = np.zeros((self.hidden_size, 1)) # Initial hidden state
# Forward through time
self.hs = np.zeros((self.hidden_size, sequence_length + 1)) # Hidden states (including initial)
self.hs[:, 0] = self.h.flatten()
for t in range(sequence_length):
self.h = np.tanh(np.dot(self.Wxh, x[t]) + np.dot(self.Whh, self.h) + self.bh)
self.hs[:, t+1] = self.h.flatten()
# Output from last hidden state
self.y_pred = np.dot(self.Why, self.h) + self.by
return self.sigmoid(self.y_pred)
def sigmoid(self, z):
return 1 / (1 + np.exp(-np.clip(z, -250, 250))) # Clip for stability
def backward(self, y_true):
# Backprop through time (simplified)
dWhy = np.dot((self.y_pred - y_true) * self.sigmoid_deriv(self.y_pred), self.hs[-1:, :].T)
dby = (self.y_pred - y_true) * self.sigmoid_deriv(self.y_pred)
# Gradients for hidden and output weights
dWhh = np.zeros_like(self.Whh)
dWxh = np.zeros_like(self.Wxh)
dbh = np.zeros_like(self.bh)
dh_next = np.zeros_like(self.h)
for t in reversed(range(sequence_length)):
dh = np.dot(self.Why.T, (self.y_pred - y_true) * self.sigmoid_deriv(self.y_pred)) + dh_next
dh_raw = (1 - self.h**2) * dh # Tanh derivative
dWxh += np.dot(dh_raw, self.x[t].T)
dWhh += np.dot(dh_raw, self.hs[:, t:t+2].T[:, 0:1]) # Simplified; full BPTT would accumulate properly
dbh += dh_raw
dh_next = np.dot(self.Whh.T, dh_raw)
# Update weights
self.Wxh -= learning_rate * dWxh
self.Whh -= learning_rate * dWhh
self.Why -= learning_rate * dWhy
self.bh -= learning_rate * dbh
self.by -= learning_rate * dby
def sigmoid_deriv(self, z):
s = self.sigmoid(z)
return s * (1 - s)
# Training
X, y = generate_data(num_samples)
# Reshape for single sample processing (batch_size=1 for simplicity)
X = X.reshape(num_samples, sequence_length, input_size, 1) # Add batch dim of 1
y = y.reshape(num_samples, 1, 1)
model = SimpleRNN(input_size, hidden_size, output_size)
for epoch in range(num_epochs):
total_loss = 0
for i in range(num_samples):
pred = model.forward(X[i])
loss = -np.mean(y[i] * np.log(pred + 1e-8) + (1 - y[i]) * np.log(1 - pred + 1e-8)) # BCE
total_loss += loss
model.backward(y[i])
avg_loss = total_loss / num_samples
print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}')
# Inference example
test_seq = np.random.randn(sequence_length, input_size, 1)
prediction = model.forward(test_seq)
print(f'Prediction: {prediction.flatten()[0]:.4f}')
How it Works:
- Data Generation: Random sequences with binary labels based on sum.
- Forward Pass: Processes the sequence timestep-by-timestep, updating hidden state
h_t = tanh(Wxh * x_t + Whh * h_{t-1} + bh). Output uses the finalhwith sigmoid. - Backward Pass: BPTT computes gradients for all timesteps, updating weights via simple GD. (Note: This is a minimal BPTT; full versions handle more precise accumulation.)
- Training: Loops over samples, computes BCE loss, and updates.
- Output: Sigmoid gives probability (0-1).
This runs in ~10-20 seconds on a standard machine for these params. Scale up num_samples or num_epochs to see learning. For multi-batch, vectorize the loops.
For deeper dives:
Understanding LSTM Networks (Colah’s Blog)
NumPy RNN Tutorial