# Position Encoding
import torch
import torch.nn as nn
import math


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()  # Initialize nn.Module internals

        # Create a (max_len x d_model) matrix to store positional encodings
        pe = torch.zeros(max_len, d_model)

        # Create position indices: [0, 1, 2, ..., max_len-1]
        # Shape: (max_len, 1)
        position = torch.arange(0, max_len).unsqueeze(1)

        # Compute the frequency scaling term for each even dimension
        # Shape: (d_model / 2,)
        #
        # Each pair of embedding dimensions (2i, 2i+1) shares a unique frequency.
        # These frequencies are exponentially spaced so that:
        #   - Early dimensions oscillate quickly (high-frequency signals),
        #     capturing fine-grained, local positional differences.
        #   - Later dimensions oscillate slowly (low-frequency signals),
        #     capturing long-range, global positional structure.
        #
        # This multi-scale design allows the model to represent both short- and
        # long-distance relationships between tokens.
        #
        # The exponential scaling (10000^(-2i / d_model)) ensures:
        #   - Positional encodings remain unique across long sequences
        #   - Relative position information can be recovered using linear operations,
        #     which is critical for attention mechanisms.
        #
        # Using fixed (non-learned) frequencies avoids overfitting to training
        # sequence lengths and enables generalization to longer sequences.

        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )

        # Apply sine to even embedding dimensions (0, 2, 4, ...)
        # Broadcasting: (max_len, 1) * (d_model/2,) -> (max_len, d_model/2)
        pe[:, 0::2] = torch.sin(position * div_term)
        # If positional encodings were different per batch element, the model could learn:
        # “this sentence is in batch slot 3”
        # “batch index correlates with label”
        # That would be spurious information and break generalization. Hence we ensure: No batch-specific signal and that only relative token order is encoded
        # Apply cosine to odd embedding dimensions (1, 3, 5, ...)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension so it can be broadcast across batches
        # Final shape: (1, max_len, d_model)
        pe = pe.unsqueeze(0)

        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        Args:
            x: Input embeddings of shape (batch_size, seq_len, d_model)

        Returns:
            Embeddings with positional information added
        """
        # Slice positional encodings to match sequence length
        # Broadcasting adds the same positional encoding to each batch element
        return x + self.pe[:, :x.size(1)]

class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super().__init__()

        # Scaling factor = sqrt(d_k)
        #
        # The dot product Q · K grows in magnitude with the dimensionality d_k.
        # Without scaling, large dot products would push the softmax into
        # extremely peaked distributions, causing:
        #   - Vanishing gradients
        #   - Overconfident, brittle attention weights
        #
        # Dividing by sqrt(d_k) keeps the variance of the scores roughly constant,
        # stabilizing training and ensuring smoother attention distributions.
        self.scale = math.sqrt(d_k)

    def forward(self, Q, K, V, mask=None):
        """
        Args:
            Q: Queries of shape (..., seq_len_q, d_k)
            K: Keys of shape (..., seq_len_k, d_k)
            V: Values of shape (..., seq_len_k, d_v)
            mask: Optional attention mask (broadcastable to scores shape)
                  Used to block padding tokens or future tokens (causal masking)

        Returns:
            output: Attention-weighted values
            attn: Attention weight matrix
        """

        # Compute raw attention scores via dot product:
        #   score_{i,j} = Q_i · K_j
        #
        # K is transposed so that:
        #   (..., seq_len_q, d_k) @ (..., d_k, seq_len_k)
        # → (..., seq_len_q, seq_len_k)
        #
        # Each score measures how much token i should attend to token j.
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale

        # Apply mask (if provided) to prevent attention to certain positions:
        #   - Padding tokens (in encoder attention)
        #   - Future tokens (in decoder self-attention)
        #
        # Masked positions are set to a large negative value so that
        # softmax assigns them near-zero probability.
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        # Normalize scores into a probability distribution using softmax.
        # For each query token, attention weights over all key positions sum to 1.
        attn = torch.softmax(scores, dim=-1)

        # Compute the final attention output as a weighted sum of values:
        #   output_i = Σ_j attn_{i,j} · V_j
        #
        # This mixes information from all tokens, weighted by relevance.
        return torch.matmul(attn, V), attn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()

        # d_model must be divisible by the number of heads
        # because we split the embedding space evenly across heads.
        # Each head operates on a lower-dimensional subspace.
        assert d_model % n_heads == 0

        # Dimensionality per head (d_k)
        # Total dimension is preserved when heads are concatenated back.
        self.d_k = d_model // n_heads
        self.n_heads = n_heads

        # Linear projections for Queries, Keys, and Values
        #
        # These allow the model to learn different representations
        # of the same input depending on how it is being used:
        #   - As a query (what am I looking for?)
        #   - As a key   (what do I offer?)
        #   - As a value (what information do I carry?)
        #
        # Each projects from d_model → d_model, after which we split
        # into multiple heads.
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # Output projection
        #
        # After attention is computed independently in each head,
        # their outputs are concatenated and mixed using this layer.
        # This allows information from different heads to interact.
        self.W_o = nn.Linear(d_model, d_model)

        # Scaled dot-product attention applied independently per head
        self.attn = ScaledDotProductAttention(self.d_k)

    def forward(self, Q, K, V, mask=None):
        """
        Args:
            Q, K, V: Input tensors of shape (batch_size, seq_len, d_model)
            mask: Optional attention mask

        Returns:
            Output tensor of shape (batch_size, seq_len, d_model)
        """

        B, T, _ = Q.shape

        # Project inputs into Q, K, V spaces
        # Shape after projection: (B, T, d_model)
        #
        # Then reshape to split into multiple heads:
        #   (B, T, n_heads, d_k)
        #
        # Finally transpose so that heads become a separate dimension:
        #   (B, n_heads, T, d_k)
        #
        # This allows attention to be computed independently per head.
        Q = self.W_q(Q).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(B, T, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(B, T, self.n_heads, self.d_k).transpose(1, 2)

        # Apply scaled dot-product attention to each head in parallel
        # Output shape: (B, n_heads, T, d_k)
        x, _ = self.attn(Q, K, V, mask)

        # Recombine heads:
        #   (B, n_heads, T, d_k) → (B, T, n_heads * d_k)
        #
        # .contiguous() ensures memory layout is correct before reshaping
        x = x.transpose(1, 2).contiguous().view(B, T, -1)

        # Final linear projection mixes information across heads
        return self.W_o(x)

x = LayerNorm(x + sublayer(x))

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()

        # Position-wise FeedForward Network
        #
        # This is applied independently to each token position.
        # It does NOT mix information across time (sequence length),
        # only across feature dimensions.
        #
        # The structure expands the representation into a higher-
        # dimensional space (d_ff), applies a non-linearity,
        # and then projects it back to d_model.
        #
        # This allows the model to:
        #   - Introduce non-linear transformations
        #   - Increase representational capacity
        #   - Learn complex feature interactions per token
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),  # Expansion (feature mixing)
            nn.ReLU(),                 # Non-linearity
            nn.Linear(d_ff, d_model)   # Compression back to model dimension
        )

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        # FeedForward is applied identically to every token position
        return self.net(x)


class EncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()

        # Multi-head self-attention:
        # Allows each token to attend to all other tokens in the sequence
        # and build context-aware representations.
        self.attn = MultiHeadAttention(d_model, n_heads)

        # Position-wise feedforward network:
        # Adds non-linear transformation capacity after attention.
        self.ff = FeedForward(d_model, d_ff)

        # Layer normalization layers
        #
        # These stabilize training by normalizing feature distributions
        # and help gradient flow in deep transformer stacks.
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout for regularization
        # Prevents overfitting and co-adaptation of neurons.
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        """
        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model)
            mask: Optional attention mask

        Returns:
            Output tensor of shape (batch_size, seq_len, d_model)
        """

        # --- Self-Attention Sub-layer ---
        #
        # Residual connection:
        #   Preserves the original representation and improves gradient flow.
        #
        # Dropout:
        #   Regularizes attention outputs.
        #
        # LayerNorm (post-norm formulation here):
        #   Stabilizes activations and training dynamics.
        x = self.norm1(
            x + self.dropout(self.attn(x, x, x, mask))
        )

        # --- FeedForward Sub-layer ---
        #
        # Again, we apply:
        #   - Position-wise feedforward transformation
        #   - Dropout for regularization
        #   - Residual connection to preserve information
        #   - LayerNorm for stability
        x = self.norm2(
            x + self.dropout(self.ff(x))
        )

        return x

class DecoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()

        # Masked multi-head self-attention
        #
        # This allows each target token to attend only to:
        #   - itself
        #   - previous target tokens
        #
        # The causal (look-ahead) mask ensures autoregressive behavior:
        # the model cannot "see the future" during generation.
        self.self_attn = MultiHeadAttention(d_model, n_heads)

        # Cross-attention (encoder–decoder attention)
        #
        # Queries come from the decoder (what do I need?),
        # Keys and Values come from the encoder (what information is available?).
        #
        # This is how the decoder conditions its predictions on the source sequence.
        self.cross_attn = MultiHeadAttention(d_model, n_heads)

        # Position-wise feedforward network
        #
        # Applies non-linear transformation independently to each target token
        # after contextual information has been integrated.
        self.ff = FeedForward(d_model, d_ff)

        # LayerNorm layers for each sub-layer
        #
        # Separate norms are used because each sub-layer has
        # different statistical properties.
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, tgt_mask=None, src_mask=None):
        """
        Args:
            x: Decoder input embeddings (batch_size, tgt_len, d_model)
            enc_out: Encoder outputs (batch_size, src_len, d_model)
            tgt_mask: Causal mask to block future target tokens
            src_mask: Mask to block padding in the source sequence

        Returns:
            Decoder output representations
        """

        # Allows each target token to attend only to earlier target tokens.
        # This enforces the autoregressive property required for generation.
        #
        # Residual connection + dropout + LayerNorm stabilize training
        # and preserve the original token representation.
        x = self.norm1(
            x + self.dropout(self.self_attn(x, x, x, tgt_mask))
        )

        # Decoder queries attend over encoder keys and values.
        # This is the information bottleneck where the decoder
        # decides which parts of the source sequence are relevant
        # for predicting the next token.
        x = self.norm2(
            x + self.dropout(self.cross_attn(x, enc_out, enc_out, src_mask))
        )

        # Applies a non-linear, position-wise transformation
        # to refine the decoder representations after all
        # contextual information has been integrated.
        x = self.norm3(
            x + self.dropout(self.ff(x))
        )

        return x

class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab,
        tgt_vocab,
        d_model=512,
        n_heads=8,
        d_ff=2048,
        n_layers=6,
        max_len=512
    ):
        super().__init__()

        # Source and target token embeddings
        #
        # These map discrete token IDs to continuous vectors of size d_model.
        # Separate embeddings are used because source and target vocabularies
        # may differ and play different roles in the model.
        self.src_embed = nn.Embedding(src_vocab, d_model)
        self.tgt_embed = nn.Embedding(tgt_vocab, d_model)

        # Positional encoding
        #
        # Adds explicit sequence order information to token embeddings.
        # This is required because attention itself is permutation-invariant.
        self.pos = PositionalEncoding(d_model, max_len)

        # Encoder stack
        #
        # A stack of identical EncoderBlocks.
        # Each block refines representations by:
        #   1) allowing tokens to attend to each other (self-attention)
        #   2) applying non-linear, position-wise transformations (FFN)
        self.encoder = nn.ModuleList([
            EncoderBlock(d_model, n_heads, d_ff)
            for _ in range(n_layers)
        ])

        # Decoder stack
        #
        # Each DecoderBlock:
        #   1) builds an autoregressive representation of the target prefix
        #   2) attends to the encoder outputs (cross-attention)
        #   3) refines representations with a feedforward network
        self.decoder = nn.ModuleList([
            DecoderBlock(d_model, n_heads, d_ff)
            for _ in range(n_layers)
        ])

        # Final linear projection
        #
        # Maps decoder hidden states back to the target vocabulary space,
        # producing logits for next-token prediction.
        self.fc_out = nn.Linear(d_model, tgt_vocab)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        """
        Args:
            src: Source token indices (batch_size, src_len)
            tgt: Target token indices (batch_size, tgt_len)
            src_mask: Mask for source padding tokens
            tgt_mask: Causal mask for target tokens

        Returns:
            Logits over target vocabulary for each target position
        """

        # --- Embedding + Positional Encoding ---
        #
        # Token embeddings provide semantic meaning.
        # Positional encodings inject order information.
        #
        # Resulting shape: (batch_size, seq_len, d_model)
        src = self.pos(self.src_embed(src))
        tgt = self.pos(self.tgt_embed(tgt))

        # --- Encoder ---
        #
        # The encoder processes the entire source sequence in parallel.
        # Its output is a contextual representation of the source,
        # which will be attended to by the decoder.
        for layer in self.encoder:
            src = layer(src, src_mask)

        # --- Decoder ---
        #
        # The decoder processes the target sequence autoregressively.
        # At each layer, it:
        #   - attends to past target tokens (masked self-attention)
        #   - attends to encoder outputs (cross-attention)
        for layer in self.decoder:
            tgt = layer(tgt, src, tgt_mask, src_mask)

        # --- Output Projection ---
        #
        # Convert decoder representations into vocabulary logits.
        # These logits are typically passed to a softmax during training
        # or used for greedy/beam search during inference.
        return self.fc_out(tgt)

scores = scores.masked_fill(mask == 0, -∞)

def causal_mask(size):
    # Create a square matrix of ones with shape (size, size)
    # This represents all possible query–key positions.
    mask = torch.ones(size, size)

    # Keep only the lower-triangular part of the matrix.
    #
    # Positions above the diagonal correspond to "future" tokens
    # (i.e., positions j > i when predicting token i).
    #
    # torch.tril enforces the causal constraint:
    #   token i can attend to tokens {0, ..., i}
    #   but NOT to tokens {i+1, ..., size-1}
    mask = torch.tril(mask)

    # Add two singleton dimensions so the mask can be broadcast
    # across:
    #   - batch dimension
    #   - attention heads
    #
    # Final shape: (1, 1, size, size)
    # This matches attention score tensors of shape:
    #   (batch_size, n_heads, seq_len, seq_len)
    return mask.unsqueeze(0).unsqueeze(1)

model = Transformer(
    src_vocab=10000,
    tgt_vocab=10000,
    d_model=512,
    n_heads=8,
    n_layers=6
)

src = torch.randint(0, 10000, (2, 20))
tgt = torch.randint(0, 10000, (2, 20))

mask = causal_mask(tgt.size(1))
out = model(src, tgt, tgt_mask=mask)

print(out.shape)  # (batch, seq_len, vocab)

out

! curl "https://cas-bridge.xethub.hf.co/xet-bridge-us/645e8da96320b0efe40ade7a/02e40cc51c59a4bc6c51bd7bc9acda4316e208745be060558eaf500cd14e9f96?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20260112%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20260112T124634Z&X-Amz-Expires=3600&X-Amz-Signature=a948d01680e37e90762ec67ccadf0d597a9a94e89dd28da762376ed10ed60b41&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=6507eb42423b46492edf979c&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27TinyStoriesV2-GPT4-train.txt%3B+filename%3D%22TinyStoriesV2-GPT4-train.txt%22%3B&response-content-type=text%2Fplain&x-id=GetObject&Expires=1768225594&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc2ODIyNTU5NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82NDVlOGRhOTYzMjBiMGVmZTQwYWRlN2EvMDJlNDBjYzUxYzU5YTRiYzZjNTFiZDdiYzlhY2RhNDMxNmUyMDg3NDViZTA2MDU1OGVhZjUwMGNkMTRlOWY5NioifV19&Signature=BPYTyJ2CTFy3Jb5MCHF4i%7EEAndHrAJP8bK6GP7tv6REvFarHH3eAur3dyE6w-7eo7PJGKzzeQDodxSYhwHQE95b2RuywZ5DlxTS%7EelkvlI52suIS6vgxa2bkGq5sW7zD1LAzuP3UEoJ1mniA7vq8WbQ2OPWKy%7ET87Zc8ieGiMZ7KoEOy4OpiUhY3SiU3e%7EI43nHwlcEQEGQ4VpRG5OlQNEnOSwSUhm4UlHIvz6gt3smSOCvgCV7l3MTY0CpPU00YwTp7w0NbIaHsTMuuk8N0XBVeAl%7EdE0o1qs3RSZeUeY2grJbVaxlevb657i0R%7E1uHDZ1%7E-ctEGcXRXpMcLqvlkw__&Key-Pair-Id=K2L8F4GPSG1IFC" -o TinyStoriesV2-GPT4-train.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2124M  100 2124M    0     0   128M      0  0:00:16  0:00:16 --:--:-- 66.8M

! head -c 100 TinyStoriesV2-GPT4-train.txt

Once upon a time there was a little boy named Ben. Ben loved to explore the world around him. He sa

import torch

DATA_PATH = "TinyStoriesV2-GPT4-train.txt"  # change if needed
MAX_CHARS = 500_000      # limit for quick runs
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BLOCK_SIZE = 512
BATCH_SIZE = 128   # or 256 if it fits
GRAD_ACCUM = 1
D_MODEL    = 256
N_HEADS   = 4
N_LAYERS  = 4
D_FF      = 1024
LR        = 3e-4
EPOCHS    = 80
AMP       = True

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

class LanguageModelDataset(Dataset):
    def __init__(self, data, block_size):
        # `data` is a long sequence of token IDs representing TinyStories.
        # Example:
        #   [12, 45, 891, 34, 78, ...]
        #
        # `block_size` defines the length of the context window the model
        # is allowed to condition on when predicting the next token.
        self.data = data
        self.block_size = block_size

    def __len__(self):
        # Each training example requires `block_size + 1` tokens:
        #   - `block_size` tokens for the input
        #   - 1 additional token for the shifted target
        #
        # We subtract `block_size` to avoid indexing past the end
        # of the token sequence.
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # Input sequence (context window)
        #
        # This represents what the model is allowed to see.
        # Shape: (block_size,)
        x = self.data[idx : idx + self.block_size]

        # Target sequence (next-token labels)
        #
        # This is the same sequence shifted one position to the right.
        # For each position t, the model learns to predict y[t]
        # given x[0:t].
        y = self.data[idx + 1 : idx + self.block_size + 1]

        return x, y

data = torch.tensor(tokenizer.encode(text), dtype=torch.long)

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True

class BPETokenizer:
    def __init__(self, text, vocab_size=1000):
        self.vocab_size = vocab_size

        # Start with character-level vocabulary
        chars = sorted(list(set(text)))
        self.vocab = chars.copy()

        # Create initial merges (character pairs)
        self.merges = {}

        # Convert text to list of tokens (initially characters)
        tokens = list(text)

        # Learn BPE merges
        for _ in range(vocab_size - len(chars)):
            # Count frequency of each adjacent pair
            pair_freqs = {}
            for i in range(len(tokens) - 1):
                pair = (tokens[i], tokens[i + 1])
                pair_freqs[pair] = pair_freqs.get(pair, 0) + 1

            if not pair_freqs:
                break

            # Find most frequent pair
            best_pair = max(pair_freqs, key=pair_freqs.get)

            # Create new token for this pair
            new_token = ''.join(best_pair)
            self.merges[best_pair] = new_token
            self.vocab.append(new_token)

            # Merge all occurrences of this pair in the token list
            i = 0
            while i < len(tokens) - 1:
                if (tokens[i], tokens[i + 1]) == best_pair:
                    tokens[i] = new_token
                    del tokens[i + 1]
                else:
                    i += 1

        # Create mappings
        self.stoi = {token: i for i, token in enumerate(self.vocab)}
        self.itos = {i: token for token, i in self.stoi.items()}

    def encode(self, s):
        # Start with character-level encoding
        tokens = list(s)

        # Apply BPE merges greedily
        changed = True
        while changed:
            changed = False
            i = 0
            while i < len(tokens) - 1:
                pair = (tokens[i], tokens[i + 1])
                if pair in self.merges:
                    tokens[i] = self.merges[pair]
                    del tokens[i + 1]
                    changed = True
                else:
                    i += 1

        return [self.stoi[token] for token in tokens]

    def decode(self, ids):
        # Convert IDs back to tokens
        tokens = [self.itos[i] for i in ids]
        return ''.join(tokens)

print("Loading data...")
with open(DATA_PATH, "r", encoding="utf-8") as f:
    text = f.read()[:MAX_CHARS]

tokenizer = BPETokenizer(text, vocab_size=1000)
data = torch.tensor(tokenizer.encode(text), dtype=torch.long)

# ---------------- SPLIT ----------------
split_ratio = 0.9
split_idx = int(len(data) * split_ratio)

train_data = data[:split_idx]
val_data   = data[split_idx:]

# ---------------- DATASETS ----------------
train_dataset = LanguageModelDataset(train_data, BLOCK_SIZE)
val_dataset   = LanguageModelDataset(val_data, BLOCK_SIZE)

# ---------------- LOADERS ----------------
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=3,
    shuffle=True,
    drop_last=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
     num_workers=2,
    shuffle=False,      # IMPORTANT
    drop_last=True
)

Loading data...

/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py:627: UserWarning: This DataLoader will create 3 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

forward(self, Q, K, V, mask=None)

import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class MultiHeadAttention(nn.Module):
    """
    GPT-style multi-head self-attention
    """

    def __init__(self, d_model: int, n_heads: int):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        # Fused projection for Q, K, V (faster + cleaner)
        self.qkv_proj = nn.Linear(d_model, 3 * d_model)

        # Output projection
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
        """
        Args:
            x: (B, T, d_model)
            mask: (1 or B, 1 or n_heads, T, T) causal mask (0 = block)

        Returns:
            (B, T, d_model)
        """
        B, T, C = x.shape

        # ------------------------------------------------
        # Project & split into Q, K, V
        # ------------------------------------------------
        # (B, T, 3 * C) → (B, T, 3, n_heads, d_k)
        qkv = self.qkv_proj(x).view(B, T, 3, self.n_heads, self.d_k)

        # (B, n_heads, T, d_k)
        q, k, v = qkv.unbind(dim=2)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # ------------------------------------------------
        # Scaled dot-product attention
        # ------------------------------------------------
        # (B, n_heads, T, T)
        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-inf"))

        attn = F.softmax(scores, dim=-1)

        # (B, n_heads, T, d_k)
        out = attn @ v

        # ------------------------------------------------
        # Recombine heads
        # ------------------------------------------------
        # (B, T, C)
        out = out.transpose(1, 2).contiguous().view(B, T, C)

        return self.out_proj(out)

x = LayerNorm(x + sublayer(x))

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()

        # Masked multi-head self-attention
        # Uses causal masking to prevent access to future tokens
        self.attn = MultiHeadAttention(d_model, n_heads)

        # Position-wise feedforward network
        self.ff = FeedForward(d_model, d_ff)

        # Pre-layer normalization
        # Improves gradient flow in deep networks
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, mask):
        # Pre-LN + residual connection for attention
        # x ← x + Attention(LayerNorm(x))
        x = x + self.attn(self.ln1(x), mask)

        # Pre-LN + residual connection for feedforward
        # x ← x + FFN(LayerNorm(x))
        x = x + self.ff(self.ln2(x))

        return x

class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()

        # Token embedding layer
        # Maps token IDs to continuous vectors
        self.embed = nn.Embedding(vocab_size, D_MODEL)

        # Positional encoding to inject order information
        self.pos = PositionalEncoding(D_MODEL, BLOCK_SIZE)

        # Stack of Transformer blocks
        # Each block refines representations autoregressively
        self.blocks = nn.ModuleList([
            TransformerBlock(D_MODEL, N_HEADS, D_FF)
            for _ in range(N_LAYERS)
        ])

        # Final LayerNorm before output projection
        self.ln_f = nn.LayerNorm(D_MODEL)

        # Output projection to vocabulary space
        self.head = nn.Linear(D_MODEL, vocab_size)

        # Weight tying between input embeddings and output logits
        # Reduces parameters and improves generalization
        self.head.weight = self.embed.weight

    def forward(self, x):
        """
        Args:
            x: Input token IDs (batch_size, seq_len)

        Returns:
            Logits over vocabulary for each position
        """

        B, T = x.shape

        # Create causal mask to enforce autoregressive constraint
        mask = causal_mask(T).to(x.device)

        # Token embeddings + positional encodings
        x = self.embed(x)
        x = self.pos(x)

        # Apply stacked Transformer blocks
        for block in self.blocks:
            x = block(x, mask)

        # Final normalization and projection to logits
        x = self.ln_f(x)
        return self.head(x)

model = GPT(tokenizer.vocab_size).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

import torch, gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

print(torch.cuda.memory_summary())

|===========================================================================|
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|===========================================================================|
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  13858 KiB |  13858 KiB |  13858 KiB |      0 B   |
|       from large pool |      0 KiB |      0 KiB |      0 KiB |      0 B   |
|       from small pool |  13858 KiB |  13858 KiB |  13858 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |  13858 KiB |  13858 KiB |  13858 KiB |      0 B   |
|       from large pool |      0 KiB |      0 KiB |      0 KiB |      0 B   |
|       from small pool |  13858 KiB |  13858 KiB |  13858 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Requested memory      |  13857 KiB |  13857 KiB |  13857 KiB |      0 B   |
|       from large pool |      0 KiB |      0 KiB |      0 KiB |      0 B   |
|       from small pool |  13857 KiB |  13857 KiB |  13857 KiB |      0 B   |
|---------------------------------------------------------------------------|
| GPU reserved memory   |  14336 KiB |  14336 KiB |  14336 KiB |      0 B   |
|       from large pool |      0 KiB |      0 KiB |      0 KiB |      0 B   |
|       from small pool |  14336 KiB |  14336 KiB |  14336 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Non-releasable memory | 489472 B   |   1816 KiB |   7704 KiB |   7226 KiB |
|       from large pool |      0 B   |      0 KiB |      0 KiB |      0 KiB |
|       from small pool | 489472 B   |   1816 KiB |   7704 KiB |   7226 KiB |
|---------------------------------------------------------------------------|
| Allocations           |      53    |      53    |      53    |       0    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |      53    |      53    |      53    |       0    |
|---------------------------------------------------------------------------|
| Active allocs         |      53    |      53    |      53    |       0    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |      53    |      53    |      53    |       0    |
|---------------------------------------------------------------------------|
| GPU reserved segments |       7    |       7    |       7    |       0    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |       7    |       7    |       7    |       0    |
|---------------------------------------------------------------------------|
| Non-releasable allocs |       2    |       4    |       7    |       5    |
|       from large pool |       0    |       0    |       0    |       0    |
|       from small pool |       2    |       4    |       7    |       5    |
|---------------------------------------------------------------------------|
| Oversize allocations  |       0    |       0    |       0    |       0    |
|---------------------------------------------------------------------------|
| Oversize GPU segments |       0    |       0    |       0    |       0    |
|===========================================================================|

import torch
from torch.cuda.amp import autocast, GradScaler

# Compile model (PyTorch 2.x)
model = torch.compile(model)

scaler = GradScaler()

ACC_STEPS = 4          # 🔧 tune this
CLIP_NORM = 1.0

/tmp/ipython-input-1351142244.py:7: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler()

@torch.no_grad()
def generate(prompt, max_new_tokens=300, temperature=0.8):
    model.eval()
    ids = torch.tensor([tokenizer.encode(prompt)], device=DEVICE)

    for _ in range(max_new_tokens):
        # Truncate input to BLOCK_SIZE if it exceeds it.
        # The model was trained with BLOCK_SIZE context.
        input_ids = ids[:, -BLOCK_SIZE:]
        logits = model(input_ids)
        logits = logits[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, 1)
        ids = torch.cat([ids, next_id], dim=1)

    return tokenizer.decode(ids[0].tolist())

from tqdm import tqdm
import torch
import math
import os
from torch.cuda.amp import autocast, GradScaler

# ============================================================
# RESUME FROM CHECKPOINT (FULL STATE)
# ============================================================

best_val_loss = math.inf
start_epoch = 0
scaler = GradScaler()



if os.path.exists("best_model.pt"):
    print("Resuming from best_model.pt")
    ckpt = torch.load("best_model.pt", map_location=DEVICE)

    if isinstance(ckpt, dict) and "model" in ckpt:
        # ✅ NEW-style checkpoint
        model.load_state_dict(ckpt["model"])
        optimizer.load_state_dict(ckpt["optimizer"])
        scaler.load_state_dict(ckpt["scaler"])
        best_val_loss = ckpt["best_val_loss"]
        start_epoch = ckpt["epoch"] + 1
    else:
        # ✅ OLD-style checkpoint (weights only)
        model.load_state_dict(ckpt)
        print("Loaded weights-only checkpoint (optimizer/scaler reset)")

model.to(DEVICE)

# ============================================================
# TRAINING LOOP
# ============================================================

for epoch in range(start_epoch, EPOCHS):

    # ===================== TRAIN =====================
    model.train()
    optimizer.zero_grad(set_to_none=True)
    train_loss = 0.0

    for step, (x, y) in enumerate(tqdm(train_loader, desc=f"Train {epoch+1}")):
        x = x.to(DEVICE, non_blocking=True)
        y = y.to(DEVICE, non_blocking=True)

        with autocast():
            logits = model(x)
            loss = criterion(
                logits.view(-1, logits.size(-1)),
                y.view(-1)
            )
            loss = loss / ACC_STEPS

        scaler.scale(loss).backward()
        train_loss += loss.item() * ACC_STEPS

        should_step = (
            (step + 1) % ACC_STEPS == 0
            or (step + 1) == len(train_loader)  # flush last batch
        )

        if should_step:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)

            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

    train_loss /= len(train_loader)

    # ===================== VALIDATE =====================
    model.eval()
    val_loss = 0.0
    MAX_VAL_BATCHES = 200

    with torch.no_grad(), autocast():
        for i, (x, y) in enumerate(val_loader):
            if i >= MAX_VAL_BATCHES:
                break

            x = x.to(DEVICE, non_blocking=True)
            y = y.to(DEVICE, non_blocking=True)

            logits = model(x)
            loss = criterion(
                logits.view(-1, logits.size(-1)),
                y.view(-1)
            )
            val_loss += loss.item()

    val_loss /= min(len(val_loader), MAX_VAL_BATCHES)

    # ===================== LOGGING + CHECKPOINT =====================
    improved = val_loss < best_val_loss
    if improved:
        best_val_loss = val_loss
        torch.save(
            {
                "model": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "scaler": scaler.state_dict(),
                "epoch": epoch,
                "best_val_loss": best_val_loss,
            },
            "best_model.pt",
        )

    print(
        f"Epoch {epoch+1:3d} | "
        f"Train {train_loss:.4f} | "
        f"Val {val_loss:.4f}"
        + ("  ✓" if improved else "")
    )

    # ===================== SAMPLE =====================
    if (epoch + 1) % 10 == 0:
        model.eval()
        prompt = torch.tensor([[tokenizer.stoi.get("O", 0)]], device=DEVICE)
        gen = model.generate(prompt, max_tokens=100)
        print(f"\nSample:\n{tokenizer.decode(gen[0].tolist())}\n")

# ============================================================
# DONE
# ============================================================

print("\n" + "=" * 50)
print(f"Training complete! Best val loss: {best_val_loss:.4f}")
print("=" * 50)

/tmp/ipython-input-960699167.py:13: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
  scaler = GradScaler()

Resuming from best_model.pt
Loaded weights-only checkpoint (optimizer/scaler reset)

Train 1:   0%|          | 0/1539 [00:00<?, ?it/s]/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py:627: UserWarning: This DataLoader will create 3 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(
/tmp/ipython-input-960699167.py:50: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with autocast():
Train 1: 100%|██████████| 1539/1539 [04:39<00:00,  5.51it/s]
/tmp/ipython-input-960699167.py:81: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
  with torch.no_grad(), autocast():

Epoch   1 | Train 2.9840 | Val 2.8299  ✓

Train 2: 100%|██████████| 1539/1539 [04:47<00:00,  5.36it/s]

Epoch   2 | Train 2.0371 | Val 2.9852

Train 3: 100%|██████████| 1539/1539 [04:46<00:00,  5.37it/s]

Epoch   3 | Train 1.2778 | Val 4.0615

Train 4: 100%|██████████| 1539/1539 [04:46<00:00,  5.38it/s]

Epoch   4 | Train 0.7066 | Val 5.6822

print("\n--- GENERATED STORY ---\n")
print(generate("Once upon a time  "))

--- GENERATED STORY ---

Once upon a time  was tirered. It made go to the park, but he also lion named sed sed with a new friend, a little girl named Lily him. She saw her frog and opped her head. Lily was very munice. Then, She camemmembered Mom comineace braceful was not too too happy ve fun.
<|endoftext|>
Once upon a time, there was a little girl named Mia. Mia had a big, red in a little house with her  always smiled. Mia truckly little cat the ad. Lily lived in a small house with her momy. Mia loved to stay very happy.
Mia went outside to play in the stawn and saw a lotched the slide. She was very happy and s. They cyedguit. The slizzlep. Mia could not find it was very happy. The kids girl

GPT From Scratch - A Simple Primer¶

Encoder - Decoder Transformer¶

Why encoder–decoder lost dominance¶

Architecture¶

Positional Embeddings¶

Scaled Dot-Product Attention¶

MultiHeadAttention¶

FeedForward Network (Position-wise FFN)¶

1. Why do we need a FeedForward network at all?¶

2. Why expand to d_ff and then project back?¶

3. Why residual connections everywhere?¶

4. Why LayerNorm after each sub-layer?¶

5. Why split attention and feedforward into two sub-layers?¶

Decoder Block — Why it exists and how it works¶

1. Why does the decoder have masked self-attention?¶

2. Why do we need cross-attention at all?¶

3. Why is self-attention applied before cross-attention?¶

4. Why does the decoder also need a FeedForward network?¶

5. Why three residual + normalization blocks?¶

Transformer — Why this architecture works¶

1. Why separate encoder and decoder?¶

2. Why stack multiple layers?¶

3. Why positional encoding at the input?¶

4. Why does the encoder run fully in parallel?¶

Why project back to vocabulary with fc_out?¶

Causal Mask¶

1. What problem does the causal mask solve?¶

2. Why a lower-triangular matrix?¶

3. Why not just use zeros and ones directly?¶

4. Why add two extra dimensions?¶

5. Why is this essential even during training?¶

So... How do we go about generating Text¶

Decoder-Only vs Encoder-Decoder¶

Configuration¶

Dataset¶

LanguageModelDataset (TinyStories)¶

Historical context¶

Why sliding windows?¶

Why predict a sequence instead of a single token?¶

Tokenizer¶

BPE (Byte Pair Encoding) Tokenization & Data Pipeline (TinyStories)¶

Why BPE tokenization?¶

Why encode the entire corpus into one long tensor?¶

Why use LanguageModelDataset + DataLoader?¶

Why shuffle and drop last?¶

GPT¶

MultiAttentionHead¶

Differences vs earlier attention (only real changes)¶

1. Self-attention only (no Q, K, V inputs)¶

2. Attention logic is inlined (no ScaledDotProductAttention module)¶

3. Causal masking assumed externally¶

4. No attention weights returned¶

5. Single-path projection (GPT style)¶

What did not change (important)¶

Earlier Transformer (Vaswani et al., 2017)¶

GPT (Radford et al., 2018 → GPT-4)¶

2. Architectural evolution: Pre-LN instead of Post-LN¶

Earlier code (Encoder / Decoder blocks)¶

GPT-style block¶

3. Single attention type: masked self-attention only¶

4. Weight tying (embedding ↔ output projection)¶

5. Final LayerNorm before output¶

6. Positional encoding remains (but later evolved)¶

Training¶

Generation¶

Drumroll ....¶

2. Why expand to `d_ff` and then project back?¶

Why project back to vocabulary with `fc_out`?¶

Why use `LanguageModelDataset` + DataLoader?¶