Module 06: Transformer

d3 = require("d3@7")


// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================

// Function to read CSS custom property values from the document
getCSSVar = function(name, fallback = null) {
  if (typeof document === 'undefined') return fallback;
  const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim();
  return value || fallback;
}

// =============================================================================
// THEME OBJECT
// =============================================================================

// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
  // Fallback values (light mode)
  const fallbacks = {
    nodeFill: '#f5f5f4',
    nodeFillHover: '#e7e5e4',
    nodeStroke: '#d6d3d1',
    nodeText: '#1c1917',
    edgeStroke: '#78716c',
    highlight: '#f97316',
    highlightGlow: 'rgba(249, 115, 22, 0.3)',
    accent: '#0ea5e9',
    accentGlow: 'rgba(14, 165, 233, 0.3)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: '#fafaf9',
    bgSecondary: '#f5f5f4',
    // Semantic colors for status/feedback
    error: '#dc2626',
    errorBg: 'rgba(220, 38, 38, 0.1)',
    success: '#16a34a',
    successBg: 'rgba(22, 163, 74, 0.1)',
    info: '#2563eb',
    infoBg: 'rgba(37, 99, 235, 0.1)'
  };

  return {
    nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
    nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
    nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
    nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
    edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
    highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
    highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
    accent: getCSSVar('--diagram-accent', fallbacks.accent),
    accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
    textOnHighlight: fallbacks.textOnHighlight,
    textOnAccent: fallbacks.textOnAccent,
    bg: getCSSVar('--diagram-bg', fallbacks.bg),
    bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
    // Semantic colors (use fallbacks directly since no CSS vars defined)
    error: fallbacks.error,
    errorBg: fallbacks.errorBg,
    success: fallbacks.success,
    successBg: fallbacks.successBg,
    info: fallbacks.info,
    infoBg: fallbacks.infoBg
  };
}

// =============================================================================
// SVG PRIMITIVES
// =============================================================================

// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
  const {
    x = 0,
    y = 0,
    width = 100,
    height = 50,
    label = '',
    sublabel = '',
    id = null,
    theme = diagramTheme,
    rx = 6,
    ry = 6,
    className = 'diagram-node'
  } = options;

  // Create group
  const g = svg.append('g')
    .attr('class', className)
    .attr('transform', `translate(${x}, ${y})`);

  if (id) g.attr('id', id);

  // Add rectangle
  g.append('rect')
    .attr('x', -width / 2)
    .attr('y', -height / 2)
    .attr('width', width)
    .attr('height', height)
    .attr('rx', rx)
    .attr('ry', ry)
    .attr('fill', theme.nodeFill)
    .attr('stroke', theme.nodeStroke)
    .attr('stroke-width', 1.5);

  // Add main label
  if (label) {
    const labelY = sublabel ? -6 : 0;
    g.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(label);
  }

  // Add sublabel
  if (sublabel) {
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', 0.7)
      .attr('pointer-events', 'none')
      .text(sublabel);
  }

  return g;
}

// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
  const {
    x1 = 0,
    y1 = 0,
    x2 = 100,
    y2 = 0,
    label = '',
    theme = diagramTheme,
    curved = false,
    curvature = 0.3,
    id = null,
    className = 'diagram-edge',
    dashed = false
  } = options;

  // Create unique marker ID
  const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;

  // Ensure defs exists
  let defs = svg.select('defs');
  if (defs.empty()) {
    defs = svg.append('defs');
  }

  // Add arrowhead marker
  defs.append('marker')
    .attr('id', markerId)
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Create group for arrow
  const g = svg.append('g')
    .attr('class', className);

  if (id) g.attr('id', id);

  // Calculate path
  let pathD;
  if (curved) {
    // Quadratic Bezier curve
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const dx = x2 - x1;
    const dy = y2 - y1;
    // Perpendicular offset for curve
    const cx = midX - dy * curvature;
    const cy = midY + dx * curvature;
    pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
  } else {
    // Straight line
    pathD = `M${x1},${y1} L${x2},${y2}`;
  }

  // Add path
  const path = g.append('path')
    .attr('d', pathD)
    .attr('fill', 'none')
    .attr('stroke', theme.edgeStroke)
    .attr('stroke-width', 1.5)
    .attr('marker-end', `url(#${markerId})`);

  if (dashed) {
    path.attr('stroke-dasharray', '5,3');
  }

  // Add label if provided
  if (label) {
    const labelX = (x1 + x2) / 2;
    const labelY = (y1 + y2) / 2;

    // Offset label perpendicular to line
    const angle = Math.atan2(y2 - y1, x2 - x1);
    const offsetX = Math.sin(angle) * 12;
    const offsetY = -Math.cos(angle) * 12;

    g.append('text')
      .attr('x', labelX + offsetX)
      .attr('y', labelY + offsetY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .text(label);
  }

  return g;
}

// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================

// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
  const {
    total = 1,
    initialStep = 0,
    speed = 1000,
    loop = true,
    onStepChange = null
  } = options;

  let current = initialStep;
  let isPlaying = false;
  let intervalId = null;
  let currentSpeed = speed;

  const notifyChange = () => {
    if (onStepChange && typeof onStepChange === 'function') {
      onStepChange(current);
    }
  };

  const controller = {
    get current() { return current; },
    get isPlaying() { return isPlaying; },
    get total() { return total; },
    get speed() { return currentSpeed; },

    setStep(step) {
      current = Math.max(0, Math.min(total - 1, step));
      notifyChange();
      return current;
    },

    next() {
      if (current < total - 1) {
        current++;
      } else if (loop) {
        current = 0;
      }
      notifyChange();
      return current;
    },

    prev() {
      if (current > 0) {
        current--;
      } else if (loop) {
        current = total - 1;
      }
      notifyChange();
      return current;
    },

    play() {
      if (isPlaying) return;
      isPlaying = true;
      intervalId = setInterval(() => {
        controller.next();
      }, currentSpeed);
    },

    stop() {
      isPlaying = false;
      if (intervalId) {
        clearInterval(intervalId);
        intervalId = null;
      }
    },

    toggle() {
      if (isPlaying) {
        controller.stop();
      } else {
        controller.play();
      }
    },

    reset() {
      controller.stop();
      current = initialStep;
      notifyChange();
    },

    setSpeed(newSpeed) {
      currentSpeed = newSpeed;
      if (isPlaying) {
        controller.stop();
        controller.play();
      }
    }
  };

  return controller;
}

// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================

// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
  const {
    nodes = [],
    edges = [],
    width = 600,
    height = 400,
    activeNodes = [],
    activeEdges = [],
    theme = diagramTheme,
    nodeWidth = 100,
    nodeHeight = 50,
    padding = 20
  } = options;

  // Create SVG element
  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`)
    .attr('class', 'flow-diagram');

  // Add background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', theme.bg)
    .attr('rx', 8);

  // Create defs for markers
  const defs = svg.append('defs');

  // Standard arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Highlighted arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.highlight);

  // Edges layer (draw first so nodes appear on top)
  const edgesLayer = svg.append('g').attr('class', 'edges-layer');

  // Nodes layer
  const nodesLayer = svg.append('g').attr('class', 'nodes-layer');

  // Draw edges
  edges.forEach((edge, i) => {
    const sourceNode = nodes.find(n => n.id === edge.source);
    const targetNode = nodes.find(n => n.id === edge.target);

    if (!sourceNode || !targetNode) return;

    const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
    const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
    const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';

    // Calculate edge path
    const x1 = sourceNode.x;
    const y1 = sourceNode.y;
    const x2 = targetNode.x;
    const y2 = targetNode.y;

    // Shorten path to not overlap with node edges
    const dx = x2 - x1;
    const dy = y2 - y1;
    const len = Math.sqrt(dx * dx + dy * dy);
    const offsetStart = (nodeWidth / 2) + 5;
    const offsetEnd = (nodeWidth / 2) + 10;

    const startX = x1 + (dx / len) * offsetStart;
    const startY = y1 + (dy / len) * offsetStart;
    const endX = x2 - (dx / len) * offsetEnd;
    const endY = y2 - (dy / len) * offsetEnd;

    const edgeGroup = edgesLayer.append('g')
      .attr('class', `edge ${isActive ? 'highlighted' : ''}`);

    if (edge.id) edgeGroup.attr('id', edge.id);

    // Draw path
    let pathD;
    if (edge.curved) {
      const midX = (startX + endX) / 2;
      const midY = (startY + endY) / 2;
      const curvature = edge.curvature || 0.2;
      const cx = midX - dy * curvature;
      const cy = midY + dx * curvature;
      pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
    } else {
      pathD = `M${startX},${startY} L${endX},${endY}`;
    }

    const path = edgeGroup.append('path')
      .attr('d', pathD)
      .attr('fill', 'none')
      .attr('stroke', edgeColor)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', `url(#${markerId})`);

    if (edge.dashed) {
      path.attr('stroke-dasharray', '5,3');
    }

    if (isActive) {
      path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
    }

    // Add label if present
    if (edge.label) {
      const labelX = (startX + endX) / 2;
      const labelY = (startY + endY) / 2;
      const angle = Math.atan2(endY - startY, endX - startX);
      const offsetX = Math.sin(angle) * 14;
      const offsetY = -Math.cos(angle) * 14;

      edgeGroup.append('text')
        .attr('x', labelX + offsetX)
        .attr('y', labelY + offsetY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? theme.highlight : theme.nodeText)
        .attr('font-size', '10px')
        .text(edge.label);
    }
  });

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
    const nodeFill = isActive ? theme.highlight : theme.nodeFill;
    const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
    const textFill = isActive ? theme.textOnHighlight : theme.nodeText;

    const nodeGroup = nodesLayer.append('g')
      .attr('class', `node ${isActive ? 'highlighted' : ''}`)
      .attr('transform', `translate(${node.x}, ${node.y})`);

    if (node.id) nodeGroup.attr('id', node.id);

    // Node rectangle
    const rect = nodeGroup.append('rect')
      .attr('x', -nodeWidth / 2)
      .attr('y', -nodeHeight / 2)
      .attr('width', node.width || nodeWidth)
      .attr('height', node.height || nodeHeight)
      .attr('rx', 6)
      .attr('ry', 6)
      .attr('fill', nodeFill)
      .attr('stroke', nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
    }

    // Main label
    const labelY = node.sublabel ? -6 : 0;
    nodeGroup.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', textFill)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(node.label || '');

    // Sublabel
    if (node.sublabel) {
      nodeGroup.append('text')
        .attr('x', 0)
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', textFill)
        .attr('font-size', '10px')
        .attr('opacity', isActive ? 0.9 : 0.7)
        .attr('pointer-events', 'none')
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// =============================================================================
// EXPORTS
// =============================================================================

// Export everything as a single object for lessons to use
diagramLib = {
  // Core dependencies
  d3,

  // Theme utilities
  getCSSVar,
  diagramTheme,

  // SVG primitives
  createNode,
  createArrow,

  // Animation controller
  createStepController,

  // Components
  FlowDiagram
}

/**
 * Segmented step control for visualization stepping.
 * @param {Object} options
 * @param {number} options.min - Minimum step value (default 0)
 * @param {number} options.max - Maximum step value
 * @param {number} options.value - Initial value (default min)
 * @param {string} options.label - Optional label text
 * @returns {number} Current step value (reactive)
 */
stepControl = function({min = 0, max, value, label = null} = {}) {
  const initialValue = value ?? min;
  const steps = Array.from({length: max - min + 1}, (_, i) => min + i);

  const container = htl.html`<div class="step-control">
    ${label ? htl.html`<span class="step-control-label">${label}</span>` : ''}
    <div class="step-control-segments" role="group" aria-label="${label || 'Step control'}">
      ${steps.map(step => htl.html`<button
        class="step-control-segment ${step === initialValue ? 'active' : ''}"
        data-step="${step}"
        aria-pressed="${step === initialValue}"
        tabindex="${step === initialValue ? 0 : -1}"
      >${step}</button>`)}
    </div>
  </div>`;

  const segments = container.querySelectorAll('.step-control-segment');
  let currentValue = initialValue;

  function updateActive(newValue) {
    currentValue = newValue;
    segments.forEach(seg => {
      const isActive = parseInt(seg.dataset.step) === newValue;
      seg.classList.toggle('active', isActive);
      seg.setAttribute('aria-pressed', isActive);
      seg.tabIndex = isActive ? 0 : -1;
    });
    container.value = newValue;
    container.dispatchEvent(new Event('input', {bubbles: true}));
  }

  // Click handler
  segments.forEach(seg => {
    seg.addEventListener('click', () => {
      updateActive(parseInt(seg.dataset.step));
    });
  });

  // Keyboard navigation
  container.addEventListener('keydown', (e) => {
    if (e.key === 'ArrowRight' || e.key === 'ArrowDown') {
      e.preventDefault();
      const next = Math.min(currentValue + 1, max);
      updateActive(next);
      segments[next - min].focus();
    } else if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') {
      e.preventDefault();
      const prev = Math.max(currentValue - 1, min);
      updateActive(prev);
      segments[prev - min].focus();
    } else if (e.key === 'Home') {
      e.preventDefault();
      updateActive(min);
      segments[0].focus();
    } else if (e.key === 'End') {
      e.preventDefault();
      updateActive(max);
      segments[max - min].focus();
    }
  });

  container.value = initialValue;
  return container;
}

Introduction

The transformer decoder block is the building block of GPT-style language models. GPT-2, GPT-3, and LLaMA stack 12 to 96 of these blocks.

This module combines everything built so far:

Multi-head attention from Module 05
Feed-forward networks (mini neural networks for each token)
Layer normalization (stabilizes training)
Residual connections (enables deep networks)

Each block performs two operations:

Multi-head attention: Tokens communicate with each other
Feed-forward network: Each token is processed independently

Decoder-Only vs Encoder-Decoder

This module implements decoder-only transformers (GPT-style). The two main transformer architectures:

Architecture	Examples	Use Case	Attention
Decoder-only	GPT, LLaMA, Claude	Text generation	Causal (can’t see future)
Encoder-Decoder	T5, BART, original Transformer	Translation, summarization	Bidirectional encoder + causal decoder

Most modern LLMs use decoder-only architecture because it is simpler; we follow that approach.

What You’ll Learn

After this module, you can:

Understand the complete GPT-style transformer architecture
Implement LayerNorm, GELU, and feed-forward networks from scratch
Build a full transformer block with residual connections
Assemble a complete language model from components
Calculate parameter counts for different model sizes

Prerequisites

This module requires familiarity with:

Module 04: Embeddings — Token and positional embeddings
Module 05: Attention — Multi-head attention mechanism

Complete Model Architecture

// Step definitions for the forward pass walkthrough
architectureSteps = [
  {
    id: 0,
    name: "Input",
    description: "Token IDs enter the model as integer indices into the vocabulary.",
    shape: "[batch, seq_len]",
    example: "[1, 4] → 4 token IDs"
  },
  {
    id: 1,
    name: "Token Embedding",
    description: "Each token ID is mapped to a dense vector via learned embedding lookup.",
    shape: "[batch, seq_len, embed_dim]",
    example: "[1, 4, 128] → 4 vectors of dim 128"
  },
  {
    id: 2,
    name: "Position Embedding",
    description: "Position information is added so the model knows token order.",
    shape: "[batch, seq_len, embed_dim]",
    example: "pos[0..3] added to each token"
  },
  {
    id: 3,
    name: "Transformer Blocks",
    description: "N stacked blocks refine representations through attention and FFN.",
    shape: "[batch, seq_len, embed_dim]",
    example: "6 blocks × (attention + FFN)"
  },
  {
    id: 4,
    name: "Final LayerNorm",
    description: "Normalize activations before the output projection.",
    shape: "[batch, seq_len, embed_dim]",
    example: "Stabilize for prediction"
  },
  {
    id: 5,
    name: "LM Head → Logits",
    description: "Project to vocabulary size. Each position predicts the next token.",
    shape: "[batch, seq_len, vocab_size]",
    example: "[1, 4, 10000] → scores for each word"
  }
]

// Step control for architecture walkthrough
viewof archStep = stepControl({min: 0, max: 5, value: 0, label: "Architecture Step"})

// Current step info
currentArchStep = architectureSteps[archStep]

// Draw the GPT architecture diagram
transformerArchitectureDiagram = {
  const width = 680;
  const height = 620;
  const marginLeft = 40;
  const marginRight = 180;
  const marginTop = 50;
  const marginBottom = 30;

  // Component dimensions
  const nodeWidth = 140;
  const nodeHeight = 44;
  const blockWidth = 120;
  const blockHeight = 36;
  const numBlocks = 6;
  const blockGap = 6;

  // Vertical layout
  const inputY = marginTop + 30;
  const tokenEmbY = inputY + 70;
  const posEmbY = tokenEmbY;
  const addY = tokenEmbY + 65;
  const blocksStartY = addY + 75;
  const blocksEndY = blocksStartY + numBlocks * (blockHeight + blockGap) - blockGap;
  const lnFinalY = blocksEndY + 65;
  const lmHeadY = lnFinalY + 60;
  const logitsY = lmHeadY + 65;

  // Horizontal positions
  const centerX = marginLeft + (width - marginLeft - marginRight) / 2;
  const tokenEmbX = centerX - 80;
  const posEmbX = centerX + 80;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', 'SF Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Defs for arrows and filters
  const defs = svg.append("defs");

  // Glow filter
  const glowFilter = defs.append("filter")
    .attr("id", "arch-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "coloredBlur");
  const glowMerge = glowFilter.append("feMerge");
  glowMerge.append("feMergeNode").attr("in", "coloredBlur");
  glowMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Arrow markers
  defs.append("marker")
    .attr("id", "arch-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  defs.append("marker")
    .attr("id", "arch-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Color scheme for component types
  const colors = {
    input: diagramTheme.accent,
    embedding: "#6366f1",  // Indigo
    blocks: "#059669",     // Emerald
    output: "#dc2626"      // Red
  };

  // Helper to draw a node
  function drawNode(x, y, w, h, label, sublabel, stepId, colorType) {
    const isActive = archStep === stepId;
    const baseColor = colors[colorType] || diagramTheme.nodeFill;

    const g = svg.append("g")
      .attr("transform", `translate(${x}, ${y})`);

    g.append("rect")
      .attr("x", -w/2)
      .attr("y", -h/2)
      .attr("width", w)
      .attr("height", h)
      .attr("rx", 6)
      .attr("fill", isActive ? baseColor : diagramTheme.nodeFill)
      .attr("stroke", isActive ? baseColor : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("opacity", isActive ? 1 : 0.85)
      .style("filter", isActive ? "url(#arch-glow)" : "none");

    const textColor = isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText;

    if (sublabel) {
      g.append("text")
        .attr("y", -6)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", textColor)
        .attr("font-size", "11px")
        .attr("font-weight", "600")
        .text(label);
      g.append("text")
        .attr("y", 10)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", textColor)
        .attr("font-size", "9px")
        .attr("opacity", 0.8)
        .text(sublabel);
    } else {
      g.append("text")
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", textColor)
        .attr("font-size", "11px")
        .attr("font-weight", "600")
        .text(label);
    }

    return g;
  }

  // Helper to draw an arrow
  function drawArrow(x1, y1, x2, y2, isActive) {
    svg.append("path")
      .attr("d", `M${x1},${y1} L${x2},${y2}`)
      .attr("fill", "none")
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
      .attr("stroke-width", isActive ? 2 : 1.5)
      .attr("marker-end", isActive ? "url(#arch-arrow-active)" : "url(#arch-arrow)")
      .attr("opacity", isActive ? 1 : 0.6);
  }

  // Title
  svg.append("text")
    .attr("x", centerX)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text("GPT Architecture: Forward Pass");

  // === Draw components ===

  // Input tokens
  drawNode(centerX, inputY, nodeWidth, nodeHeight, "Token IDs", "[23, 156, 42, 789]", 0, "input");

  // Arrows from input to embeddings
  const splitY = inputY + nodeHeight/2 + 10;
  drawArrow(centerX - 20, inputY + nodeHeight/2, tokenEmbX, tokenEmbY - nodeHeight/2 - 8, archStep === 0 || archStep === 1);
  drawArrow(centerX + 20, inputY + nodeHeight/2, posEmbX, posEmbY - nodeHeight/2 - 8, archStep === 0 || archStep === 1);

  // Token Embedding
  drawNode(tokenEmbX, tokenEmbY, nodeWidth, nodeHeight, "Token Embed", "(vocab, embed_dim)", 1, "embedding");

  // Position Embedding
  drawNode(posEmbX, posEmbY, nodeWidth, nodeHeight, "Pos Embed", "(max_seq, embed_dim)", 2, "embedding");

  // Arrows to Add
  drawArrow(tokenEmbX, tokenEmbY + nodeHeight/2, centerX - 10, addY - 18 - 8, archStep === 1 || archStep === 2);
  drawArrow(posEmbX, posEmbY + nodeHeight/2, centerX + 10, addY - 18 - 8, archStep === 2);

  // Add circle
  const addActive = archStep === 1 || archStep === 2;
  svg.append("circle")
    .attr("cx", centerX)
    .attr("cy", addY)
    .attr("r", 18)
    .attr("fill", addActive ? colors.embedding : diagramTheme.nodeFill)
    .attr("stroke", addActive ? colors.embedding : diagramTheme.nodeStroke)
    .attr("stroke-width", addActive ? 2 : 1.5)
    .style("filter", addActive ? "url(#arch-glow)" : "none");
  svg.append("text")
    .attr("x", centerX)
    .attr("y", addY)
    .attr("text-anchor", "middle")
    .attr("dominant-baseline", "central")
    .attr("fill", addActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "16px")
    .attr("font-weight", "bold")
    .text("+");

  // Arrow to blocks
  drawArrow(centerX, addY + 18, centerX, blocksStartY - blockHeight/2 - 8, archStep === 2 || archStep === 3);

  // Transformer Blocks (stacked)
  const blocksActive = archStep === 3;

  // Background container for blocks
  const blocksContainerPad = 12;
  const blocksContainerHeight = numBlocks * (blockHeight + blockGap) - blockGap + blocksContainerPad * 2;
  svg.append("rect")
    .attr("x", centerX - blockWidth/2 - blocksContainerPad)
    .attr("y", blocksStartY - blockHeight/2 - blocksContainerPad)
    .attr("width", blockWidth + blocksContainerPad * 2)
    .attr("height", blocksContainerHeight)
    .attr("rx", 8)
    .attr("fill", "none")
    .attr("stroke", blocksActive ? colors.blocks : diagramTheme.nodeStroke)
    .attr("stroke-width", blocksActive ? 2 : 1)
    .attr("stroke-dasharray", blocksActive ? "none" : "4,3")
    .attr("opacity", blocksActive ? 1 : 0.5);

  // Label for blocks section
  svg.append("text")
    .attr("x", centerX + blockWidth/2 + blocksContainerPad + 10)
    .attr("y", blocksStartY + blocksContainerHeight/2 - blockHeight/2 - blocksContainerPad)
    .attr("text-anchor", "start")
    .attr("dominant-baseline", "central")
    .attr("fill", blocksActive ? colors.blocks : diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("font-weight", blocksActive ? "600" : "400")
    .attr("opacity", blocksActive ? 1 : 0.6)
    .text("× N Blocks");

  // Individual blocks
  for (let i = 0; i < numBlocks; i++) {
    const blockY = blocksStartY + i * (blockHeight + blockGap);
    const g = svg.append("g")
      .attr("transform", `translate(${centerX}, ${blockY})`);

    g.append("rect")
      .attr("x", -blockWidth/2)
      .attr("y", -blockHeight/2)
      .attr("width", blockWidth)
      .attr("height", blockHeight)
      .attr("rx", 5)
      .attr("fill", blocksActive ? colors.blocks : diagramTheme.nodeFill)
      .attr("stroke", blocksActive ? colors.blocks : diagramTheme.nodeStroke)
      .attr("stroke-width", blocksActive ? 2 : 1)
      .attr("opacity", blocksActive ? 1 : 0.7);

    g.append("text")
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", blocksActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("font-weight", "500")
      .text(`Block ${i}`);

    // Arrow between blocks (except last)
    if (i < numBlocks - 1) {
      const nextY = blockY + blockHeight/2 + blockGap/2;
      svg.append("line")
        .attr("x1", centerX)
        .attr("y1", blockY + blockHeight/2)
        .attr("x2", centerX)
        .attr("y2", blockY + blockHeight + blockGap - blockHeight/2)
        .attr("stroke", blocksActive ? colors.blocks : diagramTheme.edgeStroke)
        .attr("stroke-width", 1)
        .attr("opacity", blocksActive ? 0.8 : 0.4);
    }
  }

  // Arrow to Final LayerNorm
  drawArrow(centerX, blocksEndY + blockHeight/2, centerX, lnFinalY - nodeHeight/2 - 8, archStep === 3 || archStep === 4);

  // Final LayerNorm
  drawNode(centerX, lnFinalY, nodeWidth, nodeHeight, "Final LayerNorm", "Normalize", 4, "output");

  // Arrow to LM Head
  drawArrow(centerX, lnFinalY + nodeHeight/2, centerX, lmHeadY - nodeHeight/2 - 8, archStep === 4 || archStep === 5);

  // LM Head
  drawNode(centerX, lmHeadY, nodeWidth, nodeHeight, "LM Head", "Linear → vocab_size", 5, "output");

  // Arrow to Logits
  drawArrow(centerX, lmHeadY + nodeHeight/2, centerX, logitsY - nodeHeight/2 - 8, archStep === 5);

  // Logits
  drawNode(centerX, logitsY, nodeWidth, nodeHeight, "Logits", "[batch, seq, vocab]", 5, "output");

  // === Info panel on the right ===
  const infoX = width - marginRight + 20;
  const infoY = marginTop + 50;
  const infoWidth = marginRight - 30;

  // Info panel background
  svg.append("rect")
    .attr("x", infoX - 10)
    .attr("y", infoY - 10)
    .attr("width", infoWidth)
    .attr("height", 150)
    .attr("rx", 8)
    .attr("fill", diagramTheme.bgSecondary)
    .attr("stroke", diagramTheme.nodeStroke)
    .attr("stroke-width", 1)
    .attr("opacity", 0.5);

  // Step indicator
  svg.append("text")
    .attr("x", infoX)
    .attr("y", infoY + 5)
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .attr("text-transform", "uppercase")
    .attr("letter-spacing", "0.5px")
    .text(`Step ${archStep + 1} of 6`);

  // Step name
  svg.append("text")
    .attr("x", infoX)
    .attr("y", infoY + 28)
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "13px")
    .attr("font-weight", "700")
    .text(currentArchStep.name);

  // Description (wrap text)
  const desc = currentArchStep.description;
  const descWords = desc.split(" ");
  let descLine = "";
  let descY = infoY + 50;
  const maxDescWidth = infoWidth - 20;

  descWords.forEach((word, i) => {
    const testLine = descLine + (descLine ? " " : "") + word;
    if (testLine.length > 22) {
      svg.append("text")
        .attr("x", infoX)
        .attr("y", descY)
        .attr("fill", diagramTheme.nodeText)
        .attr("font-size", "10px")
        .attr("opacity", 0.8)
        .text(descLine);
      descY += 14;
      descLine = word;
    } else {
      descLine = testLine;
    }
  });
  if (descLine) {
    svg.append("text")
      .attr("x", infoX)
      .attr("y", descY)
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.8)
      .text(descLine);
  }

  // Shape info
  svg.append("text")
    .attr("x", infoX)
    .attr("y", infoY + 110)
    .attr("fill", diagramTheme.accent)
    .attr("font-size", "9px")
    .attr("font-weight", "500")
    .text("Shape:");
  svg.append("text")
    .attr("x", infoX)
    .attr("y", infoY + 124)
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "10px")
    .attr("font-family", "'JetBrains Mono', monospace")
    .text(currentArchStep.shape);

  return svg.node();
}

Interactive Architecture Walkthrough

Use the slider above to step through the forward pass. Each stage shows how tensor shapes transform as data flows through the model: - Input: Raw token IDs (integers) - Embeddings: Dense vectors capturing meaning and position - Blocks: Iterative refinement through attention and FFN - Output: Probability distribution over vocabulary

Single Transformer Block (Pre-Norm)

// Step definitions for transformer block walkthrough
blockSteps = [
  {
    id: 0,
    name: "Input",
    description: "Embeddings enter the transformer block unchanged initially.",
    highlight: ["input"],
    activeEdges: []
  },
  {
    id: 1,
    name: "Attention Path",
    description: "Pre-Norm: LayerNorm first, then Multi-Head Attention with causal mask, then Dropout.",
    highlight: ["ln1", "attn", "drop1"],
    activeEdges: ["e_in_ln1", "e_ln1_attn", "e_attn_drop1"]
  },
  {
    id: 2,
    name: "First Residual",
    description: "Add original input to attention output: x + Attention(LayerNorm(x)). Skip connection preserves gradients.",
    highlight: ["add1", "residual1"],
    activeEdges: ["e_in_add1", "e_drop1_add1"]
  },
  {
    id: 3,
    name: "FFN Path",
    description: "LayerNorm, then Feed-Forward Network (expand 4x, GELU, project back). Each token processed independently.",
    highlight: ["ln2", "ffn"],
    activeEdges: ["e_add1_ln2", "e_ln2_ffn"]
  },
  {
    id: 4,
    name: "Second Residual",
    description: "Add intermediate to FFN output: x + FFN(LayerNorm(x)). Two residuals per block enable deep stacking.",
    highlight: ["add2", "residual2", "output"],
    activeEdges: ["e_add1_add2", "e_ffn_add2", "e_add2_out"]
  }
]

// Step control for transformer block walkthrough
viewof blockStep = stepControl({min: 0, max: 4, value: 0, label: "Block Step"})

// Current step info
currentBlockStep = blockSteps[blockStep]

// Transformer block diagram
transformerBlockDiagram = {
  const width = 720;
  const height = 580;
  const theme = diagramTheme;
  const step = blockStep;
  const stepInfo = currentBlockStep;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 12);

  // Defs for filters and markers
  const defs = svg.append("defs");

  // Glow filter
  const glowFilter = defs.append("filter")
    .attr("id", "block-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "coloredBlur");
  const glowMerge = glowFilter.append("feMerge");
  glowMerge.append("feMergeNode").attr("in", "coloredBlur");
  glowMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Arrow markers
  defs.append("marker")
    .attr("id", "block-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", theme.edgeStroke);

  defs.append("marker")
    .attr("id", "block-arrow-attn")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", "#22d3ee");

  defs.append("marker")
    .attr("id", "block-arrow-ffn")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", "#a78bfa");

  defs.append("marker")
    .attr("id", "block-arrow-res")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", "#fb923c");

  // Color scheme
  const colors = {
    attention: "#22d3ee",  // Cyan for attention path
    ffn: "#a78bfa",        // Purple for FFN path
    residual: "#fb923c",   // Orange for residual connections
    norm: "#6b7280",       // Gray for LayerNorm
    io: theme.accent       // Theme accent for input/output
  };

  // Helper: check if element is active
  const isActive = (id) => stepInfo.highlight.includes(id);
  const isEdgeActive = (id) => stepInfo.activeEdges.includes(id);

  // Layout
  const centerX = 280;
  const rightX = 500;
  const nodeWidth = 140;
  const nodeHeight = 44;

  // Vertical positions
  const inputY = 80;
  const ln1Y = 160;
  const attnY = 230;
  const drop1Y = 300;
  const add1Y = 365;
  const ln2Y = 430;
  const ffnY = 490;
  const add2Y = 490;
  const outputY = 550;

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", theme.highlight)
    .attr("font-size", "15px")
    .attr("font-weight", "700")
    .text(`Transformer Block: ${stepInfo.name}`);

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 50)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.75)
    .text(stepInfo.description);

  // Helper: draw a node box
  function drawNode(x, y, label, sublabel, id, color) {
    const active = isActive(id);
    const g = svg.append("g").attr("transform", `translate(${x}, ${y})`);

    g.append("rect")
      .attr("x", -nodeWidth/2)
      .attr("y", -nodeHeight/2)
      .attr("width", nodeWidth)
      .attr("height", nodeHeight)
      .attr("rx", 6)
      .attr("fill", active ? color : theme.nodeFill)
      .attr("fill-opacity", active ? 0.25 : 1)
      .attr("stroke", active ? color : theme.nodeStroke)
      .attr("stroke-width", active ? 2.5 : 1.5)
      .attr("filter", active ? "url(#block-glow)" : null);

    g.append("text")
      .attr("y", sublabel ? -7 : 0)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", active ? color : theme.nodeText)
      .attr("font-size", "12px")
      .attr("font-weight", active ? "600" : "500")
      .text(label);

    if (sublabel) {
      g.append("text")
        .attr("y", 10)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", active ? color : theme.nodeText)
        .attr("font-size", "9px")
        .attr("opacity", active ? 0.9 : 0.6)
        .text(sublabel);
    }
  }

  // Helper: draw a circle node (for + operations)
  function drawCircle(x, y, label, id, color) {
    const active = isActive(id);
    const g = svg.append("g").attr("transform", `translate(${x}, ${y})`);

    g.append("circle")
      .attr("r", 20)
      .attr("fill", active ? color : theme.nodeFill)
      .attr("fill-opacity", active ? 0.3 : 1)
      .attr("stroke", active ? color : theme.nodeStroke)
      .attr("stroke-width", active ? 2.5 : 1.5)
      .attr("filter", active ? "url(#block-glow)" : null);

    g.append("text")
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", active ? color : theme.nodeText)
      .attr("font-size", "18px")
      .attr("font-weight", "bold")
      .text(label);
  }

  // Helper: draw an arrow
  function drawArrow(x1, y1, x2, y2, edgeId, color, curved = false, curveDir = 1) {
    const active = isEdgeActive(edgeId);
    const markerColor = color === colors.attention ? "block-arrow-attn" :
                        color === colors.ffn ? "block-arrow-ffn" :
                        color === colors.residual ? "block-arrow-res" : "block-arrow";

    let pathD;
    if (curved) {
      const midX = (x1 + x2) / 2 + curveDir * 40;
      pathD = `M${x1},${y1} Q${midX},${(y1+y2)/2} ${x2},${y2}`;
    } else {
      pathD = `M${x1},${y1} L${x2},${y2}`;
    }

    svg.append("path")
      .attr("d", pathD)
      .attr("fill", "none")
      .attr("stroke", active ? color : theme.edgeStroke)
      .attr("stroke-width", active ? 2.5 : 1.5)
      .attr("stroke-opacity", active ? 1 : 0.5)
      .attr("marker-end", active ? `url(#${markerColor})` : "url(#block-arrow)")
      .attr("filter", active ? "url(#block-glow)" : null);
  }

  // === Draw the architecture ===

  // Input node
  drawNode(centerX, inputY, "Input x", "(batch, seq, embed)", "input", colors.io);

  // --- Attention Path (left side) ---

  // LayerNorm 1
  drawNode(centerX, ln1Y, "LayerNorm", null, "ln1", colors.attention);

  // Multi-Head Attention
  drawNode(centerX, attnY, "Multi-Head Attention", "(causal mask)", "attn", colors.attention);

  // Dropout
  drawNode(centerX, drop1Y, "Dropout", null, "drop1", colors.attention);

  // Arrows in attention path
  drawArrow(centerX, inputY + nodeHeight/2, centerX, ln1Y - nodeHeight/2 - 8, "e_in_ln1", colors.attention);
  drawArrow(centerX, ln1Y + nodeHeight/2, centerX, attnY - nodeHeight/2 - 8, "e_ln1_attn", colors.attention);
  drawArrow(centerX, attnY + nodeHeight/2, centerX, drop1Y - nodeHeight/2 - 8, "e_attn_drop1", colors.attention);

  // First residual add
  drawCircle(centerX, add1Y, "+", "add1", colors.residual);

  // Arrow from dropout to add1
  drawArrow(centerX, drop1Y + nodeHeight/2, centerX, add1Y - 20 - 8, "e_drop1_add1", colors.attention);

  // Residual connection 1 (skip from input to add1)
  const res1Active = isActive("residual1");
  // Draw the skip connection path on the right
  svg.append("path")
    .attr("d", `M${centerX + nodeWidth/2},${inputY} L${centerX + 90},${inputY} L${centerX + 90},${add1Y} L${centerX + 20},${add1Y}`)
    .attr("fill", "none")
    .attr("stroke", res1Active ? colors.residual : theme.edgeStroke)
    .attr("stroke-width", res1Active ? 2.5 : 1.5)
    .attr("stroke-dasharray", "6,3")
    .attr("stroke-opacity", res1Active ? 1 : 0.4)
    .attr("marker-end", res1Active ? "url(#block-arrow-res)" : "url(#block-arrow)")
    .attr("filter", res1Active ? "url(#block-glow)" : null);

  // Residual label
  if (res1Active) {
    svg.append("text")
      .attr("x", centerX + 100)
      .attr("y", (inputY + add1Y) / 2)
      .attr("text-anchor", "start")
      .attr("fill", colors.residual)
      .attr("font-size", "10px")
      .attr("font-weight", "600")
      .text("residual");
  }

  // --- FFN Path ---

  // LayerNorm 2
  drawNode(centerX, ln2Y, "LayerNorm", null, "ln2", colors.ffn);

  // Feed-Forward Network
  drawNode(rightX, ffnY, "Feed-Forward", "(4x expand, GELU)", "ffn", colors.ffn);

  // Arrow from add1 to ln2
  drawArrow(centerX, add1Y + 20, centerX, ln2Y - nodeHeight/2 - 8, "e_add1_ln2", colors.ffn);

  // Arrow from ln2 to ffn (horizontal then down)
  const ln2ToFfnActive = isEdgeActive("e_ln2_ffn");
  svg.append("path")
    .attr("d", `M${centerX + nodeWidth/2},${ln2Y} L${rightX},${ln2Y} L${rightX},${ffnY - nodeHeight/2 - 8}`)
    .attr("fill", "none")
    .attr("stroke", ln2ToFfnActive ? colors.ffn : theme.edgeStroke)
    .attr("stroke-width", ln2ToFfnActive ? 2.5 : 1.5)
    .attr("stroke-opacity", ln2ToFfnActive ? 1 : 0.5)
    .attr("marker-end", ln2ToFfnActive ? "url(#block-arrow-ffn)" : "url(#block-arrow)")
    .attr("filter", ln2ToFfnActive ? "url(#block-glow)" : null);

  // Second residual add
  drawCircle(centerX, add2Y, "+", "add2", colors.residual);

  // Arrow from ffn to add2
  const ffnToAddActive = isEdgeActive("e_ffn_add2");
  svg.append("path")
    .attr("d", `M${rightX - nodeWidth/2},${ffnY} L${centerX + 20},${add2Y}`)
    .attr("fill", "none")
    .attr("stroke", ffnToAddActive ? colors.ffn : theme.edgeStroke)
    .attr("stroke-width", ffnToAddActive ? 2.5 : 1.5)
    .attr("stroke-opacity", ffnToAddActive ? 1 : 0.5)
    .attr("marker-end", ffnToAddActive ? "url(#block-arrow-ffn)" : "url(#block-arrow)")
    .attr("filter", ffnToAddActive ? "url(#block-glow)" : null);

  // Residual connection 2 (from add1 to add2 - the skip)
  const res2Active = isActive("residual2");
  svg.append("path")
    .attr("d", `M${centerX - 20},${add1Y} L${centerX - 60},${add1Y} L${centerX - 60},${add2Y} L${centerX - 20},${add2Y}`)
    .attr("fill", "none")
    .attr("stroke", res2Active ? colors.residual : theme.edgeStroke)
    .attr("stroke-width", res2Active ? 2.5 : 1.5)
    .attr("stroke-dasharray", "6,3")
    .attr("stroke-opacity", res2Active ? 1 : 0.4)
    .attr("marker-end", res2Active ? "url(#block-arrow-res)" : "url(#block-arrow)")
    .attr("filter", res2Active ? "url(#block-glow)" : null);

  // Residual 2 label
  if (res2Active) {
    svg.append("text")
      .attr("x", centerX - 70)
      .attr("y", (add1Y + add2Y) / 2)
      .attr("text-anchor", "end")
      .attr("fill", colors.residual)
      .attr("font-size", "10px")
      .attr("font-weight", "600")
      .text("residual");
  }

  // Output node
  drawNode(centerX, outputY, "Output", "(batch, seq, embed)", "output", colors.io);

  // Arrow from add2 to output
  drawArrow(centerX, add2Y + 20, centerX, outputY - nodeHeight/2 - 8, "e_add2_out", colors.io);

  // === Legend ===
  const legendX = 600;
  const legendY = 120;

  svg.append("text")
    .attr("x", legendX)
    .attr("y", legendY)
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .attr("font-weight", "600")
    .text("Legend");

  // Attention path
  svg.append("rect")
    .attr("x", legendX)
    .attr("y", legendY + 15)
    .attr("width", 14)
    .attr("height", 14)
    .attr("rx", 3)
    .attr("fill", colors.attention)
    .attr("fill-opacity", 0.3)
    .attr("stroke", colors.attention);

  svg.append("text")
    .attr("x", legendX + 20)
    .attr("y", legendY + 25)
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text("Attention path");

  // FFN path
  svg.append("rect")
    .attr("x", legendX)
    .attr("y", legendY + 38)
    .attr("width", 14)
    .attr("height", 14)
    .attr("rx", 3)
    .attr("fill", colors.ffn)
    .attr("fill-opacity", 0.3)
    .attr("stroke", colors.ffn);

  svg.append("text")
    .attr("x", legendX + 20)
    .attr("y", legendY + 48)
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text("FFN path");

  // Residual
  svg.append("line")
    .attr("x1", legendX)
    .attr("y1", legendY + 68)
    .attr("x2", legendX + 14)
    .attr("y2", legendY + 68)
    .attr("stroke", colors.residual)
    .attr("stroke-width", 2)
    .attr("stroke-dasharray", "4,2");

  svg.append("text")
    .attr("x", legendX + 20)
    .attr("y", legendY + 71)
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text("Residual (skip)");

  // Pre-Norm annotation
  svg.append("rect")
    .attr("x", legendX - 10)
    .attr("y", legendY + 95)
    .attr("width", 110)
    .attr("height", 50)
    .attr("rx", 6)
    .attr("fill", theme.bgSecondary)
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 1);

  svg.append("text")
    .attr("x", legendX)
    .attr("y", legendY + 113)
    .attr("fill", theme.highlight)
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .text("Pre-Norm Pattern:");

  svg.append("text")
    .attr("x", legendX)
    .attr("y", legendY + 130)
    .attr("fill", theme.nodeText)
    .attr("font-size", "9px")
    .attr("font-family", "'IBM Plex Mono', monospace")
    .text("x + f(LayerNorm(x))");

  return svg.node();
}

The key innovation is the residual connections (the + nodes). Instead of y = f(x), we compute y = x + f(x). This:

Helps gradients flow through deep networks
Makes it easy to learn identity (just set f(x) = 0)
Enables training of 100+ layer networks

The Components

We build each component from scratch, then show the PyTorch equivalents. The pattern: understand the math, implement it simply, then see how PyTorch optimizes it.

LayerNorm from Scratch

The Idea: Activations drift to extreme values during training, causing gradients to explode or vanish. Layer normalization fixes this by normalizing each token’s embedding to zero mean and unit variance, then applying learnable scale and shift.

The Formula:

\[\text{LayerNorm}(x) = \gamma \times \frac{x - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta\]

where:

\(\mu\) = mean across the embedding dimension
\(\sigma^2\) = variance across the embedding dimension
\(\gamma\) (gamma) = learnable scale parameter (initialized to 1)
\(\beta\) (beta) = learnable shift parameter (initialized to 0)
\(\epsilon\) = small constant for numerical stability (typically 1e-5)

From Scratch Implementation:

import numpy as np
import torch
import torch.nn as nn

class LayerNormScratch:
    """Layer normalization from scratch using NumPy-style operations."""

    def __init__(self, dim, eps=1e-5):
        # Learnable parameters
        self.gamma = np.ones((dim,))   # scale (initialized to 1)
        self.beta = np.zeros((dim,))   # shift (initialized to 0)
        self.eps = eps

    def __call__(self, x):
        """
        Args:
            x: input array of shape (..., dim)
        Returns:
            normalized array of same shape
        """
        # Step 1: Compute mean across last dimension
        mean = x.mean(axis=-1, keepdims=True)

        # Step 2: Compute variance across last dimension
        var = ((x - mean) ** 2).mean(axis=-1, keepdims=True)

        # Step 3: Normalize (the "norm" in LayerNorm)
        x_norm = (x - mean) / np.sqrt(var + self.eps)

        # Step 4: Scale and shift with learnable parameters
        return self.gamma * x_norm + self.beta


# Test our from-scratch implementation
x = np.array([[2.0, 4.0, 6.0, 8.0],
              [1.0, 2.0, 3.0, 4.0]])

ln_scratch = LayerNormScratch(dim=4)
out_scratch = ln_scratch(x)

print("LayerNorm from Scratch:")
print(f"  Input:\n{x}")
print(f"  Output:\n{np.round(out_scratch, 4)}")
print(f"  Output mean per row: {out_scratch.mean(axis=-1).round(6)}")
print(f"  Output std per row: {out_scratch.std(axis=-1).round(4)}")

LayerNorm from Scratch:
  Input:
[[2. 4. 6. 8.]
 [1. 2. 3. 4.]]
  Output:
[[-1.3416 -0.4472  0.4472  1.3416]
 [-1.3416 -0.4472  0.4472  1.3416]]
  Output mean per row: [0. 0.]
  Output std per row: [1. 1.]

PyTorch’s nn.LayerNorm:

# PyTorch's optimized implementation
ln_pytorch = nn.LayerNorm(4, elementwise_affine=True)

# Initialize to match our scratch version (gamma=1, beta=0)
nn.init.ones_(ln_pytorch.weight)
nn.init.zeros_(ln_pytorch.bias)

x_torch = torch.tensor(x, dtype=torch.float32)
out_pytorch = ln_pytorch(x_torch)

print("PyTorch LayerNorm:")
print(f"  Output:\n{out_pytorch.detach().numpy().round(4)}")
print(f"  Matches scratch: {np.allclose(out_scratch, out_pytorch.detach().numpy(), atol=1e-5)}")

PyTorch LayerNorm:
  Output:
[[-1.3416 -0.4472  0.4472  1.3416]
 [-1.3416 -0.4472  0.4472  1.3416]]
  Matches scratch: True

Key Insight: LayerNorm is just normalize-scale-shift. The learnable \(\gamma\) and \(\beta\) let the network undo the normalization if needed, but start from a stable baseline. Unlike BatchNorm, LayerNorm normalizes across features (embedding dimension) rather than across batch, making it suitable for variable-length sequences.

Dropout: Regularization by Noise

The Idea: During training, dropout zeros a random subset of activations. The network learns redundant representations rather than depending on any single feature. The key trick: scale remaining values by \(\frac{1}{1-p}\) so the expected value stays the same.

Why it works:

Forces the network to learn redundant representations
Acts like training an ensemble of sub-networks
At inference time, use all neurons (no dropout)

From Scratch Implementation:

class DropoutScratch:
    """Dropout from scratch."""

    def __init__(self, p=0.1):
        """
        Args:
            p: probability of dropping each element (not keeping!)
        """
        self.p = p

    def __call__(self, x, training=True):
        """
        Args:
            x: input array
            training: if False, return x unchanged
        Returns:
            x with dropout applied (if training)
        """
        if not training or self.p == 0:
            return x

        # Create random mask: True where we KEEP the value
        keep_prob = 1 - self.p
        mask = np.random.random(x.shape) < keep_prob

        # Apply mask and scale by 1/(1-p)
        # This keeps the expected value the same:
        # E[x * mask / keep_prob] = x * keep_prob / keep_prob = x
        return x * mask / keep_prob


# Demonstrate dropout
np.random.seed(42)
x = np.ones((2, 8))

dropout = DropoutScratch(p=0.5)

print("Dropout from Scratch (p=0.5):")
print(f"  Input (all 1s): {x[0]}")

# Apply dropout multiple times to see the randomness
for i in range(3):
    np.random.seed(i)
    out = dropout(x, training=True)
    print(f"  Trial {i+1}: {out[0].round(2)}")
    print(f"    Mean: {out[0].mean():.2f} (should be ~1.0 on average)")

Dropout from Scratch (p=0.5):
  Input (all 1s): [1. 1. 1. 1. 1. 1. 1. 1.]
  Trial 1: [0. 0. 0. 0. 2. 0. 2. 0.]
    Mean: 0.50 (should be ~1.0 on average)
  Trial 2: [2. 0. 2. 2. 2. 2. 2. 2.]
    Mean: 1.75 (should be ~1.0 on average)
  Trial 3: [2. 2. 0. 2. 2. 2. 2. 0.]
    Mean: 1.50 (should be ~1.0 on average)

The Scaling Trick Explained:

# Why divide by (1-p)?
# Without scaling, dropout reduces expected output
# With scaling, expected output stays the same

p = 0.5
np.random.seed(0)
x = np.array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

# Without scaling
mask = np.random.random(x.shape) < (1-p)
out_no_scale = x * mask
print(f"Without scaling: {out_no_scale} -> mean = {out_no_scale.mean():.2f}")

# With scaling (divide by keep probability)
np.random.seed(0)
mask = np.random.random(x.shape) < (1-p)
out_scaled = x * mask / (1-p)
print(f"With scaling:    {out_scaled} -> mean = {out_scaled.mean():.2f}")
print(f"\nThe scaling keeps expected value at 1.0 despite dropping 50% of values")

Without scaling: [0. 0. 0. 0. 1. 0. 1. 0.] -> mean = 0.25
With scaling:    [0. 0. 0. 0. 2. 0. 2. 0.] -> mean = 0.50

The scaling keeps expected value at 1.0 despite dropping 50% of values

PyTorch’s nn.Dropout:

# PyTorch handles training/mode automatically
dropout_pytorch = nn.Dropout(p=0.5)

x_torch = torch.ones(2, 8)

# Training mode (dropout active)
dropout_pytorch.train()
torch.manual_seed(42)
out_train = dropout_pytorch(x_torch)
print(f"PyTorch Dropout (training): {out_train[0].numpy()}")

# Inference mode (dropout disabled)
dropout_pytorch.eval()
out_inference = dropout_pytorch(x_torch)
print(f"PyTorch Dropout (inference): {out_inference[0].numpy()}")

PyTorch Dropout (training): [2. 2. 2. 2. 0. 2. 0. 0.]
PyTorch Dropout (inference): [1. 1. 1. 1. 1. 1. 1. 1.]

Key Insight: Dropout is random masking with scaling. Scaling by \(\frac{1}{1-p}\) during training preserves the expected value, so inference requires no adjustment.

Residual Connections: The Highway for Gradients

The Idea: Replace \(y = f(x)\) with \(y = x + f(x)\). This skip connection eliminates the vanishing gradient problem by giving gradients a direct path.

Why it helps:

viewof gradientLayers = Inputs.range([5, 40], {
  value: 20,
  step: 1,
  label: "Network Depth (layers)"
})

viewof gradientShrink = Inputs.range([0.5, 0.95], {
  value: 0.8,
  step: 0.05,
  label: "Layer gradient factor"
})

gradientData = {
  const numLayers = gradientLayers;
  const shrinkFactor = gradientShrink;

  const withoutResidual = [1.0];
  const withResidual = [1.0];

  for (let i = 0; i < numLayers; i++) {
    // Without residual: gradients multiply (shrink exponentially)
    // d(f(f(f(x))))/dx = f'(x) * f'(f(x)) * f'(f(f(x)))
    withoutResidual.push(withoutResidual[withoutResidual.length - 1] * shrinkFactor);

    // With residual: d(x + f(x))/dx = 1 + f'(x)
    // The gradient through the skip connection adds 1, keeping it bounded
    const prevGrad = withResidual[withResidual.length - 1];
    // Each layer contributes: 1 (from skip) + shrinkFactor (through transformation)
    // But we model the total gradient magnitude
    const newGrad = Math.min(1.0 + shrinkFactor * 0.1, prevGrad * (1.0 + (1 - shrinkFactor) * 0.05));
    withResidual.push(Math.max(0.5, Math.min(1.5, newGrad)));
  }

  return { withoutResidual, withResidual };
}

// Final values for display
finalWithout = gradientData.withoutResidual[gradientData.withoutResidual.length - 1]
finalWith = gradientData.withResidual[gradientData.withResidual.length - 1]

// Main gradient flow chart
gradientFlowChart = {
  const width = 700;
  const height = 350;
  const margin = { top: 50, right: 30, bottom: 60, left: 70 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Data
  const dataWithout = gradientData.withoutResidual.map((v, i) => ({ layer: i, value: v }));
  const dataWith = gradientData.withResidual.map((v, i) => ({ layer: i, value: v }));

  // Scales (log scale for y to show exponential decay)
  const xScale = d3.scaleLinear()
    .domain([0, gradientLayers])
    .range([0, innerWidth]);

  // Use log scale to better show the exponential decay
  const minVal = Math.min(0.00001, d3.min(dataWithout, d => d.value));
  const yScale = d3.scaleLog()
    .domain([minVal, 2])
    .range([innerHeight, 0])
    .clamp(true);

  // Grid lines (log scale)
  const gridColor = theme.isDark ? "rgba(255,255,255,0.06)" : "rgba(0,0,0,0.06)";

  const logTicks = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1];
  g.selectAll("line.grid-h")
    .data(logTicks)
    .join("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", d => yScale(d))
    .attr("y2", d => yScale(d))
    .attr("stroke", gridColor)
    .attr("stroke-dasharray", "3,3");

  // Colors - use theme colors for consistency
  const withoutColor = theme.primary;
  const withColor = theme.error;

  // Line generators
  const lineGenerator = d3.line()
    .x(d => xScale(d.layer))
    .y(d => yScale(Math.max(minVal, d.value)))
    .curve(d3.curveMonotoneX);

  // Draw "without residual" line
  g.append("path")
    .datum(dataWithout)
    .attr("d", lineGenerator)
    .attr("fill", "none")
    .attr("stroke", withoutColor)
    .attr("stroke-width", 2.5)
    .attr("stroke-linecap", "round");

  // Draw "with residual" line
  g.append("path")
    .datum(dataWith)
    .attr("d", lineGenerator)
    .attr("fill", "none")
    .attr("stroke", withColor)
    .attr("stroke-width", 2.5)
    .attr("stroke-linecap", "round");

  // Points at key intervals
  const keyLayers = [0, Math.floor(gradientLayers / 4), Math.floor(gradientLayers / 2),
                     Math.floor(3 * gradientLayers / 4), gradientLayers];

  keyLayers.forEach(layer => {
    if (layer <= gradientLayers) {
      const withoutVal = dataWithout[layer].value;
      const withVal = dataWith[layer].value;

      // Without residual point
      g.append("circle")
        .attr("cx", xScale(layer))
        .attr("cy", yScale(Math.max(minVal, withoutVal)))
        .attr("r", 5)
        .attr("fill", withoutColor)
        .attr("stroke", theme.bg)
        .attr("stroke-width", 2);

      // With residual point
      g.append("circle")
        .attr("cx", xScale(layer))
        .attr("cy", yScale(Math.max(minVal, withVal)))
        .attr("r", 5)
        .attr("fill", withColor)
        .attr("stroke", theme.bg)
        .attr("stroke-width", 2);
    }
  });

  // Danger zone indicator (below 0.01)
  g.append("rect")
    .attr("x", 0)
    .attr("y", yScale(0.01))
    .attr("width", innerWidth)
    .attr("height", innerHeight - yScale(0.01))
    .attr("fill", theme.isDark ? "rgba(239, 68, 68, 0.1)" : "rgba(239, 68, 68, 0.05)");

  g.append("text")
    .attr("x", innerWidth - 10)
    .attr("y", yScale(0.001))
    .attr("text-anchor", "end")
    .attr("fill", withColor)
    .attr("font-size", "10px")
    .attr("opacity", 0.7)
    .text("Vanishing gradient zone");

  // X axis
  g.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(8).tickFormat(d => d))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 45)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Layer Depth");

  // Y axis (log scale)
  g.append("g")
    .call(d3.axisLeft(yScale)
      .tickValues([0.00001, 0.0001, 0.001, 0.01, 0.1, 1])
      .tickFormat(d => {
        if (d === 1) return "1";
        if (d === 0.1) return "0.1";
        if (d === 0.01) return "0.01";
        if (d === 0.001) return "10\u207B\u00B3";
        if (d === 0.0001) return "10\u207B\u2074";
        if (d === 0.00001) return "10\u207B\u2075";
        return d.toExponential(0);
      }))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -55)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Gradient Magnitude (log scale)");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text("Residual Connections Prevent Vanishing Gradients");

  // Legend
  const legendX = margin.left + 20;
  const legendY = margin.top + 15;

  svg.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY)
    .attr("y2", legendY)
    .attr("stroke", withoutColor)
    .attr("stroke-width", 2.5);

  svg.append("circle")
    .attr("cx", legendX + 12.5)
    .attr("cy", legendY)
    .attr("r", 4)
    .attr("fill", withoutColor);

  svg.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY)
    .attr("dominant-baseline", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Without residual");

  svg.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY + 20)
    .attr("y2", legendY + 20)
    .attr("stroke", withColor)
    .attr("stroke-width", 2.5);

  svg.append("circle")
    .attr("cx", legendX + 12.5)
    .attr("cy", legendY + 20)
    .attr("r", 4)
    .attr("fill", withColor);

  svg.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY + 20)
    .attr("dominant-baseline", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("With residual");

  return svg.node();
}

// Stats display
html`<div style="
  display: grid;
  grid-template-columns: repeat(2, 1fr);
  gap: 16px;
  margin-top: 16px;
  font-family: 'IBM Plex Mono', monospace;
">
  <div style="
    background: ${diagramTheme.bgSecondary};
    border: 1px solid #3b82f6;
    border-radius: 8px;
    padding: 16px;
  ">
    <div style="color: #3b82f6; font-size: 12px; font-weight: 600; margin-bottom: 8px;">Without Residuals</div>
    <div style="color: ${diagramTheme.nodeText}; font-size: 14px;">
      After ${gradientLayers} layers: <strong>${finalWithout < 0.0001 ? finalWithout.toExponential(2) : finalWithout.toFixed(6)}</strong>
    </div>
    <div style="color: ${diagramTheme.nodeText}; font-size: 11px; opacity: 0.7; margin-top: 6px;">
      Gradients vanish exponentially: ${gradientShrink}^${gradientLayers} = ${Math.pow(gradientShrink, gradientLayers).toExponential(2)}
    </div>
  </div>
  <div style="
    background: ${diagramTheme.bgSecondary};
    border: 1px solid #ef4444;
    border-radius: 8px;
    padding: 16px;
  ">
    <div style="color: #ef4444; font-size: 12px; font-weight: 600; margin-bottom: 8px;">With Residuals</div>
    <div style="color: ${diagramTheme.nodeText}; font-size: 14px;">
      After ${gradientLayers} layers: <strong>${finalWith.toFixed(4)}</strong>
    </div>
    <div style="color: ${diagramTheme.nodeText}; font-size: 11px; opacity: 0.7; margin-top: 6px;">
      Gradients stay healthy via skip connections
    </div>
  </div>
</div>`

// Explanation
html`<div style="
  background: ${diagramTheme.isDark ? 'rgba(34, 197, 94, 0.1)' : 'rgba(34, 197, 94, 0.05)'};
  border-left: 3px solid #22c55e;
  border-radius: 0 6px 6px 0;
  padding: 12px 16px;
  margin-top: 16px;
  font-family: 'IBM Plex Sans', system-ui, sans-serif;
  font-size: 13px;
  color: ${diagramTheme.nodeText};
">
  <strong style="color: #22c55e;">Why Residuals Work:</strong>
  The gradient of <code>y = x + f(x)</code> is <code>dy/dx = 1 + df/dx</code>.
  That <strong>+1</strong> from the skip connection means gradients always have a direct path
  through the network, even if <code>df/dx</code> shrinks. This enables training of 100+ layer networks.
</div>`

Pre-Norm vs Post-Norm:

The original Transformer used “post-norm”: normalize after the residual addition.

# Post-Norm (original Transformer)
x = LayerNorm(x + Attention(x))
x = LayerNorm(x + FFN(x))

Modern LLMs use “pre-norm”: normalize before each sublayer.

# Pre-Norm (GPT-2, LLaMA, modern LLMs)
x = x + Attention(LayerNorm(x))
x = x + FFN(LayerNorm(x))

Why Pre-Norm is Better:

# Demonstrate the stability difference

def simulate_forward_pass(num_layers, prenorm=True):
    """Simulate activation magnitudes through layers."""
    x = 1.0  # Starting activation magnitude

    for _ in range(num_layers):
        if prenorm:
            # Pre-norm: normalize first, then residual keeps things bounded
            normed = 1.0  # After LayerNorm, magnitude is ~1
            sublayer_out = normed * 0.5  # Sublayer output
            x = x + sublayer_out  # Residual addition
        else:
            # Post-norm: residual can grow, then we normalize
            sublayer_out = x * 0.5
            x = x + sublayer_out  # Can grow unboundedly before norm
            x = 1.0  # LayerNorm resets to ~1

    return x

print("Activation stability comparison:")
print(f"  Pre-norm after 24 layers:  ~{simulate_forward_pass(24, prenorm=True):.1f}")
print(f"  Post-norm after 24 layers: ~{simulate_forward_pass(24, prenorm=False):.1f}")
print("\nPre-norm has a cleaner gradient path because the skip connection")
print("bypasses normalization - gradients flow directly from output to input.")

Activation stability comparison:
  Pre-norm after 24 layers:  ~13.0
  Post-norm after 24 layers: ~1.0

Pre-norm has a cleaner gradient path because the skip connection
bypasses normalization - gradients flow directly from output to input.

Key Insight: Residual connections transform y = f(x) into y = x + f(x). The gradient of this is dy/dx = 1 + df/dx. That + 1 is crucial - it means gradients always have a direct path through the network, even if df/dx is tiny.

The Full Transformer Block from Scratch

The Idea: Now we assemble all the pieces into a complete transformer block:

LayerNorm + Multi-Head Attention + Residual
LayerNorm + Feed-Forward Network + Residual

class FeedForwardScratch:
    """Simple feed-forward network from scratch."""

    def __init__(self, embed_dim, ff_dim):
        # Initialize weights with small random values
        scale = 0.02
        self.w1 = np.random.randn(embed_dim, ff_dim) * scale
        self.b1 = np.zeros(ff_dim)
        self.w2 = np.random.randn(ff_dim, embed_dim) * scale
        self.b2 = np.zeros(embed_dim)

    def gelu(self, x):
        """GELU activation: x * Phi(x) where Phi is standard normal CDF."""
        return 0.5 * x * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 * x**3)))

    def __call__(self, x):
        # Up projection: embed_dim -> ff_dim
        h = x @ self.w1 + self.b1
        # Activation
        h = self.gelu(h)
        # Down projection: ff_dim -> embed_dim
        return h @ self.w2 + self.b2


# NOTE: This uses a SIMPLIFIED attention (just linear projection) to focus on
# the overall block structure. Real attention with Q, K, V is in attention.py
class TransformerBlockScratch:
    """
    A complete transformer block from scratch (with simplified attention).

    Architecture (Pre-Norm):
        x = x + Attention(LayerNorm(x))
        x = x + FeedForward(LayerNorm(x))

    WARNING: The attention here is simplified to a linear projection for
    demonstration purposes. See m05_attention for full attention implementation.
    """

    def __init__(self, embed_dim, num_heads, ff_dim, dropout_p=0.1):
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Layer norms
        self.ln1 = LayerNormScratch(embed_dim)
        self.ln2 = LayerNormScratch(embed_dim)

        # Attention projections (simplified: no actual attention computation)
        # In a full implementation, this would include Q, K, V projections
        scale = 0.02
        self.attn_proj = np.random.randn(embed_dim, embed_dim) * scale

        # Feed-forward network
        self.ff = FeedForwardScratch(embed_dim, ff_dim)

        # Dropout
        self.dropout = DropoutScratch(dropout_p)

    def __call__(self, x, training=True):
        """
        Args:
            x: input of shape (batch, seq, embed_dim)
            training: whether to apply dropout
        Returns:
            output of shape (batch, seq, embed_dim)
        """
        # === Attention sub-block ===
        # 1. Layer norm (pre-norm)
        normed = self.ln1(x)

        # 2. Attention (simplified: just a linear projection for demo)
        # Real implementation would compute Q, K, V and attention weights
        attn_out = normed @ self.attn_proj

        # 3. Dropout
        attn_out = self.dropout(attn_out, training=training)

        # 4. Residual connection
        x = x + attn_out

        # === Feed-forward sub-block ===
        # 1. Layer norm (pre-norm)
        normed = self.ln2(x)

        # 2. Feed-forward network
        ff_out = self.ff(normed)

        # 3. Dropout
        ff_out = self.dropout(ff_out, training=training)

        # 4. Residual connection
        x = x + ff_out

        return x


# Test the from-scratch transformer block
np.random.seed(42)
block_scratch = TransformerBlockScratch(
    embed_dim=64,
    num_heads=4,
    ff_dim=256,
    dropout_p=0.0  # Disable dropout for reproducibility
)

x = np.random.randn(2, 8, 64)  # batch=2, seq=8, embed=64
out = block_scratch(x, training=False)

print("Transformer Block from Scratch:")
print(f"  Input shape:  {x.shape}")
print(f"  Output shape: {out.shape}")
print(f"  Input mean:   {x.mean():.4f}")
print(f"  Output mean:  {out.mean():.4f}")
print(f"\nThe block transforms each token while preserving shape.")
print("Residual connections keep the output close to input initially.")

Transformer Block from Scratch:
  Input shape:  (2, 8, 64)
  Output shape: (2, 8, 64)
  Input mean:   -0.0469
  Output mean:  -0.0443

The block transforms each token while preserving shape.
Residual connections keep the output close to input initially.

The Complete Picture:

# Visualize the transformer block structure
print("""
Transformer Block (Pre-Norm Architecture):
==========================================

    Input x
        |
        +------------------+
        |                  |
        v                  |
    LayerNorm              |
        |                  |
        v                  |
    Multi-Head Attention   |
        |                  |
        v                  |
    Dropout                |
        |                  |
        +--------(+)-------+  <- Residual connection
                  |
        +------------------+
        |                  |
        v                  |
    LayerNorm              |
        |                  |
        v                  |
    Feed-Forward           |
        |                  |
        v                  |
    Dropout                |
        |                  |
        +--------(+)-------+  <- Residual connection
                  |
                  v
              Output
""")


Transformer Block (Pre-Norm Architecture):
==========================================

    Input x
        |
        +------------------+
        |                  |
        v                  |
    LayerNorm              |
        |                  |
        v                  |
    Multi-Head Attention   |
        |                  |
        v                  |
    Dropout                |
        |                  |
        +--------(+)-------+  <- Residual connection
                  |
        +------------------+
        |                  |
        v                  |
    LayerNorm              |
        |                  |
        v                  |
    Feed-Forward           |
        |                  |
        v                  |
    Dropout                |
        |                  |
        +--------(+)-------+  <- Residual connection
                  |
                  v
              Output

Key Insight: A Transformer block contains only attention, MLP, residuals, and norms. Stacking dozens of blocks and training on billions of tokens produces the performance.

PyTorch Transformer Modules

PyTorch provides optimized versions of everything we built from scratch.

Comparison Table:

Component	From Scratch	PyTorch
LayerNorm	Manual mean/var	`nn.LayerNorm`
Dropout	Random mask + scale	`nn.Dropout`
FFN	Two linear layers + GELU	Custom or `nn.Sequential`
Full Block	Manual assembly	`nn.TransformerDecoderLayer`

# PyTorch's TransformerDecoderLayer
# Note: This is for encoder-decoder models; for decoder-only like GPT,
# we typically build our own (as in transformer.py)

from torch.nn import TransformerDecoderLayer

# Create a decoder layer similar to our scratch implementation
pytorch_block = TransformerDecoderLayer(
    d_model=64,
    nhead=4,
    dim_feedforward=256,
    dropout=0.1,
    activation='gelu',
    batch_first=True,
    norm_first=True  # Pre-norm architecture
)

x_torch = torch.randn(2, 8, 64)

# For decoder-only, we use self-attention (memory = x)
pytorch_block.eval()
out_pytorch = pytorch_block(x_torch, x_torch)

print("PyTorch TransformerDecoderLayer:")
print(f"  Input shape:  {tuple(x_torch.shape)}")
print(f"  Output shape: {tuple(out_pytorch.shape)}")
print(f"  Parameters:   {sum(p.numel() for p in pytorch_block.parameters()):,}")

PyTorch TransformerDecoderLayer:
  Input shape:  (2, 8, 64)
  Output shape: (2, 8, 64)
  Parameters:   66,752

When to Use What:

Learning: Build from scratch to understand every step
Production: Use PyTorch’s optimized modules
Custom architectures: Mix both - understand the components, then optimize

# Our module's TransformerBlock (production quality)
from transformer import TransformerBlock

our_block = TransformerBlock(
    embed_dim=64,
    num_heads=4,
    ff_dim=256,
    dropout=0.1
)

x_torch = torch.randn(2, 8, 64)
our_block.eval()
out_ours = our_block(x_torch)

print("Our TransformerBlock (from transformer.py):")
print(f"  Input shape:  {tuple(x_torch.shape)}")
print(f"  Output shape: {tuple(out_ours.shape)}")
print(f"  Parameters:   {sum(p.numel() for p in our_block.parameters()):,}")
print("\nThis is what we use for training - it includes proper")
print("causal attention, not the simplified version in scratch code.")

Our TransformerBlock (from transformer.py):
  Input shape:  (2, 8, 64)
  Output shape: (2, 8, 64)
  Parameters:   49,984

This is what we use for training - it includes proper
causal attention, not the simplified version in scratch code.

More Component Details

Layer Normalization (PyTorch Details)

Normalizes activations across the embedding dimension:

\[\text{LayerNorm}(x) = \gamma \times \frac{x - \mu}{\sqrt{\sigma^2 + \epsilon}} + \beta\]

where \(\mu\) and \(\sigma^2\) are the mean and variance across the embedding dimension, and \(\gamma\), \(\beta\) are learnable parameters.

Why it helps:

Stabilizes activations: Prevents values from exploding or vanishing
Faster training: More stable gradients
Independent per token: Each token normalized separately

Feed-Forward Network (FFN)

// Step definitions for FFN walkthrough
ffnSteps = [
  {
    id: 0,
    name: "Input",
    description: "Input tensor arrives with embed_dim features per token",
    activeNode: "input",
    activeEdge: null
  },
  {
    id: 1,
    name: "First Linear (Expand)",
    description: "Project from embed_dim to 4x embed_dim - the expansion gives capacity for complex transformations",
    activeNode: "linear1",
    activeEdge: "e_input_linear1"
  },
  {
    id: 2,
    name: "GELU Activation",
    description: "Apply GELU nonlinearity - smoother than ReLU, allows gradients to flow",
    activeNode: "gelu",
    activeEdge: "e_linear1_gelu"
  },
  {
    id: 3,
    name: "Second Linear (Project)",
    description: "Project back from 4x embed_dim to embed_dim - compress the expanded representation",
    activeNode: "linear2",
    activeEdge: "e_gelu_linear2"
  },
  {
    id: 4,
    name: "Output",
    description: "Output tensor has the same shape as input - ready for residual connection",
    activeNode: "output",
    activeEdge: "e_linear2_output"
  }
]

// Step control for FFN walkthrough
viewof ffnStep = stepControl({min: 0, max: 4, value: 0, label: "FFN Step"})

currentFFNStep = ffnSteps[ffnStep]

// Draw the FFN diagram
ffnDiagram = {
  const width = 700;
  const height = 180;
  const marginX = 50;
  const marginY = 40;

  // Node dimensions - varying heights to show dimension expansion
  const baseHeight = 50;
  const expandedHeight = 100;  // 4x expansion shown visually as 2x height
  const nodeWidth = 100;

  // Horizontal positions (evenly spaced)
  const spacing = (width - 2 * marginX) / 4;
  const inputX = marginX;
  const linear1X = marginX + spacing;
  const geluX = marginX + spacing * 2;
  const linear2X = marginX + spacing * 3;
  const outputX = marginX + spacing * 4;
  const centerY = height / 2;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', 'SF Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 8);

  // Defs for markers and filters
  const defs = svg.append("defs");

  // Glow filter
  const glowFilter = defs.append("filter")
    .attr("id", "ffn-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "coloredBlur");
  const glowMerge = glowFilter.append("feMerge");
  glowMerge.append("feMergeNode").attr("in", "coloredBlur");
  glowMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Arrow markers
  defs.append("marker")
    .attr("id", "ffn-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  defs.append("marker")
    .attr("id", "ffn-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Color for different node types
  const colors = {
    io: diagramTheme.accent,       // Input/Output
    linear: "#8b5cf6",             // Linear layers (purple)
    activation: "#10b981"          // Activation (emerald)
  };

  // Helper to draw a node with variable height
  function drawNode(x, y, w, h, label, sublabel, nodeId, colorType) {
    const isActive = currentFFNStep.activeNode === nodeId;
    const baseColor = colors[colorType] || diagramTheme.nodeFill;

    const g = svg.append("g")
      .attr("transform", `translate(${x}, ${y})`);

    g.append("rect")
      .attr("x", -w/2)
      .attr("y", -h/2)
      .attr("width", w)
      .attr("height", h)
      .attr("rx", 6)
      .attr("fill", isActive ? baseColor : diagramTheme.nodeFill)
      .attr("stroke", isActive ? baseColor : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("opacity", isActive ? 1 : 0.85)
      .style("filter", isActive ? "url(#ffn-glow)" : "none");

    const textColor = isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText;

    if (sublabel) {
      g.append("text")
        .attr("y", -8)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", textColor)
        .attr("font-size", "11px")
        .attr("font-weight", "600")
        .text(label);

      g.append("text")
        .attr("y", 8)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", textColor)
        .attr("font-size", "9px")
        .attr("opacity", isActive ? 0.95 : 0.7)
        .text(sublabel);
    } else {
      g.append("text")
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", textColor)
        .attr("font-size", "11px")
        .attr("font-weight", "600")
        .text(label);
    }

    return g;
  }

  // Helper to draw an edge
  function drawEdge(x1, y1, x2, y2, edgeId) {
    const isActive = currentFFNStep.activeEdge === edgeId;
    const strokeColor = isActive ? diagramTheme.highlight : diagramTheme.edgeStroke;
    const markerId = isActive ? "ffn-arrow-active" : "ffn-arrow";

    const path = svg.append("path")
      .attr("d", `M${x1},${y1} L${x2},${y2}`)
      .attr("fill", "none")
      .attr("stroke", strokeColor)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("marker-end", `url(#${markerId})`);

    if (isActive) {
      path.style("filter", `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})`);
    }

    return path;
  }

  // Draw edges first (so nodes appear on top)
  // Edge offsets to connect to node edges
  const edgeGap = 8;

  // Input -> Linear1
  drawEdge(inputX + nodeWidth/2 + edgeGap, centerY, linear1X - nodeWidth/2 - edgeGap, centerY, "e_input_linear1");

  // Linear1 -> GELU
  drawEdge(linear1X + nodeWidth/2 + edgeGap, centerY, geluX - nodeWidth/2 - edgeGap, centerY, "e_linear1_gelu");

  // GELU -> Linear2
  drawEdge(geluX + nodeWidth/2 + edgeGap, centerY, linear2X - nodeWidth/2 - edgeGap, centerY, "e_gelu_linear2");

  // Linear2 -> Output
  drawEdge(linear2X + nodeWidth/2 + edgeGap, centerY, outputX - nodeWidth/2 - edgeGap, centerY, "e_linear2_output");

  // Draw nodes
  // Input (embed_dim)
  drawNode(inputX, centerY, nodeWidth, baseHeight, "Input", "(embed_dim)", "input", "io");

  // Linear1 - expanded height to show 4x
  drawNode(linear1X, centerY, nodeWidth, expandedHeight, "Linear", "→ 4x embed", "linear1", "linear");

  // GELU - also expanded (operates on 4x)
  drawNode(geluX, centerY, nodeWidth, expandedHeight, "GELU", "(4x embed)", "gelu", "activation");

  // Linear2 - expanded input, compressed output (show as expanded)
  drawNode(linear2X, centerY, nodeWidth, expandedHeight, "Linear", "→ embed_dim", "linear2", "linear");

  // Output (embed_dim)
  drawNode(outputX, centerY, nodeWidth, baseHeight, "Output", "(embed_dim)", "output", "io");

  // Dimension labels below the flow
  const labelY = centerY + 60;
  const dimLabelStyle = {
    "font-size": "9px",
    "fill": diagramTheme.nodeText,
    "opacity": 0.6
  };

  // Add "4x expansion" annotation in the middle section
  svg.append("text")
    .attr("x", (linear1X + linear2X) / 2)
    .attr("y", centerY - expandedHeight/2 - 12)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "10px")
    .attr("font-weight", "500")
    .text("4x capacity expansion");

  // Small arrows to indicate expansion and compression
  const arrowY = centerY - expandedHeight/2 - 6;
  svg.append("path")
    .attr("d", `M${linear1X - 20},${arrowY} L${linear1X + 20},${arrowY}`)
    .attr("stroke", diagramTheme.highlight)
    .attr("stroke-width", 1)
    .attr("opacity", 0.5);
  svg.append("path")
    .attr("d", `M${linear2X - 20},${arrowY} L${linear2X + 20},${arrowY}`)
    .attr("stroke", diagramTheme.highlight)
    .attr("stroke-width", 1)
    .attr("opacity", 0.5);

  return svg.node();
}

// Step description display
html`<div style="
  background: ${diagramTheme.bgSecondary};
  border: 1px solid ${diagramTheme.nodeStroke};
  border-radius: 6px;
  padding: 12px 16px;
  margin-top: 8px;
  font-family: 'JetBrains Mono', 'Fira Code', monospace;
  font-size: 13px;
">
  <strong style="color: ${diagramTheme.highlight};">${currentFFNStep.name}</strong>
  <span style="color: ${diagramTheme.nodeText}; opacity: 0.9;"> — ${currentFFNStep.description}</span>
</div>`

The FFN is a mini neural network applied to each token independently:

4x expansion: More capacity to learn complex transformations
GELU activation: Smoother than ReLU, better gradients
Same for all tokens: Unlike attention, no mixing between positions

Pre-Norm vs Post-Norm

We use Pre-Norm (LayerNorm before attention/FFN) rather than Post-Norm:

# Pre-Norm (GPT-2, LLaMA, modern LLMs)
x = x + Attention(LayerNorm(x))

# Post-Norm (original Transformer paper)
x = LayerNorm(x + Attention(x))

Why Pre-Norm is preferred:

Cleaner gradient path: The residual connection bypasses normalization, so gradients flow directly
More stable training: Especially important for deep networks (24+ layers)
Requires final LayerNorm: Since the last block’s output isn’t normalized, we add a final LayerNorm before the output projection

Post-Norm achieves marginally better final performance with careful tuning, but Pre-Norm trains more robustly.

Code Walkthrough

Let’s build and explore transformer blocks:

import sys
import importlib.util
from pathlib import Path

import torch
import torch.nn as nn

print(f"PyTorch version: {torch.__version__}")
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")

PyTorch version: 2.10.0+cu128
Device: cpu

GELU Activation

GPT uses GELU instead of ReLU. Let’s see why:

viewof activationInputValue = Inputs.range([-3, 3], {
  value: 0,
  step: 0.1,
  label: "Input value (x)"
})

// Zoom toggle
viewof showZoom = Inputs.toggle({
  value: false,
  label: "Zoom to origin"
})

geluFn = (x) => {
  // Exact GELU: x * Phi(x) where Phi is standard normal CDF
  // Using tanh approximation for consistency with PyTorch
  const sqrt2OverPi = Math.sqrt(2 / Math.PI);
  return 0.5 * x * (1 + Math.tanh(sqrt2OverPi * (x + 0.044715 * x * x * x)));
}

reluFn = (x) => Math.max(0, x)

// Derivatives
geluDerivative = (x) => {
  // Numerical approximation of GELU derivative
  const h = 0.0001;
  return (geluFn(x + h) - geluFn(x - h)) / (2 * h);
}

reluDerivative = (x) => x > 0 ? 1 : 0

// Generate data points
activationData = {
  const xMin = showZoom ? -1.5 : -3;
  const xMax = showZoom ? 1.5 : 3;
  const numPoints = 200;
  const step = (xMax - xMin) / numPoints;

  const data = [];
  for (let x = xMin; x <= xMax; x += step) {
    data.push({
      x: x,
      relu: reluFn(x),
      gelu: geluFn(x),
      reluDeriv: reluDerivative(x),
      geluDeriv: geluDerivative(x)
    });
  }
  return data;
}

// Current point values
currentRelu = reluFn(activationInputValue)
currentGelu = geluFn(activationInputValue)

// Main visualization
activationChart = {
  const width = 700;
  const height = 320;
  const margin = { top: 40, right: 30, bottom: 50, left: 55 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales
  const xDomain = showZoom ? [-1.5, 1.5] : [-3, 3];
  const yDomain = showZoom ? [-0.5, 1.5] : [-1, 3];

  const xScale = d3.scaleLinear()
    .domain(xDomain)
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain(yDomain)
    .range([innerHeight, 0]);

  // Grid lines
  const gridColor = theme.isDark ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.08)";

  g.append("g")
    .attr("class", "grid")
    .selectAll("line.h")
    .data(yScale.ticks(6))
    .join("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", d => yScale(d))
    .attr("y2", d => yScale(d))
    .attr("stroke", gridColor);

  g.append("g")
    .attr("class", "grid")
    .selectAll("line.v")
    .data(xScale.ticks(6))
    .join("line")
    .attr("x1", d => xScale(d))
    .attr("x2", d => xScale(d))
    .attr("y1", 0)
    .attr("y2", innerHeight)
    .attr("stroke", gridColor);

  // Axis lines at 0
  g.append("line")
    .attr("x1", xScale(0))
    .attr("x2", xScale(0))
    .attr("y1", 0)
    .attr("y2", innerHeight)
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "4,3")
    .attr("opacity", 0.5);

  g.append("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", yScale(0))
    .attr("y2", yScale(0))
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "4,3")
    .attr("opacity", 0.5);

  // Colors for the functions - use theme colors for consistency
  const reluColor = theme.primary;
  const geluColor = theme.error;

  // Line generators
  const reluLine = d3.line()
    .x(d => xScale(d.x))
    .y(d => yScale(d.relu))
    .curve(d3.curveLinear);

  const geluLine = d3.line()
    .x(d => xScale(d.x))
    .y(d => yScale(d.gelu))
    .curve(d3.curveMonotoneX);

  // Draw ReLU
  g.append("path")
    .datum(activationData)
    .attr("d", reluLine)
    .attr("fill", "none")
    .attr("stroke", reluColor)
    .attr("stroke-width", 2.5)
    .attr("stroke-linecap", "round");

  // Draw GELU
  g.append("path")
    .datum(activationData)
    .attr("d", geluLine)
    .attr("fill", "none")
    .attr("stroke", geluColor)
    .attr("stroke-width", 2.5)
    .attr("stroke-linecap", "round");

  // Current input value marker line
  if (activationInputValue >= xDomain[0] && activationInputValue <= xDomain[1]) {
    g.append("line")
      .attr("x1", xScale(activationInputValue))
      .attr("x2", xScale(activationInputValue))
      .attr("y1", 0)
      .attr("y2", innerHeight)
      .attr("stroke", theme.highlight)
      .attr("stroke-width", 1.5)
      .attr("stroke-dasharray", "6,4")
      .attr("opacity", 0.6);

    // ReLU point
    const reluY = yScale(currentRelu);
    if (reluY >= 0 && reluY <= innerHeight) {
      g.append("circle")
        .attr("cx", xScale(activationInputValue))
        .attr("cy", reluY)
        .attr("r", 7)
        .attr("fill", reluColor)
        .attr("stroke", theme.bg)
        .attr("stroke-width", 2);
    }

    // GELU point
    const geluY = yScale(currentGelu);
    if (geluY >= 0 && geluY <= innerHeight) {
      g.append("circle")
        .attr("cx", xScale(activationInputValue))
        .attr("cy", geluY)
        .attr("r", 7)
        .attr("fill", geluColor)
        .attr("stroke", theme.bg)
        .attr("stroke-width", 2);
    }
  }

  // X axis
  g.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(6))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 40)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Input (x)");

  // Y axis
  g.append("g")
    .call(d3.axisLeft(yScale).ticks(6))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -42)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Activation Output");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text(showZoom ? "GELU vs ReLU (Zoomed at Origin)" : "GELU vs ReLU Activation Functions");

  // Legend
  const legendX = innerWidth - 80;
  const legendY = 15;

  g.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY)
    .attr("y2", legendY)
    .attr("stroke", reluColor)
    .attr("stroke-width", 2.5);

  g.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY)
    .attr("dominant-baseline", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("font-weight", "500")
    .text("ReLU");

  g.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY + 22)
    .attr("y2", legendY + 22)
    .attr("stroke", geluColor)
    .attr("stroke-width", 2.5);

  g.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY + 22)
    .attr("dominant-baseline", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("font-weight", "500")
    .text("GELU");

  return svg.node();
}

// Value display
html`<div style="
  display: grid;
  grid-template-columns: repeat(3, 1fr);
  gap: 16px;
  margin-top: 12px;
  font-family: 'IBM Plex Mono', monospace;
">
  <div style="
    background: ${diagramTheme.bgSecondary};
    border: 1px solid ${diagramTheme.nodeStroke};
    border-radius: 8px;
    padding: 14px;
    text-align: center;
  ">
    <div style="color: ${diagramTheme.highlight}; font-size: 12px; font-weight: 500; margin-bottom: 4px;">Input x</div>
    <div style="color: ${diagramTheme.nodeText}; font-size: 20px; font-weight: 600;">${activationInputValue.toFixed(2)}</div>
  </div>
  <div style="
    background: ${diagramTheme.bgSecondary};
    border: 1px solid #3b82f6;
    border-radius: 8px;
    padding: 14px;
    text-align: center;
  ">
    <div style="color: #3b82f6; font-size: 12px; font-weight: 500; margin-bottom: 4px;">ReLU(x)</div>
    <div style="color: ${diagramTheme.nodeText}; font-size: 20px; font-weight: 600;">${currentRelu.toFixed(3)}</div>
  </div>
  <div style="
    background: ${diagramTheme.bgSecondary};
    border: 1px solid #ef4444;
    border-radius: 8px;
    padding: 14px;
    text-align: center;
  ">
    <div style="color: #ef4444; font-size: 12px; font-weight: 500; margin-bottom: 4px;">GELU(x)</div>
    <div style="color: ${diagramTheme.nodeText}; font-size: 20px; font-weight: 600;">${currentGelu.toFixed(3)}</div>
  </div>
</div>`

// Key insight callout
html`<div style="
  background: ${diagramTheme.isDark ? 'rgba(239, 68, 68, 0.1)' : 'rgba(239, 68, 68, 0.05)'};
  border-left: 3px solid #ef4444;
  border-radius: 0 6px 6px 0;
  padding: 12px 16px;
  margin-top: 16px;
  font-family: 'IBM Plex Sans', system-ui, sans-serif;
">
  <strong style="color: #ef4444;">Key Difference:</strong>
  <span style="color: ${diagramTheme.nodeText};">
    GELU is smooth everywhere, while ReLU has a sharp corner at x=0.
    At x=${activationInputValue.toFixed(1)}, ${Math.abs(activationInputValue) < 0.3 ?
      "notice how GELU smoothly transitions while ReLU has a discontinuous derivative." :
      (activationInputValue < -1 ?
        "GELU allows small negative values through, ReLU kills them entirely." :
        "both functions pass positive values, but GELU's curve is gentler.")}
  </span>
</div>`

GELU formula: \(\text{GELU}(x) = x \cdot \Phi(x)\) where \(\Phi\) is the standard normal CDF.

Approximation used in practice: \(0.5x(1 + \tanh(\sqrt{2/\pi}(x + 0.044715x^3)))\)

Activation function choices in modern LLMs:

Model	Activation	Notes
GPT-2, BERT	GELU	Smooth, good gradients
LLaMA, Mistral	SwiGLU	Gated variant, better performance
GPT-3	GELU	Same as GPT-2

SwiGLU (used in LLaMA) is a gated linear unit: \(\text{SwiGLU}(x) = \text{Swish}(xW_1) \otimes xW_2\). It requires an extra linear layer but often improves performance. Our implementation uses standard GELU to match GPT-2.

Layer Normalization Demo

# Manual LayerNorm demonstration
x = torch.tensor([[2.0, 4.0, 6.0, 8.0]])

mean = x.mean(dim=-1, keepdim=True)
var = x.var(dim=-1, unbiased=False, keepdim=True)
normalized = (x - mean) / torch.sqrt(var + 1e-5)

print("Manual LayerNorm:")
print(f"  Input: {x.numpy().tolist()}")
print(f"  Mean: {mean.item():.2f}")
print(f"  Variance: {var.item():.2f}")
print(f"  Normalized: {normalized.numpy().round(2).tolist()}")
print(f"  New mean: {normalized.mean().item():.4f}")
print(f"  New std: {normalized.std().item():.4f}")

Manual LayerNorm:
  Input: [[2.0, 4.0, 6.0, 8.0]]
  Mean: 5.00
  Variance: 5.00
  Normalized: [[-1.340000033378601, -0.44999998807907104, 0.44999998807907104, 1.340000033378601]]
  New mean: 0.0000
  New std: 1.1547

# PyTorch LayerNorm (with learnable gamma and beta)
ln = nn.LayerNorm(4)
pytorch_normalized = ln(x)

print(f"PyTorch LayerNorm output: {pytorch_normalized.detach().numpy().round(2).tolist()}")
print("(gamma and beta are learnable parameters)")

PyTorch LayerNorm output: [[-1.340000033378601, -0.44999998807907104, 0.44999998807907104, 1.340000033378601]]
(gamma and beta are learnable parameters)

Residual Connections Demo

viewof signalLayers = Inputs.range([3, 15], {
  value: 10,
  step: 1,
  label: "Number of layers"
})

viewof transformScale = Inputs.range([0.1, 0.9], {
  value: 0.5,
  step: 0.1,
  label: "Transformation scale"
})

viewof residualWeight = Inputs.range([0, 0.3], {
  value: 0.1,
  step: 0.05,
  label: "Residual contribution"
})

signalData = {
  const numLayers = signalLayers;
  const scale = transformScale;
  const resWeight = residualWeight;

  // Simple layer: y = x * scale + bias
  function simpleLayer(x) {
    return x * scale + 0.1;
  }

  // Without residual: y = f(x)
  const withoutResidual = [1.0];
  let x = 1.0;
  for (let i = 0; i < numLayers; i++) {
    x = simpleLayer(x);
    withoutResidual.push(x);
  }

  // With residual: y = x + resWeight * f(x)
  const withResidual = [1.0];
  x = 1.0;
  for (let i = 0; i < numLayers; i++) {
    x = x + resWeight * simpleLayer(x);
    withResidual.push(x);
  }

  return { withoutResidual, withResidual };
}

// Final values
signalFinalWithout = signalData.withoutResidual[signalData.withoutResidual.length - 1]
signalFinalWith = signalData.withResidual[signalData.withResidual.length - 1]

// Main chart
signalFlowChart = {
  const width = 700;
  const height = 300;
  const margin = { top: 45, right: 30, bottom: 55, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Data
  const dataWithout = signalData.withoutResidual.map((v, i) => ({ layer: i, value: v }));
  const dataWith = signalData.withResidual.map((v, i) => ({ layer: i, value: v }));

  // Scales
  const xScale = d3.scaleLinear()
    .domain([0, signalLayers])
    .range([0, innerWidth]);

  const allValues = [...signalData.withoutResidual, ...signalData.withResidual];
  const yMin = Math.min(0, d3.min(allValues) - 0.1);
  const yMax = d3.max(allValues) + 0.2;

  const yScale = d3.scaleLinear()
    .domain([yMin, yMax])
    .range([innerHeight, 0]);

  // Grid
  const gridColor = theme.isDark ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.08)";

  g.selectAll("line.grid-h")
    .data(yScale.ticks(5))
    .join("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", d => yScale(d))
    .attr("y2", d => yScale(d))
    .attr("stroke", gridColor);

  // Colors - use theme colors for consistency
  const withoutColor = theme.primary;
  const withColor = theme.error;

  // Line generators
  const lineGen = d3.line()
    .x(d => xScale(d.layer))
    .y(d => yScale(d.value))
    .curve(d3.curveMonotoneX);

  // Draw lines
  g.append("path")
    .datum(dataWithout)
    .attr("d", lineGen)
    .attr("fill", "none")
    .attr("stroke", withoutColor)
    .attr("stroke-width", 2.5)
    .attr("stroke-linecap", "round");

  g.append("path")
    .datum(dataWith)
    .attr("d", lineGen)
    .attr("fill", "none")
    .attr("stroke", withColor)
    .attr("stroke-width", 2.5)
    .attr("stroke-linecap", "round");

  // Points
  dataWithout.forEach(d => {
    g.append("circle")
      .attr("cx", xScale(d.layer))
      .attr("cy", yScale(d.value))
      .attr("r", 4)
      .attr("fill", withoutColor)
      .attr("stroke", theme.bg)
      .attr("stroke-width", 1.5);
  });

  dataWith.forEach(d => {
    g.append("circle")
      .attr("cx", xScale(d.layer))
      .attr("cy", yScale(d.value))
      .attr("r", 4)
      .attr("fill", withColor)
      .attr("stroke", theme.bg)
      .attr("stroke-width", 1.5);
  });

  // Starting point indicator
  g.append("circle")
    .attr("cx", xScale(0))
    .attr("cy", yScale(1))
    .attr("r", 8)
    .attr("fill", "none")
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 2)
    .attr("stroke-dasharray", "4,2");

  // X axis
  g.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(signalLayers + 1).tickFormat(d => d))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 42)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Layer");

  // Y axis
  g.append("g")
    .call(d3.axisLeft(yScale).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Signal Value");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text("Effect of Residual Connections on Signal Flow");

  // Legend
  const legendX = innerWidth - 70;
  const legendY = 15;

  g.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 22)
    .attr("y1", legendY)
    .attr("y2", legendY)
    .attr("stroke", withoutColor)
    .attr("stroke-width", 2.5);

  g.append("text")
    .attr("x", legendX + 28)
    .attr("y", legendY)
    .attr("dominant-baseline", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("y = f(x)");

  g.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 22)
    .attr("y1", legendY + 18)
    .attr("y2", legendY + 18)
    .attr("stroke", withColor)
    .attr("stroke-width", 2.5);

  g.append("text")
    .attr("x", legendX + 28)
    .attr("y", legendY + 18)
    .attr("dominant-baseline", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("y = x + f(x)");

  return svg.node();
}

// Summary stats
html`<div style="
  display: grid;
  grid-template-columns: repeat(2, 1fr);
  gap: 16px;
  margin-top: 12px;
  font-family: 'IBM Plex Mono', monospace;
  font-size: 13px;
">
  <div style="
    background: ${diagramTheme.bgSecondary};
    border: 1px solid #3b82f6;
    border-radius: 8px;
    padding: 14px;
  ">
    <div style="color: #3b82f6; font-size: 11px; font-weight: 600; margin-bottom: 6px;">Without Residual</div>
    <div style="color: ${diagramTheme.nodeText};">
      Value ${transformScale < 1 ? "shrinks" : "grows"} to <strong>${signalFinalWithout.toFixed(4)}</strong>
    </div>
  </div>
  <div style="
    background: ${diagramTheme.bgSecondary};
    border: 1px solid #ef4444;
    border-radius: 8px;
    padding: 14px;
  ">
    <div style="color: #ef4444; font-size: 11px; font-weight: 600; margin-bottom: 6px;">With Residual</div>
    <div style="color: ${diagramTheme.nodeText};">
      Value stays near <strong>${signalFinalWith.toFixed(4)}</strong>
    </div>
  </div>
</div>`

Weight Initialization

Deep networks require proper initialization. Our implementation uses:

Embeddings: Normal distribution with std=0.02
Linear layers in FFN: Normal distribution with std=0.02, biases initialized to 0
Attention projections: Xavier uniform initialization

# Demonstrate the importance of initialization
import torch.nn as nn

# Bad initialization - too large
bad_linear = nn.Linear(768, 768)
nn.init.normal_(bad_linear.weight, std=1.0)  # Too large!

# Good initialization - small weights
good_linear = nn.Linear(768, 768)
nn.init.normal_(good_linear.weight, std=0.02)  # GPT-2 style

x = torch.randn(1, 10, 768)
bad_out = bad_linear(x)
good_out = good_linear(x)

print("Effect of initialization on output magnitude:")
print(f"  Bad init (std=1.0):  output std = {bad_out.std().item():.2f}")
print(f"  Good init (std=0.02): output std = {good_out.std().item():.2f}")
print("\nLarge outputs can cause exploding gradients and NaN losses!")

Effect of initialization on output magnitude:
  Bad init (std=1.0):  output std = 28.06
  Good init (std=0.02): output std = 0.56

Large outputs can cause exploding gradients and NaN losses!

GPT-2’s initialization trick: Scale the final projection in each residual block by \(1/\sqrt{2N}\) where N is the number of layers. This keeps the variance stable as depth increases.

Building a Transformer Block

from transformer import (
    FeedForward,
    TransformerBlock,
    GPTModel,
    create_gpt_tiny,
    create_gpt_small,
)

# Create a transformer block
embed_dim = 64
num_heads = 4
ff_dim = 256

block = TransformerBlock(
    embed_dim=embed_dim,
    num_heads=num_heads,
    ff_dim=ff_dim,
    dropout=0.0
)

print(f"Transformer Block:")
print(f"  Embed dim: {embed_dim}")
print(f"  Num heads: {num_heads}")
print(f"  Head dim: {embed_dim // num_heads}")
print(f"  FF dim: {ff_dim}")
print(f"\nTotal parameters: {sum(p.numel() for p in block.parameters()):,}")

Transformer Block:
  Embed dim: 64
  Num heads: 4
  Head dim: 16
  FF dim: 256

Total parameters: 49,984

# Forward pass
x = torch.randn(1, 8, embed_dim)  # batch=1, seq=8
output, attention = block(x, return_attention=True)

print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention shape: {attention.shape}")

Input shape: torch.Size([1, 8, 64])
Output shape: torch.Size([1, 8, 64])
Attention shape: torch.Size([1, 4, 8, 8])

# Pass attention weights to OJS for visualization
attention_list = [attention[0, h].detach().tolist() for h in range(4)]
ojs_define(attentionWeightsData=attention_list)

// =============================================================================
// ATTENTION PATTERNS VISUALIZATION
// Displays multi-head attention weights as heatmaps
// =============================================================================

// This visualization receives attention data from Python via ojs_define()
// Expected variable: attentionWeightsData - array of [head][query][key] values

// Head selector
viewof selectedHead = Inputs.range([0, 3], {
  value: 0,
  step: 1,
  label: "Attention Head"
})

// Main attention heatmap visualization
attentionHeatmap = {
  // Use data from Python or generate demo data
  const rawData = typeof attentionWeightsData !== 'undefined' ? attentionWeightsData : null;

  // Demo data if not provided - causal attention pattern
  const seqLen = 8;
  const numHeads = 4;

  function generateDemoData() {
    const heads = [];
    for (let h = 0; h < numHeads; h++) {
      const matrix = [];
      for (let q = 0; q < seqLen; q++) {
        const row = [];
        for (let k = 0; k < seqLen; k++) {
          if (k > q) {
            // Future tokens are masked
            row.push(0);
          } else {
            // Different patterns for different heads
            let val;
            if (h === 0) {
              // Head 0: Focus on recent tokens
              val = Math.exp(-(q - k) * 0.5);
            } else if (h === 1) {
              // Head 1: Focus on first token (BOS pattern)
              val = k === 0 ? 0.7 : Math.exp(-(q - k) * 0.8) * 0.3;
            } else if (h === 2) {
              // Head 2: More uniform
              val = 1 / (q + 1);
            } else {
              // Head 3: Focus on self and previous
              val = k === q ? 0.6 : (k === q - 1 ? 0.3 : 0.1 / (q + 1));
            }
            row.push(val);
          }
        }
        // Normalize row to sum to 1 (softmax-like)
        const sum = row.reduce((a, b) => a + b, 0);
        matrix.push(row.map(v => v / sum));
      }
      heads.push(matrix);
    }
    return heads;
  }

  const data = rawData || generateDemoData();
  const head = selectedHead;
  const matrix = data[head];
  const size = matrix.length;

  const width = 400;
  const height = 400;
  const margin = { top: 50, right: 80, bottom: 60, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;
  const cellSize = Math.min(innerWidth, innerHeight) / size;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Color scale
  const colorScale = d3.scaleSequential()
    .domain([0, d3.max(matrix.flat())])
    .interpolator(theme.isDark ?
      d3.interpolateBlues :
      d3.interpolateBlues);

  // Draw cells
  for (let q = 0; q < size; q++) {
    for (let k = 0; k < size; k++) {
      const val = matrix[q][k];

      g.append("rect")
        .attr("x", k * cellSize)
        .attr("y", q * cellSize)
        .attr("width", cellSize - 1)
        .attr("height", cellSize - 1)
        .attr("rx", 2)
        .attr("fill", val > 0.001 ? colorScale(val) : theme.nodeFill)
        .attr("stroke", theme.nodeStroke)
        .attr("stroke-width", 0.5)
        .attr("opacity", val > 0.001 ? 1 : 0.3);

      // Show value for significant cells
      if (val > 0.1 && cellSize > 25) {
        g.append("text")
          .attr("x", k * cellSize + cellSize / 2)
          .attr("y", q * cellSize + cellSize / 2)
          .attr("text-anchor", "middle")
          .attr("dominant-baseline", "central")
          .attr("fill", val > 0.4 ? theme.textOnHighlight : theme.nodeText)
          .attr("font-size", "9px")
          .attr("font-weight", "500")
          .text(val.toFixed(2));
      }
    }
  }

  // Causal mask indicator (diagonal line)
  g.append("line")
    .attr("x1", 0)
    .attr("y1", 0)
    .attr("x2", size * cellSize)
    .attr("y2", size * cellSize)
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 2)
    .attr("stroke-dasharray", "4,3")
    .attr("opacity", 0.6);

  // Masked region shading
  g.append("path")
    .attr("d", `M0,0 L${size * cellSize},0 L${size * cellSize},${size * cellSize} Z`)
    .attr("fill", theme.isDark ? "rgba(255,255,255,0.03)" : "rgba(0,0,0,0.03)");

  // X axis label (Key position)
  g.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", size * cellSize + 35)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Key Position");

  // Y axis label (Query position)
  g.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -size * cellSize / 2)
    .attr("y", -40)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Query Position");

  // Position labels
  const positions = Array.from({ length: size }, (_, i) => i);

  g.selectAll("text.x-label")
    .data(positions)
    .join("text")
    .attr("class", "x-label")
    .attr("x", (d, i) => i * cellSize + cellSize / 2)
    .attr("y", size * cellSize + 15)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text(d => d);

  g.selectAll("text.y-label")
    .data(positions)
    .join("text")
    .attr("class", "y-label")
    .attr("x", -10)
    .attr("y", (d, i) => i * cellSize + cellSize / 2)
    .attr("text-anchor", "end")
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text(d => d);

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text(`Attention Head ${head} (Causal Masked)`);

  // Color legend
  const legendWidth = 15;
  const legendHeight = size * cellSize;
  const legendX = width - margin.right + 20;
  const legendY = margin.top;

  const legendScale = d3.scaleLinear()
    .domain([0, d3.max(matrix.flat())])
    .range([legendHeight, 0]);

  // Gradient for legend
  const defs = svg.append("defs");
  const legendGradient = defs.append("linearGradient")
    .attr("id", "legend-gradient")
    .attr("x1", "0%")
    .attr("y1", "100%")
    .attr("x2", "0%")
    .attr("y2", "0%");

  legendGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", colorScale(0));

  legendGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", colorScale(d3.max(matrix.flat())));

  svg.append("rect")
    .attr("x", legendX)
    .attr("y", legendY)
    .attr("width", legendWidth)
    .attr("height", legendHeight)
    .attr("fill", "url(#legend-gradient)")
    .attr("rx", 3);

  svg.append("g")
    .attr("transform", `translate(${legendX + legendWidth + 3}, ${legendY})`)
    .call(d3.axisRight(legendScale).ticks(4).tickFormat(d3.format(".2f")))
    .call(g => g.select(".domain").remove())
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "9px"));

  return svg.node();
}

// All heads overview
allHeadsOverview = {
  const rawData = typeof attentionWeightsData !== 'undefined' ? attentionWeightsData : null;

  // Demo data
  const seqLen = 8;
  const numHeads = 4;

  function generateDemoData() {
    const heads = [];
    for (let h = 0; h < numHeads; h++) {
      const matrix = [];
      for (let q = 0; q < seqLen; q++) {
        const row = [];
        for (let k = 0; k < seqLen; k++) {
          if (k > q) {
            row.push(0);
          } else {
            let val;
            if (h === 0) val = Math.exp(-(q - k) * 0.5);
            else if (h === 1) val = k === 0 ? 0.7 : Math.exp(-(q - k) * 0.8) * 0.3;
            else if (h === 2) val = 1 / (q + 1);
            else val = k === q ? 0.6 : (k === q - 1 ? 0.3 : 0.1 / (q + 1));
            row.push(val);
          }
        }
        const sum = row.reduce((a, b) => a + b, 0);
        matrix.push(row.map(v => v / sum));
      }
      heads.push(matrix);
    }
    return heads;
  }

  const data = rawData || generateDemoData();
  const size = data[0].length;

  const cellSize = 18;
  const gridGap = 20;
  const headWidth = size * cellSize;
  const width = 4 * headWidth + 3 * gridGap + 80;
  const height = headWidth + 100;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text("Attention Patterns in Transformer Block (Causal Masked)");

  // Draw each head
  data.forEach((matrix, headIdx) => {
    const offsetX = 40 + headIdx * (headWidth + gridGap);
    const offsetY = 50;

    const g = svg.append("g")
      .attr("transform", `translate(${offsetX}, ${offsetY})`);

    // Color scale
    const colorScale = d3.scaleSequential()
      .domain([0, d3.max(matrix.flat())])
      .interpolator(d3.interpolateBlues);

    // Draw cells
    for (let q = 0; q < size; q++) {
      for (let k = 0; k < size; k++) {
        const val = matrix[q][k];
        g.append("rect")
          .attr("x", k * cellSize)
          .attr("y", q * cellSize)
          .attr("width", cellSize - 1)
          .attr("height", cellSize - 1)
          .attr("rx", 1)
          .attr("fill", val > 0.001 ? colorScale(val) : theme.nodeFill)
          .attr("opacity", val > 0.001 ? 1 : 0.3);
      }
    }

    // Head label
    g.append("text")
      .attr("x", headWidth / 2)
      .attr("y", -8)
      .attr("text-anchor", "middle")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "500")
      .text(`Head ${headIdx}`);

    // Highlight selected head
    if (headIdx === selectedHead) {
      g.append("rect")
        .attr("x", -3)
        .attr("y", -3)
        .attr("width", headWidth + 6)
        .attr("height", headWidth + 6)
        .attr("fill", "none")
        .attr("stroke", theme.highlight)
        .attr("stroke-width", 2)
        .attr("rx", 4);
    }
  });

  // Axis labels
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", height - 15)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Key Position");

  svg.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -(50 + size * cellSize / 2))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Query");

  return svg.node();
}

// Explanation
html`<div style="
  background: ${diagramTheme.bgSecondary};
  border: 1px solid ${diagramTheme.nodeStroke};
  border-radius: 8px;
  padding: 14px 18px;
  margin-top: 12px;
  font-family: 'IBM Plex Sans', system-ui, sans-serif;
  font-size: 13px;
  color: ${diagramTheme.nodeText};
">
  <strong style="color: ${diagramTheme.highlight};">Reading the Heatmap:</strong>
  Each cell shows how much attention query position (row) pays to key position (column).
  The diagonal dashed line marks the causal boundary - tokens cannot attend to future positions.
  Different heads learn different patterns: some focus on recent tokens, others on the beginning (BOS).
</div>`

Complete GPT Model

# Create a tiny GPT model
model = create_gpt_tiny(vocab_size=1000)

print("GPT Tiny Model:")
print(f"  Vocab size: {model.vocab_size}")
print(f"  Embed dim: {model.embed_dim}")
print(f"  Num layers: {len(model.blocks)}")
print(f"  Max seq len: {model.max_seq_len}")
print(f"\nTotal parameters: {model.num_params:,}")

GPT Tiny Model:
  Vocab size: 1000
  Embed dim: 128
  Num layers: 4
  Max seq len: 256

Total parameters: 954,112

# Parameter breakdown
counts = model.count_parameters()

print("Parameter breakdown:")
for name, count in counts.items():
    if count > 0:
        pct = 100 * count / counts['total']
        print(f"  {name}: {count:,} ({pct:.1f}%)")

Parameter breakdown:
  token_embedding: 128,000 (13.4%)
  position_embedding: 32,768 (3.4%)
  transformer_blocks: 793,088 (83.1%)
  final_layer_norm: 256 (0.0%)
  total: 954,112 (100.0%)

# Pass parameter counts to OJS for visualization
param_data = [
    {"category": name, "value": count}
    for name, count in counts.items()
    if count > 0 and name != 'total'
]
ojs_define(parameterData=param_data, totalParams=counts['total'])

// Parameter distribution bar chart
paramDistChart = {
  const data = typeof parameterData !== 'undefined' ? parameterData : [];
  const total = typeof totalParams !== 'undefined' ? totalParams : 0;

  if (data.length === 0) return html`<div>Loading...</div>`;

  const width = 600;
  const height = 280;
  const margin = { top: 45, right: 30, bottom: 60, left: 130 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales
  const yScale = d3.scaleBand()
    .domain(data.map(d => d.category))
    .range([0, innerHeight])
    .padding(0.25);

  const xScale = d3.scaleLinear()
    .domain([0, d3.max(data, d => d.value)])
    .range([0, innerWidth]);

  // Colors - use theme colors for consistency
  const colors = [theme.primary, theme.accent, theme.success, theme.info, theme.error];

  // Bars
  g.selectAll("rect.bar")
    .data(data)
    .join("rect")
    .attr("class", "bar")
    .attr("x", 0)
    .attr("y", d => yScale(d.category))
    .attr("width", d => xScale(d.value))
    .attr("height", yScale.bandwidth())
    .attr("rx", 4)
    .attr("fill", (d, i) => colors[i % colors.length])
    .attr("opacity", 0.85);

  // Value labels
  g.selectAll("text.value")
    .data(data)
    .join("text")
    .attr("class", "value")
    .attr("x", d => xScale(d.value) + 8)
    .attr("y", d => yScale(d.category) + yScale.bandwidth() / 2)
    .attr("dominant-baseline", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text(d => {
      const pct = (d.value / total * 100).toFixed(1);
      return `${(d.value / 1000).toFixed(1)}K (${pct}%)`;
    });

  // Y axis
  g.append("g")
    .call(d3.axisLeft(yScale))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").remove())
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text(`Parameter Distribution (${(total / 1000).toFixed(0)}K total)`);

  return svg.node();
}

# Forward pass
token_ids = torch.randint(0, 1000, (2, 32))  # batch=2, seq=32
logits = model(token_ids)

print(f"Input token IDs: {token_ids.shape}")
print(f"Output logits: {logits.shape}")
print(f"  (batch=2, seq=32, vocab=1000)")

# Get predictions
probs = torch.softmax(logits[0, -1], dim=-1)
top_5 = torch.topk(probs, 5)

print("\nTop 5 predicted next tokens (untrained, so random):")
for i, (idx, prob) in enumerate(zip(top_5.indices, top_5.values)):
    print(f"  {i+1}. Token {idx.item()}: {prob.item()*100:.2f}%")

Input token IDs: torch.Size([2, 32])
Output logits: torch.Size([2, 32, 1000])
  (batch=2, seq=32, vocab=1000)

Top 5 predicted next tokens (untrained, so random):
  1. Token 787: 0.20%
  2. Token 763: 0.18%
  3. Token 654: 0.18%
  4. Token 492: 0.17%
  5. Token 877: 0.17%

Hidden States Through Layers

# Get hidden states from all layers
logits, hidden_states = model(token_ids, return_hidden_states=True)

print(f"Number of hidden states: {len(hidden_states)}")
print(f"  (1 after embedding + {len(model.blocks)} after each block)")

# Show how representations change through layers
norms = [h.norm(dim=-1).mean().item() for h in hidden_states]
layer_names = ['Embed'] + [f'Block {i}' for i in range(len(model.blocks))]

Number of hidden states: 5
  (1 after embedding + 4 after each block)

# Pass data to OJS for visualization
layer_norms_data = [{"layer": name, "norm": norm} for name, norm in zip(layer_names, norms)]
ojs_define(layerNormsData=layer_norms_data)

// Embedding norms through network visualization
embeddingNormsChart = {
  const data = typeof layerNormsData !== 'undefined' ? layerNormsData : [];

  if (data.length === 0) return html`<div>Loading...</div>`;

  const width = 650;
  const height = 280;
  const margin = { top: 45, right: 30, bottom: 70, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales
  const xScale = d3.scaleBand()
    .domain(data.map(d => d.layer))
    .range([0, innerWidth])
    .padding(0.3);

  const yScale = d3.scaleLinear()
    .domain([0, d3.max(data, d => d.norm) * 1.15])
    .range([innerHeight, 0]);

  // Grid
  const gridColor = theme.isDark ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.08)";
  g.selectAll("line.grid")
    .data(yScale.ticks(5))
    .join("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", d => yScale(d))
    .attr("y2", d => yScale(d))
    .attr("stroke", gridColor);

  // Gradient for bars
  const defs = svg.append("defs");
  const gradient = defs.append("linearGradient")
    .attr("id", "embed-norm-gradient")
    .attr("x1", "0%")
    .attr("y1", "100%")
    .attr("x2", "0%")
    .attr("y2", "0%");

  gradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.accent);

  gradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.highlight);

  // Bars
  g.selectAll("rect.bar")
    .data(data)
    .join("rect")
    .attr("class", "bar")
    .attr("x", d => xScale(d.layer))
    .attr("y", d => yScale(d.norm))
    .attr("width", xScale.bandwidth())
    .attr("height", d => innerHeight - yScale(d.norm))
    .attr("rx", 4)
    .attr("fill", "url(#embed-norm-gradient)")
    .attr("opacity", 0.85);

  // Value labels
  g.selectAll("text.value")
    .data(data)
    .join("text")
    .attr("class", "value")
    .attr("x", d => xScale(d.layer) + xScale.bandwidth() / 2)
    .attr("y", d => yScale(d.norm) - 8)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .attr("font-weight", "500")
    .text(d => d.norm.toFixed(2));

  // Line connecting bars
  const lineGen = d3.line()
    .x(d => xScale(d.layer) + xScale.bandwidth() / 2)
    .y(d => yScale(d.norm))
    .curve(d3.curveMonotoneX);

  g.append("path")
    .datum(data)
    .attr("d", lineGen)
    .attr("fill", "none")
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 2)
    .attr("stroke-dasharray", "5,3")
    .attr("opacity", 0.5);

  // X axis
  g.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .attr("transform", "rotate(-30)")
      .attr("text-anchor", "end")
      .attr("dx", "-0.5em")
      .attr("dy", "0.5em"));

  // Y axis
  g.append("g")
    .call(d3.axisLeft(yScale).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Average Embedding Norm");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text("Embedding Norms Through the Network");

  return svg.node();
}

Weight Tying

GPT shares weights between token embedding and output projection:

# Check weight tying
print("Weight Tying:")
print(f"  Token embedding weight id: {id(model.token_embedding.weight)}")
print(f"  LM head weight id: {id(model.lm_head.weight)}")
print(f"  Are they the same object? {model.token_embedding.weight is model.lm_head.weight}")

# This saves parameters!
vocab_size = 1000
embed_dim = 128
saved_params = vocab_size * embed_dim
print(f"\nParameters saved by weight tying: {saved_params:,}")

Weight Tying:
  Token embedding weight id: 140165962793680
  LM head weight id: 140165962793680
  Are they the same object? True

Parameters saved by weight tying: 128,000

Model Sizes Comparison

Model	Layers	Heads	Embed Dim	Params
Tiny (ours)	4	4	128	~1M
Small (ours)	6	6	384	~10M
GPT-2 Small	12	12	768	117M
GPT-2 Medium	24	16	1024	345M
GPT-2 Large	36	20	1280	774M
GPT-2 XL	48	25	1600	1.5B

Parameter Counting Formulas

Knowing where parameters come from aids model sizing:

Per Transformer Block:

Attention Q, K, V projections: \(3 \times d \times d\) (where \(d\) = embed_dim)
Attention output projection: \(d \times d\)
FFN first linear: \(d \times 4d\)
FFN second linear: \(4d \times d\)
LayerNorm (x2): \(2 \times 2d\) (gamma and beta for each)

Total per block: \(\approx 12d^2\) parameters

Full Model:

Token embedding: \(V \times d\) (V = vocab size)
Position embedding: \(L \times d\) (L = max sequence length)
N transformer blocks: \(N \times 12d^2\)
Final LayerNorm: \(2d\)
LM head: 0 (weight-tied with token embedding)

Approximate formula: \(\text{Params} \approx V \times d + 12Nd^2\)

For GPT-2 Small (V=50257, d=768, N=12): \(50257 \times 768 + 12 \times 12 \times 768^2 \approx 117M\)

Scaling laws: more layers, heads, and dimensions lead to better performance (but diminishing returns and higher compute cost).

Architectural Variations

Modern LLMs have evolved beyond the original GPT-2 architecture. Here are key variations:

Normalization

Variant	Used By	Description
LayerNorm	GPT-2, GPT-3	Normalize across embedding dimension
RMSNorm	LLaMA, Mistral	Simpler: just divide by RMS, no mean subtraction
Pre-Norm	Most modern LLMs	Normalize before sublayer (more stable)

Position Embeddings

Variant	Used By	Description
Learned absolute	GPT-2	Separate embedding for each position
Rotary (RoPE)	LLaMA, Mistral	Encode position in attention via rotation
ALiBi	BLOOM	Add position bias to attention scores

Our implementation uses learned absolute position embeddings (GPT-2 style), which are simple but limit the model to the maximum trained sequence length.

Feed-Forward Networks

Variant	Used By	Expansion	Activation
Standard	GPT-2	4x	GELU
SwiGLU	LLaMA	8/3x (after gating)	SiLU (Swish)

Common Pitfalls

When implementing or training transformers, watch out for:

Forgetting the causal mask: Without it, the model “cheats” by seeing future tokens during training, which cripples generation at inference.
Wrong normalization axis: LayerNorm should normalize across the embedding dimension (last axis), not the sequence or batch dimensions.
Residual connection placement: Make sure to add the residual after dropout but before the next LayerNorm in Pre-Norm architecture.
Large learning rates: Transformers are sensitive to learning rate. Start with 1e-4 to 3e-4 for Adam, use warmup.
Numerical instability: Use float32 for training initially. Half precision (fp16/bf16) requires careful scaling.
Forgetting final LayerNorm: In Pre-Norm, the output of the last block isn’t normalized. The final LayerNorm before the LM head is essential.

Interactive Exploration

Experiment with transformer architecture choices to understand where parameters come from and how they scale.

function countParameters(vocabSize, embedDim, numLayers, numHeads, ffMult, maxSeqLen) {
  // Token embeddings (weight-tied with LM head)
  const tokenEmbed = vocabSize * embedDim;

  // Position embeddings
  const posEmbed = maxSeqLen * embedDim;

  // Per transformer block
  const attnParams = 4 * embedDim * embedDim; // Q, K, V, O projections
  const ffnParams = 2 * embedDim * (ffMult * embedDim); // up + down projection
  const normParams = 4 * embedDim; // 2 LayerNorms with gamma + beta each
  const perBlock = attnParams + ffnParams + normParams;

  // Total blocks
  const totalBlocks = numLayers * perBlock;

  // Final LayerNorm
  const finalNorm = 2 * embedDim;

  return {
    tokenEmbed,
    posEmbed,
    attention: numLayers * attnParams,
    ffn: numLayers * ffnParams,
    layerNorm: numLayers * normParams + finalNorm,
    total: tokenEmbed + posEmbed + totalBlocks + finalNorm
  };
}

// Format large numbers
function formatParams(n) {
  if (n >= 1e9) return (n / 1e9).toFixed(2) + "B";
  if (n >= 1e6) return (n / 1e6).toFixed(2) + "M";
  if (n >= 1e3) return (n / 1e3).toFixed(2) + "K";
  return n.toString();
}

// Format memory
function formatMemory(bytes) {
  if (bytes >= 1e9) return (bytes / 1e9).toFixed(2) + " GB";
  if (bytes >= 1e6) return (bytes / 1e6).toFixed(1) + " MB";
  return (bytes / 1e3).toFixed(1) + " KB";
}

presets = ({
  "Tiny (Ours)": { vocab: 10000, embed: 128, layers: 4, heads: 4, ff: 4, seq: 512 },
  "Small (Ours)": { vocab: 10000, embed: 384, layers: 6, heads: 6, ff: 4, seq: 512 },
  "GPT-2 Small": { vocab: 50257, embed: 768, layers: 12, heads: 12, ff: 4, seq: 1024 },
  "GPT-2 Medium": { vocab: 50257, embed: 1024, layers: 24, heads: 16, ff: 4, seq: 1024 },
  "GPT-2 Large": { vocab: 50257, embed: 1280, layers: 36, heads: 20, ff: 4, seq: 1024 }
})

viewof preset = Inputs.select(Object.keys(presets), {
  label: "Load Preset",
  value: "Small (Ours)"
})

selectedPreset = presets[preset]

viewof vocabSize = Inputs.select([1000, 5000, 10000, 32000, 50257, 100000], {
  value: selectedPreset.vocab,
  label: "Vocabulary Size"
})

viewof embedDim = Inputs.select([64, 128, 256, 384, 512, 768, 1024, 1280, 2048, 4096], {
  value: selectedPreset.embed,
  label: "Embedding Dimension"
})

viewof numLayers = Inputs.range([1, 48], {
  value: selectedPreset.layers,
  step: 1,
  label: "Number of Layers"
})

viewof numHeads = Inputs.range([1, 32], {
  value: selectedPreset.heads,
  step: 1,
  label: "Number of Heads"
})

viewof ffMult = Inputs.range([1, 8], {
  value: selectedPreset.ff,
  step: 1,
  label: "FFN Multiplier"
})

viewof maxSeqLen = Inputs.select([256, 512, 1024, 2048, 4096, 8192], {
  value: selectedPreset.seq,
  label: "Max Sequence Length"
})

// Theme colors derived from diagramTheme for consistency
theme = {
  const t = diagramTheme;
  return {
    tokenEmbed: t.primary,
    posEmbed: t.accent,
    attention: t.success,
    ffn: t.info,
    layerNorm: t.nodeStroke,
    warning: t.error
  };
}

params = countParameters(vocabSize, embedDim, numLayers, numHeads, ffMult, maxSeqLen)

// Data for pie chart
pieData = [
  { category: "Token Embeddings", value: params.tokenEmbed, color: theme.tokenEmbed },
  { category: "Position Embeddings", value: params.posEmbed, color: theme.posEmbed },
  { category: "Attention", value: params.attention, color: theme.attention },
  { category: "Feed-Forward", value: params.ffn, color: theme.ffn },
  { category: "LayerNorm", value: params.layerNorm, color: theme.layerNorm }
].filter(d => d.value > 0)

// Memory calculations
memoryFp32 = params.total * 4
memoryFp16 = params.total * 2
memoryInt8 = params.total * 1

// Head dimension check
headDim = embedDim / numHeads
headDimValid = embedDim % numHeads === 0

Plot = import("https://esm.sh/@observablehq/plot@0.6")

parameterDistributionChart = Plot.plot({
  title: `Parameter Distribution: ${formatParams(params.total)} total`,
  width: 600,
  height: 300,
  marginLeft: 120,
  x: {
    label: "Parameters →",
    tickFormat: d => formatParams(d)
  },
  y: {
    label: null
  },
  color: {
    domain: pieData.map(d => d.category),
    range: pieData.map(d => d.color)
  },
  marks: [
    Plot.barX(pieData, {
      y: "category",
      x: "value",
      fill: "category",
      tip: true,
      title: d => `${d.category}: ${formatParams(d.value)} (${(d.value / params.total * 100).toFixed(1)}%)`
    }),
    Plot.ruleX([0])
  ]
})

// Summary statistics
md`
| Component | Parameters | Percentage |
|-----------|------------|------------|
| Token Embeddings | ${formatParams(params.tokenEmbed)} | ${(params.tokenEmbed / params.total * 100).toFixed(1)}% |
| Position Embeddings | ${formatParams(params.posEmbed)} | ${(params.posEmbed / params.total * 100).toFixed(1)}% |
| Attention (${numLayers} layers) | ${formatParams(params.attention)} | ${(params.attention / params.total * 100).toFixed(1)}% |
| Feed-Forward (${numLayers} layers) | ${formatParams(params.ffn)} | ${(params.ffn / params.total * 100).toFixed(1)}% |
| LayerNorm | ${formatParams(params.layerNorm)} | ${(params.layerNorm / params.total * 100).toFixed(1)}% |
| **Total** | **${formatParams(params.total)}** | **100%** |
`

md`**Memory Requirements:** ${formatMemory(memoryFp32)} (fp32) | ${formatMemory(memoryFp16)} (fp16) | ${formatMemory(memoryInt8)} (int8)`

// Validation warning
headDimValid ? md`` : md`<span style="color: ${theme.warning}">⚠️ Warning: embed_dim (${embedDim}) is not divisible by num_heads (${numHeads}). Head dimension would be ${headDim.toFixed(2)}.</span>`

Try This

FFN dominates: Set embed_dim=768, layers=12. Notice the Feed-Forward bars are ~2x the Attention bars (because FFN has 8d² params vs Attention’s 4d²).
Embedding cost at small scale: With vocab=50257 and embed_dim=768, token embeddings are ~38M params - a large fraction for small models.
Scaling law: Double embed_dim from 512 to 1024. Total params roughly quadruple (because most params scale with d²).
Load GPT-2 presets and see how the 117M, 345M, 774M models break down.
Head dimension check: Try numHeads that doesn’t divide embedDim evenly - you’ll see a warning.

Exercises

Exercise 1: Build a Custom Block

# Create a transformer block with different configurations
custom_block = TransformerBlock(
    embed_dim=128,
    num_heads=8,
    ff_dim=512,  # 4x expansion
    dropout=0.1
)

# Test it
x = torch.randn(4, 16, 128)  # batch=4, seq=16
output = custom_block(x)
print(f"Custom block: {x.shape} -> {output.shape}")
print(f"Parameters: {sum(p.numel() for p in custom_block.parameters()):,}")

Custom block: torch.Size([4, 16, 128]) -> torch.Size([4, 16, 128])
Parameters: 198,272

Exercise 2: Compare Model Scales

# Compare tiny vs small model
tiny = create_gpt_tiny(vocab_size=10000)
small = create_gpt_small(vocab_size=10000)

print(f"{'Model':<10} {'Embed':<8} {'Layers':<8} {'Heads':<8} {'Params':<15}")
print("-" * 50)
print(f"{'Tiny':<10} {tiny.embed_dim:<8} {len(tiny.blocks):<8} {tiny.blocks[0].attention.mha.num_heads:<8} {tiny.num_params:,}")
print(f"{'Small':<10} {small.embed_dim:<8} {len(small.blocks):<8} {small.blocks[0].attention.mha.num_heads:<8} {small.num_params:,}")

Model      Embed    Layers   Heads    Params         
--------------------------------------------------
Tiny       128      4        4        2,106,112
Small      384      6        6        14,684,160

Exercise 3: Information Flow

# See how a single token's representation changes through layers
model = create_gpt_tiny(vocab_size=100)
token_ids = torch.randint(0, 100, (1, 8))

_, hidden = model(token_ids, return_hidden_states=True)

# Track first token through layers
first_token_norms = [h[0, 0].norm().item() for h in hidden]
ex3_layer_names = ['Embed'] + [f'Block {i}' for i in range(len(model.blocks))]

print(f"First token embedding norm through {len(first_token_norms)} layers:")
for name, norm in zip(ex3_layer_names, first_token_norms):
    print(f"  {name}: {norm:.3f}")

First token embedding norm through 5 layers:
  Embed: 0.394
  Block 0: 12.604
  Block 1: 18.714
  Block 2: 23.403
  Block 3: 25.180

# Pass data to OJS for visualization
first_token_data = [{"layer": name, "norm": norm} for name, norm in zip(ex3_layer_names, first_token_norms)]
ojs_define(firstTokenNormsData=first_token_data)

// First token representation through layers
firstTokenChart = {
  const data = typeof firstTokenNormsData !== 'undefined' ? firstTokenNormsData : [];

  if (data.length === 0) return html`<div>Loading...</div>`;

  const width = 600;
  const height = 260;
  const margin = { top: 45, right: 30, bottom: 70, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Sans', system-ui, sans-serif");

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 10);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales
  const xScale = d3.scaleBand()
    .domain(data.map(d => d.layer))
    .range([0, innerWidth])
    .padding(0.3);

  const yScale = d3.scaleLinear()
    .domain([0, d3.max(data, d => d.norm) * 1.15])
    .range([innerHeight, 0]);

  // Grid
  const gridColor = theme.isDark ? "rgba(255,255,255,0.08)" : "rgba(0,0,0,0.08)";
  g.selectAll("line.grid")
    .data(yScale.ticks(5))
    .join("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", d => yScale(d))
    .attr("y2", d => yScale(d))
    .attr("stroke", gridColor);

  // Bars
  g.selectAll("rect.bar")
    .data(data)
    .join("rect")
    .attr("class", "bar")
    .attr("x", d => xScale(d.layer))
    .attr("y", d => yScale(d.norm))
    .attr("width", xScale.bandwidth())
    .attr("height", d => innerHeight - yScale(d.norm))
    .attr("rx", 4)
    .attr("fill", theme.accent)
    .attr("opacity", 0.8);

  // Value labels
  g.selectAll("text.value")
    .data(data)
    .join("text")
    .attr("class", "value")
    .attr("x", d => xScale(d.layer) + xScale.bandwidth() / 2)
    .attr("y", d => yScale(d.norm) - 6)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text(d => d.norm.toFixed(2));

  // X axis
  g.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .attr("transform", "rotate(-30)")
      .attr("text-anchor", "end")
      .attr("dx", "-0.5em")
      .attr("dy", "0.5em"));

  // Y axis
  g.append("g")
    .call(d3.axisLeft(yScale).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  g.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text("Embedding Norm (First Token)");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text("First Token Representation Through Layers");

  return svg.node();
}

Summary

Key takeaways:

Transformer architecture: Input embeddings -> N transformer blocks -> Final LayerNorm -> Output projection
Each block has two sublayers:
- Multi-head attention (tokens communicate)
- Feed-forward network (tokens processed independently)
Pre-Norm architecture: LayerNorm before each sublayer, with a “clean” residual path for stable gradients
Layer normalization: Normalizes across the embedding dimension, keeping activations in a stable range
Residual connections: x + f(x) enables gradient flow through very deep networks (100+ layers)
Feed-forward networks: 4x expansion with GELU activation provides computational capacity
Weight tying: Sharing token embedding and output projection reduces parameters and improves performance
Initialization matters: Small initial weights (std=0.02) prevent exploding activations
Parameter scaling: Total params \(\approx V \times d + 12Nd^2\) (dominated by FFN for large models)
Architectural variations: Modern LLMs (LLaMA, Mistral) use RMSNorm, RoPE, and SwiGLU for better efficiency

What’s Next

Module 07: Training trains our transformer on actual data using cross-entropy loss, learning rate scheduling, and gradient accumulation.