Module 07: Training

d3 = require("d3@7")

// =============================================================================
// THEME DETECTION
// =============================================================================

// Reactive value that tracks Quarto's native dark mode and re-renders diagrams
// when the theme toggles. Generators.observe makes this a live OJS dependency:
// any cell referencing isDarkMode re-runs whenever the body/html class changes.
isDarkMode = Generators.observe(notify => {
  const check = () =>
    document.body.classList.contains('quarto-dark') ||
    document.documentElement.classList.contains('quarto-dark');

  notify(check());

  const observer = new MutationObserver(() => notify(check()));
  observer.observe(document.body, { attributes: true, attributeFilter: ['class'] });
  observer.observe(document.documentElement, { attributes: true, attributeFilter: ['class'] });

  return () => observer.disconnect();
})


// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================

// Function to read CSS custom property values from the document.
// Reads from <body> because Quarto applies the .quarto-dark class there, so the
// dark-mode variable overrides resolve on the body element, not <html>.
getCSSVar = function(name, fallback = null) {
  if (typeof document === 'undefined') return fallback;
  const value = getComputedStyle(document.body).getPropertyValue(name).trim();
  return value || fallback;
}

// =============================================================================
// THEME OBJECT
// =============================================================================

// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
  // Light-mode fallback values (used if CSS vars are unavailable)
  const lightFallbacks = {
    nodeFill: '#f5f5f4',
    nodeFillHover: '#e7e5e4',
    nodeStroke: '#d6d3d1',
    nodeText: '#1c1917',
    edgeStroke: '#78716c',
    highlight: '#f97316',
    highlightGlow: 'rgba(249, 115, 22, 0.3)',
    accent: '#0ea5e9',
    accentGlow: 'rgba(14, 165, 233, 0.3)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: '#fafaf9',
    bgSecondary: '#f5f5f4',
    // Semantic colors for status/feedback
    error: '#dc2626',
    errorBg: 'rgba(220, 38, 38, 0.1)',
    success: '#16a34a',
    successBg: 'rgba(22, 163, 74, 0.1)',
    info: '#2563eb',
    infoBg: 'rgba(37, 99, 235, 0.1)'
  };

  // Dark-mode fallbacks + brighter semantic colors for readability on dark.
  const darkFallbacks = {
    nodeFill: '#292524',
    nodeFillHover: '#3f3a36',
    nodeStroke: '#57534e',
    nodeText: '#fafaf9',
    edgeStroke: '#a8a29e',
    highlight: '#fb923c',
    highlightGlow: 'rgba(251, 146, 60, 0.4)',
    accent: '#38bdf8',
    accentGlow: 'rgba(56, 189, 248, 0.4)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: 'transparent',
    bgSecondary: '#1c1917',
    error: '#f87171',
    errorBg: 'rgba(248, 113, 113, 0.18)',
    success: '#4ade80',
    successBg: 'rgba(74, 222, 128, 0.18)',
    info: '#60a5fa',
    infoBg: 'rgba(96, 165, 250, 0.18)'
  };

  // Referencing isDarkMode here makes this cell reactive: it recomputes (and all
  // diagrams that read it re-render) whenever the theme is toggled.
  const fallbacks = isDarkMode ? darkFallbacks : lightFallbacks;

  return {
    nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
    nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
    nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
    nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
    edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
    highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
    highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
    accent: getCSSVar('--diagram-accent', fallbacks.accent),
    accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
    textOnHighlight: fallbacks.textOnHighlight,
    textOnAccent: fallbacks.textOnAccent,
    bg: getCSSVar('--diagram-bg', fallbacks.bg),
    bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
    // Semantic colors (use fallbacks directly since no CSS vars defined)
    error: fallbacks.error,
    errorBg: fallbacks.errorBg,
    success: fallbacks.success,
    successBg: fallbacks.successBg,
    info: fallbacks.info,
    infoBg: fallbacks.infoBg,
    isDark: isDarkMode
  };
}

// =============================================================================
// SVG PRIMITIVES
// =============================================================================

// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
  const {
    x = 0,
    y = 0,
    width = 100,
    height = 50,
    label = '',
    sublabel = '',
    id = null,
    theme = diagramTheme,
    rx = 6,
    ry = 6,
    className = 'diagram-node'
  } = options;

  // Create group
  const g = svg.append('g')
    .attr('class', className)
    .attr('transform', `translate(${x}, ${y})`);

  if (id) g.attr('id', id);

  // Add rectangle
  g.append('rect')
    .attr('x', -width / 2)
    .attr('y', -height / 2)
    .attr('width', width)
    .attr('height', height)
    .attr('rx', rx)
    .attr('ry', ry)
    .attr('fill', theme.nodeFill)
    .attr('stroke', theme.nodeStroke)
    .attr('stroke-width', 1.5);

  // Add main label
  if (label) {
    const labelY = sublabel ? -6 : 0;
    g.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(label);
  }

  // Add sublabel
  if (sublabel) {
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', 0.7)
      .attr('pointer-events', 'none')
      .text(sublabel);
  }

  return g;
}

// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
  const {
    x1 = 0,
    y1 = 0,
    x2 = 100,
    y2 = 0,
    label = '',
    theme = diagramTheme,
    curved = false,
    curvature = 0.3,
    id = null,
    className = 'diagram-edge',
    dashed = false
  } = options;

  // Create unique marker ID
  const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;

  // Ensure defs exists
  let defs = svg.select('defs');
  if (defs.empty()) {
    defs = svg.append('defs');
  }

  // Add arrowhead marker
  defs.append('marker')
    .attr('id', markerId)
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Create group for arrow
  const g = svg.append('g')
    .attr('class', className);

  if (id) g.attr('id', id);

  // Calculate path
  let pathD;
  if (curved) {
    // Quadratic Bezier curve
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const dx = x2 - x1;
    const dy = y2 - y1;
    // Perpendicular offset for curve
    const cx = midX - dy * curvature;
    const cy = midY + dx * curvature;
    pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
  } else {
    // Straight line
    pathD = `M${x1},${y1} L${x2},${y2}`;
  }

  // Add path
  const path = g.append('path')
    .attr('d', pathD)
    .attr('fill', 'none')
    .attr('stroke', theme.edgeStroke)
    .attr('stroke-width', 1.5)
    .attr('marker-end', `url(#${markerId})`);

  if (dashed) {
    path.attr('stroke-dasharray', '5,3');
  }

  // Add label if provided
  if (label) {
    const labelX = (x1 + x2) / 2;
    const labelY = (y1 + y2) / 2;

    // Offset label perpendicular to line
    const angle = Math.atan2(y2 - y1, x2 - x1);
    const offsetX = Math.sin(angle) * 12;
    const offsetY = -Math.cos(angle) * 12;

    g.append('text')
      .attr('x', labelX + offsetX)
      .attr('y', labelY + offsetY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .text(label);
  }

  return g;
}

// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================

// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
  const {
    total = 1,
    initialStep = 0,
    speed = 1000,
    loop = true,
    onStepChange = null
  } = options;

  let current = initialStep;
  let isPlaying = false;
  let intervalId = null;
  let currentSpeed = speed;

  const notifyChange = () => {
    if (onStepChange && typeof onStepChange === 'function') {
      onStepChange(current);
    }
  };

  const controller = {
    get current() { return current; },
    get isPlaying() { return isPlaying; },
    get total() { return total; },
    get speed() { return currentSpeed; },

    setStep(step) {
      current = Math.max(0, Math.min(total - 1, step));
      notifyChange();
      return current;
    },

    next() {
      if (current < total - 1) {
        current++;
      } else if (loop) {
        current = 0;
      }
      notifyChange();
      return current;
    },

    prev() {
      if (current > 0) {
        current--;
      } else if (loop) {
        current = total - 1;
      }
      notifyChange();
      return current;
    },

    play() {
      if (isPlaying) return;
      isPlaying = true;
      intervalId = setInterval(() => {
        controller.next();
      }, currentSpeed);
    },

    stop() {
      isPlaying = false;
      if (intervalId) {
        clearInterval(intervalId);
        intervalId = null;
      }
    },

    toggle() {
      if (isPlaying) {
        controller.stop();
      } else {
        controller.play();
      }
    },

    reset() {
      controller.stop();
      current = initialStep;
      notifyChange();
    },

    setSpeed(newSpeed) {
      currentSpeed = newSpeed;
      if (isPlaying) {
        controller.stop();
        controller.play();
      }
    }
  };

  return controller;
}

// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================

// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
  const {
    nodes = [],
    edges = [],
    width = 600,
    height = 400,
    activeNodes = [],
    activeEdges = [],
    theme = diagramTheme,
    nodeWidth = 100,
    nodeHeight = 50,
    padding = 20
  } = options;

  // Create SVG element
  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`)
    .attr('class', 'flow-diagram');

  // Add background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', theme.bg)
    .attr('rx', 8);

  // Create defs for markers
  const defs = svg.append('defs');

  // Standard arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Highlighted arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.highlight);

  // Edges layer (draw first so nodes appear on top)
  const edgesLayer = svg.append('g').attr('class', 'edges-layer');

  // Nodes layer
  const nodesLayer = svg.append('g').attr('class', 'nodes-layer');

  // Draw edges
  edges.forEach((edge, i) => {
    const sourceNode = nodes.find(n => n.id === edge.source);
    const targetNode = nodes.find(n => n.id === edge.target);

    if (!sourceNode || !targetNode) return;

    const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
    const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
    const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';

    // Calculate edge path
    const x1 = sourceNode.x;
    const y1 = sourceNode.y;
    const x2 = targetNode.x;
    const y2 = targetNode.y;

    // Shorten path to not overlap with node edges
    const dx = x2 - x1;
    const dy = y2 - y1;
    const len = Math.sqrt(dx * dx + dy * dy);
    const offsetStart = (nodeWidth / 2) + 5;
    const offsetEnd = (nodeWidth / 2) + 10;

    const startX = x1 + (dx / len) * offsetStart;
    const startY = y1 + (dy / len) * offsetStart;
    const endX = x2 - (dx / len) * offsetEnd;
    const endY = y2 - (dy / len) * offsetEnd;

    const edgeGroup = edgesLayer.append('g')
      .attr('class', `edge ${isActive ? 'highlighted' : ''}`);

    if (edge.id) edgeGroup.attr('id', edge.id);

    // Draw path
    let pathD;
    if (edge.curved) {
      const midX = (startX + endX) / 2;
      const midY = (startY + endY) / 2;
      const curvature = edge.curvature || 0.2;
      const cx = midX - dy * curvature;
      const cy = midY + dx * curvature;
      pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
    } else {
      pathD = `M${startX},${startY} L${endX},${endY}`;
    }

    const path = edgeGroup.append('path')
      .attr('d', pathD)
      .attr('fill', 'none')
      .attr('stroke', edgeColor)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', `url(#${markerId})`);

    if (edge.dashed) {
      path.attr('stroke-dasharray', '5,3');
    }

    if (isActive) {
      path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
    }

    // Add label if present
    if (edge.label) {
      const labelX = (startX + endX) / 2;
      const labelY = (startY + endY) / 2;
      const angle = Math.atan2(endY - startY, endX - startX);
      const offsetX = Math.sin(angle) * 14;
      const offsetY = -Math.cos(angle) * 14;

      edgeGroup.append('text')
        .attr('x', labelX + offsetX)
        .attr('y', labelY + offsetY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? theme.highlight : theme.nodeText)
        .attr('font-size', '10px')
        .text(edge.label);
    }
  });

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
    const nodeFill = isActive ? theme.highlight : theme.nodeFill;
    const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
    const textFill = isActive ? theme.textOnHighlight : theme.nodeText;

    const nodeGroup = nodesLayer.append('g')
      .attr('class', `node ${isActive ? 'highlighted' : ''}`)
      .attr('transform', `translate(${node.x}, ${node.y})`);

    if (node.id) nodeGroup.attr('id', node.id);

    // Node rectangle
    const rect = nodeGroup.append('rect')
      .attr('x', -nodeWidth / 2)
      .attr('y', -nodeHeight / 2)
      .attr('width', node.width || nodeWidth)
      .attr('height', node.height || nodeHeight)
      .attr('rx', 6)
      .attr('ry', 6)
      .attr('fill', nodeFill)
      .attr('stroke', nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
    }

    // Main label
    const labelY = node.sublabel ? -6 : 0;
    nodeGroup.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', textFill)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(node.label || '');

    // Sublabel
    if (node.sublabel) {
      nodeGroup.append('text')
        .attr('x', 0)
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', textFill)
        .attr('font-size', '10px')
        .attr('opacity', isActive ? 0.9 : 0.7)
        .attr('pointer-events', 'none')
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// =============================================================================
// EXPORTS
// =============================================================================

// Export everything as a single object for lessons to use
diagramLib = {
  // Core dependencies
  d3,

  // Theme utilities
  isDarkMode,
  getCSSVar,
  diagramTheme,

  // SVG primitives
  createNode,
  createArrow,

  // Animation controller
  createStepController,

  // Components
  FlowDiagram
}

/**
 * Segmented step control for visualization stepping.
 * @param {Object} options
 * @param {number} options.min - Minimum step value (default 0)
 * @param {number} options.max - Maximum step value
 * @param {number} options.value - Initial value (default min)
 * @param {string} options.label - Optional label text
 * @returns {number} Current step value (reactive)
 */
stepControl = function({min = 0, max, value, label = null} = {}) {
  const initialValue = value ?? min;
  const steps = Array.from({length: max - min + 1}, (_, i) => min + i);

  const container = htl.html`<div class="step-control">
    ${label ? htl.html`<span class="step-control-label">${label}</span>` : ''}
    <div class="step-control-segments" role="group" aria-label="${label || 'Step control'}">
      ${steps.map(step => htl.html`<button
        class="step-control-segment ${step === initialValue ? 'active' : ''}"
        data-step="${step}"
        aria-pressed="${step === initialValue}"
        tabindex="${step === initialValue ? 0 : -1}"
      >${step}</button>`)}
    </div>
  </div>`;

  const segments = container.querySelectorAll('.step-control-segment');
  let currentValue = initialValue;

  function updateActive(newValue) {
    currentValue = newValue;
    segments.forEach(seg => {
      const isActive = parseInt(seg.dataset.step) === newValue;
      seg.classList.toggle('active', isActive);
      seg.setAttribute('aria-pressed', isActive);
      seg.tabIndex = isActive ? 0 : -1;
    });
    container.value = newValue;
    container.dispatchEvent(new Event('input', {bubbles: true}));
  }

  // Click handler
  segments.forEach(seg => {
    seg.addEventListener('click', () => {
      updateActive(parseInt(seg.dataset.step));
    });
  });

  // Keyboard navigation
  container.addEventListener('keydown', (e) => {
    if (e.key === 'ArrowRight' || e.key === 'ArrowDown') {
      e.preventDefault();
      const next = Math.min(currentValue + 1, max);
      updateActive(next);
      segments[next - min].focus();
    } else if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') {
      e.preventDefault();
      const prev = Math.max(currentValue - 1, min);
      updateActive(prev);
      segments[prev - min].focus();
    } else if (e.key === 'Home') {
      e.preventDefault();
      updateActive(min);
      segments[0].focus();
    } else if (e.key === 'End') {
      e.preventDefault();
      updateActive(max);
      segments[max - min].focus();
    }
  });

  container.value = initialValue;
  return container;
}

Introduction

Training teaches a language model to predict the next token. The process iterates:

Computing loss: How wrong are our predictions?
Computing gradients: Which direction should we adjust weights?
Updating weights: Take a small step in that direction
Repeat: Until the model gets good at prediction

This module covers cross-entropy loss, the AdamW optimizer, learning rate scheduling, gradient accumulation, and checkpointing.

What You’ll Learn

After this module, you can:

Understand cross-entropy loss and perplexity for language models
Implement learning rate schedules (warmup + cosine decay)
Use gradient accumulation for effective larger batch sizes
Apply gradient clipping for training stability
Read the floating-point number line from scratch — why fp16 overflows and underflows where bf16 doesn’t, and how loss scaling rescues small gradients
Save and load model checkpoints
Plan a compute-optimal run with scaling laws (the C≈6ND rule, Chinchilla’s ~20 tokens/param)

Prerequisites

This module requires familiarity with:

Module 02: Autograd — Gradient computation and backpropagation
Module 06: Transformer — Transformer architecture to train

Note: This lesson demonstrates concepts interactively. The training.py file provides production-ready implementations of the same algorithms.

The Training Objective

Language models learn through next-token prediction:

Input:    [The, cat, sat, on, the]
Target:   [cat, sat, on, the, mat]

For each position, predict the next token.

The loss function measures how well the model predicts: Cross-entropy between predicted probabilities and actual next tokens.

\[\text{loss} = -\sum \log(P(\text{correct\_token}))\]

Lower loss means the model assigns correct tokens higher probability.

The Training Loop

Neural networks learn through the training loop:

// Training loop steps data
trainingSteps = [
  {
    id: 0,
    name: "Zero Gradients",
    code: "optimizer.zero_grad()",
    description: "Clear accumulated gradients from the previous iteration to start fresh.",
    detail: "Gradients accumulate by default in PyTorch. Without zeroing, they add up across iterations."
  },
  {
    id: 1,
    name: "Forward Pass",
    code: "logits = model(input_ids)",
    description: "Pass input tokens through the model to get predicted logits.",
    detail: "The model computes attention, embeddings, and projections to produce next-token predictions."
  },
  {
    id: 2,
    name: "Compute Loss",
    code: "loss = F.cross_entropy(logits, targets)",
    description: "Measure how wrong the predictions are compared to actual next tokens.",
    detail: "Cross-entropy loss: lower means higher probability assigned to correct tokens."
  },
  {
    id: 3,
    name: "Backward Pass",
    code: "loss.backward()",
    description: "Compute gradients for all parameters via backpropagation.",
    detail: "Automatic differentiation traces computation graph backward, computing dLoss/dParam."
  },
  {
    id: 4,
    name: "Gradient Clipping",
    code: "clip_grad_norm_(params, 1.0)",
    description: "Scale gradients if their norm exceeds threshold to prevent instability.",
    detail: "Prevents exploding gradients that can cause NaN loss or divergent training."
  },
  {
    id: 5,
    name: "Optimizer Step",
    code: "optimizer.step()",
    description: "Update model weights using the computed (and clipped) gradients.",
    detail: "AdamW applies momentum, adaptive learning rates, and weight decay to the update."
  },
  {
    id: 6,
    name: "Update LR",
    code: "scheduler.step()",
    description: "Adjust learning rate according to schedule (warmup + cosine decay).",
    detail: "High LR early for exploration, lower LR later for fine-tuning convergence."
  }
]

// Step control for training loop
viewof trainingStep = stepControl({min: 0, max: 6, value: 0, label: "Training Step"})

// Current step info
currentTrainingStep = trainingSteps[trainingStep]

// Draw the cyclic training loop diagram
trainingLoopDiagram = {
  const width = 650;
  const height = 480;
  const centerX = width / 2;
  const centerY = height / 2 - 20;
  const radius = 160;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 8);

  // Defs for arrows
  const defs = svg.append("defs");

  // Standard arrow
  defs.append("marker")
    .attr("id", "training-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  // Highlighted arrow
  defs.append("marker")
    .attr("id", "training-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Calculate node positions in a circle
  const nodeCount = 7;
  const startAngle = -Math.PI / 2; // Start at top

  const nodePositions = trainingSteps.map((step, i) => {
    const angle = startAngle + (i * 2 * Math.PI / nodeCount);
    return {
      ...step,
      x: centerX + radius * Math.cos(angle),
      y: centerY + radius * Math.sin(angle),
      angle: angle
    };
  });

  // Draw connecting arrows between nodes
  const edgesGroup = svg.append("g").attr("class", "edges");

  for (let i = 0; i < nodeCount; i++) {
    const from = nodePositions[i];
    const to = nodePositions[(i + 1) % nodeCount];

    // Calculate edge start/end to not overlap nodes
    const nodeRadius = 42;
    const dx = to.x - from.x;
    const dy = to.y - from.y;
    const dist = Math.sqrt(dx * dx + dy * dy);

    const startX = from.x + (dx / dist) * (nodeRadius + 2);
    const startY = from.y + (dy / dist) * (nodeRadius + 2);
    const endX = to.x - (dx / dist) * (nodeRadius + 8);
    const endY = to.y - (dy / dist) * (nodeRadius + 8);

    // This edge is highlighted when we're on the "from" step
    const isActive = trainingStep === i;

    edgesGroup.append("path")
      .attr("d", `M${startX},${startY} L${endX},${endY}`)
      .attr("fill", "none")
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("marker-end", isActive ? "url(#training-arrow-active)" : "url(#training-arrow)")
      .attr("opacity", isActive ? 1 : 0.6)
      .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
  }

  // Draw nodes
  const nodesGroup = svg.append("g").attr("class", "nodes");

  nodePositions.forEach((node, i) => {
    const isActive = trainingStep === i;
    const nodeSize = 42;

    const g = nodesGroup.append("g")
      .attr("transform", `translate(${node.x}, ${node.y})`);

    // Circle node
    g.append("circle")
      .attr("r", nodeSize)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    // Step number
    g.append("text")
      .attr("y", -10)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .attr("opacity", 0.7)
      .text(`Step ${i + 1}`);

    // Node label (split long names)
    const words = node.name.split(" ");
    if (words.length > 1) {
      g.append("text")
        .attr("y", 5)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", "11px")
        .attr("font-weight", "500")
        .text(words[0]);
      g.append("text")
        .attr("y", 18)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", "11px")
        .attr("font-weight", "500")
        .text(words.slice(1).join(" "));
    } else {
      g.append("text")
        .attr("y", 10)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", "11px")
        .attr("font-weight", "500")
        .text(node.name);
    }
  });

  // Center label
  svg.append("text")
    .attr("x", centerX)
    .attr("y", centerY - 5)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("opacity", 0.8)
    .text("Training");

  svg.append("text")
    .attr("x", centerX)
    .attr("y", centerY + 12)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("opacity", 0.8)
    .text("Loop");

  // Info panel at bottom
  const infoY = height - 100;
  const infoGroup = svg.append("g")
    .attr("transform", `translate(${width / 2}, ${infoY})`);

  // Info box background
  infoGroup.append("rect")
    .attr("x", -280)
    .attr("y", -10)
    .attr("width", 560)
    .attr("height", 85)
    .attr("rx", 6)
    .attr("fill", diagramTheme.bgSecondary)
    .attr("stroke", diagramTheme.nodeStroke)
    .attr("stroke-width", 1);

  // Step name
  infoGroup.append("text")
    .attr("y", 8)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "13px")
    .attr("font-weight", "600")
    .text(`${currentTrainingStep.id + 1}. ${currentTrainingStep.name}`);

  // Code
  infoGroup.append("text")
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.accent)
    .attr("font-size", "12px")
    .attr("font-family", "monospace")
    .text(currentTrainingStep.code);

  // Description (wrap if needed)
  const desc = currentTrainingStep.description;
  if (desc.length > 70) {
    const mid = desc.lastIndexOf(" ", 70);
    infoGroup.append("text")
      .attr("y", 50)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(desc.substring(0, mid));
    infoGroup.append("text")
      .attr("y", 64)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(desc.substring(mid + 1));
  } else {
    infoGroup.append("text")
      .attr("y", 55)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(desc);
  }

  return svg.node();
}

// Additional detail below the diagram
md`**Why this step matters:** ${currentTrainingStep.detail}`

Note: zero_grad() can be called either at the start or end of each iteration. Calling it at the start (shown above) is common because it ensures gradients are fresh before the backward pass.

Setup

import sys
import math
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# For reproducibility
torch.manual_seed(42)

# Display device info
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

PyTorch version: 2.12.1+cu130
Device: cpu

Cross-Entropy Loss

The loss function measures prediction error. Cross-entropy loss penalizes wrong predictions more heavily when the model is confident but incorrect.

Why cross-entropy?

Probabilistic interpretation: It measures the “surprise” when the true token appears
Gradient properties: Gradients are proportional to the error (predicted - actual)
Information theory: Minimizing cross-entropy = maximizing likelihood of data

Mathematical formulation:

\[\text{CrossEntropy}(p, q) = -\sum_{i} p_i \log(q_i)\]

For language modeling with one-hot targets (only one correct token), this simplifies to:

\[\text{Loss} = -\log(q_{\text{correct}})\]

where \(q_{\text{correct}}\) is the probability the model assigns to the correct token.

# Example: Model predicting next token
vocab_size = 10

# Model outputs logits (raw scores)
logits = torch.tensor([
    [-1.0, 0.5, 2.0, -0.5, 1.0, 0.0, -1.5, 0.3, -0.8, 0.2]  # scores for each token
])

# True next token is index 2
target = torch.tensor([2])

# Convert to probabilities
probs = F.softmax(logits, dim=-1)

print("Logits (raw model output):")
print(f"  {logits[0].tolist()}")
print(f"\nProbabilities (after softmax):")
print(f"  {[f'{p:.3f}' for p in probs[0].tolist()]}")
print(f"\nTarget token: {target.item()}")
print(f"Probability assigned to target: {probs[0, target.item()]:.4f}")

Logits (raw model output):
  [-1.0, 0.5, 2.0, -0.5, 1.0, 0.0, -1.5, 0.30000001192092896, -0.800000011920929, 0.20000000298023224]

Probabilities (after softmax):
  ['0.022', '0.097', '0.435', '0.036', '0.160', '0.059', '0.013', '0.080', '0.026', '0.072']

Target token: 2
Probability assigned to target: 0.4353

# Cross-entropy loss
loss = F.cross_entropy(logits, target)
manual_loss = -torch.log(probs[0, target.item()])

print(f"Cross-entropy loss: {loss.item():.4f}")
print(f"Manual calculation: -log({probs[0, target.item()]:.4f}) = {manual_loss.item():.4f}")

# Perplexity
perplexity = math.exp(loss.item())
print(f"\nPerplexity: {perplexity:.2f}")

Cross-entropy loss: 0.8317
Manual calculation: -log(0.4353) = 0.8317

Perplexity: 2.30

Let’s visualize how loss changes with probability:

// Loss vs Probability interactive chart
lossProbChart = {
  const width = 650;
  const height = 340;
  const margin = { top: 40, right: 30, bottom: 50, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Generate data
  const data = [];
  for (let p = 0.01; p <= 0.99; p += 0.01) {
    data.push({ prob: p, loss: -Math.log(p) });
  }

  // Scales
  const xScale = d3.scaleLinear()
    .domain([0, 1])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, 5])
    .range([innerHeight, 0]);

  // Grid lines
  [0, 1, 2, 3, 4, 5].forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.3)
      .attr("stroke-dasharray", "3,3");
  });

  // Zero line
  chart.append("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", yScale(0))
    .attr("y2", yScale(0))
    .attr("stroke", theme.nodeText)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "5,3");

  // Line generator
  const lineGen = d3.line()
    .x(d => xScale(d.prob))
    .y(d => yScale(d.loss))
    .curve(d3.curveMonotoneX);

  // Area under curve
  const areaGen = d3.area()
    .x(d => xScale(d.prob))
    .y0(innerHeight)
    .y1(d => yScale(d.loss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(data)
    .attr("d", areaGen)
    .attr("fill", theme.accent)
    .attr("opacity", 0.1);

  // Main line
  chart.append("path")
    .datum(data)
    .attr("d", lineGen)
    .attr("fill", "none")
    .attr("stroke", theme.accent)
    .attr("stroke-width", 3);

  // Annotated points
  const points = [
    { prob: 0.1, label: "P=0.1", loss: -Math.log(0.1) },
    { prob: 0.5, label: "P=0.5", loss: -Math.log(0.5) },
    { prob: 0.9, label: "P=0.9", loss: -Math.log(0.9) }
  ];

  points.forEach(pt => {
    // Point circle
    chart.append("circle")
      .attr("cx", xScale(pt.prob))
      .attr("cy", yScale(pt.loss))
      .attr("r", 8)
      .attr("fill", theme.highlight)
      .attr("stroke", theme.bg === "transparent" ? theme.bgSecondary : theme.bg)
      .attr("stroke-width", 2);

    // Label
    const labelX = xScale(pt.prob) + 12;
    const labelY = yScale(pt.loss) - 8;

    chart.append("text")
      .attr("x", labelX)
      .attr("y", labelY)
      .attr("fill", theme.highlight)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(pt.label);

    chart.append("text")
      .attr("x", labelX)
      .attr("y", labelY + 14)
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .text(`Loss=${pt.loss.toFixed(2)}`);
  });

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(10).tickFormat(d3.format(".1f")))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 40)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Probability assigned to correct token");

  // Y-axis
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Cross-entropy loss");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Loss vs Probability");

  return svg.node();
}

Higher probability means lower loss means better predictions.

Cross-Entropy from Scratch

Before using F.cross_entropy, let’s understand what it does internally.

The Numerical Stability Problem

Softmax involves exp(x), which explodes for large x:

# The problem: exp() overflows easily
logits_big = np.array([1000.0, 1001.0, 1002.0])
print(f"exp(logits) = {np.exp(logits_big)}")  # [inf, inf, inf] - overflow!

exp(logits) = [inf inf inf]

/tmp/ipykernel_23663/1953512672.py:3: RuntimeWarning: overflow encountered in exp
  print(f"exp(logits) = {np.exp(logits_big)}")  # [inf, inf, inf] - overflow!

The Fix: Log-Sum-Exp Trick

The key insight is that we can compute log-softmax stably by subtracting the maximum:

\[\log \text{softmax}(x_i) = x_i - \log\sum_j e^{x_j} = x_i - \underbrace{(m + \log\sum_j e^{x_j - m})}_{\text{logsumexp}}\]

where \(m = \max(x)\). By subtracting the max, all exponents become \(\leq 0\), avoiding overflow.

def logsumexp(x: np.ndarray, axis: int = -1, keepdims: bool = True) -> np.ndarray:
    """
    Stable log(sum(exp(x))).

    Trick: log(sum(exp(x))) = m + log(sum(exp(x - m)))
    where m = max(x). This keeps exp() arguments <= 0.
    """
    m = x.max(axis=axis, keepdims=True)
    return m + np.log(np.exp(x - m).sum(axis=axis, keepdims=keepdims))

# Now it works!
print(f"logsumexp(logits) = {logsumexp(logits_big, keepdims=False)}")

logsumexp(logits) = [1002.40760596]

Cross-Entropy Implementation

def cross_entropy_scratch(logits: np.ndarray, targets: np.ndarray) -> float:
    """
    Cross-entropy loss from logits.

    logits: (B, C) - raw scores for each class
    targets: (B,) - integer class labels

    Formula: loss = logsumexp(logits) - logits[correct_class]

    This is equivalent to: -log(softmax(logits)[correct_class])
    but numerically stable.
    """
    B, C = logits.shape

    # log(sum(exp(logits))) for normalization
    lse = logsumexp(logits, axis=-1, keepdims=False).squeeze()  # (B,)

    # Gather correct class logits
    correct_logits = logits[np.arange(B), targets]  # (B,)

    # Loss per sample, then mean
    losses = lse - correct_logits
    return float(losses.mean())

# Test
test_logits = np.array([[2.0, 1.0, 0.1], [0.5, 2.5, 0.3]])
test_targets = np.array([0, 1])  # First sample: class 0, second: class 1
print(f"Cross-entropy loss (scratch): {cross_entropy_scratch(test_logits, test_targets):.4f}")

Cross-entropy loss (scratch): 0.3185

PyTorch Equivalent

# Compare with PyTorch
logits_pt = torch.tensor([[2.0, 1.0, 0.1], [0.5, 2.5, 0.3]])
targets_pt = torch.tensor([0, 1])
loss_pt = F.cross_entropy(logits_pt, targets_pt)
print(f"Cross-entropy loss (PyTorch): {loss_pt.item():.4f}")

Cross-entropy loss (PyTorch): 0.3185

Same result! PyTorch’s F.cross_entropy does exactly this internally, plus handles gradients automatically.

Key Insight

Cross-entropy is just logsumexp(logits) - logits[correct_class]. The logsumexp trick prevents numerical overflow by subtracting the max before exponentiating.

Perplexity

Perplexity is a more intuitive measure than raw loss:

\[\text{Perplexity} = e^{\text{cross\_entropy\_loss}}\]

Interpretation: “The model is as confused as if it were choosing uniformly among N options.”

Loss	Perplexity	Interpretation
0.0	1.0	Perfect predictions
2.3	10	~10 equally likely options
4.6	100	~100 equally likely options
6.9	1000	Random guessing (vocab=1000)

For reference: - GPT-2 on WebText: ~20 perplexity - Human baseline: ~10-20 perplexity (depends on domain)

Learning Rate Schedule

We vary the learning rate over training, using warmup followed by cosine decay:

viewof lrMaxLRExp = Inputs.range([-5, -2], {
  value: -3,
  step: 0.5,
  label: "Max LR (10^x)"
})

viewof lrMinLRExp = Inputs.range([-6, -3], {
  value: -5,
  step: 0.5,
  label: "Min LR (10^x)"
})

viewof lrWarmupSteps = Inputs.range([0, 500], {
  value: 100,
  step: 10,
  label: "Warmup Steps"
})

viewof lrTotalSteps = Inputs.range([100, 2000], {
  value: 1000,
  step: 50,
  label: "Total Steps"
})

viewof lrCurrentStep = Inputs.range([0, lrTotalSteps], {
  value: 0,
  step: 1,
  label: "Current Step"
})

lrMaxLR = Math.pow(10, lrMaxLRExp)
lrMinLR = Math.pow(10, lrMinLRExp)

// LR Schedule calculation function
lrScheduleData = {
  const maxLR = lrMaxLR;
  const minLR = lrMinLR;
  const data = [];

  for (let step = 0; step <= lrTotalSteps; step++) {
    let lr;
    let phase;

    if (step < lrWarmupSteps) {
      // Linear warmup
      lr = maxLR * step / Math.max(1, lrWarmupSteps);
      phase = "warmup";
    } else if (step >= lrTotalSteps) {
      lr = minLR;
      phase = "decay";
    } else {
      // Cosine decay
      const progress = (step - lrWarmupSteps) / Math.max(1, lrTotalSteps - lrWarmupSteps);
      const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
      lr = minLR + (maxLR - minLR) * cosine;
      phase = "decay";
    }

    data.push({ step, lr, phase });
  }

  return data;
}

// Current LR value
currentLR = {
  const maxLR = lrMaxLR;
  const minLR = lrMinLR;
  const step = lrCurrentStep;

  if (step < lrWarmupSteps) {
    return maxLR * step / Math.max(1, lrWarmupSteps);
  } else if (step >= lrTotalSteps) {
    return minLR;
  } else {
    const progress = (step - lrWarmupSteps) / Math.max(1, lrTotalSteps - lrWarmupSteps);
    const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
    return minLR + (maxLR - minLR) * cosine;
  }
}

// Current phase
currentPhase = {
  if (lrCurrentStep < lrWarmupSteps) return "warmup";
  if (lrCurrentStep === lrWarmupSteps) return "peak";
  return "decay";
}

// Learning Rate Schedule Visualization
lrScheduleChart = {
  const width = 700;
  const height = 380;
  const margin = { top: 40, right: 30, bottom: 50, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  // Background with gradient
  const defs = svg.append("defs");

  const bgGradient = defs.append("linearGradient")
    .attr("id", "lr-bg-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "0%")
    .attr("y2", "100%");

  bgGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.bg);

  bgGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.bgSecondary);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", "url(#lr-bg-gradient)")
    .attr("rx", 12);

  // Chart area
  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales - dynamic based on max/min LR
  const xScale = d3.scaleLinear()
    .domain([0, lrTotalSteps])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, lrMaxLR * 1.1])
    .range([innerHeight, 0]);

  // Phase background regions
  // Warmup region
  if (lrWarmupSteps > 0) {
    chart.append("rect")
      .attr("x", 0)
      .attr("y", 0)
      .attr("width", xScale(lrWarmupSteps))
      .attr("height", innerHeight)
      .attr("fill", theme.accent)
      .attr("opacity", currentPhase === "warmup" ? 0.15 : 0.05);
  }

  // Decay region
  chart.append("rect")
    .attr("x", xScale(lrWarmupSteps))
    .attr("y", 0)
    .attr("width", innerWidth - xScale(lrWarmupSteps))
    .attr("height", innerHeight)
    .attr("fill", theme.highlight)
    .attr("opacity", currentPhase === "decay" || currentPhase === "peak" ? 0.1 : 0.03);

  // Grid lines - dynamic based on max LR
  const yTicks = [0, lrMaxLR * 0.25, lrMaxLR * 0.5, lrMaxLR * 0.75, lrMaxLR];
  yTicks.forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.3)
      .attr("stroke-dasharray", "3,3");
  });

  // Phase labels
  if (lrWarmupSteps > 0) {
    chart.append("text")
      .attr("x", xScale(lrWarmupSteps / 2))
      .attr("y", 15)
      .attr("text-anchor", "middle")
      .attr("font-size", "11px")
      .attr("font-weight", currentPhase === "warmup" ? "600" : "400")
      .attr("fill", currentPhase === "warmup" ? theme.accent : theme.nodeText)
      .attr("opacity", currentPhase === "warmup" ? 1 : 0.5)
      .text("WARMUP");
  }

  chart.append("text")
    .attr("x", xScale(lrWarmupSteps + (lrTotalSteps - lrWarmupSteps) / 2))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("font-weight", currentPhase === "decay" ? "600" : "400")
    .attr("fill", currentPhase === "decay" || currentPhase === "peak" ? theme.highlight : theme.nodeText)
    .attr("opacity", currentPhase === "decay" || currentPhase === "peak" ? 1 : 0.5)
    .text("COSINE DECAY");

  // Line generator
  const lineGen = d3.line()
    .x(d => xScale(d.step))
    .y(d => yScale(d.lr))
    .curve(d3.curveMonotoneX);

  // Gradient for the line
  const lineGradient = defs.append("linearGradient")
    .attr("id", "lr-line-gradient")
    .attr("gradientUnits", "userSpaceOnUse")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", 0)
    .attr("y2", 0);

  lineGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.accent);

  const warmupPct = (lrWarmupSteps / lrTotalSteps * 100).toFixed(1);
  lineGradient.append("stop")
    .attr("offset", `${warmupPct}%`)
    .attr("stop-color", theme.accent);

  lineGradient.append("stop")
    .attr("offset", `${warmupPct}%`)
    .attr("stop-color", theme.highlight);

  lineGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.highlight);

  // Area under curve
  const areaGen = d3.area()
    .x(d => xScale(d.step))
    .y0(innerHeight)
    .y1(d => yScale(d.lr))
    .curve(d3.curveMonotoneX);

  // Area gradient
  const areaGradient = defs.append("linearGradient")
    .attr("id", "lr-area-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "0%")
    .attr("y2", "100%");

  areaGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.highlight)
    .attr("stop-opacity", 0.3);

  areaGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.highlight)
    .attr("stop-opacity", 0.02);

  chart.append("path")
    .datum(lrScheduleData)
    .attr("d", areaGen)
    .attr("fill", "url(#lr-area-gradient)");

  // Main line
  chart.append("path")
    .datum(lrScheduleData)
    .attr("d", lineGen)
    .attr("fill", "none")
    .attr("stroke", "url(#lr-line-gradient)")
    .attr("stroke-width", 3)
    .attr("stroke-linecap", "round");

  // Current step marker
  const currentX = xScale(lrCurrentStep);
  const currentY = yScale(currentLR);

  // Vertical line at current step
  chart.append("line")
    .attr("x1", currentX)
    .attr("x2", currentX)
    .attr("y1", 0)
    .attr("y2", innerHeight)
    .attr("stroke", theme.nodeText)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "4,4");

  // Horizontal line to y-axis
  chart.append("line")
    .attr("x1", 0)
    .attr("x2", currentX)
    .attr("y1", currentY)
    .attr("y2", currentY)
    .attr("stroke", theme.nodeText)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "4,4");

  // Glow effect for marker
  const glowFilter = defs.append("filter")
    .attr("id", "lr-marker-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");

  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "blur");

  glowFilter.append("feMerge")
    .selectAll("feMergeNode")
    .data(["blur", "SourceGraphic"])
    .join("feMergeNode")
    .attr("in", d => d);

  // Current step dot with glow
  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentY)
    .attr("r", 12)
    .attr("fill", currentPhase === "warmup" ? theme.accent : theme.highlight)
    .attr("opacity", 0.3)
    .attr("filter", "url(#lr-marker-glow)");

  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentY)
    .attr("r", 6)
    .attr("fill", currentPhase === "warmup" ? theme.accent : theme.highlight)
    .attr("stroke", theme.bg === "transparent" ? theme.bgSecondary : theme.bg)
    .attr("stroke-width", 2);

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(8).tickFormat(d => d))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  // X-axis label
  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 40)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Training Steps");

  // Y-axis with scientific notation for small LR values
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(5).tickFormat(d => {
      if (d === 0) return "0";
      if (d < 0.01) return d.toExponential(0);
      return d.toFixed(4);
    }))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px"));

  // Y-axis label
  chart.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Learning Rate");

  // Info box
  const infoBox = svg.append("g")
    .attr("transform", `translate(${width - 170}, 50)`);

  infoBox.append("rect")
    .attr("x", 0)
    .attr("y", 0)
    .attr("width", 150)
    .attr("height", 80)
    .attr("rx", 8)
    .attr("fill", theme.nodeFill)
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 1.5);

  infoBox.append("text")
    .attr("x", 75)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .attr("opacity", 0.6)
    .text("CURRENT");

  // Format LR for display: use exponential for small values
  const lrDisplay = currentLR < 0.0001 ? currentLR.toExponential(2) : currentLR.toFixed(6);

  infoBox.append("text")
    .attr("x", 75)
    .attr("y", 45)
    .attr("text-anchor", "middle")
    .attr("font-size", "15px")
    .attr("font-weight", "700")
    .attr("fill", currentPhase === "warmup" ? theme.accent : theme.highlight)
    .text(`LR: ${lrDisplay}`);

  infoBox.append("text")
    .attr("x", 75)
    .attr("y", 65)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .attr("opacity", 0.7)
    .text(`Step ${lrCurrentStep} / ${lrTotalSteps}`);

  // Phase indicator badge
  const phaseBadge = svg.append("g")
    .attr("transform", `translate(${margin.left + 10}, 55)`);

  const phaseColor = currentPhase === "warmup" ? theme.accent : theme.highlight;
  const phaseLabel = currentPhase.toUpperCase();

  phaseBadge.append("rect")
    .attr("x", 0)
    .attr("y", 0)
    .attr("width", 75)
    .attr("height", 24)
    .attr("rx", 12)
    .attr("fill", phaseColor)
    .attr("opacity", 0.9);

  phaseBadge.append("text")
    .attr("x", 37.5)
    .attr("y", 16)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", "700")
    .attr("fill", theme.textOnHighlight)
    .text(phaseLabel);

  return svg.node();
}

Why warmup? - Early training is unstable with large LR - Gradients are noisy before weights settle - Small LR lets model “get its bearings”

Why decay? - Large LR is good for exploration early - Small LR is good for fine-tuning later - Cosine is smooth (no sudden changes)

class CosineScheduler:
    """Learning rate scheduler with linear warmup and cosine decay."""

    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=0.0):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.min_lr = min_lr
        self.base_lr = optimizer.param_groups[0]['lr']
        self.current_step = 0

    def get_lr(self):
        """Calculate learning rate for current step."""
        if self.current_step < self.warmup_steps:
            # Linear warmup
            return self.base_lr * self.current_step / max(1, self.warmup_steps)
        elif self.current_step >= self.total_steps:
            return self.min_lr
        else:
            # Cosine decay
            progress = (self.current_step - self.warmup_steps) / max(
                1, self.total_steps - self.warmup_steps
            )
            cosine = 0.5 * (1 + math.cos(math.pi * progress))
            return self.min_lr + (self.base_lr - self.min_lr) * cosine

    def step(self):
        """Update learning rate."""
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        self.current_step += 1
        return lr

# Create scheduler
model = nn.Linear(10, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = CosineScheduler(
    optimizer,
    warmup_steps=100,
    total_steps=1000,
    min_lr=1e-5
)

# Collect LRs over training
lrs = []
for _ in range(1000):
    lrs.append(scheduler.get_lr())
    scheduler.step()

print(f"Initial LR: {lrs[0]:.6f}")
print(f"After warmup (step 100): {lrs[100]:.6f}")
print(f"Final LR: {lrs[-1]:.6f}")

Initial LR: 0.000000
After warmup (step 100): 0.001000
Final LR: 0.000010

The interactive visualization above shows how learning rate changes over training. Try adjusting the warmup steps and total steps sliders to see how they affect the schedule.

AdamW Optimizer

AdamW decouples weight decay from Adam (proper L2 regularization) and serves as the standard optimizer for language models.

Why AdamW over SGD or Adam?

SGD: Requires careful learning rate tuning per layer, slow convergence
Adam: Weight decay is applied to gradients (incorrect for L2 regularization)
AdamW: Decouples weight decay from gradient updates (mathematically correct)

// AdamW step-through visualization
viewof adamwStep = stepControl({min: 0, max: 4, value: 0, label: "AdamW Step"})

adamwStepInfo = {
  const steps = [
    {
      title: "Input Gradient",
      description: "Receive gradient g from backpropagation",
      formula: "g = dL/dθ",
      highlight: ["gradient"]
    },
    {
      title: "Momentum Update",
      description: "Update first moment (exponential moving average of gradients)",
      formula: "m = β₁·m + (1-β₁)·g",
      highlight: ["gradient", "momentum"]
    },
    {
      title: "Adaptive Learning Rate",
      description: "Update second moment (exponential moving average of squared gradients)",
      formula: "v = β₂·v + (1-β₂)·g²",
      highlight: ["gradient", "velocity"]
    },
    {
      title: "Bias Correction",
      description: "Correct for initialization bias in early timesteps",
      formula: "m̂ = m/(1-β₁ᵗ), v̂ = v/(1-β₂ᵗ)",
      highlight: ["momentum", "velocity", "bias"]
    },
    {
      title: "Weight Update",
      description: "Apply adaptive update with decoupled weight decay",
      formula: "θ = θ - lr·(m̂/√v̂ + λ·θ)",
      highlight: ["bias", "update"]
    }
  ];
  return steps[adamwStep];
}

// Numeric computation for AdamW example
adamwComputation = {
  // Initial values and hyperparameters
  const g = 0.5;           // gradient
  const beta1 = 0.9;
  const beta2 = 0.999;
  const lr = 0.001;
  const lambda = 0.01;     // weight decay
  const t = 5;             // timestep
  const m_prev = 0.1;      // previous momentum
  const v_prev = 0.01;     // previous velocity
  const theta_prev = 0.75; // previous weight

  // Step 0: Just the gradient
  const step0 = { g };

  // Step 1: Momentum update
  const m = beta1 * m_prev + (1 - beta1) * g;
  const step1 = { ...step0, m, m_prev };

  // Step 2: Velocity update
  const v = beta2 * v_prev + (1 - beta2) * (g * g);
  const step2 = { ...step1, v, v_prev };

  // Step 3: Bias correction
  const m_hat = m / (1 - Math.pow(beta1, t));
  const v_hat = v / (1 - Math.pow(beta2, t));
  const step3 = { ...step2, m_hat, v_hat };

  // Step 4: Weight update
  const adam_update = m_hat / Math.sqrt(v_hat + 1e-8);
  const weight_decay = lambda * theta_prev;
  const theta = theta_prev - lr * (adam_update + weight_decay);
  const step4 = { ...step3, adam_update, weight_decay, theta, theta_prev };

  const steps = [step0, step1, step2, step3, step4];
  return {
    ...steps[adamwStep],
    beta1, beta2, lr, lambda, t,
    step: adamwStep
  };
}

// AdamW flowchart visualization
adamwDiagram = {
  const width = 680;
  const height = 420;
  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'Fira Code', monospace");

  // Background with subtle gradient
  const bgGrad = svg.append("defs").append("linearGradient")
    .attr("id", "adamw-bg-grad")
    .attr("x1", "0%").attr("y1", "0%")
    .attr("x2", "100%").attr("y2", "100%");
  bgGrad.append("stop").attr("offset", "0%").attr("stop-color", theme.bg);
  bgGrad.append("stop").attr("offset", "100%").attr("stop-color", theme.bgSecondary);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", "url(#adamw-bg-grad)")
    .attr("rx", 12);

  // Glow filter for highlights
  const defs = svg.select("defs");
  const glowFilter = defs.append("filter")
    .attr("id", "adamw-glow")
    .attr("x", "-50%").attr("y", "-50%")
    .attr("width", "200%").attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "coloredBlur");
  const feMerge = glowFilter.append("feMerge");
  feMerge.append("feMergeNode").attr("in", "coloredBlur");
  feMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Arrow marker
  defs.append("marker")
    .attr("id", "adamw-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 6)
    .attr("markerHeight", 6)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", theme.edgeStroke);

  defs.append("marker")
    .attr("id", "adamw-arrow-highlight")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 6)
    .attr("markerHeight", 6)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", theme.highlight);

  // Node definitions
  const nodes = [
    { id: "gradient", label: "Gradient", sublabel: "g = dL/dθ", x: 340, y: 60 },
    { id: "momentum", label: "Momentum", sublabel: "m = β₁m + (1-β₁)g", x: 180, y: 160 },
    { id: "velocity", label: "Adaptive LR", sublabel: "v = β₂v + (1-β₂)g²", x: 500, y: 160 },
    { id: "bias", label: "Bias Correction", sublabel: "m̂, v̂", x: 340, y: 260 },
    { id: "update", label: "Weight Update", sublabel: "θ = θ - lr·(...)", x: 340, y: 360 }
  ];

  // Edge definitions
  const edges = [
    { from: "gradient", to: "momentum" },
    { from: "gradient", to: "velocity" },
    { from: "momentum", to: "bias" },
    { from: "velocity", to: "bias" },
    { from: "bias", to: "update" }
  ];

  // Determine which nodes/edges are active based on step
  const activeNodes = adamwStepInfo.highlight;

  const isNodeActive = (id) => activeNodes.includes(id);
  const isEdgeActive = (from, to) => {
    return activeNodes.includes(from) && activeNodes.includes(to);
  };

  // Draw edges
  const edgesLayer = svg.append("g").attr("class", "edges");

  edges.forEach(edge => {
    const fromNode = nodes.find(n => n.id === edge.from);
    const toNode = nodes.find(n => n.id === edge.to);
    const active = isEdgeActive(edge.from, edge.to);

    // Calculate shortened path
    const dx = toNode.x - fromNode.x;
    const dy = toNode.y - fromNode.y;
    const len = Math.sqrt(dx*dx + dy*dy);
    const startOffset = 30;
    const endOffset = 35;

    const x1 = fromNode.x + (dx/len) * startOffset;
    const y1 = fromNode.y + (dy/len) * startOffset;
    const x2 = toNode.x - (dx/len) * endOffset;
    const y2 = toNode.y - (dy/len) * endOffset;

    edgesLayer.append("path")
      .attr("d", `M${x1},${y1} L${x2},${y2}`)
      .attr("fill", "none")
      .attr("stroke", active ? theme.highlight : theme.edgeStroke)
      .attr("stroke-width", active ? 2.5 : 1.5)
      .attr("marker-end", active ? "url(#adamw-arrow-highlight)" : "url(#adamw-arrow)")
      .attr("opacity", active ? 1 : 0.5)
      .style("filter", active ? "url(#adamw-glow)" : "none")
      .style("transition", "all 0.3s ease");
  });

  // Draw nodes
  const nodesLayer = svg.append("g").attr("class", "nodes");

  nodes.forEach(node => {
    const active = isNodeActive(node.id);
    const nodeWidth = 140;
    const nodeHeight = 54;

    const g = nodesLayer.append("g")
      .attr("transform", `translate(${node.x}, ${node.y})`);

    // Node background
    g.append("rect")
      .attr("x", -nodeWidth/2)
      .attr("y", -nodeHeight/2)
      .attr("width", nodeWidth)
      .attr("height", nodeHeight)
      .attr("rx", 8)
      .attr("ry", 8)
      .attr("fill", active ? theme.highlight : theme.nodeFill)
      .attr("stroke", active ? theme.highlight : theme.nodeStroke)
      .attr("stroke-width", active ? 2 : 1.5)
      .style("filter", active ? "url(#adamw-glow)" : "none")
      .style("transition", "all 0.3s ease");

    // Node label
    g.append("text")
      .attr("y", -8)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", active ? theme.textOnHighlight : theme.nodeText)
      .attr("font-size", "13px")
      .attr("font-weight", "600")
      .style("transition", "fill 0.3s ease")
      .text(node.label);

    // Node sublabel
    g.append("text")
      .attr("y", 12)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", active ? theme.textOnHighlight : theme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", active ? 0.9 : 0.7)
      .style("transition", "all 0.3s ease")
      .text(node.sublabel);
  });

  return svg.node();
}

// Info panel showing current step details and numeric values
adamwInfoPanel = {
  const theme = diagramTheme;
  const comp = adamwComputation;
  const info = adamwStepInfo;

  const container = htl.html`<div style="
    background: ${theme.bgSecondary};
    border: 1px solid ${theme.nodeStroke};
    border-radius: 8px;
    padding: 16px 20px;
    margin-top: 12px;
    font-family: 'IBM Plex Mono', 'Fira Code', monospace;
  ">
    <div style="
      display: flex;
      justify-content: space-between;
      align-items: center;
      margin-bottom: 12px;
    ">
      <span style="
        font-size: 14px;
        font-weight: 600;
        color: ${theme.highlight};
      ">Step ${adamwStep}: ${info.title}</span>
      <span style="
        font-size: 12px;
        color: ${theme.nodeText};
        opacity: 0.7;
      ">t = ${comp.t}</span>
    </div>

    <p style="
      font-size: 12px;
      color: ${theme.nodeText};
      margin: 0 0 12px 0;
      line-height: 1.5;
    ">${info.description}</p>

    <div style="
      background: ${theme.nodeFill};
      border-radius: 6px;
      padding: 12px 16px;
      font-family: 'IBM Plex Mono', 'Fira Code', monospace;
    ">
      <div style="
        font-size: 15px;
        color: ${theme.accent};
        font-weight: 500;
        margin-bottom: 10px;
      ">${info.formula}</div>

      ${adamwStep === 0 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div><span style="opacity: 0.6;">gradient:</span> g = <span style="color: ${theme.highlight};">${comp.g.toFixed(3)}</span></div>
          <div><span style="opacity: 0.6;">hyperparams:</span> β₁=${comp.beta1}, β₂=${comp.beta2}, lr=${comp.lr}, λ=${comp.lambda}</div>
        </div>
      ` : ''}

      ${adamwStep === 1 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>m = ${comp.beta1} × ${comp.m_prev.toFixed(3)} + ${(1-comp.beta1).toFixed(1)} × ${comp.g.toFixed(3)}</div>
          <div>m = <span style="color: ${theme.highlight};">${comp.m.toFixed(4)}</span></div>
        </div>
      ` : ''}

      ${adamwStep === 2 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>v = ${comp.beta2} × ${comp.v_prev.toFixed(4)} + ${(1-comp.beta2).toFixed(3)} × ${comp.g.toFixed(3)}²</div>
          <div>v = <span style="color: ${theme.highlight};">${comp.v.toFixed(6)}</span></div>
        </div>
      ` : ''}

      ${adamwStep === 3 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>m̂ = ${comp.m.toFixed(4)} / (1 - ${comp.beta1}^${comp.t}) = <span style="color: ${theme.highlight};">${comp.m_hat.toFixed(4)}</span></div>
          <div>v̂ = ${comp.v.toFixed(6)} / (1 - ${comp.beta2}^${comp.t}) = <span style="color: ${theme.highlight};">${comp.v_hat.toFixed(6)}</span></div>
        </div>
      ` : ''}

      ${adamwStep === 4 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>adam = m̂/√v̂ = ${comp.m_hat.toFixed(4)} / √${comp.v_hat.toFixed(6)} = ${comp.adam_update.toFixed(4)}</div>
          <div>decay = λ·θ = ${comp.lambda} × ${comp.theta_prev.toFixed(2)} = ${comp.weight_decay.toFixed(5)}</div>
          <div>θ = ${comp.theta_prev.toFixed(4)} - ${comp.lr} × (${comp.adam_update.toFixed(4)} + ${comp.weight_decay.toFixed(5)})</div>
          <div>θ = <span style="color: ${theme.highlight}; font-weight: 600;">${comp.theta.toFixed(6)}</span></div>
        </div>
      ` : ''}
    </div>
  </div>`;

  return container;
}

Hyperparameters explained:

Parameter	Default	Purpose
beta1	0.9	Momentum coefficient - smooths gradient direction
beta2	0.999	Adaptive LR coefficient - smooths gradient magnitude
epsilon	1e-8	Numerical stability (prevents division by zero)
weight_decay	0.01	L2 regularization strength

Practical tip: The LLM community has converged on beta1=0.9, beta2=0.95 for large models (used by LLaMA, GPT-3). The lower beta2 adapts faster to changing gradient magnitudes.

# Creating an AdamW optimizer
model = nn.Linear(100, 10)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=3e-4,          # Learning rate
    betas=(0.9, 0.999),  # Momentum and adaptive LR
    weight_decay=0.01    # Regularization
)

print("AdamW optimizer created")
print(f"  Learning rate: {optimizer.param_groups[0]['lr']}")
print(f"  Weight decay: {optimizer.param_groups[0]['weight_decay']}")

AdamW optimizer created
  Learning rate: 0.0003
  Weight decay: 0.01

Optimizers from Scratch

Let’s build optimizers from first principles to understand what PyTorch does internally.

Plain SGD

The simplest optimizer: move parameters in the opposite direction of the gradient.

class SGD_Scratch:
    """
    Stochastic Gradient Descent.

    Update rule: theta = theta - lr * gradient
    """
    def __init__(self, params, lr=0.01):
        self.params = list(params)
        self.lr = lr

    def step(self):
        with torch.no_grad():
            for p in self.params:
                if p.grad is not None:
                    p -= self.lr * p.grad

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad = None

# Test: compare with PyTorch SGD
torch.manual_seed(42)
model_scratch = nn.Linear(10, 2)
model_pytorch = nn.Linear(10, 2)
model_pytorch.load_state_dict(model_scratch.state_dict())

opt_scratch = SGD_Scratch(model_scratch.parameters(), lr=0.1)
opt_pytorch = torch.optim.SGD(model_pytorch.parameters(), lr=0.1)

# Forward + backward
x = torch.randn(4, 10)
loss_scratch = model_scratch(x).sum()
loss_pytorch = model_pytorch(x).sum()

loss_scratch.backward()
loss_pytorch.backward()

# Update
opt_scratch.step()
opt_pytorch.step()

# Compare weights
print("After one SGD step:")
print(f"  Scratch weight[0,0]: {model_scratch.weight[0,0].item():.6f}")
print(f"  PyTorch weight[0,0]: {model_pytorch.weight[0,0].item():.6f}")
print(f"  Match: {torch.allclose(model_scratch.weight, model_pytorch.weight)}")

After one SGD step:
  Scratch weight[0,0]: 0.164868
  PyTorch weight[0,0]: 0.164868
  Match: True

SGD with Momentum

Momentum adds “velocity” to gradient descent. Instead of using the gradient directly, we accumulate a moving average of gradients:

\[v_t = \mu \cdot v_{t-1} + g_t\] \[\theta_t = \theta_{t-1} - \alpha \cdot v_t\]

This helps: - Smooth out noisy gradients - Accelerate through flat regions - Dampen oscillations in steep valleys

class SGD_Momentum_Scratch:
    """
    SGD with momentum.

    Update rule:
        v = momentum * v + gradient
        theta = theta - lr * v
    """
    def __init__(self, params, lr=0.01, momentum=0.9):
        self.params = list(params)
        self.lr = lr
        self.momentum = momentum
        # Velocity buffer for each parameter
        self.v = [torch.zeros_like(p) for p in self.params]

    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is None:
                    continue
                # Update velocity: v = momentum * v + grad
                self.v[i] = self.momentum * self.v[i] + p.grad
                # Update parameter
                p -= self.lr * self.v[i]

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad = None

# Test: compare with PyTorch SGD momentum
torch.manual_seed(42)
model_scratch = nn.Linear(10, 2)
model_pytorch = nn.Linear(10, 2)
model_pytorch.load_state_dict(model_scratch.state_dict())

opt_scratch = SGD_Momentum_Scratch(model_scratch.parameters(), lr=0.1, momentum=0.9)
opt_pytorch = torch.optim.SGD(model_pytorch.parameters(), lr=0.1, momentum=0.9)

# Multiple steps to see momentum accumulate
for step in range(3):
    x = torch.randn(4, 10)

    loss_scratch = model_scratch(x).sum()
    loss_pytorch = model_pytorch(x).sum()

    opt_scratch.zero_grad()
    opt_pytorch.zero_grad()

    loss_scratch.backward()
    loss_pytorch.backward()

    opt_scratch.step()
    opt_pytorch.step()

print("After 3 momentum SGD steps:")
print(f"  Scratch weight[0,0]: {model_scratch.weight[0,0].item():.6f}")
print(f"  PyTorch weight[0,0]: {model_pytorch.weight[0,0].item():.6f}")
print(f"  Match: {torch.allclose(model_scratch.weight, model_pytorch.weight)}")

After 3 momentum SGD steps:
  Scratch weight[0,0]: -0.793227
  PyTorch weight[0,0]: -0.793227
  Match: False

Key Insight: Momentum

Momentum is like pushing a ball down a hill - it builds up speed in consistent directions and resists sudden direction changes. This makes optimization faster and more stable.

Adam from Scratch

Adam combines momentum with adaptive learning rates. It tracks two quantities:

First moment \(m\) (mean of gradients) - like momentum
Second moment \(v\) (mean of squared gradients) - adapts learning rate per-parameter

\[m_t = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot g_t\] \[v_t = \beta_2 \cdot v_{t-1} + (1 - \beta_2) \cdot g_t^2\]

We also need bias correction because \(m\) and \(v\) are initialized to zero:

\[\hat{m}_t = \frac{m_t}{1 - \beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1 - \beta_2^t}\]

Finally, the update:

\[\theta_t = \theta_{t-1} - \alpha \cdot \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}\]

class Adam_Scratch:
    """
    Adam optimizer with optional weight decay (AdamW style).

    Tracks first moment (mean) and second moment (variance) of gradients.
    Uses bias correction to fix initialization bias.
    """
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0):
        self.params = list(params)
        self.lr = lr
        self.b1, self.b2 = betas
        self.eps = eps
        self.weight_decay = weight_decay

        # First moment (mean of gradients)
        self.m = [torch.zeros_like(p) for p in self.params]
        # Second moment (mean of squared gradients)
        self.v = [torch.zeros_like(p) for p in self.params]
        # Timestep
        self.t = 0

    def step(self):
        self.t += 1

        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is None:
                    continue

                g = p.grad

                # AdamW: Weight decay applied directly to weights (decoupled)
                if self.weight_decay != 0.0:
                    p -= self.lr * self.weight_decay * p

                # Update first moment: m = beta1 * m + (1 - beta1) * g
                self.m[i] = self.b1 * self.m[i] + (1 - self.b1) * g

                # Update second moment: v = beta2 * v + (1 - beta2) * g^2
                self.v[i] = self.b2 * self.v[i] + (1 - self.b2) * (g * g)

                # Bias correction (crucial early in training!)
                mhat = self.m[i] / (1 - self.b1 ** self.t)
                vhat = self.v[i] / (1 - self.b2 ** self.t)

                # Update parameters
                p -= self.lr * mhat / (torch.sqrt(vhat) + self.eps)

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad = None

# Test: compare with PyTorch AdamW
torch.manual_seed(42)
model_scratch = nn.Linear(10, 2)
model_pytorch = nn.Linear(10, 2)
model_pytorch.load_state_dict(model_scratch.state_dict())

opt_scratch = Adam_Scratch(model_scratch.parameters(), lr=1e-3, weight_decay=0.01)
opt_pytorch = torch.optim.AdamW(model_pytorch.parameters(), lr=1e-3, weight_decay=0.01)

# Multiple steps
for step in range(5):
    x = torch.randn(4, 10)

    loss_scratch = model_scratch(x).sum()
    loss_pytorch = model_pytorch(x).sum()

    opt_scratch.zero_grad()
    opt_pytorch.zero_grad()

    loss_scratch.backward()
    loss_pytorch.backward()

    opt_scratch.step()
    opt_pytorch.step()

print("After 5 AdamW steps:")
print(f"  Scratch weight[0,0]: {model_scratch.weight[0,0].item():.6f}")
print(f"  PyTorch weight[0,0]: {model_pytorch.weight[0,0].item():.6f}")
print(f"  Close match: {torch.allclose(model_scratch.weight, model_pytorch.weight, atol=1e-6)}")

After 5 AdamW steps:
  Scratch weight[0,0]: 0.237638
  PyTorch weight[0,0]: 0.237638
  Close match: True

Key Insight: Adam

Adam is “momentum + per-parameter learning rates.” The second moment \(v\) tracks how much each parameter’s gradient varies. Parameters with consistently large gradients get smaller effective learning rates (stabilizing training), while those with small gradients get larger rates (speeding up learning).

Why bias correction matters:

Without bias correction, the first few steps are biased toward zero because \(m\) and \(v\) are initialized to zero. Let’s see this:

# Demonstrate bias correction importance
m, v = 0.0, 0.0
b1, b2 = 0.9, 0.999
true_grad = 1.0  # Pretend gradient is always 1

print("Step | m (biased) | m_hat (corrected)")
print("-" * 45)
for t in range(1, 6):
    m = b1 * m + (1 - b1) * true_grad
    m_hat = m / (1 - b1 ** t)
    print(f"  {t}  |   {m:.4f}    |     {m_hat:.4f}")

print(f"\nWithout correction, m starts near 0.1 instead of 1.0!")
print(f"Bias correction fixes this, making m_hat ≈ 1.0 from the start.")

Step | m (biased) | m_hat (corrected)
---------------------------------------------
  1  |   0.1000    |     1.0000
  2  |   0.1900    |     1.0000
  3  |   0.2710    |     1.0000
  4  |   0.3439    |     1.0000
  5  |   0.4095    |     1.0000

Without correction, m starts near 0.1 instead of 1.0!
Bias correction fixes this, making m_hat ≈ 1.0 from the start.

From Inline Sketch to a Tested Module

The classes above are teaching sketches. The production versions — SGD, Adam, and AdamW, each with a shared Optimizer base and full type hints — live in optimizers.py. They are checked bit-for-bit against torch.optim: the test suite trains a twin nn.Linear under each hand-written optimizer and the matching PyTorch one and asserts the weights agree to floating-point rounding. When we say “this is what PyTorch does,” it’s a tested claim, not a slogan.

from optimizers import SGD, Adam, AdamW

# Our AdamW vs torch.optim.AdamW, five steps, same init.
torch.manual_seed(0)
a = nn.Linear(10, 3)
b = nn.Linear(10, 3)
b.load_state_dict(a.state_dict())

ours = AdamW(a.parameters(), lr=1e-2, weight_decay=0.1)
theirs = torch.optim.AdamW(b.parameters(), lr=1e-2, weight_decay=0.1)

for _ in range(5):
    x = torch.randn(5, 10)
    ours.zero_grad(); theirs.zero_grad()
    a(x).pow(2).sum().backward()
    b(x).pow(2).sum().backward()
    ours.step(); theirs.step()

print(f"max |ours - torch|: {(a.weight - b.weight).abs().max().item():.2e}")
print(f"Match: {torch.allclose(a.weight, b.weight, atol=1e-6)}")

max |ours - torch|: 2.98e-08
Match: True

Adam vs AdamW: Why Decoupling Matters

Earlier we said Adam applies weight decay “to the gradient” and AdamW “to the weights.” That one-word difference in placement is the whole reason AdamW exists. Both start from the same coefficient \(\lambda\):

\[ \underbrace{g_t \leftarrow g_t + \lambda\,\theta_{t-1}}_{\textbf{Adam (coupled L2)}} \qquad\text{vs.}\qquad \underbrace{\theta_{t-1} \leftarrow \theta_{t-1} - \alpha\lambda\,\theta_{t-1}}_{\textbf{AdamW (decoupled)}} \]

In AdamW the decay is a plain shrink: every weight loses the same fraction \(\alpha\lambda\) of its magnitude each step, no matter what its gradient is doing.

In Adam the decay is folded into the gradient, so it rides through the rest of the update — including the adaptive denominator \(\frac{1}{\sqrt{\hat v_t}+\epsilon}\). A weight whose gradients are large (big \(\hat v\)) gets its decay divided down; a weight with tiny gradients gets its decay through at nearly full strength. The regularization you asked for silently becomes a different, per-parameter amount. Run the two with identical lr and weight_decay and they simply disagree:

from optimizers import Adam, AdamW

torch.manual_seed(1)
a = nn.Linear(8, 4)
b = nn.Linear(8, 4)
b.load_state_dict(a.state_dict())

coupled = Adam(a.parameters(), lr=1e-2, weight_decay=0.3)   # classic Adam L2
decoupled = AdamW(b.parameters(), lr=1e-2, weight_decay=0.3)  # AdamW

for _ in range(25):
    x = torch.randn(6, 8)
    coupled.zero_grad(); decoupled.zero_grad()
    a(x).pow(2).sum().backward()
    b(x).pow(2).sum().backward()
    coupled.step(); decoupled.step()

print(f"Same lr, same weight_decay, 25 steps:")
print(f"  max |Adam(L2) - AdamW| weight gap: {(a.weight - b.weight).abs().max().item():.4f}")
print(f"  identical? {torch.allclose(a.weight, b.weight, atol=1e-4)}")

Same lr, same weight_decay, 25 steps:
  max |Adam(L2) - AdamW| weight gap: 0.0150
  identical? False

Key Insight: Where the Decay Lands

Set weight_decay=0 and Adam and AdamW are bit-identical — there is nothing to decouple. Turn decay on and they part ways, because coupled L2 passes through the \(1/\sqrt{\hat v}\) denominator and decoupled decay does not. The plot below makes the damping literal: AdamW’s effective decay is flat across parameters; Adam’s falls off as gradients grow.

The next cell bridges effective_decay from optimizers.py — the steady-state decay each scheme actually applies to a weight whose gradient has a given magnitude.

viewof optDecayLambda = Inputs.range([0.0, 0.5], {value: 0.1, step: 0.01, label: "Weight decay λ"})

optDecayChart = {
  const width = 720, height = 380, m = {top: 30, right: 130, bottom: 54, left: 70};
  const theme = diagramTheme;
  const scale = optDecayLambda / decayBaseLambda;   // effective decay is linear in λ

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%").attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  svg.append("rect").attr("width", width).attr("height", height)
    .attr("fill", theme.bg).attr("rx", 12);

  const data = decayGrid.map(d => ({
    std: d.std, coupled: d.coupled * scale, decoupled: d.decoupled * scale
  }));

  const x = d3.scaleLog().domain(d3.extent(data, d => d.std)).range([m.left, width - m.right]);
  const yMax = d3.max(data, d => Math.max(d.coupled, d.decoupled)) * 1.05 || 1;
  const y = d3.scaleLinear().domain([0, yMax]).range([height - m.bottom, m.top]);

  const xAxis = d3.axisBottom(x).ticks(5, "~g");
  const yAxis = d3.axisLeft(y).ticks(5, ".0e");
  svg.append("g").attr("transform", `translate(0,${height - m.bottom})`).call(xAxis)
    .call(g => g.selectAll("text").attr("fill", theme.nodeText))
    .call(g => g.selectAll("line,path").attr("stroke", theme.edgeStroke));
  svg.append("g").attr("transform", `translate(${m.left},0)`).call(yAxis)
    .call(g => g.selectAll("text").attr("fill", theme.nodeText))
    .call(g => g.selectAll("line,path").attr("stroke", theme.edgeStroke));

  svg.append("text").attr("x", (m.left + width - m.right) / 2).attr("y", height - 14)
    .attr("text-anchor", "middle").attr("fill", theme.nodeText).attr("font-size", 13)
    .text("gradient magnitude  √v̂  (log scale)");
  svg.append("text").attr("transform", "rotate(-90)").attr("x", -(height / 2)).attr("y", 18)
    .attr("text-anchor", "middle").attr("fill", theme.nodeText).attr("font-size", 13)
    .text("effective decay per step");

  const line = key => d3.line().x(d => x(d.std)).y(d => y(d[key]));

  // AdamW: flat (decoupled)
  svg.append("path").datum(data).attr("fill", "none")
    .attr("stroke", theme.success).attr("stroke-width", 3).attr("d", line("decoupled"));
  // Adam: falls off as 1/std (coupled)
  svg.append("path").datum(data).attr("fill", "none")
    .attr("stroke", theme.highlight).attr("stroke-width", 3).attr("d", line("coupled"));

  const legend = [
    {label: "AdamW (decoupled)", color: theme.success},
    {label: "Adam (coupled L2)", color: theme.highlight}
  ];
  legend.forEach((L, i) => {
    const gy = m.top + 8 + i * 24;
    svg.append("line").attr("x1", width - m.right + 8).attr("x2", width - m.right + 30)
      .attr("y1", gy).attr("y2", gy).attr("stroke", L.color).attr("stroke-width", 3);
    svg.append("text").attr("x", width - m.right + 36).attr("y", gy + 4)
      .attr("fill", theme.nodeText).attr("font-size", 11).text(L.label);
  });

  return svg.node();
}

Try This

Push λ up: both lines rise, but only the orange (coupled) line stays bent — parameters with large gradients keep getting under-decayed relative to what you set.
λ → 0: the two lines collapse onto the axis together. No decay, nothing to decouple — exactly why our Adam and AdamW are bit-identical at weight_decay=0.

The Optimizer Zoo, Side by Side

Weight decay aside, why reach for Adam at all? The clearest picture is a path on a hard surface. demonstrate_optimizers in optimizers.py runs SGD, SGD+momentum, and Adam on an ill-conditioned bowl \(f(x,y)=\tfrac12(25x^2+y^2)\) — 25× steeper along \(x\) than \(y\) — and records where each one goes.

viewof optZooStep = stepControl({min: 0, max: 50, value: 50, label: "Optimization step"})

optZooChart = {
  const width = 720, height = 460, m = {top: 24, right: 130, bottom: 48, left: 56};
  const theme = diagramTheme;
  const c = optCurvature;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%").attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  svg.append("rect").attr("width", width).attr("height", height)
    .attr("fill", theme.bg).attr("rx", 12);

  const runs = [
    {name: "SGD",      color: theme.info},
    {name: "Momentum", color: theme.success},
    {name: "Adam",     color: theme.highlight}
  ];

  // Data ranges from all paths.
  const allPts = runs.flatMap(r => optTrajectories[r.name].path);
  const xExt = d3.extent(allPts, p => p[0]);
  const yExt = d3.extent(allPts, p => p[1]);
  const pad = 0.6;
  const x = d3.scaleLinear().domain([xExt[0] - pad, xExt[1] + pad]).range([m.left, width - m.right]);
  const y = d3.scaleLinear().domain([yExt[0] - pad, yExt[1] + pad]).range([height - m.bottom, m.top]);

  // Contour ellipses of the bowl: 0.5*(c*x^2 + y^2) = L.
  const levels = [2, 10, 40, 120, 300];
  levels.forEach(L => {
    const ax = Math.sqrt(2 * L / c);   // x half-width
    const ay = Math.sqrt(2 * L);       // y half-width
    svg.append("ellipse")
      .attr("cx", x(0)).attr("cy", y(0))
      .attr("rx", Math.abs(x(ax) - x(0)))
      .attr("ry", Math.abs(y(0) - y(ay)))
      .attr("fill", "none").attr("stroke", theme.edgeStroke)
      .attr("stroke-width", 1).attr("opacity", 0.5);
  });

  // Axes.
  svg.append("g").attr("transform", `translate(0,${y(0)})`)
    .call(d3.axisBottom(x).ticks(6))
    .call(g => g.selectAll("text").attr("fill", theme.nodeText).attr("font-size", 10))
    .call(g => g.selectAll("line,path").attr("stroke", theme.edgeStroke));
  svg.append("g").attr("transform", `translate(${x(0)},0)`)
    .call(d3.axisLeft(y).ticks(6))
    .call(g => g.selectAll("text").attr("fill", theme.nodeText).attr("font-size", 10))
    .call(g => g.selectAll("line,path").attr("stroke", theme.edgeStroke));

  // The minimum.
  svg.append("circle").attr("cx", x(0)).attr("cy", y(0)).attr("r", 4)
    .attr("fill", theme.nodeText).attr("opacity", 0.7);

  const k = optZooStep;
  const line = d3.line().x(d => x(d[0])).y(d => y(d[1]));

  runs.forEach((r, i) => {
    const full = optTrajectories[r.name].path;
    const seg = full.slice(0, k + 1);
    svg.append("path").datum(seg).attr("fill", "none")
      .attr("stroke", r.color).attr("stroke-width", 2.5).attr("opacity", 0.9)
      .attr("d", line);
    const head = seg[seg.length - 1];
    svg.append("circle").attr("cx", x(head[0])).attr("cy", y(head[1])).attr("r", 5)
      .attr("fill", r.color).attr("stroke", theme.bg).attr("stroke-width", 1.5);

    const gy = m.top + 8 + i * 26;
    svg.append("line").attr("x1", width - m.right + 8).attr("x2", width - m.right + 30)
      .attr("y1", gy).attr("y2", gy).attr("stroke", r.color).attr("stroke-width", 3);
    const loss = optTrajectories[r.name].loss[k];
    svg.append("text").attr("x", width - m.right + 36).attr("y", gy + 4)
      .attr("fill", theme.nodeText).attr("font-size", 11)
      .text(`${r.name}  (${loss.toFixed(2)})`);
  });

  svg.append("text").attr("x", width - m.right + 8).attr("y", height - m.bottom + 4)
    .attr("fill", theme.nodeText).attr("font-size", 10).attr("opacity", 0.7)
    .text("( ) = loss at step");

  return svg.node();
}

Scrub the step and watch the shapes, not a winner:

SGD must use a step small enough for the steep \(x\) axis (here \(\alpha=1/25\)), so it snaps to the valley floor almost immediately, then crawls along the shallow \(y\) axis — the classic ill-conditioning crawl.
Momentum builds velocity down the valley, overshoots the bottom, and curls back.
Adam normalizes each axis by its own gradient history and marches at a roughly constant per-coordinate pace — and, honestly, tends to hover near the minimum on a clean quadratic rather than settle into it.

Key Insight: Why AdamW Is the LLM Default

On this toy quadratic, momentum actually reaches the lowest loss — Adam is not magic. Adam earns its place in deep networks, where different parameters see gradients that differ by orders of magnitude (embeddings vs. layer norms vs. deep weights). Its per-parameter scaling makes one global learning rate work across all of them with little tuning, and AdamW adds the decoupled decay that keeps that scaling from corrupting your regularization. Robustness at scale, not toy-problem speed, is the win.

Gradient Accumulation

Gradient accumulation increases effective batch size without adding memory.

Problem: Want batch_size=32 but only 8 fits in memory

Solution: Accumulate gradients over 4 mini-batches

// Step slider for gradient accumulation (0 = initial, 1-4 = mini-batches, 5 = optimizer step)
viewof accumStep = Inputs.range([0, 5], {
  value: 0,
  step: 1,
  label: "Accumulation Step"
})

// Gradient accumulation diagram data
accumStepInfo = {
  const steps = [
    { name: "Ready", description: "Gradients zeroed, ready to accumulate", gradientLevel: 0 },
    { name: "Mini-batch 1", description: "loss.backward() - gradients start accumulating", gradientLevel: 0.25 },
    { name: "Mini-batch 2", description: "loss.backward() - gradients continue accumulating", gradientLevel: 0.5 },
    { name: "Mini-batch 3", description: "loss.backward() - gradients continue accumulating", gradientLevel: 0.75 },
    { name: "Mini-batch 4", description: "loss.backward() - gradients fully accumulated", gradientLevel: 1.0 },
    { name: "Optimizer Step", description: "optimizer.step() - one weight update with effective batch_size=32", gradientLevel: 0 }
  ];
  return steps[accumStep];
}

// Interactive gradient accumulation visualization
{
  const width = 700;
  const height = 340;
  const batchSize = 8;
  const accumSteps = 4;
  const effectiveBatch = batchSize * accumSteps;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 8);

  // Defs for arrows and gradients
  const defs = svg.append("defs");

  // Arrow markers
  defs.append("marker")
    .attr("id", "accum-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  defs.append("marker")
    .attr("id", "accum-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Gradient fill for the accumulator bar
  const gradientFill = defs.append("linearGradient")
    .attr("id", "gradient-fill")
    .attr("x1", "0%")
    .attr("y1", "100%")
    .attr("x2", "0%")
    .attr("y2", "0%");

  gradientFill.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", diagramTheme.accent);

  gradientFill.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", diagramTheme.highlight);

  // Layout constants
  const batchBoxWidth = 100;
  const batchBoxHeight = 55;
  const batchStartX = 60;
  const batchSpacing = 20;
  const batchY = 80;

  const accumX = 480;
  const accumY = 80;
  const accumWidth = 70;
  const accumHeight = 140;

  const optimizerX = 620;
  const optimizerY = 150;

  // Draw mini-batch boxes
  const batches = [1, 2, 3, 4];

  batches.forEach((batch, i) => {
    const x = batchStartX + i * (batchBoxWidth + batchSpacing);
    const isActive = accumStep === batch;
    const isProcessed = accumStep > batch;

    const g = svg.append("g")
      .attr("transform", `translate(${x}, ${batchY})`);

    // Box
    g.append("rect")
      .attr("width", batchBoxWidth)
      .attr("height", batchBoxHeight)
      .attr("rx", 6)
      .attr("fill", isActive ? diagramTheme.highlight : (isProcessed ? diagramTheme.accent : diagramTheme.nodeFill))
      .attr("stroke", isActive ? diagramTheme.highlight : (isProcessed ? diagramTheme.accent : diagramTheme.nodeStroke))
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("opacity", isProcessed && !isActive ? 0.7 : 1)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    // Batch label
    g.append("text")
      .attr("x", batchBoxWidth / 2)
      .attr("y", 18)
      .attr("text-anchor", "middle")
      .attr("fill", isActive || isProcessed ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(`Mini-batch ${batch}`);

    // Size info
    g.append("text")
      .attr("x", batchBoxWidth / 2)
      .attr("y", 34)
      .attr("text-anchor", "middle")
      .attr("fill", isActive || isProcessed ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.8)
      .text(`size=${batchSize}`);

    // backward() call
    g.append("text")
      .attr("x", batchBoxWidth / 2)
      .attr("y", 48)
      .attr("text-anchor", "middle")
      .attr("fill", isActive || isProcessed ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("font-family", "monospace")
      .attr("opacity", 0.7)
      .text("loss.backward()");

    // Arrow from batch to accumulator
    if (accumStep >= batch && accumStep <= 4) {
      const arrowActive = isActive;
      const startX = x + batchBoxWidth;
      const startY = batchY + batchBoxHeight / 2;
      const endX = accumX - 5;
      const endY = accumY + 30 + i * 25;

      // Curved path
      const midX = (startX + endX) / 2 + 20;

      svg.append("path")
        .attr("d", `M${startX + 5},${startY} Q${midX},${startY} ${endX},${endY}`)
        .attr("fill", "none")
        .attr("stroke", arrowActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", arrowActive ? 2 : 1.5)
        .attr("marker-end", arrowActive ? "url(#accum-arrow-active)" : "url(#accum-arrow)")
        .attr("opacity", isProcessed && !arrowActive ? 0.5 : (arrowActive ? 1 : 0.7))
        .style("filter", arrowActive ? `drop-shadow(0 0 3px ${diagramTheme.highlightGlow})` : "none");
    }
  });

  // Draw accumulator container
  const accumG = svg.append("g")
    .attr("transform", `translate(${accumX}, ${accumY})`);

  // Accumulator background
  accumG.append("rect")
    .attr("width", accumWidth)
    .attr("height", accumHeight)
    .attr("rx", 8)
    .attr("fill", diagramTheme.bgSecondary)
    .attr("stroke", accumStep >= 1 && accumStep <= 4 ? diagramTheme.accent : diagramTheme.nodeStroke)
    .attr("stroke-width", 2);

  // Gradient level bar (fills from bottom)
  const gradientLevel = accumStepInfo.gradientLevel;
  const barPadding = 8;
  const barWidth = accumWidth - barPadding * 2;
  const barMaxHeight = accumHeight - barPadding * 2 - 20;
  const barHeight = barMaxHeight * gradientLevel;

  if (barHeight > 0) {
    accumG.append("rect")
      .attr("x", barPadding)
      .attr("y", accumHeight - barPadding - barHeight)
      .attr("width", barWidth)
      .attr("height", barHeight)
      .attr("rx", 4)
      .attr("fill", "url(#gradient-fill)")
      .attr("opacity", 0.9);
  }

  // Accumulator label
  accumG.append("text")
    .attr("x", accumWidth / 2)
    .attr("y", 14)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .text("Gradients");

  // Percentage label
  accumG.append("text")
    .attr("x", accumWidth / 2)
    .attr("y", accumHeight / 2 + 5)
    .attr("text-anchor", "middle")
    .attr("fill", gradientLevel > 0.3 ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text(`${Math.round(gradientLevel * 100)}%`);

  // Arrow from accumulator to optimizer
  const optimizerActive = accumStep === 5;

  svg.append("path")
    .attr("d", `M${accumX + accumWidth + 5},${accumY + accumHeight / 2} L${optimizerX - 50},${optimizerY}`)
    .attr("fill", "none")
    .attr("stroke", optimizerActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
    .attr("stroke-width", optimizerActive ? 2.5 : 1.5)
    .attr("marker-end", optimizerActive ? "url(#accum-arrow-active)" : "url(#accum-arrow)")
    .attr("opacity", accumStep < 5 ? 0.4 : 1)
    .attr("stroke-dasharray", accumStep < 5 ? "5,3" : "none")
    .style("filter", optimizerActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");

  // Optimizer box
  const optG = svg.append("g")
    .attr("transform", `translate(${optimizerX - 45}, ${optimizerY - 30})`);

  optG.append("rect")
    .attr("width", 90)
    .attr("height", 60)
    .attr("rx", 6)
    .attr("fill", optimizerActive ? diagramTheme.highlight : diagramTheme.nodeFill)
    .attr("stroke", optimizerActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
    .attr("stroke-width", optimizerActive ? 2.5 : 1.5)
    .style("filter", optimizerActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

  optG.append("text")
    .attr("x", 45)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", optimizerActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("font-weight", "600")
    .text("Optimizer");

  optG.append("text")
    .attr("x", 45)
    .attr("y", 38)
    .attr("text-anchor", "middle")
    .attr("fill", optimizerActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "9px")
    .attr("font-family", "monospace")
    .attr("opacity", 0.8)
    .text("step()");

  optG.append("text")
    .attr("x", 45)
    .attr("y", 52)
    .attr("text-anchor", "middle")
    .attr("fill", optimizerActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "8px")
    .attr("opacity", 0.7)
    .text("1 update");

  // Status info panel at bottom
  const infoY = 250;

  svg.append("rect")
    .attr("x", 30)
    .attr("y", infoY)
    .attr("width", width - 60)
    .attr("height", 70)
    .attr("rx", 6)
    .attr("fill", diagramTheme.bgSecondary)
    .attr("stroke", diagramTheme.nodeStroke)
    .attr("stroke-width", 1);

  // Step name
  svg.append("text")
    .attr("x", 50)
    .attr("y", infoY + 22)
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "13px")
    .attr("font-weight", "700")
    .text(`Step ${accumStep}: ${accumStepInfo.name}`);

  // Description
  svg.append("text")
    .attr("x", 50)
    .attr("y", infoY + 42)
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .text(accumStepInfo.description);

  // Effective batch size calculation
  svg.append("text")
    .attr("x", 50)
    .attr("y", infoY + 58)
    .attr("fill", diagramTheme.accent)
    .attr("font-size", "10px")
    .attr("font-family", "monospace")
    .text(`Effective batch size: ${batchSize} x ${accumSteps} = ${effectiveBatch}`);

  return svg.node();
}

# Demonstrate gradient accumulation
model = nn.Linear(10, 1)
accumulation_steps = 4

# Simulate accumulated gradients
total_loss = 0

for i in range(accumulation_steps):
    x = torch.randn(8, 10)  # Mini-batch
    y = model(x)
    loss = y.mean() / accumulation_steps  # Scale loss!
    loss.backward()  # Gradients accumulate
    total_loss += loss.item()

print(f"Accumulated loss (4 mini-batches): {total_loss:.4f}")
print(f"Gradient norm before step: {model.weight.grad.norm().item():.4f}")

# Now do one optimizer step
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
optimizer.step()
optimizer.zero_grad()

print("After optimizer.step() and zero_grad()")

Accumulated loss (4 mini-batches): 0.1356
Gradient norm before step: 0.3131
After optimizer.step() and zero_grad()

Gradient Clipping

Gradient clipping scales down gradients whose norm exceeds a threshold, preventing gradient explosion.

# Demonstrate gradient clipping
model = nn.Linear(10, 10)

# Create artificial large gradients
for p in model.parameters():
    p.grad = torch.randn_like(p) * 100  # Very large!

# Compute gradient norm before clipping
total_norm_before = 0
for p in model.parameters():
    total_norm_before += p.grad.norm().item() ** 2
total_norm_before = total_norm_before ** 0.5

print(f"Gradient norm before clipping: {total_norm_before:.2f}")

# Clip gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Compute gradient norm after
total_norm_after = 0
for p in model.parameters():
    total_norm_after += p.grad.norm().item() ** 2
total_norm_after = total_norm_after ** 0.5

print(f"Gradient norm after clipping:  {total_norm_after:.2f}")
print(f"\nGradients scaled down by {total_norm_before / total_norm_after:.1f}x")

Gradient norm before clipping: 1059.76
Gradient norm after clipping:  1.00

Gradients scaled down by 1059.8x

Gradient Clipping from Scratch

Let’s implement gradient clipping ourselves to understand the algorithm:

def clip_grad_norm_scratch(params, max_norm: float) -> float:
    """
    Clip gradients by global norm.

    Algorithm:
    1. Compute total norm: sqrt(sum of all grad^2)
    2. If total_norm > max_norm, scale all grads by (max_norm / total_norm)

    Returns the original norm (before clipping).
    """
    params = list(params)

    # Step 1: Compute total gradient norm
    total_sq = 0.0
    for p in params:
        if p.grad is not None:
            total_sq += (p.grad ** 2).sum().item()
    total_norm = total_sq ** 0.5

    # Step 2: Clip if needed
    if total_norm > max_norm:
        scale = max_norm / (total_norm + 1e-12)  # Small epsilon for numerical stability
        for p in params:
            if p.grad is not None:
                p.grad *= scale

    return total_norm

# Test: compare with PyTorch
model_scratch = nn.Linear(10, 10)
model_pytorch = nn.Linear(10, 10)

# Set same large gradients
torch.manual_seed(42)
for p in model_scratch.parameters():
    p.grad = torch.randn_like(p) * 100
for ps, pp in zip(model_scratch.parameters(), model_pytorch.parameters()):
    pp.grad = ps.grad.clone()

# Clip with both
norm_scratch = clip_grad_norm_scratch(model_scratch.parameters(), max_norm=1.0)
norm_pytorch = torch.nn.utils.clip_grad_norm_(model_pytorch.parameters(), max_norm=1.0)

print(f"Original norm (scratch): {norm_scratch:.4f}")
print(f"Original norm (PyTorch): {norm_pytorch.item():.4f}")

# Check gradients match after clipping
grads_match = all(
    torch.allclose(ps.grad, pp.grad)
    for ps, pp in zip(model_scratch.parameters(), model_pytorch.parameters())
)
print(f"Gradients match after clipping: {grads_match}")

Original norm (scratch): 1037.5083
Original norm (PyTorch): 1037.5084
Gradients match after clipping: True

Key Insight: Gradient Clipping

Gradient clipping scales ALL gradients by the same factor to preserve their relative magnitudes. This is different from clipping each gradient independently - we want to maintain the direction of the overall update while limiting its magnitude.

When to use gradient clipping:

Always for transformer training (standard practice)
max_norm=1.0 is a good default
Monitor gradient norms during training - consistently high norms suggest instability

Batch Size Considerations

Batch size affects both training dynamics and memory usage:

Tradeoffs:

Aspect	Small Batch	Large Batch
Memory	Less	More
Gradient noise	More (regularization effect)	Less (stable gradients)
Convergence	May generalize better	Faster convergence
LR needed	Lower	Higher (linear scaling rule)

The Linear Scaling Rule: When you double the batch size, you can double the learning rate. This maintains similar training dynamics.

Effective batch size = batch_size x gradient_accumulation_steps

# Batch size vs memory example (conceptual)
print("Memory usage scales linearly with batch size:")
print()
for batch_size in [8, 16, 32, 64]:
    # Simulated memory calculation
    tokens_per_batch = batch_size * 512  # sequence length
    memory_mb = batch_size * 50  # ~50MB per sample for a small model
    print(f"  Batch size {batch_size:2d}: ~{tokens_per_batch:,} tokens/batch, ~{memory_mb}MB")

Memory usage scales linearly with batch size:

  Batch size  8: ~4,096 tokens/batch, ~400MB
  Batch size 16: ~8,192 tokens/batch, ~800MB
  Batch size 32: ~16,384 tokens/batch, ~1600MB
  Batch size 64: ~32,768 tokens/batch, ~3200MB

Mixed Precision & Numerics

Everything so far assumed a comfortable 32-bit float (fp32). But modern models train in 16-bit — and increasingly 8-bit — because the arithmetic is 2–8× faster and every tensor is half the size (or less). The catch is that a narrower float represents fewer numbers, and two failure modes follow:

Overflow — a value too large for the format becomes inf, and any math touching it turns to NaN. One NaN poisons the whole update.
Underflow — a value too small rounds all the way to 0, so a real gradient silently vanishes and that weight never learns.

To see exactly where those walls are, we build the floating-point number line from scratch in precision.py — a small IEEE-754-style encoder that rounds any Python float into a chosen (exponent, mantissa) layout, the same rounding a GPU does when it stores a number in 16 bits. It reproduces fp32, fp16, and bf16 exactly (the tests check it against struct and PyTorch).

Anatomy of a float

A binary float spends its bits on two jobs: exponent bits buy range (how big and how small), mantissa bits buy precision (how many significant digits). A 16-bit budget forces a choice, and the two 16-bit formats split it oppositely:

from precision import FORMATS, format_spec

for name in ("fp32", "fp16", "bf16"):
    s = format_spec(FORMATS[name])
    print(f"{s['name']:5}  exp={s['exp_bits']:>2}  mantissa={s['mantissa_bits']:>2}  "
          f"max={s['max_normal']:.3e}  min_normal={s['min_normal']:.3e}  "
          f"eps={s['eps']:.2e}  (~{s['decimal_digits']:.1f} decimal digits)")

fp32   exp= 8  mantissa=23  max=3.403e+38  min_normal=1.175e-38  eps=1.19e-07  (~6.9 decimal digits)
fp16   exp= 5  mantissa=10  max=6.550e+04  min_normal=6.104e-05  eps=9.77e-04  (~3.0 decimal digits)
bf16   exp= 8  mantissa= 7  max=3.390e+38  min_normal=1.175e-38  eps=7.81e-03  (~2.1 decimal digits)

Read the two 16-bit rows against each other: fp16 spends 10 bits on the mantissa (fine precision) but only 5 on the exponent (a tiny range — it caps at 65504). bf16 keeps fp32’s full 8 exponent bits (so it barely ever overflows or underflows) at the cost of just 7 mantissa bits (coarse precision). That one trade is the whole story of mixed-precision training.

Step through some values and watch how each format stores them — where the bits go, what value comes back, and when a value hits a wall:

from precision import FORMATS, encode, round_to_format, rounding_error, overflows, underflows

def _bit_row(x, fmt):
    import math as _m
    bits = encode(x, fmt)
    s = format(bits, f"0{fmt.total_bits}b")
    r = round_to_format(x, fmt)
    return {
        "sign": s[0],
        "exp": s[1:1 + fmt.exp_bits],
        "mant": s[1 + fmt.exp_bits:],
        "stored": r if _m.isfinite(r) else None,   # null in JS; overflow flag drives display
        "error": min(rounding_error(x, fmt), 9.99),
        "overflow": overflows(x, fmt),
        "underflow": underflows(x, fmt),
    }

_demo = [0.1, 1.0, 3.1415927, 0.15625, 65504.0, 70000.0, 6.1e-5, 1e-8]
precisionData = [
    {"value": x,
     "fp16": _bit_row(x, FORMATS["fp16"]),
     "bf16": _bit_row(x, FORMATS["bf16"])}
    for x in _demo
]
ojs_define(precisionData = precisionData)

viewof precStep = stepControl({min: 0, max: 7, value: 5, label: "Value"})

precBitViz = {
  const theme = diagramTheme;
  const width = 720, height = 300;
  const row = precisionData[precStep];

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  svg.append("rect").attr("width", width).attr("height", height)
    .attr("fill", theme.bg).attr("rx", 12);

  svg.append("text").attr("x", width / 2).attr("y", 34)
    .attr("text-anchor", "middle").attr("fill", theme.nodeText)
    .attr("font-size", 20).attr("font-weight", 700)
    .text(`storing  ${row.value}`);

  const drawFormat = (label, fmt, y) => {
    const boxW = 22, boxH = 30, gap = 3;
    const bits = [
      {b: fmt.sign, kind: "sign"},
      ...fmt.exp.split("").map(b => ({b, kind: "exp"})),
      ...fmt.mant.split("").map(b => ({b, kind: "mant"}))
    ];
    const totalW = bits.length * (boxW + gap);
    const x0 = (width - totalW) / 2;
    const color = {sign: theme.edgeStroke, exp: theme.accent, mant: theme.highlight};

    svg.append("text").attr("x", x0 - 12).attr("y", y + boxH / 2 + 5)
      .attr("text-anchor", "end").attr("fill", theme.nodeText)
      .attr("font-size", 14).attr("font-weight", 600).text(label);

    bits.forEach((bit, i) => {
      const x = x0 + i * (boxW + gap);
      svg.append("rect").attr("x", x).attr("y", y).attr("width", boxW).attr("height", boxH)
        .attr("rx", 3).attr("fill", bit.b === "1" ? color[bit.kind] : theme.bgSecondary)
        .attr("stroke", theme.nodeStroke).attr("stroke-width", 1);
      svg.append("text").attr("x", x + boxW / 2).attr("y", y + boxH / 2 + 5)
        .attr("text-anchor", "middle")
        .attr("fill", bit.b === "1" ? theme.bg : theme.edgeStroke)
        .attr("font-size", 13).text(bit.b);
    });

    let note, noteColor;
    if (fmt.overflow) { note = "→ OVERFLOW (inf)"; noteColor = theme.error; }
    else if (fmt.underflow) { note = "→ UNDERFLOW (0)"; noteColor = theme.error; }
    else {
      const err = fmt.error === 0 ? "exact" : `err ${(fmt.error * 100).toFixed(3)}%`;
      note = `→ ${fmt.stored}   (${err})`;
      noteColor = fmt.error === 0 ? theme.success : theme.nodeText;
    }
    svg.append("text").attr("x", x0 + totalW + 14).attr("y", y + boxH / 2 + 5)
      .attr("fill", noteColor).attr("font-size", 12).text(note);
  };

  drawFormat("fp16", row.fp16, 90);
  drawFormat("bf16", row.bf16, 170);

  // legend
  const leg = [["sign", theme.edgeStroke], ["exponent (range)", theme.accent], ["mantissa (precision)", theme.highlight]];
  let lx = (width - 360) / 2;
  leg.forEach(([t, c]) => {
    svg.append("rect").attr("x", lx).attr("y", 244).attr("width", 14).attr("height", 14).attr("rx", 2).attr("fill", c);
    svg.append("text").attr("x", lx + 20).attr("y", 255).attr("fill", theme.nodeText).attr("font-size", 12).text(t);
    lx += t.length * 7.4 + 44;
  });

  return svg.node();
}

Key Insight

Same 16 bits, opposite bets. fp16 gives the mantissa 10 bits and the exponent 5 — precise, but it overflows at 65504 and underflows around \(6\times10^{-5}\). bf16 gives the exponent 8 bits (fp32’s whole range) and the mantissa only 7 — it almost never over/underflows, but every stored number is coarser. Step to 70000 and watch fp16 flip to inf while bf16 holds it; step to 1e-8 and watch fp16 collapse to 0.

Failure mode 1: overflow

70000 is a perfectly ordinary number, but it is past fp16’s largest value (65504). Storing it there gives inf — and inf - inf = NaN a few operations later. bf16, with fp32’s exponent range, does not blink:

from precision import round_to_format, FORMATS

for name in ("fp16", "bf16"):
    print(f"70000 in {name}: {round_to_format(70000.0, FORMATS[name])}")

70000 in fp16: inf
70000 in bf16: 70144.0

Failure mode 2: underflow (and the loss-scaling fix)

The subtler killer is underflow. Late in training, gradients get small — and a gradient like 1e-8 is below fp16’s smallest representable value, so it rounds to 0 and that weight simply stops updating. The classic fix is loss scaling: multiply the loss by a constant \(S\) before backprop, which multiplies every gradient by \(S\), lifting it out of the underflow hole. The optimizer then divides the update back out by \(S\), so the math is unchanged — only the representation was rescued.

from precision import loss_scale_gradient, FORMATS

out = loss_scale_gradient(grad=1e-8, scale=1024.0, fmt=FORMATS["fp16"])
print(f"true gradient:            {out['grad']:g}")
print(f"stored directly in fp16:  {out['naive_stored']}   underflowed={out['naive_underflowed']}")
print(f"x{out['scale']:g}, store, unscale:   {out['recovered']:g}   rescued={out['rescued']}")

true gradient:            1e-08
stored directly in fp16:  0.0   underflowed=True
x1024, store, unscale:   1.00117e-08   rescued=True

Multiplying by \(S = 1024\) turned an unrepresentable 1e-8 into ~1e-5 (safely inside fp16), and unscaling recovered the gradient almost exactly. This is what PyTorch’s GradScaler automates — and it is why bf16, which underflows far less, usually needs no loss scaling at all.

The whole number line at a glance

The range ladder below shows why the choice comes out the way it does: each bar runs from a format’s smallest subnormal to its largest value (log scale). fp16 is a narrow window; bf16 spans essentially all of fp32. The two markers are the problem values above — 70000 sits past fp16’s right wall, 1e-8 past its left.

from precision import FORMATS, format_spec

_ranges = []
for name in ("fp32", "fp16", "bf16"):
    s = format_spec(FORMATS[name])
    _ranges.append({"format": name, "lo": s["min_subnormal"], "hi": s["max_normal"]})
ojs_define(precisionRanges = _ranges)

precRangeViz = Plot.plot({
  width: 680,
  height: 190,
  marginLeft: 70,
  x: {type: "log", label: "representable magnitude (log scale)", grid: true},
  y: {label: null, domain: precisionRanges.map(r => r.format)},
  color: {domain: ["fp32", "fp16", "bf16"], range: [diagramTheme.edgeStroke, diagramTheme.highlight, diagramTheme.accent]},
  marks: [
    Plot.barX(precisionRanges, {y: "format", x1: "lo", x2: "hi", fill: "format", rx: 4, fillOpacity: 0.85}),
    Plot.ruleX([70000], {stroke: diagramTheme.error, strokeDasharray: "4 3"}),
    Plot.ruleX([1e-8], {stroke: diagramTheme.error, strokeDasharray: "4 3"}),
    Plot.text([{x: 70000, t: "70000"}], {x: "x", y: () => "fp32", text: "t", dy: -34, fill: diagramTheme.error, fontSize: 11}),
    Plot.text([{x: 1e-8, t: "1e-8"}], {x: "x", y: () => "fp32", text: "t", dy: -34, fill: diagramTheme.error, fontSize: 11})
  ]
})

Try This

In the bit stepper, compare 0.1 across the two formats: neither is exact (0.1 is not a finite binary fraction), but fp16’s error is ~10× smaller than bf16’s — that is the extra 3 mantissa bits. Then jump to 70000 and 1e-8: fp16 hits a wall at both ends, bf16 at neither. That asymmetry — bf16 trades precision you rarely miss for range you can’t afford to lose — is why bf16 became the default 16-bit training format on A100/H100/TPU hardware.

Putting it together: the mixed-precision recipe

“Mixed” precision means using both: fast low-precision math for the heavy forward/backward pass, full-precision fp32 for the parts that must be accurate.

Keep a master copy of the weights in fp32.
Cast to fp16/bf16 for the forward and backward pass (the expensive part, now 2× faster).
If using fp16, apply loss scaling so small gradients survive.
Apply the update to the fp32 master weights (accurate accumulation), then re-cast.

In PyTorch this is a few lines of autocast + GradScaler:

from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()                       # manages loss scaling automatically
for batch in dataloader:
    optimizer.zero_grad()
    with autocast(dtype=torch.bfloat16):    # fp16/bf16 forward+backward
        loss = F.cross_entropy(model(input_ids), targets)
    scaler.scale(loss).backward()           # scale up -> gradients survive fp16
    scaler.step(optimizer)                  # unscale -> fp32 master-weight update
    scaler.update()                         # adapt the scale factor

Practical advice:

Prefer bf16 where the hardware supports it (A100/H100/TPU) — its fp32-sized range means no loss scaling and far fewer NaN surprises.
Use fp16 + loss scaling on older GPUs (e.g. V100) that lack bf16.
fp8 (e4m3 / e5m2) is the frontier — H100-class hardware trains parts of the network in 8 bits, with per-tensor scaling doing the loss-scaling job the number line above makes necessary.

Distributed Training Basics

Large models require multiple GPUs. A brief overview:

Data Parallel (DP/DDP):

Same model copied to all GPUs
Each GPU processes different data
Gradients are averaged across GPUs
Memory per GPU = full model size

// Data Parallel step descriptions
dpSteps = [
  {
    id: 0,
    name: "Input Data",
    description: "Large training batch ready to be distributed across GPUs"
  },
  {
    id: 1,
    name: "Split Data",
    description: "Batch is divided evenly among available GPUs"
  },
  {
    id: 2,
    name: "Forward Pass",
    description: "Each GPU computes forward pass on its data shard with full model copy"
  },
  {
    id: 3,
    name: "Compute Gradients",
    description: "Each GPU computes gradients via backpropagation"
  },
  {
    id: 4,
    name: "AllReduce",
    description: "Gradients are averaged across all GPUs via collective communication"
  }
]

// Step slider for Data Parallel diagram
viewof dpStep = Inputs.range([0, 4], {
  value: 0,
  step: 1,
  label: "Step"
})

// Current Data Parallel step info
currentDpStep = dpSteps[dpStep]

// Data Parallel interactive diagram
{
  const width = 700;
  const height = 380;
  const numGpus = 3;
  const batchPerGpu = 8;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 8);

  // Defs for arrows and gradients
  const defs = svg.append("defs");

  // Arrow markers
  defs.append("marker")
    .attr("id", "dp-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  defs.append("marker")
    .attr("id", "dp-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Data flow gradient for animation effect
  const flowGradient = defs.append("linearGradient")
    .attr("id", "dp-flow-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "100%")
    .attr("y2", "0%");

  flowGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", diagramTheme.highlight)
    .attr("stop-opacity", 0.2);

  flowGradient.append("stop")
    .attr("offset", "50%")
    .attr("stop-color", diagramTheme.highlight)
    .attr("stop-opacity", 1);

  flowGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", diagramTheme.highlight)
    .attr("stop-opacity", 0.2);

  // Layout constants
  const batchX = 70;
  const splitX = 200;
  const gpuX = 400;
  const reduceX = 580;
  const centerY = height / 2;
  const gpuSpacing = 90;

  // GPU Y positions
  const gpuYs = [centerY - gpuSpacing, centerY, centerY + gpuSpacing];

  // Helper: draw data block
  const drawDataBlock = (g, x, y, w, h, label, isActive, isSmall = false) => {
    const block = g.append("g").attr("transform", `translate(${x}, ${y})`);

    block.append("rect")
      .attr("x", -w/2)
      .attr("y", -h/2)
      .attr("width", w)
      .attr("height", h)
      .attr("rx", 4)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 6px ${diagramTheme.highlightGlow})` : "none");

    if (label) {
      block.append("text")
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", isSmall ? "10px" : "11px")
        .attr("font-weight", "500")
        .text(label);
    }

    return block;
  };

  // Helper: draw GPU box
  const drawGpu = (g, x, y, gpuNum, isActive, showGradients = false) => {
    const gpu = g.append("g").attr("transform", `translate(${x}, ${y})`);
    const boxW = 100;
    const boxH = 60;

    // GPU container
    gpu.append("rect")
      .attr("x", -boxW/2)
      .attr("y", -boxH/2)
      .attr("width", boxW)
      .attr("height", boxH)
      .attr("rx", 6)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    // GPU label
    gpu.append("text")
      .attr("y", -12)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(`GPU ${gpuNum}`);

    // Full model indicator
    gpu.append("text")
      .attr("y", 6)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("opacity", isActive ? 0.9 : 0.7)
      .text("Full Model");

    // Gradient indicator (when computing gradients)
    if (showGradients) {
      gpu.append("text")
        .attr("y", 20)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.accent)
        .attr("font-size", "9px")
        .attr("font-weight", "500")
        .text("∇ gradients");
    }

    return gpu;
  };

  // Draw based on current step
  const mainGroup = svg.append("g");

  // Step 0: Show full batch
  if (dpStep >= 0) {
    const isActive = dpStep === 0;
    drawDataBlock(mainGroup, batchX, centerY, 60, 100, null, isActive);

    // Data visualization inside batch
    const batchGroup = mainGroup.append("g").attr("transform", `translate(${batchX}, ${centerY})`);
    for (let i = 0; i < 6; i++) {
      const row = Math.floor(i / 2);
      const col = i % 2;
      batchGroup.append("rect")
        .attr("x", -20 + col * 22)
        .attr("y", -35 + row * 25)
        .attr("width", 18)
        .attr("height", 20)
        .attr("rx", 2)
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.accent)
        .attr("opacity", isActive ? 0.9 : 0.6);
    }

    // Batch label
    mainGroup.append("text")
      .attr("x", batchX)
      .attr("y", centerY + 65)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "500")
      .text("Data Batch");

    mainGroup.append("text")
      .attr("x", batchX)
      .attr("y", centerY + 80)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text(`(${batchPerGpu * numGpus} samples)`);
  }

  // Step 1+: Show split batches
  if (dpStep >= 1) {
    const isActive = dpStep === 1;

    // Draw split indicator arrows
    for (let i = 0; i < numGpus; i++) {
      const startX = batchX + 35;
      const startY = centerY;
      const endX = splitX - 25;
      const endY = gpuYs[i];

      mainGroup.append("path")
        .attr("d", `M${startX},${startY} C${startX + 40},${startY} ${endX - 40},${endY} ${endX},${endY}`)
        .attr("fill", "none")
        .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", isActive ? 2 : 1.5)
        .attr("marker-end", isActive ? "url(#dp-arrow-active)" : "url(#dp-arrow)")
        .attr("opacity", isActive ? 1 : 0.6)
        .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
    }

    // Draw split batches
    for (let i = 0; i < numGpus; i++) {
      drawDataBlock(mainGroup, splitX, gpuYs[i], 45, 40, null, isActive, true);

      // Mini data visualization
      const splitGroup = mainGroup.append("g").attr("transform", `translate(${splitX}, ${gpuYs[i]})`);
      for (let j = 0; j < 2; j++) {
        splitGroup.append("rect")
          .attr("x", -15 + j * 16)
          .attr("y", -8)
          .attr("width", 12)
          .attr("height", 16)
          .attr("rx", 2)
          .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.accent)
          .attr("opacity", isActive ? 0.9 : 0.6);
      }

      // Batch shard label
      mainGroup.append("text")
        .attr("x", splitX)
        .attr("y", gpuYs[i] + 30)
        .attr("text-anchor", "middle")
        .attr("fill", diagramTheme.nodeText)
        .attr("font-size", "9px")
        .attr("opacity", 0.7)
        .text(`Batch ${i}`);
    }
  }

  // Step 2+: Show GPUs with forward pass
  if (dpStep >= 2) {
    const isForward = dpStep === 2;
    const isGradient = dpStep === 3;
    const isActive = isForward || isGradient;

    // Arrows from split to GPU
    for (let i = 0; i < numGpus; i++) {
      const startX = splitX + 28;
      const endX = gpuX - 55;
      const y = gpuYs[i];

      mainGroup.append("path")
        .attr("d", `M${startX},${y} L${endX},${y}`)
        .attr("fill", "none")
        .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", isActive ? 2 : 1.5)
        .attr("marker-end", isActive ? "url(#dp-arrow-active)" : "url(#dp-arrow)")
        .attr("opacity", isActive ? 1 : 0.6)
        .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
    }

    // Draw GPUs
    for (let i = 0; i < numGpus; i++) {
      drawGpu(mainGroup, gpuX, gpuYs[i], i, isActive, isGradient);
    }
  }

  // Step 4: AllReduce
  if (dpStep >= 4) {
    const isActive = dpStep === 4;

    // Arrows from GPU to AllReduce
    for (let i = 0; i < numGpus; i++) {
      const startX = gpuX + 55;
      const startY = gpuYs[i];
      const endX = reduceX - 45;
      const endY = centerY;

      mainGroup.append("path")
        .attr("d", `M${startX},${startY} C${startX + 30},${startY} ${endX - 30},${endY} ${endX},${endY}`)
        .attr("fill", "none")
        .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", isActive ? 2 : 1.5)
        .attr("marker-end", isActive ? "url(#dp-arrow-active)" : "url(#dp-arrow)")
        .attr("opacity", isActive ? 1 : 0.6)
        .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
    }

    // AllReduce node
    const reduceGroup = mainGroup.append("g").attr("transform", `translate(${reduceX}, ${centerY})`);

    reduceGroup.append("rect")
      .attr("x", -42)
      .attr("y", -35)
      .attr("width", 84)
      .attr("height", 70)
      .attr("rx", 6)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    reduceGroup.append("text")
      .attr("y", -10)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text("AllReduce");

    reduceGroup.append("text")
      .attr("y", 8)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("opacity", 0.8)
      .text("Average");

    reduceGroup.append("text")
      .attr("y", 22)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("opacity", 0.8)
      .text("Gradients");
  }

  // Step indicator bar at top
  const stepBarY = 25;
  const stepBarWidth = 500;
  const stepBarX = (width - stepBarWidth) / 2;
  const stepWidth = stepBarWidth / 5;

  const stepLabels = ["Input", "Split", "Forward", "Gradients", "AllReduce"];

  for (let i = 0; i < 5; i++) {
    const isCurrentStep = dpStep === i;
    const isPastStep = dpStep > i;
    const stepX = stepBarX + i * stepWidth + stepWidth / 2;

    // Step circle
    mainGroup.append("circle")
      .attr("cx", stepX)
      .attr("cy", stepBarY)
      .attr("r", 12)
      .attr("fill", isCurrentStep ? diagramTheme.highlight : (isPastStep ? diagramTheme.accent : diagramTheme.nodeFill))
      .attr("stroke", isCurrentStep ? diagramTheme.highlight : (isPastStep ? diagramTheme.accent : diagramTheme.nodeStroke))
      .attr("stroke-width", isCurrentStep ? 2 : 1.5)
      .style("filter", isCurrentStep ? `drop-shadow(0 0 6px ${diagramTheme.highlightGlow})` : "none");

    // Step number
    mainGroup.append("text")
      .attr("x", stepX)
      .attr("y", stepBarY)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isCurrentStep || isPastStep ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("font-weight", "600")
      .text(i);

    // Step label
    mainGroup.append("text")
      .attr("x", stepX)
      .attr("y", stepBarY + 22)
      .attr("text-anchor", "middle")
      .attr("fill", isCurrentStep ? diagramTheme.highlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("font-weight", isCurrentStep ? "600" : "400")
      .attr("opacity", isCurrentStep ? 1 : 0.7)
      .text(stepLabels[i]);

    // Connecting line (except last)
    if (i < 4) {
      const lineStartX = stepX + 15;
      const lineEndX = stepBarX + (i + 1) * stepWidth + stepWidth / 2 - 15;

      mainGroup.append("line")
        .attr("x1", lineStartX)
        .attr("y1", stepBarY)
        .attr("x2", lineEndX)
        .attr("y2", stepBarY)
        .attr("stroke", isPastStep ? diagramTheme.accent : diagramTheme.edgeStroke)
        .attr("stroke-width", 1.5)
        .attr("opacity", 0.5);
    }
  }

  // Effective batch size display
  const statsY = height - 30;

  mainGroup.append("text")
    .attr("x", width / 2)
    .attr("y", statsY)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.8)
    .text(`Effective batch size: ${batchPerGpu} samples/GPU × ${numGpus} GPUs = ${batchPerGpu * numGpus} samples`);

  return svg.node();
}

// Step description panel for Data Parallel
html`<div style="
  background: ${diagramTheme.bgSecondary};
  border-radius: 6px;
  padding: 12px 16px;
  margin-top: 8px;
  border-left: 3px solid ${diagramTheme.highlight};
">
  <div style="font-weight: 600; color: ${diagramTheme.nodeText}; margin-bottom: 4px;">
    Step ${currentDpStep.id}: ${currentDpStep.name}
  </div>
  <div style="color: ${diagramTheme.nodeText}; opacity: 0.8; font-size: 13px;">
    ${currentDpStep.description}
  </div>
</div>`

Fully Sharded Data Parallel (FSDP):

Model is sharded across GPUs
Each GPU holds a fraction of parameters
Memory per GPU = model_size / num_gpus
Enables training models larger than single GPU memory

# Distributed training concepts
print("Distributed Training Strategies:")
print()
print("1. Data Parallel (DDP):")
print("   - Best for: Models that fit in one GPU")
print("   - Scales: Batch size (effective_batch = batch * num_gpus)")
print()
print("2. Fully Sharded Data Parallel (FSDP):")
print("   - Best for: Large models (>10B parameters)")
print("   - Scales: Model size and batch size")
print()
print("3. Pipeline Parallel:")
print("   - Best for: Very deep models")
print("   - Splits model layers across GPUs")
print()
print("4. Tensor Parallel:")
print("   - Best for: Models with large layers")
print("   - Splits individual layers across GPUs")

Distributed Training Strategies:

1. Data Parallel (DDP):
   - Best for: Models that fit in one GPU
   - Scales: Batch size (effective_batch = batch * num_gpus)

2. Fully Sharded Data Parallel (FSDP):
   - Best for: Large models (>10B parameters)
   - Scales: Model size and batch size

3. Pipeline Parallel:
   - Best for: Very deep models
   - Splits model layers across GPUs

4. Tensor Parallel:
   - Best for: Models with large layers
   - Splits individual layers across GPUs

Training Stability and Failure Modes

Understanding common failure modes helps you debug training issues:

Loss = NaN or Inf

Causes: - Learning rate too high - Gradient explosion - Numerical overflow in fp16

Solutions: - Reduce learning rate (try 10x smaller) - Add gradient clipping - Use bf16 instead of fp16 or add gradient scaling

Loss stuck at high value

Causes: - Learning rate too low - Poor weight initialization - Data loading bug (same batch every time)

Solutions: - Increase learning rate - Check data loader with small sample - Verify model architecture

Loss oscillates or increases

Causes: - Learning rate too high - Batch size too small - Bug in loss computation

Solutions: - Add warmup period - Reduce learning rate - Use gradient accumulation

// Training pathologies visualization
trainingPathologiesChart = {
  const width = 750;
  const height = 280;
  const margin = { top: 35, right: 20, bottom: 45, left: 50 };

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  // Three panels: Good, Unstable (LR high), Slow (LR low)
  const panelWidth = (width - 60) / 3;
  const innerWidth = panelWidth - margin.left - margin.right + 30;
  const innerHeight = height - margin.top - margin.bottom;

  const configs = [
    {
      title: "Good Training",
      color: theme.accent,
      getData: () => {
        const data = [];
        for (let i = 0; i < 100; i++) {
          // Exponential decay with some noise
          const noise = (Math.sin(i * 0.7) * 0.1 + Math.cos(i * 1.3) * 0.08);
          data.push({ step: i, loss: 5.0 * Math.exp(-0.03 * i) + 0.5 + noise });
        }
        return data;
      },
      yDomain: [0, 6]
    },
    {
      title: "LR Too High (Unstable)",
      color: diagramTheme.error, // red - danger/unstable
      getData: () => {
        const data = [];
        for (let i = 0; i < 100; i++) {
          data.push({ step: i, loss: 4.0 + 0.5 * Math.sin(i * 0.3) + 0.02 * i });
        }
        return data;
      },
      yDomain: [0, 8]
    },
    {
      title: "LR Too Low (Slow)",
      color: theme.highlight,
      getData: () => {
        const data = [];
        for (let i = 0; i < 100; i++) {
          data.push({ step: i, loss: 5.0 * Math.exp(-0.005 * i) + 0.5 });
        }
        return data;
      },
      yDomain: [0, 6]
    }
  ];

  configs.forEach((config, panelIdx) => {
    const offsetX = 10 + panelIdx * panelWidth;
    const panel = svg.append("g")
      .attr("transform", `translate(${offsetX + margin.left}, ${margin.top})`);

    const data = config.getData();

    // Scales
    const xScale = d3.scaleLinear()
      .domain([0, 100])
      .range([0, innerWidth]);

    const yScale = d3.scaleLinear()
      .domain(config.yDomain)
      .range([innerHeight, 0]);

    // Grid lines
    [2, 4, 6].forEach(tick => {
      if (tick <= config.yDomain[1]) {
        panel.append("line")
          .attr("x1", 0)
          .attr("x2", innerWidth)
          .attr("y1", yScale(tick))
          .attr("y2", yScale(tick))
          .attr("stroke", theme.nodeStroke)
          .attr("stroke-opacity", 0.3)
          .attr("stroke-dasharray", "2,2");
      }
    });

    // Line generator
    const lineGen = d3.line()
      .x(d => xScale(d.step))
      .y(d => yScale(d.loss))
      .curve(d3.curveMonotoneX);

    // Line
    panel.append("path")
      .datum(data)
      .attr("d", lineGen)
      .attr("fill", "none")
      .attr("stroke", config.color)
      .attr("stroke-width", 2.5);

    // X-axis
    panel.append("g")
      .attr("transform", `translate(0, ${innerHeight})`)
      .call(d3.axisBottom(xScale).ticks(5))
      .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

    panel.append("text")
      .attr("x", innerWidth / 2)
      .attr("y", innerHeight + 35)
      .attr("text-anchor", "middle")
      .attr("font-size", "10px")
      .attr("fill", theme.nodeText)
      .text("Step");

    // Y-axis
    panel.append("g")
      .call(d3.axisLeft(yScale).ticks(4))
      .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

    if (panelIdx === 0) {
      panel.append("text")
        .attr("transform", "rotate(-90)")
        .attr("x", -innerHeight / 2)
        .attr("y", -35)
        .attr("text-anchor", "middle")
        .attr("font-size", "10px")
        .attr("fill", theme.nodeText)
        .text("Loss");
    }

    // Title
    panel.append("text")
      .attr("x", innerWidth / 2)
      .attr("y", -12)
      .attr("text-anchor", "middle")
      .attr("font-size", "12px")
      .attr("font-weight", "600")
      .attr("fill", config.color)
      .text(config.title);
  });

  return svg.node();
}

Debugging checklist:

Check initial loss - should be ~log(vocab_size) for untrained model
Verify data is being loaded correctly (print a few samples)
Monitor gradient norms - should be stable, not growing
Check learning rate schedule is working (print LR each step)
Test with a tiny dataset first to verify overfitting capability

Text Dataset

Let’s create a simple dataset for language modeling:

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    """Simple text dataset for language modeling."""

    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return max(0, len(self.tokens) - self.seq_len)

    def __getitem__(self, idx):
        input_ids = self.tokens[idx:idx + self.seq_len]
        targets = self.tokens[idx + 1:idx + self.seq_len + 1]
        return input_ids, targets

# Create a simple dataset
tokens = torch.arange(100)  # Token IDs 0-99
seq_len = 8

dataset = TextDataset(tokens, seq_len=seq_len)

print(f"Token IDs: {tokens[:20].tolist()}...")
print(f"Sequence length: {seq_len}")
print(f"Number of samples: {len(dataset)}")

Token IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]...
Sequence length: 8
Number of samples: 92

# Look at a sample
input_ids, targets = dataset[0]

print("Sample 0:")
print(f"  Input:  {input_ids.tolist()}")
print(f"  Target: {targets.tolist()}")
print(f"\n  Target is input shifted by 1 position!")

# Another sample
input_ids, targets = dataset[50]
print(f"\nSample 50:")
print(f"  Input:  {input_ids.tolist()}")
print(f"  Target: {targets.tolist()}")

Sample 0:
  Input:  [0, 1, 2, 3, 4, 5, 6, 7]
  Target: [1, 2, 3, 4, 5, 6, 7, 8]

  Target is input shifted by 1 position!

Sample 50:
  Input:  [50, 51, 52, 53, 54, 55, 56, 57]
  Target: [51, 52, 53, 54, 55, 56, 57, 58]

Training a Model

Now let’s put it all together and train a tiny model:

import sys
sys.path.insert(0, '..')
from m06_transformer.transformer import create_gpt_tiny

# Create model and data
torch.manual_seed(42)

vocab_size = 100
model = create_gpt_tiny(vocab_size=vocab_size)

# Random "training data"
tokens = torch.randint(0, vocab_size, (5000,))

print(f"Model: {model.num_params:,} parameters")
print(f"Training data: {len(tokens):,} tokens")

Model: 838,912 parameters
Training data: 5,000 tokens

# Check initial loss (should be ~log(vocab_size) for random predictions)
dataset = TextDataset(tokens, seq_len=32)
input_ids, targets = dataset[0]
input_ids = input_ids.unsqueeze(0)  # Add batch dimension
targets = targets.unsqueeze(0)

model.eval()
with torch.no_grad():
    logits = model(input_ids)
    # Reshape for loss computation
    B, T, V = logits.shape
    initial_loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

print(f"Initial loss: {initial_loss.item():.4f}")
print(f"Initial perplexity: {math.exp(initial_loss.item()):.2f}")
print(f"\nExpected for random guessing: loss ~ {np.log(vocab_size):.2f}, ppl ~ {vocab_size}")

Initial loss: 4.6686
Initial perplexity: 106.55

Expected for random guessing: loss ~ 4.61, ppl ~ 100

def train_model(model, tokens, num_steps=100, batch_size=16, seq_len=32, learning_rate=3e-4):
    """Simple training loop."""
    dataset = TextDataset(tokens, seq_len)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = CosineScheduler(optimizer, warmup_steps=10, total_steps=num_steps, min_lr=1e-5)

    model.train()
    losses = []
    step = 0

    while step < num_steps:
        for input_ids, targets in dataloader:
            if step >= num_steps:
                break

            # Forward pass
            logits = model(input_ids)
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            losses.append(loss.item())

            if step % 10 == 0:
                lr = optimizer.param_groups[0]['lr']
                ppl = math.exp(loss.item())
                print(f"Step {step:3d} | Loss: {loss.item():.4f} | PPL: {ppl:.2f} | LR: {lr:.2e}")

            step += 1

    return losses

# Train!
print("Starting training...\n")
losses = train_model(model, tokens, num_steps=100)
print(f"\nFinal loss: {losses[-1]:.4f}")
print(f"Final perplexity: {math.exp(losses[-1]):.2f}")

Starting training...

Step   0 | Loss: 4.6194 | PPL: 101.44 | LR: 0.00e+00
Step  10 | Loss: 4.6242 | PPL: 101.92 | LR: 3.00e-04
Step  20 | Loss: 4.6074 | PPL: 100.23 | LR: 2.91e-04
Step  30 | Loss: 4.5950 | PPL: 98.99 | LR: 2.66e-04
Step  40 | Loss: 4.5934 | PPL: 98.83 | LR: 2.27e-04
Step  50 | Loss: 4.6067 | PPL: 100.16 | LR: 1.80e-04
Step  60 | Loss: 4.5851 | PPL: 98.02 | LR: 1.30e-04
Step  70 | Loss: 4.5777 | PPL: 97.29 | LR: 8.25e-05
Step  80 | Loss: 4.6026 | PPL: 99.74 | LR: 4.39e-05
Step  90 | Loss: 4.5959 | PPL: 99.08 | LR: 1.87e-05

Final loss: 4.5968
Final perplexity: 99.17

# Pass losses to OJS for visualization
import json
ojs_define(training_losses = losses, vocab_size_val = vocab_size)

// Training curve visualization
trainingCurveChart = {
  const width = 750;
  const height = 300;
  const margin = { top: 35, right: 30, bottom: 45, left: 55 };

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  // Two panels: Loss and Perplexity
  const panelWidth = (width - 30) / 2;
  const innerWidth = panelWidth - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const losses = training_losses;
  const vocabSize = vocab_size_val;
  const randomBaseline = Math.log(vocabSize);

  // Panel 1: Loss
  const panel1 = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  const xScale1 = d3.scaleLinear()
    .domain([0, losses.length - 1])
    .range([0, innerWidth]);

  const yScale1 = d3.scaleLinear()
    .domain([0, Math.max(...losses) * 1.1])
    .range([innerHeight, 0]);

  // Grid
  [1, 2, 3, 4].forEach(tick => {
    if (tick <= Math.max(...losses) * 1.1) {
      panel1.append("line")
        .attr("x1", 0)
        .attr("x2", innerWidth)
        .attr("y1", yScale1(tick))
        .attr("y2", yScale1(tick))
        .attr("stroke", theme.nodeStroke)
        .attr("stroke-opacity", 0.3)
        .attr("stroke-dasharray", "2,2");
    }
  });

  // Random baseline line
  panel1.append("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", yScale1(randomBaseline))
    .attr("y2", yScale1(randomBaseline))
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  // Loss line
  const lineGen1 = d3.line()
    .x((d, i) => xScale1(i))
    .y(d => yScale1(d))
    .curve(d3.curveMonotoneX);

  panel1.append("path")
    .datum(losses)
    .attr("d", lineGen1)
    .attr("fill", "none")
    .attr("stroke", theme.accent)
    .attr("stroke-width", 2.5);

  // X-axis
  panel1.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale1).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel1.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 35)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Step");

  // Y-axis
  panel1.append("g")
    .call(d3.axisLeft(yScale1).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel1.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -40)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Loss");

  // Title
  panel1.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", -12)
    .attr("text-anchor", "middle")
    .attr("font-size", "13px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Training Loss");

  // Legend
  panel1.append("line")
    .attr("x1", innerWidth - 100)
    .attr("x2", innerWidth - 80)
    .attr("y1", 15)
    .attr("y2", 15)
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  panel1.append("text")
    .attr("x", innerWidth - 75)
    .attr("y", 19)
    .attr("font-size", "9px")
    .attr("fill", theme.nodeText)
    .text("Random baseline");

  // Panel 2: Perplexity
  const panel2 = svg.append("g")
    .attr("transform", `translate(${panelWidth + margin.left + 15}, ${margin.top})`);

  const perplexities = losses.map(l => Math.exp(l));

  const xScale2 = d3.scaleLinear()
    .domain([0, losses.length - 1])
    .range([0, innerWidth]);

  const yScale2 = d3.scaleLinear()
    .domain([0, Math.max(...perplexities) * 1.1])
    .range([innerHeight, 0]);

  // Grid
  [25, 50, 75, 100].forEach(tick => {
    if (tick <= Math.max(...perplexities) * 1.1) {
      panel2.append("line")
        .attr("x1", 0)
        .attr("x2", innerWidth)
        .attr("y1", yScale2(tick))
        .attr("y2", yScale2(tick))
        .attr("stroke", theme.nodeStroke)
        .attr("stroke-opacity", 0.3)
        .attr("stroke-dasharray", "2,2");
    }
  });

  // Random baseline line
  panel2.append("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", yScale2(vocabSize))
    .attr("y2", yScale2(vocabSize))
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  // Perplexity line
  const lineGen2 = d3.line()
    .x((d, i) => xScale2(i))
    .y(d => yScale2(d))
    .curve(d3.curveMonotoneX);

  panel2.append("path")
    .datum(perplexities)
    .attr("d", lineGen2)
    .attr("fill", "none")
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 2.5);

  // X-axis
  panel2.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale2).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel2.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 35)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Step");

  // Y-axis
  panel2.append("g")
    .call(d3.axisLeft(yScale2).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel2.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -40)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Perplexity");

  // Title
  panel2.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", -12)
    .attr("text-anchor", "middle")
    .attr("font-size", "13px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Training Perplexity");

  // Legend
  panel2.append("line")
    .attr("x1", innerWidth - 100)
    .attr("x2", innerWidth - 80)
    .attr("y1", 15)
    .attr("y2", 15)
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  panel2.append("text")
    .attr("x", innerWidth - 75)
    .attr("y", 19)
    .attr("font-size", "9px")
    .attr("fill", theme.nodeText)
    .text("Random baseline");

  return svg.node();
}

Effect of Learning Rate

Learning rate is crucial - too high causes instability, too low is slow:

# Train with different learning rates
learning_rates = [1e-5, 1e-4, 3e-4, 1e-3, 3e-3]
all_losses = {}

for lr in learning_rates:
    torch.manual_seed(42)
    model = create_gpt_tiny(vocab_size=100)
    tokens = torch.randint(0, 100, (3000,))

    # Train silently
    dataset = TextDataset(tokens, seq_len=32)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True, drop_last=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

    model.train()
    losses = []
    step = 0

    while step < 50:
        for input_ids, targets in dataloader:
            if step >= 50:
                break
            logits = model(input_ids)
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.item())
            step += 1

    all_losses[lr] = losses
    print(f"LR={lr:.0e}: final_loss={losses[-1]:.3f}, final_ppl={math.exp(losses[-1]):.1f}")

LR=1e-05: final_loss=4.637, final_ppl=103.2
LR=1e-04: final_loss=4.615, final_ppl=101.0
LR=3e-04: final_loss=4.613, final_ppl=100.8
LR=1e-03: final_loss=4.639, final_ppl=103.4
LR=3e-03: final_loss=4.634, final_ppl=103.0

# Pass learning rate comparison data to OJS
# Convert dict with float keys to list of dicts for JSON serialization
lr_comparison_data = [{"lr": lr, "losses": losses} for lr, losses in all_losses.items()]
ojs_define(lr_comparison = lr_comparison_data)

// Learning rate comparison chart
lrComparisonChart = {
  const width = 700;
  const height = 380;
  const margin = { top: 40, right: 100, bottom: 50, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Color scale for different learning rates - uses theme colors
  const colors = [
    theme.edgeStroke,  // gray - too low
    theme.accent,      // blue - low
    theme.success,     // green - optimal
    theme.highlight,   // orange - high
    theme.error        // red - too high
  ];

  // Find max values
  const allLosses = lr_comparison.flatMap(d => d.losses);
  const maxLoss = Math.max(...allLosses);
  const maxSteps = Math.max(...lr_comparison.map(d => d.losses.length));

  // Scales
  const xScale = d3.scaleLinear()
    .domain([0, maxSteps - 1])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, Math.min(maxLoss * 1.1, 10)])
    .range([innerHeight, 0]);

  // Grid
  [2, 4, 6, 8].forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.3)
      .attr("stroke-dasharray", "2,2");
  });

  // Line generator
  const lineGen = d3.line()
    .x((d, i) => xScale(i))
    .y(d => yScale(Math.min(d, 10)))
    .curve(d3.curveMonotoneX);

  // Draw lines for each learning rate
  lr_comparison.forEach((lrData, idx) => {
    const color = colors[idx % colors.length];

    chart.append("path")
      .datum(lrData.losses)
      .attr("d", lineGen)
      .attr("fill", "none")
      .attr("stroke", color)
      .attr("stroke-width", 2.5)
      .attr("opacity", 0.9);
  });

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(6))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 40)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Step");

  // Y-axis
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Loss");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Training Loss for Different Learning Rates");

  // Legend
  const legend = svg.append("g")
    .attr("transform", `translate(${width - margin.right + 15}, ${margin.top + 20})`);

  lr_comparison.forEach((lrData, idx) => {
    const y = idx * 22;
    const color = colors[idx % colors.length];
    const lrStr = lrData.lr.toExponential(0);

    legend.append("line")
      .attr("x1", 0)
      .attr("x2", 20)
      .attr("y1", y)
      .attr("y2", y)
      .attr("stroke", color)
      .attr("stroke-width", 2.5);

    legend.append("text")
      .attr("x", 25)
      .attr("y", y + 4)
      .attr("font-size", "10px")
      .attr("fill", theme.nodeText)
      .text(`LR=${lrStr}`);
  });

  return svg.node();
}

Observations:

Too low (1e-5): Training is very slow
Just right (3e-4): Smooth, fast convergence
Too high (3e-3): Unstable, loss may spike or diverge

Checkpointing

Save regularly! Training can crash. Here’s what to save:

# Demonstrate checkpointing
import json
from pathlib import Path

def save_checkpoint(model, optimizer, step, loss, path):
    """Save a training checkpoint."""
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'step': step,
        'loss': loss,
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved to {path}")

def load_checkpoint(model, optimizer, path):
    """Load a training checkpoint."""
    checkpoint = torch.load(path, weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Checkpoint loaded from {path}")
    print(f"  Step: {checkpoint['step']}, Loss: {checkpoint['loss']:.4f}")
    return checkpoint['step'], checkpoint['loss']

# Save example
model = create_gpt_tiny(vocab_size=100)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
save_checkpoint(model, optimizer, step=50, loss=2.5, path="demo_checkpoint.pt")

# Load example
model2 = create_gpt_tiny(vocab_size=100)
optimizer2 = torch.optim.AdamW(model2.parameters(), lr=3e-4)
step, loss = load_checkpoint(model2, optimizer2, "demo_checkpoint.pt")

# Clean up
Path("demo_checkpoint.pt").unlink()

Checkpoint saved to demo_checkpoint.pt
Checkpoint loaded from demo_checkpoint.pt
  Step: 50, Loss: 2.5000

Validation and Early Stopping

Monitor validation loss to detect overfitting:

// Early Stopping Controls
viewof esCurrentEpoch = Inputs.range([1, 50], {
  value: 1,
  step: 1,
  label: "Current Epoch"
})

earlyStoppingData = {
  const epochs = 50;
  const data = [];

  // Training loss: exponential decay with noise
  // Validation loss: decreases then increases (U-shape)
  const bestEpoch = 25; // Where validation loss is lowest

  for (let epoch = 1; epoch <= epochs; epoch++) {
    // Training loss: smooth exponential decay
    const trainLoss = 2.5 * Math.exp(-0.08 * epoch) + 0.3 + 0.05 * Math.sin(epoch * 0.5);

    // Validation loss: U-shaped curve
    // Decreases initially, then increases (overfitting)
    const valBase = 2.5 * Math.exp(-0.06 * epoch) + 0.4;
    const overfitComponent = epoch > bestEpoch ? 0.02 * Math.pow(epoch - bestEpoch, 1.3) : 0;
    const valLoss = valBase + overfitComponent + 0.03 * Math.sin(epoch * 0.7 + 1);

    data.push({
      epoch,
      trainLoss,
      valLoss,
      gap: valLoss - trainLoss
    });
  }

  return data;
}

// Find the best epoch (minimum validation loss)
bestModelEpoch = {
  let minVal = Infinity;
  let bestEpoch = 1;

  for (const d of earlyStoppingData) {
    if (d.valLoss < minVal) {
      minVal = d.valLoss;
      bestEpoch = d.epoch;
    }
  }

  return bestEpoch;
}

// Current epoch data
currentEpochData = {
  const current = earlyStoppingData.find(d => d.epoch === esCurrentEpoch);
  return current || earlyStoppingData[0];
}

// Training phase detection
trainingPhase = {
  if (esCurrentEpoch < bestModelEpoch - 5) return "learning";
  if (esCurrentEpoch <= bestModelEpoch + 2) return "optimal";
  return "overfitting";
}

// Early Stopping Visualization
{
  const theme = diagramTheme;
  const width = 700;
  const height = 400;
  const margin = { top: 40, right: 150, bottom: 60, left: 70 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  const defs = svg.append("defs");

  // Background gradient
  const bgGradient = defs.append("linearGradient")
    .attr("id", "es-bg-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "0%")
    .attr("y2", "100%");

  bgGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.bg);

  bgGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.bgSecondary);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", "url(#es-bg-gradient)")
    .attr("rx", 12);

  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales
  const xScale = d3.scaleLinear()
    .domain([1, 50])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, 3])
    .range([innerHeight, 0]);

  // Overfitting region highlight
  chart.append("rect")
    .attr("x", xScale(bestModelEpoch))
    .attr("y", 0)
    .attr("width", innerWidth - xScale(bestModelEpoch))
    .attr("height", innerHeight)
    .attr("fill", theme.error)
    .attr("opacity", trainingPhase === "overfitting" ? 0.15 : 0.05);

  // Optimal zone highlight
  chart.append("rect")
    .attr("x", xScale(Math.max(1, bestModelEpoch - 5)))
    .attr("y", 0)
    .attr("width", xScale(bestModelEpoch + 2) - xScale(Math.max(1, bestModelEpoch - 5)))
    .attr("height", innerHeight)
    .attr("fill", theme.success)
    .attr("opacity", trainingPhase === "optimal" ? 0.15 : 0.05);

  // Region labels at top
  chart.append("text")
    .attr("x", xScale(10))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", trainingPhase === "learning" ? "600" : "400")
    .attr("fill", theme.accent)
    .attr("opacity", trainingPhase === "learning" ? 1 : 0.5)
    .text("LEARNING");

  chart.append("text")
    .attr("x", xScale(bestModelEpoch))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", trainingPhase === "optimal" ? "600" : "400")
    .attr("fill", theme.success)
    .attr("opacity", trainingPhase === "optimal" ? 1 : 0.5)
    .text("OPTIMAL");

  chart.append("text")
    .attr("x", xScale(40))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", trainingPhase === "overfitting" ? "600" : "400")
    .attr("fill", theme.error)
    .attr("opacity", trainingPhase === "overfitting" ? 1 : 0.5)
    .text("OVERFITTING");

  // Grid lines
  const yTicks = [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0];
  yTicks.forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.2)
      .attr("stroke-dasharray", "3,3");
  });

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(10))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  // Y-axis
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(6))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  // Axis labels
  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 45)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("font-weight", "500")
    .text("Epoch");

  chart.append("text")
    .attr("x", -innerHeight / 2)
    .attr("y", -50)
    .attr("transform", "rotate(-90)")
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("font-weight", "500")
    .text("Loss");

  // Filter data up to current epoch
  const visibleData = earlyStoppingData.filter(d => d.epoch <= esCurrentEpoch);

  // Gap fill between curves (overfitting visualization)
  const gapArea = d3.area()
    .x(d => xScale(d.epoch))
    .y0(d => yScale(d.trainLoss))
    .y1(d => yScale(d.valLoss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(visibleData)
    .attr("d", gapArea)
    .attr("fill", theme.error)
    .attr("opacity", 0.1);

  // Training loss line
  const trainLine = d3.line()
    .x(d => xScale(d.epoch))
    .y(d => yScale(d.trainLoss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(visibleData)
    .attr("d", trainLine)
    .attr("fill", "none")
    .attr("stroke", theme.accent)
    .attr("stroke-width", 3)
    .attr("stroke-linecap", "round");

  // Validation loss line
  const valLine = d3.line()
    .x(d => xScale(d.epoch))
    .y(d => yScale(d.valLoss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(visibleData)
    .attr("d", valLine)
    .attr("fill", "none")
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 3)
    .attr("stroke-linecap", "round");

  // Best model marker (vertical line at best epoch)
  if (esCurrentEpoch >= bestModelEpoch) {
    const bestData = earlyStoppingData.find(d => d.epoch === bestModelEpoch);

    chart.append("line")
      .attr("x1", xScale(bestModelEpoch))
      .attr("x2", xScale(bestModelEpoch))
      .attr("y1", 0)
      .attr("y2", innerHeight)
      .attr("stroke", theme.success)
      .attr("stroke-width", 2)
      .attr("stroke-dasharray", "6,4");

    // Best model point marker
    chart.append("circle")
      .attr("cx", xScale(bestModelEpoch))
      .attr("cy", yScale(bestData.valLoss))
      .attr("r", 8)
      .attr("fill", theme.success)
      .attr("stroke", "#fff")
      .attr("stroke-width", 2);

    // Star/checkpoint icon
    chart.append("text")
      .attr("x", xScale(bestModelEpoch))
      .attr("y", yScale(bestData.valLoss) + 1)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", "#fff")
      .attr("font-size", "10px")
      .attr("font-weight", "bold")
      .text("★");

    // Label for best model
    chart.append("text")
      .attr("x", xScale(bestModelEpoch))
      .attr("y", yScale(bestData.valLoss) - 18)
      .attr("text-anchor", "middle")
      .attr("fill", theme.success)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text("SAVE CHECKPOINT");
  }

  // Current epoch marker
  const currentX = xScale(esCurrentEpoch);
  const currentTrainY = yScale(currentEpochData.trainLoss);
  const currentValY = yScale(currentEpochData.valLoss);

  // Vertical line at current epoch
  chart.append("line")
    .attr("x1", currentX)
    .attr("x2", currentX)
    .attr("y1", 0)
    .attr("y2", innerHeight)
    .attr("stroke", theme.nodeText)
    .attr("stroke-width", 1)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "4,4");

  // Gap indicator arrow
  if (currentEpochData.gap > 0.1) {
    const midY = (currentTrainY + currentValY) / 2;

    // Gap line
    chart.append("line")
      .attr("x1", currentX + 8)
      .attr("x2", currentX + 8)
      .attr("y1", currentTrainY)
      .attr("y2", currentValY)
      .attr("stroke", theme.error)
      .attr("stroke-width", 2);

    // Gap label
    chart.append("text")
      .attr("x", currentX + 18)
      .attr("y", midY)
      .attr("dominant-baseline", "central")
      .attr("fill", theme.error)
      .attr("font-size", "10px")
      .attr("font-weight", "500")
      .text(`Gap: ${currentEpochData.gap.toFixed(2)}`);
  }

  // Current points
  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentTrainY)
    .attr("r", 6)
    .attr("fill", theme.accent)
    .attr("stroke", "#fff")
    .attr("stroke-width", 2);

  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentValY)
    .attr("r", 6)
    .attr("fill", theme.highlight)
    .attr("stroke", "#fff")
    .attr("stroke-width", 2);

  // Legend
  const legendX = innerWidth + 20;
  const legendY = 40;

  // Training loss legend
  chart.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY)
    .attr("y2", legendY)
    .attr("stroke", theme.accent)
    .attr("stroke-width", 3);

  chart.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Train Loss");

  // Validation loss legend
  chart.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY + 25)
    .attr("y2", legendY + 25)
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 3);

  chart.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY + 25)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Val Loss");

  // Best model legend
  chart.append("circle")
    .attr("cx", legendX + 12)
    .attr("cy", legendY + 55)
    .attr("r", 6)
    .attr("fill", theme.success);

  chart.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY + 55)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Best Model");

  // Status panel
  const statusY = legendY + 90;

  chart.append("rect")
    .attr("x", legendX - 5)
    .attr("y", statusY - 5)
    .attr("width", 115)
    .attr("height", 75)
    .attr("rx", 6)
    .attr("fill", theme.bgSecondary)
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 1);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 12)
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .attr("opacity", 0.7)
    .text(`Epoch: ${esCurrentEpoch}`);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 28)
    .attr("fill", theme.accent)
    .attr("font-size", "10px")
    .text(`Train: ${currentEpochData.trainLoss.toFixed(3)}`);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 44)
    .attr("fill", theme.highlight)
    .attr("font-size", "10px")
    .text(`Val: ${currentEpochData.valLoss.toFixed(3)}`);

  const phaseColor = trainingPhase === "learning" ? theme.accent :
                     trainingPhase === "optimal" ? (theme.success) :
                     (theme.error);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 60)
    .attr("fill", phaseColor)
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .text(trainingPhase.toUpperCase());

  return svg.node();
}

Tips: - Monitor validation loss, not just training loss - Save the model with the best validation loss - Consider early stopping if validation loss increases consistently

Scaling Laws: Compute-Optimal Training

Everything so far answers how to train. This section answers the question that comes before the first step: given a fixed compute budget, how big should the model be, and how many tokens should it see? The surprising answer is that this is predictable — model loss follows smooth power laws in size, data, and compute, so you can plan the run instead of guessing.

The Compute Rule: `C ≈ 6ND`

The compute (in FLOPs) of one training run is captured by a famously simple approximation:

\[ C \approx 6 \cdot N \cdot D \]

where \(N\) is the number of parameters and \(D\) is the number of training tokens. The 6 counts roughly 2 FLOPs per parameter per token in the forward pass and 4 in the backward pass (the backward pass computes gradients w.r.t. both activations and weights). This one relation is the budget line every planning decision moves along: for a fixed \(C\), buying more parameters means buying fewer tokens, and vice versa.

from scaling import compute_flops, chinchilla_loss

# A 1B-parameter model trained on 20B tokens.
flops = compute_flops(n_params=1e9, n_tokens=20e9)
print(f"Compute: {flops:.2e} FLOPs")
print(f"Predicted loss: {chinchilla_loss(1e9, 20e9):.3f} nats/token")

Compute: 1.20e+20 FLOPs
Predicted loss: 2.580 nats/token

From Kaplan to Chinchilla

Two landmark papers shaped how the field spends compute:

Kaplan et al. (2020) measured that test loss falls as a power law in \(N\), \(D\), and \(C\), and concluded that most extra compute should go into bigger models — data could lag behind.
Hoffmann et al. (2022), “Chinchilla” re-fit the curves more carefully and found Kaplan’s recipe left models badly under-trained. Compute-optimal training grows parameters and tokens together — a rule of thumb of about 20 tokens per parameter. Their 70B-parameter Chinchilla, trained on 1.4T tokens, beat the 280B-parameter Gopher trained on only 300B tokens — using the same compute.

Chinchilla summarized the loss surface with a parametric fit:

\[ L(N, D) = E + \frac{A}{N^{\alpha}} + \frac{B}{D^{\beta}} \]

\(E\) is the irreducible loss (the entropy of natural text you can never beat); the two power-law terms shrink as you add parameters or data. scaling.py implements this fit and finds the loss-minimizing split of any compute budget by a ternary search along the \(C = 6ND\) constraint.

from scaling import compute_optimal_allocation, chinchilla_rule_allocation

C = compute_flops(7e10, 1.4e12)          # Chinchilla's own compute budget
rule = chinchilla_rule_allocation(C, tokens_per_param=20)
print(f"20x rule of thumb -> {rule['n_params']/1e9:.0f}B params, "
      f"{rule['n_tokens']/1e12:.1f}T tokens")   # recovers Chinchilla exactly

opt = compute_optimal_allocation(C)
print(f"Parametric optimum -> {opt['n_params']/1e9:.0f}B params, "
      f"{opt['n_tokens']/1e12:.1f}T tokens, "
      f"{opt['tokens_per_param']:.0f} tokens/param")

20x rule of thumb -> 70B params, 1.4T tokens
Parametric optimum -> 32B params, 3.0T tokens, 93 tokens/param

Notice the parametric optimum’s ratio is above 20 — the fit implies the compute-optimal tokens-per-parameter ratio drifts upward with scale. That is one reason modern models are trained far past 20× (Llama-3’s 8B model saw ~15T tokens — about 1875 tokens per parameter): once training compute is spent, a smaller-but-longer-trained model is cheaper to serve.

The Undertraining Story, Quantified

The clearest way to see scaling laws at work is to place real models against the compute-optimal frontier — the lowest loss achievable at each compute budget. A model with too many parameters for its token budget sits above the frontier: it wasted FLOPs it could have spent on data.

from scaling import demonstrate_scaling

_ = demonstrate_scaling()

========================================================================
REAL MODELS vs THE COMPUTE-OPTIMAL FRONTIER (Chinchilla fit)
========================================================================
model                tok/param     loss  frontier   excess
GPT-3 (2020)               1.7    2.002     1.954    0.048
Gopher (2021)              1.1    1.993     1.936    0.058
Chinchilla (2022)         20.0    1.937     1.930    0.007
Llama-3 8B (2024)       1875.0    1.949     1.923    0.026

Kaplan (2020): spend compute on bigger models.
Chinchilla (2022): grow params and tokens together (~20 tok/param as
a rule of thumb) — the parameter-heavy models above were undertrained.

The excess column is how much loss each model gives up versus a compute-optimal model using identical FLOPs. Gopher and GPT-3 — parameter-heavy, ~1 token per parameter — give up the most; Chinchilla sits almost on the frontier.

Interactive: Compute-Optimal Explorer

Slide the compute budget and read off the compute-optimal model (via the 20× rule). The chart plots the loss frontier against compute on a log-compute axis, with four real models placed where their own compute and loss land — the ones floating above the line are the under-trained ones.

viewof scalingBudgetExp = Inputs.range([18, 26], {
  value: 23.77, step: 0.1, label: "log₁₀ compute budget (FLOPs)"
})

scalingReadout = {
  // The 20x rule of thumb: N = sqrt(C / (6 * 20)), D = 20 * N.
  const C = 10 ** scalingBudgetExp;
  const N = Math.sqrt(C / (6 * 20));
  const D = 20 * N;
  const fmt = x => x >= 1e12 ? (x / 1e12).toFixed(2) + "T"
    : x >= 1e9 ? (x / 1e9).toFixed(2) + "B"
    : (x / 1e6).toFixed(0) + "M";
  const label = `${fmt(N)} params · ${fmt(D)} tokens · 20 tokens/param`;
  return html`<div style="font-family: monospace; font-size: 14px; margin: 8px 0 4px; color: ${diagramTheme.nodeText};">
    Compute-optimal model at <strong>10<sup>${scalingBudgetExp.toFixed(1)}</sup></strong> FLOPs:
    &nbsp;<span style="color: ${diagramTheme.highlight}; font-weight: 600;">${label}</span>
  </div>`;
}

scalingChart = {
  // Interpolate the frontier loss at the selected budget for the marker.
  const budgetLoss = (() => {
    const xs = scalingFrontier;
    const x = scalingBudgetExp;
    for (let i = 1; i < xs.length; i++) {
      if (xs[i].logC >= x) {
        const a = xs[i - 1], b = xs[i];
        const t = (x - a.logC) / (b.logC - a.logC);
        return a.loss + t * (b.loss - a.loss);
      }
    }
    return xs[xs.length - 1].loss;
  })();

  return Plot.plot({
    width: 720,
    height: 440,
    marginLeft: 56,
    marginBottom: 46,
    style: { background: "transparent", color: diagramTheme.nodeText, fontSize: "13px" },
    x: { label: "log₁₀ training compute (FLOPs) →", grid: true },
    y: { label: "↑ loss (nats/token)", grid: true },
    marks: [
      Plot.line(scalingFrontier, {
        x: "logC", y: "loss",
        stroke: diagramTheme.accent, strokeWidth: 2.5
      }),
      Plot.ruleX([scalingBudgetExp], { stroke: diagramTheme.highlight, strokeDasharray: "4,4" }),
      Plot.dot([{ logC: scalingBudgetExp, loss: budgetLoss }], {
        fill: diagramTheme.highlight, r: 6
      }),
      Plot.dot(scalingModels, {
        x: "logC", y: "loss", r: 6,
        fill: diagramTheme.nodeFill, stroke: diagramTheme.edgeStroke, strokeWidth: 1.5
      }),
      Plot.text(scalingModels, {
        x: "logC", y: "loss", text: "name",
        dy: -12, fontSize: 11, fill: diagramTheme.nodeText
      }),
      Plot.text([{ logC: scalingFrontier[2].logC, loss: scalingFrontier[2].loss }], {
        text: ["compute-optimal frontier"], dy: -10, dx: 40,
        fontSize: 11, fill: diagramTheme.accent
      })
    ]
  });
}

Key Insight

Under-training is spending compute on parameters instead of data. A model that is too large for its token budget sits above the compute-optimal frontier — it would have reached a lower loss, at the same FLOPs, as a smaller model trained on more tokens. Chinchilla’s headline result was that most large models of its era (Gopher, GPT-3, MT-NLG) were on the wrong side of that line.

“20 Tokens per Parameter” Is a Rule of Thumb, Not a Law

The 20× ratio comes from Chinchilla’s compute-optimal fit at their scale. The parametric loss law implies the optimal ratio grows with compute, and — just as important — compute-optimal training only minimizes training cost. When a model will be served to many users, it is often worth over-training a smaller model far past 20× (Llama-3 8B: ~1875 tokens/param) to cut inference cost. Use scaling laws to plan, but pick the ratio for your deployment, not a magic number.

Training Tips

Quick Reference Table

Symptom	Likely Cause	Solution
Loss = NaN	LR too high	Reduce LR by 10x
Loss stuck	LR too low	Increase LR by 2-5x
Loss oscillates	Batch too small	Use gradient accumulation
Overfitting	Not enough data	More data, more dropout
Underfitting	Model too small	More layers/heads/dims
Slow training	No GPU/MPS	Use hardware acceleration
OOM errors	Batch too large	Reduce batch size, use accumulation
Training crash	No checkpoints	Save every N steps

Hyperparameter Recommendations

Based on published research and common practices:

Hyperparameter	Small Models (<1B)	Large Models (>1B)
Learning rate	1e-4 to 6e-4	1e-4 to 3e-4
Warmup	1-2% of steps	0.1-1% of steps
Weight decay	0.01 - 0.1	0.01 - 0.1
Beta1	0.9	0.9
Beta2	0.999	0.95
Batch size	256 - 1024 tokens	1M - 4M tokens
Gradient clip	1.0	1.0

Memory Optimization Strategies

Gradient accumulation: Simulate larger batches
Mixed precision (fp16/bf16): ~50% memory reduction
Gradient checkpointing: Trade compute for memory
FSDP/DeepSpeed: Shard model across GPUs

Interactive Exploration

Experiment with learning rate schedules in real-time. Adjust the hyperparameters to see how warmup and cosine decay shape the learning rate curve.

function computeSchedule(maxLr, minLr, warmupSteps, totalSteps) {
  const lrs = [];
  const numPoints = Math.min(totalSteps, 500); // Limit points for performance
  const stepSize = totalSteps / numPoints;

  for (let i = 0; i <= numPoints; i++) {
    const step = Math.floor(i * stepSize);
    let lr;

    if (step < warmupSteps) {
      // Linear warmup
      lr = maxLr * step / Math.max(1, warmupSteps);
    } else if (step >= totalSteps) {
      lr = minLr;
    } else {
      // Cosine decay
      const progress = (step - warmupSteps) / Math.max(1, totalSteps - warmupSteps);
      const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
      lr = minLr + (maxLr - minLr) * cosine;
    }

    lrs.push({ step, lr, phase: step < warmupSteps ? "warmup" : "decay" });
  }

  return lrs;
}

// Get LR at a specific step
function getLrAtStep(step, maxLr, minLr, warmupSteps, totalSteps) {
  if (step < warmupSteps) {
    return maxLr * step / Math.max(1, warmupSteps);
  } else if (step >= totalSteps) {
    return minLr;
  } else {
    const progress = (step - warmupSteps) / Math.max(1, totalSteps - warmupSteps);
    const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
    return minLr + (maxLr - minLr) * cosine;
  }
}

viewof maxLr = Inputs.range([1e-5, 1e-2], {
  value: 1e-3,
  step: 1e-5,
  label: "Max Learning Rate",
  format: x => x.toExponential(1)
})

viewof minLr = Inputs.range([0, 1e-4], {
  value: 1e-5,
  step: 1e-6,
  label: "Min Learning Rate",
  format: x => x.toExponential(1)
})

viewof warmupSteps = Inputs.range([0, 500], {
  value: 100,
  step: 10,
  label: "Warmup Steps"
})

viewof totalSteps = Inputs.range([100, 2000], {
  value: 1000,
  step: 50,
  label: "Total Steps"
})

viewof currentStep = Inputs.range([0, totalSteps], {
  value: Math.floor(totalSteps / 2),
  step: 1,
  label: "Current Step"
})

// Widget theme - uses diagramTheme from _diagram-lib.qmd which already handles dark mode
theme = {
  const t = diagramTheme;
  return {
    warmupBg: t.isDark ? 'rgba(251, 146, 60, 0.15)' : 'rgba(249, 115, 22, 0.1)',
    curveStroke: t.accent,
    warmupMarker: t.highlight,
    currentMarker: t.error,
    annotationText: t.highlight
  };
}

scheduleData = computeSchedule(maxLr, minLr, warmupSteps, totalSteps)

// Current LR
currentLr = getLrAtStep(currentStep, maxLr, minLr, warmupSteps, totalSteps)

// Warmup percentage
warmupPct = ((warmupSteps / totalSteps) * 100).toFixed(1)

Plot = import("https://esm.sh/@observablehq/plot@0.6")

Plot.plot({
  title: "Learning Rate Schedule: Warmup + Cosine Decay",
  subtitle: `Warmup: ${warmupSteps} steps (${warmupPct}%) | Peak LR: ${maxLr.toExponential(1)} | Min LR: ${minLr.toExponential(1)}`,
  width: 700,
  height: 350,
  marginLeft: 70,
  marginBottom: 50,
  x: {
    label: "Training Step →",
    domain: [0, totalSteps]
  },
  y: {
    label: "↑ Learning Rate",
    domain: [0, maxLr * 1.1],
    tickFormat: ".1e"
  },
  marks: [
    // Warmup region background
    Plot.rectY([{x1: 0, x2: warmupSteps, y: maxLr * 1.1}], {
      x1: "x1",
      x2: "x2",
      y2: "y",
      y1: 0,
      fill: theme.warmupBg,
      fillOpacity: 0.5
    }),
    // Main LR curve
    Plot.line(scheduleData, {
      x: "step",
      y: "lr",
      stroke: theme.curveStroke,
      strokeWidth: 2.5
    }),
    // Warmup end marker
    Plot.ruleX([warmupSteps], {
      stroke: theme.warmupMarker,
      strokeWidth: 2,
      strokeDasharray: "5,5"
    }),
    // Current step indicator
    Plot.ruleX([currentStep], {
      stroke: theme.currentMarker,
      strokeWidth: 2
    }),
    // Current LR point
    Plot.dot([{step: currentStep, lr: currentLr}], {
      x: "step",
      y: "lr",
      fill: theme.currentMarker,
      r: 6
    }),
    // Annotations
    Plot.text([{step: warmupSteps, lr: maxLr * 1.05}], {
      x: "step",
      y: "lr",
      text: ["← Warmup ends"],
      fill: theme.annotationText,
      fontSize: 11,
      textAnchor: "start"
    }),
    Plot.ruleY([0])
  ]
})

// Display current step info
md`**Step ${currentStep}:** LR = **${currentLr.toExponential(3)}** ${currentStep < warmupSteps ? "(warming up)" : currentStep >= totalSteps ? "(finished)" : "(decaying)"}`

// Legend
md`<span style="background: ${theme.warmupBg}; padding: 2px 8px; color: ${theme.nodeText}">Warmup phase</span> &nbsp; <span style="color: ${theme.warmupMarker}">┆</span> Warmup ends &nbsp; <span style="color: ${theme.currentMarker}">│</span> Current step`

Try This

Effect of warmup: Set warmup to 0, then gradually increase to 200. Notice how the curve changes from immediate peak to gradual ramp-up.
Long vs short training: Compare total_steps=500 vs total_steps=2000 with the same warmup. See how the decay rate changes.
Min LR matters: Set min_lr to 0, then to 1e-5. The floor prevents the model from completely stopping learning.
Warmup ratio: Try warmup_steps = 1-2% of total_steps (common in practice). For 1000 steps, that’s 10-20 warmup steps.
Drag the current step slider to see the exact LR at any point in training.

Exercises

Exercise 1: Learning Rate Finder

Implement a learning rate finder that trains for a few iterations at exponentially increasing learning rates and plots loss vs learning rate.

# Your implementation here
def lr_finder(model, tokens, start_lr=1e-7, end_lr=1e-1, num_steps=100):
    """Find optimal learning rate by training with exponentially increasing LR."""
    # TODO: Implement this
    pass

Exercise 2: Custom Scheduler

Implement a linear warmup + linear decay scheduler (instead of cosine decay).

# Your implementation here
class LinearScheduler:
    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=0.0):
        # TODO: Implement this
        pass

    def step(self):
        pass

Exercise 3: Training with Validation

Modify the training loop to: 1. Compute validation loss every N steps 2. Save the best model (lowest validation loss) 3. Implement early stopping if validation loss doesn’t improve for M steps

Exercise 4: Prove the Decoupling to Yourself

Using the from-scratch Adam and AdamW from optimizers.py, show that coupled and decoupled weight decay are not the same thing. Train two identically-initialized layers with the same lr and weight_decay, then confirm (a) they diverge with decay on, and (b) they are bit-identical with weight_decay=0.

# Your implementation here
from optimizers import Adam, AdamW

def decoupling_gap(weight_decay, steps=25, seed=0):
    """Return the max weight difference between Adam(L2) and AdamW after training."""
    # TODO: build twin nn.Linear layers, run Adam vs AdamW, return the gap
    pass

# Expect: decoupling_gap(0.3) > 0  and  decoupling_gap(0.0) == 0

Summary

This module covered:

Cross-entropy loss measures prediction quality (lower = better), with mathematical foundations in information theory
Perplexity provides an intuitive metric: exp(loss) - “choosing among N equally likely options”
Learning rate scheduling with warmup + cosine decay prevents early instability and enables fine-tuning
Optimizers from scratch — built SGD, Adam, and AdamW in optimizers.py, each matching torch.optim bit-for-bit. The Adam-vs-AdamW difference is one term’s placement: coupled L2 rides the gradient through the 1/√v̂ denominator (per-parameter, unintended), while AdamW’s decoupled decay shrinks every weight uniformly — identical when weight_decay=0, divergent otherwise
Gradient accumulation increases effective batch size without adding memory
Gradient clipping (max_norm=1.0) prevents exploding gradients, essential for transformers
Batch size tradeoffs affect memory, training dynamics, and generalization
Mixed precision & numerics — built the float number line from scratch: fp16 spends bits on precision (overflows at 65504, underflows near 6e-5), bf16 keeps fp32’s range at coarser precision, and loss scaling lifts tiny gradients out of fp16’s underflow hole (2× speed, 50% memory)
Distributed training (DDP, FSDP) scales training to multiple GPUs
Common failure modes (NaN loss, stuck training, oscillation) and their solutions
Checkpointing strategies ensure you never lose training progress
Scaling laws predict loss from compute: the C≈6ND rule plus Chinchilla’s ~20 tokens/param let you size a model and its data budget instead of guessing — and reveal that parameter-heavy models like GPT-3 and Gopher were under-trained

Key Takeaways

Always use warmup (at least 1% of steps) to stabilize early training
Monitor gradient norms alongside loss - they tell you about training stability
Start with standard hyperparameters (lr=3e-4, wd=0.01, clip=1.0), then adjust
Test your training loop on a tiny dataset first - verify it can overfit

What’s Next

Module 08: Generation uses the trained model to generate text with various decoding strategies: greedy, sampling, and top-k/top-p.

Introduction

What You’ll Learn

Prerequisites

The Training Objective

The Training Loop

Setup

Cross-Entropy Loss

Cross-Entropy from Scratch

Perplexity

Learning Rate Schedule

AdamW Optimizer

Optimizers from Scratch

Plain SGD

SGD with Momentum

Adam from Scratch

From Inline Sketch to a Tested Module

Adam vs AdamW: Why Decoupling Matters

The Optimizer Zoo, Side by Side

Gradient Accumulation

Gradient Clipping

Gradient Clipping from Scratch

Batch Size Considerations

Mixed Precision & Numerics

Anatomy of a float

Failure mode 1: overflow

Failure mode 2: underflow (and the loss-scaling fix)

The whole number line at a glance

Putting it together: the mixed-precision recipe

Distributed Training Basics

Training Stability and Failure Modes

Text Dataset

Training a Model

Effect of Learning Rate

Checkpointing

Validation and Early Stopping

Scaling Laws: Compute-Optimal Training

The Compute Rule: C ≈ 6ND

From Kaplan to Chinchilla

The Undertraining Story, Quantified

Interactive: Compute-Optimal Explorer

Training Tips

Quick Reference Table

Hyperparameter Recommendations

Memory Optimization Strategies

Interactive Exploration

Exercises

Exercise 1: Learning Rate Finder

Exercise 2: Custom Scheduler

Exercise 3: Training with Validation

Exercise 4: Prove the Decoupling to Yourself

Summary

Key Takeaways

What’s Next

The Compute Rule: `C ≈ 6ND`