Module 07: Training

d3 = require("d3@7")


// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================

// Function to read CSS custom property values from the document
getCSSVar = function(name, fallback = null) {
  if (typeof document === 'undefined') return fallback;
  const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim();
  return value || fallback;
}

// =============================================================================
// THEME OBJECT
// =============================================================================

// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
  // Fallback values (light mode)
  const fallbacks = {
    nodeFill: '#f5f5f4',
    nodeFillHover: '#e7e5e4',
    nodeStroke: '#d6d3d1',
    nodeText: '#1c1917',
    edgeStroke: '#78716c',
    highlight: '#f97316',
    highlightGlow: 'rgba(249, 115, 22, 0.3)',
    accent: '#0ea5e9',
    accentGlow: 'rgba(14, 165, 233, 0.3)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: '#fafaf9',
    bgSecondary: '#f5f5f4',
    // Semantic colors for status/feedback
    error: '#dc2626',
    errorBg: 'rgba(220, 38, 38, 0.1)',
    success: '#16a34a',
    successBg: 'rgba(22, 163, 74, 0.1)',
    info: '#2563eb',
    infoBg: 'rgba(37, 99, 235, 0.1)'
  };

  return {
    nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
    nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
    nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
    nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
    edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
    highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
    highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
    accent: getCSSVar('--diagram-accent', fallbacks.accent),
    accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
    textOnHighlight: fallbacks.textOnHighlight,
    textOnAccent: fallbacks.textOnAccent,
    bg: getCSSVar('--diagram-bg', fallbacks.bg),
    bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
    // Semantic colors (use fallbacks directly since no CSS vars defined)
    error: fallbacks.error,
    errorBg: fallbacks.errorBg,
    success: fallbacks.success,
    successBg: fallbacks.successBg,
    info: fallbacks.info,
    infoBg: fallbacks.infoBg
  };
}

// =============================================================================
// SVG PRIMITIVES
// =============================================================================

// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
  const {
    x = 0,
    y = 0,
    width = 100,
    height = 50,
    label = '',
    sublabel = '',
    id = null,
    theme = diagramTheme,
    rx = 6,
    ry = 6,
    className = 'diagram-node'
  } = options;

  // Create group
  const g = svg.append('g')
    .attr('class', className)
    .attr('transform', `translate(${x}, ${y})`);

  if (id) g.attr('id', id);

  // Add rectangle
  g.append('rect')
    .attr('x', -width / 2)
    .attr('y', -height / 2)
    .attr('width', width)
    .attr('height', height)
    .attr('rx', rx)
    .attr('ry', ry)
    .attr('fill', theme.nodeFill)
    .attr('stroke', theme.nodeStroke)
    .attr('stroke-width', 1.5);

  // Add main label
  if (label) {
    const labelY = sublabel ? -6 : 0;
    g.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(label);
  }

  // Add sublabel
  if (sublabel) {
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', 0.7)
      .attr('pointer-events', 'none')
      .text(sublabel);
  }

  return g;
}

// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
  const {
    x1 = 0,
    y1 = 0,
    x2 = 100,
    y2 = 0,
    label = '',
    theme = diagramTheme,
    curved = false,
    curvature = 0.3,
    id = null,
    className = 'diagram-edge',
    dashed = false
  } = options;

  // Create unique marker ID
  const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;

  // Ensure defs exists
  let defs = svg.select('defs');
  if (defs.empty()) {
    defs = svg.append('defs');
  }

  // Add arrowhead marker
  defs.append('marker')
    .attr('id', markerId)
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Create group for arrow
  const g = svg.append('g')
    .attr('class', className);

  if (id) g.attr('id', id);

  // Calculate path
  let pathD;
  if (curved) {
    // Quadratic Bezier curve
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const dx = x2 - x1;
    const dy = y2 - y1;
    // Perpendicular offset for curve
    const cx = midX - dy * curvature;
    const cy = midY + dx * curvature;
    pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
  } else {
    // Straight line
    pathD = `M${x1},${y1} L${x2},${y2}`;
  }

  // Add path
  const path = g.append('path')
    .attr('d', pathD)
    .attr('fill', 'none')
    .attr('stroke', theme.edgeStroke)
    .attr('stroke-width', 1.5)
    .attr('marker-end', `url(#${markerId})`);

  if (dashed) {
    path.attr('stroke-dasharray', '5,3');
  }

  // Add label if provided
  if (label) {
    const labelX = (x1 + x2) / 2;
    const labelY = (y1 + y2) / 2;

    // Offset label perpendicular to line
    const angle = Math.atan2(y2 - y1, x2 - x1);
    const offsetX = Math.sin(angle) * 12;
    const offsetY = -Math.cos(angle) * 12;

    g.append('text')
      .attr('x', labelX + offsetX)
      .attr('y', labelY + offsetY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .text(label);
  }

  return g;
}

// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================

// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
  const {
    total = 1,
    initialStep = 0,
    speed = 1000,
    loop = true,
    onStepChange = null
  } = options;

  let current = initialStep;
  let isPlaying = false;
  let intervalId = null;
  let currentSpeed = speed;

  const notifyChange = () => {
    if (onStepChange && typeof onStepChange === 'function') {
      onStepChange(current);
    }
  };

  const controller = {
    get current() { return current; },
    get isPlaying() { return isPlaying; },
    get total() { return total; },
    get speed() { return currentSpeed; },

    setStep(step) {
      current = Math.max(0, Math.min(total - 1, step));
      notifyChange();
      return current;
    },

    next() {
      if (current < total - 1) {
        current++;
      } else if (loop) {
        current = 0;
      }
      notifyChange();
      return current;
    },

    prev() {
      if (current > 0) {
        current--;
      } else if (loop) {
        current = total - 1;
      }
      notifyChange();
      return current;
    },

    play() {
      if (isPlaying) return;
      isPlaying = true;
      intervalId = setInterval(() => {
        controller.next();
      }, currentSpeed);
    },

    stop() {
      isPlaying = false;
      if (intervalId) {
        clearInterval(intervalId);
        intervalId = null;
      }
    },

    toggle() {
      if (isPlaying) {
        controller.stop();
      } else {
        controller.play();
      }
    },

    reset() {
      controller.stop();
      current = initialStep;
      notifyChange();
    },

    setSpeed(newSpeed) {
      currentSpeed = newSpeed;
      if (isPlaying) {
        controller.stop();
        controller.play();
      }
    }
  };

  return controller;
}

// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================

// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
  const {
    nodes = [],
    edges = [],
    width = 600,
    height = 400,
    activeNodes = [],
    activeEdges = [],
    theme = diagramTheme,
    nodeWidth = 100,
    nodeHeight = 50,
    padding = 20
  } = options;

  // Create SVG element
  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`)
    .attr('class', 'flow-diagram');

  // Add background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', theme.bg)
    .attr('rx', 8);

  // Create defs for markers
  const defs = svg.append('defs');

  // Standard arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Highlighted arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.highlight);

  // Edges layer (draw first so nodes appear on top)
  const edgesLayer = svg.append('g').attr('class', 'edges-layer');

  // Nodes layer
  const nodesLayer = svg.append('g').attr('class', 'nodes-layer');

  // Draw edges
  edges.forEach((edge, i) => {
    const sourceNode = nodes.find(n => n.id === edge.source);
    const targetNode = nodes.find(n => n.id === edge.target);

    if (!sourceNode || !targetNode) return;

    const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
    const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
    const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';

    // Calculate edge path
    const x1 = sourceNode.x;
    const y1 = sourceNode.y;
    const x2 = targetNode.x;
    const y2 = targetNode.y;

    // Shorten path to not overlap with node edges
    const dx = x2 - x1;
    const dy = y2 - y1;
    const len = Math.sqrt(dx * dx + dy * dy);
    const offsetStart = (nodeWidth / 2) + 5;
    const offsetEnd = (nodeWidth / 2) + 10;

    const startX = x1 + (dx / len) * offsetStart;
    const startY = y1 + (dy / len) * offsetStart;
    const endX = x2 - (dx / len) * offsetEnd;
    const endY = y2 - (dy / len) * offsetEnd;

    const edgeGroup = edgesLayer.append('g')
      .attr('class', `edge ${isActive ? 'highlighted' : ''}`);

    if (edge.id) edgeGroup.attr('id', edge.id);

    // Draw path
    let pathD;
    if (edge.curved) {
      const midX = (startX + endX) / 2;
      const midY = (startY + endY) / 2;
      const curvature = edge.curvature || 0.2;
      const cx = midX - dy * curvature;
      const cy = midY + dx * curvature;
      pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
    } else {
      pathD = `M${startX},${startY} L${endX},${endY}`;
    }

    const path = edgeGroup.append('path')
      .attr('d', pathD)
      .attr('fill', 'none')
      .attr('stroke', edgeColor)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', `url(#${markerId})`);

    if (edge.dashed) {
      path.attr('stroke-dasharray', '5,3');
    }

    if (isActive) {
      path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
    }

    // Add label if present
    if (edge.label) {
      const labelX = (startX + endX) / 2;
      const labelY = (startY + endY) / 2;
      const angle = Math.atan2(endY - startY, endX - startX);
      const offsetX = Math.sin(angle) * 14;
      const offsetY = -Math.cos(angle) * 14;

      edgeGroup.append('text')
        .attr('x', labelX + offsetX)
        .attr('y', labelY + offsetY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? theme.highlight : theme.nodeText)
        .attr('font-size', '10px')
        .text(edge.label);
    }
  });

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
    const nodeFill = isActive ? theme.highlight : theme.nodeFill;
    const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
    const textFill = isActive ? theme.textOnHighlight : theme.nodeText;

    const nodeGroup = nodesLayer.append('g')
      .attr('class', `node ${isActive ? 'highlighted' : ''}`)
      .attr('transform', `translate(${node.x}, ${node.y})`);

    if (node.id) nodeGroup.attr('id', node.id);

    // Node rectangle
    const rect = nodeGroup.append('rect')
      .attr('x', -nodeWidth / 2)
      .attr('y', -nodeHeight / 2)
      .attr('width', node.width || nodeWidth)
      .attr('height', node.height || nodeHeight)
      .attr('rx', 6)
      .attr('ry', 6)
      .attr('fill', nodeFill)
      .attr('stroke', nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
    }

    // Main label
    const labelY = node.sublabel ? -6 : 0;
    nodeGroup.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', textFill)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(node.label || '');

    // Sublabel
    if (node.sublabel) {
      nodeGroup.append('text')
        .attr('x', 0)
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', textFill)
        .attr('font-size', '10px')
        .attr('opacity', isActive ? 0.9 : 0.7)
        .attr('pointer-events', 'none')
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// =============================================================================
// EXPORTS
// =============================================================================

// Export everything as a single object for lessons to use
diagramLib = {
  // Core dependencies
  d3,

  // Theme utilities
  getCSSVar,
  diagramTheme,

  // SVG primitives
  createNode,
  createArrow,

  // Animation controller
  createStepController,

  // Components
  FlowDiagram
}

/**
 * Segmented step control for visualization stepping.
 * @param {Object} options
 * @param {number} options.min - Minimum step value (default 0)
 * @param {number} options.max - Maximum step value
 * @param {number} options.value - Initial value (default min)
 * @param {string} options.label - Optional label text
 * @returns {number} Current step value (reactive)
 */
stepControl = function({min = 0, max, value, label = null} = {}) {
  const initialValue = value ?? min;
  const steps = Array.from({length: max - min + 1}, (_, i) => min + i);

  const container = htl.html`<div class="step-control">
    ${label ? htl.html`<span class="step-control-label">${label}</span>` : ''}
    <div class="step-control-segments" role="group" aria-label="${label || 'Step control'}">
      ${steps.map(step => htl.html`<button
        class="step-control-segment ${step === initialValue ? 'active' : ''}"
        data-step="${step}"
        aria-pressed="${step === initialValue}"
        tabindex="${step === initialValue ? 0 : -1}"
      >${step}</button>`)}
    </div>
  </div>`;

  const segments = container.querySelectorAll('.step-control-segment');
  let currentValue = initialValue;

  function updateActive(newValue) {
    currentValue = newValue;
    segments.forEach(seg => {
      const isActive = parseInt(seg.dataset.step) === newValue;
      seg.classList.toggle('active', isActive);
      seg.setAttribute('aria-pressed', isActive);
      seg.tabIndex = isActive ? 0 : -1;
    });
    container.value = newValue;
    container.dispatchEvent(new Event('input', {bubbles: true}));
  }

  // Click handler
  segments.forEach(seg => {
    seg.addEventListener('click', () => {
      updateActive(parseInt(seg.dataset.step));
    });
  });

  // Keyboard navigation
  container.addEventListener('keydown', (e) => {
    if (e.key === 'ArrowRight' || e.key === 'ArrowDown') {
      e.preventDefault();
      const next = Math.min(currentValue + 1, max);
      updateActive(next);
      segments[next - min].focus();
    } else if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') {
      e.preventDefault();
      const prev = Math.max(currentValue - 1, min);
      updateActive(prev);
      segments[prev - min].focus();
    } else if (e.key === 'Home') {
      e.preventDefault();
      updateActive(min);
      segments[0].focus();
    } else if (e.key === 'End') {
      e.preventDefault();
      updateActive(max);
      segments[max - min].focus();
    }
  });

  container.value = initialValue;
  return container;
}

Introduction

Training teaches a language model to predict the next token. The process iterates:

Computing loss: How wrong are our predictions?
Computing gradients: Which direction should we adjust weights?
Updating weights: Take a small step in that direction
Repeat: Until the model gets good at prediction

This module covers cross-entropy loss, the AdamW optimizer, learning rate scheduling, gradient accumulation, and checkpointing.

What You’ll Learn

After this module, you can:

Understand cross-entropy loss and perplexity for language models
Implement learning rate schedules (warmup + cosine decay)
Use gradient accumulation for effective larger batch sizes
Apply gradient clipping for training stability
Save and load model checkpoints

Prerequisites

This module requires familiarity with:

Module 02: Autograd — Gradient computation and backpropagation
Module 06: Transformer — Transformer architecture to train

Note: This lesson demonstrates concepts interactively. The training.py file provides production-ready implementations of the same algorithms.

The Training Objective

Language models learn through next-token prediction:

Input:    [The, cat, sat, on, the]
Target:   [cat, sat, on, the, mat]

For each position, predict the next token.

The loss function measures how well the model predicts: Cross-entropy between predicted probabilities and actual next tokens.

\[\text{loss} = -\sum \log(P(\text{correct\_token}))\]

Lower loss means the model assigns correct tokens higher probability.

The Training Loop

Neural networks learn through the training loop:

// Training loop steps data
trainingSteps = [
  {
    id: 0,
    name: "Zero Gradients",
    code: "optimizer.zero_grad()",
    description: "Clear accumulated gradients from the previous iteration to start fresh.",
    detail: "Gradients accumulate by default in PyTorch. Without zeroing, they add up across iterations."
  },
  {
    id: 1,
    name: "Forward Pass",
    code: "logits = model(input_ids)",
    description: "Pass input tokens through the model to get predicted logits.",
    detail: "The model computes attention, embeddings, and projections to produce next-token predictions."
  },
  {
    id: 2,
    name: "Compute Loss",
    code: "loss = F.cross_entropy(logits, targets)",
    description: "Measure how wrong the predictions are compared to actual next tokens.",
    detail: "Cross-entropy loss: lower means higher probability assigned to correct tokens."
  },
  {
    id: 3,
    name: "Backward Pass",
    code: "loss.backward()",
    description: "Compute gradients for all parameters via backpropagation.",
    detail: "Automatic differentiation traces computation graph backward, computing dLoss/dParam."
  },
  {
    id: 4,
    name: "Gradient Clipping",
    code: "clip_grad_norm_(params, 1.0)",
    description: "Scale gradients if their norm exceeds threshold to prevent instability.",
    detail: "Prevents exploding gradients that can cause NaN loss or divergent training."
  },
  {
    id: 5,
    name: "Optimizer Step",
    code: "optimizer.step()",
    description: "Update model weights using the computed (and clipped) gradients.",
    detail: "AdamW applies momentum, adaptive learning rates, and weight decay to the update."
  },
  {
    id: 6,
    name: "Update LR",
    code: "scheduler.step()",
    description: "Adjust learning rate according to schedule (warmup + cosine decay).",
    detail: "High LR early for exploration, lower LR later for fine-tuning convergence."
  }
]

// Step control for training loop
viewof trainingStep = stepControl({min: 0, max: 6, value: 0, label: "Training Step"})

// Current step info
currentTrainingStep = trainingSteps[trainingStep]

// Draw the cyclic training loop diagram
trainingLoopDiagram = {
  const width = 650;
  const height = 480;
  const centerX = width / 2;
  const centerY = height / 2 - 20;
  const radius = 160;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 8);

  // Defs for arrows
  const defs = svg.append("defs");

  // Standard arrow
  defs.append("marker")
    .attr("id", "training-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  // Highlighted arrow
  defs.append("marker")
    .attr("id", "training-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Calculate node positions in a circle
  const nodeCount = 7;
  const startAngle = -Math.PI / 2; // Start at top

  const nodePositions = trainingSteps.map((step, i) => {
    const angle = startAngle + (i * 2 * Math.PI / nodeCount);
    return {
      ...step,
      x: centerX + radius * Math.cos(angle),
      y: centerY + radius * Math.sin(angle),
      angle: angle
    };
  });

  // Draw connecting arrows between nodes
  const edgesGroup = svg.append("g").attr("class", "edges");

  for (let i = 0; i < nodeCount; i++) {
    const from = nodePositions[i];
    const to = nodePositions[(i + 1) % nodeCount];

    // Calculate edge start/end to not overlap nodes
    const nodeRadius = 42;
    const dx = to.x - from.x;
    const dy = to.y - from.y;
    const dist = Math.sqrt(dx * dx + dy * dy);

    const startX = from.x + (dx / dist) * (nodeRadius + 2);
    const startY = from.y + (dy / dist) * (nodeRadius + 2);
    const endX = to.x - (dx / dist) * (nodeRadius + 8);
    const endY = to.y - (dy / dist) * (nodeRadius + 8);

    // This edge is highlighted when we're on the "from" step
    const isActive = trainingStep === i;

    edgesGroup.append("path")
      .attr("d", `M${startX},${startY} L${endX},${endY}`)
      .attr("fill", "none")
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("marker-end", isActive ? "url(#training-arrow-active)" : "url(#training-arrow)")
      .attr("opacity", isActive ? 1 : 0.6)
      .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
  }

  // Draw nodes
  const nodesGroup = svg.append("g").attr("class", "nodes");

  nodePositions.forEach((node, i) => {
    const isActive = trainingStep === i;
    const nodeSize = 42;

    const g = nodesGroup.append("g")
      .attr("transform", `translate(${node.x}, ${node.y})`);

    // Circle node
    g.append("circle")
      .attr("r", nodeSize)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    // Step number
    g.append("text")
      .attr("y", -10)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .attr("opacity", 0.7)
      .text(`Step ${i + 1}`);

    // Node label (split long names)
    const words = node.name.split(" ");
    if (words.length > 1) {
      g.append("text")
        .attr("y", 5)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", "11px")
        .attr("font-weight", "500")
        .text(words[0]);
      g.append("text")
        .attr("y", 18)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", "11px")
        .attr("font-weight", "500")
        .text(words.slice(1).join(" "));
    } else {
      g.append("text")
        .attr("y", 10)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", "11px")
        .attr("font-weight", "500")
        .text(node.name);
    }
  });

  // Center label
  svg.append("text")
    .attr("x", centerX)
    .attr("y", centerY - 5)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("opacity", 0.8)
    .text("Training");

  svg.append("text")
    .attr("x", centerX)
    .attr("y", centerY + 12)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("opacity", 0.8)
    .text("Loop");

  // Info panel at bottom
  const infoY = height - 100;
  const infoGroup = svg.append("g")
    .attr("transform", `translate(${width / 2}, ${infoY})`);

  // Info box background
  infoGroup.append("rect")
    .attr("x", -280)
    .attr("y", -10)
    .attr("width", 560)
    .attr("height", 85)
    .attr("rx", 6)
    .attr("fill", diagramTheme.bgSecondary)
    .attr("stroke", diagramTheme.nodeStroke)
    .attr("stroke-width", 1);

  // Step name
  infoGroup.append("text")
    .attr("y", 8)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "13px")
    .attr("font-weight", "600")
    .text(`${currentTrainingStep.id + 1}. ${currentTrainingStep.name}`);

  // Code
  infoGroup.append("text")
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.accent)
    .attr("font-size", "12px")
    .attr("font-family", "monospace")
    .text(currentTrainingStep.code);

  // Description (wrap if needed)
  const desc = currentTrainingStep.description;
  if (desc.length > 70) {
    const mid = desc.lastIndexOf(" ", 70);
    infoGroup.append("text")
      .attr("y", 50)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(desc.substring(0, mid));
    infoGroup.append("text")
      .attr("y", 64)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(desc.substring(mid + 1));
  } else {
    infoGroup.append("text")
      .attr("y", 55)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(desc);
  }

  return svg.node();
}

// Additional detail below the diagram
md`**Why this step matters:** ${currentTrainingStep.detail}`

Note: zero_grad() can be called either at the start or end of each iteration. Calling it at the start (shown above) is common because it ensures gradients are fresh before the backward pass.

Setup

import sys
import math
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# For reproducibility
torch.manual_seed(42)

# Display device info
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

PyTorch version: 2.10.0+cu128
Device: cpu

Cross-Entropy Loss

The loss function measures prediction error. Cross-entropy loss penalizes wrong predictions more heavily when the model is confident but incorrect.

Why cross-entropy?

Probabilistic interpretation: It measures the “surprise” when the true token appears
Gradient properties: Gradients are proportional to the error (predicted - actual)
Information theory: Minimizing cross-entropy = maximizing likelihood of data

Mathematical formulation:

\[\text{CrossEntropy}(p, q) = -\sum_{i} p_i \log(q_i)\]

For language modeling with one-hot targets (only one correct token), this simplifies to:

\[\text{Loss} = -\log(q_{\text{correct}})\]

where \(q_{\text{correct}}\) is the probability the model assigns to the correct token.

# Example: Model predicting next token
vocab_size = 10

# Model outputs logits (raw scores)
logits = torch.tensor([
    [-1.0, 0.5, 2.0, -0.5, 1.0, 0.0, -1.5, 0.3, -0.8, 0.2]  # scores for each token
])

# True next token is index 2
target = torch.tensor([2])

# Convert to probabilities
probs = F.softmax(logits, dim=-1)

print("Logits (raw model output):")
print(f"  {logits[0].tolist()}")
print(f"\nProbabilities (after softmax):")
print(f"  {[f'{p:.3f}' for p in probs[0].tolist()]}")
print(f"\nTarget token: {target.item()}")
print(f"Probability assigned to target: {probs[0, target.item()]:.4f}")

Logits (raw model output):
  [-1.0, 0.5, 2.0, -0.5, 1.0, 0.0, -1.5, 0.30000001192092896, -0.800000011920929, 0.20000000298023224]

Probabilities (after softmax):
  ['0.022', '0.097', '0.435', '0.036', '0.160', '0.059', '0.013', '0.080', '0.026', '0.072']

Target token: 2
Probability assigned to target: 0.4353

# Cross-entropy loss
loss = F.cross_entropy(logits, target)
manual_loss = -torch.log(probs[0, target.item()])

print(f"Cross-entropy loss: {loss.item():.4f}")
print(f"Manual calculation: -log({probs[0, target.item()]:.4f}) = {manual_loss.item():.4f}")

# Perplexity
perplexity = math.exp(loss.item())
print(f"\nPerplexity: {perplexity:.2f}")

Cross-entropy loss: 0.8317
Manual calculation: -log(0.4353) = 0.8317

Perplexity: 2.30

Let’s visualize how loss changes with probability:

// Loss vs Probability interactive chart
lossProbChart = {
  const width = 650;
  const height = 340;
  const margin = { top: 40, right: 30, bottom: 50, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Generate data
  const data = [];
  for (let p = 0.01; p <= 0.99; p += 0.01) {
    data.push({ prob: p, loss: -Math.log(p) });
  }

  // Scales
  const xScale = d3.scaleLinear()
    .domain([0, 1])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, 5])
    .range([innerHeight, 0]);

  // Grid lines
  [0, 1, 2, 3, 4, 5].forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.3)
      .attr("stroke-dasharray", "3,3");
  });

  // Zero line
  chart.append("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", yScale(0))
    .attr("y2", yScale(0))
    .attr("stroke", theme.nodeText)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "5,3");

  // Line generator
  const lineGen = d3.line()
    .x(d => xScale(d.prob))
    .y(d => yScale(d.loss))
    .curve(d3.curveMonotoneX);

  // Area under curve
  const areaGen = d3.area()
    .x(d => xScale(d.prob))
    .y0(innerHeight)
    .y1(d => yScale(d.loss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(data)
    .attr("d", areaGen)
    .attr("fill", theme.accent)
    .attr("opacity", 0.1);

  // Main line
  chart.append("path")
    .datum(data)
    .attr("d", lineGen)
    .attr("fill", "none")
    .attr("stroke", theme.accent)
    .attr("stroke-width", 3);

  // Annotated points
  const points = [
    { prob: 0.1, label: "P=0.1", loss: -Math.log(0.1) },
    { prob: 0.5, label: "P=0.5", loss: -Math.log(0.5) },
    { prob: 0.9, label: "P=0.9", loss: -Math.log(0.9) }
  ];

  points.forEach(pt => {
    // Point circle
    chart.append("circle")
      .attr("cx", xScale(pt.prob))
      .attr("cy", yScale(pt.loss))
      .attr("r", 8)
      .attr("fill", theme.highlight)
      .attr("stroke", theme.bg === "transparent" ? theme.bgSecondary : theme.bg)
      .attr("stroke-width", 2);

    // Label
    const labelX = xScale(pt.prob) + 12;
    const labelY = yScale(pt.loss) - 8;

    chart.append("text")
      .attr("x", labelX)
      .attr("y", labelY)
      .attr("fill", theme.highlight)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(pt.label);

    chart.append("text")
      .attr("x", labelX)
      .attr("y", labelY + 14)
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .text(`Loss=${pt.loss.toFixed(2)}`);
  });

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(10).tickFormat(d3.format(".1f")))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 40)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Probability assigned to correct token");

  // Y-axis
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Cross-entropy loss");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Loss vs Probability");

  return svg.node();
}

Higher probability means lower loss means better predictions.

Cross-Entropy from Scratch

Before using F.cross_entropy, let’s understand what it does internally.

The Numerical Stability Problem

Softmax involves exp(x), which explodes for large x:

# The problem: exp() overflows easily
logits_big = np.array([1000.0, 1001.0, 1002.0])
print(f"exp(logits) = {np.exp(logits_big)}")  # [inf, inf, inf] - overflow!

exp(logits) = [inf inf inf]

/tmp/ipykernel_3335/1953512672.py:3: RuntimeWarning: overflow encountered in exp
  print(f"exp(logits) = {np.exp(logits_big)}")  # [inf, inf, inf] - overflow!

The Fix: Log-Sum-Exp Trick

The key insight is that we can compute log-softmax stably by subtracting the maximum:

\[\log \text{softmax}(x_i) = x_i - \log\sum_j e^{x_j} = x_i - \underbrace{(m + \log\sum_j e^{x_j - m})}_{\text{logsumexp}}\]

where \(m = \max(x)\). By subtracting the max, all exponents become \(\leq 0\), avoiding overflow.

def logsumexp(x: np.ndarray, axis: int = -1, keepdims: bool = True) -> np.ndarray:
    """
    Stable log(sum(exp(x))).

    Trick: log(sum(exp(x))) = m + log(sum(exp(x - m)))
    where m = max(x). This keeps exp() arguments <= 0.
    """
    m = x.max(axis=axis, keepdims=True)
    return m + np.log(np.exp(x - m).sum(axis=axis, keepdims=keepdims))

# Now it works!
print(f"logsumexp(logits) = {logsumexp(logits_big, keepdims=False)}")

logsumexp(logits) = [1002.40760596]

Cross-Entropy Implementation

def cross_entropy_scratch(logits: np.ndarray, targets: np.ndarray) -> float:
    """
    Cross-entropy loss from logits.

    logits: (B, C) - raw scores for each class
    targets: (B,) - integer class labels

    Formula: loss = logsumexp(logits) - logits[correct_class]

    This is equivalent to: -log(softmax(logits)[correct_class])
    but numerically stable.
    """
    B, C = logits.shape

    # log(sum(exp(logits))) for normalization
    lse = logsumexp(logits, axis=-1, keepdims=False).squeeze()  # (B,)

    # Gather correct class logits
    correct_logits = logits[np.arange(B), targets]  # (B,)

    # Loss per sample, then mean
    losses = lse - correct_logits
    return float(losses.mean())

# Test
test_logits = np.array([[2.0, 1.0, 0.1], [0.5, 2.5, 0.3]])
test_targets = np.array([0, 1])  # First sample: class 0, second: class 1
print(f"Cross-entropy loss (scratch): {cross_entropy_scratch(test_logits, test_targets):.4f}")

Cross-entropy loss (scratch): 0.3185

PyTorch Equivalent

# Compare with PyTorch
logits_pt = torch.tensor([[2.0, 1.0, 0.1], [0.5, 2.5, 0.3]])
targets_pt = torch.tensor([0, 1])
loss_pt = F.cross_entropy(logits_pt, targets_pt)
print(f"Cross-entropy loss (PyTorch): {loss_pt.item():.4f}")

Cross-entropy loss (PyTorch): 0.3185

Same result! PyTorch’s F.cross_entropy does exactly this internally, plus handles gradients automatically.

Key Insight

Cross-entropy is just logsumexp(logits) - logits[correct_class]. The logsumexp trick prevents numerical overflow by subtracting the max before exponentiating.

Perplexity

Perplexity is a more intuitive measure than raw loss:

\[\text{Perplexity} = e^{\text{cross\_entropy\_loss}}\]

Interpretation: “The model is as confused as if it were choosing uniformly among N options.”

Loss	Perplexity	Interpretation
0.0	1.0	Perfect predictions
2.3	10	~10 equally likely options
4.6	100	~100 equally likely options
6.9	1000	Random guessing (vocab=1000)

For reference: - GPT-2 on WebText: ~20 perplexity - Human baseline: ~10-20 perplexity (depends on domain)

Learning Rate Schedule

We vary the learning rate over training, using warmup followed by cosine decay:

viewof lrMaxLRExp = Inputs.range([-5, -2], {
  value: -3,
  step: 0.5,
  label: "Max LR (10^x)"
})

viewof lrMinLRExp = Inputs.range([-6, -3], {
  value: -5,
  step: 0.5,
  label: "Min LR (10^x)"
})

viewof lrWarmupSteps = Inputs.range([0, 500], {
  value: 100,
  step: 10,
  label: "Warmup Steps"
})

viewof lrTotalSteps = Inputs.range([100, 2000], {
  value: 1000,
  step: 50,
  label: "Total Steps"
})

viewof lrCurrentStep = Inputs.range([0, lrTotalSteps], {
  value: 0,
  step: 1,
  label: "Current Step"
})

lrMaxLR = Math.pow(10, lrMaxLRExp)
lrMinLR = Math.pow(10, lrMinLRExp)

// LR Schedule calculation function
lrScheduleData = {
  const maxLR = lrMaxLR;
  const minLR = lrMinLR;
  const data = [];

  for (let step = 0; step <= lrTotalSteps; step++) {
    let lr;
    let phase;

    if (step < lrWarmupSteps) {
      // Linear warmup
      lr = maxLR * step / Math.max(1, lrWarmupSteps);
      phase = "warmup";
    } else if (step >= lrTotalSteps) {
      lr = minLR;
      phase = "decay";
    } else {
      // Cosine decay
      const progress = (step - lrWarmupSteps) / Math.max(1, lrTotalSteps - lrWarmupSteps);
      const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
      lr = minLR + (maxLR - minLR) * cosine;
      phase = "decay";
    }

    data.push({ step, lr, phase });
  }

  return data;
}

// Current LR value
currentLR = {
  const maxLR = lrMaxLR;
  const minLR = lrMinLR;
  const step = lrCurrentStep;

  if (step < lrWarmupSteps) {
    return maxLR * step / Math.max(1, lrWarmupSteps);
  } else if (step >= lrTotalSteps) {
    return minLR;
  } else {
    const progress = (step - lrWarmupSteps) / Math.max(1, lrTotalSteps - lrWarmupSteps);
    const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
    return minLR + (maxLR - minLR) * cosine;
  }
}

// Current phase
currentPhase = {
  if (lrCurrentStep < lrWarmupSteps) return "warmup";
  if (lrCurrentStep === lrWarmupSteps) return "peak";
  return "decay";
}

// Learning Rate Schedule Visualization
lrScheduleChart = {
  const width = 700;
  const height = 380;
  const margin = { top: 40, right: 30, bottom: 50, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  // Background with gradient
  const defs = svg.append("defs");

  const bgGradient = defs.append("linearGradient")
    .attr("id", "lr-bg-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "0%")
    .attr("y2", "100%");

  bgGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.bg);

  bgGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.bgSecondary);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", "url(#lr-bg-gradient)")
    .attr("rx", 12);

  // Chart area
  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales - dynamic based on max/min LR
  const xScale = d3.scaleLinear()
    .domain([0, lrTotalSteps])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, lrMaxLR * 1.1])
    .range([innerHeight, 0]);

  // Phase background regions
  // Warmup region
  if (lrWarmupSteps > 0) {
    chart.append("rect")
      .attr("x", 0)
      .attr("y", 0)
      .attr("width", xScale(lrWarmupSteps))
      .attr("height", innerHeight)
      .attr("fill", theme.accent)
      .attr("opacity", currentPhase === "warmup" ? 0.15 : 0.05);
  }

  // Decay region
  chart.append("rect")
    .attr("x", xScale(lrWarmupSteps))
    .attr("y", 0)
    .attr("width", innerWidth - xScale(lrWarmupSteps))
    .attr("height", innerHeight)
    .attr("fill", theme.highlight)
    .attr("opacity", currentPhase === "decay" || currentPhase === "peak" ? 0.1 : 0.03);

  // Grid lines - dynamic based on max LR
  const yTicks = [0, lrMaxLR * 0.25, lrMaxLR * 0.5, lrMaxLR * 0.75, lrMaxLR];
  yTicks.forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.3)
      .attr("stroke-dasharray", "3,3");
  });

  // Phase labels
  if (lrWarmupSteps > 0) {
    chart.append("text")
      .attr("x", xScale(lrWarmupSteps / 2))
      .attr("y", 15)
      .attr("text-anchor", "middle")
      .attr("font-size", "11px")
      .attr("font-weight", currentPhase === "warmup" ? "600" : "400")
      .attr("fill", currentPhase === "warmup" ? theme.accent : theme.nodeText)
      .attr("opacity", currentPhase === "warmup" ? 1 : 0.5)
      .text("WARMUP");
  }

  chart.append("text")
    .attr("x", xScale(lrWarmupSteps + (lrTotalSteps - lrWarmupSteps) / 2))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("font-weight", currentPhase === "decay" ? "600" : "400")
    .attr("fill", currentPhase === "decay" || currentPhase === "peak" ? theme.highlight : theme.nodeText)
    .attr("opacity", currentPhase === "decay" || currentPhase === "peak" ? 1 : 0.5)
    .text("COSINE DECAY");

  // Line generator
  const lineGen = d3.line()
    .x(d => xScale(d.step))
    .y(d => yScale(d.lr))
    .curve(d3.curveMonotoneX);

  // Gradient for the line
  const lineGradient = defs.append("linearGradient")
    .attr("id", "lr-line-gradient")
    .attr("gradientUnits", "userSpaceOnUse")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", 0)
    .attr("y2", 0);

  lineGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.accent);

  const warmupPct = (lrWarmupSteps / lrTotalSteps * 100).toFixed(1);
  lineGradient.append("stop")
    .attr("offset", `${warmupPct}%`)
    .attr("stop-color", theme.accent);

  lineGradient.append("stop")
    .attr("offset", `${warmupPct}%`)
    .attr("stop-color", theme.highlight);

  lineGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.highlight);

  // Area under curve
  const areaGen = d3.area()
    .x(d => xScale(d.step))
    .y0(innerHeight)
    .y1(d => yScale(d.lr))
    .curve(d3.curveMonotoneX);

  // Area gradient
  const areaGradient = defs.append("linearGradient")
    .attr("id", "lr-area-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "0%")
    .attr("y2", "100%");

  areaGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.highlight)
    .attr("stop-opacity", 0.3);

  areaGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.highlight)
    .attr("stop-opacity", 0.02);

  chart.append("path")
    .datum(lrScheduleData)
    .attr("d", areaGen)
    .attr("fill", "url(#lr-area-gradient)");

  // Main line
  chart.append("path")
    .datum(lrScheduleData)
    .attr("d", lineGen)
    .attr("fill", "none")
    .attr("stroke", "url(#lr-line-gradient)")
    .attr("stroke-width", 3)
    .attr("stroke-linecap", "round");

  // Current step marker
  const currentX = xScale(lrCurrentStep);
  const currentY = yScale(currentLR);

  // Vertical line at current step
  chart.append("line")
    .attr("x1", currentX)
    .attr("x2", currentX)
    .attr("y1", 0)
    .attr("y2", innerHeight)
    .attr("stroke", theme.nodeText)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "4,4");

  // Horizontal line to y-axis
  chart.append("line")
    .attr("x1", 0)
    .attr("x2", currentX)
    .attr("y1", currentY)
    .attr("y2", currentY)
    .attr("stroke", theme.nodeText)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "4,4");

  // Glow effect for marker
  const glowFilter = defs.append("filter")
    .attr("id", "lr-marker-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");

  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "blur");

  glowFilter.append("feMerge")
    .selectAll("feMergeNode")
    .data(["blur", "SourceGraphic"])
    .join("feMergeNode")
    .attr("in", d => d);

  // Current step dot with glow
  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentY)
    .attr("r", 12)
    .attr("fill", currentPhase === "warmup" ? theme.accent : theme.highlight)
    .attr("opacity", 0.3)
    .attr("filter", "url(#lr-marker-glow)");

  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentY)
    .attr("r", 6)
    .attr("fill", currentPhase === "warmup" ? theme.accent : theme.highlight)
    .attr("stroke", theme.bg === "transparent" ? theme.bgSecondary : theme.bg)
    .attr("stroke-width", 2);

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(8).tickFormat(d => d))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px"));

  // X-axis label
  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 40)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Training Steps");

  // Y-axis with scientific notation for small LR values
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(5).tickFormat(d => {
      if (d === 0) return "0";
      if (d < 0.01) return d.toExponential(0);
      return d.toFixed(4);
    }))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px"));

  // Y-axis label
  chart.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Learning Rate");

  // Info box
  const infoBox = svg.append("g")
    .attr("transform", `translate(${width - 170}, 50)`);

  infoBox.append("rect")
    .attr("x", 0)
    .attr("y", 0)
    .attr("width", 150)
    .attr("height", 80)
    .attr("rx", 8)
    .attr("fill", theme.nodeFill)
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 1.5);

  infoBox.append("text")
    .attr("x", 75)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .attr("opacity", 0.6)
    .text("CURRENT");

  // Format LR for display: use exponential for small values
  const lrDisplay = currentLR < 0.0001 ? currentLR.toExponential(2) : currentLR.toFixed(6);

  infoBox.append("text")
    .attr("x", 75)
    .attr("y", 45)
    .attr("text-anchor", "middle")
    .attr("font-size", "15px")
    .attr("font-weight", "700")
    .attr("fill", currentPhase === "warmup" ? theme.accent : theme.highlight)
    .text(`LR: ${lrDisplay}`);

  infoBox.append("text")
    .attr("x", 75)
    .attr("y", 65)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .attr("opacity", 0.7)
    .text(`Step ${lrCurrentStep} / ${lrTotalSteps}`);

  // Phase indicator badge
  const phaseBadge = svg.append("g")
    .attr("transform", `translate(${margin.left + 10}, 55)`);

  const phaseColor = currentPhase === "warmup" ? theme.accent : theme.highlight;
  const phaseLabel = currentPhase.toUpperCase();

  phaseBadge.append("rect")
    .attr("x", 0)
    .attr("y", 0)
    .attr("width", 75)
    .attr("height", 24)
    .attr("rx", 12)
    .attr("fill", phaseColor)
    .attr("opacity", 0.9);

  phaseBadge.append("text")
    .attr("x", 37.5)
    .attr("y", 16)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", "700")
    .attr("fill", theme.textOnHighlight)
    .text(phaseLabel);

  return svg.node();
}

Why warmup? - Early training is unstable with large LR - Gradients are noisy before weights settle - Small LR lets model “get its bearings”

Why decay? - Large LR is good for exploration early - Small LR is good for fine-tuning later - Cosine is smooth (no sudden changes)

class CosineScheduler:
    """Learning rate scheduler with linear warmup and cosine decay."""

    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=0.0):
        self.optimizer = optimizer
        self.warmup_steps = warmup_steps
        self.total_steps = total_steps
        self.min_lr = min_lr
        self.base_lr = optimizer.param_groups[0]['lr']
        self.current_step = 0

    def get_lr(self):
        """Calculate learning rate for current step."""
        if self.current_step < self.warmup_steps:
            # Linear warmup
            return self.base_lr * self.current_step / max(1, self.warmup_steps)
        elif self.current_step >= self.total_steps:
            return self.min_lr
        else:
            # Cosine decay
            progress = (self.current_step - self.warmup_steps) / max(
                1, self.total_steps - self.warmup_steps
            )
            cosine = 0.5 * (1 + math.cos(math.pi * progress))
            return self.min_lr + (self.base_lr - self.min_lr) * cosine

    def step(self):
        """Update learning rate."""
        lr = self.get_lr()
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        self.current_step += 1
        return lr

# Create scheduler
model = nn.Linear(10, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = CosineScheduler(
    optimizer,
    warmup_steps=100,
    total_steps=1000,
    min_lr=1e-5
)

# Collect LRs over training
lrs = []
for _ in range(1000):
    lrs.append(scheduler.get_lr())
    scheduler.step()

print(f"Initial LR: {lrs[0]:.6f}")
print(f"After warmup (step 100): {lrs[100]:.6f}")
print(f"Final LR: {lrs[-1]:.6f}")

Initial LR: 0.000000
After warmup (step 100): 0.001000
Final LR: 0.000010

The interactive visualization above shows how learning rate changes over training. Try adjusting the warmup steps and total steps sliders to see how they affect the schedule.

AdamW Optimizer

AdamW decouples weight decay from Adam (proper L2 regularization) and serves as the standard optimizer for language models.

Why AdamW over SGD or Adam?

SGD: Requires careful learning rate tuning per layer, slow convergence
Adam: Weight decay is applied to gradients (incorrect for L2 regularization)
AdamW: Decouples weight decay from gradient updates (mathematically correct)

// AdamW step-through visualization
viewof adamwStep = stepControl({min: 0, max: 4, value: 0, label: "AdamW Step"})

adamwStepInfo = {
  const steps = [
    {
      title: "Input Gradient",
      description: "Receive gradient g from backpropagation",
      formula: "g = dL/dθ",
      highlight: ["gradient"]
    },
    {
      title: "Momentum Update",
      description: "Update first moment (exponential moving average of gradients)",
      formula: "m = β₁·m + (1-β₁)·g",
      highlight: ["gradient", "momentum"]
    },
    {
      title: "Adaptive Learning Rate",
      description: "Update second moment (exponential moving average of squared gradients)",
      formula: "v = β₂·v + (1-β₂)·g²",
      highlight: ["gradient", "velocity"]
    },
    {
      title: "Bias Correction",
      description: "Correct for initialization bias in early timesteps",
      formula: "m̂ = m/(1-β₁ᵗ), v̂ = v/(1-β₂ᵗ)",
      highlight: ["momentum", "velocity", "bias"]
    },
    {
      title: "Weight Update",
      description: "Apply adaptive update with decoupled weight decay",
      formula: "θ = θ - lr·(m̂/√v̂ + λ·θ)",
      highlight: ["bias", "update"]
    }
  ];
  return steps[adamwStep];
}

// Numeric computation for AdamW example
adamwComputation = {
  // Initial values and hyperparameters
  const g = 0.5;           // gradient
  const beta1 = 0.9;
  const beta2 = 0.999;
  const lr = 0.001;
  const lambda = 0.01;     // weight decay
  const t = 5;             // timestep
  const m_prev = 0.1;      // previous momentum
  const v_prev = 0.01;     // previous velocity
  const theta_prev = 0.75; // previous weight

  // Step 0: Just the gradient
  const step0 = { g };

  // Step 1: Momentum update
  const m = beta1 * m_prev + (1 - beta1) * g;
  const step1 = { ...step0, m, m_prev };

  // Step 2: Velocity update
  const v = beta2 * v_prev + (1 - beta2) * (g * g);
  const step2 = { ...step1, v, v_prev };

  // Step 3: Bias correction
  const m_hat = m / (1 - Math.pow(beta1, t));
  const v_hat = v / (1 - Math.pow(beta2, t));
  const step3 = { ...step2, m_hat, v_hat };

  // Step 4: Weight update
  const adam_update = m_hat / Math.sqrt(v_hat + 1e-8);
  const weight_decay = lambda * theta_prev;
  const theta = theta_prev - lr * (adam_update + weight_decay);
  const step4 = { ...step3, adam_update, weight_decay, theta, theta_prev };

  const steps = [step0, step1, step2, step3, step4];
  return {
    ...steps[adamwStep],
    beta1, beta2, lr, lambda, t,
    step: adamwStep
  };
}

// AdamW flowchart visualization
adamwDiagram = {
  const width = 680;
  const height = 420;
  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'Fira Code', monospace");

  // Background with subtle gradient
  const bgGrad = svg.append("defs").append("linearGradient")
    .attr("id", "adamw-bg-grad")
    .attr("x1", "0%").attr("y1", "0%")
    .attr("x2", "100%").attr("y2", "100%");
  bgGrad.append("stop").attr("offset", "0%").attr("stop-color", theme.bg);
  bgGrad.append("stop").attr("offset", "100%").attr("stop-color", theme.bgSecondary);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", "url(#adamw-bg-grad)")
    .attr("rx", 12);

  // Glow filter for highlights
  const defs = svg.select("defs");
  const glowFilter = defs.append("filter")
    .attr("id", "adamw-glow")
    .attr("x", "-50%").attr("y", "-50%")
    .attr("width", "200%").attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "coloredBlur");
  const feMerge = glowFilter.append("feMerge");
  feMerge.append("feMergeNode").attr("in", "coloredBlur");
  feMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Arrow marker
  defs.append("marker")
    .attr("id", "adamw-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 6)
    .attr("markerHeight", 6)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", theme.edgeStroke);

  defs.append("marker")
    .attr("id", "adamw-arrow-highlight")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 6)
    .attr("markerHeight", 6)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", theme.highlight);

  // Node definitions
  const nodes = [
    { id: "gradient", label: "Gradient", sublabel: "g = dL/dθ", x: 340, y: 60 },
    { id: "momentum", label: "Momentum", sublabel: "m = β₁m + (1-β₁)g", x: 180, y: 160 },
    { id: "velocity", label: "Adaptive LR", sublabel: "v = β₂v + (1-β₂)g²", x: 500, y: 160 },
    { id: "bias", label: "Bias Correction", sublabel: "m̂, v̂", x: 340, y: 260 },
    { id: "update", label: "Weight Update", sublabel: "θ = θ - lr·(...)", x: 340, y: 360 }
  ];

  // Edge definitions
  const edges = [
    { from: "gradient", to: "momentum" },
    { from: "gradient", to: "velocity" },
    { from: "momentum", to: "bias" },
    { from: "velocity", to: "bias" },
    { from: "bias", to: "update" }
  ];

  // Determine which nodes/edges are active based on step
  const activeNodes = adamwStepInfo.highlight;

  const isNodeActive = (id) => activeNodes.includes(id);
  const isEdgeActive = (from, to) => {
    return activeNodes.includes(from) && activeNodes.includes(to);
  };

  // Draw edges
  const edgesLayer = svg.append("g").attr("class", "edges");

  edges.forEach(edge => {
    const fromNode = nodes.find(n => n.id === edge.from);
    const toNode = nodes.find(n => n.id === edge.to);
    const active = isEdgeActive(edge.from, edge.to);

    // Calculate shortened path
    const dx = toNode.x - fromNode.x;
    const dy = toNode.y - fromNode.y;
    const len = Math.sqrt(dx*dx + dy*dy);
    const startOffset = 30;
    const endOffset = 35;

    const x1 = fromNode.x + (dx/len) * startOffset;
    const y1 = fromNode.y + (dy/len) * startOffset;
    const x2 = toNode.x - (dx/len) * endOffset;
    const y2 = toNode.y - (dy/len) * endOffset;

    edgesLayer.append("path")
      .attr("d", `M${x1},${y1} L${x2},${y2}`)
      .attr("fill", "none")
      .attr("stroke", active ? theme.highlight : theme.edgeStroke)
      .attr("stroke-width", active ? 2.5 : 1.5)
      .attr("marker-end", active ? "url(#adamw-arrow-highlight)" : "url(#adamw-arrow)")
      .attr("opacity", active ? 1 : 0.5)
      .style("filter", active ? "url(#adamw-glow)" : "none")
      .style("transition", "all 0.3s ease");
  });

  // Draw nodes
  const nodesLayer = svg.append("g").attr("class", "nodes");

  nodes.forEach(node => {
    const active = isNodeActive(node.id);
    const nodeWidth = 140;
    const nodeHeight = 54;

    const g = nodesLayer.append("g")
      .attr("transform", `translate(${node.x}, ${node.y})`);

    // Node background
    g.append("rect")
      .attr("x", -nodeWidth/2)
      .attr("y", -nodeHeight/2)
      .attr("width", nodeWidth)
      .attr("height", nodeHeight)
      .attr("rx", 8)
      .attr("ry", 8)
      .attr("fill", active ? theme.highlight : theme.nodeFill)
      .attr("stroke", active ? theme.highlight : theme.nodeStroke)
      .attr("stroke-width", active ? 2 : 1.5)
      .style("filter", active ? "url(#adamw-glow)" : "none")
      .style("transition", "all 0.3s ease");

    // Node label
    g.append("text")
      .attr("y", -8)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", active ? theme.textOnHighlight : theme.nodeText)
      .attr("font-size", "13px")
      .attr("font-weight", "600")
      .style("transition", "fill 0.3s ease")
      .text(node.label);

    // Node sublabel
    g.append("text")
      .attr("y", 12)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", active ? theme.textOnHighlight : theme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", active ? 0.9 : 0.7)
      .style("transition", "all 0.3s ease")
      .text(node.sublabel);
  });

  return svg.node();
}

// Info panel showing current step details and numeric values
adamwInfoPanel = {
  const theme = diagramTheme;
  const comp = adamwComputation;
  const info = adamwStepInfo;

  const container = htl.html`<div style="
    background: ${theme.bgSecondary};
    border: 1px solid ${theme.nodeStroke};
    border-radius: 8px;
    padding: 16px 20px;
    margin-top: 12px;
    font-family: 'IBM Plex Mono', 'Fira Code', monospace;
  ">
    <div style="
      display: flex;
      justify-content: space-between;
      align-items: center;
      margin-bottom: 12px;
    ">
      <span style="
        font-size: 14px;
        font-weight: 600;
        color: ${theme.highlight};
      ">Step ${adamwStep}: ${info.title}</span>
      <span style="
        font-size: 12px;
        color: ${theme.nodeText};
        opacity: 0.7;
      ">t = ${comp.t}</span>
    </div>

    <p style="
      font-size: 12px;
      color: ${theme.nodeText};
      margin: 0 0 12px 0;
      line-height: 1.5;
    ">${info.description}</p>

    <div style="
      background: ${theme.nodeFill};
      border-radius: 6px;
      padding: 12px 16px;
      font-family: 'IBM Plex Mono', 'Fira Code', monospace;
    ">
      <div style="
        font-size: 15px;
        color: ${theme.accent};
        font-weight: 500;
        margin-bottom: 10px;
      ">${info.formula}</div>

      ${adamwStep === 0 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div><span style="opacity: 0.6;">gradient:</span> g = <span style="color: ${theme.highlight};">${comp.g.toFixed(3)}</span></div>
          <div><span style="opacity: 0.6;">hyperparams:</span> β₁=${comp.beta1}, β₂=${comp.beta2}, lr=${comp.lr}, λ=${comp.lambda}</div>
        </div>
      ` : ''}

      ${adamwStep === 1 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>m = ${comp.beta1} × ${comp.m_prev.toFixed(3)} + ${(1-comp.beta1).toFixed(1)} × ${comp.g.toFixed(3)}</div>
          <div>m = <span style="color: ${theme.highlight};">${comp.m.toFixed(4)}</span></div>
        </div>
      ` : ''}

      ${adamwStep === 2 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>v = ${comp.beta2} × ${comp.v_prev.toFixed(4)} + ${(1-comp.beta2).toFixed(3)} × ${comp.g.toFixed(3)}²</div>
          <div>v = <span style="color: ${theme.highlight};">${comp.v.toFixed(6)}</span></div>
        </div>
      ` : ''}

      ${adamwStep === 3 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>m̂ = ${comp.m.toFixed(4)} / (1 - ${comp.beta1}^${comp.t}) = <span style="color: ${theme.highlight};">${comp.m_hat.toFixed(4)}</span></div>
          <div>v̂ = ${comp.v.toFixed(6)} / (1 - ${comp.beta2}^${comp.t}) = <span style="color: ${theme.highlight};">${comp.v_hat.toFixed(6)}</span></div>
        </div>
      ` : ''}

      ${adamwStep === 4 ? htl.html`
        <div style="font-size: 11px; color: ${theme.nodeText}; line-height: 1.8;">
          <div>adam = m̂/√v̂ = ${comp.m_hat.toFixed(4)} / √${comp.v_hat.toFixed(6)} = ${comp.adam_update.toFixed(4)}</div>
          <div>decay = λ·θ = ${comp.lambda} × ${comp.theta_prev.toFixed(2)} = ${comp.weight_decay.toFixed(5)}</div>
          <div>θ = ${comp.theta_prev.toFixed(4)} - ${comp.lr} × (${comp.adam_update.toFixed(4)} + ${comp.weight_decay.toFixed(5)})</div>
          <div>θ = <span style="color: ${theme.highlight}; font-weight: 600;">${comp.theta.toFixed(6)}</span></div>
        </div>
      ` : ''}
    </div>
  </div>`;

  return container;
}

Hyperparameters explained:

Parameter	Default	Purpose
beta1	0.9	Momentum coefficient - smooths gradient direction
beta2	0.999	Adaptive LR coefficient - smooths gradient magnitude
epsilon	1e-8	Numerical stability (prevents division by zero)
weight_decay	0.01	L2 regularization strength

Practical tip: The LLM community has converged on beta1=0.9, beta2=0.95 for large models (used by LLaMA, GPT-3). The lower beta2 adapts faster to changing gradient magnitudes.

# Creating an AdamW optimizer
model = nn.Linear(100, 10)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=3e-4,          # Learning rate
    betas=(0.9, 0.999),  # Momentum and adaptive LR
    weight_decay=0.01    # Regularization
)

print("AdamW optimizer created")
print(f"  Learning rate: {optimizer.param_groups[0]['lr']}")
print(f"  Weight decay: {optimizer.param_groups[0]['weight_decay']}")

AdamW optimizer created
  Learning rate: 0.0003
  Weight decay: 0.01

Optimizers from Scratch

Let’s build optimizers from first principles to understand what PyTorch does internally.

Plain SGD

The simplest optimizer: move parameters in the opposite direction of the gradient.

class SGD_Scratch:
    """
    Stochastic Gradient Descent.

    Update rule: theta = theta - lr * gradient
    """
    def __init__(self, params, lr=0.01):
        self.params = list(params)
        self.lr = lr

    def step(self):
        with torch.no_grad():
            for p in self.params:
                if p.grad is not None:
                    p -= self.lr * p.grad

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad = None

# Test: compare with PyTorch SGD
torch.manual_seed(42)
model_scratch = nn.Linear(10, 2)
model_pytorch = nn.Linear(10, 2)
model_pytorch.load_state_dict(model_scratch.state_dict())

opt_scratch = SGD_Scratch(model_scratch.parameters(), lr=0.1)
opt_pytorch = torch.optim.SGD(model_pytorch.parameters(), lr=0.1)

# Forward + backward
x = torch.randn(4, 10)
loss_scratch = model_scratch(x).sum()
loss_pytorch = model_pytorch(x).sum()

loss_scratch.backward()
loss_pytorch.backward()

# Update
opt_scratch.step()
opt_pytorch.step()

# Compare weights
print("After one SGD step:")
print(f"  Scratch weight[0,0]: {model_scratch.weight[0,0].item():.6f}")
print(f"  PyTorch weight[0,0]: {model_pytorch.weight[0,0].item():.6f}")
print(f"  Match: {torch.allclose(model_scratch.weight, model_pytorch.weight)}")

After one SGD step:
  Scratch weight[0,0]: 0.164868
  PyTorch weight[0,0]: 0.164868
  Match: True

SGD with Momentum

Momentum adds “velocity” to gradient descent. Instead of using the gradient directly, we accumulate a moving average of gradients:

\[v_t = \mu \cdot v_{t-1} + g_t\] \[\theta_t = \theta_{t-1} - \alpha \cdot v_t\]

This helps: - Smooth out noisy gradients - Accelerate through flat regions - Dampen oscillations in steep valleys

class SGD_Momentum_Scratch:
    """
    SGD with momentum.

    Update rule:
        v = momentum * v + gradient
        theta = theta - lr * v
    """
    def __init__(self, params, lr=0.01, momentum=0.9):
        self.params = list(params)
        self.lr = lr
        self.momentum = momentum
        # Velocity buffer for each parameter
        self.v = [torch.zeros_like(p) for p in self.params]

    def step(self):
        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is None:
                    continue
                # Update velocity: v = momentum * v + grad
                self.v[i] = self.momentum * self.v[i] + p.grad
                # Update parameter
                p -= self.lr * self.v[i]

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad = None

# Test: compare with PyTorch SGD momentum
torch.manual_seed(42)
model_scratch = nn.Linear(10, 2)
model_pytorch = nn.Linear(10, 2)
model_pytorch.load_state_dict(model_scratch.state_dict())

opt_scratch = SGD_Momentum_Scratch(model_scratch.parameters(), lr=0.1, momentum=0.9)
opt_pytorch = torch.optim.SGD(model_pytorch.parameters(), lr=0.1, momentum=0.9)

# Multiple steps to see momentum accumulate
for step in range(3):
    x = torch.randn(4, 10)

    loss_scratch = model_scratch(x).sum()
    loss_pytorch = model_pytorch(x).sum()

    opt_scratch.zero_grad()
    opt_pytorch.zero_grad()

    loss_scratch.backward()
    loss_pytorch.backward()

    opt_scratch.step()
    opt_pytorch.step()

print("After 3 momentum SGD steps:")
print(f"  Scratch weight[0,0]: {model_scratch.weight[0,0].item():.6f}")
print(f"  PyTorch weight[0,0]: {model_pytorch.weight[0,0].item():.6f}")
print(f"  Match: {torch.allclose(model_scratch.weight, model_pytorch.weight)}")

After 3 momentum SGD steps:
  Scratch weight[0,0]: -0.793227
  PyTorch weight[0,0]: -0.793227
  Match: False

Key Insight: Momentum

Momentum is like pushing a ball down a hill - it builds up speed in consistent directions and resists sudden direction changes. This makes optimization faster and more stable.

Adam from Scratch

Adam combines momentum with adaptive learning rates. It tracks two quantities:

First moment \(m\) (mean of gradients) - like momentum
Second moment \(v\) (mean of squared gradients) - adapts learning rate per-parameter

\[m_t = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot g_t\] \[v_t = \beta_2 \cdot v_{t-1} + (1 - \beta_2) \cdot g_t^2\]

We also need bias correction because \(m\) and \(v\) are initialized to zero:

\[\hat{m}_t = \frac{m_t}{1 - \beta_1^t}, \quad \hat{v}_t = \frac{v_t}{1 - \beta_2^t}\]

Finally, the update:

\[\theta_t = \theta_{t-1} - \alpha \cdot \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}\]

class Adam_Scratch:
    """
    Adam optimizer with optional weight decay (AdamW style).

    Tracks first moment (mean) and second moment (variance) of gradients.
    Uses bias correction to fix initialization bias.
    """
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0):
        self.params = list(params)
        self.lr = lr
        self.b1, self.b2 = betas
        self.eps = eps
        self.weight_decay = weight_decay

        # First moment (mean of gradients)
        self.m = [torch.zeros_like(p) for p in self.params]
        # Second moment (mean of squared gradients)
        self.v = [torch.zeros_like(p) for p in self.params]
        # Timestep
        self.t = 0

    def step(self):
        self.t += 1

        with torch.no_grad():
            for i, p in enumerate(self.params):
                if p.grad is None:
                    continue

                g = p.grad

                # AdamW: Weight decay applied directly to weights (decoupled)
                if self.weight_decay != 0.0:
                    p -= self.lr * self.weight_decay * p

                # Update first moment: m = beta1 * m + (1 - beta1) * g
                self.m[i] = self.b1 * self.m[i] + (1 - self.b1) * g

                # Update second moment: v = beta2 * v + (1 - beta2) * g^2
                self.v[i] = self.b2 * self.v[i] + (1 - self.b2) * (g * g)

                # Bias correction (crucial early in training!)
                mhat = self.m[i] / (1 - self.b1 ** self.t)
                vhat = self.v[i] / (1 - self.b2 ** self.t)

                # Update parameters
                p -= self.lr * mhat / (torch.sqrt(vhat) + self.eps)

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad = None

# Test: compare with PyTorch AdamW
torch.manual_seed(42)
model_scratch = nn.Linear(10, 2)
model_pytorch = nn.Linear(10, 2)
model_pytorch.load_state_dict(model_scratch.state_dict())

opt_scratch = Adam_Scratch(model_scratch.parameters(), lr=1e-3, weight_decay=0.01)
opt_pytorch = torch.optim.AdamW(model_pytorch.parameters(), lr=1e-3, weight_decay=0.01)

# Multiple steps
for step in range(5):
    x = torch.randn(4, 10)

    loss_scratch = model_scratch(x).sum()
    loss_pytorch = model_pytorch(x).sum()

    opt_scratch.zero_grad()
    opt_pytorch.zero_grad()

    loss_scratch.backward()
    loss_pytorch.backward()

    opt_scratch.step()
    opt_pytorch.step()

print("After 5 AdamW steps:")
print(f"  Scratch weight[0,0]: {model_scratch.weight[0,0].item():.6f}")
print(f"  PyTorch weight[0,0]: {model_pytorch.weight[0,0].item():.6f}")
print(f"  Close match: {torch.allclose(model_scratch.weight, model_pytorch.weight, atol=1e-6)}")

After 5 AdamW steps:
  Scratch weight[0,0]: 0.237638
  PyTorch weight[0,0]: 0.237638
  Close match: True

Key Insight: Adam

Adam is “momentum + per-parameter learning rates.” The second moment \(v\) tracks how much each parameter’s gradient varies. Parameters with consistently large gradients get smaller effective learning rates (stabilizing training), while those with small gradients get larger rates (speeding up learning).

Why bias correction matters:

Without bias correction, the first few steps are biased toward zero because \(m\) and \(v\) are initialized to zero. Let’s see this:

# Demonstrate bias correction importance
m, v = 0.0, 0.0
b1, b2 = 0.9, 0.999
true_grad = 1.0  # Pretend gradient is always 1

print("Step | m (biased) | m_hat (corrected)")
print("-" * 45)
for t in range(1, 6):
    m = b1 * m + (1 - b1) * true_grad
    m_hat = m / (1 - b1 ** t)
    print(f"  {t}  |   {m:.4f}    |     {m_hat:.4f}")

print(f"\nWithout correction, m starts near 0.1 instead of 1.0!")
print(f"Bias correction fixes this, making m_hat ≈ 1.0 from the start.")

Step | m (biased) | m_hat (corrected)
---------------------------------------------
  1  |   0.1000    |     1.0000
  2  |   0.1900    |     1.0000
  3  |   0.2710    |     1.0000
  4  |   0.3439    |     1.0000
  5  |   0.4095    |     1.0000

Without correction, m starts near 0.1 instead of 1.0!
Bias correction fixes this, making m_hat ≈ 1.0 from the start.

Gradient Accumulation

Gradient accumulation increases effective batch size without adding memory.

Problem: Want batch_size=32 but only 8 fits in memory

Solution: Accumulate gradients over 4 mini-batches

// Step slider for gradient accumulation (0 = initial, 1-4 = mini-batches, 5 = optimizer step)
viewof accumStep = Inputs.range([0, 5], {
  value: 0,
  step: 1,
  label: "Accumulation Step"
})

// Gradient accumulation diagram data
accumStepInfo = {
  const steps = [
    { name: "Ready", description: "Gradients zeroed, ready to accumulate", gradientLevel: 0 },
    { name: "Mini-batch 1", description: "loss.backward() - gradients start accumulating", gradientLevel: 0.25 },
    { name: "Mini-batch 2", description: "loss.backward() - gradients continue accumulating", gradientLevel: 0.5 },
    { name: "Mini-batch 3", description: "loss.backward() - gradients continue accumulating", gradientLevel: 0.75 },
    { name: "Mini-batch 4", description: "loss.backward() - gradients fully accumulated", gradientLevel: 1.0 },
    { name: "Optimizer Step", description: "optimizer.step() - one weight update with effective batch_size=32", gradientLevel: 0 }
  ];
  return steps[accumStep];
}

// Interactive gradient accumulation visualization
{
  const width = 700;
  const height = 340;
  const batchSize = 8;
  const accumSteps = 4;
  const effectiveBatch = batchSize * accumSteps;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 8);

  // Defs for arrows and gradients
  const defs = svg.append("defs");

  // Arrow markers
  defs.append("marker")
    .attr("id", "accum-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  defs.append("marker")
    .attr("id", "accum-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Gradient fill for the accumulator bar
  const gradientFill = defs.append("linearGradient")
    .attr("id", "gradient-fill")
    .attr("x1", "0%")
    .attr("y1", "100%")
    .attr("x2", "0%")
    .attr("y2", "0%");

  gradientFill.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", diagramTheme.accent);

  gradientFill.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", diagramTheme.highlight);

  // Layout constants
  const batchBoxWidth = 100;
  const batchBoxHeight = 55;
  const batchStartX = 60;
  const batchSpacing = 20;
  const batchY = 80;

  const accumX = 480;
  const accumY = 80;
  const accumWidth = 70;
  const accumHeight = 140;

  const optimizerX = 620;
  const optimizerY = 150;

  // Draw mini-batch boxes
  const batches = [1, 2, 3, 4];

  batches.forEach((batch, i) => {
    const x = batchStartX + i * (batchBoxWidth + batchSpacing);
    const isActive = accumStep === batch;
    const isProcessed = accumStep > batch;

    const g = svg.append("g")
      .attr("transform", `translate(${x}, ${batchY})`);

    // Box
    g.append("rect")
      .attr("width", batchBoxWidth)
      .attr("height", batchBoxHeight)
      .attr("rx", 6)
      .attr("fill", isActive ? diagramTheme.highlight : (isProcessed ? diagramTheme.accent : diagramTheme.nodeFill))
      .attr("stroke", isActive ? diagramTheme.highlight : (isProcessed ? diagramTheme.accent : diagramTheme.nodeStroke))
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("opacity", isProcessed && !isActive ? 0.7 : 1)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    // Batch label
    g.append("text")
      .attr("x", batchBoxWidth / 2)
      .attr("y", 18)
      .attr("text-anchor", "middle")
      .attr("fill", isActive || isProcessed ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(`Mini-batch ${batch}`);

    // Size info
    g.append("text")
      .attr("x", batchBoxWidth / 2)
      .attr("y", 34)
      .attr("text-anchor", "middle")
      .attr("fill", isActive || isProcessed ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.8)
      .text(`size=${batchSize}`);

    // backward() call
    g.append("text")
      .attr("x", batchBoxWidth / 2)
      .attr("y", 48)
      .attr("text-anchor", "middle")
      .attr("fill", isActive || isProcessed ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("font-family", "monospace")
      .attr("opacity", 0.7)
      .text("loss.backward()");

    // Arrow from batch to accumulator
    if (accumStep >= batch && accumStep <= 4) {
      const arrowActive = isActive;
      const startX = x + batchBoxWidth;
      const startY = batchY + batchBoxHeight / 2;
      const endX = accumX - 5;
      const endY = accumY + 30 + i * 25;

      // Curved path
      const midX = (startX + endX) / 2 + 20;

      svg.append("path")
        .attr("d", `M${startX + 5},${startY} Q${midX},${startY} ${endX},${endY}`)
        .attr("fill", "none")
        .attr("stroke", arrowActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", arrowActive ? 2 : 1.5)
        .attr("marker-end", arrowActive ? "url(#accum-arrow-active)" : "url(#accum-arrow)")
        .attr("opacity", isProcessed && !arrowActive ? 0.5 : (arrowActive ? 1 : 0.7))
        .style("filter", arrowActive ? `drop-shadow(0 0 3px ${diagramTheme.highlightGlow})` : "none");
    }
  });

  // Draw accumulator container
  const accumG = svg.append("g")
    .attr("transform", `translate(${accumX}, ${accumY})`);

  // Accumulator background
  accumG.append("rect")
    .attr("width", accumWidth)
    .attr("height", accumHeight)
    .attr("rx", 8)
    .attr("fill", diagramTheme.bgSecondary)
    .attr("stroke", accumStep >= 1 && accumStep <= 4 ? diagramTheme.accent : diagramTheme.nodeStroke)
    .attr("stroke-width", 2);

  // Gradient level bar (fills from bottom)
  const gradientLevel = accumStepInfo.gradientLevel;
  const barPadding = 8;
  const barWidth = accumWidth - barPadding * 2;
  const barMaxHeight = accumHeight - barPadding * 2 - 20;
  const barHeight = barMaxHeight * gradientLevel;

  if (barHeight > 0) {
    accumG.append("rect")
      .attr("x", barPadding)
      .attr("y", accumHeight - barPadding - barHeight)
      .attr("width", barWidth)
      .attr("height", barHeight)
      .attr("rx", 4)
      .attr("fill", "url(#gradient-fill)")
      .attr("opacity", 0.9);
  }

  // Accumulator label
  accumG.append("text")
    .attr("x", accumWidth / 2)
    .attr("y", 14)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .text("Gradients");

  // Percentage label
  accumG.append("text")
    .attr("x", accumWidth / 2)
    .attr("y", accumHeight / 2 + 5)
    .attr("text-anchor", "middle")
    .attr("fill", gradientLevel > 0.3 ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text(`${Math.round(gradientLevel * 100)}%`);

  // Arrow from accumulator to optimizer
  const optimizerActive = accumStep === 5;

  svg.append("path")
    .attr("d", `M${accumX + accumWidth + 5},${accumY + accumHeight / 2} L${optimizerX - 50},${optimizerY}`)
    .attr("fill", "none")
    .attr("stroke", optimizerActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
    .attr("stroke-width", optimizerActive ? 2.5 : 1.5)
    .attr("marker-end", optimizerActive ? "url(#accum-arrow-active)" : "url(#accum-arrow)")
    .attr("opacity", accumStep < 5 ? 0.4 : 1)
    .attr("stroke-dasharray", accumStep < 5 ? "5,3" : "none")
    .style("filter", optimizerActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");

  // Optimizer box
  const optG = svg.append("g")
    .attr("transform", `translate(${optimizerX - 45}, ${optimizerY - 30})`);

  optG.append("rect")
    .attr("width", 90)
    .attr("height", 60)
    .attr("rx", 6)
    .attr("fill", optimizerActive ? diagramTheme.highlight : diagramTheme.nodeFill)
    .attr("stroke", optimizerActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
    .attr("stroke-width", optimizerActive ? 2.5 : 1.5)
    .style("filter", optimizerActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

  optG.append("text")
    .attr("x", 45)
    .attr("y", 22)
    .attr("text-anchor", "middle")
    .attr("fill", optimizerActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("font-weight", "600")
    .text("Optimizer");

  optG.append("text")
    .attr("x", 45)
    .attr("y", 38)
    .attr("text-anchor", "middle")
    .attr("fill", optimizerActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "9px")
    .attr("font-family", "monospace")
    .attr("opacity", 0.8)
    .text("step()");

  optG.append("text")
    .attr("x", 45)
    .attr("y", 52)
    .attr("text-anchor", "middle")
    .attr("fill", optimizerActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
    .attr("font-size", "8px")
    .attr("opacity", 0.7)
    .text("1 update");

  // Status info panel at bottom
  const infoY = 250;

  svg.append("rect")
    .attr("x", 30)
    .attr("y", infoY)
    .attr("width", width - 60)
    .attr("height", 70)
    .attr("rx", 6)
    .attr("fill", diagramTheme.bgSecondary)
    .attr("stroke", diagramTheme.nodeStroke)
    .attr("stroke-width", 1);

  // Step name
  svg.append("text")
    .attr("x", 50)
    .attr("y", infoY + 22)
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "13px")
    .attr("font-weight", "700")
    .text(`Step ${accumStep}: ${accumStepInfo.name}`);

  // Description
  svg.append("text")
    .attr("x", 50)
    .attr("y", infoY + 42)
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .text(accumStepInfo.description);

  // Effective batch size calculation
  svg.append("text")
    .attr("x", 50)
    .attr("y", infoY + 58)
    .attr("fill", diagramTheme.accent)
    .attr("font-size", "10px")
    .attr("font-family", "monospace")
    .text(`Effective batch size: ${batchSize} x ${accumSteps} = ${effectiveBatch}`);

  return svg.node();
}

# Demonstrate gradient accumulation
model = nn.Linear(10, 1)
accumulation_steps = 4

# Simulate accumulated gradients
total_loss = 0

for i in range(accumulation_steps):
    x = torch.randn(8, 10)  # Mini-batch
    y = model(x)
    loss = y.mean() / accumulation_steps  # Scale loss!
    loss.backward()  # Gradients accumulate
    total_loss += loss.item()

print(f"Accumulated loss (4 mini-batches): {total_loss:.4f}")
print(f"Gradient norm before step: {model.weight.grad.norm().item():.4f}")

# Now do one optimizer step
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
optimizer.step()
optimizer.zero_grad()

print("After optimizer.step() and zero_grad()")

Accumulated loss (4 mini-batches): 0.2169
Gradient norm before step: 0.5239
After optimizer.step() and zero_grad()

Gradient Clipping

Gradient clipping scales down gradients whose norm exceeds a threshold, preventing gradient explosion.

# Demonstrate gradient clipping
model = nn.Linear(10, 10)

# Create artificial large gradients
for p in model.parameters():
    p.grad = torch.randn_like(p) * 100  # Very large!

# Compute gradient norm before clipping
total_norm_before = 0
for p in model.parameters():
    total_norm_before += p.grad.norm().item() ** 2
total_norm_before = total_norm_before ** 0.5

print(f"Gradient norm before clipping: {total_norm_before:.2f}")

# Clip gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Compute gradient norm after
total_norm_after = 0
for p in model.parameters():
    total_norm_after += p.grad.norm().item() ** 2
total_norm_after = total_norm_after ** 0.5

print(f"Gradient norm after clipping:  {total_norm_after:.2f}")
print(f"\nGradients scaled down by {total_norm_before / total_norm_after:.1f}x")

Gradient norm before clipping: 1104.72
Gradient norm after clipping:  1.00

Gradients scaled down by 1104.7x

Gradient Clipping from Scratch

Let’s implement gradient clipping ourselves to understand the algorithm:

def clip_grad_norm_scratch(params, max_norm: float) -> float:
    """
    Clip gradients by global norm.

    Algorithm:
    1. Compute total norm: sqrt(sum of all grad^2)
    2. If total_norm > max_norm, scale all grads by (max_norm / total_norm)

    Returns the original norm (before clipping).
    """
    params = list(params)

    # Step 1: Compute total gradient norm
    total_sq = 0.0
    for p in params:
        if p.grad is not None:
            total_sq += (p.grad ** 2).sum().item()
    total_norm = total_sq ** 0.5

    # Step 2: Clip if needed
    if total_norm > max_norm:
        scale = max_norm / (total_norm + 1e-12)  # Small epsilon for numerical stability
        for p in params:
            if p.grad is not None:
                p.grad *= scale

    return total_norm

# Test: compare with PyTorch
model_scratch = nn.Linear(10, 10)
model_pytorch = nn.Linear(10, 10)

# Set same large gradients
torch.manual_seed(42)
for p in model_scratch.parameters():
    p.grad = torch.randn_like(p) * 100
for ps, pp in zip(model_scratch.parameters(), model_pytorch.parameters()):
    pp.grad = ps.grad.clone()

# Clip with both
norm_scratch = clip_grad_norm_scratch(model_scratch.parameters(), max_norm=1.0)
norm_pytorch = torch.nn.utils.clip_grad_norm_(model_pytorch.parameters(), max_norm=1.0)

print(f"Original norm (scratch): {norm_scratch:.4f}")
print(f"Original norm (PyTorch): {norm_pytorch.item():.4f}")

# Check gradients match after clipping
grads_match = all(
    torch.allclose(ps.grad, pp.grad)
    for ps, pp in zip(model_scratch.parameters(), model_pytorch.parameters())
)
print(f"Gradients match after clipping: {grads_match}")

Original norm (scratch): 1037.5083
Original norm (PyTorch): 1037.5084
Gradients match after clipping: True

Key Insight: Gradient Clipping

Gradient clipping scales ALL gradients by the same factor to preserve their relative magnitudes. This is different from clipping each gradient independently - we want to maintain the direction of the overall update while limiting its magnitude.

When to use gradient clipping:

Always for transformer training (standard practice)
max_norm=1.0 is a good default
Monitor gradient norms during training - consistently high norms suggest instability

Batch Size Considerations

Batch size affects both training dynamics and memory usage:

Tradeoffs:

Aspect	Small Batch	Large Batch
Memory	Less	More
Gradient noise	More (regularization effect)	Less (stable gradients)
Convergence	May generalize better	Faster convergence
LR needed	Lower	Higher (linear scaling rule)

The Linear Scaling Rule: When you double the batch size, you can double the learning rate. This maintains similar training dynamics.

Effective batch size = batch_size x gradient_accumulation_steps

# Batch size vs memory example (conceptual)
print("Memory usage scales linearly with batch size:")
print()
for batch_size in [8, 16, 32, 64]:
    # Simulated memory calculation
    tokens_per_batch = batch_size * 512  # sequence length
    memory_mb = batch_size * 50  # ~50MB per sample for a small model
    print(f"  Batch size {batch_size:2d}: ~{tokens_per_batch:,} tokens/batch, ~{memory_mb}MB")

Memory usage scales linearly with batch size:

  Batch size  8: ~4,096 tokens/batch, ~400MB
  Batch size 16: ~8,192 tokens/batch, ~800MB
  Batch size 32: ~16,384 tokens/batch, ~1600MB
  Batch size 64: ~32,768 tokens/batch, ~3200MB

Mixed Precision Training

Modern GPUs/TPUs compute faster using lower precision (fp16/bf16) while preserving training quality.

Conceptual Example

The code below shows the API but doesn’t execute training — mixed precision requires specific hardware (CUDA GPUs) to demonstrate speedups.

Precision types:

Type	Bits	Range	Use Case
fp32	32	Large	Default, master weights
fp16	16	Limited	Faster compute, risk of overflow
bf16	16	Large (like fp32)	Best of both worlds

How mixed precision works:

Keep master weights in fp32 (full precision)
Cast to fp16/bf16 for forward/backward pass (fast)
Compute gradients in fp16/bf16
Update master weights in fp32 (accurate)

# Mixed precision example (conceptual - requires GPU)
print("Mixed Precision Training:")
print()

# Simulated speedup
print("Speedups on modern hardware:")
print("  - A100 GPU with bf16: ~2x faster than fp32")
print("  - H100 GPU with fp8:  ~3x faster than fp32")
print()

# PyTorch autocast usage
print("PyTorch usage:")
print("""
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch in dataloader:
    optimizer.zero_grad()

    # Forward pass in mixed precision
    with autocast():
        logits = model(input_ids)
        loss = F.cross_entropy(logits, targets)

    # Backward pass with gradient scaling
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
""")

Mixed Precision Training:

Speedups on modern hardware:
  - A100 GPU with bf16: ~2x faster than fp32
  - H100 GPU with fp8:  ~3x faster than fp32

PyTorch usage:

from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch in dataloader:
    optimizer.zero_grad()

    # Forward pass in mixed precision
    with autocast():
        logits = model(input_ids)
        loss = F.cross_entropy(logits, targets)

    # Backward pass with gradient scaling
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

Practical advice:

Use bf16 if available (A100, H100) - it has the same dynamic range as fp32
Use fp16 with gradient scaling on older GPUs (V100)
Apple Silicon (MPS) does not yet fully support mixed precision

Distributed Training Basics

Large models require multiple GPUs. A brief overview:

Data Parallel (DP/DDP):

Same model copied to all GPUs
Each GPU processes different data
Gradients are averaged across GPUs
Memory per GPU = full model size

// Data Parallel step descriptions
dpSteps = [
  {
    id: 0,
    name: "Input Data",
    description: "Large training batch ready to be distributed across GPUs"
  },
  {
    id: 1,
    name: "Split Data",
    description: "Batch is divided evenly among available GPUs"
  },
  {
    id: 2,
    name: "Forward Pass",
    description: "Each GPU computes forward pass on its data shard with full model copy"
  },
  {
    id: 3,
    name: "Compute Gradients",
    description: "Each GPU computes gradients via backpropagation"
  },
  {
    id: 4,
    name: "AllReduce",
    description: "Gradients are averaged across all GPUs via collective communication"
  }
]

// Step slider for Data Parallel diagram
viewof dpStep = Inputs.range([0, 4], {
  value: 0,
  step: 1,
  label: "Step"
})

// Current Data Parallel step info
currentDpStep = dpSteps[dpStep]

// Data Parallel interactive diagram
{
  const width = 700;
  const height = 380;
  const numGpus = 3;
  const batchPerGpu = 8;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 8);

  // Defs for arrows and gradients
  const defs = svg.append("defs");

  // Arrow markers
  defs.append("marker")
    .attr("id", "dp-arrow")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.edgeStroke);

  defs.append("marker")
    .attr("id", "dp-arrow-active")
    .attr("viewBox", "0 -5 10 10")
    .attr("refX", 8)
    .attr("refY", 0)
    .attr("markerWidth", 5)
    .attr("markerHeight", 5)
    .attr("orient", "auto")
    .append("path")
    .attr("d", "M0,-5L10,0L0,5")
    .attr("fill", diagramTheme.highlight);

  // Data flow gradient for animation effect
  const flowGradient = defs.append("linearGradient")
    .attr("id", "dp-flow-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "100%")
    .attr("y2", "0%");

  flowGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", diagramTheme.highlight)
    .attr("stop-opacity", 0.2);

  flowGradient.append("stop")
    .attr("offset", "50%")
    .attr("stop-color", diagramTheme.highlight)
    .attr("stop-opacity", 1);

  flowGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", diagramTheme.highlight)
    .attr("stop-opacity", 0.2);

  // Layout constants
  const batchX = 70;
  const splitX = 200;
  const gpuX = 400;
  const reduceX = 580;
  const centerY = height / 2;
  const gpuSpacing = 90;

  // GPU Y positions
  const gpuYs = [centerY - gpuSpacing, centerY, centerY + gpuSpacing];

  // Helper: draw data block
  const drawDataBlock = (g, x, y, w, h, label, isActive, isSmall = false) => {
    const block = g.append("g").attr("transform", `translate(${x}, ${y})`);

    block.append("rect")
      .attr("x", -w/2)
      .attr("y", -h/2)
      .attr("width", w)
      .attr("height", h)
      .attr("rx", 4)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 6px ${diagramTheme.highlightGlow})` : "none");

    if (label) {
      block.append("text")
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", isSmall ? "10px" : "11px")
        .attr("font-weight", "500")
        .text(label);
    }

    return block;
  };

  // Helper: draw GPU box
  const drawGpu = (g, x, y, gpuNum, isActive, showGradients = false) => {
    const gpu = g.append("g").attr("transform", `translate(${x}, ${y})`);
    const boxW = 100;
    const boxH = 60;

    // GPU container
    gpu.append("rect")
      .attr("x", -boxW/2)
      .attr("y", -boxH/2)
      .attr("width", boxW)
      .attr("height", boxH)
      .attr("rx", 6)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    // GPU label
    gpu.append("text")
      .attr("y", -12)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(`GPU ${gpuNum}`);

    // Full model indicator
    gpu.append("text")
      .attr("y", 6)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("opacity", isActive ? 0.9 : 0.7)
      .text("Full Model");

    // Gradient indicator (when computing gradients)
    if (showGradients) {
      gpu.append("text")
        .attr("y", 20)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.accent)
        .attr("font-size", "9px")
        .attr("font-weight", "500")
        .text("∇ gradients");
    }

    return gpu;
  };

  // Draw based on current step
  const mainGroup = svg.append("g");

  // Step 0: Show full batch
  if (dpStep >= 0) {
    const isActive = dpStep === 0;
    drawDataBlock(mainGroup, batchX, centerY, 60, 100, null, isActive);

    // Data visualization inside batch
    const batchGroup = mainGroup.append("g").attr("transform", `translate(${batchX}, ${centerY})`);
    for (let i = 0; i < 6; i++) {
      const row = Math.floor(i / 2);
      const col = i % 2;
      batchGroup.append("rect")
        .attr("x", -20 + col * 22)
        .attr("y", -35 + row * 25)
        .attr("width", 18)
        .attr("height", 20)
        .attr("rx", 2)
        .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.accent)
        .attr("opacity", isActive ? 0.9 : 0.6);
    }

    // Batch label
    mainGroup.append("text")
      .attr("x", batchX)
      .attr("y", centerY + 65)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "500")
      .text("Data Batch");

    mainGroup.append("text")
      .attr("x", batchX)
      .attr("y", centerY + 80)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text(`(${batchPerGpu * numGpus} samples)`);
  }

  // Step 1+: Show split batches
  if (dpStep >= 1) {
    const isActive = dpStep === 1;

    // Draw split indicator arrows
    for (let i = 0; i < numGpus; i++) {
      const startX = batchX + 35;
      const startY = centerY;
      const endX = splitX - 25;
      const endY = gpuYs[i];

      mainGroup.append("path")
        .attr("d", `M${startX},${startY} C${startX + 40},${startY} ${endX - 40},${endY} ${endX},${endY}`)
        .attr("fill", "none")
        .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", isActive ? 2 : 1.5)
        .attr("marker-end", isActive ? "url(#dp-arrow-active)" : "url(#dp-arrow)")
        .attr("opacity", isActive ? 1 : 0.6)
        .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
    }

    // Draw split batches
    for (let i = 0; i < numGpus; i++) {
      drawDataBlock(mainGroup, splitX, gpuYs[i], 45, 40, null, isActive, true);

      // Mini data visualization
      const splitGroup = mainGroup.append("g").attr("transform", `translate(${splitX}, ${gpuYs[i]})`);
      for (let j = 0; j < 2; j++) {
        splitGroup.append("rect")
          .attr("x", -15 + j * 16)
          .attr("y", -8)
          .attr("width", 12)
          .attr("height", 16)
          .attr("rx", 2)
          .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.accent)
          .attr("opacity", isActive ? 0.9 : 0.6);
      }

      // Batch shard label
      mainGroup.append("text")
        .attr("x", splitX)
        .attr("y", gpuYs[i] + 30)
        .attr("text-anchor", "middle")
        .attr("fill", diagramTheme.nodeText)
        .attr("font-size", "9px")
        .attr("opacity", 0.7)
        .text(`Batch ${i}`);
    }
  }

  // Step 2+: Show GPUs with forward pass
  if (dpStep >= 2) {
    const isForward = dpStep === 2;
    const isGradient = dpStep === 3;
    const isActive = isForward || isGradient;

    // Arrows from split to GPU
    for (let i = 0; i < numGpus; i++) {
      const startX = splitX + 28;
      const endX = gpuX - 55;
      const y = gpuYs[i];

      mainGroup.append("path")
        .attr("d", `M${startX},${y} L${endX},${y}`)
        .attr("fill", "none")
        .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", isActive ? 2 : 1.5)
        .attr("marker-end", isActive ? "url(#dp-arrow-active)" : "url(#dp-arrow)")
        .attr("opacity", isActive ? 1 : 0.6)
        .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
    }

    // Draw GPUs
    for (let i = 0; i < numGpus; i++) {
      drawGpu(mainGroup, gpuX, gpuYs[i], i, isActive, isGradient);
    }
  }

  // Step 4: AllReduce
  if (dpStep >= 4) {
    const isActive = dpStep === 4;

    // Arrows from GPU to AllReduce
    for (let i = 0; i < numGpus; i++) {
      const startX = gpuX + 55;
      const startY = gpuYs[i];
      const endX = reduceX - 45;
      const endY = centerY;

      mainGroup.append("path")
        .attr("d", `M${startX},${startY} C${startX + 30},${startY} ${endX - 30},${endY} ${endX},${endY}`)
        .attr("fill", "none")
        .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr("stroke-width", isActive ? 2 : 1.5)
        .attr("marker-end", isActive ? "url(#dp-arrow-active)" : "url(#dp-arrow)")
        .attr("opacity", isActive ? 1 : 0.6)
        .style("filter", isActive ? `drop-shadow(0 0 4px ${diagramTheme.highlightGlow})` : "none");
    }

    // AllReduce node
    const reduceGroup = mainGroup.append("g").attr("transform", `translate(${reduceX}, ${centerY})`);

    reduceGroup.append("rect")
      .attr("x", -42)
      .attr("y", -35)
      .attr("width", 84)
      .attr("height", 70)
      .attr("rx", 6)
      .attr("fill", isActive ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isActive ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .style("filter", isActive ? `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})` : "none");

    reduceGroup.append("text")
      .attr("y", -10)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text("AllReduce");

    reduceGroup.append("text")
      .attr("y", 8)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("opacity", 0.8)
      .text("Average");

    reduceGroup.append("text")
      .attr("y", 22)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("opacity", 0.8)
      .text("Gradients");
  }

  // Step indicator bar at top
  const stepBarY = 25;
  const stepBarWidth = 500;
  const stepBarX = (width - stepBarWidth) / 2;
  const stepWidth = stepBarWidth / 5;

  const stepLabels = ["Input", "Split", "Forward", "Gradients", "AllReduce"];

  for (let i = 0; i < 5; i++) {
    const isCurrentStep = dpStep === i;
    const isPastStep = dpStep > i;
    const stepX = stepBarX + i * stepWidth + stepWidth / 2;

    // Step circle
    mainGroup.append("circle")
      .attr("cx", stepX)
      .attr("cy", stepBarY)
      .attr("r", 12)
      .attr("fill", isCurrentStep ? diagramTheme.highlight : (isPastStep ? diagramTheme.accent : diagramTheme.nodeFill))
      .attr("stroke", isCurrentStep ? diagramTheme.highlight : (isPastStep ? diagramTheme.accent : diagramTheme.nodeStroke))
      .attr("stroke-width", isCurrentStep ? 2 : 1.5)
      .style("filter", isCurrentStep ? `drop-shadow(0 0 6px ${diagramTheme.highlightGlow})` : "none");

    // Step number
    mainGroup.append("text")
      .attr("x", stepX)
      .attr("y", stepBarY)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isCurrentStep || isPastStep ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("font-weight", "600")
      .text(i);

    // Step label
    mainGroup.append("text")
      .attr("x", stepX)
      .attr("y", stepBarY + 22)
      .attr("text-anchor", "middle")
      .attr("fill", isCurrentStep ? diagramTheme.highlight : diagramTheme.nodeText)
      .attr("font-size", "9px")
      .attr("font-weight", isCurrentStep ? "600" : "400")
      .attr("opacity", isCurrentStep ? 1 : 0.7)
      .text(stepLabels[i]);

    // Connecting line (except last)
    if (i < 4) {
      const lineStartX = stepX + 15;
      const lineEndX = stepBarX + (i + 1) * stepWidth + stepWidth / 2 - 15;

      mainGroup.append("line")
        .attr("x1", lineStartX)
        .attr("y1", stepBarY)
        .attr("x2", lineEndX)
        .attr("y2", stepBarY)
        .attr("stroke", isPastStep ? diagramTheme.accent : diagramTheme.edgeStroke)
        .attr("stroke-width", 1.5)
        .attr("opacity", 0.5);
    }
  }

  // Effective batch size display
  const statsY = height - 30;

  mainGroup.append("text")
    .attr("x", width / 2)
    .attr("y", statsY)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.8)
    .text(`Effective batch size: ${batchPerGpu} samples/GPU × ${numGpus} GPUs = ${batchPerGpu * numGpus} samples`);

  return svg.node();
}

// Step description panel for Data Parallel
html`<div style="
  background: ${diagramTheme.bgSecondary};
  border-radius: 6px;
  padding: 12px 16px;
  margin-top: 8px;
  border-left: 3px solid ${diagramTheme.highlight};
">
  <div style="font-weight: 600; color: ${diagramTheme.nodeText}; margin-bottom: 4px;">
    Step ${currentDpStep.id}: ${currentDpStep.name}
  </div>
  <div style="color: ${diagramTheme.nodeText}; opacity: 0.8; font-size: 13px;">
    ${currentDpStep.description}
  </div>
</div>`

Fully Sharded Data Parallel (FSDP):

Model is sharded across GPUs
Each GPU holds a fraction of parameters
Memory per GPU = model_size / num_gpus
Enables training models larger than single GPU memory

# Distributed training concepts
print("Distributed Training Strategies:")
print()
print("1. Data Parallel (DDP):")
print("   - Best for: Models that fit in one GPU")
print("   - Scales: Batch size (effective_batch = batch * num_gpus)")
print()
print("2. Fully Sharded Data Parallel (FSDP):")
print("   - Best for: Large models (>10B parameters)")
print("   - Scales: Model size and batch size")
print()
print("3. Pipeline Parallel:")
print("   - Best for: Very deep models")
print("   - Splits model layers across GPUs")
print()
print("4. Tensor Parallel:")
print("   - Best for: Models with large layers")
print("   - Splits individual layers across GPUs")

Distributed Training Strategies:

1. Data Parallel (DDP):
   - Best for: Models that fit in one GPU
   - Scales: Batch size (effective_batch = batch * num_gpus)

2. Fully Sharded Data Parallel (FSDP):
   - Best for: Large models (>10B parameters)
   - Scales: Model size and batch size

3. Pipeline Parallel:
   - Best for: Very deep models
   - Splits model layers across GPUs

4. Tensor Parallel:
   - Best for: Models with large layers
   - Splits individual layers across GPUs

Training Stability and Failure Modes

Understanding common failure modes helps you debug training issues:

Loss = NaN or Inf

Causes: - Learning rate too high - Gradient explosion - Numerical overflow in fp16

Solutions: - Reduce learning rate (try 10x smaller) - Add gradient clipping - Use bf16 instead of fp16 or add gradient scaling

Loss stuck at high value

Causes: - Learning rate too low - Poor weight initialization - Data loading bug (same batch every time)

Solutions: - Increase learning rate - Check data loader with small sample - Verify model architecture

Loss oscillates or increases

Causes: - Learning rate too high - Batch size too small - Bug in loss computation

Solutions: - Add warmup period - Reduce learning rate - Use gradient accumulation

// Training pathologies visualization
trainingPathologiesChart = {
  const width = 750;
  const height = 280;
  const margin = { top: 35, right: 20, bottom: 45, left: 50 };

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  // Three panels: Good, Unstable (LR high), Slow (LR low)
  const panelWidth = (width - 60) / 3;
  const innerWidth = panelWidth - margin.left - margin.right + 30;
  const innerHeight = height - margin.top - margin.bottom;

  const configs = [
    {
      title: "Good Training",
      color: theme.accent,
      getData: () => {
        const data = [];
        for (let i = 0; i < 100; i++) {
          // Exponential decay with some noise
          const noise = (Math.sin(i * 0.7) * 0.1 + Math.cos(i * 1.3) * 0.08);
          data.push({ step: i, loss: 5.0 * Math.exp(-0.03 * i) + 0.5 + noise });
        }
        return data;
      },
      yDomain: [0, 6]
    },
    {
      title: "LR Too High (Unstable)",
      color: diagramTheme.error, // red - danger/unstable
      getData: () => {
        const data = [];
        for (let i = 0; i < 100; i++) {
          data.push({ step: i, loss: 4.0 + 0.5 * Math.sin(i * 0.3) + 0.02 * i });
        }
        return data;
      },
      yDomain: [0, 8]
    },
    {
      title: "LR Too Low (Slow)",
      color: theme.highlight,
      getData: () => {
        const data = [];
        for (let i = 0; i < 100; i++) {
          data.push({ step: i, loss: 5.0 * Math.exp(-0.005 * i) + 0.5 });
        }
        return data;
      },
      yDomain: [0, 6]
    }
  ];

  configs.forEach((config, panelIdx) => {
    const offsetX = 10 + panelIdx * panelWidth;
    const panel = svg.append("g")
      .attr("transform", `translate(${offsetX + margin.left}, ${margin.top})`);

    const data = config.getData();

    // Scales
    const xScale = d3.scaleLinear()
      .domain([0, 100])
      .range([0, innerWidth]);

    const yScale = d3.scaleLinear()
      .domain(config.yDomain)
      .range([innerHeight, 0]);

    // Grid lines
    [2, 4, 6].forEach(tick => {
      if (tick <= config.yDomain[1]) {
        panel.append("line")
          .attr("x1", 0)
          .attr("x2", innerWidth)
          .attr("y1", yScale(tick))
          .attr("y2", yScale(tick))
          .attr("stroke", theme.nodeStroke)
          .attr("stroke-opacity", 0.3)
          .attr("stroke-dasharray", "2,2");
      }
    });

    // Line generator
    const lineGen = d3.line()
      .x(d => xScale(d.step))
      .y(d => yScale(d.loss))
      .curve(d3.curveMonotoneX);

    // Line
    panel.append("path")
      .datum(data)
      .attr("d", lineGen)
      .attr("fill", "none")
      .attr("stroke", config.color)
      .attr("stroke-width", 2.5);

    // X-axis
    panel.append("g")
      .attr("transform", `translate(0, ${innerHeight})`)
      .call(d3.axisBottom(xScale).ticks(5))
      .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

    panel.append("text")
      .attr("x", innerWidth / 2)
      .attr("y", innerHeight + 35)
      .attr("text-anchor", "middle")
      .attr("font-size", "10px")
      .attr("fill", theme.nodeText)
      .text("Step");

    // Y-axis
    panel.append("g")
      .call(d3.axisLeft(yScale).ticks(4))
      .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
      .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

    if (panelIdx === 0) {
      panel.append("text")
        .attr("transform", "rotate(-90)")
        .attr("x", -innerHeight / 2)
        .attr("y", -35)
        .attr("text-anchor", "middle")
        .attr("font-size", "10px")
        .attr("fill", theme.nodeText)
        .text("Loss");
    }

    // Title
    panel.append("text")
      .attr("x", innerWidth / 2)
      .attr("y", -12)
      .attr("text-anchor", "middle")
      .attr("font-size", "12px")
      .attr("font-weight", "600")
      .attr("fill", config.color)
      .text(config.title);
  });

  return svg.node();
}

Debugging checklist:

Check initial loss - should be ~log(vocab_size) for untrained model
Verify data is being loaded correctly (print a few samples)
Monitor gradient norms - should be stable, not growing
Check learning rate schedule is working (print LR each step)
Test with a tiny dataset first to verify overfitting capability

Text Dataset

Let’s create a simple dataset for language modeling:

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    """Simple text dataset for language modeling."""

    def __init__(self, tokens, seq_len):
        self.tokens = tokens
        self.seq_len = seq_len

    def __len__(self):
        return max(0, len(self.tokens) - self.seq_len)

    def __getitem__(self, idx):
        input_ids = self.tokens[idx:idx + self.seq_len]
        targets = self.tokens[idx + 1:idx + self.seq_len + 1]
        return input_ids, targets

# Create a simple dataset
tokens = torch.arange(100)  # Token IDs 0-99
seq_len = 8

dataset = TextDataset(tokens, seq_len=seq_len)

print(f"Token IDs: {tokens[:20].tolist()}...")
print(f"Sequence length: {seq_len}")
print(f"Number of samples: {len(dataset)}")

Token IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]...
Sequence length: 8
Number of samples: 92

# Look at a sample
input_ids, targets = dataset[0]

print("Sample 0:")
print(f"  Input:  {input_ids.tolist()}")
print(f"  Target: {targets.tolist()}")
print(f"\n  Target is input shifted by 1 position!")

# Another sample
input_ids, targets = dataset[50]
print(f"\nSample 50:")
print(f"  Input:  {input_ids.tolist()}")
print(f"  Target: {targets.tolist()}")

Sample 0:
  Input:  [0, 1, 2, 3, 4, 5, 6, 7]
  Target: [1, 2, 3, 4, 5, 6, 7, 8]

  Target is input shifted by 1 position!

Sample 50:
  Input:  [50, 51, 52, 53, 54, 55, 56, 57]
  Target: [51, 52, 53, 54, 55, 56, 57, 58]

Training a Model

Now let’s put it all together and train a tiny model:

import sys
sys.path.insert(0, '..')
from m06_transformer.transformer import create_gpt_tiny

# Create model and data
torch.manual_seed(42)

vocab_size = 100
model = create_gpt_tiny(vocab_size=vocab_size)

# Random "training data"
tokens = torch.randint(0, vocab_size, (5000,))

print(f"Model: {model.num_params:,} parameters")
print(f"Training data: {len(tokens):,} tokens")

Model: 838,912 parameters
Training data: 5,000 tokens

# Check initial loss (should be ~log(vocab_size) for random predictions)
dataset = TextDataset(tokens, seq_len=32)
input_ids, targets = dataset[0]
input_ids = input_ids.unsqueeze(0)  # Add batch dimension
targets = targets.unsqueeze(0)

model.eval()
with torch.no_grad():
    logits = model(input_ids)
    # Reshape for loss computation
    B, T, V = logits.shape
    initial_loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

print(f"Initial loss: {initial_loss.item():.4f}")
print(f"Initial perplexity: {math.exp(initial_loss.item()):.2f}")
print(f"\nExpected for random guessing: loss ~ {np.log(vocab_size):.2f}, ppl ~ {vocab_size}")

Initial loss: 4.6686
Initial perplexity: 106.55

Expected for random guessing: loss ~ 4.61, ppl ~ 100

def train_model(model, tokens, num_steps=100, batch_size=16, seq_len=32, learning_rate=3e-4):
    """Simple training loop."""
    dataset = TextDataset(tokens, seq_len)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    scheduler = CosineScheduler(optimizer, warmup_steps=10, total_steps=num_steps, min_lr=1e-5)

    model.train()
    losses = []
    step = 0

    while step < num_steps:
        for input_ids, targets in dataloader:
            if step >= num_steps:
                break

            # Forward pass
            logits = model(input_ids)
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            losses.append(loss.item())

            if step % 10 == 0:
                lr = optimizer.param_groups[0]['lr']
                ppl = math.exp(loss.item())
                print(f"Step {step:3d} | Loss: {loss.item():.4f} | PPL: {ppl:.2f} | LR: {lr:.2e}")

            step += 1

    return losses

# Train!
print("Starting training...\n")
losses = train_model(model, tokens, num_steps=100)
print(f"\nFinal loss: {losses[-1]:.4f}")
print(f"Final perplexity: {math.exp(losses[-1]):.2f}")

Starting training...

Step   0 | Loss: 4.6194 | PPL: 101.44 | LR: 0.00e+00
Step  10 | Loss: 4.6242 | PPL: 101.92 | LR: 3.00e-04
Step  20 | Loss: 4.6074 | PPL: 100.23 | LR: 2.91e-04
Step  30 | Loss: 4.5950 | PPL: 98.99 | LR: 2.66e-04
Step  40 | Loss: 4.5934 | PPL: 98.83 | LR: 2.27e-04
Step  50 | Loss: 4.6067 | PPL: 100.16 | LR: 1.80e-04
Step  60 | Loss: 4.5851 | PPL: 98.02 | LR: 1.30e-04
Step  70 | Loss: 4.5777 | PPL: 97.29 | LR: 8.25e-05
Step  80 | Loss: 4.6026 | PPL: 99.74 | LR: 4.39e-05
Step  90 | Loss: 4.5959 | PPL: 99.08 | LR: 1.87e-05

Final loss: 4.5968
Final perplexity: 99.17

# Pass losses to OJS for visualization
import json
ojs_define(training_losses = losses, vocab_size_val = vocab_size)

// Training curve visualization
trainingCurveChart = {
  const width = 750;
  const height = 300;
  const margin = { top: 35, right: 30, bottom: 45, left: 55 };

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  // Two panels: Loss and Perplexity
  const panelWidth = (width - 30) / 2;
  const innerWidth = panelWidth - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const losses = training_losses;
  const vocabSize = vocab_size_val;
  const randomBaseline = Math.log(vocabSize);

  // Panel 1: Loss
  const panel1 = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  const xScale1 = d3.scaleLinear()
    .domain([0, losses.length - 1])
    .range([0, innerWidth]);

  const yScale1 = d3.scaleLinear()
    .domain([0, Math.max(...losses) * 1.1])
    .range([innerHeight, 0]);

  // Grid
  [1, 2, 3, 4].forEach(tick => {
    if (tick <= Math.max(...losses) * 1.1) {
      panel1.append("line")
        .attr("x1", 0)
        .attr("x2", innerWidth)
        .attr("y1", yScale1(tick))
        .attr("y2", yScale1(tick))
        .attr("stroke", theme.nodeStroke)
        .attr("stroke-opacity", 0.3)
        .attr("stroke-dasharray", "2,2");
    }
  });

  // Random baseline line
  panel1.append("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", yScale1(randomBaseline))
    .attr("y2", yScale1(randomBaseline))
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  // Loss line
  const lineGen1 = d3.line()
    .x((d, i) => xScale1(i))
    .y(d => yScale1(d))
    .curve(d3.curveMonotoneX);

  panel1.append("path")
    .datum(losses)
    .attr("d", lineGen1)
    .attr("fill", "none")
    .attr("stroke", theme.accent)
    .attr("stroke-width", 2.5);

  // X-axis
  panel1.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale1).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel1.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 35)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Step");

  // Y-axis
  panel1.append("g")
    .call(d3.axisLeft(yScale1).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel1.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -40)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Loss");

  // Title
  panel1.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", -12)
    .attr("text-anchor", "middle")
    .attr("font-size", "13px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Training Loss");

  // Legend
  panel1.append("line")
    .attr("x1", innerWidth - 100)
    .attr("x2", innerWidth - 80)
    .attr("y1", 15)
    .attr("y2", 15)
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  panel1.append("text")
    .attr("x", innerWidth - 75)
    .attr("y", 19)
    .attr("font-size", "9px")
    .attr("fill", theme.nodeText)
    .text("Random baseline");

  // Panel 2: Perplexity
  const panel2 = svg.append("g")
    .attr("transform", `translate(${panelWidth + margin.left + 15}, ${margin.top})`);

  const perplexities = losses.map(l => Math.exp(l));

  const xScale2 = d3.scaleLinear()
    .domain([0, losses.length - 1])
    .range([0, innerWidth]);

  const yScale2 = d3.scaleLinear()
    .domain([0, Math.max(...perplexities) * 1.1])
    .range([innerHeight, 0]);

  // Grid
  [25, 50, 75, 100].forEach(tick => {
    if (tick <= Math.max(...perplexities) * 1.1) {
      panel2.append("line")
        .attr("x1", 0)
        .attr("x2", innerWidth)
        .attr("y1", yScale2(tick))
        .attr("y2", yScale2(tick))
        .attr("stroke", theme.nodeStroke)
        .attr("stroke-opacity", 0.3)
        .attr("stroke-dasharray", "2,2");
    }
  });

  // Random baseline line
  panel2.append("line")
    .attr("x1", 0)
    .attr("x2", innerWidth)
    .attr("y1", yScale2(vocabSize))
    .attr("y2", yScale2(vocabSize))
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  // Perplexity line
  const lineGen2 = d3.line()
    .x((d, i) => xScale2(i))
    .y(d => yScale2(d))
    .curve(d3.curveMonotoneX);

  panel2.append("path")
    .datum(perplexities)
    .attr("d", lineGen2)
    .attr("fill", "none")
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 2.5);

  // X-axis
  panel2.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale2).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel2.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 35)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Step");

  // Y-axis
  panel2.append("g")
    .call(d3.axisLeft(yScale2).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "10px"));

  panel2.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -40)
    .attr("text-anchor", "middle")
    .attr("font-size", "11px")
    .attr("fill", theme.nodeText)
    .text("Perplexity");

  // Title
  panel2.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", -12)
    .attr("text-anchor", "middle")
    .attr("font-size", "13px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Training Perplexity");

  // Legend
  panel2.append("line")
    .attr("x1", innerWidth - 100)
    .attr("x2", innerWidth - 80)
    .attr("y1", 15)
    .attr("y2", 15)
    .attr("stroke", theme.error)
    .attr("stroke-width", 1.5)
    .attr("stroke-dasharray", "5,3");

  panel2.append("text")
    .attr("x", innerWidth - 75)
    .attr("y", 19)
    .attr("font-size", "9px")
    .attr("fill", theme.nodeText)
    .text("Random baseline");

  return svg.node();
}

Effect of Learning Rate

Learning rate is crucial - too high causes instability, too low is slow:

# Train with different learning rates
learning_rates = [1e-5, 1e-4, 3e-4, 1e-3, 3e-3]
all_losses = {}

for lr in learning_rates:
    torch.manual_seed(42)
    model = create_gpt_tiny(vocab_size=100)
    tokens = torch.randint(0, 100, (3000,))

    # Train silently
    dataset = TextDataset(tokens, seq_len=32)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True, drop_last=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

    model.train()
    losses = []
    step = 0

    while step < 50:
        for input_ids, targets in dataloader:
            if step >= 50:
                break
            logits = model(input_ids)
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            losses.append(loss.item())
            step += 1

    all_losses[lr] = losses
    print(f"LR={lr:.0e}: final_loss={losses[-1]:.3f}, final_ppl={math.exp(losses[-1]):.1f}")

LR=1e-05: final_loss=4.637, final_ppl=103.2
LR=1e-04: final_loss=4.615, final_ppl=101.0
LR=3e-04: final_loss=4.613, final_ppl=100.8
LR=1e-03: final_loss=4.639, final_ppl=103.4
LR=3e-03: final_loss=4.634, final_ppl=103.0

# Pass learning rate comparison data to OJS
# Convert dict with float keys to list of dicts for JSON serialization
lr_comparison_data = [{"lr": lr, "losses": losses} for lr, losses in all_losses.items()]
ojs_define(lr_comparison = lr_comparison_data)

// Learning rate comparison chart
lrComparisonChart = {
  const width = 700;
  const height = 380;
  const margin = { top: 40, right: 100, bottom: 50, left: 60 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 8);

  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Color scale for different learning rates - uses theme colors
  const colors = [
    theme.edgeStroke,  // gray - too low
    theme.accent,      // blue - low
    theme.success,     // green - optimal
    theme.highlight,   // orange - high
    theme.error        // red - too high
  ];

  // Find max values
  const allLosses = lr_comparison.flatMap(d => d.losses);
  const maxLoss = Math.max(...allLosses);
  const maxSteps = Math.max(...lr_comparison.map(d => d.losses.length));

  // Scales
  const xScale = d3.scaleLinear()
    .domain([0, maxSteps - 1])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, Math.min(maxLoss * 1.1, 10)])
    .range([innerHeight, 0]);

  // Grid
  [2, 4, 6, 8].forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.3)
      .attr("stroke-dasharray", "2,2");
  });

  // Line generator
  const lineGen = d3.line()
    .x((d, i) => xScale(i))
    .y(d => yScale(Math.min(d, 10)))
    .curve(d3.curveMonotoneX);

  // Draw lines for each learning rate
  lr_comparison.forEach((lrData, idx) => {
    const color = colors[idx % colors.length];

    chart.append("path")
      .datum(lrData.losses)
      .attr("d", lineGen)
      .attr("fill", "none")
      .attr("stroke", color)
      .attr("stroke-width", 2.5)
      .attr("opacity", 0.9);
  });

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(6))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 40)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Step");

  // Y-axis
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(5))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  chart.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -45)
    .attr("text-anchor", "middle")
    .attr("font-size", "12px")
    .attr("fill", theme.nodeText)
    .text("Loss");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .attr("fill", theme.nodeText)
    .text("Training Loss for Different Learning Rates");

  // Legend
  const legend = svg.append("g")
    .attr("transform", `translate(${width - margin.right + 15}, ${margin.top + 20})`);

  lr_comparison.forEach((lrData, idx) => {
    const y = idx * 22;
    const color = colors[idx % colors.length];
    const lrStr = lrData.lr.toExponential(0);

    legend.append("line")
      .attr("x1", 0)
      .attr("x2", 20)
      .attr("y1", y)
      .attr("y2", y)
      .attr("stroke", color)
      .attr("stroke-width", 2.5);

    legend.append("text")
      .attr("x", 25)
      .attr("y", y + 4)
      .attr("font-size", "10px")
      .attr("fill", theme.nodeText)
      .text(`LR=${lrStr}`);
  });

  return svg.node();
}

Observations:

Too low (1e-5): Training is very slow
Just right (3e-4): Smooth, fast convergence
Too high (3e-3): Unstable, loss may spike or diverge

Checkpointing

Save regularly! Training can crash. Here’s what to save:

# Demonstrate checkpointing
import json
from pathlib import Path

def save_checkpoint(model, optimizer, step, loss, path):
    """Save a training checkpoint."""
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'step': step,
        'loss': loss,
    }
    torch.save(checkpoint, path)
    print(f"Checkpoint saved to {path}")

def load_checkpoint(model, optimizer, path):
    """Load a training checkpoint."""
    checkpoint = torch.load(path, weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Checkpoint loaded from {path}")
    print(f"  Step: {checkpoint['step']}, Loss: {checkpoint['loss']:.4f}")
    return checkpoint['step'], checkpoint['loss']

# Save example
model = create_gpt_tiny(vocab_size=100)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
save_checkpoint(model, optimizer, step=50, loss=2.5, path="demo_checkpoint.pt")

# Load example
model2 = create_gpt_tiny(vocab_size=100)
optimizer2 = torch.optim.AdamW(model2.parameters(), lr=3e-4)
step, loss = load_checkpoint(model2, optimizer2, "demo_checkpoint.pt")

# Clean up
Path("demo_checkpoint.pt").unlink()

Checkpoint saved to demo_checkpoint.pt
Checkpoint loaded from demo_checkpoint.pt
  Step: 50, Loss: 2.5000

Validation and Early Stopping

Monitor validation loss to detect overfitting:

// Early Stopping Controls
viewof esCurrentEpoch = Inputs.range([1, 50], {
  value: 1,
  step: 1,
  label: "Current Epoch"
})

earlyStoppingData = {
  const epochs = 50;
  const data = [];

  // Training loss: exponential decay with noise
  // Validation loss: decreases then increases (U-shape)
  const bestEpoch = 25; // Where validation loss is lowest

  for (let epoch = 1; epoch <= epochs; epoch++) {
    // Training loss: smooth exponential decay
    const trainLoss = 2.5 * Math.exp(-0.08 * epoch) + 0.3 + 0.05 * Math.sin(epoch * 0.5);

    // Validation loss: U-shaped curve
    // Decreases initially, then increases (overfitting)
    const valBase = 2.5 * Math.exp(-0.06 * epoch) + 0.4;
    const overfitComponent = epoch > bestEpoch ? 0.02 * Math.pow(epoch - bestEpoch, 1.3) : 0;
    const valLoss = valBase + overfitComponent + 0.03 * Math.sin(epoch * 0.7 + 1);

    data.push({
      epoch,
      trainLoss,
      valLoss,
      gap: valLoss - trainLoss
    });
  }

  return data;
}

// Find the best epoch (minimum validation loss)
bestModelEpoch = {
  let minVal = Infinity;
  let bestEpoch = 1;

  for (const d of earlyStoppingData) {
    if (d.valLoss < minVal) {
      minVal = d.valLoss;
      bestEpoch = d.epoch;
    }
  }

  return bestEpoch;
}

// Current epoch data
currentEpochData = {
  const current = earlyStoppingData.find(d => d.epoch === esCurrentEpoch);
  return current || earlyStoppingData[0];
}

// Training phase detection
trainingPhase = {
  if (esCurrentEpoch < bestModelEpoch - 5) return "learning";
  if (esCurrentEpoch <= bestModelEpoch + 2) return "optimal";
  return "overfitting";
}

// Early Stopping Visualization
{
  const theme = diagramTheme;
  const width = 700;
  const height = 400;
  const margin = { top: 40, right: 150, bottom: 60, left: 70 };
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  const defs = svg.append("defs");

  // Background gradient
  const bgGradient = defs.append("linearGradient")
    .attr("id", "es-bg-gradient")
    .attr("x1", "0%")
    .attr("y1", "0%")
    .attr("x2", "0%")
    .attr("y2", "100%");

  bgGradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", theme.bg);

  bgGradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", theme.bgSecondary);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", "url(#es-bg-gradient)")
    .attr("rx", 12);

  const chart = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales
  const xScale = d3.scaleLinear()
    .domain([1, 50])
    .range([0, innerWidth]);

  const yScale = d3.scaleLinear()
    .domain([0, 3])
    .range([innerHeight, 0]);

  // Overfitting region highlight
  chart.append("rect")
    .attr("x", xScale(bestModelEpoch))
    .attr("y", 0)
    .attr("width", innerWidth - xScale(bestModelEpoch))
    .attr("height", innerHeight)
    .attr("fill", theme.error)
    .attr("opacity", trainingPhase === "overfitting" ? 0.15 : 0.05);

  // Optimal zone highlight
  chart.append("rect")
    .attr("x", xScale(Math.max(1, bestModelEpoch - 5)))
    .attr("y", 0)
    .attr("width", xScale(bestModelEpoch + 2) - xScale(Math.max(1, bestModelEpoch - 5)))
    .attr("height", innerHeight)
    .attr("fill", theme.success)
    .attr("opacity", trainingPhase === "optimal" ? 0.15 : 0.05);

  // Region labels at top
  chart.append("text")
    .attr("x", xScale(10))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", trainingPhase === "learning" ? "600" : "400")
    .attr("fill", theme.accent)
    .attr("opacity", trainingPhase === "learning" ? 1 : 0.5)
    .text("LEARNING");

  chart.append("text")
    .attr("x", xScale(bestModelEpoch))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", trainingPhase === "optimal" ? "600" : "400")
    .attr("fill", theme.success)
    .attr("opacity", trainingPhase === "optimal" ? 1 : 0.5)
    .text("OPTIMAL");

  chart.append("text")
    .attr("x", xScale(40))
    .attr("y", 15)
    .attr("text-anchor", "middle")
    .attr("font-size", "10px")
    .attr("font-weight", trainingPhase === "overfitting" ? "600" : "400")
    .attr("fill", theme.error)
    .attr("opacity", trainingPhase === "overfitting" ? 1 : 0.5)
    .text("OVERFITTING");

  // Grid lines
  const yTicks = [0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0];
  yTicks.forEach(tick => {
    chart.append("line")
      .attr("x1", 0)
      .attr("x2", innerWidth)
      .attr("y1", yScale(tick))
      .attr("y2", yScale(tick))
      .attr("stroke", theme.nodeStroke)
      .attr("stroke-opacity", 0.2)
      .attr("stroke-dasharray", "3,3");
  });

  // X-axis
  chart.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale).ticks(10))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  // Y-axis
  chart.append("g")
    .call(d3.axisLeft(yScale).ticks(6))
    .call(g => g.select(".domain").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", theme.nodeStroke))
    .call(g => g.selectAll(".tick text").attr("fill", theme.nodeText).attr("font-size", "11px"));

  // Axis labels
  chart.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 45)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("font-weight", "500")
    .text("Epoch");

  chart.append("text")
    .attr("x", -innerHeight / 2)
    .attr("y", -50)
    .attr("transform", "rotate(-90)")
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("font-weight", "500")
    .text("Loss");

  // Filter data up to current epoch
  const visibleData = earlyStoppingData.filter(d => d.epoch <= esCurrentEpoch);

  // Gap fill between curves (overfitting visualization)
  const gapArea = d3.area()
    .x(d => xScale(d.epoch))
    .y0(d => yScale(d.trainLoss))
    .y1(d => yScale(d.valLoss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(visibleData)
    .attr("d", gapArea)
    .attr("fill", theme.error)
    .attr("opacity", 0.1);

  // Training loss line
  const trainLine = d3.line()
    .x(d => xScale(d.epoch))
    .y(d => yScale(d.trainLoss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(visibleData)
    .attr("d", trainLine)
    .attr("fill", "none")
    .attr("stroke", theme.accent)
    .attr("stroke-width", 3)
    .attr("stroke-linecap", "round");

  // Validation loss line
  const valLine = d3.line()
    .x(d => xScale(d.epoch))
    .y(d => yScale(d.valLoss))
    .curve(d3.curveMonotoneX);

  chart.append("path")
    .datum(visibleData)
    .attr("d", valLine)
    .attr("fill", "none")
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 3)
    .attr("stroke-linecap", "round");

  // Best model marker (vertical line at best epoch)
  if (esCurrentEpoch >= bestModelEpoch) {
    const bestData = earlyStoppingData.find(d => d.epoch === bestModelEpoch);

    chart.append("line")
      .attr("x1", xScale(bestModelEpoch))
      .attr("x2", xScale(bestModelEpoch))
      .attr("y1", 0)
      .attr("y2", innerHeight)
      .attr("stroke", theme.success)
      .attr("stroke-width", 2)
      .attr("stroke-dasharray", "6,4");

    // Best model point marker
    chart.append("circle")
      .attr("cx", xScale(bestModelEpoch))
      .attr("cy", yScale(bestData.valLoss))
      .attr("r", 8)
      .attr("fill", theme.success)
      .attr("stroke", "#fff")
      .attr("stroke-width", 2);

    // Star/checkpoint icon
    chart.append("text")
      .attr("x", xScale(bestModelEpoch))
      .attr("y", yScale(bestData.valLoss) + 1)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", "#fff")
      .attr("font-size", "10px")
      .attr("font-weight", "bold")
      .text("★");

    // Label for best model
    chart.append("text")
      .attr("x", xScale(bestModelEpoch))
      .attr("y", yScale(bestData.valLoss) - 18)
      .attr("text-anchor", "middle")
      .attr("fill", theme.success)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text("SAVE CHECKPOINT");
  }

  // Current epoch marker
  const currentX = xScale(esCurrentEpoch);
  const currentTrainY = yScale(currentEpochData.trainLoss);
  const currentValY = yScale(currentEpochData.valLoss);

  // Vertical line at current epoch
  chart.append("line")
    .attr("x1", currentX)
    .attr("x2", currentX)
    .attr("y1", 0)
    .attr("y2", innerHeight)
    .attr("stroke", theme.nodeText)
    .attr("stroke-width", 1)
    .attr("stroke-opacity", 0.4)
    .attr("stroke-dasharray", "4,4");

  // Gap indicator arrow
  if (currentEpochData.gap > 0.1) {
    const midY = (currentTrainY + currentValY) / 2;

    // Gap line
    chart.append("line")
      .attr("x1", currentX + 8)
      .attr("x2", currentX + 8)
      .attr("y1", currentTrainY)
      .attr("y2", currentValY)
      .attr("stroke", theme.error)
      .attr("stroke-width", 2);

    // Gap label
    chart.append("text")
      .attr("x", currentX + 18)
      .attr("y", midY)
      .attr("dominant-baseline", "central")
      .attr("fill", theme.error)
      .attr("font-size", "10px")
      .attr("font-weight", "500")
      .text(`Gap: ${currentEpochData.gap.toFixed(2)}`);
  }

  // Current points
  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentTrainY)
    .attr("r", 6)
    .attr("fill", theme.accent)
    .attr("stroke", "#fff")
    .attr("stroke-width", 2);

  chart.append("circle")
    .attr("cx", currentX)
    .attr("cy", currentValY)
    .attr("r", 6)
    .attr("fill", theme.highlight)
    .attr("stroke", "#fff")
    .attr("stroke-width", 2);

  // Legend
  const legendX = innerWidth + 20;
  const legendY = 40;

  // Training loss legend
  chart.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY)
    .attr("y2", legendY)
    .attr("stroke", theme.accent)
    .attr("stroke-width", 3);

  chart.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Train Loss");

  // Validation loss legend
  chart.append("line")
    .attr("x1", legendX)
    .attr("x2", legendX + 25)
    .attr("y1", legendY + 25)
    .attr("y2", legendY + 25)
    .attr("stroke", theme.highlight)
    .attr("stroke-width", 3);

  chart.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY + 25)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Val Loss");

  // Best model legend
  chart.append("circle")
    .attr("cx", legendX + 12)
    .attr("cy", legendY + 55)
    .attr("r", 6)
    .attr("fill", theme.success);

  chart.append("text")
    .attr("x", legendX + 32)
    .attr("y", legendY + 55)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "11px")
    .text("Best Model");

  // Status panel
  const statusY = legendY + 90;

  chart.append("rect")
    .attr("x", legendX - 5)
    .attr("y", statusY - 5)
    .attr("width", 115)
    .attr("height", 75)
    .attr("rx", 6)
    .attr("fill", theme.bgSecondary)
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 1);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 12)
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .attr("opacity", 0.7)
    .text(`Epoch: ${esCurrentEpoch}`);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 28)
    .attr("fill", theme.accent)
    .attr("font-size", "10px")
    .text(`Train: ${currentEpochData.trainLoss.toFixed(3)}`);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 44)
    .attr("fill", theme.highlight)
    .attr("font-size", "10px")
    .text(`Val: ${currentEpochData.valLoss.toFixed(3)}`);

  const phaseColor = trainingPhase === "learning" ? theme.accent :
                     trainingPhase === "optimal" ? (theme.success) :
                     (theme.error);

  chart.append("text")
    .attr("x", legendX + 5)
    .attr("y", statusY + 60)
    .attr("fill", phaseColor)
    .attr("font-size", "10px")
    .attr("font-weight", "600")
    .text(trainingPhase.toUpperCase());

  return svg.node();
}

Tips: - Monitor validation loss, not just training loss - Save the model with the best validation loss - Consider early stopping if validation loss increases consistently

Training Tips

Quick Reference Table

Symptom	Likely Cause	Solution
Loss = NaN	LR too high	Reduce LR by 10x
Loss stuck	LR too low	Increase LR by 2-5x
Loss oscillates	Batch too small	Use gradient accumulation
Overfitting	Not enough data	More data, more dropout
Underfitting	Model too small	More layers/heads/dims
Slow training	No GPU/MPS	Use hardware acceleration
OOM errors	Batch too large	Reduce batch size, use accumulation
Training crash	No checkpoints	Save every N steps

Hyperparameter Recommendations

Based on published research and common practices:

Hyperparameter	Small Models (<1B)	Large Models (>1B)
Learning rate	1e-4 to 6e-4	1e-4 to 3e-4
Warmup	1-2% of steps	0.1-1% of steps
Weight decay	0.01 - 0.1	0.01 - 0.1
Beta1	0.9	0.9
Beta2	0.999	0.95
Batch size	256 - 1024 tokens	1M - 4M tokens
Gradient clip	1.0	1.0

Memory Optimization Strategies

Gradient accumulation: Simulate larger batches
Mixed precision (fp16/bf16): ~50% memory reduction
Gradient checkpointing: Trade compute for memory
FSDP/DeepSpeed: Shard model across GPUs

Interactive Exploration

Experiment with learning rate schedules in real-time. Adjust the hyperparameters to see how warmup and cosine decay shape the learning rate curve.

function computeSchedule(maxLr, minLr, warmupSteps, totalSteps) {
  const lrs = [];
  const numPoints = Math.min(totalSteps, 500); // Limit points for performance
  const stepSize = totalSteps / numPoints;

  for (let i = 0; i <= numPoints; i++) {
    const step = Math.floor(i * stepSize);
    let lr;

    if (step < warmupSteps) {
      // Linear warmup
      lr = maxLr * step / Math.max(1, warmupSteps);
    } else if (step >= totalSteps) {
      lr = minLr;
    } else {
      // Cosine decay
      const progress = (step - warmupSteps) / Math.max(1, totalSteps - warmupSteps);
      const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
      lr = minLr + (maxLr - minLr) * cosine;
    }

    lrs.push({ step, lr, phase: step < warmupSteps ? "warmup" : "decay" });
  }

  return lrs;
}

// Get LR at a specific step
function getLrAtStep(step, maxLr, minLr, warmupSteps, totalSteps) {
  if (step < warmupSteps) {
    return maxLr * step / Math.max(1, warmupSteps);
  } else if (step >= totalSteps) {
    return minLr;
  } else {
    const progress = (step - warmupSteps) / Math.max(1, totalSteps - warmupSteps);
    const cosine = 0.5 * (1 + Math.cos(Math.PI * progress));
    return minLr + (maxLr - minLr) * cosine;
  }
}

viewof maxLr = Inputs.range([1e-5, 1e-2], {
  value: 1e-3,
  step: 1e-5,
  label: "Max Learning Rate",
  format: x => x.toExponential(1)
})

viewof minLr = Inputs.range([0, 1e-4], {
  value: 1e-5,
  step: 1e-6,
  label: "Min Learning Rate",
  format: x => x.toExponential(1)
})

viewof warmupSteps = Inputs.range([0, 500], {
  value: 100,
  step: 10,
  label: "Warmup Steps"
})

viewof totalSteps = Inputs.range([100, 2000], {
  value: 1000,
  step: 50,
  label: "Total Steps"
})

viewof currentStep = Inputs.range([0, totalSteps], {
  value: Math.floor(totalSteps / 2),
  step: 1,
  label: "Current Step"
})

// Widget theme - uses diagramTheme from _diagram-lib.qmd which already handles dark mode
theme = {
  const t = diagramTheme;
  return {
    warmupBg: t.isDark ? 'rgba(251, 146, 60, 0.15)' : 'rgba(249, 115, 22, 0.1)',
    curveStroke: t.accent,
    warmupMarker: t.highlight,
    currentMarker: t.error,
    annotationText: t.highlight
  };
}

scheduleData = computeSchedule(maxLr, minLr, warmupSteps, totalSteps)

// Current LR
currentLr = getLrAtStep(currentStep, maxLr, minLr, warmupSteps, totalSteps)

// Warmup percentage
warmupPct = ((warmupSteps / totalSteps) * 100).toFixed(1)

Plot = import("https://esm.sh/@observablehq/plot@0.6")

Plot.plot({
  title: "Learning Rate Schedule: Warmup + Cosine Decay",
  subtitle: `Warmup: ${warmupSteps} steps (${warmupPct}%) | Peak LR: ${maxLr.toExponential(1)} | Min LR: ${minLr.toExponential(1)}`,
  width: 700,
  height: 350,
  marginLeft: 70,
  marginBottom: 50,
  x: {
    label: "Training Step →",
    domain: [0, totalSteps]
  },
  y: {
    label: "↑ Learning Rate",
    domain: [0, maxLr * 1.1],
    tickFormat: ".1e"
  },
  marks: [
    // Warmup region background
    Plot.rectY([{x1: 0, x2: warmupSteps, y: maxLr * 1.1}], {
      x1: "x1",
      x2: "x2",
      y2: "y",
      y1: 0,
      fill: theme.warmupBg,
      fillOpacity: 0.5
    }),
    // Main LR curve
    Plot.line(scheduleData, {
      x: "step",
      y: "lr",
      stroke: theme.curveStroke,
      strokeWidth: 2.5
    }),
    // Warmup end marker
    Plot.ruleX([warmupSteps], {
      stroke: theme.warmupMarker,
      strokeWidth: 2,
      strokeDasharray: "5,5"
    }),
    // Current step indicator
    Plot.ruleX([currentStep], {
      stroke: theme.currentMarker,
      strokeWidth: 2
    }),
    // Current LR point
    Plot.dot([{step: currentStep, lr: currentLr}], {
      x: "step",
      y: "lr",
      fill: theme.currentMarker,
      r: 6
    }),
    // Annotations
    Plot.text([{step: warmupSteps, lr: maxLr * 1.05}], {
      x: "step",
      y: "lr",
      text: ["← Warmup ends"],
      fill: theme.annotationText,
      fontSize: 11,
      textAnchor: "start"
    }),
    Plot.ruleY([0])
  ]
})

// Display current step info
md`**Step ${currentStep}:** LR = **${currentLr.toExponential(3)}** ${currentStep < warmupSteps ? "(warming up)" : currentStep >= totalSteps ? "(finished)" : "(decaying)"}`

// Legend
md`<span style="background: ${theme.warmupBg}; padding: 2px 8px; color: ${theme.nodeText}">Warmup phase</span> &nbsp; <span style="color: ${theme.warmupMarker}">┆</span> Warmup ends &nbsp; <span style="color: ${theme.currentMarker}">│</span> Current step`

Try This

Effect of warmup: Set warmup to 0, then gradually increase to 200. Notice how the curve changes from immediate peak to gradual ramp-up.
Long vs short training: Compare total_steps=500 vs total_steps=2000 with the same warmup. See how the decay rate changes.
Min LR matters: Set min_lr to 0, then to 1e-5. The floor prevents the model from completely stopping learning.
Warmup ratio: Try warmup_steps = 1-2% of total_steps (common in practice). For 1000 steps, that’s 10-20 warmup steps.
Drag the current step slider to see the exact LR at any point in training.

Exercises

Exercise 1: Learning Rate Finder

Implement a learning rate finder that trains for a few iterations at exponentially increasing learning rates and plots loss vs learning rate.

# Your implementation here
def lr_finder(model, tokens, start_lr=1e-7, end_lr=1e-1, num_steps=100):
    """Find optimal learning rate by training with exponentially increasing LR."""
    # TODO: Implement this
    pass

Exercise 2: Custom Scheduler

Implement a linear warmup + linear decay scheduler (instead of cosine decay).

# Your implementation here
class LinearScheduler:
    def __init__(self, optimizer, warmup_steps, total_steps, min_lr=0.0):
        # TODO: Implement this
        pass

    def step(self):
        pass

Exercise 3: Training with Validation

Modify the training loop to: 1. Compute validation loss every N steps 2. Save the best model (lowest validation loss) 3. Implement early stopping if validation loss doesn’t improve for M steps

Summary

This module covered:

Cross-entropy loss measures prediction quality (lower = better), with mathematical foundations in information theory
Perplexity provides an intuitive metric: exp(loss) - “choosing among N equally likely options”
Learning rate scheduling with warmup + cosine decay prevents early instability and enables fine-tuning
AdamW optimizer combines momentum, adaptive learning rates, and proper weight decay decoupling
Gradient accumulation increases effective batch size without adding memory
Gradient clipping (max_norm=1.0) prevents exploding gradients, essential for transformers
Batch size tradeoffs affect memory, training dynamics, and generalization
Mixed precision training (fp16/bf16) provides 2x speedup and 50% memory reduction
Distributed training (DDP, FSDP) scales training to multiple GPUs
Common failure modes (NaN loss, stuck training, oscillation) and their solutions
Checkpointing strategies ensure you never lose training progress

Key Takeaways

Always use warmup (at least 1% of steps) to stabilize early training
Monitor gradient norms alongside loss - they tell you about training stability
Start with standard hyperparameters (lr=3e-4, wd=0.01, clip=1.0), then adjust
Test your training loop on a tiny dataset first - verify it can overfit

What’s Next

Module 08: Generation uses the trained model to generate text with various decoding strategies: greedy, sampling, and top-k/top-p.