Module 00: What Is a Language Model?

~15 minutes · No prerequisites

d3 = require("d3@7")


// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================

// Function to read CSS custom property values from the document
getCSSVar = function(name, fallback = null) {
  if (typeof document === 'undefined') return fallback;
  const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim();
  return value || fallback;
}

// =============================================================================
// THEME OBJECT
// =============================================================================

// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
  // Fallback values (light mode)
  const fallbacks = {
    nodeFill: '#f5f5f4',
    nodeFillHover: '#e7e5e4',
    nodeStroke: '#d6d3d1',
    nodeText: '#1c1917',
    edgeStroke: '#78716c',
    highlight: '#f97316',
    highlightGlow: 'rgba(249, 115, 22, 0.3)',
    accent: '#0ea5e9',
    accentGlow: 'rgba(14, 165, 233, 0.3)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: '#fafaf9',
    bgSecondary: '#f5f5f4',
    // Semantic colors for status/feedback
    error: '#dc2626',
    errorBg: 'rgba(220, 38, 38, 0.1)',
    success: '#16a34a',
    successBg: 'rgba(22, 163, 74, 0.1)',
    info: '#2563eb',
    infoBg: 'rgba(37, 99, 235, 0.1)'
  };

  return {
    nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
    nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
    nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
    nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
    edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
    highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
    highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
    accent: getCSSVar('--diagram-accent', fallbacks.accent),
    accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
    textOnHighlight: fallbacks.textOnHighlight,
    textOnAccent: fallbacks.textOnAccent,
    bg: getCSSVar('--diagram-bg', fallbacks.bg),
    bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
    // Semantic colors (use fallbacks directly since no CSS vars defined)
    error: fallbacks.error,
    errorBg: fallbacks.errorBg,
    success: fallbacks.success,
    successBg: fallbacks.successBg,
    info: fallbacks.info,
    infoBg: fallbacks.infoBg
  };
}

// =============================================================================
// SVG PRIMITIVES
// =============================================================================

// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
  const {
    x = 0,
    y = 0,
    width = 100,
    height = 50,
    label = '',
    sublabel = '',
    id = null,
    theme = diagramTheme,
    rx = 6,
    ry = 6,
    className = 'diagram-node'
  } = options;

  // Create group
  const g = svg.append('g')
    .attr('class', className)
    .attr('transform', `translate(${x}, ${y})`);

  if (id) g.attr('id', id);

  // Add rectangle
  g.append('rect')
    .attr('x', -width / 2)
    .attr('y', -height / 2)
    .attr('width', width)
    .attr('height', height)
    .attr('rx', rx)
    .attr('ry', ry)
    .attr('fill', theme.nodeFill)
    .attr('stroke', theme.nodeStroke)
    .attr('stroke-width', 1.5);

  // Add main label
  if (label) {
    const labelY = sublabel ? -6 : 0;
    g.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(label);
  }

  // Add sublabel
  if (sublabel) {
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', 0.7)
      .attr('pointer-events', 'none')
      .text(sublabel);
  }

  return g;
}

// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
  const {
    x1 = 0,
    y1 = 0,
    x2 = 100,
    y2 = 0,
    label = '',
    theme = diagramTheme,
    curved = false,
    curvature = 0.3,
    id = null,
    className = 'diagram-edge',
    dashed = false
  } = options;

  // Create unique marker ID
  const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;

  // Ensure defs exists
  let defs = svg.select('defs');
  if (defs.empty()) {
    defs = svg.append('defs');
  }

  // Add arrowhead marker
  defs.append('marker')
    .attr('id', markerId)
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Create group for arrow
  const g = svg.append('g')
    .attr('class', className);

  if (id) g.attr('id', id);

  // Calculate path
  let pathD;
  if (curved) {
    // Quadratic Bezier curve
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const dx = x2 - x1;
    const dy = y2 - y1;
    // Perpendicular offset for curve
    const cx = midX - dy * curvature;
    const cy = midY + dx * curvature;
    pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
  } else {
    // Straight line
    pathD = `M${x1},${y1} L${x2},${y2}`;
  }

  // Add path
  const path = g.append('path')
    .attr('d', pathD)
    .attr('fill', 'none')
    .attr('stroke', theme.edgeStroke)
    .attr('stroke-width', 1.5)
    .attr('marker-end', `url(#${markerId})`);

  if (dashed) {
    path.attr('stroke-dasharray', '5,3');
  }

  // Add label if provided
  if (label) {
    const labelX = (x1 + x2) / 2;
    const labelY = (y1 + y2) / 2;

    // Offset label perpendicular to line
    const angle = Math.atan2(y2 - y1, x2 - x1);
    const offsetX = Math.sin(angle) * 12;
    const offsetY = -Math.cos(angle) * 12;

    g.append('text')
      .attr('x', labelX + offsetX)
      .attr('y', labelY + offsetY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .text(label);
  }

  return g;
}

// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================

// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
  const {
    total = 1,
    initialStep = 0,
    speed = 1000,
    loop = true,
    onStepChange = null
  } = options;

  let current = initialStep;
  let isPlaying = false;
  let intervalId = null;
  let currentSpeed = speed;

  const notifyChange = () => {
    if (onStepChange && typeof onStepChange === 'function') {
      onStepChange(current);
    }
  };

  const controller = {
    get current() { return current; },
    get isPlaying() { return isPlaying; },
    get total() { return total; },
    get speed() { return currentSpeed; },

    setStep(step) {
      current = Math.max(0, Math.min(total - 1, step));
      notifyChange();
      return current;
    },

    next() {
      if (current < total - 1) {
        current++;
      } else if (loop) {
        current = 0;
      }
      notifyChange();
      return current;
    },

    prev() {
      if (current > 0) {
        current--;
      } else if (loop) {
        current = total - 1;
      }
      notifyChange();
      return current;
    },

    play() {
      if (isPlaying) return;
      isPlaying = true;
      intervalId = setInterval(() => {
        controller.next();
      }, currentSpeed);
    },

    stop() {
      isPlaying = false;
      if (intervalId) {
        clearInterval(intervalId);
        intervalId = null;
      }
    },

    toggle() {
      if (isPlaying) {
        controller.stop();
      } else {
        controller.play();
      }
    },

    reset() {
      controller.stop();
      current = initialStep;
      notifyChange();
    },

    setSpeed(newSpeed) {
      currentSpeed = newSpeed;
      if (isPlaying) {
        controller.stop();
        controller.play();
      }
    }
  };

  return controller;
}

// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================

// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
  const {
    nodes = [],
    edges = [],
    width = 600,
    height = 400,
    activeNodes = [],
    activeEdges = [],
    theme = diagramTheme,
    nodeWidth = 100,
    nodeHeight = 50,
    padding = 20
  } = options;

  // Create SVG element
  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`)
    .attr('class', 'flow-diagram');

  // Add background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', theme.bg)
    .attr('rx', 8);

  // Create defs for markers
  const defs = svg.append('defs');

  // Standard arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Highlighted arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.highlight);

  // Edges layer (draw first so nodes appear on top)
  const edgesLayer = svg.append('g').attr('class', 'edges-layer');

  // Nodes layer
  const nodesLayer = svg.append('g').attr('class', 'nodes-layer');

  // Draw edges
  edges.forEach((edge, i) => {
    const sourceNode = nodes.find(n => n.id === edge.source);
    const targetNode = nodes.find(n => n.id === edge.target);

    if (!sourceNode || !targetNode) return;

    const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
    const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
    const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';

    // Calculate edge path
    const x1 = sourceNode.x;
    const y1 = sourceNode.y;
    const x2 = targetNode.x;
    const y2 = targetNode.y;

    // Shorten path to not overlap with node edges
    const dx = x2 - x1;
    const dy = y2 - y1;
    const len = Math.sqrt(dx * dx + dy * dy);
    const offsetStart = (nodeWidth / 2) + 5;
    const offsetEnd = (nodeWidth / 2) + 10;

    const startX = x1 + (dx / len) * offsetStart;
    const startY = y1 + (dy / len) * offsetStart;
    const endX = x2 - (dx / len) * offsetEnd;
    const endY = y2 - (dy / len) * offsetEnd;

    const edgeGroup = edgesLayer.append('g')
      .attr('class', `edge ${isActive ? 'highlighted' : ''}`);

    if (edge.id) edgeGroup.attr('id', edge.id);

    // Draw path
    let pathD;
    if (edge.curved) {
      const midX = (startX + endX) / 2;
      const midY = (startY + endY) / 2;
      const curvature = edge.curvature || 0.2;
      const cx = midX - dy * curvature;
      const cy = midY + dx * curvature;
      pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
    } else {
      pathD = `M${startX},${startY} L${endX},${endY}`;
    }

    const path = edgeGroup.append('path')
      .attr('d', pathD)
      .attr('fill', 'none')
      .attr('stroke', edgeColor)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', `url(#${markerId})`);

    if (edge.dashed) {
      path.attr('stroke-dasharray', '5,3');
    }

    if (isActive) {
      path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
    }

    // Add label if present
    if (edge.label) {
      const labelX = (startX + endX) / 2;
      const labelY = (startY + endY) / 2;
      const angle = Math.atan2(endY - startY, endX - startX);
      const offsetX = Math.sin(angle) * 14;
      const offsetY = -Math.cos(angle) * 14;

      edgeGroup.append('text')
        .attr('x', labelX + offsetX)
        .attr('y', labelY + offsetY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? theme.highlight : theme.nodeText)
        .attr('font-size', '10px')
        .text(edge.label);
    }
  });

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
    const nodeFill = isActive ? theme.highlight : theme.nodeFill;
    const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
    const textFill = isActive ? theme.textOnHighlight : theme.nodeText;

    const nodeGroup = nodesLayer.append('g')
      .attr('class', `node ${isActive ? 'highlighted' : ''}`)
      .attr('transform', `translate(${node.x}, ${node.y})`);

    if (node.id) nodeGroup.attr('id', node.id);

    // Node rectangle
    const rect = nodeGroup.append('rect')
      .attr('x', -nodeWidth / 2)
      .attr('y', -nodeHeight / 2)
      .attr('width', node.width || nodeWidth)
      .attr('height', node.height || nodeHeight)
      .attr('rx', 6)
      .attr('ry', 6)
      .attr('fill', nodeFill)
      .attr('stroke', nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
    }

    // Main label
    const labelY = node.sublabel ? -6 : 0;
    nodeGroup.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', textFill)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(node.label || '');

    // Sublabel
    if (node.sublabel) {
      nodeGroup.append('text')
        .attr('x', 0)
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', textFill)
        .attr('font-size', '10px')
        .attr('opacity', isActive ? 0.9 : 0.7)
        .attr('pointer-events', 'none')
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// =============================================================================
// EXPORTS
// =============================================================================

// Export everything as a single object for lessons to use
diagramLib = {
  // Core dependencies
  d3,

  // Theme utilities
  getCSSVar,
  diagramTheme,

  // SVG primitives
  createNode,
  createArrow,

  // Animation controller
  createStepController,

  // Components
  FlowDiagram
}

What You’ll Learn

After this module, you will:

Understand what language models do: next-token prediction
Know why simple statistical approaches fail at this task
Grasp how neural networks learn from data (training loop intuition)
Have a mental model of transformer architecture
See the roadmap of what you’ll build in this course

Tokenizer = (await import('/playground/tokenizer.js')).Tokenizer
GPTModel = (await import('/playground/model.js')).GPTModel

// Load data files
tokenizerData = FileAttachment("../../playground/tokenizer.json").json()
weightsData = FileAttachment("../../playground/weights.json").json()

// Initialize tokenizer and model
tokenizer = new Tokenizer(tokenizerData)
model = new GPTModel(weightsData)

A language model predicts the next token.

Given the text def hello(, what comes next? A language model outputs a probability distribution over all possible tokens:

Token	Probability
`)`	31%
`name`	18%
`self`	12%
`x`	8%
…	…

Notice three points:

Tokens aren’t words. The model works with subword pieces - fragments smaller than words. hello might become hel + lo. This keeps the vocabulary manageable - typically 30,000 to 50,000 tokens - yet handles any text, including rare words and unfamiliar code.
The output is probabilities, not a single answer. The model expresses uncertainty. Sometimes ) is clearly right; sometimes several options make sense.
This simple task scales remarkably. Predicting the next token well requires understanding syntax, semantics, context, and even reasoning. A model that excels at this task can write code, answer questions, and hold conversations. This is how GitHub Copilot suggests completions, how ChatGPT generates responses, and how your IDE’s autocomplete works - all next-token prediction at scale.

Try It

viewof inputText = Inputs.text({
  label: "Enter code:",
  placeholder: "def hello(",
  value: "def hello(",
  width: 400
})

// Compute predictions
predictions = {
  const tok = await tokenizer;
  const mod = await model;

  if (!inputText || inputText.length === 0) {
    return [];
  }

  const ids = tok.encode(inputText);
  const trace = mod.forward(ids);
  const logits = trace.logits;

  // Get last position logits
  const vocabSize = logits.shape[1];
  const lastLogits = [];
  const offset = (logits.shape[0] - 1) * vocabSize;
  for (let i = 0; i < vocabSize; i++) {
    lastLogits.push(logits.data[offset + i]);
  }

  // Softmax
  const maxLogit = Math.max(...lastLogits);
  const expLogits = lastLogits.map(l => Math.exp(l - maxLogit));
  const sumExp = expLogits.reduce((a, b) => a + b, 0);
  const probs = expLogits.map(e => e / sumExp);

  // Get top 5
  const indexed = probs.map((p, i) => ({ prob: p, id: i }));
  indexed.sort((a, b) => b.prob - a.prob);
  const top5 = indexed.slice(0, 5);

  return top5.map(({ prob, id }) => ({
    token: tok.idToToken(id).replace(/ /g, '␣'),
    probability: (prob * 100).toFixed(1) + '%'
  }));
}

// Display predictions as a simple table
predictionDisplay = {
  if (predictions.length === 0) {
    return html`<p style="color: var(--text-muted);">Type something to see predictions.</p>`;
  }

  return html`
    <div style="
      display: flex;
      gap: 12px;
      flex-wrap: wrap;
      margin-top: 12px;
    ">
      ${predictions.map((p, i) => html`
        <div style="
          padding: 8px 16px;
          background: ${i === 0 ? 'var(--diagram-highlight, #f97316)' : 'var(--surface-tertiary, #f1f5f9)'};
          color: ${i === 0 ? 'var(--diagram-text-on-highlight, #1c1917)' : 'var(--text-primary, #1e293b)'};
          border-radius: 6px;
          font-family: 'JetBrains Mono', monospace;
          font-size: 14px;
        ">
          <span style="font-weight: 600;">${p.token}</span>
          <span style="opacity: 0.8; margin-left: 8px;">${p.probability}</span>
        </div>
      `)}
    </div>
  `;
}

Try it: Type a function definition like def add( and notice how the model predicts argument names. Then try for i in - the predictions shift based on what typically follows loop constructs.

Predicting the next token sounds simple. How do you do it well?

Why Simple Approaches Fail

The obvious approach: count what tokens typically follow other tokens. After seeing for i in a million times, you learn that range follows.

This is called an n-gram model. It looks at the last N tokens to predict the next one. Simple, fast, and works surprisingly well for common patterns.

But n-grams have a fundamental limitation: they see only a fixed window of tokens.

The Context Problem

Consider this code:

def calculate_sum(numbers):
    total = 0
    for n in numbers:
        total += █

What token fills the blank? A human immediately knows the answer is n - it’s the loop variable. An n-gram model looking at just total += guesses blindly - perhaps 1, x, or value, tokens that commonly follow +=.

N-gram vs Transformer: A Comparison

comparisonExamples = [
  {
    id: 'loop',
    name: 'Loop Variable',
    code: 'def calculate_sum(numbers):\n    total = 0\n    for n in numbers:\n        total += █',
    ngram: [
      { token: '1', prob: '24%', reason: 'common increment' },
      { token: 'x', prob: '18%', reason: 'common variable' },
      { token: 'value', prob: '15%', reason: 'common name' }
    ],
    transformer: [
      { token: 'n', prob: '72%', reason: 'sees loop variable' },
      { token: 'numbers', prob: '8%', reason: 'sees parameter' },
      { token: '1', prob: '5%', reason: 'fallback' }
    ],
    insight: 'The transformer attends to the loop definition and knows `n` is the iteration variable.'
  },
  {
    id: 'return',
    name: 'Return Value',
    code: 'if x > 0:\n    return x\nelse:\n    return █',
    ngram: [
      { token: '0', prob: '28%', reason: 'common return' },
      { token: 'None', prob: '22%', reason: 'common return' },
      { token: 'False', prob: '14%', reason: 'common return' }
    ],
    transformer: [
      { token: '-x', prob: '45%', reason: 'sees negation pattern' },
      { token: '0', prob: '25%', reason: 'conditional default' },
      { token: '-1', prob: '12%', reason: 'error sentinel' }
    ],
    insight: 'The transformer recognizes the absolute value pattern from the `if x > 0` condition.'
  },
  {
    id: 'comment',
    name: 'Following Comments',
    code: '# reverse a string\ndef █',
    ngram: [
      { token: 'main', prob: '15%', reason: 'common function' },
      { token: 'get', prob: '12%', reason: 'common prefix' },
      { token: 'calculate', prob: '10%', reason: 'common prefix' }
    ],
    transformer: [
      { token: 'reverse', prob: '68%', reason: 'matches comment' },
      { token: 'rev', prob: '12%', reason: 'abbreviation' },
      { token: 'flip', prob: '6%', reason: 'synonym' }
    ],
    insight: 'The transformer reads the comment and predicts a function name that matches its intent.'
  }
]

viewof selectedExample = Inputs.radio(
  comparisonExamples.map(e => e.id),
  {
    label: "Choose an example:",
    value: 'loop',
    format: id => comparisonExamples.find(e => e.id === id).name
  }
)

// Display the comparison widget
comparisonWidget = {
  const example = comparisonExamples.find(e => e.id === selectedExample);
  const t = diagramTheme;

  return html`
    <div style="margin: 20px 0;">
      <pre style="
        background: var(--surface-secondary, #1e293b);
        color: var(--text-primary, #e2e8f0);
        padding: 16px;
        border-radius: 8px;
        font-family: 'JetBrains Mono', monospace;
        font-size: 14px;
        line-height: 1.5;
        overflow-x: auto;
        margin-bottom: 20px;
      ">${example.code}</pre>

      <div style="
        display: grid;
        grid-template-columns: 1fr 1fr;
        gap: 16px;
      ">
        <div style="
          background: var(--surface-tertiary, #f1f5f9);
          border-radius: 8px;
          padding: 16px;
          border-top: 4px solid var(--diagram-edge-stroke, #94a3b8);
        ">
          <div style="
            font-weight: 600;
            color: var(--text-primary, #1e293b);
            margin-bottom: 12px;
            font-size: 14px;
          ">N-gram Model <span style="opacity: 0.6; font-weight: 400;">(sees last ~3 tokens)</span></div>
          ${example.ngram.map((p, i) => html`
            <div style="
              display: flex;
              justify-content: space-between;
              align-items: center;
              padding: 8px 12px;
              margin-bottom: 6px;
              background: ${i === 0 ? 'var(--diagram-node-fill-alt, rgba(148, 163, 184, 0.3))' : 'transparent'};
              border-radius: 4px;
            ">
              <span style="
                font-family: 'JetBrains Mono', monospace;
                font-weight: 500;
              ">${p.token}</span>
              <span style="
                font-size: 13px;
                color: var(--text-secondary, #64748b);
              ">${p.prob} <span style="opacity: 0.7;">· ${p.reason}</span></span>
            </div>
          `)}
        </div>

        <div style="
          background: var(--surface-tertiary, #f1f5f9);
          border-radius: 8px;
          padding: 16px;
          border-top: 4px solid var(--diagram-highlight, #f97316);
        ">
          <div style="
            font-weight: 600;
            color: var(--text-primary, #1e293b);
            margin-bottom: 12px;
            font-size: 14px;
          ">Transformer <span style="opacity: 0.6; font-weight: 400;">(sees all context)</span></div>
          ${example.transformer.map((p, i) => html`
            <div style="
              display: flex;
              justify-content: space-between;
              align-items: center;
              padding: 8px 12px;
              margin-bottom: 6px;
              background: ${i === 0 ? 'rgba(249, 115, 22, 0.2)' : 'transparent'};
              border-radius: 4px;
            ">
              <span style="
                font-family: 'JetBrains Mono', monospace;
                font-weight: 500;
                color: ${i === 0 ? 'var(--diagram-highlight, #f97316)' : 'inherit'};
              ">${p.token}</span>
              <span style="
                font-size: 13px;
                color: var(--text-secondary, #64748b);
              ">${p.prob} <span style="opacity: 0.7;">· ${p.reason}</span></span>
            </div>
          `)}
        </div>
      </div>

      <div style="
        margin-top: 16px;
        padding: 12px 16px;
        background: linear-gradient(90deg, rgba(249, 115, 22, 0.1), transparent);
        border-left: 3px solid var(--diagram-highlight, #f97316);
        border-radius: 0 6px 6px 0;
        font-size: 14px;
        color: var(--text-primary, #334155);
      ">
        <strong>Why the difference?</strong> ${example.insight}
      </div>
    </div>
  `;
}

The answer depends on context from ten or more tokens earlier. N-grams cannot reach that far. Increase the window size and another problem emerges: this exact sequence is new, so statistics offer no guidance.

Language is full of these long-range dependencies:

Matching brackets and parentheses
Variable references spanning multiple lines
Pronouns referring to earlier nouns
Comments describing code that follows

A good language model must consider all previous tokens and learn which ones matter for each prediction.

Neural Networks: Learning from Data

N-grams count patterns; neural networks learn them. The difference is fundamental.

A neural network is a function with adjustable parameters. Feed it an input, it produces an output. These adjustable parameters are called weights - numbers multiplied with inputs and summed. They determine which function the network computes, and we adjust them so the function does what we want.

// Neural network diagram with interactive layer highlighting
viewof nnLayer = Inputs.range([0, 2], {
  label: "Highlight layer",
  step: 1,
  value: 0,
  width: 300
})

nnLayerNames = ["Input Layer", "Hidden Layer", "Output Layer"]
nnLayerDescs = [
  "Raw input data enters here. For text, this might be token embeddings - vectors representing each word.",
  "Where the learning happens. Neurons multiply inputs by weights, sum them up, then apply activation functions (like ReLU). Without these non-linearities, stacking layers would be pointless - multiple linear operations just collapse into one.",
  "Produces the final result. For language models, this is a probability distribution over possible next tokens."
]

currentNNLayer = {
  return {
    name: nnLayerNames[nnLayer],
    desc: nnLayerDescs[nnLayer]
  };
}

// Simple neural network visualization
nnDiagram = {
  const width = 600;
  const height = 280;
  const t = diagramTheme;

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', t.bg)
    .attr('rx', 8);

  // Layer configuration
  const layers = [
    { x: 100, neurons: 3, label: 'Input' },
    { x: 300, neurons: 4, label: 'Hidden' },
    { x: 500, neurons: 2, label: 'Output' }
  ];

  const neuronRadius = 18;
  const verticalSpacing = 55;

  // Draw connections first (behind neurons)
  for (let l = 0; l < layers.length - 1; l++) {
    const fromLayer = layers[l];
    const toLayer = layers[l + 1];

    const fromYStart = (height - (fromLayer.neurons - 1) * verticalSpacing) / 2;
    const toYStart = (height - (toLayer.neurons - 1) * verticalSpacing) / 2;

    for (let i = 0; i < fromLayer.neurons; i++) {
      for (let j = 0; j < toLayer.neurons; j++) {
        const fromY = fromYStart + i * verticalSpacing;
        const toY = toYStart + j * verticalSpacing;

        // Highlight connections from/to highlighted layer
        const isHighlighted = (nnLayer === l) || (nnLayer === l + 1);

        svg.append('line')
          .attr('x1', fromLayer.x + neuronRadius)
          .attr('y1', fromY)
          .attr('x2', toLayer.x - neuronRadius)
          .attr('y2', toY)
          .attr('stroke', isHighlighted ? t.highlight : t.edgeStroke)
          .attr('stroke-width', isHighlighted ? 1.5 : 0.8)
          .attr('opacity', isHighlighted ? 0.7 : 0.3);
      }
    }
  }

  // Draw neurons
  layers.forEach((layer, l) => {
    const yStart = (height - (layer.neurons - 1) * verticalSpacing) / 2;
    const isActive = nnLayer === l;

    for (let i = 0; i < layer.neurons; i++) {
      const y = yStart + i * verticalSpacing;

      // Neuron circle
      svg.append('circle')
        .attr('cx', layer.x)
        .attr('cy', y)
        .attr('r', neuronRadius)
        .attr('fill', isActive ? t.highlight : t.nodeFill)
        .attr('stroke', isActive ? t.highlight : t.nodeStroke)
        .attr('stroke-width', isActive ? 2 : 1.5);

      // Show weights symbol in hidden layer
      if (l === 1 && isActive) {
        svg.append('text')
          .attr('x', layer.x)
          .attr('y', y)
          .attr('text-anchor', 'middle')
          .attr('dominant-baseline', 'central')
          .attr('fill', t.textOnHighlight)
          .attr('font-size', '14px')
          .attr('font-weight', '600')
          .text('w');
      }
    }

    // Layer label
    svg.append('text')
      .attr('x', layer.x)
      .attr('y', height - 20)
      .attr('text-anchor', 'middle')
      .attr('fill', isActive ? t.highlight : t.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', isActive ? '600' : '400')
      .text(layer.label);
  });

  // Title
  svg.append('text')
    .attr('x', width / 2)
    .attr('y', 25)
    .attr('text-anchor', 'middle')
    .attr('fill', t.nodeText)
    .attr('font-size', '14px')
    .attr('font-weight', '600')
    .text('A Simple Neural Network');

  return svg.node();
}

// Layer description display
nnLayerDisplay = {
  return html`
    <div style="
      margin-top: 16px;
      padding: 16px 20px;
      background: var(--surface-secondary, #f8fafc);
      border-radius: 8px;
      border-left: 4px solid var(--diagram-highlight, #f97316);
    ">
      <div style="
        font-weight: 600;
        font-size: 15px;
        color: var(--text-primary, #1e293b);
        margin-bottom: 4px;
      ">${currentNNLayer.name}</div>
      <div style="
        font-size: 14px;
        color: var(--text-secondary, #475569);
      ">${currentNNLayer.desc}</div>
    </div>
  `;
}

How Neural Networks Learn

This process matters because transformers are neural networks. The same training loop you’ll see here is exactly how we’ll train our language model in Module 07.

Training follows a simple loop:

Forward pass: Feed input through the network, get a prediction
Compute loss: Measure how wrong the prediction is (the loss function quantifies prediction quality)
Backward pass: Calculate how each weight contributed to the error (these sensitivity values are called gradients)
Update weights: Nudge weights in the direction that reduces error

Repeat with millions of examples. For language models, this means feeding billions of tokens from books, code, and web text, learning from each wrong prediction. The weights shift until the network predicts well.

(You’ll implement the backward pass yourself in Module 02: Autograd. PyTorch computes gradients automatically, but implementing them sharpens intuition.)

// Training loop visualization
trainingLoopDiagram = {
  const width = 600;
  const height = 180;
  const t = diagramTheme;

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', t.bg)
    .attr('rx', 8);

  // Steps
  const steps = [
    { x: 80, label: 'Input', sublabel: '"def hello("' },
    { x: 200, label: 'Forward', sublabel: 'predict' },
    { x: 320, label: 'Loss', sublabel: 'how wrong?' },
    { x: 440, label: 'Backward', sublabel: 'gradients' },
    { x: 560, label: 'Update', sublabel: 'fix weights' }
  ];

  const y = 90;
  const boxWidth = 85;
  const boxHeight = 50;

  // Arrow marker
  const defs = svg.append('defs');
  defs.append('marker')
    .attr('id', 'train-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', t.edgeStroke);

  // Draw arrows between steps
  for (let i = 0; i < steps.length - 1; i++) {
    svg.append('line')
      .attr('x1', steps[i].x + boxWidth/2 + 5)
      .attr('y1', y)
      .attr('x2', steps[i + 1].x - boxWidth/2 - 10)
      .attr('y2', y)
      .attr('stroke', t.edgeStroke)
      .attr('stroke-width', 1.5)
      .attr('marker-end', 'url(#train-arrow)');
  }

  // Loop back arrow (curved)
  svg.append('path')
    .attr('d', `M ${steps[4].x} ${y + boxHeight/2 + 10}
                Q ${steps[4].x} ${y + 60} ${steps[2].x} ${y + 60}
                Q ${steps[0].x} ${y + 60} ${steps[0].x} ${y + boxHeight/2 + 10}`)
    .attr('fill', 'none')
    .attr('stroke', t.accent)
    .attr('stroke-width', 1.5)
    .attr('stroke-dasharray', '5,3')
    .attr('marker-end', 'url(#train-arrow)');

  // "repeat" label
  svg.append('text')
    .attr('x', width / 2)
    .attr('y', y + 70)
    .attr('text-anchor', 'middle')
    .attr('fill', t.accent)
    .attr('font-size', '11px')
    .attr('font-style', 'italic')
    .text('repeat millions of times');

  // Draw step boxes
  steps.forEach((step, i) => {
    const g = svg.append('g')
      .attr('transform', `translate(${step.x}, ${y})`);

    g.append('rect')
      .attr('x', -boxWidth/2)
      .attr('y', -boxHeight/2)
      .attr('width', boxWidth)
      .attr('height', boxHeight)
      .attr('rx', 6)
      .attr('fill', i === 2 ? t.highlight : t.nodeFill)
      .attr('stroke', i === 2 ? t.highlight : t.nodeStroke)
      .attr('stroke-width', 1.5);

    g.append('text')
      .attr('y', -8)
      .attr('text-anchor', 'middle')
      .attr('fill', i === 2 ? t.textOnHighlight : t.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '600')
      .text(step.label);

    g.append('text')
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('fill', i === 2 ? t.textOnHighlight : t.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', i === 2 ? 0.9 : 0.7)
      .text(step.sublabel);
  });

  // Title
  svg.append('text')
    .attr('x', width / 2)
    .attr('y', 25)
    .attr('text-anchor', 'middle')
    .attr('fill', t.nodeText)
    .attr('font-size', '14px')
    .attr('font-weight', '600')
    .text('The Training Loop');

  return svg.node();
}

Why This Matters for Language

Neural networks do not merely memorize patterns as n-grams do. They learn representations - internal encodings where similar concepts cluster together.

Consider the word “cat”: - An n-gram sees only the characters c-a-t - A neural network learns a vector where “cat” is close to “dog”, “kitten”, and “pet” but far from “quantum” and “derivative”

In transformers, these representations become even richer: the same word gets different vectors depending on context. “Bank” near “river” differs from “bank” near “money.”

N-grams lack this capacity entirely. They treat “cat sat on the mat” and “dog sat on the rug” as completely unrelated sequences. Neural networks recognize the structural similarity and generalize patterns from one to the other.

This representation learning lets neural networks: - Generalize to unseen word combinations - Handle context by learning which parts of the input matter - Scale with more data and compute

Transformers are neural networks distinguished by how they handle context: through attention.

The Transformer Solution

Attention examines all previous tokens and learns which ones matter.

The architecture flows like this:

// Animation state
viewof animStep = Inputs.range([0, 5], {
  label: "Step",
  step: 1,
  value: 0,
  width: 300
})

stageDescriptions = [
  { stage: "Input", desc: "Text enters as token IDs (integers)" },
  { stage: "Embedding", desc: "Each ID becomes a dense vector (e.g., 768 numbers). These vectors are learned - the model discovers which tokens should have similar vectors based on usage patterns. Words like 'run' and 'jog' end up close together." },
  { stage: "Attention", desc: "Each token gathers information from relevant previous tokens. Here, the pronoun 'it' discovers its referent, and `total +=` finds the loop variable `n`." },
  { stage: "Feed-Forward", desc: "Process each token's combined information through a small neural network. This is where the model 'thinks' about what it learned from attention." },
  { stage: "Repeat", desc: "Stack multiple layers to build deeper understanding. Early layers recognize syntax; later layers grasp meaning." },
  { stage: "Output", desc: "Final vectors become probabilities over the vocabulary" }
]

currentStage = stageDescriptions[animStep]

// Animated architecture diagram
architectureDiagram = {
  const width = 700;
  const height = 200;
  const t = diagramTheme;

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', t.bg)
    .attr('rx', 8);

  // Node positions
  const nodes = [
    { id: 0, x: 60, y: 100, label: 'Input', sublabel: 'Token IDs' },
    { id: 1, x: 180, y: 100, label: 'Embedding', sublabel: 'Vectors' },
    { id: 2, x: 320, y: 100, label: 'Attention', sublabel: 'Context' },
    { id: 3, x: 460, y: 100, label: 'Feed-Forward', sublabel: 'Process' },
    { id: 4, x: 540, y: 50, label: '×N', sublabel: 'Layers' },
    { id: 5, x: 640, y: 100, label: 'Output', sublabel: 'Probabilities' }
  ];

  // Draw edges first
  const edges = [
    { from: 0, to: 1 },
    { from: 1, to: 2 },
    { from: 2, to: 3 },
    { from: 3, to: 5 }
  ];

  // Arrow marker
  const defs = svg.append('defs');
  defs.append('marker')
    .attr('id', 'arch-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', t.edgeStroke);

  defs.append('marker')
    .attr('id', 'arch-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', t.highlight);

  edges.forEach(e => {
    const from = nodes[e.from];
    const to = nodes[e.to];
    const isActive = animStep >= e.from && animStep <= e.to;

    svg.append('line')
      .attr('x1', from.x + 50)
      .attr('y1', from.y)
      .attr('x2', to.x - 50)
      .attr('y2', to.y)
      .attr('stroke', isActive ? t.highlight : t.edgeStroke)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', isActive ? 'url(#arch-arrow-highlight)' : 'url(#arch-arrow)');
  });

  // Loop arrow for "Repeat" (curved)
  if (animStep >= 4) {
    const loopPath = svg.append('path')
      .attr('d', 'M 470,70 Q 500,20 530,50')
      .attr('fill', 'none')
      .attr('stroke', animStep === 4 ? t.highlight : t.edgeStroke)
      .attr('stroke-width', animStep === 4 ? 2.5 : 1.5)
      .attr('stroke-dasharray', '4,2');
  }

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = animStep === i;
    const nodeWidth = i === 4 ? 50 : 90;
    const nodeHeight = i === 4 ? 35 : 50;

    const g = svg.append('g')
      .attr('transform', `translate(${node.x}, ${node.y})`);

    g.append('rect')
      .attr('x', -nodeWidth/2)
      .attr('y', -nodeHeight/2)
      .attr('width', nodeWidth)
      .attr('height', nodeHeight)
      .attr('rx', 6)
      .attr('fill', isActive ? t.highlight : t.nodeFill)
      .attr('stroke', isActive ? t.highlight : t.nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    g.append('text')
      .attr('y', node.sublabel && i !== 4 ? -6 : 0)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', isActive ? t.textOnHighlight : t.nodeText)
      .attr('font-size', i === 4 ? '14px' : '12px')
      .attr('font-weight', '500')
      .text(node.label);

    if (node.sublabel && i !== 4) {
      g.append('text')
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? t.textOnHighlight : t.nodeText)
        .attr('font-size', '10px')
        .attr('opacity', 0.7)
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// Attention visualization - shows when on step 2
attentionViz = {
  if (animStep !== 2) return html``;

  const t = diagramTheme;
  const tokens = ['The', 'cat', 'sat', 'on', 'the', 'mat', 'because', 'it', 'was', 'tired'];
  const targetIdx = 7; // "it"
  // Attention weights from "it" to previous tokens
  const weights = [0.05, 0.62, 0.08, 0.02, 0.03, 0.12, 0.05, 0, 0, 0];

  const width = 700;
  const height = 120;

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', t.bg)
    .attr('rx', 8);

  const tokenSpacing = 65;
  const startX = 30;
  const tokenY = 80;
  const targetY = 35;

  // Draw tokens
  tokens.forEach((token, i) => {
    const x = startX + i * tokenSpacing;
    const weight = weights[i];
    const isTarget = i === targetIdx;

    // Token box
    svg.append('rect')
      .attr('x', x)
      .attr('y', isTarget ? targetY - 12 : tokenY - 12)
      .attr('width', 55)
      .attr('height', 24)
      .attr('rx', 4)
      .attr('fill', isTarget ? t.highlight : (weight > 0.3 ? 'rgba(249, 115, 22, 0.3)' : t.nodeFill))
      .attr('stroke', isTarget ? t.highlight : (weight > 0.3 ? t.highlight : t.nodeStroke))
      .attr('stroke-width', isTarget ? 2 : 1);

    // Token text
    svg.append('text')
      .attr('x', x + 27.5)
      .attr('y', isTarget ? targetY : tokenY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', isTarget ? t.textOnHighlight : t.nodeText)
      .attr('font-size', '12px')
      .attr('font-family', "'JetBrains Mono', monospace")
      .text(token);

    // Draw attention line from target to this token
    if (i < targetIdx && weight > 0) {
      const targetX = startX + targetIdx * tokenSpacing + 27.5;
      const sourceX = x + 27.5;

      svg.append('line')
        .attr('x1', sourceX)
        .attr('y1', tokenY - 14)
        .attr('x2', targetX)
        .attr('y2', targetY + 14)
        .attr('stroke', t.highlight)
        .attr('stroke-width', Math.max(0.5, weight * 5))
        .attr('opacity', 0.4 + weight * 0.6);
    }
  });

  // Label
  svg.append('text')
    .attr('x', startX + targetIdx * tokenSpacing + 27.5)
    .attr('y', targetY - 25)
    .attr('text-anchor', 'middle')
    .attr('fill', t.highlight)
    .attr('font-size', '10px')
    .attr('font-weight', '500')
    .text('predicting next token');

  return svg.node();
}

// Attention explanation
attentionExplanation = {
  if (animStep !== 2) return html``;

  return html`
    <div style="
      margin-top: 12px;
      padding: 12px 16px;
      background: rgba(249, 115, 22, 0.1);
      border-radius: 6px;
      font-size: 13px;
      color: var(--text-primary, #334155);
    ">
      <strong>Attention in action:</strong> When predicting what comes after "it", the model attends strongly to "cat" (62%) because "it" likely refers to the cat. The line thickness shows attention weight - the model learned this connection, it wasn't programmed.
    </div>
  `;
}

// Stage description display
stageDisplay = {
  return html`
    <div style="
      margin-top: 16px;
      padding: 16px 20px;
      background: var(--surface-secondary, #f8fafc);
      border-radius: 8px;
      border-left: 4px solid var(--diagram-highlight, #f97316);
    ">
      <div style="
        font-weight: 600;
        font-size: 15px;
        color: var(--text-primary, #1e293b);
        margin-bottom: 4px;
      ">${currentStage.stage}</div>
      <div style="
        font-size: 14px;
        color: var(--text-secondary, #475569);
      ">${currentStage.desc}</div>
    </div>
  `;
}

Three key insights:

Full context visibility. Older models read left-to-right, one token at a time. Transformers see the entire input at once. Each layer processes all positions simultaneously, making them highly parallelizable and efficient.
Learned relevance. The attention mechanism learns which tokens matter for each prediction. When predicting after total +=, it learns to focus on the loop variable n, not the function name.
Stacked layers. Multiple transformer layers build increasingly abstract representations. Early layers recognize syntax; later layers grasp meaning.

The modules ahead build each component from scratch.

What You’ll Build

Each module tackles one piece of this architecture:

moduleData = [
  { id: '01', name: 'Tensors', desc: 'The n-dimensional arrays that hold everything - weights, activations, gradients', cat: 'foundation' },
  { id: '02', name: 'Autograd', desc: 'How models learn: automatic gradients without manual calculus', cat: 'foundation' },
  { id: '03', name: 'Tokenization', desc: 'Why "hello" becomes [15339] and how subwords handle any text', cat: 'input' },
  { id: '04', name: 'Embeddings', desc: 'Where meaning lives: turning IDs into rich vector representations', cat: 'model' },
  { id: '05', name: 'Attention', desc: 'The mechanism that lets "it" know what "it" refers to', cat: 'model' },
  { id: '06', name: 'Transformer', desc: 'Putting it all together into a complete architecture', cat: 'model' },
  { id: '07', name: 'Training', desc: 'Teaching the model to predict well through gradient descent', cat: 'training' },
  { id: '08', name: 'Generation', desc: 'Sampling strategies: from greedy to nucleus sampling', cat: 'output' }
]

// Category colors - light and dark mode variants
catColors = ({
  foundation: {
    bg: '#dbeafe', darkBg: '#1e3a5f',
    border: '#3b82f6',
    text: '#1e40af', darkText: '#93c5fd'
  },
  input: {
    bg: '#fef3c7', darkBg: '#422006',
    border: '#f59e0b',
    text: '#92400e', darkText: '#fcd34d'
  },
  model: {
    bg: '#dcfce7', darkBg: '#14352a',
    border: '#22c55e',
    text: '#166534', darkText: '#86efac'
  },
  training: {
    bg: '#fce7f3', darkBg: '#3b1436',
    border: '#ec4899',
    text: '#9d174d', darkText: '#f9a8d4'
  },
  output: {
    bg: '#f3e8ff', darkBg: '#3b1764',
    border: '#a855f7',
    text: '#6b21a8', darkText: '#d8b4fe'
  }
})

// Module directory name mapping
moduleNames = ({
  '01': 'tensors',
  '02': 'autograd',
  '03': 'tokenization',
  '04': 'embeddings',
  '05': 'attention',
  '06': 'transformer',
  '07': 'training',
  '08': 'generation'
})

// Render module roadmap
roadmapDisplay = {
  const t = diagramTheme;
  const isDark = t.isDark;

  return html`
    <div style="
      display: grid;
      grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
      gap: 12px;
      margin: 20px 0;
    ">
      ${moduleData.map(m => {
        const colors = catColors[m.cat];
        const bgColor = isDark ? colors.darkBg : colors.bg;
        const borderColor = colors.border;
        const textColor = isDark ? colors.darkText : colors.text;

        return html`
          <a href="../m${m.id}_${moduleNames[m.id]}/lesson.html" style="
            display: block;
            padding: 16px;
            background: ${bgColor};
            border-left: 4px solid ${borderColor};
            border-radius: 6px;
            text-decoration: none;
            transition: transform 0.15s ease, box-shadow 0.15s ease;
          " onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 4px 12px rgba(0,0,0,0.1)';"
             onmouseout="this.style.transform='none'; this.style.boxShadow='none';">
            <div style="
              font-size: 13px;
              font-weight: 600;
              color: ${textColor};
              margin-bottom: 4px;
              font-family: 'JetBrains Mono', monospace;
            ">Module ${m.id}</div>
            <div style="
              font-size: 16px;
              font-weight: 600;
              color: var(--text-primary, #1e293b);
              margin-bottom: 6px;
            ">${m.name}</div>
            <div style="
              font-size: 13px;
              color: var(--text-secondary, #475569);
            ">${m.desc}</div>
          </a>
        `;
      })}
    </div>
  `;
}

Each module builds on the previous ones. By the end, you’ll have a working language model you fully understand - something you built piece by piece.

Key Takeaways

Language models predict the next token from a probability distribution over the vocabulary
Simple counting (n-grams) fails because language has long-range dependencies that demand variable context
Neural networks learn patterns through iterative weight updates - they don’t just memorize, they generalize
Transformers use attention to dynamically focus on relevant context, however distant
You’ll build each component from scratch in the modules ahead, from tensors to text generation

Start with Tensors →