Module 05: Attention

d3 = require("d3@7")

// =============================================================================
// THEME DETECTION
// =============================================================================

// Reactive value that tracks Quarto's native dark mode and re-renders diagrams
// when the theme toggles. Generators.observe makes this a live OJS dependency:
// any cell referencing isDarkMode re-runs whenever the body/html class changes.
isDarkMode = Generators.observe(notify => {
  const check = () =>
    document.body.classList.contains('quarto-dark') ||
    document.documentElement.classList.contains('quarto-dark');

  notify(check());

  const observer = new MutationObserver(() => notify(check()));
  observer.observe(document.body, { attributes: true, attributeFilter: ['class'] });
  observer.observe(document.documentElement, { attributes: true, attributeFilter: ['class'] });

  return () => observer.disconnect();
})


// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================

// Function to read CSS custom property values from the document.
// Reads from <body> because Quarto applies the .quarto-dark class there, so the
// dark-mode variable overrides resolve on the body element, not <html>.
getCSSVar = function(name, fallback = null) {
  if (typeof document === 'undefined') return fallback;
  const value = getComputedStyle(document.body).getPropertyValue(name).trim();
  return value || fallback;
}

// =============================================================================
// THEME OBJECT
// =============================================================================

// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
  // Light-mode fallback values (used if CSS vars are unavailable)
  const lightFallbacks = {
    nodeFill: '#f5f5f4',
    nodeFillHover: '#e7e5e4',
    nodeStroke: '#d6d3d1',
    nodeText: '#1c1917',
    edgeStroke: '#78716c',
    highlight: '#f97316',
    highlightGlow: 'rgba(249, 115, 22, 0.3)',
    accent: '#0ea5e9',
    accentGlow: 'rgba(14, 165, 233, 0.3)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: '#fafaf9',
    bgSecondary: '#f5f5f4',
    // Semantic colors for status/feedback
    error: '#dc2626',
    errorBg: 'rgba(220, 38, 38, 0.1)',
    success: '#16a34a',
    successBg: 'rgba(22, 163, 74, 0.1)',
    info: '#2563eb',
    infoBg: 'rgba(37, 99, 235, 0.1)'
  };

  // Dark-mode fallbacks + brighter semantic colors for readability on dark.
  const darkFallbacks = {
    nodeFill: '#292524',
    nodeFillHover: '#3f3a36',
    nodeStroke: '#57534e',
    nodeText: '#fafaf9',
    edgeStroke: '#a8a29e',
    highlight: '#fb923c',
    highlightGlow: 'rgba(251, 146, 60, 0.4)',
    accent: '#38bdf8',
    accentGlow: 'rgba(56, 189, 248, 0.4)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: 'transparent',
    bgSecondary: '#1c1917',
    error: '#f87171',
    errorBg: 'rgba(248, 113, 113, 0.18)',
    success: '#4ade80',
    successBg: 'rgba(74, 222, 128, 0.18)',
    info: '#60a5fa',
    infoBg: 'rgba(96, 165, 250, 0.18)'
  };

  // Referencing isDarkMode here makes this cell reactive: it recomputes (and all
  // diagrams that read it re-render) whenever the theme is toggled.
  const fallbacks = isDarkMode ? darkFallbacks : lightFallbacks;

  return {
    nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
    nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
    nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
    nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
    edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
    highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
    highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
    accent: getCSSVar('--diagram-accent', fallbacks.accent),
    accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
    textOnHighlight: fallbacks.textOnHighlight,
    textOnAccent: fallbacks.textOnAccent,
    bg: getCSSVar('--diagram-bg', fallbacks.bg),
    bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
    // Semantic colors (use fallbacks directly since no CSS vars defined)
    error: fallbacks.error,
    errorBg: fallbacks.errorBg,
    success: fallbacks.success,
    successBg: fallbacks.successBg,
    info: fallbacks.info,
    infoBg: fallbacks.infoBg,
    isDark: isDarkMode
  };
}

// =============================================================================
// SVG PRIMITIVES
// =============================================================================

// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
  const {
    x = 0,
    y = 0,
    width = 100,
    height = 50,
    label = '',
    sublabel = '',
    id = null,
    theme = diagramTheme,
    rx = 6,
    ry = 6,
    className = 'diagram-node'
  } = options;

  // Create group
  const g = svg.append('g')
    .attr('class', className)
    .attr('transform', `translate(${x}, ${y})`);

  if (id) g.attr('id', id);

  // Add rectangle
  g.append('rect')
    .attr('x', -width / 2)
    .attr('y', -height / 2)
    .attr('width', width)
    .attr('height', height)
    .attr('rx', rx)
    .attr('ry', ry)
    .attr('fill', theme.nodeFill)
    .attr('stroke', theme.nodeStroke)
    .attr('stroke-width', 1.5);

  // Add main label
  if (label) {
    const labelY = sublabel ? -6 : 0;
    g.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(label);
  }

  // Add sublabel
  if (sublabel) {
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', 0.7)
      .attr('pointer-events', 'none')
      .text(sublabel);
  }

  return g;
}

// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
  const {
    x1 = 0,
    y1 = 0,
    x2 = 100,
    y2 = 0,
    label = '',
    theme = diagramTheme,
    curved = false,
    curvature = 0.3,
    id = null,
    className = 'diagram-edge',
    dashed = false
  } = options;

  // Create unique marker ID
  const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;

  // Ensure defs exists
  let defs = svg.select('defs');
  if (defs.empty()) {
    defs = svg.append('defs');
  }

  // Add arrowhead marker
  defs.append('marker')
    .attr('id', markerId)
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Create group for arrow
  const g = svg.append('g')
    .attr('class', className);

  if (id) g.attr('id', id);

  // Calculate path
  let pathD;
  if (curved) {
    // Quadratic Bezier curve
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const dx = x2 - x1;
    const dy = y2 - y1;
    // Perpendicular offset for curve
    const cx = midX - dy * curvature;
    const cy = midY + dx * curvature;
    pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
  } else {
    // Straight line
    pathD = `M${x1},${y1} L${x2},${y2}`;
  }

  // Add path
  const path = g.append('path')
    .attr('d', pathD)
    .attr('fill', 'none')
    .attr('stroke', theme.edgeStroke)
    .attr('stroke-width', 1.5)
    .attr('marker-end', `url(#${markerId})`);

  if (dashed) {
    path.attr('stroke-dasharray', '5,3');
  }

  // Add label if provided
  if (label) {
    const labelX = (x1 + x2) / 2;
    const labelY = (y1 + y2) / 2;

    // Offset label perpendicular to line
    const angle = Math.atan2(y2 - y1, x2 - x1);
    const offsetX = Math.sin(angle) * 12;
    const offsetY = -Math.cos(angle) * 12;

    g.append('text')
      .attr('x', labelX + offsetX)
      .attr('y', labelY + offsetY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .text(label);
  }

  return g;
}

// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================

// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
  const {
    total = 1,
    initialStep = 0,
    speed = 1000,
    loop = true,
    onStepChange = null
  } = options;

  let current = initialStep;
  let isPlaying = false;
  let intervalId = null;
  let currentSpeed = speed;

  const notifyChange = () => {
    if (onStepChange && typeof onStepChange === 'function') {
      onStepChange(current);
    }
  };

  const controller = {
    get current() { return current; },
    get isPlaying() { return isPlaying; },
    get total() { return total; },
    get speed() { return currentSpeed; },

    setStep(step) {
      current = Math.max(0, Math.min(total - 1, step));
      notifyChange();
      return current;
    },

    next() {
      if (current < total - 1) {
        current++;
      } else if (loop) {
        current = 0;
      }
      notifyChange();
      return current;
    },

    prev() {
      if (current > 0) {
        current--;
      } else if (loop) {
        current = total - 1;
      }
      notifyChange();
      return current;
    },

    play() {
      if (isPlaying) return;
      isPlaying = true;
      intervalId = setInterval(() => {
        controller.next();
      }, currentSpeed);
    },

    stop() {
      isPlaying = false;
      if (intervalId) {
        clearInterval(intervalId);
        intervalId = null;
      }
    },

    toggle() {
      if (isPlaying) {
        controller.stop();
      } else {
        controller.play();
      }
    },

    reset() {
      controller.stop();
      current = initialStep;
      notifyChange();
    },

    setSpeed(newSpeed) {
      currentSpeed = newSpeed;
      if (isPlaying) {
        controller.stop();
        controller.play();
      }
    }
  };

  return controller;
}

// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================

// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
  const {
    nodes = [],
    edges = [],
    width = 600,
    height = 400,
    activeNodes = [],
    activeEdges = [],
    theme = diagramTheme,
    nodeWidth = 100,
    nodeHeight = 50,
    padding = 20
  } = options;

  // Create SVG element
  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`)
    .attr('class', 'flow-diagram');

  // Add background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', theme.bg)
    .attr('rx', 8);

  // Create defs for markers
  const defs = svg.append('defs');

  // Standard arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Highlighted arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.highlight);

  // Edges layer (draw first so nodes appear on top)
  const edgesLayer = svg.append('g').attr('class', 'edges-layer');

  // Nodes layer
  const nodesLayer = svg.append('g').attr('class', 'nodes-layer');

  // Draw edges
  edges.forEach((edge, i) => {
    const sourceNode = nodes.find(n => n.id === edge.source);
    const targetNode = nodes.find(n => n.id === edge.target);

    if (!sourceNode || !targetNode) return;

    const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
    const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
    const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';

    // Calculate edge path
    const x1 = sourceNode.x;
    const y1 = sourceNode.y;
    const x2 = targetNode.x;
    const y2 = targetNode.y;

    // Shorten path to not overlap with node edges
    const dx = x2 - x1;
    const dy = y2 - y1;
    const len = Math.sqrt(dx * dx + dy * dy);
    const offsetStart = (nodeWidth / 2) + 5;
    const offsetEnd = (nodeWidth / 2) + 10;

    const startX = x1 + (dx / len) * offsetStart;
    const startY = y1 + (dy / len) * offsetStart;
    const endX = x2 - (dx / len) * offsetEnd;
    const endY = y2 - (dy / len) * offsetEnd;

    const edgeGroup = edgesLayer.append('g')
      .attr('class', `edge ${isActive ? 'highlighted' : ''}`);

    if (edge.id) edgeGroup.attr('id', edge.id);

    // Draw path
    let pathD;
    if (edge.curved) {
      const midX = (startX + endX) / 2;
      const midY = (startY + endY) / 2;
      const curvature = edge.curvature || 0.2;
      const cx = midX - dy * curvature;
      const cy = midY + dx * curvature;
      pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
    } else {
      pathD = `M${startX},${startY} L${endX},${endY}`;
    }

    const path = edgeGroup.append('path')
      .attr('d', pathD)
      .attr('fill', 'none')
      .attr('stroke', edgeColor)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', `url(#${markerId})`);

    if (edge.dashed) {
      path.attr('stroke-dasharray', '5,3');
    }

    if (isActive) {
      path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
    }

    // Add label if present
    if (edge.label) {
      const labelX = (startX + endX) / 2;
      const labelY = (startY + endY) / 2;
      const angle = Math.atan2(endY - startY, endX - startX);
      const offsetX = Math.sin(angle) * 14;
      const offsetY = -Math.cos(angle) * 14;

      edgeGroup.append('text')
        .attr('x', labelX + offsetX)
        .attr('y', labelY + offsetY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? theme.highlight : theme.nodeText)
        .attr('font-size', '10px')
        .text(edge.label);
    }
  });

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
    const nodeFill = isActive ? theme.highlight : theme.nodeFill;
    const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
    const textFill = isActive ? theme.textOnHighlight : theme.nodeText;

    const nodeGroup = nodesLayer.append('g')
      .attr('class', `node ${isActive ? 'highlighted' : ''}`)
      .attr('transform', `translate(${node.x}, ${node.y})`);

    if (node.id) nodeGroup.attr('id', node.id);

    // Node rectangle
    const rect = nodeGroup.append('rect')
      .attr('x', -nodeWidth / 2)
      .attr('y', -nodeHeight / 2)
      .attr('width', node.width || nodeWidth)
      .attr('height', node.height || nodeHeight)
      .attr('rx', 6)
      .attr('ry', 6)
      .attr('fill', nodeFill)
      .attr('stroke', nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
    }

    // Main label
    const labelY = node.sublabel ? -6 : 0;
    nodeGroup.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', textFill)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(node.label || '');

    // Sublabel
    if (node.sublabel) {
      nodeGroup.append('text')
        .attr('x', 0)
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', textFill)
        .attr('font-size', '10px')
        .attr('opacity', isActive ? 0.9 : 0.7)
        .attr('pointer-events', 'none')
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// =============================================================================
// EXPORTS
// =============================================================================

// Export everything as a single object for lessons to use
diagramLib = {
  // Core dependencies
  d3,

  // Theme utilities
  isDarkMode,
  getCSSVar,
  diagramTheme,

  // SVG primitives
  createNode,
  createArrow,

  // Animation controller
  createStepController,

  // Components
  FlowDiagram
}

/**
 * Segmented step control for visualization stepping.
 * @param {Object} options
 * @param {number} options.min - Minimum step value (default 0)
 * @param {number} options.max - Maximum step value
 * @param {number} options.value - Initial value (default min)
 * @param {string} options.label - Optional label text
 * @returns {number} Current step value (reactive)
 */
stepControl = function({min = 0, max, value, label = null} = {}) {
  const initialValue = value ?? min;
  const steps = Array.from({length: max - min + 1}, (_, i) => min + i);

  const container = htl.html`<div class="step-control">
    ${label ? htl.html`<span class="step-control-label">${label}</span>` : ''}
    <div class="step-control-segments" role="group" aria-label="${label || 'Step control'}">
      ${steps.map(step => htl.html`<button
        class="step-control-segment ${step === initialValue ? 'active' : ''}"
        data-step="${step}"
        aria-pressed="${step === initialValue}"
        tabindex="${step === initialValue ? 0 : -1}"
      >${step}</button>`)}
    </div>
  </div>`;

  const segments = container.querySelectorAll('.step-control-segment');
  let currentValue = initialValue;

  function updateActive(newValue) {
    currentValue = newValue;
    segments.forEach(seg => {
      const isActive = parseInt(seg.dataset.step) === newValue;
      seg.classList.toggle('active', isActive);
      seg.setAttribute('aria-pressed', isActive);
      seg.tabIndex = isActive ? 0 : -1;
    });
    container.value = newValue;
    container.dispatchEvent(new Event('input', {bubbles: true}));
  }

  // Click handler
  segments.forEach(seg => {
    seg.addEventListener('click', () => {
      updateActive(parseInt(seg.dataset.step));
    });
  });

  // Keyboard navigation
  container.addEventListener('keydown', (e) => {
    if (e.key === 'ArrowRight' || e.key === 'ArrowDown') {
      e.preventDefault();
      const next = Math.min(currentValue + 1, max);
      updateActive(next);
      segments[next - min].focus();
    } else if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') {
      e.preventDefault();
      const prev = Math.max(currentValue - 1, min);
      updateActive(prev);
      segments[prev - min].focus();
    } else if (e.key === 'Home') {
      e.preventDefault();
      updateActive(min);
      segments[0].focus();
    } else if (e.key === 'End') {
      e.preventDefault();
      updateActive(max);
      segments[max - min].focus();
    }
  });

  container.value = initialValue;
  return container;
}

Introduction

Attention made transformers revolutionary. Each token examines every other token and gathers relevant information.

Attention enables each token to ask: “Which tokens in this sequence matter to me?”

Why attention matters for LLMs:

Long-range dependencies: Token 100 can attend to token 1—solving the vanishing gradient problem that cripples RNNs
Parallelization: All positions compute simultaneously during training, unlike in RNNs
Interpretability: Attention weights reveal what the model examines
Dynamic context: Each token’s representation is context-dependent, not fixed

Self-attention is the key innovation: tokens attend to other tokens within the same sequence - queries, keys, and values all come from the same input. Cross-attention (used in encoder-decoder models) draws queries from one sequence and keys/values from another.

What You’ll Learn

After this module, you can:

Understand Query, Key, Value projections and their roles
Implement scaled dot-product attention from scratch
Apply causal masking for autoregressive models
Build multi-head attention and understand why it’s beneficial
Recognize attention patterns and what they reveal
Shrink the KV cache with grouped-query and multi-query attention (GQA/MQA)

Prerequisites

This module requires familiarity with:

Module 01: Tensors — Matrix multiplication and broadcasting
Module 04: Embeddings — Token and positional embeddings

Note: Attention treats tokens as an unordered set. Positional embeddings (Module 04) supply the sense of order.

Attention as Three Questions

Every token in a sequence asks three questions. These questions unlock attention.

import numpy as np

# A simple sentence
tokens = ["The", "cat", "sat"]

# Each token has an embedding (we'll use random ones for illustration)
np.random.seed(42)
embed_dim = 4
embeddings = {tok: np.random.randn(embed_dim).round(2) for tok in tokens}

print("Each token has an embedding vector:")
for tok, emb in embeddings.items():
    print(f"  '{tok}': {emb}")

Each token has an embedding vector:
  'The': [ 0.5  -0.14  0.65  1.52]
  'cat': [-0.23 -0.23  1.58  0.77]
  'sat': [-0.47  0.54 -0.46 -0.47]

The Three Questions:

Question	Vector	What it asks
Query (Q)	“What am I looking for?”	Token seeks relevant context
Key (K)	“What do I contain?”	Token advertises its content
Value (V)	“What do I return if matched?”	Token’s actual information

# Each token projects its embedding into Q, K, V
# These are learned linear transformations

# For "sat", the query might encode: "I need a subject (who sat?)"
# For "cat", the key might encode: "I'm a noun, a subject candidate"
# For "cat", the value carries: the actual semantic content of "cat"

print("When 'sat' attends to 'cat':")
print("  Q_sat . K_cat = high score (sat is looking for a subject, cat is one)")
print("  The output for 'sat' includes V_cat weighted by this score")

When 'sat' attends to 'cat':
  Q_sat . K_cat = high score (sat is looking for a subject, cat is one)
  The output for 'sat' includes V_cat weighted by this score

Q, K, and V are learned projections: the model learns what to seek (Q), how to advertise content (K), and what information to transmit (V).

Intuition: Query, Key, Value

Think of attention as a “soft lookup” - like a database query, but differentiable:

Query:   "What information do I need?"     (the question)
Keys:    "What information do I have?"     (index/labels for content)
Values:  "Here's my actual information"    (the content itself)

Attention = softmax(Query . Keys) x Values

Analogy: Imagine a library where:

Your query is “books about cats”
Each book has a key (its topic/keywords)
Each book has a value (its actual content)
You get a weighted average of book contents based on how well they match your query

For the sentence “The cat sat on the mat”:

“sat” might attend strongly to “cat” (who sat?) and weakly to “mat” (where?)
“mat” might attend strongly to “the” and “on” (which mat? on what?)

attentionData = {
  const tokens = ["The", "cat", "sat", "on", "the", "mat"];

  // Realistic attention patterns for each token (rows sum to ~1)
  const weights = [
    [0.45, 0.15, 0.10, 0.05, 0.20, 0.05],  // "The" - attends to self and other "the"
    [0.15, 0.50, 0.20, 0.05, 0.05, 0.05],  // "cat" - attends strongly to self
    [0.10, 0.60, 0.15, 0.05, 0.02, 0.08],  // "sat" - attends to "cat" (who sat?)
    [0.05, 0.10, 0.25, 0.35, 0.05, 0.20],  // "on" - attends to self and context
    [0.35, 0.10, 0.05, 0.10, 0.30, 0.10],  // "the" - attends to other "The"
    [0.05, 0.15, 0.30, 0.25, 0.05, 0.20],  // "mat" - attends to "sat", "on"
  ];

  return { tokens, weights };
}

// Selected token state
viewof focusToken = Inputs.select(attentionData.tokens, {
  label: "Select token to analyze",
  value: "sat"
})

// Interactive attention flow visualization
attentionFlowDiagram = {
  const { tokens, weights } = attentionData;
  const focusIdx = tokens.indexOf(focusToken);
  const focusWeights = weights[focusIdx];

  const width = 650;
  const height = 280;
  const tokenY = 80;
  const tokenSpacing = 95;
  const tokenStartX = 60;
  const tokenRadius = 32;
  const arrowStartY = tokenY + tokenRadius + 15;
  const arrowEndY = height - 55;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', 'SF Mono', monospace");

  // Defs for gradients and markers
  const defs = svg.append("defs");

  // Glow filter for focused token
  const glowFilter = defs.append("filter")
    .attr("id", "attn-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "coloredBlur");
  const glowMerge = glowFilter.append("feMerge");
  glowMerge.append("feMergeNode").attr("in", "coloredBlur");
  glowMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text(`"${focusToken}" attends to...`);

  // Subtitle
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 48)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.6)
    .text("Click any token to change focus");

  // Create arrow markers with varying opacity based on attention weight
  tokens.forEach((_, i) => {
    const weight = focusWeights[i];
    const opacity = 0.3 + weight * 0.7;

    defs.append("marker")
      .attr("id", `attn-arrow-${i}`)
      .attr("viewBox", "0 -4 8 8")
      .attr("refX", 6)
      .attr("refY", 0)
      .attr("markerWidth", 5)
      .attr("markerHeight", 5)
      .attr("orient", "auto")
      .append("path")
      .attr("d", "M0,-4L8,0L0,4Z")
      .attr("fill", diagramTheme.highlight)
      .attr("opacity", opacity);
  });

  // Draw attention arrows from focus token to all tokens
  const focusX = tokenStartX + focusIdx * tokenSpacing;

  tokens.forEach((token, i) => {
    const weight = focusWeights[i];
    const targetX = tokenStartX + i * tokenSpacing;

    // Calculate stroke width based on attention weight (1 to 6 pixels)
    const strokeWidth = 1 + weight * 5;
    const opacity = 0.2 + weight * 0.8;

    // Calculate curve control point
    const midX = (focusX + targetX) / 2;
    const curveOffset = Math.abs(focusX - targetX) * 0.3;
    const controlY = arrowStartY + 40 + curveOffset * 0.5;

    // Arrow group with animation
    const arrowGroup = svg.append("g")
      .attr("class", "attention-arrow")
      .style("opacity", 0);

    // Draw curved arrow
    if (Math.abs(focusIdx - i) <= 1) {
      // Straight or slight curve for adjacent tokens
      arrowGroup.append("path")
        .attr("d", `M${focusX},${arrowStartY} Q${midX},${controlY} ${targetX},${arrowEndY}`)
        .attr("fill", "none")
        .attr("stroke", diagramTheme.highlight)
        .attr("stroke-width", strokeWidth)
        .attr("stroke-opacity", opacity)
        .attr("marker-end", `url(#attn-arrow-${i})`);
    } else {
      // More pronounced curve for distant tokens
      arrowGroup.append("path")
        .attr("d", `M${focusX},${arrowStartY} Q${midX},${controlY + 20} ${targetX},${arrowEndY}`)
        .attr("fill", "none")
        .attr("stroke", diagramTheme.highlight)
        .attr("stroke-width", strokeWidth)
        .attr("stroke-opacity", opacity)
        .attr("marker-end", `url(#attn-arrow-${i})`);
    }

    // Animate arrow appearance with stagger
    arrowGroup
      .transition()
      .delay(i * 80)
      .duration(400)
      .style("opacity", 1);
  });

  // Draw tokens as clickable circles
  tokens.forEach((token, i) => {
    const x = tokenStartX + i * tokenSpacing;
    const isFocus = i === focusIdx;
    const weight = focusWeights[i];

    const tokenGroup = svg.append("g")
      .attr("transform", `translate(${x}, ${tokenY})`)
      .style("cursor", "pointer")
      .on("click", () => {
        // Update the select input programmatically
        const select = document.querySelector('select[name="Select token to analyze"]');
        if (select) {
          select.value = token;
          select.dispatchEvent(new Event('input', { bubbles: true }));
        }
      });

    // Token circle - highlight if focused
    tokenGroup.append("circle")
      .attr("r", tokenRadius)
      .attr("fill", isFocus ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr("stroke", isFocus ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr("stroke-width", isFocus ? 3 : 2)
      .attr("filter", isFocus ? "url(#attn-glow)" : null)
      .style("transition", "all 0.3s ease");

    // Token label
    tokenGroup.append("text")
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isFocus ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr("font-size", "13px")
      .attr("font-weight", isFocus ? "700" : "500")
      .text(token);

    // Attention weight label below each token (in the "target" area)
    const weightGroup = svg.append("g")
      .attr("transform", `translate(${x}, ${arrowEndY + 25})`)
      .style("opacity", 0);

    // Weight background pill
    const weightText = (weight * 100).toFixed(0) + "%";
    const pillWidth = 42;
    const pillHeight = 22;

    weightGroup.append("rect")
      .attr("x", -pillWidth / 2)
      .attr("y", -pillHeight / 2)
      .attr("width", pillWidth)
      .attr("height", pillHeight)
      .attr("rx", pillHeight / 2)
      .attr("fill", diagramTheme.highlight)
      .attr("opacity", 0.15 + weight * 0.6);

    weightGroup.append("text")
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", diagramTheme.highlight)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(weightText);

    // Animate weight labels
    weightGroup
      .transition()
      .delay(300 + i * 60)
      .duration(300)
      .style("opacity", 1);
  });

  return svg.node();
}

Softmax normalizes each row of attention weights to sum to 1.

The Math: Scaled Dot-Product Attention

The attention formula:

Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) x V

Where:

Q (Query): What am I looking for? Shape: (seq, d_k)
K (Key): What do I have to offer? Shape: (seq, d_k)
V (Value): What information do I carry? Shape: (seq, d_v)
d_k: Dimension of keys (for scaling)

Step by Step

// Step control for attention visualization
viewof attentionStep = stepControl({min: 0, max: 4, value: 0, label: "Step"})

stepDescriptions = [
  { title: "Overview", desc: "The four steps of scaled dot-product attention" },
  { title: "Step 1: Compute Similarity", desc: "Multiply Q by K transposed to get raw attention scores" },
  { title: "Step 2: Scale", desc: "Divide by sqrt(d_k) to prevent softmax saturation" },
  { title: "Step 3: Softmax", desc: "Normalize each row to get attention weights (sum to 1)" },
  { title: "Step 4: Apply to Values", desc: "Multiply weights by V to get weighted output" }
]

// Scaled dot-product attention visualization
scaledDotProductDiagram = {
  const width = 720;
  const height = 520;
  const step = attentionStep;
  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 12);

  // Define gradients and filters
  const defs = svg.append("defs");

  // Glow filter for active elements
  const glowFilter = defs.append("filter")
    .attr("id", "sdpa-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "3")
    .attr("result", "coloredBlur");
  const glowMerge = glowFilter.append("feMerge");
  glowMerge.append("feMergeNode").attr("in", "coloredBlur");
  glowMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Color palette for matrices
  const colors = {
    Q: "#22d3ee",      // cyan
    K: "#a78bfa",      // purple
    V: "#4ade80",      // green
    QKT: "#f472b6",    // pink
    scaled: "#fbbf24", // amber
    weights: "#fb923c", // orange
    output: "#34d399"  // emerald
  };

  // Matrix dimensions for visualization
  const matrixH = 48;
  const matrixW = 36;
  const cellSize = 12;

  // Helper to draw a matrix block
  function drawMatrix(g, x, y, rows, cols, color, label, sublabel, isActive, opacity = 1) {
    const w = cols * cellSize;
    const h = rows * cellSize;

    const group = g.append("g")
      .attr("transform", `translate(${x}, ${y})`)
      .style("opacity", opacity);

    // Matrix background
    group.append("rect")
      .attr("x", -w/2)
      .attr("y", -h/2)
      .attr("width", w)
      .attr("height", h)
      .attr("fill", color)
      .attr("fill-opacity", isActive ? 0.3 : 0.15)
      .attr("stroke", color)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("rx", 4)
      .attr("filter", isActive ? "url(#sdpa-glow)" : null);

    // Grid lines
    for (let i = 1; i < rows; i++) {
      group.append("line")
        .attr("x1", -w/2)
        .attr("y1", -h/2 + i * cellSize)
        .attr("x2", w/2)
        .attr("y2", -h/2 + i * cellSize)
        .attr("stroke", color)
        .attr("stroke-opacity", 0.3)
        .attr("stroke-width", 0.5);
    }
    for (let j = 1; j < cols; j++) {
      group.append("line")
        .attr("x1", -w/2 + j * cellSize)
        .attr("y1", -h/2)
        .attr("x2", -w/2 + j * cellSize)
        .attr("y2", h/2)
        .attr("stroke", color)
        .attr("stroke-opacity", 0.3)
        .attr("stroke-width", 0.5);
    }

    // Label above
    group.append("text")
      .attr("x", 0)
      .attr("y", -h/2 - 10)
      .attr("text-anchor", "middle")
      .attr("fill", isActive ? color : theme.nodeText)
      .attr("font-size", "13px")
      .attr("font-weight", isActive ? "700" : "500")
      .text(label);

    // Shape label below
    if (sublabel) {
      group.append("text")
        .attr("x", 0)
        .attr("y", h/2 + 16)
        .attr("text-anchor", "middle")
        .attr("fill", theme.nodeText)
        .attr("font-size", "10px")
        .attr("opacity", 0.6)
        .text(sublabel);
    }

    return group;
  }

  // Helper to draw an arrow
  function drawArrow(g, x1, y1, x2, y2, color, isActive, label = null) {
    const markerId = `sdpa-arrow-${Math.random().toString(36).substr(2, 6)}`;

    defs.append("marker")
      .attr("id", markerId)
      .attr("viewBox", "0 -4 8 8")
      .attr("refX", 6)
      .attr("refY", 0)
      .attr("markerWidth", 5)
      .attr("markerHeight", 5)
      .attr("orient", "auto")
      .append("path")
      .attr("d", "M0,-4L8,0L0,4Z")
      .attr("fill", isActive ? color : theme.edgeStroke);

    const group = g.append("g");

    group.append("line")
      .attr("x1", x1)
      .attr("y1", y1)
      .attr("x2", x2)
      .attr("y2", y2)
      .attr("stroke", isActive ? color : theme.edgeStroke)
      .attr("stroke-width", isActive ? 2 : 1.5)
      .attr("marker-end", `url(#${markerId})`)
      .attr("filter", isActive ? "url(#sdpa-glow)" : null);

    if (label) {
      const midX = (x1 + x2) / 2;
      const midY = (y1 + y2) / 2;
      group.append("text")
        .attr("x", midX)
        .attr("y", midY - 8)
        .attr("text-anchor", "middle")
        .attr("fill", isActive ? color : theme.nodeText)
        .attr("font-size", "10px")
        .attr("font-weight", isActive ? "600" : "400")
        .text(label);
    }

    return group;
  }

  // Helper to draw operation box
  function drawOp(g, x, y, text, isActive, color = theme.highlight) {
    const group = g.append("g")
      .attr("transform", `translate(${x}, ${y})`);

    const padding = 8;
    const textEl = group.append("text")
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? color : theme.nodeText)
      .attr("font-size", "12px")
      .attr("font-weight", isActive ? "700" : "500")
      .text(text);

    const bbox = textEl.node().getBBox();

    group.insert("rect", "text")
      .attr("x", -bbox.width/2 - padding)
      .attr("y", -bbox.height/2 - padding/2)
      .attr("width", bbox.width + padding * 2)
      .attr("height", bbox.height + padding)
      .attr("rx", 4)
      .attr("fill", isActive ? color : theme.nodeFill)
      .attr("fill-opacity", isActive ? 0.2 : 1)
      .attr("stroke", isActive ? color : theme.nodeStroke)
      .attr("stroke-width", isActive ? 2 : 1)
      .attr("filter", isActive ? "url(#sdpa-glow)" : null);

    return group;
  }

  // Title and description
  const stepInfo = stepDescriptions[step];
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", theme.highlight)
    .attr("font-size", "15px")
    .attr("font-weight", "700")
    .text(stepInfo.title);

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 48)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("opacity", 0.7)
    .text(stepInfo.desc);

  // Main content group
  const content = svg.append("g").attr("transform", "translate(0, 60)");

  // Layout positions
  const row1Y = 50;
  const row2Y = 160;
  const row3Y = 270;
  const row4Y = 380;

  // --- STEP 1: Q, K -> QK^T ---
  const step1Active = step === 0 || step === 1;

  // Q matrix
  drawMatrix(content, 140, row1Y, 4, 3, colors.Q, "Q", "seq x d_k", step === 1, step1Active ? 1 : 0.4);

  // K matrix
  drawMatrix(content, 280, row1Y, 4, 3, colors.K, "K", "seq x d_k", step === 1, step1Active ? 1 : 0.4);

  // K^T indicator
  if (step1Active) {
    content.append("text")
      .attr("x", 280)
      .attr("y", row1Y + 50)
      .attr("text-anchor", "middle")
      .attr("fill", step === 1 ? colors.K : theme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", step === 1 ? 1 : 0.5)
      .text("transpose");
  }

  // V matrix (shown from start but activated in step 4)
  const step4Active = step === 0 || step === 4;
  drawMatrix(content, 580, row1Y, 4, 3, colors.V, "V", "seq x d_v", step === 4, step4Active ? 1 : 0.4);

  // Matmul operation for Q @ K^T
  drawOp(content, 210, row2Y - 20, "@ (matmul)", step === 1, colors.QKT);

  // Arrows from Q and K to matmul
  if (step1Active) {
    drawArrow(content, 140, row1Y + 35, 190, row2Y - 40, colors.Q, step === 1);
    drawArrow(content, 280, row1Y + 35, 230, row2Y - 40, colors.K, step === 1);
  }

  // QK^T result
  const step2Active = step === 0 || step === 2;
  drawMatrix(content, 210, row2Y + 40, 4, 4, colors.QKT, "QK^T", "seq x seq", step === 1 || step === 2, step1Active || step2Active ? 1 : 0.4);

  // --- STEP 2: Scale ---
  // Scale operation
  drawOp(content, 210, row3Y - 20, "/ sqrt(d_k)", step === 2, colors.scaled);

  // Arrow from QK^T to scale
  if (step2Active) {
    drawArrow(content, 210, row2Y + 75, 210, row3Y - 40, colors.QKT, step === 2);
  }

  // Scaled scores
  const step3Active = step === 0 || step === 3;
  drawMatrix(content, 210, row3Y + 40, 4, 4, colors.scaled, "Scaled", "seq x seq", step === 2 || step === 3, step2Active || step3Active ? 1 : 0.4);

  // Annotation for why we scale
  if (step === 2) {
    content.append("text")
      .attr("x", 330)
      .attr("y", row3Y - 20)
      .attr("fill", colors.scaled)
      .attr("font-size", "10px")
      .text("prevents gradient vanishing");
  }

  // --- STEP 3: Softmax ---
  drawOp(content, 210, row4Y - 20, "softmax(dim=-1)", step === 3, colors.weights);

  // Arrow from scaled to softmax
  if (step3Active) {
    drawArrow(content, 210, row3Y + 75, 210, row4Y - 40, colors.scaled, step === 3);
  }

  // Attention weights
  drawMatrix(content, 210, row4Y + 40, 4, 4, colors.weights, "Weights", "each row sums to 1", step === 3 || step === 4, step3Active || step4Active ? 1 : 0.4);

  // --- STEP 4: Apply to Values ---
  // Matmul with V
  drawOp(content, 400, row4Y + 40, "@ (matmul)", step === 4, colors.output);

  // Arrows for final matmul
  if (step4Active) {
    drawArrow(content, 250, row4Y + 40, 365, row4Y + 40, colors.weights, step === 4);
    drawArrow(content, 580, row1Y + 35, 580, row4Y - 10, colors.V, step === 4);
    drawArrow(content, 580, row4Y + 10, 435, row4Y + 40, colors.V, step === 4);
  }

  // Output
  drawMatrix(content, 560, row4Y + 40, 4, 3, colors.output, "Output", "seq x d_v", step === 4, step4Active ? 1 : 0.4);

  // Arrow from matmul to output
  if (step4Active) {
    drawArrow(content, 435, row4Y + 40, 520, row4Y + 40, colors.output, step === 4);
  }

  // Formula at bottom
  const formulaY = height - 25;
  const formulaText = "Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) x V";

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", formulaY)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("opacity", 0.8)
    .text(formulaText);

  // Highlight the active part of formula
  if (step > 0) {
    const highlights = [
      null,
      { text: "QK^T", color: colors.QKT },
      { text: "/ sqrt(d_k)", color: colors.scaled },
      { text: "softmax(...)", color: colors.weights },
      { text: "x V", color: colors.output }
    ];
    const h = highlights[step];
    if (h) {
      svg.append("text")
        .attr("x", width / 2)
        .attr("y", formulaY + 18)
        .attr("text-anchor", "middle")
        .attr("fill", h.color)
        .attr("font-size", "11px")
        .attr("font-weight", "600")
        .text(`Current: ${h.text}`);
    }
  }

  return svg.node();
}

Building Attention by Hand

Before using PyTorch, let’s build attention with NumPy to see what happens at each step.

import numpy as np

def attention_from_scratch(x, W_q, W_k, W_v):
    """
    Single-head attention implemented with pure NumPy.

    Args:
        x: Input embeddings (seq_len, embed_dim)
        W_q, W_k, W_v: Projection matrices (embed_dim, head_dim)

    Returns:
        output: Attended values (seq_len, head_dim)
        weights: Attention weights (seq_len, seq_len)
    """
    # Step 1: Project input into Q, K, V
    Q = x @ W_q  # (seq, head_dim) - "What am I looking for?"
    K = x @ W_k  # (seq, head_dim) - "What do I contain?"
    V = x @ W_v  # (seq, head_dim) - "What do I return?"

    print(f"Input x shape: {x.shape}")
    print(f"Q = x @ W_q: {Q.shape}")
    print(f"K = x @ W_k: {K.shape}")
    print(f"V = x @ W_v: {V.shape}")

    # Step 2: Compute attention scores
    # Each query attends to all keys: Q @ K.T
    d_k = K.shape[-1]
    scores = Q @ K.T  # (seq, seq) - similarity between every pair
    scores = scores / np.sqrt(d_k)  # Scale to prevent softmax saturation

    print(f"\nScores = Q @ K.T / sqrt({d_k}): {scores.shape}")
    print(f"Score matrix (who attends to whom):")
    print(scores.round(2))

    # Step 3: Softmax to get attention weights
    # Each row sums to 1: how much each position attends to others
    def softmax(x):
        exp_x = np.exp(x - x.max(axis=-1, keepdims=True))  # Numerical stability
        return exp_x / exp_x.sum(axis=-1, keepdims=True)

    weights = softmax(scores)

    print(f"\nAttention weights (each row sums to 1):")
    print(weights.round(3))
    print(f"Row sums: {weights.sum(axis=-1).round(3)}")

    # Step 4: Weighted sum of values
    output = weights @ V  # (seq, head_dim)

    print(f"\nOutput = weights @ V: {output.shape}")

    return output, weights

# Demo with a tiny example
np.random.seed(42)
seq_len, embed_dim, head_dim = 3, 4, 2

x = np.random.randn(seq_len, embed_dim)
W_q = np.random.randn(embed_dim, head_dim) * 0.5
W_k = np.random.randn(embed_dim, head_dim) * 0.5
W_v = np.random.randn(embed_dim, head_dim) * 0.5

print("=" * 50)
print("ATTENTION FROM SCRATCH")
print("=" * 50)
output, weights = attention_from_scratch(x, W_q, W_k, W_v)

==================================================
ATTENTION FROM SCRATCH
==================================================
Input x shape: (3, 4)
Q = x @ W_q: (3, 2)
K = x @ W_k: (3, 2)
V = x @ W_v: (3, 2)

Scores = Q @ K.T / sqrt(2): (3, 3)
Score matrix (who attends to whom):
[[ 0.05  0.2   0.4 ]
 [ 0.48  0.72 -0.05]
 [ 0.18  0.22 -0.18]]

Attention weights (each row sums to 1):
[[0.278 0.324 0.397]
 [0.348 0.445 0.206]
 [0.365 0.381 0.255]]
Row sums: [1. 1. 1.]

Output = weights @ V: (3, 2)

Key Insight: Attention reduces to four matrix multiplications: 1. Q = x @ W_q - Project to queries 2. K = x @ W_k - Project to keys 3. scores = Q @ K.T / sqrt(d) - Compute similarities 4. output = softmax(scores) @ V - Weighted sum of values

Why Scale by sqrt(d_k)?

Scaling prevents large d_k from producing dot products that saturate softmax and shrink gradients to near-zero. Here’s the intuition:

The Problem: When Q and K have elements drawn from a distribution with mean 0 and variance 1, their dot product has variance proportional to d_k. For d_k = 64, dot products can easily reach values like 8 or -10.

Why This Matters: Softmax of large values produces near-one-hot distributions:

softmax([10, 0, 0]) = [0.9999, 0.00005, 0.00005]

This causes:

Vanishing gradients: The gradient of softmax approaches 0 at extremes
Loss of information: We want soft attention, not hard selection

The Solution: Dividing by sqrt(d_k) normalizes variance back to ~1.

import torch
import torch.nn.functional as F
import math

# Show the effect of scaling
d_k = 64
q = torch.randn(1, d_k)
k = torch.randn(1, d_k)

dot_product = (q @ k.T).item()
scaled = dot_product / math.sqrt(d_k)

print(f"d_k = {d_k}")
print(f"Raw dot product: {dot_product:.2f}")
print(f"Scaled by sqrt({d_k}) = {math.sqrt(d_k):.1f}: {scaled:.2f}")
print(f"\nScaling keeps values in a reasonable range for softmax")

# Demonstrate the gradient problem
scores_large = torch.tensor([[10.0, 1.0, 1.0]], requires_grad=True)
scores_normal = torch.tensor([[1.0, 0.5, 0.5]], requires_grad=True)

weights_large = F.softmax(scores_large, dim=-1)
weights_normal = F.softmax(scores_normal, dim=-1)

print(f"\nLarge scores [10, 1, 1] -> softmax: {weights_large.detach().numpy().round(4)}")
print(f"Normal scores [1, 0.5, 0.5] -> softmax: {weights_normal.detach().numpy().round(3)}")

d_k = 64
Raw dot product: 3.86
Scaled by sqrt(64) = 8.0: 0.48

Scaling keeps values in a reasonable range for softmax

Large scores [10, 1, 1] -> softmax: [[9.998e-01 1.000e-04 1.000e-04]]
Normal scores [1, 0.5, 0.5] -> softmax: [[0.452 0.274 0.274]]

Numerical Stability in Softmax

The naive softmax implementation hides a danger: overflow.

import numpy as np

# The naive softmax
def naive_softmax(x):
    """This will overflow for large values!"""
    exp_x = np.exp(x)
    return exp_x / exp_x.sum()

# Try it with large values
large_scores = np.array([1000.0, 1001.0, 1002.0])

print("Large scores:", large_scores)
print("exp(1000) =", np.exp(1000))  # This is inf!

try:
    result = naive_softmax(large_scores)
    print("Naive softmax:", result)  # Will be [nan, nan, nan]
except:
    print("Overflow error!")

Large scores: [1000. 1001. 1002.]
exp(1000) = inf
Naive softmax: [nan nan nan]

/tmp/ipykernel_23516/757484777.py:13: RuntimeWarning: overflow encountered in exp
  print("exp(1000) =", np.exp(1000))  # This is inf!
/tmp/ipykernel_23516/757484777.py:6: RuntimeWarning: overflow encountered in exp
  exp_x = np.exp(x)
/tmp/ipykernel_23516/757484777.py:7: RuntimeWarning: invalid value encountered in divide
  return exp_x / exp_x.sum()

The Problem: exp(1000) is astronomically large - it overflows to infinity. Even exp(100) is about 2.7 x 10^43.

The Solution: The max-subtraction trick. Subtract the maximum value before exponentiating.

def stable_softmax(x):
    """
    Numerically stable softmax using the max-subtraction trick.

    Key insight: softmax(x) = softmax(x - c) for any constant c
    We choose c = max(x) to keep values small.
    """
    # Subtract max for numerical stability
    x_shifted = x - x.max()

    print(f"Original: {x}")
    print(f"After subtracting max ({x.max()}): {x_shifted}")
    print(f"Now exp() won't overflow: exp({x_shifted}) = {np.exp(x_shifted)}")

    exp_x = np.exp(x_shifted)
    return exp_x / exp_x.sum()

print("Stable softmax with max-subtraction trick:")
print("=" * 50)
large_scores = np.array([1000.0, 1001.0, 1002.0])
result = stable_softmax(large_scores)
print(f"\nResult: {result}")
print(f"Sum: {result.sum()}")  # Should be 1.0

Stable softmax with max-subtraction trick:
==================================================
Original: [1000. 1001. 1002.]
After subtracting max (1002.0): [-2. -1.  0.]
Now exp() won't overflow: exp([-2. -1.  0.]) = [0.13533528 0.36787944 1.        ]

Result: [0.09003057 0.24472847 0.66524096]
Sum: 0.9999999999999999

Why does this work mathematically?

softmax(x)_i = exp(x_i) / sum(exp(x_j))
            = exp(x_i - c) / sum(exp(x_j - c))    [multiply by exp(-c)/exp(-c)]

For c = max(x), all exponents are <= 0, so exp() stays bounded.

# Verify the math: both give same result
normal_scores = np.array([2.0, 1.0, 0.1])

naive_result = naive_softmax(normal_scores)
stable_result = stable_softmax(normal_scores)

print(f"\nNaive:  {naive_result}")
print(f"Stable: {stable_result}")
print(f"Same? {np.allclose(naive_result, stable_result)}")

Original: [2.  1.  0.1]
After subtracting max (2.0): [ 0.  -1.  -1.9]
Now exp() won't overflow: exp([ 0.  -1.  -1.9]) = [1.         0.36787944 0.14956862]

Naive:  [0.65900114 0.24243297 0.09856589]
Stable: [0.65900114 0.24243297 0.09856589]
Same? True

Always Use Stable Softmax

PyTorch’s F.softmax applies the max-subtraction trick automatically. Naive softmax fails silently, returning NaN when scores grow large.

Code: Scaled Dot-Product Attention

Let’s implement attention step by step. This follows the exact algorithm from the attention.py module:

def scaled_dot_product_attention(query, key, value, mask=None):
    """
    Compute scaled dot-product attention.

    Attention(Q, K, V) = softmax(QK^T / sqrt(d_k)) x V

    Args:
        query: (batch, seq, d_k) or (..., seq, d_k)
        key: (batch, seq, d_k)
        value: (batch, seq, d_v)  # d_v can differ from d_k
        mask: Optional mask where 0 = masked, 1 = attend

    Returns:
        output: (batch, seq, d_v)
        attention_weights: (batch, seq, seq)

    Note: The mask convention (0 = masked) matches the module implementation.
    Masked positions get -inf before softmax, becoming 0 after.
    """
    d_k = query.size(-1)

    # Step 1: Compute similarity scores
    # QK^T: (..., seq, d_k) @ (..., d_k, seq) -> (..., seq, seq)
    scores = torch.matmul(query, key.transpose(-2, -1))

    # Step 2: Scale by sqrt(d_k)
    scores = scores / math.sqrt(d_k)

    # Step 3: Apply mask (if provided)
    # Masked positions get -inf, which becomes 0 after softmax
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))

    # Step 4: Softmax (each row sums to 1)
    attention_weights = F.softmax(scores, dim=-1)

    # Step 5: Weighted sum of values
    output = torch.matmul(attention_weights, value)

    return output, attention_weights

# Test it
batch, seq, d_k = 1, 4, 8
Q = torch.randn(batch, seq, d_k)
K = torch.randn(batch, seq, d_k)
V = torch.randn(batch, seq, d_k)

output, weights = scaled_dot_product_attention(Q, K, V)

print(f"Query shape: {Q.shape}")
print(f"Key shape: {K.shape}")
print(f"Value shape: {V.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {weights.shape}")
print(f"\nAttention weights (each row sums to 1):")
print(weights[0].round(decimals=2).numpy())
print(f"\nRow sums: {weights[0].sum(dim=-1).numpy()}")

Query shape: torch.Size([1, 4, 8])
Key shape: torch.Size([1, 4, 8])
Value shape: torch.Size([1, 4, 8])
Output shape: torch.Size([1, 4, 8])
Attention weights shape: torch.Size([1, 4, 4])

Attention weights (each row sums to 1):
[[0.01 0.87 0.06 0.06]
 [0.61 0.16 0.08 0.15]
 [0.18 0.05 0.44 0.32]
 [0.01 0.61 0.28 0.1 ]]

Row sums: [1.0000001  0.99999994 1.         1.        ]

Visualizing Attention Patterns

# Extract attention weights for OJS visualization
attention_weights_for_viz = weights[0].detach().numpy().tolist()
ojs_define(attention_weights_viz = attention_weights_for_viz)

// Random Attention Pattern Heatmap
randomAttentionHeatmap = {
  const width = 500;
  const height = 420;
  const margin = {top: 50, right: 80, bottom: 50, left: 60};
  const n = attention_weights_viz.length;
  const cellSize = Math.min(
    (width - margin.left - margin.right) / n,
    (height - margin.top - margin.bottom) / n
  );
  const matrixSize = cellSize * n;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "600")
    .text("Random Attention Pattern");

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Color scale - blue intensity
  const maxWeight = d3.max(attention_weights_viz.flat());
  const colorScale = d3.scaleSequential(d3.interpolateBlues)
    .domain([0, maxWeight]);

  // Draw cells
  for (let i = 0; i < n; i++) {
    for (let j = 0; j < n; j++) {
      const weight = attention_weights_viz[i][j];

      g.append("rect")
        .attr("x", j * cellSize)
        .attr("y", i * cellSize)
        .attr("width", cellSize - 1)
        .attr("height", cellSize - 1)
        .attr("fill", colorScale(weight))
        .attr("rx", 2);

      g.append("text")
        .attr("x", j * cellSize + cellSize / 2)
        .attr("y", i * cellSize + cellSize / 2)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", weight > maxWeight * 0.5 ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
        .attr("font-size", "10px")
        .text(weight.toFixed(2));
    }
  }

  // Axis labels
  for (let i = 0; i < n; i++) {
    // Row labels (Query)
    g.append("text")
      .attr("x", -8)
      .attr("y", i * cellSize + cellSize / 2)
      .attr("text-anchor", "end")
      .attr("dominant-baseline", "central")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .text(i);

    // Column labels (Key)
    g.append("text")
      .attr("x", i * cellSize + cellSize / 2)
      .attr("y", matrixSize + 15)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .text(i);
  }

  // Axis titles
  g.append("text")
    .attr("x", matrixSize / 2)
    .attr("y", matrixSize + 35)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .text("Key Position (what we look at)");

  g.append("text")
    .attr("transform", `translate(-40, ${matrixSize / 2}) rotate(-90)`)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .text("Query Position (who is looking)");

  // Color legend
  const legendHeight = matrixSize;
  const legendWidth = 15;
  const legendX = matrixSize + 25;

  const legendScale = d3.scaleLinear()
    .domain([0, maxWeight])
    .range([legendHeight, 0]);

  // Gradient for legend
  const defs = svg.append("defs");
  const gradient = defs.append("linearGradient")
    .attr("id", "attention-legend-gradient")
    .attr("x1", "0%")
    .attr("y1", "100%")
    .attr("x2", "0%")
    .attr("y2", "0%");

  gradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", colorScale(0));
  gradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", colorScale(maxWeight));

  g.append("rect")
    .attr("x", legendX)
    .attr("y", 0)
    .attr("width", legendWidth)
    .attr("height", legendHeight)
    .attr("fill", "url(#attention-legend-gradient)")
    .attr("rx", 2);

  // Legend ticks
  const legendAxis = d3.axisRight(legendScale)
    .ticks(5)
    .tickFormat(d3.format(".2f"));

  g.append("g")
    .attr("transform", `translate(${legendX + legendWidth}, 0)`)
    .call(legendAxis)
    .call(g => g.select(".domain").remove())
    .call(g => g.selectAll(".tick line").attr("stroke", diagramTheme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "9px"));

  return svg.node();
}

Causal Masking for Language Models

In autoregressive models (like GPT), each token attends only to previous tokens, never future ones. A causal mask enforces this constraint:

Mask Convention

In this lesson, we use an additive mask where:

0 = attend (score unchanged)
-inf = masked (softmax converts to 0)

Some libraries use a boolean mask (True = attend, False = masked) which is converted internally. The key insight: positions with -inf before softmax become 0 attention weight.

// Toggle between attention modes
viewof attentionMode = Inputs.radio(
  ["Bidirectional (BERT)", "Causal (GPT)"],
  {
    value: "Causal (GPT)",
    label: "Attention Type"
  }
)

// Bidirectional vs Causal attention matrix visualization
bidirectionalVsCausalDiagram = {
  const tokens = ["The", "cat", "sat", "on", "the", "mat"];
  const n = tokens.length;
  const isCausal = attentionMode === "Causal (GPT)";

  const width = 580;
  const height = 480;
  const matrixSize = 300;
  const cellSize = matrixSize / n;
  const matrixX = (width - matrixSize) / 2;
  const matrixY = 100;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Defs for filters and gradients
  const defs = svg.append("defs");

  // Glow filter for active cells
  const glowFilter = defs.append("filter")
    .attr("id", "cell-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "2")
    .attr("result", "coloredBlur");
  const glowMerge = glowFilter.append("feMerge");
  glowMerge.append("feMergeNode").attr("in", "coloredBlur");
  glowMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Masked pattern for blocked cells
  const maskedPattern = defs.append("pattern")
    .attr("id", "masked-pattern")
    .attr("patternUnits", "userSpaceOnUse")
    .attr("width", 8)
    .attr("height", 8);
  maskedPattern.append("rect")
    .attr("width", 8)
    .attr("height", 8)
    .attr("fill", diagramTheme.bgSecondary);
  maskedPattern.append("path")
    .attr("d", "M-1,1 l2,-2 M0,8 l8,-8 M7,9 l2,-2")
    .attr("stroke", diagramTheme.nodeStroke)
    .attr("stroke-width", 1)
    .attr("stroke-opacity", 0.3);

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 12);

  // Title
  const title = isCausal ? "Causal Attention (GPT)" : "Bidirectional Attention (BERT)";
  const subtitle = isCausal
    ? "Each token can only attend to itself and previous tokens"
    : "Each token can attend to all tokens in the sequence";

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 32)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "16px")
    .attr("font-weight", "700")
    .text(title);

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 54)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "12px")
    .attr("opacity", 0.7)
    .text(subtitle);

  // Matrix group
  const matrix = svg.append("g")
    .attr("transform", `translate(${matrixX}, ${matrixY})`);

  // Row labels (Query tokens) - on the left
  svg.append("text")
    .attr("x", matrixX - 55)
    .attr("y", matrixY - 15)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("font-weight", "600")
    .text("Query");

  tokens.forEach((token, i) => {
    svg.append("text")
      .attr("x", matrixX - 10)
      .attr("y", matrixY + i * cellSize + cellSize / 2)
      .attr("text-anchor", "end")
      .attr("dominant-baseline", "central")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(token);
  });

  // Column labels (Key tokens) - on top
  svg.append("text")
    .attr("x", matrixX + matrixSize / 2)
    .attr("y", matrixY - 25)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("font-weight", "600")
    .text("Key");

  tokens.forEach((token, j) => {
    svg.append("text")
      .attr("x", matrixX + j * cellSize + cellSize / 2)
      .attr("y", matrixY - 8)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text(token);
  });

  // Draw attention matrix cells
  for (let i = 0; i < n; i++) {
    for (let j = 0; j < n; j++) {
      const canAttend = !isCausal || j <= i;
      const delay = (i * n + j) * 15;

      const cell = matrix.append("g")
        .attr("transform", `translate(${j * cellSize}, ${i * cellSize})`);

      // Cell background
      const rect = cell.append("rect")
        .attr("width", cellSize - 2)
        .attr("height", cellSize - 2)
        .attr("x", 1)
        .attr("y", 1)
        .attr("rx", 4)
        .attr("stroke", canAttend ? diagramTheme.highlight : diagramTheme.nodeStroke)
        .attr("stroke-width", canAttend ? 1.5 : 1)
        .attr("stroke-opacity", canAttend ? 0.6 : 0.3);

      // Animate fill based on attention mode
      if (canAttend) {
        rect
          .attr("fill", diagramTheme.highlight)
          .attr("fill-opacity", 0)
          .transition()
          .delay(delay)
          .duration(300)
          .attr("fill-opacity", 0.25 + (i === j ? 0.25 : 0));
      } else {
        rect
          .attr("fill", "url(#masked-pattern)")
          .attr("fill-opacity", 0)
          .transition()
          .delay(delay)
          .duration(300)
          .attr("fill-opacity", 1);
      }

      // Checkmark or X indicator
      const symbol = cell.append("text")
        .attr("x", cellSize / 2)
        .attr("y", cellSize / 2)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("font-size", "14px")
        .attr("font-weight", "600")
        .attr("opacity", 0);

      if (canAttend) {
        symbol
          .attr("fill", diagramTheme.highlight)
          .text("\u2713") // checkmark
          .transition()
          .delay(delay + 150)
          .duration(200)
          .attr("opacity", 0.8);
      } else {
        symbol
          .attr("fill", diagramTheme.nodeStroke)
          .text("\u2717") // X mark
          .transition()
          .delay(delay + 150)
          .duration(200)
          .attr("opacity", 0.5);
      }
    }
  }

  // Matrix border
  matrix.append("rect")
    .attr("width", matrixSize)
    .attr("height", matrixSize)
    .attr("fill", "none")
    .attr("stroke", diagramTheme.nodeStroke)
    .attr("stroke-width", 2)
    .attr("rx", 6);

  // Legend
  const legendY = matrixY + matrixSize + 30;

  // Can attend legend
  const legendAttend = svg.append("g")
    .attr("transform", `translate(${width / 2 - 100}, ${legendY})`);

  legendAttend.append("rect")
    .attr("width", 20)
    .attr("height", 20)
    .attr("rx", 3)
    .attr("fill", diagramTheme.highlight)
    .attr("fill-opacity", 0.35)
    .attr("stroke", diagramTheme.highlight)
    .attr("stroke-width", 1.5);

  legendAttend.append("text")
    .attr("x", 28)
    .attr("y", 10)
    .attr("dominant-baseline", "central")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .text("Can attend");

  // Cannot attend legend (only show in causal mode)
  if (isCausal) {
    const legendMasked = svg.append("g")
      .attr("transform", `translate(${width / 2 + 40}, ${legendY})`);

    legendMasked.append("rect")
      .attr("width", 20)
      .attr("height", 20)
      .attr("rx", 3)
      .attr("fill", "url(#masked-pattern)")
      .attr("stroke", diagramTheme.nodeStroke)
      .attr("stroke-width", 1);

    legendMasked.append("text")
      .attr("x", 28)
      .attr("y", 10)
      .attr("dominant-baseline", "central")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .text("Masked (future)");
  }

  // Explanation text at bottom
  const explanationY = legendY + 45;
  const explanation = isCausal
    ? 'Row i can only see columns 0...i (lower triangle + diagonal)'
    : 'Every position can attend to every other position';

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", explanationY)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.6)
    .attr("font-style", "italic")
    .text(explanation);

  return svg.node();
}

The Causal Mask from Scratch

The causal mask is elegantly simple: we add -inf to positions we want to mask, and softmax turns those into zeros.

import numpy as np

def causal_mask_from_scratch(seq_len):
    """
    Create a causal mask using np.triu (upper triangular).

    The mask has -inf above the diagonal (future positions)
    and 0 on and below the diagonal (past/current positions).
    """
    # np.triu with k=1 gives us the strictly upper triangular part
    # (everything above the main diagonal)
    mask = np.triu(np.ones((seq_len, seq_len)), k=1)

    # Convert 1s to -inf (positions to mask)
    mask = mask * (-1e9)  # Use large negative instead of -inf for visualization

    return mask

# Visualize the mask
seq_len = 5
mask = causal_mask_from_scratch(seq_len)

print("Causal Mask (0 = attend, -inf = masked):")
print(mask.round(0))

print("\nHow it works:")
print("  Position 0: sees only position 0")
print("  Position 1: sees positions 0, 1")
print("  Position 4: sees positions 0, 1, 2, 3, 4")

Causal Mask (0 = attend, -inf = masked):
[[-0.e+00 -1.e+09 -1.e+09 -1.e+09 -1.e+09]
 [-0.e+00 -0.e+00 -1.e+09 -1.e+09 -1.e+09]
 [-0.e+00 -0.e+00 -0.e+00 -1.e+09 -1.e+09]
 [-0.e+00 -0.e+00 -0.e+00 -0.e+00 -1.e+09]
 [-0.e+00 -0.e+00 -0.e+00 -0.e+00 -0.e+00]]

How it works:
  Position 0: sees only position 0
  Position 1: sees positions 0, 1
  Position 4: sees positions 0, 1, 2, 3, 4

def attention_with_causal_mask(x, W_q, W_k, W_v):
    """
    Causal attention from scratch - each position only attends to past.
    """
    Q = x @ W_q
    K = x @ W_k
    V = x @ W_v

    seq_len = x.shape[0]
    d_k = K.shape[-1]

    # Compute scores
    scores = Q @ K.T / np.sqrt(d_k)

    print("Scores before masking:")
    print(scores.round(2))

    # Add causal mask: -inf for future positions
    mask = np.triu(np.ones((seq_len, seq_len)), k=1) * (-1e9)
    scores = scores + mask

    print("\nScores after adding causal mask:")
    print(scores.round(2))

    # Softmax: -inf becomes 0
    def softmax(x):
        exp_x = np.exp(x - x.max(axis=-1, keepdims=True))
        return exp_x / exp_x.sum(axis=-1, keepdims=True)

    weights = softmax(scores)

    print("\nAttention weights (upper triangle is 0!):")
    print(weights.round(3))

    output = weights @ V
    return output, weights

# Demo
np.random.seed(42)
seq_len, embed_dim, head_dim = 4, 4, 2
x = np.random.randn(seq_len, embed_dim)
W_q = np.random.randn(embed_dim, head_dim) * 0.5
W_k = np.random.randn(embed_dim, head_dim) * 0.5
W_v = np.random.randn(embed_dim, head_dim) * 0.5

print("=" * 50)
print("CAUSAL ATTENTION FROM SCRATCH")
print("=" * 50)
output, weights = attention_with_causal_mask(x, W_q, W_k, W_v)

==================================================
CAUSAL ATTENTION FROM SCRATCH
==================================================
Scores before masking:
[[-1.08 -0.42  0.22  0.84]
 [-1.26 -0.68  0.22  1.97]
 [ 0.11  0.11 -0.01 -0.41]
 [ 2.12  0.79 -0.44 -1.52]]

Scores after adding causal mask:
[[-1.08000000e+00 -1.00000000e+09 -1.00000000e+09 -9.99999999e+08]
 [-1.26000000e+00 -6.80000000e-01 -1.00000000e+09 -9.99999998e+08]
 [ 1.10000000e-01  1.10000000e-01 -1.00000000e-02 -1.00000000e+09]
 [ 2.12000000e+00  7.90000000e-01 -4.40000000e-01 -1.52000000e+00]]

Attention weights (upper triangle is 0!):
[[1.    0.    0.    0.   ]
 [0.359 0.641 0.    0.   ]
 [0.348 0.345 0.307 0.   ]
 [0.731 0.193 0.057 0.019]]

Key Insight

Causal masking is just adding -inf before softmax. That is the entire trick.

softmax([2.0, 1.0, -inf]) = [0.73, 0.27, 0.00]
The -inf position gets exactly 0 weight
No information flows from future to past

def create_causal_mask(seq_len):
    """Create a lower triangular causal mask."""
    return torch.tril(torch.ones(seq_len, seq_len))

# Show the mask
seq_len = 6
mask = create_causal_mask(seq_len)

print("Causal Mask (1 = can attend, 0 = masked):")
print()
tokens = ["The", "cat", "sat", "on", "the", "mat"]
for i in range(seq_len):
    row = ['#' if mask[i, j] == 1 else '.' for j in range(seq_len)]
    print(f"  {tokens[i]:4s}: {''.join(row)}")

print(f"\nPosition 0 can only see position 0")
print(f"Position 5 can see all previous positions")

Causal Mask (1 = can attend, 0 = masked):

  The : #.....
  cat : ##....
  sat : ###...
  on  : ####..
  the : #####.
  mat : ######

Position 0 can only see position 0
Position 5 can see all previous positions

# Apply causal mask
Q = torch.randn(1, 6, 8)
K = torch.randn(1, 6, 8)
V = torch.randn(1, 6, 8)

# Without mask (bidirectional)
output_bi, weights_bi = scaled_dot_product_attention(Q, K, V)

# With causal mask
output_causal, weights_causal = scaled_dot_product_attention(Q, K, V, mask=mask)

# Pass to OJS for visualization
ojs_define(
    weights_bi_viz = weights_bi[0].detach().numpy().tolist(),
    weights_causal_viz = weights_causal[0].detach().numpy().tolist(),
    mask_tokens = tokens
)

print("Notice: In causal attention, the upper triangle is 0 (can't attend to future)")

// Side-by-side comparison: Bidirectional vs Causal attention
biVsCausalComparison = {
  const width = 780;
  const height = 380;
  const margin = {top: 50, right: 20, bottom: 70, left: 70};
  const n = mask_tokens.length;

  const matrixWidth = (width - margin.left - margin.right - 100) / 2;
  const cellSize = Math.min(matrixWidth / n, (height - margin.top - margin.bottom) / n);
  const matrixSize = cellSize * n;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Color scale
  const colorScale = d3.scaleSequential(d3.interpolateBlues)
    .domain([0, 1]);

  // Helper to draw a heatmap
  function drawHeatmap(g, weights, title, xOffset) {
    const heatmap = g.append("g")
      .attr("transform", `translate(${xOffset}, ${margin.top})`);

    // Title
    heatmap.append("text")
      .attr("x", matrixSize / 2)
      .attr("y", -25)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.highlight)
      .attr("font-size", "13px")
      .attr("font-weight", "600")
      .text(title);

    // Draw cells
    for (let i = 0; i < n; i++) {
      for (let j = 0; j < n; j++) {
        const weight = weights[i][j];

        heatmap.append("rect")
          .attr("x", j * cellSize)
          .attr("y", i * cellSize)
          .attr("width", cellSize - 1)
          .attr("height", cellSize - 1)
          .attr("fill", colorScale(weight))
          .attr("rx", 2);

        if (cellSize >= 35) {
          heatmap.append("text")
            .attr("x", j * cellSize + cellSize / 2)
            .attr("y", i * cellSize + cellSize / 2)
            .attr("text-anchor", "middle")
            .attr("dominant-baseline", "central")
            .attr("fill", weight > 0.5 ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
            .attr("font-size", "9px")
            .text(weight.toFixed(2));
        }
      }
    }

    // Row labels (tokens)
    for (let i = 0; i < n; i++) {
      heatmap.append("text")
        .attr("x", -8)
        .attr("y", i * cellSize + cellSize / 2)
        .attr("text-anchor", "end")
        .attr("dominant-baseline", "central")
        .attr("fill", diagramTheme.nodeText)
        .attr("font-size", "10px")
        .text(mask_tokens[i]);
    }

    // Column labels (tokens)
    for (let j = 0; j < n; j++) {
      heatmap.append("text")
        .attr("x", j * cellSize + cellSize / 2)
        .attr("y", matrixSize + 15)
        .attr("text-anchor", "middle")
        .attr("fill", diagramTheme.nodeText)
        .attr("font-size", "10px")
        .attr("transform", `rotate(-45, ${j * cellSize + cellSize / 2}, ${matrixSize + 15})`)
        .text(mask_tokens[j]);
    }

    // Border
    heatmap.append("rect")
      .attr("width", matrixSize)
      .attr("height", matrixSize)
      .attr("fill", "none")
      .attr("stroke", diagramTheme.nodeStroke)
      .attr("stroke-width", 1.5)
      .attr("rx", 4);
  }

  // Draw both heatmaps
  drawHeatmap(svg, weights_bi_viz, "Bidirectional Attention", margin.left);
  drawHeatmap(svg, weights_causal_viz, "Causal Attention (Lower Triangular)", margin.left + matrixSize + 80);

  // Shared color legend
  const legendX = width - 30;
  const legendHeight = matrixSize;
  const legendWidth = 12;

  const legendScale = d3.scaleLinear()
    .domain([0, 1])
    .range([legendHeight, 0]);

  // Gradient
  const defs = svg.append("defs");
  const gradient = defs.append("linearGradient")
    .attr("id", "bi-causal-legend-gradient")
    .attr("x1", "0%")
    .attr("y1", "100%")
    .attr("x2", "0%")
    .attr("y2", "0%");

  gradient.append("stop")
    .attr("offset", "0%")
    .attr("stop-color", colorScale(0));
  gradient.append("stop")
    .attr("offset", "100%")
    .attr("stop-color", colorScale(1));

  svg.append("rect")
    .attr("x", legendX)
    .attr("y", margin.top)
    .attr("width", legendWidth)
    .attr("height", legendHeight)
    .attr("fill", "url(#bi-causal-legend-gradient)")
    .attr("rx", 2);

  // Legend axis
  const legendAxis = d3.axisRight(legendScale)
    .ticks(5)
    .tickFormat(d3.format(".1f"));

  svg.append("g")
    .attr("transform", `translate(${legendX + legendWidth}, ${margin.top})`)
    .call(legendAxis)
    .call(g => g.select(".domain").remove())
    .call(g => g.selectAll(".tick line").attr("stroke", diagramTheme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "9px"));

  // Caption
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", height - 15)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.7)
    .text("Notice: In causal attention, the upper triangle is 0 (cannot attend to future tokens)");

  return svg.node();
}

Multi-Head Attention

Multiple attention heads, instead of one, learn different patterns:

MultiHead(Q, K, V) = Concat(head_1, ..., head_h) x W_O

where head_i = Attention(Q x W_Q_i, K x W_K_i, V x W_V_i)

Why Multiple Heads?

A single attention head computes one weighted average, which limits what relationships it can capture. Multiple heads provide:

Diverse patterns: Different heads focus on different relationships: syntax, semantics, position, coreference
Subspace attention: Each head operates in a lower-dimensional subspace (head_dim = embed_dim / num_heads), allowing specialized representations
Computational efficiency: Despite having multiple heads, the total computation is similar to single-head attention with full dimensionality (same number of parameters)

Typical configurations:

GPT-2: 12 heads, 768 embed_dim, 64 head_dim
GPT-3: 96 heads, 12288 embed_dim, 128 head_dim
Llama 2 (7B): 32 heads, 4096 embed_dim, 128 head_dim

// Step control for multi-head architecture walkthrough
viewof multiHeadStep = stepControl({min: 0, max: 5, value: 0, label: "Step"})

multiHeadStepDescriptions = [
  { title: "Input", desc: "Token embeddings enter the multi-head attention layer" },
  { title: "Linear Projections", desc: "Three learned weight matrices create Q, K, V projections" },
  { title: "Split into Heads", desc: "Reshape tensors to distribute across multiple attention heads" },
  { title: "Parallel Attention", desc: "Each head computes scaled dot-product attention independently" },
  { title: "Concatenate", desc: "Combine all head outputs back into a single tensor" },
  { title: "Output Projection", desc: "Final linear transformation produces the layer output" }
]

// Multi-head attention architecture visualization
multiHeadArchDiagram = {
  const width = 780;
  const height = 640;
  const step = multiHeadStep;
  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 12);

  // Define gradients and filters
  const defs = svg.append("defs");

  // Glow filter for active elements
  const glowFilter = defs.append("filter")
    .attr("id", "mha-glow")
    .attr("x", "-50%")
    .attr("y", "-50%")
    .attr("width", "200%")
    .attr("height", "200%");
  glowFilter.append("feGaussianBlur")
    .attr("stdDeviation", "4")
    .attr("result", "coloredBlur");
  const glowMerge = glowFilter.append("feMerge");
  glowMerge.append("feMergeNode").attr("in", "coloredBlur");
  glowMerge.append("feMergeNode").attr("in", "SourceGraphic");

  // Color palette - consistent Q/K/V colors
  const colors = {
    Q: "#22d3ee",      // cyan for Query
    K: "#a78bfa",      // purple for Key
    V: "#4ade80",      // green for Value
    head: "#fb923c",   // orange for heads
    concat: "#f472b6", // pink for concatenation
    output: "#34d399", // emerald for output
    input: "#94a3b8"   // slate for input
  };

  // Helper to create arrow marker
  function createMarker(id, color) {
    defs.append("marker")
      .attr("id", id)
      .attr("viewBox", "0 -4 8 8")
      .attr("refX", 6)
      .attr("refY", 0)
      .attr("markerWidth", 5)
      .attr("markerHeight", 5)
      .attr("orient", "auto")
      .append("path")
      .attr("d", "M0,-4L8,0L0,4Z")
      .attr("fill", color);
  }

  createMarker("mha-arrow-default", theme.edgeStroke);
  createMarker("mha-arrow-q", colors.Q);
  createMarker("mha-arrow-k", colors.K);
  createMarker("mha-arrow-v", colors.V);
  createMarker("mha-arrow-head", colors.head);
  createMarker("mha-arrow-concat", colors.concat);
  createMarker("mha-arrow-output", colors.output);

  // Helper to draw a box with label
  function drawBox(g, x, y, w, h, color, label, sublabel, isActive, opacity = 1) {
    const group = g.append("g")
      .attr("transform", `translate(${x}, ${y})`)
      .style("opacity", opacity);

    group.append("rect")
      .attr("x", -w/2)
      .attr("y", -h/2)
      .attr("width", w)
      .attr("height", h)
      .attr("rx", 6)
      .attr("fill", isActive ? color : theme.nodeFill)
      .attr("fill-opacity", isActive ? 0.25 : 1)
      .attr("stroke", isActive ? color : theme.nodeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("filter", isActive ? "url(#mha-glow)" : null);

    group.append("text")
      .attr("y", sublabel ? -6 : 0)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive ? color : theme.nodeText)
      .attr("font-size", "12px")
      .attr("font-weight", isActive ? "700" : "500")
      .text(label);

    if (sublabel) {
      group.append("text")
        .attr("y", 10)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", theme.nodeText)
        .attr("font-size", "9px")
        .attr("opacity", 0.6)
        .text(sublabel);
    }

    return group;
  }

  // Helper to draw an arrow
  function drawArrow(g, x1, y1, x2, y2, color, isActive, markerId = "mha-arrow-default") {
    const arrow = g.append("line")
      .attr("x1", x1)
      .attr("y1", y1)
      .attr("x2", x2)
      .attr("y2", y2)
      .attr("stroke", isActive ? color : theme.edgeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("marker-end", `url(#${isActive ? markerId : "mha-arrow-default"})`)
      .attr("filter", isActive ? "url(#mha-glow)" : null);
    return arrow;
  }

  // Helper to draw curved arrow (for parallel paths)
  function drawCurvedArrow(g, x1, y1, x2, y2, curve, color, isActive, markerId) {
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const path = g.append("path")
      .attr("d", `M${x1},${y1} Q${midX + curve},${midY} ${x2},${y2}`)
      .attr("fill", "none")
      .attr("stroke", isActive ? color : theme.edgeStroke)
      .attr("stroke-width", isActive ? 2.5 : 1.5)
      .attr("marker-end", `url(#${isActive ? markerId : "mha-arrow-default"})`)
      .attr("filter", isActive ? "url(#mha-glow)" : null);
    return path;
  }

  // Title and description
  const stepInfo = multiHeadStepDescriptions[step];
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", theme.highlight)
    .attr("font-size", "15px")
    .attr("font-weight", "700")
    .text(stepInfo.title);

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 48)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .attr("opacity", 0.7)
    .text(stepInfo.desc);

  // Main content group
  const content = svg.append("g").attr("transform", "translate(0, 70)");

  // Layout constants
  const centerX = width / 2;
  const row1Y = 30;   // Input
  const row2Y = 110;  // Projections
  const row3Y = 190;  // Q, K, V
  const row4Y = 270;  // Reshape
  const row5Y = 350;  // Heads
  const row6Y = 440;  // Concat
  const row7Y = 520;  // Output projection + output

  // Step visibility
  const s0 = step === 0;
  const s1 = step === 1;
  const s2 = step === 2;
  const s3 = step === 3;
  const s4 = step === 4;
  const s5 = step === 5;

  // ===== ROW 1: Input =====
  drawBox(content, centerX, row1Y, 180, 40, colors.input, "x", "(batch, seq, embed_dim)", s0, s0 ? 1 : 0.5);

  // ===== ROW 2: Linear Projections =====
  const projSpacing = 140;
  const projY = row2Y;

  // Arrows from input to projections
  if (s0 || s1) {
    drawArrow(content, centerX - 50, row1Y + 22, centerX - projSpacing, projY - 22, colors.Q, s1, "mha-arrow-q");
    drawArrow(content, centerX, row1Y + 22, centerX, projY - 22, colors.K, s1, "mha-arrow-k");
    drawArrow(content, centerX + 50, row1Y + 22, centerX + projSpacing, projY - 22, colors.V, s1, "mha-arrow-v");
  }

  drawBox(content, centerX - projSpacing, projY, 80, 36, colors.Q, "W_Q", null, s1, s1 ? 1 : 0.5);
  drawBox(content, centerX, projY, 80, 36, colors.K, "W_K", null, s1, s1 ? 1 : 0.5);
  drawBox(content, centerX + projSpacing, projY, 80, 36, colors.V, "W_V", null, s1, s1 ? 1 : 0.5);

  // ===== ROW 3: Q, K, V matrices =====
  // Arrows from projections to Q, K, V
  if (s1 || s2) {
    drawArrow(content, centerX - projSpacing, projY + 20, centerX - projSpacing, row3Y - 20, colors.Q, s1, "mha-arrow-q");
    drawArrow(content, centerX, projY + 20, centerX, row3Y - 20, colors.K, s1, "mha-arrow-k");
    drawArrow(content, centerX + projSpacing, projY + 20, centerX + projSpacing, row3Y - 20, colors.V, s1, "mha-arrow-v");
  }

  drawBox(content, centerX - projSpacing, row3Y, 90, 40, colors.Q, "Q", "(b, seq, embed)", s1 || s2, (s1 || s2) ? 1 : 0.5);
  drawBox(content, centerX, row3Y, 90, 40, colors.K, "K", "(b, seq, embed)", s1 || s2, (s1 || s2) ? 1 : 0.5);
  drawBox(content, centerX + projSpacing, row3Y, 90, 40, colors.V, "V", "(b, seq, embed)", s1 || s2, (s1 || s2) ? 1 : 0.5);

  // ===== ROW 4: Reshape =====
  // Single reshape operation box
  drawBox(content, centerX, row4Y, 280, 40, colors.head, "Reshape + Transpose", "(b, heads, seq, head_dim)", s2, s2 ? 1 : 0.5);

  // Arrows from Q, K, V to reshape
  if (s2) {
    drawArrow(content, centerX - projSpacing, row3Y + 22, centerX - 80, row4Y - 22, colors.Q, true, "mha-arrow-q");
    drawArrow(content, centerX, row3Y + 22, centerX, row4Y - 22, colors.K, true, "mha-arrow-k");
    drawArrow(content, centerX + projSpacing, row3Y + 22, centerX + 80, row4Y - 22, colors.V, true, "mha-arrow-v");
  }

  // ===== ROW 5: Parallel Heads =====
  const headSpacing = 150;
  const heads = [
    { label: "Head 0", x: centerX - headSpacing * 1.5 },
    { label: "Head 1", x: centerX - headSpacing * 0.5 },
    { label: "Head 2", x: centerX + headSpacing * 0.5 },
    { label: "Head 3", x: centerX + headSpacing * 1.5 }
  ];

  // Arrows from reshape to heads
  if (s2 || s3) {
    heads.forEach((head, i) => {
      const startX = centerX + (i - 1.5) * 60;
      drawArrow(content, startX, row4Y + 22, head.x, row5Y - 32, colors.head, s3, "mha-arrow-head");
    });
  }

  // Draw heads
  heads.forEach((head, i) => {
    const headGroup = content.append("g")
      .attr("transform", `translate(${head.x}, ${row5Y})`)
      .style("opacity", (s3) ? 1 : 0.5);

    // Head container
    headGroup.append("rect")
      .attr("x", -55)
      .attr("y", -30)
      .attr("width", 110)
      .attr("height", 60)
      .attr("rx", 8)
      .attr("fill", s3 ? colors.head : theme.nodeFill)
      .attr("fill-opacity", s3 ? 0.15 : 1)
      .attr("stroke", s3 ? colors.head : theme.nodeStroke)
      .attr("stroke-width", s3 ? 2 : 1.5)
      .attr("filter", s3 ? "url(#mha-glow)" : null);

    // Head label
    headGroup.append("text")
      .attr("y", -10)
      .attr("text-anchor", "middle")
      .attr("fill", s3 ? colors.head : theme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", s3 ? "700" : "500")
      .text(head.label);

    // Mini attention indicator
    headGroup.append("text")
      .attr("y", 10)
      .attr("text-anchor", "middle")
      .attr("fill", theme.nodeText)
      .attr("font-size", "9px")
      .attr("opacity", 0.6)
      .text("softmax(QK^T/sqrt(d))V");
  });

  // ===== ROW 6: Concatenate =====
  // Arrows from heads to concat
  if (s3 || s4) {
    heads.forEach((head, i) => {
      drawArrow(content, head.x, row5Y + 32, centerX + (i - 1.5) * 40, row6Y - 22, colors.concat, s4, "mha-arrow-concat");
    });
  }

  drawBox(content, centerX, row6Y, 180, 40, colors.concat, "Concatenate", "(b, seq, embed_dim)", s4, s4 ? 1 : 0.5);

  // ===== ROW 7: Output Projection and Output =====
  // Arrow from concat to output projection
  if (s4 || s5) {
    drawArrow(content, centerX, row6Y + 22, centerX, row7Y - 22, colors.output, s5, "mha-arrow-output");
  }

  drawBox(content, centerX, row7Y, 140, 36, colors.output, "W_O", "output projection", s5, s5 ? 1 : 0.5);

  // Final output
  if (s5) {
    drawArrow(content, centerX, row7Y + 20, centerX, row7Y + 55, colors.output, true, "mha-arrow-output");
  }
  drawBox(content, centerX, row7Y + 75, 180, 40, colors.output, "Output", "(batch, seq, embed_dim)", s5, s5 ? 1 : 0.5);

  // Shape annotations on the right side
  const annotationX = width - 100;
  const annotations = [
    { y: row1Y + 70, text: "embed_dim", active: s0 || s1 },
    { y: row3Y + 70, text: "= heads x head_dim", active: s2 },
    { y: row5Y + 70, text: "parallel computation", active: s3 },
    { y: row6Y + 70, text: "recombine", active: s4 }
  ];

  annotations.forEach(ann => {
    if (ann.active) {
      svg.append("text")
        .attr("x", annotationX)
        .attr("y", ann.y)
        .attr("text-anchor", "middle")
        .attr("fill", theme.highlight)
        .attr("font-size", "10px")
        .attr("font-style", "italic")
        .attr("opacity", 0.8)
        .text(ann.text);
    }
  });

  return svg.node();
}

What Different Heads Learn

Trained models show head specialization:

Head 0: “Who did what?” - attends to subject-verb pairs
Head 1: “What comes before?” - attends to previous token
Head 2: “What’s similar?” - attends to semantically similar words
Head 3: “Syntax patterns” - attends to grammatical structure

import torch.nn as nn

# Simplified implementation for learning - see attention.py for production version
class MultiHeadAttention(nn.Module):
    """Multi-head attention with separate Q, K, V projections (simplified for illustration)."""

    def __init__(self, embed_dim, num_heads, dropout=0.0):
        super().__init__()
        assert embed_dim % num_heads == 0

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Projections for Q, K, V
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)

        # Output projection
        self.out_proj = nn.Linear(embed_dim, embed_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None, return_attention=False):
        batch_size, seq_len, _ = x.shape

        # Project Q, K, V
        q = self.q_proj(x)  # (batch, seq, embed)
        k = self.k_proj(x)
        v = self.v_proj(x)

        # Reshape for multi-head: (batch, seq, embed) -> (batch, heads, seq, head_dim)
        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute attention
        d_k = q.size(-1)
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)

        # Apply attention to values
        attn_output = torch.matmul(attention_weights, v)

        # Reshape back: (batch, heads, seq, head_dim) -> (batch, seq, embed)
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, seq_len, self.embed_dim)

        # Final projection
        output = self.out_proj(attn_output)

        if return_attention:
            return output, attention_weights
        return output

# Test multi-head attention
embed_dim = 64
num_heads = 8
mha = MultiHeadAttention(embed_dim=embed_dim, num_heads=num_heads)

x = torch.randn(2, 10, embed_dim)
output, weights = mha(x, return_attention=True)

print(f"Multi-Head Attention Configuration:")
print(f"  Embedding dimension: {embed_dim}")
print(f"  Number of heads: {num_heads}")
print(f"  Dimension per head: {embed_dim // num_heads}")
print(f"\nInput shape: {x.shape}")
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {weights.shape}")
print(f"  (batch, heads, query_pos, key_pos)")
print(f"\nTotal parameters: {sum(p.numel() for p in mha.parameters()):,}")

Multi-Head Attention Configuration:
  Embedding dimension: 64
  Number of heads: 8
  Dimension per head: 8

Input shape: torch.Size([2, 10, 64])
Output shape: torch.Size([2, 10, 64])
Attention weights shape: torch.Size([2, 8, 10, 10])
  (batch, heads, query_pos, key_pos)

Total parameters: 16,640

Visualizing Multi-Head Attention

# Pass multi-head attention weights to OJS
multi_head_weights = [weights[0, h].detach().numpy().tolist() for h in range(num_heads)]
ojs_define(mha_weights = multi_head_weights, mha_num_heads = num_heads)

// Multi-Head Attention Patterns Visualization
multiHeadPatterns = {
  const numHeads = mha_num_heads;
  const cols = 4;
  const rows = 2;
  const cellGridSize = 90;
  const padding = 15;
  const headerHeight = 50;

  const width = cols * (cellGridSize + padding) + padding;
  const height = rows * (cellGridSize + padding + 25) + headerHeight + padding;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Main title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 24)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text("Attention Patterns Across 8 Heads");

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 42)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.7)
    .text("Each head learns different patterns");

  // Draw each head's attention pattern
  for (let head = 0; head < numHeads; head++) {
    const row = Math.floor(head / cols);
    const col = head % cols;
    const weights = mha_weights[head];
    const n = weights.length;
    const cellSize = cellGridSize / n;

    const x = padding + col * (cellGridSize + padding);
    const y = headerHeight + row * (cellGridSize + padding + 25);

    const g = svg.append("g")
      .attr("transform", `translate(${x}, ${y})`);

    // Head title
    g.append("text")
      .attr("x", cellGridSize / 2)
      .attr("y", -6)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text(`Head ${head}`);

    // Find max for this head's color scale
    const maxWeight = d3.max(weights.flat());
    const colorScale = d3.scaleSequential(d3.interpolateBlues)
      .domain([0, maxWeight]);

    // Draw cells
    for (let i = 0; i < n; i++) {
      for (let j = 0; j < n; j++) {
        g.append("rect")
          .attr("x", j * cellSize)
          .attr("y", i * cellSize)
          .attr("width", cellSize - 0.5)
          .attr("height", cellSize - 0.5)
          .attr("fill", colorScale(weights[i][j]))
          .attr("rx", 1);
      }
    }

    // Border
    g.append("rect")
      .attr("width", cellGridSize)
      .attr("height", cellGridSize)
      .attr("fill", "none")
      .attr("stroke", diagramTheme.nodeStroke)
      .attr("stroke-width", 1)
      .attr("rx", 3);
  }

  return svg.node();
}

Using Our Attention Module

The attention.py module provides production-ready implementations:

from attention import (
    CausalMultiHeadAttention,
    demonstrate_attention,
    demonstrate_causal_attention
)

# Run built-in demonstrations
print("=" * 60)
print("MULTI-HEAD ATTENTION DEMONSTRATION")
print("=" * 60)
demonstrate_attention(seq_len=6, embed_dim=32, num_heads=4)

============================================================
MULTI-HEAD ATTENTION DEMONSTRATION
============================================================
============================================================
ATTENTION DEMONSTRATION
============================================================

Multi-Head Attention:
  Embed dim: 32
  Num heads: 4
  Head dim: 8

Input shape: (1, 6, 32)
  (batch=1, seq_len=6, embed_dim=32)

Output shape: (1, 6, 32)
Attention weights shape: (1, 4, 6, 6)
  (batch, heads, seq, seq)

Attention weights for head 0, position 0:
  [0.04559595137834549, 0.5718222856521606, 0.018782716244459152, 0.07987961918115616, 0.0661456361413002, 0.2177736908197403]
  Sum: 1.0000 (should be 1.0)

MultiHeadAttention(
  (q_proj): Linear(in_features=32, out_features=32, bias=True)
  (k_proj): Linear(in_features=32, out_features=32, bias=True)
  (v_proj): Linear(in_features=32, out_features=32, bias=True)
  (out_proj): Linear(in_features=32, out_features=32, bias=True)
  (attention): ScaledDotProductAttention()
  (dropout): Dropout(p=0.0, inplace=False)
)

print("\n" + "=" * 60)
print("CAUSAL ATTENTION DEMONSTRATION")
print("=" * 60)
demonstrate_causal_attention(seq_len=6)


============================================================
CAUSAL ATTENTION DEMONSTRATION
============================================================
============================================================
CAUSAL ATTENTION DEMONSTRATION
============================================================

Causal mask for seq_len=6:
(1 = can attend, 0 = masked)
  Position 0: █·····
  Position 1: ██····
  Position 2: ███···
  Position 3: ████··
  Position 4: █████·
  Position 5: ██████

Interpretation:
  Position 0: can only see position 0
  Position 1: can see positions 0, 1
  Position 5: can see all positions

Without mask (position 0 attends to all):
  [0.26472821831703186, 0.3044092357158661, 0.07897242158651352, 0.2088378369808197, 0.10276581346988678, 0.04028651863336563]

With causal mask (position 0 only attends to itself):
  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]

# Causal multi-head attention (what GPT uses)
causal_mha = CausalMultiHeadAttention(
    embed_dim=64,
    num_heads=8,
    max_seq_len=512,
    dropout=0.0
)

x = torch.randn(2, 10, 64)
output, weights = causal_mha(x, return_attention=True)

print(f"\nCausal Multi-Head Attention:")
print(f"  Input shape: {x.shape}")
print(f"  Output shape: {output.shape}")
print(f"  Attention weights shape: {weights.shape}")


Causal Multi-Head Attention:
  Input shape: torch.Size([2, 10, 64])
  Output shape: torch.Size([2, 10, 64])
  Attention weights shape: torch.Size([2, 8, 10, 10])

PyTorch’s Optimized Attention

Now that we understand attention from scratch, PyTorch’s production-optimized implementations offer a faster path.

F.scaled_dot_product_attention

PyTorch 2.0+ provides F.scaled_dot_product_attention - a single function that replaces our manual implementation and automatically uses the best available backend.

import torch
import torch.nn.functional as F

# Our inputs
batch, num_heads, seq_len, head_dim = 2, 8, 64, 32
query = torch.randn(batch, num_heads, seq_len, head_dim)
key = torch.randn(batch, num_heads, seq_len, head_dim)
value = torch.randn(batch, num_heads, seq_len, head_dim)

# The manual way (what we implemented)
def manual_attention(q, k, v, is_causal=False):
    d_k = q.size(-1)
    scores = q @ k.transpose(-2, -1) / (d_k ** 0.5)
    if is_causal:
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
        scores.masked_fill_(mask, float('-inf'))
    weights = F.softmax(scores, dim=-1)
    return weights @ v

# PyTorch's optimized version
output_manual = manual_attention(query, key, value, is_causal=True)
output_pytorch = F.scaled_dot_product_attention(query, key, value, is_causal=True)

print(f"Manual output shape: {output_manual.shape}")
print(f"PyTorch SDPA shape: {output_pytorch.shape}")
print(f"Results match: {torch.allclose(output_manual, output_pytorch, atol=1e-5)}")

Manual output shape: torch.Size([2, 8, 64, 32])
PyTorch SDPA shape: torch.Size([2, 8, 64, 32])
Results match: True

Flash Attention: The Speed Revolution

F.scaled_dot_product_attention invokes Flash Attention when the backend supports it - a breakthrough algorithm that:

Avoids materializing the full attention matrix (O(n^2) memory -> O(n) memory)
Uses tiling to keep computation in fast GPU SRAM
Fuses operations to minimize memory bandwidth bottleneck

# Check which backends are available
print("PyTorch Attention Backends:")
print(f"  Flash Attention: {torch.backends.cuda.flash_sdp_enabled() if torch.cuda.is_available() else 'N/A (no CUDA)'}")
print(f"  Memory-efficient: {torch.backends.cuda.mem_efficient_sdp_enabled() if torch.cuda.is_available() else 'N/A (no CUDA)'}")
print(f"  Math (fallback): Always available")

# The beautiful thing: same API, automatic optimization
# PyTorch picks the fastest available backend

PyTorch Attention Backends:
  Flash Attention: N/A (no CUDA)
  Memory-efficient: N/A (no CUDA)
  Math (fallback): Always available

# Benchmark: manual vs PyTorch SDPA
import time

def benchmark(fn, name, warmup=5, runs=20):
    # Warmup
    for _ in range(warmup):
        _ = fn()

    # Timed runs
    start = time.perf_counter()
    for _ in range(runs):
        _ = fn()
    elapsed = (time.perf_counter() - start) / runs * 1000

    print(f"{name}: {elapsed:.2f} ms per call")
    return elapsed

# Benchmark on CPU (GPU would show more dramatic difference)
batch, num_heads, seq_len, head_dim = 1, 8, 256, 64
q = torch.randn(batch, num_heads, seq_len, head_dim)
k = torch.randn(batch, num_heads, seq_len, head_dim)
v = torch.randn(batch, num_heads, seq_len, head_dim)

print(f"\nBenchmark (seq_len={seq_len}, {num_heads} heads, head_dim={head_dim}):")
t_manual = benchmark(lambda: manual_attention(q, k, v, is_causal=True), "Manual attention")
t_sdpa = benchmark(lambda: F.scaled_dot_product_attention(q, k, v, is_causal=True), "PyTorch SDPA")
print(f"\nSpeedup: {t_manual/t_sdpa:.1f}x")


Benchmark (seq_len=256, 8 heads, head_dim=64):
Manual attention: 2.62 ms per call
PyTorch SDPA: 0.75 ms per call

Speedup: 3.5x

From Scratch to Production

What we learned	What PyTorch provides
Q @ K.T / sqrt(d)	Fused kernel, no intermediate storage
Causal mask with -inf	Built-in `is_causal=True` flag
Stable softmax	Numerically stable implementation
Manual loops	Flash Attention tiling

Use F.scaled_dot_product_attention in production. The scratch implementation aids debugging, but the optimized version runs 2-10x faster on GPU.

Exercises

Exercise 1: Verify Attention Row Sums

# Verify that each row of attention weights sums to 1
Q = torch.randn(1, 5, 16)
K = torch.randn(1, 5, 16)
V = torch.randn(1, 5, 16)

output, weights = scaled_dot_product_attention(Q, K, V)

print("Attention weights row sums (should all be 1.0):")
print(weights[0].sum(dim=-1))

Attention weights row sums (should all be 1.0):
tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

Exercise 2: Effect of Temperature

# Temperature scaling affects attention sharpness
# Higher temperature = more uniform, Lower = more peaked

def attention_with_temperature(Q, K, V, temperature=1.0):
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / (math.sqrt(d_k) * temperature)
    weights = F.softmax(scores, dim=-1)
    output = torch.matmul(weights, V)
    return output, weights

# Generate fixed Q, K, V for temperature comparison
torch.manual_seed(42)
Q_temp = torch.randn(1, 4, 8)
K_temp = torch.randn(1, 4, 8)
V_temp = torch.randn(1, 4, 8)

# Compute attention at different temperatures
temp_weights = {}
for temp in [0.5, 1.0, 2.0]:
    _, w = attention_with_temperature(Q_temp, K_temp, V_temp, temperature=temp)
    temp_weights[f"temp_{str(temp).replace('.', '_')}"] = w[0].detach().numpy().tolist()

ojs_define(
    temp_weights_05 = temp_weights["temp_0_5"],
    temp_weights_10 = temp_weights["temp_1_0"],
    temp_weights_20 = temp_weights["temp_2_0"]
)

print("Lower temperature = sharper attention (more peaked)")
print("Higher temperature = softer attention (more uniform)")

// Temperature Effect Visualization
temperatureEffectViz = {
  const temperatures = [
    { temp: 0.5, weights: temp_weights_05, label: "Sharp" },
    { temp: 1.0, weights: temp_weights_10, label: "Normal" },
    { temp: 2.0, weights: temp_weights_20, label: "Uniform" }
  ];

  const n = 4;
  const cellSize = 50;
  const matrixSize = cellSize * n;
  const margin = {top: 60, right: 20, bottom: 40, left: 20};
  const spacing = 50;

  const width = temperatures.length * (matrixSize + spacing) - spacing + margin.left + margin.right;
  const height = matrixSize + margin.top + margin.bottom;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text("Effect of Temperature on Attention Sharpness");

  const colorScale = d3.scaleSequential(d3.interpolateBlues)
    .domain([0, 1]);

  temperatures.forEach((data, idx) => {
    const x = margin.left + idx * (matrixSize + spacing);
    const y = margin.top;

    const g = svg.append("g")
      .attr("transform", `translate(${x}, ${y})`);

    // Title for this temperature
    g.append("text")
      .attr("x", matrixSize / 2)
      .attr("y", -25)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "12px")
      .attr("font-weight", "600")
      .text(`Temperature = ${data.temp}`);

    g.append("text")
      .attr("x", matrixSize / 2)
      .attr("y", -10)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text(`(${data.label})`);

    // Draw cells
    for (let i = 0; i < n; i++) {
      for (let j = 0; j < n; j++) {
        const weight = data.weights[i][j];

        g.append("rect")
          .attr("x", j * cellSize)
          .attr("y", i * cellSize)
          .attr("width", cellSize - 2)
          .attr("height", cellSize - 2)
          .attr("fill", colorScale(weight))
          .attr("rx", 3);

        g.append("text")
          .attr("x", j * cellSize + cellSize / 2)
          .attr("y", i * cellSize + cellSize / 2)
          .attr("text-anchor", "middle")
          .attr("dominant-baseline", "central")
          .attr("fill", weight > 0.5 ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
          .attr("font-size", "10px")
          .text(weight.toFixed(2));
      }
    }

    // Border
    g.append("rect")
      .attr("width", matrixSize)
      .attr("height", matrixSize)
      .attr("fill", "none")
      .attr("stroke", diagramTheme.nodeStroke)
      .attr("stroke-width", 1.5)
      .attr("rx", 4);
  });

  // Caption
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", height - 12)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.7)
    .text("Lower temperature = sharper (peaked) | Higher temperature = softer (uniform)");

  return svg.node();
}

Exercise 3: Compare Single-Head vs Multi-Head

# Single head with full dimension vs multiple heads with smaller dimensions
embed_dim = 64
seq_len = 8

# Single head: one attention over 64 dimensions
single_head = MultiHeadAttention(embed_dim=embed_dim, num_heads=1)

# Multi head: 8 attention heads over 8 dimensions each
multi_head = MultiHeadAttention(embed_dim=embed_dim, num_heads=8)

x = torch.randn(1, seq_len, embed_dim)

out_single, w_single = single_head(x, return_attention=True)
out_multi, w_multi = multi_head(x, return_attention=True)

print(f"Single-head attention:")
print(f"  Attention weights shape: {w_single.shape}")
print(f"  One pattern to rule them all")

print(f"\nMulti-head attention:")
print(f"  Attention weights shape: {w_multi.shape}")
print(f"  8 different patterns, each can specialize")

# Compute average of first 4 heads for visualization
multi_combined = torch.zeros(seq_len, seq_len)
for h in range(4):
    multi_combined += w_multi[0, h].detach()
multi_combined /= 4

# Pass to OJS for visualization
ojs_define(
    single_head_weights = w_single[0, 0].detach().numpy().tolist(),
    multi_head_avg_weights = multi_combined.numpy().tolist(),
    single_multi_seq_len = seq_len
)

// Single-Head vs Multi-Head Attention Comparison
singleVsMultiHead = {
  const n = single_multi_seq_len;
  const cellSize = 40;
  const matrixSize = cellSize * n;
  const margin = {top: 50, right: 20, bottom: 30, left: 20};
  const spacing = 80;

  const width = 2 * matrixSize + spacing + margin.left + margin.right;
  const height = matrixSize + margin.top + margin.bottom;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 26)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text("Single-Head vs Multi-Head Attention");

  const datasets = [
    { weights: single_head_weights, title: "Single Head (64d)", subtitle: "One attention pattern" },
    { weights: multi_head_avg_weights, title: "Multi Head (8 heads x 8d)", subtitle: "Average of first 4 heads" }
  ];

  // Find global max for consistent color scaling
  const globalMax = Math.max(
    d3.max(single_head_weights.flat()),
    d3.max(multi_head_avg_weights.flat())
  );

  const colorScale = d3.scaleSequential(d3.interpolateBlues)
    .domain([0, globalMax]);

  datasets.forEach((data, idx) => {
    const x = margin.left + idx * (matrixSize + spacing);
    const y = margin.top;

    const g = svg.append("g")
      .attr("transform", `translate(${x}, ${y})`);

    // Title
    g.append("text")
      .attr("x", matrixSize / 2)
      .attr("y", -22)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "12px")
      .attr("font-weight", "600")
      .text(data.title);

    g.append("text")
      .attr("x", matrixSize / 2)
      .attr("y", -8)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text(data.subtitle);

    // Draw cells
    for (let i = 0; i < n; i++) {
      for (let j = 0; j < n; j++) {
        const weight = data.weights[i][j];

        g.append("rect")
          .attr("x", j * cellSize)
          .attr("y", i * cellSize)
          .attr("width", cellSize - 1)
          .attr("height", cellSize - 1)
          .attr("fill", colorScale(weight))
          .attr("rx", 2);
      }
    }

    // Border
    g.append("rect")
      .attr("width", matrixSize)
      .attr("height", matrixSize)
      .attr("fill", "none")
      .attr("stroke", diagramTheme.nodeStroke)
      .attr("stroke-width", 1.5)
      .attr("rx", 4);

    // Axis labels
    g.append("text")
      .attr("x", matrixSize / 2)
      .attr("y", matrixSize + 18)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.6)
      .text("Key");

    g.append("text")
      .attr("transform", `translate(-12, ${matrixSize / 2}) rotate(-90)`)
      .attr("text-anchor", "middle")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.6)
      .text("Query");
  });

  return svg.node();
}

Complexity and Optimizations

Attention has:

Time complexity: O(n^2 * d) where n = sequence length, d = embedding dimension
Memory complexity: O(n^2) for storing the attention matrix

For long sequences (n = 10000), O(n²) complexity demands 100 million attention entries!

// Memory Complexity Bar Chart
memoryComplexityChart = {
  const seqLengths = [512, 1024, 2048, 4096, 8192, 16384, 32768];
  const memoryGB = seqLengths.map(n => (n * n * 4) / (1024 ** 3)); // float32 = 4 bytes

  const width = 650;
  const height = 380;
  const margin = {top: 50, right: 30, bottom: 70, left: 70};
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const svg = d3.create("svg")
    .attr("width", width)
    .attr("height", height)
    .attr("viewBox", `0 0 ${width} ${height}`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", diagramTheme.bg)
    .attr("rx", 10);

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 28)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.highlight)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text("Quadratic Memory Growth: O(n^2) Attention Matrix Size");

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left}, ${margin.top})`);

  // Scales
  const xScale = d3.scaleBand()
    .domain(seqLengths.map(String))
    .range([0, innerWidth])
    .padding(0.2);

  const yScale = d3.scaleLinear()
    .domain([0, d3.max(memoryGB) * 1.15])
    .range([innerHeight, 0]);

  // X axis
  g.append("g")
    .attr("transform", `translate(0, ${innerHeight})`)
    .call(d3.axisBottom(xScale))
    .call(g => g.select(".domain").attr("stroke", diagramTheme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", diagramTheme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px")
      .attr("transform", "rotate(-30)")
      .attr("text-anchor", "end")
      .attr("dx", "-0.5em")
      .attr("dy", "0.3em"));

  // X axis label
  g.append("text")
    .attr("x", innerWidth / 2)
    .attr("y", innerHeight + 55)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "12px")
    .text("Sequence Length");

  // Y axis
  g.append("g")
    .call(d3.axisLeft(yScale).ticks(6).tickFormat(d => d.toFixed(1)))
    .call(g => g.select(".domain").attr("stroke", diagramTheme.nodeStroke))
    .call(g => g.selectAll(".tick line").attr("stroke", diagramTheme.nodeStroke))
    .call(g => g.selectAll(".tick text")
      .attr("fill", diagramTheme.nodeText)
      .attr("font-size", "10px"));

  // Y axis label
  g.append("text")
    .attr("transform", "rotate(-90)")
    .attr("x", -innerHeight / 2)
    .attr("y", -50)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "12px")
    .text("Attention Matrix Memory (GB)");

  // Bars
  const bars = g.selectAll(".bar")
    .data(seqLengths)
    .join("rect")
    .attr("class", "bar")
    .attr("x", d => xScale(String(d)))
    .attr("y", innerHeight)
    .attr("width", xScale.bandwidth())
    .attr("height", 0)
    .attr("fill", diagramTheme.accent)
    .attr("rx", 4);

  // Animate bars
  bars.transition()
    .duration(800)
    .delay((d, i) => i * 100)
    .attr("y", (d, i) => yScale(memoryGB[i]))
    .attr("height", (d, i) => innerHeight - yScale(memoryGB[i]));

  // Value labels on bars
  g.selectAll(".label")
    .data(seqLengths)
    .join("text")
    .attr("class", "label")
    .attr("x", d => xScale(String(d)) + xScale.bandwidth() / 2)
    .attr("y", (d, i) => yScale(memoryGB[i]) - 8)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "9px")
    .attr("opacity", 0)
    .text((d, i) => `${memoryGB[i].toFixed(2)} GB`)
    .transition()
    .delay((d, i) => 800 + i * 100)
    .duration(300)
    .attr("opacity", 1);

  // Caption
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", height - 10)
    .attr("text-anchor", "middle")
    .attr("fill", diagramTheme.nodeText)
    .attr("font-size", "11px")
    .attr("opacity", 0.7)
    .text("This is why 32K context models need special techniques!");

  return svg.node();
}

KV Cache for Efficient Inference

During autoregressive generation, we compute attention one token at a time. Without caching, we’d recompute K and V for all previous tokens at each step.

KV Cache: Store computed K and V values for previous tokens:

At step t, only compute K_t and V_t for the new token
Concatenate with cached K_{1:t-1} and V_{1:t-1}
Query only needs the new token’s Q_t

This reduces per-token generation from O(n²) to O(n).

Modern Optimizations

Flash Attention (Dao et al., 2022):

Avoids materializing the full n x n attention matrix
Uses tiling and recomputation to be memory-efficient
2-4x faster than standard attention on modern GPUs
PyTorch 2.0+ and most frameworks now default to Flash Attention

Sparse Attention patterns:

Local attention: Each token only attends to nearby tokens
Strided attention: Attend to every k-th token
Block-sparse: Combine local and strided patterns

Linear Attention approximations:

Replace softmax(QK^T)V with kernel feature maps
Achieves O(n) complexity but may sacrifice quality

Grouped-Query Attention (GQA & MQA)

The KV cache above is what makes generation fast — but it is also what makes generation expensive to remember. At each step we store a key and a value vector for every past token, in every layer, for every head. For a 70B-class model at a few thousand tokens of context, that cache runs to tens of gigabytes and quickly dominates the memory budget, capping how many requests a GPU can serve at once. The bottleneck is not compute — it is the size of the KV cache.

Grouped-Query Attention (GQA) attacks it with one idea: let several query heads share a single key/value head. The queries stay diverse — each head still asks its own question — but they consult a smaller set of keys and values, so there is less to cache.

Multi-Head Attention (MHA) — every query head has its own K/V head. Maximum quality, maximum cache.
Multi-Query Attention (MQA) — all query heads share one K/V head. The cache shrinks by a factor of num_heads, but quality can dip.
Grouped-Query Attention (GQA) — the middle ground: split the query heads into num_kv_heads groups, one shared K/V head per group. Near-MHA quality at a fraction of the cache. This is the setting LLaMA 2/3, Mistral, Qwen, and Gemma actually ship.

Intuition: Share the Keys and Values

Picture the query heads on the left and the key/value heads on the right. In MHA the two columns are the same size and wired one-to-one. GQA keeps every query head but collapses the right column, wiring each group of queries to one shared K/V head. MQA collapses it all the way to a single K/V head. Step through the three variants:

// Step control: MHA → GQA → MQA
viewof gqaStep = stepControl({min: 0, max: 2, value: 0, label: "Variant"})

gqaStepInfo = [
  {
    title: "MHA — Multi-Head Attention",
    caption: "8 query heads, 8 key/value heads, wired one-to-one. Every head caches its own K and V — the largest possible cache."
  },
  {
    title: "GQA — Grouped-Query Attention",
    caption: "8 query heads share just 2 key/value heads (groups of 4). The cache is 4× smaller while queries stay fully diverse."
  },
  {
    title: "MQA — Multi-Query Attention",
    caption: "All 8 query heads share a single key/value head. The cache is 8× smaller — cheapest to store, but quality can suffer."
  }
]

gqaDiagram = {
  const wiring = gqaWirings[gqaStep];
  const info = gqaStepInfo[gqaStep];
  const theme = diagramTheme;

  const width = 720, height = 440;
  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'IBM Plex Mono', 'JetBrains Mono', monospace");

  svg.append("rect")
    .attr("width", width).attr("height", height)
    .attr("fill", theme.bg).attr("rx", 12);

  // Title + caption
  svg.append("text")
    .attr("x", width / 2).attr("y", 30)
    .attr("text-anchor", "middle")
    .attr("fill", theme.highlight)
    .attr("font-size", "15px").attr("font-weight", "700")
    .text(info.title);

  const nH = wiring.num_heads, nKV = wiring.num_kv_heads;
  const nRep = wiring.n_rep;   // query heads per K/V head — the group size

  // Distinct hue per group, anchored to the theme's highlight colour so it
  // adapts to light/dark mode.
  const base = d3.hsl(theme.highlight);
  const groupColor = g => d3.hsl((base.h + g * 360 / nKV) % 360, 0.62, base.l).toString();

  const topPad = 92, botPad = 96;
  const qX = 210, kvX = 510, r = 15;
  const span = height - topPad - botPad;
  const qY = i => topPad + (nH === 1 ? span / 2 : i * span / (nH - 1));
  const kvY = j => topPad + (nKV === 1 ? span / 2 : j * span / (nKV - 1));

  // Column headers
  svg.append("text").attr("x", qX).attr("y", topPad - 26)
    .attr("text-anchor", "middle").attr("fill", theme.nodeText)
    .attr("font-size", "12px").attr("font-weight", "600")
    .text(`Query heads (${nH})`);
  svg.append("text").attr("x", kvX).attr("y", topPad - 26)
    .attr("text-anchor", "middle").attr("fill", theme.nodeText)
    .attr("font-size", "12px").attr("font-weight", "600")
    .text(`Key/Value heads (${nKV})`);

  // Edges: query head i → kv head (i // nRep)
  svg.append("g").selectAll("line")
    .data(wiring.mapping)
    .join("line")
    .attr("x1", qX + r).attr("y1", d => qY(d.query_head))
    .attr("x2", kvX - r).attr("y2", d => kvY(d.kv_head))
    .attr("stroke", d => groupColor(d.kv_head))
    .attr("stroke-width", 2)
    .attr("opacity", 0.75);

  // Query nodes (coloured by the group they belong to)
  svg.append("g").selectAll("circle.q")
    .data(wiring.mapping)
    .join("circle")
    .attr("cx", qX).attr("cy", d => qY(d.query_head))
    .attr("r", r)
    .attr("fill", d => groupColor(d.kv_head))
    .attr("stroke", theme.nodeStroke).attr("stroke-width", 1.5);
  svg.append("g").selectAll("text.ql")
    .data(wiring.mapping)
    .join("text")
    .attr("x", qX).attr("y", d => qY(d.query_head) + 4)
    .attr("text-anchor", "middle").attr("fill", theme.bg)
    .attr("font-size", "11px").attr("font-weight", "700")
    .text(d => `Q${d.query_head}`);

  // KV nodes
  svg.append("g").selectAll("circle.kv")
    .data(d3.range(nKV))
    .join("circle")
    .attr("cx", kvX).attr("cy", j => kvY(j))
    .attr("r", r + 3)
    .attr("fill", j => groupColor(j))
    .attr("stroke", theme.nodeStroke).attr("stroke-width", 1.5);
  svg.append("g").selectAll("text.kvl")
    .data(d3.range(nKV))
    .join("text")
    .attr("x", kvX).attr("y", j => kvY(j) + 4)
    .attr("text-anchor", "middle").attr("fill", theme.bg)
    .attr("font-size", "11px").attr("font-weight", "700")
    .text(j => `KV${j}`);

  // Cache badge
  svg.append("text")
    .attr("x", width / 2).attr("y", height - 52)
    .attr("text-anchor", "middle")
    .attr("fill", theme.accent).attr("font-size", "13px").attr("font-weight", "700")
    .text(`KV cache ∝ ${nKV} head${nKV === 1 ? "" : "s"}  →  ${nH / nKV}× smaller than MHA`);

  // Caption (wrapped)
  const words = info.caption.split(" ");
  const lines = [];
  let line = "";
  for (const w of words) {
    if ((line + w).length > 78) { lines.push(line.trim()); line = ""; }
    line += w + " ";
  }
  lines.push(line.trim());
  svg.append("g").selectAll("text.cap")
    .data(lines)
    .join("text")
    .attr("x", width / 2)
    .attr("y", (d, i) => height - 30 + i * 16)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText).attr("font-size", "11px").attr("opacity", 0.85)
    .text(d => d);

  return svg.node();
}

Notice the query column never shrinks — the model keeps all 8 distinct queries. Only the key/value column collapses. That is the whole trick: diversity where it is cheap (queries are recomputed each step and never cached), sharing where it is expensive (keys and values are cached for every past token).

The Math: Counting the KV Cache

The cache stores one key and one value vector per token, per layer, per key/value head. Its size is

\[ \text{KV bytes} = 2 \cdot L \cdot n \cdot n_{kv} \cdot d_\text{head} \cdot b \]

where \(L\) is the number of layers, \(n\) the context length, \(n_{kv}\) the number of key/value heads, \(d_\text{head}\) the head dimension, \(b\) the bytes per element (2 for fp16/bf16), and the leading \(2\) counts \(K\) and \(V\).

The quantity we control is \(n_{kv}\). Full MHA sets \(n_{kv} = n_\text{heads}\); MQA sets \(n_{kv} = 1\); GQA picks something in between. So GQA shrinks the cache by exactly

\[ \frac{n_\text{heads}}{n_{kv}} \]

and it shrinks the K and V projection parameters by the same factor (the query projection and the number of query heads are untouched). Attention still runs over all \(n_\text{heads}\) query heads — GQA changes only how many distinct keys and values those heads look at.

Key Insight

MHA gives each of \(n_\text{heads}\) query heads its own K/V; GQA gives each group of query heads one shared K/V; MQA gives them all one. Because the KV cache — not the matrix multiply — is the real inference bottleneck, dividing \(n_{kv}\) by 8 divides the cache by 8 at almost no quality cost. Llama-2-70B uses 64 query heads and 8 K/V heads: an 8× smaller cache than the equivalent MHA model.

The GroupedQueryAttention you build here is a general attention module. In Module 09: Efficient Attention you’ll build its cache-aware, causal counterpart for decode-time generation — and see FlashAttention attack the compute side of the same problem.

Code: Grouped-Query Attention From Scratch

The implementation is MHA with two changes: K and V project to the smaller num_kv_heads, and we repeat_interleave them back up to num_heads so the same scaled_dot_product_attention runs unchanged. This is GroupedQueryAttention in attention.py.

import torch
from attention import GroupedQueryAttention, kv_cache_bytes, demonstrate_gqa

# 8 query heads sharing 2 key/value heads (a group size of 4).
gqa = GroupedQueryAttention(embed_dim=256, num_heads=8, num_kv_heads=2)
x = torch.randn(4, 32, 256)                     # (batch, seq, embed)
out, attn = gqa(x, return_attention=True)

print(f"Output shape:        {tuple(out.shape)}")
print(f"Attn weights shape:  {tuple(attn.shape)}   (all 8 query heads)")
print(f"Q projection:        {gqa.q_proj.weight.shape[0]} rows (8 heads × 32)")
print(f"K projection:        {gqa.k_proj.weight.shape[0]} rows (2 heads × 32)")
print(f"n_rep (group size):  {gqa.n_rep} query heads per K/V head")

Output shape:        (4, 32, 256)
Attn weights shape:  (4, 8, 32, 32)   (all 8 query heads)
Q projection:        256 rows (8 heads × 32)
K projection:        64 rows (2 heads × 32)
n_rep (group size):  4 query heads per K/V head

The two endpoints fall straight out of num_kv_heads:

mha = GroupedQueryAttention(embed_dim=256, num_heads=8, num_kv_heads=8)  # MHA
mqa = GroupedQueryAttention(embed_dim=256, num_heads=8, num_kv_heads=1)  # MQA

for name, layer in [("MHA", mha), ("GQA", gqa), ("MQA", mqa)]:
    kv_params = layer.k_proj.weight.numel() + layer.v_proj.weight.numel()
    print(f"{name}: {layer.num_kv_heads} K/V head(s), "
          f"{kv_params:>6,} K+V params, n_rep {layer.n_rep}")

MHA: 8 K/V head(s), 131,072 K+V params, n_rep 1
GQA: 2 K/V head(s), 32,768 K+V params, n_rep 4
MQA: 1 K/V head(s), 16,384 K+V params, n_rep 8

The K/V heads are expanded with repeat_interleave, so query head i reads shared head i // n_rep, where n_rep = num_heads // num_kv_heads is the number of query heads sharing each K/V head — the group size. This is the attribute name LLaMA’s repeat_kv uses, and the one Module 09 reuses on its cache-aware build, so the two modules read the same. Because the expansion is exact, MHA (num_kv_heads == num_heads) is a genuine special case — no approximation.

# The payoff, on a realistic decoder (32 layers, 32 heads, d_head=128, 8K ctx):
demonstrate_gqa(embed_dim=4096, num_heads=32, seq_len=8192, num_layers=32)

============================================================
GROUPED-QUERY ATTENTION DEMONSTRATION
============================================================
embed_dim=4096, num_heads=32, head_dim=128
KV cache at seq_len=8192, num_layers=32 (fp16):

   MHA: 32 KV heads →   4.00 GB  (1x smaller than MHA)
   GQA:  8 KV heads →   1.00 GB  (4x smaller than MHA)
   MQA:  1 KV heads →   0.12 GB  (32x smaller than MHA)

{'MHA': {'num_kv_heads': 32, 'kv_cache_bytes': 4294967296},
 'GQA': {'num_kv_heads': 8, 'kv_cache_bytes': 1073741824},
 'MQA': {'num_kv_heads': 1, 'kv_cache_bytes': 134217728}}

num_kv_heads must divide num_heads

The query heads split into equal groups, so num_heads % num_kv_heads == 0 must hold — GroupedQueryAttention asserts it. 8 query heads can share 1, 2, 4, or 8 K/V heads, but not 3. Production kernels (F.scaled_dot_product_attention with enable_gqa=True, FlashAttention) skip the physical repeat_interleave for speed, but compute the identical result.

How Much Memory Does It Save?

Slide the number of key/value heads and watch the KV cache grow with context length. The dashed line is full MHA (32 K/V heads); the solid line is your GQA choice. Fewer K/V heads → a flatter, cheaper curve — the same context for a fraction of the memory.

viewof kvHeads = Inputs.select([1, 2, 4, 8, 16, 32], {
  value: 8,
  label: "K/V heads (of 32 query heads)"
})

kvCacheMemChart = {
  const cfg = gqaModelConfig;                      // {num_heads:32, head_dim:128, num_layers:32}
  const seqLengths = d3.range(1, 33).map(k => k * 1024);  // 1K … 32K tokens
  const bytes = (n, nkv) => 2 * cfg.num_layers * n * nkv * cfg.head_dim * 2; // fp16
  const toGB = b => b / (1024 ** 3);

  const mha = seqLengths.map(n => ({seq: n, gb: toGB(bytes(n, cfg.num_heads)), kind: "MHA (32 K/V heads)"}));
  const chosen = seqLengths.map(n => ({seq: n, gb: toGB(bytes(n, kvHeads)), kind: `Your choice (${kvHeads} K/V heads)`}));

  return Plot.plot({
    width: 700,
    height: 380,
    marginLeft: 60,
    marginBottom: 50,
    style: { background: "transparent", fontSize: "12px" },
    x: { label: "Context length (tokens) →", tickFormat: d => `${d / 1024}K`, grid: true },
    y: { label: "↑ KV cache (GB, fp16)", grid: true },
    color: {
      legend: true,
      domain: ["MHA (32 K/V heads)", `Your choice (${kvHeads} K/V heads)`],
      range: [
        getComputedStyle(document.documentElement).getPropertyValue("--text-secondary").trim() || "#94a3b8",
        getComputedStyle(document.documentElement).getPropertyValue("--diagram-highlight").trim() || "#f97316"
      ]
    },
    marks: [
      Plot.line(mha, { x: "seq", y: "gb", stroke: "kind", strokeDasharray: "5,4", strokeWidth: 2 }),
      Plot.line(chosen, { x: "seq", y: "gb", stroke: "kind", strokeWidth: 3 }),
      Plot.dot(chosen, { x: "seq", y: "gb", fill: "kind", r: 2.5 }),
      Plot.ruleY([0])
    ]
  });
}

Try This

Drop to 1 K/V head (MQA) — the curve flattens to 1/32 of MHA. That is the whole cache budget of a long-context request, reclaimed.
Set 8 K/V heads (Llama-2-70B’s choice) — a 4× cut versus MHA while keeping 8 distinct K/V subspaces. Note how it tracks well below the dashed MHA line.
Set 32 K/V heads — the solid line lands exactly on the dashed MHA line: GQA with num_kv_heads == num_heads is MHA.

Interactive Exploration

Experiment with attention in real-time. Adjust the temperature to see how it affects the attention distribution:

Low temperature → Sharp, focused attention (nearly one-hot)
High temperature → Soft, diffuse attention (more uniform)

exploreTokens = ["The", "cat", "sat", "on", "the", "mat"]

// Pre-computed similarity scores (simulating Q·K^T)
// Higher values where semantically related
exploreSimilarityMatrix = [
  [1.0, 0.2, 0.1, 0.1, 0.9, 0.1],  // "The" - similar to other "the"
  [0.2, 1.0, 0.6, 0.1, 0.2, 0.3],  // "cat" - relates to "sat"
  [0.1, 0.7, 1.0, 0.3, 0.1, 0.4],  // "sat" - relates to "cat", "mat"
  [0.1, 0.1, 0.2, 1.0, 0.1, 0.5],  // "on" - relates to "mat"
  [0.9, 0.2, 0.1, 0.1, 1.0, 0.2],  // "the" - similar to other "The"
  [0.1, 0.4, 0.5, 0.6, 0.2, 1.0],  // "mat" - relates to "sat", "on"
]

// Temperature slider for interactive exploration
viewof exploreTemperature = Inputs.range([0.1, 3.0], {
  value: 1.0,
  step: 0.1,
  label: "Temperature"
})

// Softmax function for exploration
exploreSoftmax = function(arr, temp) {
  const scaled = arr.map(x => x / temp);
  const maxVal = Math.max(...scaled);
  const exps = scaled.map(x => Math.exp(x - maxVal));
  const sum = exps.reduce((a, b) => a + b, 0);
  return exps.map(x => x / sum);
}

// Compute attention weights
exploreAttentionWeights = exploreSimilarityMatrix.map(row => exploreSoftmax(row, exploreTemperature))

// Create heatmap data
exploreHeatmapData = {
  const data = [];
  for (let i = 0; i < exploreTokens.length; i++) {
    for (let j = 0; j < exploreTokens.length; j++) {
      data.push({
        query: exploreTokens[i],
        key: exploreTokens[j],
        queryIdx: i,
        keyIdx: j,
        weight: exploreAttentionWeights[i][j]
      });
    }
  }
  return data;
}

// Theme colors for light/dark mode in exploration
// Uses diagramTheme for consistent dark mode support
exploreTheme = diagramTheme.isDark ? {
  textHigh: diagramTheme.textOnHighlight,  // Dark text on bright cells in dark mode
  textLow: diagramTheme.nodeText,           // Light text on dark cells
  barFill: diagramTheme.accent
} : {
  textHigh: diagramTheme.textOnHighlight,  // Dark text on bright cells in light mode
  textLow: diagramTheme.nodeText,           // Dark text on light cells
  barFill: diagramTheme.accent
}

Plot = import("https://esm.sh/@observablehq/plot@0.6")

Plot.plot({
  title: "Attention Weights",
  subtitle: `Temperature: ${exploreTemperature.toFixed(1)} — Each row shows where that token "looks"`,
  width: 500,
  height: 400,
  padding: 0,
  marginLeft: 60,
  marginBottom: 60,
  x: {
    domain: exploreTokens,
    label: "Key (what we look at) →",
    tickRotate: -45
  },
  y: {
    domain: exploreTokens,
    label: "← Query (who is looking)"
  },
  color: {
    scheme: "blues",
    domain: [0, 1],
    legend: true,
    label: "Attention Weight"
  },
  marks: [
    Plot.cell(exploreHeatmapData, {
      x: "key",
      y: "query",
      fill: "weight",
      tip: true
    }),
    Plot.text(exploreHeatmapData, {
      x: "key",
      y: "query",
      text: d => d.weight.toFixed(2),
      fill: d => d.weight > 0.5 ? exploreTheme.textHigh : exploreTheme.textLow,
      fontSize: 11
    })
  ]
})

viewof exploreSelectedToken = Inputs.select(exploreTokens, {
  label: "Focus on token",
  value: "sat"
})

exploreSelectedIdx = exploreTokens.indexOf(exploreSelectedToken)
exploreSelectedWeights = exploreAttentionWeights[exploreSelectedIdx]

md`**"${exploreSelectedToken}"** attends to:`

Plot.plot({
  width: 500,
  height: 200,
  marginLeft: 60,
  x: {
    domain: exploreTokens,
    label: "Token"
  },
  y: {
    domain: [0, 1],
    label: "Attention Weight"
  },
  marks: [
    Plot.barY(exploreTokens.map((t, i) => ({token: t, weight: exploreSelectedWeights[i]})), {
      x: "token",
      y: "weight",
      fill: exploreTheme.barFill
    }),
    Plot.text(exploreTokens.map((t, i) => ({token: t, weight: exploreSelectedWeights[i]})), {
      x: "token",
      y: "weight",
      text: d => d.weight.toFixed(2),
      dy: -8,
      fontSize: 11
    }),
    Plot.ruleY([0])
  ]
})

Try This

Set temperature to 0.1 — notice how attention becomes nearly one-hot (picks one token)
Set temperature to 3.0 — notice how attention becomes almost uniform
Compare how “sat” attends (looks at “cat”) vs how “the” attends (looks at other “the”)

Common Pitfalls

When implementing attention, watch out for these issues:

Forgetting to scale: Without /sqrt(d_k), training becomes unstable with large head dimensions
Wrong mask dimensions: Mask should broadcast correctly over batch and head dimensions
NaN from all-masked rows: If an entire row is masked, softmax produces NaN (log(0)). Handle with nan_to_num or ensure at least one position is unmasked
Memory leaks with attention weights: Storing attention weights for visualization can exhaust memory. Only compute when needed

Summary

Key takeaways:

Attention computes weighted sums: Each position’s output is a weighted combination of all (allowed) positions’ values
Q, K, V: Query asks “what do I need?”, Key says “what do I have?”, Value carries the information
Scaling prevents gradient issues: Dividing by sqrt(d_k) keeps softmax from saturating
Causal masking enables generation: In LLMs, we mask future tokens so the model learns to predict the next token
Multiple heads learn different patterns: Each head can specialize in different linguistic relationships
Complexity is O(n^2): Attention’s quadratic cost limits sequence length, motivating optimizations like Flash Attention and KV caching
Grouped-Query Attention shrinks the KV cache: Sharing key/value heads across query heads (GQA, or MQA at the extreme) cuts the cache by num_heads / num_kv_heads at near-MHA quality — the setting every modern open model ships

Going Deeper

Core Papers:

Fast Transformer Decoding: One Write-Head is All You Need — Shazeer (2019) introduces Multi-Query Attention: collapse to a single K/V head to make the decode-time cache tiny.
GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints — Ainslie et al. (2023) interpolate between MHA and MQA and show a small number of K/V head groups recovers MHA quality at MQA-like cost.
FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness — Dao et al. (2022), the IO-aware kernel that avoids materializing the N×N matrix.
Attention Is All You Need — Vaswani et al. (2017), the original multi-head attention.

Practical Resources:

The Illustrated Transformer — Jay Alammar’s visual walkthrough of attention.
Llama 2 paper, §2.1 — a production model using GQA (64 query heads, 8 K/V heads).

What’s Next

Module 06: Transformer combines attention with feed-forward networks, layer normalization, and residual connections to build a complete transformer decoder block.