Module 01: Tensors

d3 = require("d3@7")


// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================

// Function to read CSS custom property values from the document
getCSSVar = function(name, fallback = null) {
  if (typeof document === 'undefined') return fallback;
  const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim();
  return value || fallback;
}

// =============================================================================
// THEME OBJECT
// =============================================================================

// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
  // Fallback values (light mode)
  const fallbacks = {
    nodeFill: '#f5f5f4',
    nodeFillHover: '#e7e5e4',
    nodeStroke: '#d6d3d1',
    nodeText: '#1c1917',
    edgeStroke: '#78716c',
    highlight: '#f97316',
    highlightGlow: 'rgba(249, 115, 22, 0.3)',
    accent: '#0ea5e9',
    accentGlow: 'rgba(14, 165, 233, 0.3)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: '#fafaf9',
    bgSecondary: '#f5f5f4',
    // Semantic colors for status/feedback
    error: '#dc2626',
    errorBg: 'rgba(220, 38, 38, 0.1)',
    success: '#16a34a',
    successBg: 'rgba(22, 163, 74, 0.1)',
    info: '#2563eb',
    infoBg: 'rgba(37, 99, 235, 0.1)'
  };

  return {
    nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
    nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
    nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
    nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
    edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
    highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
    highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
    accent: getCSSVar('--diagram-accent', fallbacks.accent),
    accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
    textOnHighlight: fallbacks.textOnHighlight,
    textOnAccent: fallbacks.textOnAccent,
    bg: getCSSVar('--diagram-bg', fallbacks.bg),
    bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
    // Semantic colors (use fallbacks directly since no CSS vars defined)
    error: fallbacks.error,
    errorBg: fallbacks.errorBg,
    success: fallbacks.success,
    successBg: fallbacks.successBg,
    info: fallbacks.info,
    infoBg: fallbacks.infoBg
  };
}

// =============================================================================
// SVG PRIMITIVES
// =============================================================================

// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
  const {
    x = 0,
    y = 0,
    width = 100,
    height = 50,
    label = '',
    sublabel = '',
    id = null,
    theme = diagramTheme,
    rx = 6,
    ry = 6,
    className = 'diagram-node'
  } = options;

  // Create group
  const g = svg.append('g')
    .attr('class', className)
    .attr('transform', `translate(${x}, ${y})`);

  if (id) g.attr('id', id);

  // Add rectangle
  g.append('rect')
    .attr('x', -width / 2)
    .attr('y', -height / 2)
    .attr('width', width)
    .attr('height', height)
    .attr('rx', rx)
    .attr('ry', ry)
    .attr('fill', theme.nodeFill)
    .attr('stroke', theme.nodeStroke)
    .attr('stroke-width', 1.5);

  // Add main label
  if (label) {
    const labelY = sublabel ? -6 : 0;
    g.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(label);
  }

  // Add sublabel
  if (sublabel) {
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', 0.7)
      .attr('pointer-events', 'none')
      .text(sublabel);
  }

  return g;
}

// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
  const {
    x1 = 0,
    y1 = 0,
    x2 = 100,
    y2 = 0,
    label = '',
    theme = diagramTheme,
    curved = false,
    curvature = 0.3,
    id = null,
    className = 'diagram-edge',
    dashed = false
  } = options;

  // Create unique marker ID
  const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;

  // Ensure defs exists
  let defs = svg.select('defs');
  if (defs.empty()) {
    defs = svg.append('defs');
  }

  // Add arrowhead marker
  defs.append('marker')
    .attr('id', markerId)
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Create group for arrow
  const g = svg.append('g')
    .attr('class', className);

  if (id) g.attr('id', id);

  // Calculate path
  let pathD;
  if (curved) {
    // Quadratic Bezier curve
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const dx = x2 - x1;
    const dy = y2 - y1;
    // Perpendicular offset for curve
    const cx = midX - dy * curvature;
    const cy = midY + dx * curvature;
    pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
  } else {
    // Straight line
    pathD = `M${x1},${y1} L${x2},${y2}`;
  }

  // Add path
  const path = g.append('path')
    .attr('d', pathD)
    .attr('fill', 'none')
    .attr('stroke', theme.edgeStroke)
    .attr('stroke-width', 1.5)
    .attr('marker-end', `url(#${markerId})`);

  if (dashed) {
    path.attr('stroke-dasharray', '5,3');
  }

  // Add label if provided
  if (label) {
    const labelX = (x1 + x2) / 2;
    const labelY = (y1 + y2) / 2;

    // Offset label perpendicular to line
    const angle = Math.atan2(y2 - y1, x2 - x1);
    const offsetX = Math.sin(angle) * 12;
    const offsetY = -Math.cos(angle) * 12;

    g.append('text')
      .attr('x', labelX + offsetX)
      .attr('y', labelY + offsetY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .text(label);
  }

  return g;
}

// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================

// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
  const {
    total = 1,
    initialStep = 0,
    speed = 1000,
    loop = true,
    onStepChange = null
  } = options;

  let current = initialStep;
  let isPlaying = false;
  let intervalId = null;
  let currentSpeed = speed;

  const notifyChange = () => {
    if (onStepChange && typeof onStepChange === 'function') {
      onStepChange(current);
    }
  };

  const controller = {
    get current() { return current; },
    get isPlaying() { return isPlaying; },
    get total() { return total; },
    get speed() { return currentSpeed; },

    setStep(step) {
      current = Math.max(0, Math.min(total - 1, step));
      notifyChange();
      return current;
    },

    next() {
      if (current < total - 1) {
        current++;
      } else if (loop) {
        current = 0;
      }
      notifyChange();
      return current;
    },

    prev() {
      if (current > 0) {
        current--;
      } else if (loop) {
        current = total - 1;
      }
      notifyChange();
      return current;
    },

    play() {
      if (isPlaying) return;
      isPlaying = true;
      intervalId = setInterval(() => {
        controller.next();
      }, currentSpeed);
    },

    stop() {
      isPlaying = false;
      if (intervalId) {
        clearInterval(intervalId);
        intervalId = null;
      }
    },

    toggle() {
      if (isPlaying) {
        controller.stop();
      } else {
        controller.play();
      }
    },

    reset() {
      controller.stop();
      current = initialStep;
      notifyChange();
    },

    setSpeed(newSpeed) {
      currentSpeed = newSpeed;
      if (isPlaying) {
        controller.stop();
        controller.play();
      }
    }
  };

  return controller;
}

// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================

// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
  const {
    nodes = [],
    edges = [],
    width = 600,
    height = 400,
    activeNodes = [],
    activeEdges = [],
    theme = diagramTheme,
    nodeWidth = 100,
    nodeHeight = 50,
    padding = 20
  } = options;

  // Create SVG element
  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`)
    .attr('class', 'flow-diagram');

  // Add background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', theme.bg)
    .attr('rx', 8);

  // Create defs for markers
  const defs = svg.append('defs');

  // Standard arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Highlighted arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.highlight);

  // Edges layer (draw first so nodes appear on top)
  const edgesLayer = svg.append('g').attr('class', 'edges-layer');

  // Nodes layer
  const nodesLayer = svg.append('g').attr('class', 'nodes-layer');

  // Draw edges
  edges.forEach((edge, i) => {
    const sourceNode = nodes.find(n => n.id === edge.source);
    const targetNode = nodes.find(n => n.id === edge.target);

    if (!sourceNode || !targetNode) return;

    const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
    const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
    const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';

    // Calculate edge path
    const x1 = sourceNode.x;
    const y1 = sourceNode.y;
    const x2 = targetNode.x;
    const y2 = targetNode.y;

    // Shorten path to not overlap with node edges
    const dx = x2 - x1;
    const dy = y2 - y1;
    const len = Math.sqrt(dx * dx + dy * dy);
    const offsetStart = (nodeWidth / 2) + 5;
    const offsetEnd = (nodeWidth / 2) + 10;

    const startX = x1 + (dx / len) * offsetStart;
    const startY = y1 + (dy / len) * offsetStart;
    const endX = x2 - (dx / len) * offsetEnd;
    const endY = y2 - (dy / len) * offsetEnd;

    const edgeGroup = edgesLayer.append('g')
      .attr('class', `edge ${isActive ? 'highlighted' : ''}`);

    if (edge.id) edgeGroup.attr('id', edge.id);

    // Draw path
    let pathD;
    if (edge.curved) {
      const midX = (startX + endX) / 2;
      const midY = (startY + endY) / 2;
      const curvature = edge.curvature || 0.2;
      const cx = midX - dy * curvature;
      const cy = midY + dx * curvature;
      pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
    } else {
      pathD = `M${startX},${startY} L${endX},${endY}`;
    }

    const path = edgeGroup.append('path')
      .attr('d', pathD)
      .attr('fill', 'none')
      .attr('stroke', edgeColor)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', `url(#${markerId})`);

    if (edge.dashed) {
      path.attr('stroke-dasharray', '5,3');
    }

    if (isActive) {
      path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
    }

    // Add label if present
    if (edge.label) {
      const labelX = (startX + endX) / 2;
      const labelY = (startY + endY) / 2;
      const angle = Math.atan2(endY - startY, endX - startX);
      const offsetX = Math.sin(angle) * 14;
      const offsetY = -Math.cos(angle) * 14;

      edgeGroup.append('text')
        .attr('x', labelX + offsetX)
        .attr('y', labelY + offsetY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? theme.highlight : theme.nodeText)
        .attr('font-size', '10px')
        .text(edge.label);
    }
  });

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
    const nodeFill = isActive ? theme.highlight : theme.nodeFill;
    const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
    const textFill = isActive ? theme.textOnHighlight : theme.nodeText;

    const nodeGroup = nodesLayer.append('g')
      .attr('class', `node ${isActive ? 'highlighted' : ''}`)
      .attr('transform', `translate(${node.x}, ${node.y})`);

    if (node.id) nodeGroup.attr('id', node.id);

    // Node rectangle
    const rect = nodeGroup.append('rect')
      .attr('x', -nodeWidth / 2)
      .attr('y', -nodeHeight / 2)
      .attr('width', node.width || nodeWidth)
      .attr('height', node.height || nodeHeight)
      .attr('rx', 6)
      .attr('ry', 6)
      .attr('fill', nodeFill)
      .attr('stroke', nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
    }

    // Main label
    const labelY = node.sublabel ? -6 : 0;
    nodeGroup.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', textFill)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(node.label || '');

    // Sublabel
    if (node.sublabel) {
      nodeGroup.append('text')
        .attr('x', 0)
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', textFill)
        .attr('font-size', '10px')
        .attr('opacity', isActive ? 0.9 : 0.7)
        .attr('pointer-events', 'none')
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// =============================================================================
// EXPORTS
// =============================================================================

// Export everything as a single object for lessons to use
diagramLib = {
  // Core dependencies
  d3,

  // Theme utilities
  getCSSVar,
  diagramTheme,

  // SVG primitives
  createNode,
  createArrow,

  // Animation controller
  createStepController,

  // Components
  FlowDiagram
}

/**
 * Segmented step control for visualization stepping.
 * @param {Object} options
 * @param {number} options.min - Minimum step value (default 0)
 * @param {number} options.max - Maximum step value
 * @param {number} options.value - Initial value (default min)
 * @param {string} options.label - Optional label text
 * @returns {number} Current step value (reactive)
 */
stepControl = function({min = 0, max, value, label = null} = {}) {
  const initialValue = value ?? min;
  const steps = Array.from({length: max - min + 1}, (_, i) => min + i);

  const container = htl.html`<div class="step-control">
    ${label ? htl.html`<span class="step-control-label">${label}</span>` : ''}
    <div class="step-control-segments" role="group" aria-label="${label || 'Step control'}">
      ${steps.map(step => htl.html`<button
        class="step-control-segment ${step === initialValue ? 'active' : ''}"
        data-step="${step}"
        aria-pressed="${step === initialValue}"
        tabindex="${step === initialValue ? 0 : -1}"
      >${step}</button>`)}
    </div>
  </div>`;

  const segments = container.querySelectorAll('.step-control-segment');
  let currentValue = initialValue;

  function updateActive(newValue) {
    currentValue = newValue;
    segments.forEach(seg => {
      const isActive = parseInt(seg.dataset.step) === newValue;
      seg.classList.toggle('active', isActive);
      seg.setAttribute('aria-pressed', isActive);
      seg.tabIndex = isActive ? 0 : -1;
    });
    container.value = newValue;
    container.dispatchEvent(new Event('input', {bubbles: true}));
  }

  // Click handler
  segments.forEach(seg => {
    seg.addEventListener('click', () => {
      updateActive(parseInt(seg.dataset.step));
    });
  });

  // Keyboard navigation
  container.addEventListener('keydown', (e) => {
    if (e.key === 'ArrowRight' || e.key === 'ArrowDown') {
      e.preventDefault();
      const next = Math.min(currentValue + 1, max);
      updateActive(next);
      segments[next - min].focus();
    } else if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') {
      e.preventDefault();
      const prev = Math.max(currentValue - 1, min);
      updateActive(prev);
      segments[prev - min].focus();
    } else if (e.key === 'Home') {
      e.preventDefault();
      updateActive(min);
      segments[0].focus();
    } else if (e.key === 'End') {
      e.preventDefault();
      updateActive(max);
      segments[max - min].focus();
    }
  });

  container.value = initialValue;
  return container;
}

Introduction

Tensors are the foundation of deep learning. Master them before building a language model.

A tensor is a multi-dimensional array of numbers. NumPy users already know tensors. PyTorch tensors match NumPy’s interface but add GPU acceleration and automatic differentiation.

Why do we need them for LLMs?

Text becomes numbers: The tokenizer converts every word to a list of numbers (a vector)
Batching: Processing multiple sequences simultaneously improves efficiency
Matrix operations: Attention, embeddings, and neural network layers are all matrix multiplications

What You’ll Learn

After this module, you can:

Understand tensor shapes and what each dimension represents
Perform element-wise operations, matrix multiplication, and broadcasting
Convert between NumPy arrays and PyTorch tensors
Move tensors between CPU and GPU for acceleration
Recognize common LLM tensor shapes and their meanings

Tensor Dimensions

Classify tensors by dimension count:

// Tensor dimension selector
viewof selectedDim = Inputs.range([0, 3], {
  label: "Dimensions",
  step: 1,
  value: 0,
  width: 300
})

dimensionData = [
  {
    dim: 0,
    name: "Scalar",
    desc: "A single number",
    shape: "()",
    example: "5",
    code: "torch.tensor(5.0)"
  },
  {
    dim: 1,
    name: "Vector",
    desc: "A list of numbers",
    shape: "(5,)",
    example: "[1, 2, 3, 4, 5]",
    code: "torch.tensor([1, 2, 3, 4, 5])"
  },
  {
    dim: 2,
    name: "Matrix",
    desc: "A grid of numbers",
    shape: "(2, 3)",
    example: "[[1, 2, 3], [4, 5, 6]]",
    code: "torch.tensor([[1,2,3], [4,5,6]])"
  },
  {
    dim: 3,
    name: "3D Tensor",
    desc: "A stack of matrices",
    shape: "(batch, seq, embed)",
    example: "Multiple sequences in a batch",
    code: "torch.randn(4, 32, 64)"
  }
]

currentDim = dimensionData[selectedDim]

// Visual representation of tensor dimensions
tensorDimensionDiagram = {
  const width = 580;
  const height = 280;
  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 12);

  const centerX = width / 2;
  const vizY = 120;

  // Draw progression indicators at top
  const progressY = 30;
  const stepWidth = 120;
  const startX = (width - stepWidth * 4) / 2 + stepWidth / 2;

  for (let i = 0; i <= 3; i++) {
    const x = startX + i * stepWidth;
    const isActive = i === selectedDim;
    const isPast = i < selectedDim;

    // Circle
    svg.append("circle")
      .attr("cx", x)
      .attr("cy", progressY)
      .attr("r", 16)
      .attr("fill", isActive ? theme.highlight : isPast ? theme.accent : theme.nodeFill)
      .attr("stroke", isActive ? theme.highlight : isPast ? theme.accent : theme.nodeStroke)
      .attr("stroke-width", 2);

    // Number
    svg.append("text")
      .attr("x", x)
      .attr("y", progressY)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive || isPast ? theme.textOnHighlight : theme.nodeText)
      .attr("font-size", "12px")
      .attr("font-weight", "600")
      .text(`${i}D`);

    // Connector line
    if (i < 3) {
      svg.append("line")
        .attr("x1", x + 20)
        .attr("y1", progressY)
        .attr("x2", x + stepWidth - 20)
        .attr("y2", progressY)
        .attr("stroke", i < selectedDim ? theme.accent : theme.nodeStroke)
        .attr("stroke-width", 2)
        .attr("stroke-dasharray", i < selectedDim ? "0" : "4,4");
    }
  }

  // Draw the visual representation based on dimension
  const vizGroup = svg.append("g")
    .attr("transform", `translate(${centerX}, ${vizY})`);

  if (selectedDim === 0) {
    // Scalar: single glowing point
    vizGroup.append("circle")
      .attr("cx", 0)
      .attr("cy", 0)
      .attr("r", 30)
      .attr("fill", theme.highlight)
      .attr("filter", `drop-shadow(0 0 12px ${theme.highlightGlow})`);

    vizGroup.append("text")
      .attr("x", 0)
      .attr("y", 0)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", theme.textOnHighlight)
      .attr("font-size", "24px")
      .attr("font-weight", "bold")
      .text("5");

  } else if (selectedDim === 1) {
    // Vector: row of cells
    const cellSize = 40;
    const values = [1, 2, 3, 4, 5];
    const totalWidth = values.length * cellSize;

    values.forEach((v, i) => {
      const x = -totalWidth/2 + i * cellSize + cellSize/2;

      vizGroup.append("rect")
        .attr("x", x - cellSize/2 + 2)
        .attr("y", -cellSize/2)
        .attr("width", cellSize - 4)
        .attr("height", cellSize)
        .attr("fill", theme.accent)
        .attr("rx", 4)
        .attr("filter", `drop-shadow(0 0 6px ${theme.accentGlow})`);

      vizGroup.append("text")
        .attr("x", x)
        .attr("y", 0)
        .attr("text-anchor", "middle")
        .attr("dominant-baseline", "central")
        .attr("fill", theme.textOnAccent)
        .attr("font-size", "18px")
        .attr("font-weight", "600")
        .text(v);
    });

    // Index labels
    vizGroup.append("text")
      .attr("x", -totalWidth/2 - 15)
      .attr("y", 0)
      .attr("text-anchor", "end")
      .attr("dominant-baseline", "central")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px")
      .attr("opacity", 0.7)
      .text("idx:");

    values.forEach((_, i) => {
      const x = -totalWidth/2 + i * cellSize + cellSize/2;
      vizGroup.append("text")
        .attr("x", x)
        .attr("y", cellSize/2 + 14)
        .attr("text-anchor", "middle")
        .attr("fill", theme.nodeText)
        .attr("font-size", "10px")
        .attr("opacity", 0.6)
        .text(i);
    });

  } else if (selectedDim === 2) {
    // Matrix: 2D grid
    const cellSize = 36;
    const data = [[1, 2, 3], [4, 5, 6]];
    const rows = data.length;
    const cols = data[0].length;

    data.forEach((row, r) => {
      row.forEach((v, c) => {
        const x = (c - cols/2) * cellSize + cellSize/2;
        const y = (r - rows/2) * cellSize + cellSize/2;

        vizGroup.append("rect")
          .attr("x", x - cellSize/2 + 2)
          .attr("y", y - cellSize/2 + 2)
          .attr("width", cellSize - 4)
          .attr("height", cellSize - 4)
          .attr("fill", theme.accent)
          .attr("rx", 4)
          .attr("filter", `drop-shadow(0 0 4px ${theme.accentGlow})`);

        vizGroup.append("text")
          .attr("x", x)
          .attr("y", y)
          .attr("text-anchor", "middle")
          .attr("dominant-baseline", "central")
          .attr("fill", theme.textOnAccent)
          .attr("font-size", "16px")
          .attr("font-weight", "600")
          .text(v);
      });
    });

    // Row/col labels
    for (let r = 0; r < rows; r++) {
      const y = (r - rows/2) * cellSize + cellSize/2;
      vizGroup.append("text")
        .attr("x", -cols/2 * cellSize - 12)
        .attr("y", y)
        .attr("text-anchor", "end")
        .attr("dominant-baseline", "central")
        .attr("fill", theme.nodeText)
        .attr("font-size", "10px")
        .attr("opacity", 0.6)
        .text(`[${r}]`);
    }

    for (let c = 0; c < cols; c++) {
      const x = (c - cols/2) * cellSize + cellSize/2;
      vizGroup.append("text")
        .attr("x", x)
        .attr("y", -rows/2 * cellSize - 10)
        .attr("text-anchor", "middle")
        .attr("fill", theme.nodeText)
        .attr("font-size", "10px")
        .attr("opacity", 0.6)
        .text(`[${c}]`);
    }

  } else {
    // 3D Tensor: stacked matrices with depth effect
    const cellSize = 28;
    const layers = 3;
    const rows = 2;
    const cols = 3;
    const depthOffset = 18;

    // Draw back to front
    for (let l = layers - 1; l >= 0; l--) {
      const layerGroup = vizGroup.append("g")
        .attr("transform", `translate(${l * depthOffset - (layers-1) * depthOffset/2}, ${-l * depthOffset + (layers-1) * depthOffset/2})`);

      const opacity = l === 0 ? 1 : 0.5 + (layers - l) * 0.15;
      const layerColors = [theme.highlight, theme.accent, theme.nodeFill];
      const layerColor = layerColors[l] || theme.nodeFill;
      const textColor = l === 0 ? theme.textOnHighlight : l === 1 ? theme.textOnAccent : theme.nodeText;

      for (let r = 0; r < rows; r++) {
        for (let c = 0; c < cols; c++) {
          const x = (c - cols/2) * cellSize + cellSize/2;
          const y = (r - rows/2) * cellSize + cellSize/2;

          layerGroup.append("rect")
            .attr("x", x - cellSize/2 + 2)
            .attr("y", y - cellSize/2 + 2)
            .attr("width", cellSize - 4)
            .attr("height", cellSize - 4)
            .attr("fill", layerColor)
            .attr("stroke", theme.nodeStroke)
            .attr("stroke-width", 0.5)
            .attr("rx", 3)
            .attr("opacity", opacity);

          if (l === 0) {
            layerGroup.append("text")
              .attr("x", x)
              .attr("y", y)
              .attr("text-anchor", "middle")
              .attr("dominant-baseline", "central")
              .attr("fill", textColor)
              .attr("font-size", "11px")
              .attr("font-weight", "500")
              .text((r * cols + c + 1));
          }
        }
      }
    }

    // Labels for axes
    vizGroup.append("text")
      .attr("x", -80)
      .attr("y", 50)
      .attr("text-anchor", "middle")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text("batch");

    vizGroup.append("text")
      .attr("x", 0)
      .attr("y", 65)
      .attr("text-anchor", "middle")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text("seq");

    vizGroup.append("text")
      .attr("x", 80)
      .attr("y", 50)
      .attr("text-anchor", "middle")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text("embed");
  }

  // Info panel at bottom
  const infoY = 210;

  svg.append("text")
    .attr("x", centerX)
    .attr("y", infoY)
    .attr("text-anchor", "middle")
    .attr("fill", theme.highlight)
    .attr("font-size", "18px")
    .attr("font-weight", "700")
    .text(`${currentDim.dim}D: ${currentDim.name}`);

  svg.append("text")
    .attr("x", centerX)
    .attr("y", infoY + 22)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "13px")
    .text(currentDim.desc);

  svg.append("text")
    .attr("x", centerX)
    .attr("y", infoY + 44)
    .attr("text-anchor", "middle")
    .attr("fill", theme.accent)
    .attr("font-size", "12px")
    .attr("font-family", "'JetBrains Mono', 'Fira Code', monospace")
    .text(`shape: ${currentDim.shape}`);

  return svg.node();
}

Shape reveals what a tensor represents. In an LLM:

(vocab_size,) - A 1D tensor: scores for each word in vocabulary
(seq_len, embed_dim) - A 2D tensor: one embedding vector per token
(batch, seq_len, embed_dim) - A 3D tensor: multiple sequences at once

LLM Tensor Shapes

// LLM pipeline step selector
viewof llmStep = stepControl({min: 0, max: 2, value: 0, label: "Pipeline Stage"})

llmStages = [
  {
    name: "Token IDs",
    shape: "(batch, seq_len)",
    dims: [2, 4],
    dimLabels: ["batch=2", "seq_len=4"],
    example: "[[101, 2054, 2003, 102], [101, 7592, 999, 102]]",
    desc: "Raw token indices from vocabulary"
  },
  {
    name: "Embeddings",
    shape: "(batch, seq_len, embed_dim)",
    dims: [2, 4, 8],
    dimLabels: ["batch=2", "seq=4", "embed=8"],
    example: "Each token ID becomes a dense vector",
    desc: "Tokens converted to learned dense vectors"
  },
  {
    name: "Attention",
    shape: "(batch, heads, seq, seq)",
    dims: [2, 2, 4, 4],
    dimLabels: ["batch=2", "heads=2", "seq=4", "seq=4"],
    example: "Query-Key similarity scores",
    desc: "Each head attends from every position to every position"
  }
]

currentStage = llmStages[llmStep]

// LLM tensor shapes visualization
llmTensorShapesViz = {
  const width = 620;
  const height = 340;
  const theme = diagramTheme;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  // Background with subtle grid
  const defs = svg.append("defs");

  // Grid pattern
  const patternId = `grid-${Math.random().toString(36).substr(2, 9)}`;
  defs.append("pattern")
    .attr("id", patternId)
    .attr("width", 20)
    .attr("height", 20)
    .attr("patternUnits", "userSpaceOnUse")
    .append("path")
    .attr("d", "M 20 0 L 0 0 0 20")
    .attr("fill", "none")
    .attr("stroke", theme.nodeStroke)
    .attr("stroke-width", 0.3)
    .attr("opacity", 0.3);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 12);

  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", `url(#${patternId})`)
    .attr("rx", 12);

  // Pipeline flow at top
  const pipelineY = 35;
  const stageWidth = 160;
  const stageStartX = (width - stageWidth * 3) / 2;

  llmStages.forEach((stage, i) => {
    const x = stageStartX + i * stageWidth + stageWidth / 2;
    const isActive = i === llmStep;
    const isPast = i < llmStep;

    // Stage box
    svg.append("rect")
      .attr("x", x - 65)
      .attr("y", pipelineY - 18)
      .attr("width", 130)
      .attr("height", 36)
      .attr("rx", 6)
      .attr("fill", isActive ? theme.highlight : isPast ? theme.accent : theme.nodeFill)
      .attr("stroke", isActive ? theme.highlight : isPast ? theme.accent : theme.nodeStroke)
      .attr("stroke-width", isActive ? 2 : 1)
      .attr("filter", isActive ? `drop-shadow(0 0 8px ${theme.highlightGlow})` : "none");

    svg.append("text")
      .attr("x", x)
      .attr("y", pipelineY)
      .attr("text-anchor", "middle")
      .attr("dominant-baseline", "central")
      .attr("fill", isActive || isPast ? theme.textOnHighlight : theme.nodeText)
      .attr("font-size", "12px")
      .attr("font-weight", "600")
      .text(stage.name);

    // Arrow to next stage
    if (i < 2) {
      const arrowX = x + 70;
      svg.append("path")
        .attr("d", `M${arrowX},${pipelineY} L${arrowX + 20},${pipelineY} L${arrowX + 15},${pipelineY - 5} M${arrowX + 20},${pipelineY} L${arrowX + 15},${pipelineY + 5}`)
        .attr("stroke", i < llmStep ? theme.accent : theme.nodeStroke)
        .attr("stroke-width", 2)
        .attr("fill", "none");
    }
  });

  // Visualization area
  const vizY = 160;
  const vizGroup = svg.append("g")
    .attr("transform", `translate(${width / 2}, ${vizY})`);

  if (llmStep === 0) {
    // Token IDs: 2D grid of integer tokens
    const cellW = 55;
    const cellH = 32;
    const tokens = [[101, 2054, 2003, 102], [101, 7592, 999, 102]];
    const rows = tokens.length;
    const cols = tokens[0].length;

    // Draw cells
    tokens.forEach((row, r) => {
      row.forEach((tok, c) => {
        const x = (c - cols/2) * cellW + cellW/2;
        const y = (r - rows/2) * cellH + cellH/2;

        vizGroup.append("rect")
          .attr("x", x - cellW/2 + 2)
          .attr("y", y - cellH/2 + 2)
          .attr("width", cellW - 4)
          .attr("height", cellH - 4)
          .attr("fill", theme.accent)
          .attr("rx", 4)
          .attr("filter", `drop-shadow(0 0 4px ${theme.accentGlow})`);

        vizGroup.append("text")
          .attr("x", x)
          .attr("y", y)
          .attr("text-anchor", "middle")
          .attr("dominant-baseline", "central")
          .attr("fill", theme.textOnAccent)
          .attr("font-size", "14px")
          .attr("font-weight", "500")
          .text(tok);
      });
    });

    // Axis labels
    vizGroup.append("text")
      .attr("x", -cols/2 * cellW - 25)
      .attr("y", 0)
      .attr("text-anchor", "end")
      .attr("dominant-baseline", "central")
      .attr("fill", theme.highlight)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text("batch");

    vizGroup.append("text")
      .attr("x", 0)
      .attr("y", -rows/2 * cellH - 15)
      .attr("text-anchor", "middle")
      .attr("fill", theme.highlight)
      .attr("font-size", "11px")
      .attr("font-weight", "600")
      .text("seq_len");

  } else if (llmStep === 1) {
    // Embeddings: 3D with depth for embed_dim
    const cellW = 50;
    const cellH = 28;
    const embedSlices = 3; // Show 3 slices of embed dimension
    const depthOffset = 12;
    const rows = 2;
    const cols = 4;

    for (let e = embedSlices - 1; e >= 0; e--) {
      const layerGroup = vizGroup.append("g")
        .attr("transform", `translate(${e * depthOffset - embedSlices * depthOffset/2}, ${-e * depthOffset + embedSlices * depthOffset/2})`);

      const opacity = e === 0 ? 1 : 0.4 + e * 0.15;
      const layerColor = e === 0 ? theme.accent : theme.nodeFill;

      for (let r = 0; r < rows; r++) {
        for (let c = 0; c < cols; c++) {
          const x = (c - cols/2) * cellW + cellW/2;
          const y = (r - rows/2) * cellH + cellH/2;

          layerGroup.append("rect")
            .attr("x", x - cellW/2 + 2)
            .attr("y", y - cellH/2 + 2)
            .attr("width", cellW - 4)
            .attr("height", cellH - 4)
            .attr("fill", layerColor)
            .attr("stroke", theme.nodeStroke)
            .attr("stroke-width", 0.5)
            .attr("rx", 3)
            .attr("opacity", opacity);

          if (e === 0) {
            // Show sample embedding values
            const val = (Math.random() * 2 - 1).toFixed(1);
            layerGroup.append("text")
              .attr("x", x)
              .attr("y", y)
              .attr("text-anchor", "middle")
              .attr("dominant-baseline", "central")
              .attr("fill", theme.textOnAccent)
              .attr("font-size", "10px")
              .text(val);
          }
        }
      }
    }

    // Dimension labels
    vizGroup.append("text")
      .attr("x", -cols/2 * cellW - 35)
      .attr("y", 10)
      .attr("text-anchor", "end")
      .attr("fill", theme.highlight)
      .attr("font-size", "10px")
      .attr("font-weight", "600")
      .text("batch");

    vizGroup.append("text")
      .attr("x", 0)
      .attr("y", -rows/2 * cellH - 25)
      .attr("text-anchor", "middle")
      .attr("fill", theme.highlight)
      .attr("font-size", "10px")
      .attr("font-weight", "600")
      .text("seq");

    vizGroup.append("text")
      .attr("x", cols/2 * cellW + 40)
      .attr("y", -20)
      .attr("text-anchor", "start")
      .attr("fill", theme.highlight)
      .attr("font-size", "10px")
      .attr("font-weight", "600")
      .text("embed_dim");

    // Arrow showing depth
    vizGroup.append("path")
      .attr("d", `M${cols/2 * cellW + 15},-5 L${cols/2 * cellW + 30},-20`)
      .attr("stroke", theme.highlight)
      .attr("stroke-width", 1.5)
      .attr("fill", "none")
      .attr("marker-end", "none");

  } else {
    // Attention: show two heads side by side, each as seq x seq matrix
    const headSize = 70;
    const cellSize = headSize / 4;
    const headSpacing = 100;

    // Draw two attention heads
    [-1, 1].forEach((side, headIdx) => {
      const headX = side * headSpacing / 2;
      const headGroup = vizGroup.append("g")
        .attr("transform", `translate(${headX}, 0)`);

      // Head label
      headGroup.append("text")
        .attr("x", 0)
        .attr("y", -headSize/2 - 18)
        .attr("text-anchor", "middle")
        .attr("fill", headIdx === 0 ? theme.highlight : theme.accent)
        .attr("font-size", "11px")
        .attr("font-weight", "600")
        .text(`Head ${headIdx + 1}`);

      // Draw 4x4 attention matrix
      for (let r = 0; r < 4; r++) {
        for (let c = 0; c < 4; c++) {
          const x = (c - 2) * cellSize + cellSize/2;
          const y = (r - 2) * cellSize + cellSize/2;

          // Attention weight (stronger on diagonal and recent tokens for illustration)
          const weight = c <= r ? Math.max(0.2, 1 - (r - c) * 0.25) : 0;
          const color = headIdx === 0 ? theme.highlight : theme.accent;

          headGroup.append("rect")
            .attr("x", x - cellSize/2 + 1)
            .attr("y", y - cellSize/2 + 1)
            .attr("width", cellSize - 2)
            .attr("height", cellSize - 2)
            .attr("fill", color)
            .attr("opacity", weight * 0.8 + 0.1)
            .attr("rx", 2);
        }
      }

      // Axis labels for first head only
      if (headIdx === 0) {
        headGroup.append("text")
          .attr("x", -headSize/2 - 12)
          .attr("y", 0)
          .attr("text-anchor", "end")
          .attr("dominant-baseline", "central")
          .attr("fill", theme.nodeText)
          .attr("font-size", "9px")
          .attr("opacity", 0.7)
          .text("query");

        headGroup.append("text")
          .attr("x", 0)
          .attr("y", headSize/2 + 14)
          .attr("text-anchor", "middle")
          .attr("fill", theme.nodeText)
          .attr("font-size", "9px")
          .attr("opacity", 0.7)
          .text("key");
      }
    });

    // Causal mask note
    vizGroup.append("text")
      .attr("x", 0)
      .attr("y", 75)
      .attr("text-anchor", "middle")
      .attr("fill", theme.nodeText)
      .attr("font-size", "10px")
      .attr("opacity", 0.7)
      .text("Causal mask: can only attend to past tokens");
  }

  // Info panel at bottom
  const infoY = 280;

  svg.append("rect")
    .attr("x", width/2 - 200)
    .attr("y", infoY - 20)
    .attr("width", 400)
    .attr("height", 50)
    .attr("fill", theme.bgSecondary)
    .attr("rx", 8)
    .attr("opacity", 0.8);

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", infoY)
    .attr("text-anchor", "middle")
    .attr("fill", theme.highlight)
    .attr("font-size", "14px")
    .attr("font-weight", "bold")
    .text(`Shape: ${currentStage.shape}`);

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", infoY + 20)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "12px")
    .text(currentStage.desc);

  return svg.node();
}

Tensors Are Just Arrays

Start with NumPy: understanding NumPy arrays means understanding tensors.

import numpy as np

# A tensor is just a multi-dimensional array of numbers
scalar = np.array(5.0)           # 0D: a single number
vector = np.array([1, 2, 3])     # 1D: a list of numbers
matrix = np.array([[1, 2],       # 2D: a grid of numbers
                   [3, 4]])
tensor_3d = np.zeros((2, 3, 4))  # 3D: a stack of grids

print(f"Scalar: shape={scalar.shape}, ndim={scalar.ndim}")
print(f"Vector: shape={vector.shape}, ndim={vector.ndim}")
print(f"Matrix: shape={matrix.shape}, ndim={matrix.ndim}")
print(f"3D Tensor: shape={tensor_3d.shape}, ndim={tensor_3d.ndim}")

Scalar: shape=(), ndim=0
Vector: shape=(3,), ndim=1
Matrix: shape=(2, 2), ndim=2
3D Tensor: shape=(2, 3, 4), ndim=3

Shape and Dtype

Every array has two fundamental properties:

Shape: The size of each dimension (rows, cols, ...)
Dtype: The data type of elements (float32, int64, etc.)

# Shape tells you what the data represents
embeddings = np.random.randn(4, 8)  # 4 tokens, each with 8-dim embedding
print(f"Shape: {embeddings.shape}")
print(f"Dtype: {embeddings.dtype}")
print(f"Total elements: {embeddings.size}")
print(f"Memory: {embeddings.nbytes} bytes")

Shape: (4, 8)
Dtype: float64
Total elements: 32
Memory: 256 bytes

Indexing and Slicing

NumPy’s powerful indexing is identical to PyTorch:

# Create a batch of sequences
batch = np.arange(24).reshape(2, 3, 4)  # (batch=2, seq=3, features=4)
print(f"Full shape: {batch.shape}")
print(f"Original:\n{batch}\n")

# Get first sequence in first batch
print(f"batch[0, 0]: {batch[0, 0]}")

# Get all batches, first token only
print(f"batch[:, 0, :] shape: {batch[:, 0, :].shape}")

# Negative indexing: last element
print(f"batch[0, -1, :]: {batch[0, -1, :]}")

# Boolean indexing
mask = batch > 10
print(f"Elements > 10: {batch[mask]}")

Full shape: (2, 3, 4)
Original:
[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]

batch[0, 0]: [0 1 2 3]
batch[:, 0, :] shape: (2, 4)
batch[0, -1, :]: [ 8  9 10 11]
Elements > 10: [11 12 13 14 15 16 17 18 19 20 21 22 23]

The Core Operations

Neural networks depend on four fundamental operations: element-wise arithmetic, matrix multiplication, broadcasting, and reshaping. We examine each in NumPy, then in PyTorch.

Element-wise Operations

Apply the same operation to every element:

a = np.array([1.0, 2.0, 3.0])
b = np.array([4.0, 5.0, 6.0])

# NumPy: element-wise arithmetic
print(f"a + b = {a + b}")
print(f"a * b = {a * b}")
print(f"a ** 2 = {a ** 2}")
print(f"np.exp(a) = {np.exp(a)}")

a + b = [5. 7. 9.]
a * b = [ 4. 10. 18.]
a ** 2 = [1. 4. 9.]
np.exp(a) = [ 2.71828183  7.3890561  20.08553692]

PyTorch works identically:

import torch

a_pt = torch.tensor([1.0, 2.0, 3.0])
b_pt = torch.tensor([4.0, 5.0, 6.0])

# PyTorch: same operations, same syntax
print(f"a + b = {a_pt + b_pt}")
print(f"a * b = {a_pt * b_pt}")
print(f"a ** 2 = {a_pt ** 2}")
print(f"torch.exp(a) = {torch.exp(a_pt)}")

a + b = tensor([5., 7., 9.])
a * b = tensor([ 4., 10., 18.])
a ** 2 = tensor([1., 4., 9.])
torch.exp(a) = tensor([ 2.7183,  7.3891, 20.0855])

Matrix Multiplication

The workhorse of neural networks. For matrices A (m x n) and B (n x p), the result C = A @ B is (m x p):

# NumPy matrix multiplication
A = np.array([[1, 2],
              [3, 4]])   # (2, 2)
B = np.array([[5, 6],
              [7, 8]])   # (2, 2)

# Three equivalent ways
result1 = np.matmul(A, B)
result2 = A @ B
result3 = np.dot(A, B)  # same for 2D arrays

print(f"A @ B =\n{result1}")
print(f"Result shape: {result1.shape}")

A @ B =
[[19 22]
 [43 50]]
Result shape: (2, 2)

The @ operator also works for batched operations:

# Batched matrix multiplication in NumPy
batch_A = np.random.randn(4, 3, 2)  # 4 matrices of shape (3, 2)
batch_B = np.random.randn(4, 2, 5)  # 4 matrices of shape (2, 5)

result = batch_A @ batch_B
print(f"Batch matmul: {batch_A.shape} @ {batch_B.shape} = {result.shape}")

Batch matmul: (4, 3, 2) @ (4, 2, 5) = (4, 3, 5)

PyTorch’s @ and torch.matmul behave the same:

# PyTorch matrix multiplication
A_pt = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
B_pt = torch.tensor([[5.0, 6.0], [7.0, 8.0]])

result_pt = A_pt @ B_pt
print(f"PyTorch A @ B =\n{result_pt}")

# Batched
batch_A_pt = torch.randn(4, 3, 2)
batch_B_pt = torch.randn(4, 2, 5)
print(f"Batched: {(batch_A_pt @ batch_B_pt).shape}")

PyTorch A @ B =
tensor([[19., 22.],
        [43., 50.]])
Batched: torch.Size([4, 3, 5])

Broadcasting

When shapes differ, broadcasting expands the smaller array. This is crucial for adding biases, scaling, and many other operations.

The rules are simple: 1. Align shapes from the right 2. Dimensions must be equal OR one of them is 1 3. Missing dimensions are treated as 1

# NumPy broadcasting examples
x = np.ones((4, 3))       # (4, 3)
bias = np.array([1, 2, 3]) # (3,) - broadcasts to (4, 3)

result = x + bias
print(f"Shape {x.shape} + {bias.shape} = {result.shape}")
print(f"Result:\n{result}")

Shape (4, 3) + (3,) = (4, 3)
Result:
[[2. 3. 4.]
 [2. 3. 4.]
 [2. 3. 4.]
 [2. 3. 4.]]

# More broadcasting examples
batch = np.ones((2, 4, 3))  # (2, 4, 3)
scale = np.array([[[2]]])   # (1, 1, 1) - broadcasts to (2, 4, 3)
vector = np.array([1, 2, 3]) # (3,) - broadcasts to (2, 4, 3)

print(f"{batch.shape} * {scale.shape} = {(batch * scale).shape}")
print(f"{batch.shape} + {vector.shape} = {(batch + vector).shape}")

(2, 4, 3) * (1, 1, 1) = (2, 4, 3)
(2, 4, 3) + (3,) = (2, 4, 3)

PyTorch broadcasting follows the exact same rules:

# PyTorch broadcasting
embeddings = torch.randn(4, 32, 64)  # (batch, seq, embed)
bias = torch.randn(64)               # (embed,)

result = embeddings + bias  # broadcasts!
print(f"PyTorch: {embeddings.shape} + {bias.shape} = {result.shape}")

PyTorch: torch.Size([4, 32, 64]) + torch.Size([64]) = torch.Size([4, 32, 64])

Key Insight: NumPy to PyTorch

PyTorch tensors are NumPy arrays with superpowers. The API is nearly identical:

NumPy	PyTorch	Notes
`np.array([1,2,3])`	`torch.tensor([1,2,3])`	Creation
`arr.shape`	`tensor.shape`	Same attribute
`arr.dtype`	`tensor.dtype`	Same attribute
`np.matmul(a, b)`	`torch.matmul(a, b)`	Or use `@`
`np.exp(x)`	`torch.exp(x)`	Element-wise ops
`arr.reshape(2,3)`	`tensor.reshape(2,3)`	Reshaping
`arr.T`	`tensor.T`	Transpose

Converting between them requires one function call:

# NumPy <-> PyTorch conversion
np_array = np.array([1.0, 2.0, 3.0])
pt_tensor = torch.from_numpy(np_array)  # Shares memory!
back_to_np = pt_tensor.numpy()          # Shares memory!

print(f"NumPy: {np_array}")
print(f"PyTorch: {pt_tensor}")
print(f"Back to NumPy: {back_to_np}")

NumPy: [1. 2. 3.]
PyTorch: tensor([1., 2., 3.], dtype=torch.float64)
Back to NumPy: [1. 2. 3.]

Why PyTorch?

What does PyTorch add beyond NumPy? Three capabilities:

1. GPU Acceleration

NumPy runs only on CPU. PyTorch runs on both CPU and GPU, delivering 10-100x speedups on large matrices:

import time

# Create large matrices
size = 2000
np_a = np.random.randn(size, size).astype(np.float32)
np_b = np.random.randn(size, size).astype(np.float32)

# NumPy (CPU)
start = time.time()
np_result = np_a @ np_b
np_time = time.time() - start
print(f"NumPy (CPU): {np_time*1000:.1f} ms")

# PyTorch (CPU for comparison)
pt_a = torch.from_numpy(np_a)
pt_b = torch.from_numpy(np_b)
start = time.time()
pt_result = pt_a @ pt_b
torch.mps.synchronize() if torch.backends.mps.is_available() else None
pt_cpu_time = time.time() - start
print(f"PyTorch (CPU): {pt_cpu_time*1000:.1f} ms")

# PyTorch (GPU if available)
if torch.backends.mps.is_available() or torch.cuda.is_available():
    device = "mps" if torch.backends.mps.is_available() else "cuda"
    pt_a_gpu = pt_a.to(device)
    pt_b_gpu = pt_b.to(device)

    # Warm up: first GPU operation incurs overhead (kernel compilation, memory allocation)
    _ = pt_a_gpu @ pt_b_gpu
    # Synchronize: GPU operations are async, so we wait for completion before timing
    torch.mps.synchronize() if device == "mps" else torch.cuda.synchronize()

    start = time.time()
    pt_result_gpu = pt_a_gpu @ pt_b_gpu
    # Must synchronize again to ensure operation completes before stopping timer
    torch.mps.synchronize() if device == "mps" else torch.cuda.synchronize()
    pt_gpu_time = time.time() - start
    print(f"PyTorch ({device.upper()}): {pt_gpu_time*1000:.1f} ms")
    print(f"GPU speedup: {np_time/pt_gpu_time:.1f}x faster")

NumPy (CPU): 164.3 ms
PyTorch (CPU): 189.7 ms

2. Automatic Differentiation

PyTorch tracks operations to compute gradients automatically. Automatic differentiation makes neural networks trainable:

# NumPy: you'd have to compute gradients by hand
x_np = np.array([2.0])
y_np = x_np ** 2 + 3 * x_np
# dy/dx = 2x + 3 = 7 at x=2... but you have to derive and code this yourself!

# PyTorch: automatic!
x_pt = torch.tensor([2.0], requires_grad=True)
y_pt = x_pt ** 2 + 3 * x_pt
y_pt.backward()  # Compute gradient automatically
print(f"x = {x_pt.item()}")
print(f"y = x^2 + 3x = {y_pt.item()}")
print(f"dy/dx (computed automatically) = {x_pt.grad.item()}")

x = 2.0
y = x^2 + 3x = 10.0
dy/dx (computed automatically) = 7.0

This automatic differentiation scales to millions of parameters. Module 02: Autograd explores this deeply.

3. Optimized Kernels

PyTorch uses optimized backends (cuBLAS, cuDNN, MPS) that outperform naive implementations, even on CPU. Operations like convolutions, attention, and batch normalization have specialized implementations.

# PyTorch's optimized softmax vs manual
x = torch.randn(1000, 1000)

# Manual softmax (correct but slower)
def manual_softmax(x):
    exp_x = torch.exp(x - x.max(dim=-1, keepdim=True).values)
    return exp_x / exp_x.sum(dim=-1, keepdim=True)

# PyTorch's optimized version
import time

start = time.time()
for _ in range(100):
    _ = manual_softmax(x)
manual_time = time.time() - start

start = time.time()
for _ in range(100):
    _ = torch.softmax(x, dim=-1)
pytorch_time = time.time() - start

print(f"Manual softmax: {manual_time*1000:.1f} ms")
print(f"PyTorch softmax: {pytorch_time*1000:.1f} ms")

Manual softmax: 246.5 ms
PyTorch softmax: 104.1 ms

The Key Insight

PyTorch tensors are NumPy arrays with GPU acceleration and automatic differentiation:

Same API, same intuition
GPU acceleration for speed
Automatic gradients for training
Optimized kernels under the hood

Use NumPy for learning concepts; use PyTorch for production models.

Code Walkthrough

Explore tensors interactively:

import torch

print(f"PyTorch version: {torch.__version__}")
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")

PyTorch version: 2.10.0+cu128
Device: cpu

Creating Tensors

# From a list
x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
print(f"Shape: {x.shape}")
print(f"Dtype: {x.dtype}")
print(f"Device: {x.device}")
print(x)

Shape: torch.Size([2, 3])
Dtype: torch.float32
Device: cpu
tensor([[1., 2., 3.],
        [4., 5., 6.]])

# Random tensors (common for initialization)
random_tensor = torch.randn(2, 3, 4)  # Normal distribution (mean=0, std=1)
print(f"Random tensor shape: {random_tensor.shape}")
print(f"Mean: {random_tensor.mean():.4f}, Std: {random_tensor.std():.4f}")

Random tensor shape: torch.Size([2, 3, 4])
Mean: -0.0696, Std: 0.9891

Data Types (dtypes)

Dtype choice determines numerical precision and memory usage:

# Default is float32 (32 bits = 4 bytes per number)
t32 = torch.randn(1000, 1000)
print(f"float32: {t32.element_size()} bytes per element, total: {t32.numel() * t32.element_size() / 1e6:.1f} MB")

# float16 uses half the memory but lower precision
t16 = torch.randn(1000, 1000, dtype=torch.float16)
print(f"float16: {t16.element_size()} bytes per element, total: {t16.numel() * t16.element_size() / 1e6:.1f} MB")

# bfloat16: same exponent bits as float32 (8 bits) for better dynamic range,
# but fewer mantissa bits than float16, trading precision for stability
tbf16 = torch.randn(1000, 1000, dtype=torch.bfloat16)
print(f"bfloat16: {tbf16.element_size()} bytes per element")

float32: 4 bytes per element, total: 4.0 MB
float16: 2 bytes per element, total: 2.0 MB
bfloat16: 2 bytes per element

When to use each:

float32: Default, good for learning and debugging
float16: Inference on GPUs with Tensor Cores, half memory
bfloat16: Training large models, better numerical stability than float16

Reshaping

Multi-head attention requires reshaping to split the embedding dimension across heads:

# Reshape for multi-head attention
batch, seq, embed = 4, 32, 64
num_heads = 8
head_dim = embed // num_heads

x = torch.randn(batch, seq, embed)
print(f"Original: {x.shape}")

# Split into heads
x_heads = x.view(batch, seq, num_heads, head_dim)
print(f"After view: {x_heads.shape}")

# Transpose for attention computation
x_heads = x_heads.transpose(1, 2)  # (batch, heads, seq, head_dim)
print(f"After transpose: {x_heads.shape}")

Original: torch.Size([4, 32, 64])
After view: torch.Size([4, 32, 8, 8])
After transpose: torch.Size([4, 8, 32, 8])

Memory Layout: view vs reshape vs contiguous

Understanding memory layout prevents contiguity errors. Tensors store data in a flat 1D array; strides specify how many elements to skip when traversing each dimension. Operations like transpose change the logical order without moving data. The result: a non-contiguous tensor whose strides no longer match row-major layout:

# view() requires contiguous memory - it's a zero-copy operation
x = torch.randn(3, 4)
print(f"Original is contiguous: {x.is_contiguous()}")

# Transpose creates a non-contiguous view (same memory, different strides)
x_t = x.transpose(0, 1)
print(f"Transposed is contiguous: {x_t.is_contiguous()}")

# view() fails on non-contiguous tensors
try:
    x_t.view(12)  # This will fail
except RuntimeError as e:
    print(f"Error: {e}")

# contiguous() makes a copy with proper memory layout
x_t_contig = x_t.contiguous()
print(f"After contiguous(): {x_t_contig.is_contiguous()}")
x_t_contig.view(12)  # Now it works
print("view() works after contiguous()")

# reshape() handles this automatically (but may copy)
reshaped = x_t.reshape(12)  # Always works
print(f"reshape() auto-handles non-contiguous: {reshaped.shape}")

Original is contiguous: True
Transposed is contiguous: False
Error: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
After contiguous(): True
view() works after contiguous()
reshape() auto-handles non-contiguous: torch.Size([12])

Rule of thumb: Use reshape() by default; use view() when you need zero-copy behavior.

Matrix Multiplication

Matrix multiplication dominates neural network computation. The @ operator (or torch.matmul) handles batched operations automatically:

# Simulating Q @ K^T in attention
Q = torch.randn(2, 8, 32, 8)  # (batch, heads, seq, head_dim)
K = torch.randn(2, 8, 32, 8)

# Attention scores
scores = Q @ K.transpose(-2, -1)  # (batch, heads, seq, seq)
print(f"Q shape: {Q.shape}")
print(f"K^T shape: {K.transpose(-2, -1).shape}")
print(f"Scores shape: {scores.shape}")

Q shape: torch.Size([2, 8, 32, 8])
K^T shape: torch.Size([2, 8, 8, 32])
Scores shape: torch.Size([2, 8, 32, 32])

Key insight: Leading dimensions broadcast automatically; the last two dimensions follow matrix multiplication rules: (m, k) @ (k, n) -> (m, n).

Preview: Softmax in LLMs

Softmax converts raw scores (logits) into a probability distribution. You’ll use it constantly in attention weights and next-token prediction.

\[\text{softmax}(z_i) = \frac{e^{z_i}}{\sum_{j=1}^{n} e^{z_j}}\]

import torch

logits = torch.tensor([2.0, 1.0, 0.1])
probs = torch.softmax(logits, dim=-1)

print(f"Logits: {logits.tolist()}")
print(f"Probs:  {[f'{p:.3f}' for p in probs.tolist()]}")
print(f"Sum:    {probs.sum():.3f}")

Logits: [2.0, 1.0, 0.10000000149011612]
Probs:  ['0.659', '0.242', '0.099']
Sum:    1.000

The highest logit (2.0) gets ~65% of the probability mass. The dim parameter specifies which dimension sums to 1.

Tip: Always use torch.softmax() — it handles numerical stability automatically. Module 05: Attention and Module 08: Generation cover softmax in depth.

Common Operations in LLMs

These operations appear everywhere in transformer models:

# Layer Normalization (normalizes features, not batch)
x = torch.randn(4, 32, 64)  # (batch, seq, embed)
mean = x.mean(dim=-1, keepdim=True)
std = x.std(dim=-1, keepdim=True)
x_norm = (x - mean) / (std + 1e-5)
print(f"LayerNorm output shape: {x_norm.shape}")
print(f"Mean per token: {x_norm.mean(dim=-1)[0, :3]}")  # Should be ~0

# Linear projection (the most common operation)
W = torch.randn(64, 256)  # (in_features, out_features)
b = torch.randn(256)       # (out_features,)
x = torch.randn(4, 32, 64) # (batch, seq, in_features)
out = x @ W + b            # Broadcasting adds bias
print(f"Linear output shape: {out.shape}")

LayerNorm output shape: torch.Size([4, 32, 64])
Mean per token: tensor([-1.8626e-08, -1.4901e-08, -2.0489e-08])
Linear output shape: torch.Size([4, 32, 256])

Broadcasting in Action

# Adding bias to all tokens in a batch
embeddings = torch.randn(4, 32, 64)  # (batch, seq, embed)
bias = torch.randn(64)               # (embed,)

result = embeddings + bias  # Broadcasts!
print(f"Embeddings: {embeddings.shape}")
print(f"Bias: {bias.shape}")
print(f"Result: {result.shape}")

Embeddings: torch.Size([4, 32, 64])
Bias: torch.Size([64])
Result: torch.Size([4, 32, 64])

Device Management (CPU vs GPU)

Moving tensors between devices is essential for GPU acceleration:

# Check available devices
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Create tensor on specific device
device = "mps" if torch.backends.mps.is_available() else "cpu"
x = torch.randn(1000, 1000, device=device)
print(f"Tensor device: {x.device}")

# Move existing tensor to device
y = torch.randn(1000, 1000)  # Created on CPU by default
y = y.to(device)             # Move to GPU
print(f"After .to(): {y.device}")

MPS available: False
CUDA available: False
Tensor device: cpu
After .to(): cpu

Common pitfall: PyTorch requires tensors to share the same device for operations:

# This would fail if devices differ:
# z = x_cpu + x_gpu  # RuntimeError!

# Always ensure tensors are on the same device
a = torch.randn(100, device=device)
b = torch.randn(100, device=device)
c = a + b  # Works!
print(f"Both on {device}: operation succeeded")

Both on cpu: operation succeeded

Interactive Exploration

Enter two tensor shapes below. The widget shows how broadcasting aligns and expands each dimension.

theme = {
  const t = diagramTheme;

  // For the broadcasting widget, we need some additional semantic colors
  // that extend the base diagram theme
  return {
    // Surface colors from diagram theme
    bgPrimary: t.bg === 'transparent' ? t.bgSecondary : t.bg,
    bgSecondary: t.bgSecondary,
    bgTertiary: t.nodeFill,
    textPrimary: t.nodeText,
    textSecondary: t.nodeText,
    textMuted: t.edgeStroke,
    borderLight: t.nodeStroke,
    borderMedium: t.edgeStroke,
    // Semantic colors derived from theme
    blue: t.primary,
    blueBg: t.primaryBg,
    green: t.success,
    greenBg: t.successBg,
    yellow: t.highlight,
    yellowBg: t.infoBg,
    red: t.error,
    redBg: t.errorBg,
    gray: t.edgeStroke,
    grayBg: t.nodeFill,
    // Text on colored backgrounds - use semantic theme colors
    blueText: t.primary,
    greenText: t.success,
    yellowText: t.info,
    redText: t.error,
    grayText: t.nodeText,
    successBg: t.successBg,
    errorBg: t.errorBg,
    successText: t.success,
    errorText: t.error
  };
}

// Parse shape string like "(3, 4, 1)" into array [3, 4, 1]
function parseShape(str) {
  const cleaned = str.replace(/[\(\)\[\]]/g, '').trim();
  if (!cleaned) return [];
  const dims = cleaned.split(',').map(s => parseInt(s.trim())).filter(n => !isNaN(n) && n > 0);
  return dims;
}

// Compute broadcast result
function broadcastShapes(shapeA, shapeB) {
  const maxLen = Math.max(shapeA.length, shapeB.length);

  // Pad shorter shape with 1s on the left
  const paddedA = Array(maxLen - shapeA.length).fill(1).concat(shapeA);
  const paddedB = Array(maxLen - shapeB.length).fill(1).concat(shapeB);

  const result = [];
  const dimInfoA = [];
  const dimInfoB = [];
  let compatible = true;
  let errorDim = -1;

  for (let i = 0; i < maxLen; i++) {
    const a = paddedA[i];
    const b = paddedB[i];

    if (a === b) {
      result.push(a);
      dimInfoA.push({ value: a, broadcast: false, padded: i < maxLen - shapeA.length });
      dimInfoB.push({ value: b, broadcast: false, padded: i < maxLen - shapeB.length });
    } else if (a === 1) {
      result.push(b);
      dimInfoA.push({ value: a, broadcast: true, padded: i < maxLen - shapeA.length });
      dimInfoB.push({ value: b, broadcast: false, padded: i < maxLen - shapeB.length });
    } else if (b === 1) {
      result.push(a);
      dimInfoA.push({ value: a, broadcast: false, padded: i < maxLen - shapeA.length });
      dimInfoB.push({ value: b, broadcast: true, padded: i < maxLen - shapeB.length });
    } else {
      // Incompatible
      compatible = false;
      errorDim = i;
      result.push(null);
      dimInfoA.push({ value: a, broadcast: false, error: true, padded: i < maxLen - shapeA.length });
      dimInfoB.push({ value: b, broadcast: false, error: true, padded: i < maxLen - shapeB.length });
    }
  }

  return { result, dimInfoA, dimInfoB, compatible, errorDim, paddedA, paddedB };
}

// Format shape for display
function formatShape(dims) {
  if (dims.length === 0) return "()";
  return "(" + dims.join(", ") + ")";
}

viewof shapeAInput = Inputs.text({
  label: "Shape A",
  value: "4, 1, 3",
  placeholder: "e.g., 4, 1, 3",
  width: 200
})

viewof shapeBInput = Inputs.text({
  label: "Shape B",
  value: "5, 3",
  placeholder: "e.g., 5, 3",
  width: 200
})

shapeA = parseShape(shapeAInput)
shapeB = parseShape(shapeBInput)
broadcast = broadcastShapes(shapeA, shapeB)

// Visualization
broadcastVisualizationTable = html`
<div style="font-family: system-ui; margin: 20px 0; color: ${theme.textPrimary};">
  <table style="border-collapse: collapse; width: 100%; max-width: 600px; background: ${theme.bgPrimary};">
    <thead>
      <tr style="background: ${theme.bgTertiary};">
        <th style="padding: 10px; text-align: left; border-bottom: 2px solid ${theme.borderMedium}; color: ${theme.textPrimary};">Tensor</th>
        <th style="padding: 10px; text-align: left; border-bottom: 2px solid ${theme.borderMedium}; color: ${theme.textPrimary};">Original Shape</th>
        <th style="padding: 10px; text-align: left; border-bottom: 2px solid ${theme.borderMedium}; color: ${theme.textPrimary};">Aligned (padded left)</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td style="padding: 10px; font-weight: bold; color: ${theme.blueText};">A</td>
        <td style="padding: 10px; font-family: monospace; color: ${theme.textPrimary};">${formatShape(shapeA)}</td>
        <td style="padding: 10px; font-family: monospace;">
          ${broadcast.dimInfoA.map(d => html`<span style="
            display: inline-block;
            padding: 4px 8px;
            margin: 2px;
            border-radius: 4px;
            background: ${d.error ? theme.redBg : d.broadcast ? theme.yellowBg : d.padded ? theme.grayBg : theme.blueBg};
            border: 1px solid ${d.error ? theme.red : d.broadcast ? theme.yellow : d.padded ? theme.gray : theme.blue};
            color: ${d.error ? theme.redText : d.broadcast ? theme.yellowText : d.padded ? theme.grayText : theme.blueText};
          ">${d.value}</span>`)}
        </td>
      </tr>
      <tr>
        <td style="padding: 10px; font-weight: bold; color: ${theme.greenText};">B</td>
        <td style="padding: 10px; font-family: monospace; color: ${theme.textPrimary};">${formatShape(shapeB)}</td>
        <td style="padding: 10px; font-family: monospace;">
          ${broadcast.dimInfoB.map(d => html`<span style="
            display: inline-block;
            padding: 4px 8px;
            margin: 2px;
            border-radius: 4px;
            background: ${d.error ? theme.redBg : d.broadcast ? theme.yellowBg : d.padded ? theme.grayBg : theme.greenBg};
            border: 1px solid ${d.error ? theme.red : d.broadcast ? theme.yellow : d.padded ? theme.gray : theme.green};
            color: ${d.error ? theme.redText : d.broadcast ? theme.yellowText : d.padded ? theme.grayText : theme.greenText};
          ">${d.value}</span>`)}
        </td>
      </tr>
      <tr style="background: ${broadcast.compatible ? theme.successBg : theme.errorBg};">
        <td style="padding: 10px; font-weight: bold; color: ${theme.textPrimary};" colspan="2">Result</td>
        <td style="padding: 10px; font-family: monospace; font-weight: bold;">
          ${broadcast.compatible
            ? html`<span style="color: ${theme.successText};">${formatShape(broadcast.result)}</span>`
            : html`<span style="color: ${theme.errorText};">❌ Incompatible shapes</span>`
          }
        </td>
      </tr>
    </tbody>
  </table>

  <div style="margin-top: 15px; font-size: 13px;">
    <span style="display: inline-block; padding: 2px 8px; background: ${theme.blueBg}; color: ${theme.blueText}; border-radius: 4px; margin-right: 8px;">Original</span>
    <span style="display: inline-block; padding: 2px 8px; background: ${theme.grayBg}; color: ${theme.grayText}; border-radius: 4px; margin-right: 8px;">Padded (1)</span>
    <span style="display: inline-block; padding: 2px 8px; background: ${theme.yellowBg}; color: ${theme.yellowText}; border-radius: 4px; margin-right: 8px;">Broadcast</span>
    <span style="display: inline-block; padding: 2px 8px; background: ${theme.redBg}; color: ${theme.redText}; border-radius: 4px;">Error</span>
  </div>
</div>
`

// Explanation
broadcastExplanation = broadcast.compatible
  ? md`**Broadcasting succeeded!** The result shape ${formatShape(broadcast.result)} is computed by taking the maximum of each aligned dimension.`
  : md`**Broadcasting failed!** Dimension ${broadcast.errorDim} has incompatible sizes: ${broadcast.paddedA[broadcast.errorDim]} vs ${broadcast.paddedB[broadcast.errorDim]}. For broadcasting to work, dimensions must be equal or one must be 1.`

Try This

Simple broadcast: Try (4, 1) and (1, 3). Both have a 1, so they broadcast to (4, 3).
Scalar broadcast: Try (3, 4) and (1). A scalar broadcasts to any shape.
Same shapes: Try (2, 3) and (2, 3). No broadcasting needed - shapes are identical.
Incompatible shapes: Try (3, 4) and (2, 4). The first dimension (3 vs 2) can’t broadcast because neither is 1.
Real-world example: Try (32, 10, 64) (batch of sequences) and (64) (a bias vector). The bias broadcasts across batch and sequence dimensions.

Exercises

Exercise 1: Create an Embedding Lookup

# Create a vocabulary embedding table
vocab_size = 100
embed_dim = 32

embedding_table = torch.randn(vocab_size, embed_dim)
print(f"Embedding table: {embedding_table.shape}")

# Look up embeddings for token IDs
token_ids = torch.tensor([5, 23, 7, 42])
embeddings = embedding_table[token_ids]
print(f"Token IDs: {token_ids}")
print(f"Embeddings shape: {embeddings.shape}")

Embedding table: torch.Size([100, 32])
Token IDs: tensor([ 5, 23,  7, 42])
Embeddings shape: torch.Size([4, 32])

Exercise 2: Simulate Simple Attention

seq_len = 6
embed_dim = 8

# Token embeddings
tokens = torch.randn(seq_len, embed_dim)

# Compute attention scores (dot product similarity)
scores = tokens @ tokens.T
print(f"Attention scores shape: {scores.shape}")

# Apply softmax to get weights
attention_weights = torch.softmax(scores, dim=-1)
print(f"Attention weights shape: {attention_weights.shape}")

Attention scores shape: torch.Size([6, 6])
Attention weights shape: torch.Size([6, 6])

// Attention heatmap visualization
attentionHeatmap = {
  const width = 420;
  const height = 380;
  const margin = {top: 50, right: 80, bottom: 50, left: 60};
  const innerWidth = width - margin.left - margin.right;
  const innerHeight = height - margin.top - margin.bottom;

  const theme = diagramTheme;
  const data = attention_data;  // Already in correct format from Python
  const n = data.length;

  const cellSize = Math.min(innerWidth / n, innerHeight / n);
  const gridWidth = cellSize * n;
  const gridHeight = cellSize * n;

  const svg = d3.create("svg")
    .attr("viewBox", `0 0 ${width} ${height}`)
    .attr("width", "100%")
    .attr("height", height)
    .style("max-width", `${width}px`)
    .style("font-family", "'JetBrains Mono', 'Fira Code', monospace");

  // Background
  svg.append("rect")
    .attr("width", width)
    .attr("height", height)
    .attr("fill", theme.bg)
    .attr("rx", 12);

  const g = svg.append("g")
    .attr("transform", `translate(${margin.left + (innerWidth - gridWidth) / 2}, ${margin.top + (innerHeight - gridHeight) / 2})`);

  // Color scale - using theme colors
  const maxVal = d3.max(data.flat());
  const colorScale = d3.scaleSequential()
    .domain([0, maxVal])
    .interpolator(t => {
      // Interpolate from bg to accent color
      const r1 = parseInt(theme.bg.slice(1, 3) || 'fa', 16);
      const g1 = parseInt(theme.bg.slice(3, 5) || 'fa', 16);
      const b1 = parseInt(theme.bg.slice(5, 7) || 'f9', 16);
      const r2 = parseInt(theme.accent.slice(1, 3), 16);
      const g2 = parseInt(theme.accent.slice(3, 5), 16);
      const b2 = parseInt(theme.accent.slice(5, 7), 16);
      const r = Math.round(r1 + (r2 - r1) * t);
      const g = Math.round(g1 + (g2 - g1) * t);
      const b = Math.round(b1 + (b2 - b1) * t);
      return `rgb(${r},${g},${b})`;
    });

  // Draw cells
  for (let i = 0; i < n; i++) {
    for (let j = 0; j < n; j++) {
      const value = data[i][j];

      g.append("rect")
        .attr("x", j * cellSize)
        .attr("y", i * cellSize)
        .attr("width", cellSize - 2)
        .attr("height", cellSize - 2)
        .attr("fill", colorScale(value))
        .attr("rx", 3)
        .attr("stroke", theme.nodeStroke)
        .attr("stroke-width", 0.5);

      // Show value for higher weights
      if (value > 0.12) {
        g.append("text")
          .attr("x", j * cellSize + cellSize / 2 - 1)
          .attr("y", i * cellSize + cellSize / 2)
          .attr("text-anchor", "middle")
          .attr("dominant-baseline", "central")
          .attr("fill", value > maxVal * 0.6 ? theme.textOnAccent : theme.nodeText)
          .attr("font-size", "10px")
          .attr("font-weight", "500")
          .text(value.toFixed(2));
      }
    }
  }

  // Axis labels
  for (let i = 0; i < n; i++) {
    // Y axis (Query)
    g.append("text")
      .attr("x", -8)
      .attr("y", i * cellSize + cellSize / 2)
      .attr("text-anchor", "end")
      .attr("dominant-baseline", "central")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px")
      .text(i);

    // X axis (Key)
    g.append("text")
      .attr("x", i * cellSize + cellSize / 2)
      .attr("y", gridHeight + 16)
      .attr("text-anchor", "middle")
      .attr("fill", theme.nodeText)
      .attr("font-size", "11px")
      .text(i);
  }

  // Axis titles
  svg.append("text")
    .attr("x", margin.left - 35)
    .attr("y", height / 2)
    .attr("text-anchor", "middle")
    .attr("dominant-baseline", "central")
    .attr("fill", theme.highlight)
    .attr("font-size", "12px")
    .attr("font-weight", "600")
    .attr("transform", `rotate(-90, ${margin.left - 35}, ${height / 2})`)
    .text("Query Token");

  svg.append("text")
    .attr("x", width / 2)
    .attr("y", height - 15)
    .attr("text-anchor", "middle")
    .attr("fill", theme.highlight)
    .attr("font-size", "12px")
    .attr("font-weight", "600")
    .text("Key Token");

  // Title
  svg.append("text")
    .attr("x", width / 2)
    .attr("y", 25)
    .attr("text-anchor", "middle")
    .attr("fill", theme.nodeText)
    .attr("font-size", "14px")
    .attr("font-weight", "700")
    .text("Simple Attention Pattern");

  // Color legend
  const legendWidth = 15;
  const legendHeight = gridHeight;
  const legendX = margin.left + innerWidth + 15;
  const legendY = margin.top + (innerHeight - legendHeight) / 2;

  const defs = svg.append("defs");
  const gradientId = `legend-grad-${Math.random().toString(36).substr(2, 9)}`;
  const gradient = defs.append("linearGradient")
    .attr("id", gradientId)
    .attr("x1", "0%")
    .attr("y1", "100%")
    .attr("x2", "0%")
    .attr("y2", "0%");

  gradient.append("stop").attr("offset", "0%").attr("stop-color", colorScale(0));
  gradient.append("stop").attr("offset", "100%").attr("stop-color", colorScale(maxVal));

  svg.append("rect")
    .attr("x", legendX)
    .attr("y", legendY)
    .attr("width", legendWidth)
    .attr("height", legendHeight)
    .attr("fill", `url(#${gradientId})`)
    .attr("rx", 3)
    .attr("stroke", theme.nodeStroke);

  svg.append("text")
    .attr("x", legendX + legendWidth + 6)
    .attr("y", legendY)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text(maxVal.toFixed(2));

  svg.append("text")
    .attr("x", legendX + legendWidth + 6)
    .attr("y", legendY + legendHeight)
    .attr("dominant-baseline", "central")
    .attr("fill", theme.nodeText)
    .attr("font-size", "10px")
    .text("0.00");

  return svg.node();
}

Exercise 3: Apply Attention

# Weighted combination of values
output = attention_weights @ tokens
print(f"Input: {tokens.shape}")
print(f"Weights: {attention_weights.shape}")
print(f"Output: {output.shape}")
print("\nEach output token is a weighted average of ALL input tokens!")

Input: torch.Size([6, 8])
Weights: torch.Size([6, 6])
Output: torch.Size([6, 8])

Each output token is a weighted average of ALL input tokens!

Common Pitfalls

Avoid these common mistakes:

1. Shape Mismatches

Always print shapes when debugging. Most errors come from unexpected dimensions.

# BAD: Silent broadcasting can hide bugs
a = torch.randn(4, 3)
b = torch.randn(3)     # Did you mean (4, 3)?
result = a + b         # Works due to broadcasting, but may not be intended!

# GOOD: Verify shapes explicitly
print(f"a: {a.shape}, b: {b.shape}, result: {result.shape}")
assert a.shape == (4, 3), f"Expected (4, 3), got {a.shape}"

a: torch.Size([4, 3]), b: torch.Size([3]), result: torch.Size([4, 3])

2. Device Mismatches

Tensors must be on the same device for operations.

device = "mps" if torch.backends.mps.is_available() else "cpu"
x_cpu = torch.randn(3)
x_gpu = torch.randn(3, device=device)

# BAD: This fails if device != cpu
# result = x_cpu + x_gpu  # RuntimeError!

# GOOD: Ensure same device
x_cpu = x_cpu.to(device)
result = x_cpu + x_gpu
print(f"Both on {device}: operation succeeded")

Both on cpu: operation succeeded

3. In-place Operations Break Gradients

Methods ending in _ modify tensors in-place and break gradient computation.

x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = x * 2

# BAD: In-place modification on tensor needed for gradient
# y.add_(1)  # RuntimeError: in-place operation on a leaf Variable

# GOOD: Create new tensor
y = y + 1
loss = y.sum()
loss.backward()
print(f"Gradient computed: {x.grad}")

Gradient computed: tensor([2., 2., 2.])

4. Forgetting contiguous()

After transpose/permute, the tensor may not be contiguous in memory.

x = torch.randn(3, 4)
x_t = x.transpose(0, 1)  # Shape (4, 3), but not contiguous

print(f"Original contiguous: {x.is_contiguous()}")
print(f"Transposed contiguous: {x_t.is_contiguous()}")

# BAD: view() requires contiguous memory
# x_t.view(12)  # RuntimeError!

# GOOD: Make contiguous first, or use reshape
x_t_contig = x_t.contiguous().view(12)  # Works
x_t_reshaped = x_t.reshape(12)          # Also works (may copy)

Original contiguous: True
Transposed contiguous: False

5. dtype Mismatches

Operations between different dtypes may silently upcast or fail.

a = torch.tensor([1.0, 2.0], dtype=torch.float32)
b = torch.tensor([1.0, 2.0], dtype=torch.float16)

# Silent upcast to float32 (may waste memory)
result = a + b.to(torch.float32)
print(f"Result dtype: {result.dtype}")

# GOOD: Be explicit about dtype
a_fp16 = a.to(torch.float16)
result = a_fp16 + b  # Both float16
print(f"Explicit dtype: {result.dtype}")

Result dtype: torch.float32
Explicit dtype: torch.float16

Summary

Key takeaways:

Tensors are multi-dimensional arrays - their shape tells you what they represent
Broadcasting automatically expands smaller tensors to match larger ones
Matrix multiplication is the core operation - inner dimensions must match
Reshaping reorganizes dimensions without changing total elements
Memory layout matters - understand contiguous vs strided for efficient operations
Device placement - use GPU (MPS/CUDA) for 10-100x speedup on large tensors
Data types - float32 for learning, float16/bfloat16 for production

What’s Next

Module 02: Autograd shows how PyTorch automatically computes gradients through all these operations - the mechanism that makes neural networks trainable.