Module 03: Tokenization

d3 = require("d3@7")


// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================

// Function to read CSS custom property values from the document
getCSSVar = function(name, fallback = null) {
  if (typeof document === 'undefined') return fallback;
  const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim();
  return value || fallback;
}

// =============================================================================
// THEME OBJECT
// =============================================================================

// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
  // Fallback values (light mode)
  const fallbacks = {
    nodeFill: '#f5f5f4',
    nodeFillHover: '#e7e5e4',
    nodeStroke: '#d6d3d1',
    nodeText: '#1c1917',
    edgeStroke: '#78716c',
    highlight: '#f97316',
    highlightGlow: 'rgba(249, 115, 22, 0.3)',
    accent: '#0ea5e9',
    accentGlow: 'rgba(14, 165, 233, 0.3)',
    textOnHighlight: '#1c1917',
    textOnAccent: '#1c1917',
    bg: '#fafaf9',
    bgSecondary: '#f5f5f4',
    // Semantic colors for status/feedback
    error: '#dc2626',
    errorBg: 'rgba(220, 38, 38, 0.1)',
    success: '#16a34a',
    successBg: 'rgba(22, 163, 74, 0.1)',
    info: '#2563eb',
    infoBg: 'rgba(37, 99, 235, 0.1)'
  };

  return {
    nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
    nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
    nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
    nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
    edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
    highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
    highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
    accent: getCSSVar('--diagram-accent', fallbacks.accent),
    accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
    textOnHighlight: fallbacks.textOnHighlight,
    textOnAccent: fallbacks.textOnAccent,
    bg: getCSSVar('--diagram-bg', fallbacks.bg),
    bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
    // Semantic colors (use fallbacks directly since no CSS vars defined)
    error: fallbacks.error,
    errorBg: fallbacks.errorBg,
    success: fallbacks.success,
    successBg: fallbacks.successBg,
    info: fallbacks.info,
    infoBg: fallbacks.infoBg
  };
}

// =============================================================================
// SVG PRIMITIVES
// =============================================================================

// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
  const {
    x = 0,
    y = 0,
    width = 100,
    height = 50,
    label = '',
    sublabel = '',
    id = null,
    theme = diagramTheme,
    rx = 6,
    ry = 6,
    className = 'diagram-node'
  } = options;

  // Create group
  const g = svg.append('g')
    .attr('class', className)
    .attr('transform', `translate(${x}, ${y})`);

  if (id) g.attr('id', id);

  // Add rectangle
  g.append('rect')
    .attr('x', -width / 2)
    .attr('y', -height / 2)
    .attr('width', width)
    .attr('height', height)
    .attr('rx', rx)
    .attr('ry', ry)
    .attr('fill', theme.nodeFill)
    .attr('stroke', theme.nodeStroke)
    .attr('stroke-width', 1.5);

  // Add main label
  if (label) {
    const labelY = sublabel ? -6 : 0;
    g.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(label);
  }

  // Add sublabel
  if (sublabel) {
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .attr('opacity', 0.7)
      .attr('pointer-events', 'none')
      .text(sublabel);
  }

  return g;
}

// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
  const {
    x1 = 0,
    y1 = 0,
    x2 = 100,
    y2 = 0,
    label = '',
    theme = diagramTheme,
    curved = false,
    curvature = 0.3,
    id = null,
    className = 'diagram-edge',
    dashed = false
  } = options;

  // Create unique marker ID
  const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;

  // Ensure defs exists
  let defs = svg.select('defs');
  if (defs.empty()) {
    defs = svg.append('defs');
  }

  // Add arrowhead marker
  defs.append('marker')
    .attr('id', markerId)
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Create group for arrow
  const g = svg.append('g')
    .attr('class', className);

  if (id) g.attr('id', id);

  // Calculate path
  let pathD;
  if (curved) {
    // Quadratic Bezier curve
    const midX = (x1 + x2) / 2;
    const midY = (y1 + y2) / 2;
    const dx = x2 - x1;
    const dy = y2 - y1;
    // Perpendicular offset for curve
    const cx = midX - dy * curvature;
    const cy = midY + dx * curvature;
    pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
  } else {
    // Straight line
    pathD = `M${x1},${y1} L${x2},${y2}`;
  }

  // Add path
  const path = g.append('path')
    .attr('d', pathD)
    .attr('fill', 'none')
    .attr('stroke', theme.edgeStroke)
    .attr('stroke-width', 1.5)
    .attr('marker-end', `url(#${markerId})`);

  if (dashed) {
    path.attr('stroke-dasharray', '5,3');
  }

  // Add label if provided
  if (label) {
    const labelX = (x1 + x2) / 2;
    const labelY = (y1 + y2) / 2;

    // Offset label perpendicular to line
    const angle = Math.atan2(y2 - y1, x2 - x1);
    const offsetX = Math.sin(angle) * 12;
    const offsetY = -Math.cos(angle) * 12;

    g.append('text')
      .attr('x', labelX + offsetX)
      .attr('y', labelY + offsetY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', theme.nodeText)
      .attr('font-size', '10px')
      .text(label);
  }

  return g;
}

// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================

// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
  const {
    total = 1,
    initialStep = 0,
    speed = 1000,
    loop = true,
    onStepChange = null
  } = options;

  let current = initialStep;
  let isPlaying = false;
  let intervalId = null;
  let currentSpeed = speed;

  const notifyChange = () => {
    if (onStepChange && typeof onStepChange === 'function') {
      onStepChange(current);
    }
  };

  const controller = {
    get current() { return current; },
    get isPlaying() { return isPlaying; },
    get total() { return total; },
    get speed() { return currentSpeed; },

    setStep(step) {
      current = Math.max(0, Math.min(total - 1, step));
      notifyChange();
      return current;
    },

    next() {
      if (current < total - 1) {
        current++;
      } else if (loop) {
        current = 0;
      }
      notifyChange();
      return current;
    },

    prev() {
      if (current > 0) {
        current--;
      } else if (loop) {
        current = total - 1;
      }
      notifyChange();
      return current;
    },

    play() {
      if (isPlaying) return;
      isPlaying = true;
      intervalId = setInterval(() => {
        controller.next();
      }, currentSpeed);
    },

    stop() {
      isPlaying = false;
      if (intervalId) {
        clearInterval(intervalId);
        intervalId = null;
      }
    },

    toggle() {
      if (isPlaying) {
        controller.stop();
      } else {
        controller.play();
      }
    },

    reset() {
      controller.stop();
      current = initialStep;
      notifyChange();
    },

    setSpeed(newSpeed) {
      currentSpeed = newSpeed;
      if (isPlaying) {
        controller.stop();
        controller.play();
      }
    }
  };

  return controller;
}

// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================

// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
  const {
    nodes = [],
    edges = [],
    width = 600,
    height = 400,
    activeNodes = [],
    activeEdges = [],
    theme = diagramTheme,
    nodeWidth = 100,
    nodeHeight = 50,
    padding = 20
  } = options;

  // Create SVG element
  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`)
    .attr('class', 'flow-diagram');

  // Add background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', theme.bg)
    .attr('rx', 8);

  // Create defs for markers
  const defs = svg.append('defs');

  // Standard arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.edgeStroke);

  // Highlighted arrow marker
  defs.append('marker')
    .attr('id', 'flow-arrow-highlight')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 6)
    .attr('markerHeight', 6)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', theme.highlight);

  // Edges layer (draw first so nodes appear on top)
  const edgesLayer = svg.append('g').attr('class', 'edges-layer');

  // Nodes layer
  const nodesLayer = svg.append('g').attr('class', 'nodes-layer');

  // Draw edges
  edges.forEach((edge, i) => {
    const sourceNode = nodes.find(n => n.id === edge.source);
    const targetNode = nodes.find(n => n.id === edge.target);

    if (!sourceNode || !targetNode) return;

    const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
    const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
    const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';

    // Calculate edge path
    const x1 = sourceNode.x;
    const y1 = sourceNode.y;
    const x2 = targetNode.x;
    const y2 = targetNode.y;

    // Shorten path to not overlap with node edges
    const dx = x2 - x1;
    const dy = y2 - y1;
    const len = Math.sqrt(dx * dx + dy * dy);
    const offsetStart = (nodeWidth / 2) + 5;
    const offsetEnd = (nodeWidth / 2) + 10;

    const startX = x1 + (dx / len) * offsetStart;
    const startY = y1 + (dy / len) * offsetStart;
    const endX = x2 - (dx / len) * offsetEnd;
    const endY = y2 - (dy / len) * offsetEnd;

    const edgeGroup = edgesLayer.append('g')
      .attr('class', `edge ${isActive ? 'highlighted' : ''}`);

    if (edge.id) edgeGroup.attr('id', edge.id);

    // Draw path
    let pathD;
    if (edge.curved) {
      const midX = (startX + endX) / 2;
      const midY = (startY + endY) / 2;
      const curvature = edge.curvature || 0.2;
      const cx = midX - dy * curvature;
      const cy = midY + dx * curvature;
      pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
    } else {
      pathD = `M${startX},${startY} L${endX},${endY}`;
    }

    const path = edgeGroup.append('path')
      .attr('d', pathD)
      .attr('fill', 'none')
      .attr('stroke', edgeColor)
      .attr('stroke-width', isActive ? 2.5 : 1.5)
      .attr('marker-end', `url(#${markerId})`);

    if (edge.dashed) {
      path.attr('stroke-dasharray', '5,3');
    }

    if (isActive) {
      path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
    }

    // Add label if present
    if (edge.label) {
      const labelX = (startX + endX) / 2;
      const labelY = (startY + endY) / 2;
      const angle = Math.atan2(endY - startY, endX - startX);
      const offsetX = Math.sin(angle) * 14;
      const offsetY = -Math.cos(angle) * 14;

      edgeGroup.append('text')
        .attr('x', labelX + offsetX)
        .attr('y', labelY + offsetY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', isActive ? theme.highlight : theme.nodeText)
        .attr('font-size', '10px')
        .text(edge.label);
    }
  });

  // Draw nodes
  nodes.forEach((node, i) => {
    const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
    const nodeFill = isActive ? theme.highlight : theme.nodeFill;
    const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
    const textFill = isActive ? theme.textOnHighlight : theme.nodeText;

    const nodeGroup = nodesLayer.append('g')
      .attr('class', `node ${isActive ? 'highlighted' : ''}`)
      .attr('transform', `translate(${node.x}, ${node.y})`);

    if (node.id) nodeGroup.attr('id', node.id);

    // Node rectangle
    const rect = nodeGroup.append('rect')
      .attr('x', -nodeWidth / 2)
      .attr('y', -nodeHeight / 2)
      .attr('width', node.width || nodeWidth)
      .attr('height', node.height || nodeHeight)
      .attr('rx', 6)
      .attr('ry', 6)
      .attr('fill', nodeFill)
      .attr('stroke', nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
    }

    // Main label
    const labelY = node.sublabel ? -6 : 0;
    nodeGroup.append('text')
      .attr('x', 0)
      .attr('y', labelY)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', textFill)
      .attr('font-size', '12px')
      .attr('font-weight', '500')
      .attr('pointer-events', 'none')
      .text(node.label || '');

    // Sublabel
    if (node.sublabel) {
      nodeGroup.append('text')
        .attr('x', 0)
        .attr('y', 10)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', textFill)
        .attr('font-size', '10px')
        .attr('opacity', isActive ? 0.9 : 0.7)
        .attr('pointer-events', 'none')
        .text(node.sublabel);
    }
  });

  return svg.node();
}

// =============================================================================
// EXPORTS
// =============================================================================

// Export everything as a single object for lessons to use
diagramLib = {
  // Core dependencies
  d3,

  // Theme utilities
  getCSSVar,
  diagramTheme,

  // SVG primitives
  createNode,
  createArrow,

  // Animation controller
  createStepController,

  // Components
  FlowDiagram
}

/**
 * Segmented step control for visualization stepping.
 * @param {Object} options
 * @param {number} options.min - Minimum step value (default 0)
 * @param {number} options.max - Maximum step value
 * @param {number} options.value - Initial value (default min)
 * @param {string} options.label - Optional label text
 * @returns {number} Current step value (reactive)
 */
stepControl = function({min = 0, max, value, label = null} = {}) {
  const initialValue = value ?? min;
  const steps = Array.from({length: max - min + 1}, (_, i) => min + i);

  const container = htl.html`<div class="step-control">
    ${label ? htl.html`<span class="step-control-label">${label}</span>` : ''}
    <div class="step-control-segments" role="group" aria-label="${label || 'Step control'}">
      ${steps.map(step => htl.html`<button
        class="step-control-segment ${step === initialValue ? 'active' : ''}"
        data-step="${step}"
        aria-pressed="${step === initialValue}"
        tabindex="${step === initialValue ? 0 : -1}"
      >${step}</button>`)}
    </div>
  </div>`;

  const segments = container.querySelectorAll('.step-control-segment');
  let currentValue = initialValue;

  function updateActive(newValue) {
    currentValue = newValue;
    segments.forEach(seg => {
      const isActive = parseInt(seg.dataset.step) === newValue;
      seg.classList.toggle('active', isActive);
      seg.setAttribute('aria-pressed', isActive);
      seg.tabIndex = isActive ? 0 : -1;
    });
    container.value = newValue;
    container.dispatchEvent(new Event('input', {bubbles: true}));
  }

  // Click handler
  segments.forEach(seg => {
    seg.addEventListener('click', () => {
      updateActive(parseInt(seg.dataset.step));
    });
  });

  // Keyboard navigation
  container.addEventListener('keydown', (e) => {
    if (e.key === 'ArrowRight' || e.key === 'ArrowDown') {
      e.preventDefault();
      const next = Math.min(currentValue + 1, max);
      updateActive(next);
      segments[next - min].focus();
    } else if (e.key === 'ArrowLeft' || e.key === 'ArrowUp') {
      e.preventDefault();
      const prev = Math.max(currentValue - 1, min);
      updateActive(prev);
      segments[prev - min].focus();
    } else if (e.key === 'Home') {
      e.preventDefault();
      updateActive(min);
      segments[0].focus();
    } else if (e.key === 'End') {
      e.preventDefault();
      updateActive(max);
      segments[max - min].focus();
    }
  });

  container.value = initialValue;
  return container;
}

Introduction

A language model requires numbers, not text. Tokenization breaks text into tokens and maps each to an integer.

Tokenization converts raw text into integers the model can process. Modern LLMs use subword tokenization - they break text into pieces smaller than words but larger than characters.

Why subword tokenization?

Word-level: Cannot handle new words (OOV problem), huge vocabulary needed (millions for multilingual)
Character-level: Sequences become 4-5x longer, attention cost explodes O(n^2), model learns spelling from scratch
Subword: Best of both worlds - handles new words via decomposition, reasonable sequence length

BPE (Byte Pair Encoding) dominates modern tokenization. Philip Gage invented it for data compression in 1994; researchers adapted it for NLP in 2016:

Start with individual characters as the initial vocabulary
Count all adjacent token pairs in the training corpus
Merge the most frequent pair into a new token
Add the merged token to the vocabulary
Repeat until vocabulary size reached

What You’ll Learn

After this module, you can:

Build a character-level tokenizer from scratch
Understand why subword tokenization outperforms alternatives
Implement the BPE algorithm for training and encoding
Handle special tokens (PAD, UNK, BOS, EOS)
Recognize trade-offs in vocabulary size

Prerequisites

This module requires familiarity with:

Module 01: Tensors — Basic tensor operations and shapes

First, build the simplest tokenizer from scratch.

The Simplest Tokenizer

The simplest approach treats each character as a token.

# Build vocabulary from text
text = "hello world"
chars = sorted(set(text))
print(f"Unique characters: {chars}")
print(f"Vocabulary size: {len(chars)}")

Unique characters: [' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w']
Vocabulary size: 8

# The core of any tokenizer: two lookup tables
stoi = {ch: i for i, ch in enumerate(chars)}  # string to integer
itos = {i: ch for i, ch in enumerate(chars)}  # integer to string

print("stoi (encode):", stoi)
print("itos (decode):", itos)

stoi (encode): {' ': 0, 'd': 1, 'e': 2, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7}
itos (decode): {0: ' ', 1: 'd', 2: 'e', 3: 'h', 4: 'l', 5: 'o', 6: 'r', 7: 'w'}

# Encode: text -> integers
def encode(text):
    return [stoi[ch] for ch in text]

# Decode: integers -> text
def decode(ids):
    return ''.join(itos[i] for i in ids)

# Try it out
encoded = encode("hello")
print(f"'hello' -> {encoded}")
print(f"{encoded} -> '{decode(encoded)}'")

'hello' -> [3, 2, 4, 4, 5]
[3, 2, 4, 4, 5] -> 'hello'

# Round-trip test
original = "hello world"
reconstructed = decode(encode(original))
print(f"Original:      '{original}'")
print(f"Reconstructed: '{reconstructed}'")
print(f"Perfect round-trip: {original == reconstructed}")

Original:      'hello world'
Reconstructed: 'hello world'
Perfect round-trip: True

Ten lines of Python produce a complete tokenizer. Every tokenizer — no matter how sophisticated — has these same two operations:

encode: text to token IDs
decode: token IDs back to text

The Key Insight

Tokenization achieves compression and semantic grouping:

Tokenization	Vocabulary Size	Sequence Length	Semantics
Character	~100 (ASCII)	Very long	None (individual letters)
Word	~1,000,000+	Short	Strong (whole words)
Subword	~30,000-100,000	Medium	Moderate (meaningful pieces)

Why Characters Aren’t Enough

Our character tokenizer works — but fails at scale.

Problem 1: Long Sequences

sample_text = "The transformer architecture revolutionized natural language processing."
char_tokens = list(sample_text)
print(f"Text length: {len(sample_text)} characters")
print(f"Token count: {len(char_tokens)} tokens")
print(f"Compression ratio: {len(sample_text) / len(char_tokens):.2f}x (no compression!)")

Text length: 72 characters
Token count: 72 tokens
Compression ratio: 1.00x (no compression!)

Since attention is O(n^2) in sequence length, doubling the sequence length quadruples the compute cost. Character-level tokenization produces the longest possible sequences.

Problem 2: No Semantic Units

# The model sees this:
word = "transformer"
char_view = list(word)
print(f"Characters: {char_view}")
print(f"Token count: {len(char_view)}")

Characters: ['t', 'r', 'a', 'n', 's', 'f', 'o', 'r', 'm', 'e', 'r']
Token count: 11

The model must discover on its own that t-r-a-n-s-f-o-r-m-e-r forms a meaningful unit. Character tokenization provides no semantic guidance. At word-level, “transformer” occupies one token with its own learned representation.

Problem 3: Vocabulary Explosion for Bytes

# If we go to byte-level (handling all Unicode)
text_with_emoji = "Hello! \U0001F60A"
byte_view = text_with_emoji.encode('utf-8')
print(f"Text: {text_with_emoji}")
print(f"Bytes: {list(byte_view)}")
print(f"Byte count: {len(byte_view)} (emoji = 4 bytes!)")

Text: Hello! 😊
Bytes: [72, 101, 108, 108, 111, 33, 32, 240, 159, 152, 138]
Byte count: 11 (emoji = 4 bytes!)

Byte-level tokenization can represent anything, but sequences become even longer. Byte-level tokenization splits a single emoji into 4 tokens.

The Tradeoff

This is the fundamental tradeoff in tokenization:

Characters: Small vocab, long sequences, no semantics
Words:      Huge vocab, short sequences, good semantics, can't handle new words
Subwords:   Medium vocab, medium sequences, some semantics, handles new words

BPE merges frequently co-occurring character sequences into single tokens — the sweet spot between characters and words.

Intuition: Learning Patterns Through Merging

Think of BPE as compression that learns common patterns:

// Interactive BPE merging visualization for "hello"
viewof bpeMergeStep = stepControl({min: 0, max: 4, value: 0, label: "Merge Step"})

bpeMergeDiagram = {
  const width = 620;
  const height = 180;

  // Merge states: each state is an array of tokens
  const mergeStates = [
    { tokens: ['h', 'e', 'l', 'l', 'o'], label: 'Initial: Character Tokens', merge: null },
    { tokens: ['h', 'e', 'll', 'o'], label: "Merge 1: 'l' + 'l' → 'll'", merge: ['l', 'l', 'll'] },
    { tokens: ['he', 'll', 'o'], label: "Merge 2: 'h' + 'e' → 'he'", merge: ['h', 'e', 'he'] },
    { tokens: ['he', 'llo'], label: "Merge 3: 'll' + 'o' → 'llo'", merge: ['ll', 'o', 'llo'] },
    { tokens: ['hello'], label: "Merge 4: 'he' + 'llo' → 'hello'", merge: ['he', 'llo', 'hello'] }
  ];

  const state = mergeStates[bpeMergeStep];
  const tokens = state.tokens;

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', diagramTheme.bg)
    .attr('rx', 8);

  // Title
  svg.append('text')
    .attr('x', width / 2)
    .attr('y', 28)
    .attr('text-anchor', 'middle')
    .attr('fill', diagramTheme.nodeText)
    .attr('font-size', '14px')
    .attr('font-weight', '600')
    .text(state.label);

  // Token display area
  const tokenY = 90;
  const tokenH = 50;
  const gap = 8;

  // Calculate total width needed for tokens
  const tokenWidths = tokens.map(t => Math.max(50, t.length * 22 + 24));
  const totalWidth = tokenWidths.reduce((a, b) => a + b, 0) + gap * (tokens.length - 1);
  let startX = (width - totalWidth) / 2;

  // Draw tokens
  tokens.forEach((token, i) => {
    const tokenW = tokenWidths[i];
    const x = startX + tokenW / 2;

    // Check if this token was just merged
    const justMerged = state.merge && token === state.merge[2];

    const g = svg.append('g')
      .attr('transform', `translate(${x}, ${tokenY})`);

    // Token box with animation effect for merged tokens
    const rect = g.append('rect')
      .attr('x', -tokenW / 2)
      .attr('y', -tokenH / 2)
      .attr('width', tokenW)
      .attr('height', tokenH)
      .attr('rx', 8)
      .attr('fill', justMerged ? diagramTheme.highlight : diagramTheme.nodeFill)
      .attr('stroke', justMerged ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr('stroke-width', justMerged ? 2.5 : 1.5);

    if (justMerged) {
      rect.attr('filter', `drop-shadow(0 0 8px ${diagramTheme.highlightGlow})`);
    }

    // Token text
    g.append('text')
      .attr('x', 0)
      .attr('y', 0)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', justMerged ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr('font-size', '18px')
      .attr('font-family', 'monospace')
      .attr('font-weight', '500')
      .text(`'${token}'`);

    startX += tokenW + gap;
  });

  // Show merge indicator if applicable
  if (state.merge) {
    svg.append('text')
      .attr('x', width / 2)
      .attr('y', height - 25)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.accent)
      .attr('font-size', '12px')
      .attr('font-family', 'monospace')
      .text(`Merged: '${state.merge[0]}' + '${state.merge[1]}' → '${state.merge[2]}'`);
  } else {
    svg.append('text')
      .attr('x', width / 2)
      .attr('y', height - 25)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.nodeText)
      .attr('font-size', '12px')
      .attr('opacity', 0.7)
      .text(`${tokens.length} tokens`);
  }

  return svg.node();
}

md`**Token count:** ${['h','e','l','l','o'].length - bpeMergeStep} → ${bpeMergeStep === 4 ? '1 token (fully merged)' : `${5 - bpeMergeStep} tokens`}`

For code, BPE learns patterns like:

def (function definition with space)
self. (common in Python classes)
return (return statement)
(4-space indent)

The BPE Training Algorithm

BPE learns to tokenize through this process:

// Interactive BPE training algorithm visualization
viewof bpeTrainStep = stepControl({min: 0, max: 6, value: 0, label: "Training Step"})

bpeTrainingDiagram = {
  const width = 700;
  const height = 320;

  // Training states showing the BPE algorithm on "low lower lowest"
  const trainStates = [
    {
      phase: 'start',
      tokens: ['l', 'o', 'w', ' ', 'l', 'o', 'w', 'e', 'r', ' ', 'l', 'o', 'w', 'e', 's', 't'],
      pairs: [["('l','o')", 3], ["('o','w')", 3], ["('w',' ')", 2], ["('w','e')", 2]],
      highlight: null,
      description: 'Start with individual characters'
    },
    {
      phase: 'count',
      tokens: ['l', 'o', 'w', ' ', 'l', 'o', 'w', 'e', 'r', ' ', 'l', 'o', 'w', 'e', 's', 't'],
      pairs: [["('l','o')", 3], ["('o','w')", 3], ["('w',' ')", 2], ["('w','e')", 2]],
      highlight: "('l','o')",
      description: "Count pairs: ('l','o') appears 3 times (most frequent)"
    },
    {
      phase: 'merge',
      tokens: ['lo', 'w', ' ', 'lo', 'w', 'e', 'r', ' ', 'lo', 'w', 'e', 's', 't'],
      pairs: [["('lo','w')", 3], ["('w',' ')", 2], ["('w','e')", 2]],
      highlight: 'lo',
      description: "Merge ('l','o') → 'lo' everywhere"
    },
    {
      phase: 'count',
      tokens: ['lo', 'w', ' ', 'lo', 'w', 'e', 'r', ' ', 'lo', 'w', 'e', 's', 't'],
      pairs: [["('lo','w')", 3], ["('w',' ')", 2], ["('w','e')", 2]],
      highlight: "('lo','w')",
      description: "Count pairs: ('lo','w') appears 3 times"
    },
    {
      phase: 'merge',
      tokens: ['low', ' ', 'low', 'e', 'r', ' ', 'low', 'e', 's', 't'],
      pairs: [["('low',' ')", 2], ["('low','e')", 2], ["(' ','low')", 2]],
      highlight: 'low',
      description: "Merge ('lo','w') → 'low'"
    },
    {
      phase: 'count',
      tokens: ['low', ' ', 'low', 'e', 'r', ' ', 'low', 'e', 's', 't'],
      pairs: [["('low','e')", 2], ["('low',' ')", 2]],
      highlight: "('low','e')",
      description: "Count pairs: ('low','e') appears 2 times"
    },
    {
      phase: 'merge',
      tokens: ['low', ' ', 'lowe', 'r', ' ', 'lowe', 's', 't'],
      pairs: [["('lowe','r')", 1], ["('lowe','s')", 1]],
      highlight: 'lowe',
      description: "Merge ('low','e') → 'lowe' — Continue until vocab size reached"
    }
  ];

  const state = trainStates[bpeTrainStep];

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', diagramTheme.bg)
    .attr('rx', 8);

  // Title / Phase indicator
  const phaseColors = {
    'start': diagramTheme.nodeStroke,
    'count': diagramTheme.accent,
    'merge': diagramTheme.highlight
  };

  svg.append('text')
    .attr('x', width / 2)
    .attr('y', 28)
    .attr('text-anchor', 'middle')
    .attr('fill', phaseColors[state.phase])
    .attr('font-size', '14px')
    .attr('font-weight', '600')
    .text(state.phase === 'start' ? 'BPE Training Algorithm' :
          state.phase === 'count' ? 'Phase: Count Pairs' : 'Phase: Merge');

  // Token display area
  const tokenY = 85;
  const tokenH = 36;
  const gap = 3;

  // Calculate token layout
  const tokens = state.tokens;
  const tokenWidths = tokens.map(t => t === ' ' ? 28 : Math.max(28, t.length * 14 + 16));
  const totalWidth = tokenWidths.reduce((a, b) => a + b, 0) + gap * (tokens.length - 1);
  const scale = totalWidth > width - 40 ? (width - 40) / totalWidth : 1;

  let startX = (width - totalWidth * scale) / 2;

  // Draw tokens
  tokens.forEach((token, i) => {
    const tokenW = tokenWidths[i] * scale;
    const x = startX + tokenW / 2;
    const isHighlighted = state.highlight === token;
    const isSpace = token === ' ';

    const g = svg.append('g')
      .attr('transform', `translate(${x}, ${tokenY})`);

    const rect = g.append('rect')
      .attr('x', -tokenW / 2)
      .attr('y', -tokenH / 2)
      .attr('width', tokenW)
      .attr('height', tokenH)
      .attr('rx', 5)
      .attr('fill', isHighlighted ? diagramTheme.highlight :
                    isSpace ? diagramTheme.bgSecondary : diagramTheme.nodeFill)
      .attr('stroke', isHighlighted ? diagramTheme.highlight : diagramTheme.nodeStroke)
      .attr('stroke-width', isHighlighted ? 2 : 1);

    if (isHighlighted) {
      rect.attr('filter', `drop-shadow(0 0 6px ${diagramTheme.highlightGlow})`);
    }

    g.append('text')
      .attr('x', 0)
      .attr('y', 0)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', isHighlighted ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr('font-size', `${11 * scale}px`)
      .attr('font-family', 'monospace')
      .text(isSpace ? '␣' : token);

    startX += tokenW + gap * scale;
  });

  // Pair frequencies section
  const pairY = 170;
  svg.append('text')
    .attr('x', 20)
    .attr('y', pairY)
    .attr('fill', diagramTheme.nodeText)
    .attr('font-size', '12px')
    .attr('font-weight', '600')
    .text('Pair frequencies:');

  const pairs = state.pairs;
  const pairGap = 150;
  pairs.forEach((pair, i) => {
    const x = 20 + i * pairGap;
    const isHighlightedPair = state.highlight === pair[0];

    svg.append('text')
      .attr('x', x)
      .attr('y', pairY + 24)
      .attr('fill', isHighlightedPair ? diagramTheme.highlight : diagramTheme.nodeText)
      .attr('font-size', '12px')
      .attr('font-family', 'monospace')
      .attr('font-weight', isHighlightedPair ? '700' : '400')
      .text(`${pair[0]}: ${pair[1]}`);
  });

  // Description
  svg.append('rect')
    .attr('x', 20)
    .attr('y', height - 65)
    .attr('width', width - 40)
    .attr('height', 45)
    .attr('rx', 6)
    .attr('fill', diagramTheme.bgSecondary)
    .attr('stroke', diagramTheme.nodeStroke)
    .attr('stroke-width', 1);

  svg.append('text')
    .attr('x', width / 2)
    .attr('y', height - 38)
    .attr('text-anchor', 'middle')
    .attr('fill', diagramTheme.nodeText)
    .attr('font-size', '13px')
    .text(state.description);

  // Token count
  svg.append('text')
    .attr('x', width - 20)
    .attr('y', 28)
    .attr('text-anchor', 'end')
    .attr('fill', diagramTheme.nodeText)
    .attr('font-size', '11px')
    .attr('opacity', 0.7)
    .text(`${tokens.length} tokens`);

  return svg.node();
}

The Math

BPE is simple - just counting and merging:

# Count pair frequencies
pairs = count_pairs(tokens)  # {'he': 50, 'el': 30, 'll': 80, ...}

# Find most frequent
best_pair = max(pairs, key=pairs.get)  # ('l', 'l')

# Merge everywhere
tokens = merge(tokens, best_pair, 'll')

Vocabulary size is a hyperparameter:

Too small: Sequences too long, less meaning per token
Too large: Many rare tokens, harder to learn
Typical: 8K-50K tokens for LLMs

Encoding New Text

Once trained, encoding applies merges in the order they were learned:

// Interactive encoding demonstration for "lower"
viewof encodeStep = stepControl({min: 0, max: 5, value: 0, label: "Encode Step"})

encodingDiagram = {
  const width = 620;
  const height = 240;

  // Encoding steps showing how merges are applied in order
  const encodeSteps = [
    { tokens: ['l', 'o', 'w', 'e', 'r'], label: 'Split to characters', merge: null, ids: null },
    { tokens: ['lo', 'w', 'e', 'r'], label: "Apply merge 1: 'l' + 'o' → 'lo'", merge: ['l', 'o', 'lo'], ids: null },
    { tokens: ['low', 'e', 'r'], label: "Apply merge 2: 'lo' + 'w' → 'low'", merge: ['lo', 'w', 'low'], ids: null },
    { tokens: ['lowe', 'r'], label: "Apply merge 3: 'low' + 'e' → 'lowe'", merge: ['low', 'e', 'lowe'], ids: null },
    { tokens: ['lower'], label: "Apply merge 4: 'lowe' + 'r' → 'lower'", merge: ['lowe', 'r', 'lower'], ids: null },
    { tokens: ['lower'], label: "Look up token IDs", merge: null, ids: [15] }
  ];

  const state = encodeSteps[encodeStep];
  const tokens = state.tokens;

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', diagramTheme.bg)
    .attr('rx', 8);

  // Step indicator
  svg.append('text')
    .attr('x', 20)
    .attr('y', 28)
    .attr('fill', diagramTheme.accent)
    .attr('font-size', '12px')
    .attr('font-weight', '600')
    .text(`Step ${encodeStep + 1}/6`);

  // Title
  svg.append('text')
    .attr('x', width / 2)
    .attr('y', 28)
    .attr('text-anchor', 'middle')
    .attr('fill', diagramTheme.nodeText)
    .attr('font-size', '14px')
    .attr('font-weight', '600')
    .text(state.label);

  // Arrow showing merge progression
  if (encodeStep > 0 && encodeStep < 5) {
    // Show the "before" state faded
    const prevTokens = encodeSteps[encodeStep - 1].tokens;
    const prevY = 70;
    const prevGap = 6;
    const prevWidths = prevTokens.map(t => Math.max(40, t.length * 16 + 20));
    const prevTotal = prevWidths.reduce((a, b) => a + b, 0) + prevGap * (prevTokens.length - 1);
    let prevX = (width - prevTotal) / 2;

    prevTokens.forEach((token, i) => {
      const w = prevWidths[i];
      const x = prevX + w / 2;
      const isMerging = state.merge && (token === state.merge[0] || token === state.merge[1]);

      svg.append('rect')
        .attr('x', x - w / 2)
        .attr('y', prevY - 16)
        .attr('width', w)
        .attr('height', 32)
        .attr('rx', 5)
        .attr('fill', diagramTheme.bgSecondary)
        .attr('stroke', isMerging ? diagramTheme.accent : diagramTheme.nodeStroke)
        .attr('stroke-width', isMerging ? 2 : 1)
        .attr('opacity', 0.6);

      svg.append('text')
        .attr('x', x)
        .attr('y', prevY)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', diagramTheme.nodeText)
        .attr('font-size', '13px')
        .attr('font-family', 'monospace')
        .attr('opacity', 0.5)
        .text(`'${token}'`);

      prevX += w + prevGap;
    });

    // Arrow down
    svg.append('path')
      .attr('d', `M${width/2},${prevY + 22} L${width/2},${prevY + 45}`)
      .attr('stroke', diagramTheme.accent)
      .attr('stroke-width', 2)
      .attr('marker-end', 'url(#encode-arrow)');

    // Arrow marker
    const defs = svg.append('defs');
    defs.append('marker')
      .attr('id', 'encode-arrow')
      .attr('viewBox', '0 -5 10 10')
      .attr('refX', 8)
      .attr('refY', 0)
      .attr('markerWidth', 6)
      .attr('markerHeight', 6)
      .attr('orient', 'auto')
      .append('path')
      .attr('d', 'M0,-5L10,0L0,5')
      .attr('fill', diagramTheme.accent);
  }

  // Current tokens (main display)
  const tokenY = encodeStep > 0 && encodeStep < 5 ? 150 : 110;
  const tokenH = 50;
  const gap = 10;

  const tokenWidths = tokens.map(t => Math.max(60, t.length * 20 + 28));
  const totalWidth = tokenWidths.reduce((a, b) => a + b, 0) + gap * (tokens.length - 1);
  let startX = (width - totalWidth) / 2;

  tokens.forEach((token, i) => {
    const tokenW = tokenWidths[i];
    const x = startX + tokenW / 2;
    const justMerged = state.merge && token === state.merge[2];
    const showId = state.ids !== null;

    const g = svg.append('g')
      .attr('transform', `translate(${x}, ${tokenY})`);

    const rect = g.append('rect')
      .attr('x', -tokenW / 2)
      .attr('y', -tokenH / 2)
      .attr('width', tokenW)
      .attr('height', tokenH)
      .attr('rx', 8)
      .attr('fill', justMerged ? diagramTheme.highlight :
                    showId ? diagramTheme.accent : diagramTheme.nodeFill)
      .attr('stroke', justMerged ? diagramTheme.highlight :
                      showId ? diagramTheme.accent : diagramTheme.nodeStroke)
      .attr('stroke-width', justMerged || showId ? 2.5 : 1.5);

    if (justMerged || showId) {
      rect.attr('filter', `drop-shadow(0 0 8px ${justMerged ? diagramTheme.highlightGlow : diagramTheme.accentGlow})`);
    }

    // Token text
    g.append('text')
      .attr('x', 0)
      .attr('y', showId ? -8 : 0)
      .attr('text-anchor', 'middle')
      .attr('dominant-baseline', 'central')
      .attr('fill', justMerged ? diagramTheme.textOnHighlight :
                    showId ? diagramTheme.textOnAccent : diagramTheme.nodeText)
      .attr('font-size', '16px')
      .attr('font-family', 'monospace')
      .attr('font-weight', '500')
      .text(`'${token}'`);

    // ID display
    if (showId && state.ids[i] !== undefined) {
      g.append('text')
        .attr('x', 0)
        .attr('y', 12)
        .attr('text-anchor', 'middle')
        .attr('dominant-baseline', 'central')
        .attr('fill', diagramTheme.textOnAccent)
        .attr('font-size', '13px')
        .attr('opacity', 0.9)
        .text(`ID: ${state.ids[i]}`);
    }

    startX += tokenW + gap;
  });

  // Bottom info
  const infoY = height - 30;
  if (state.ids) {
    svg.append('text')
      .attr('x', width / 2)
      .attr('y', infoY)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.accent)
      .attr('font-size', '14px')
      .attr('font-weight', '600')
      .text(`Output: [${state.ids.join(', ')}]`);
  } else {
    svg.append('text')
      .attr('x', width / 2)
      .attr('y', infoY)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.nodeText)
      .attr('font-size', '12px')
      .attr('opacity', 0.7)
      .text(`${tokens.length} token${tokens.length > 1 ? 's' : ''}`);
  }

  return svg.node();
}

Handling Unknown Words

BPE can handle words it has never seen:

// Toggle between known and unknown word handling
viewof wordType = Inputs.radio(["Known Word: 'lowest'", "Unknown Word: 'lows'"], {
  value: "Known Word: 'lowest'",
  label: "Word type"
})

unknownWordsDiagram = {
  const width = 650;
  const height = 280;
  const isKnown = wordType.includes('lowest');

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', diagramTheme.bg)
    .attr('rx', 8);

  // Title
  svg.append('text')
    .attr('x', width / 2)
    .attr('y', 28)
    .attr('text-anchor', 'middle')
    .attr('fill', isKnown ? diagramTheme.accent : diagramTheme.highlight)
    .attr('font-size', '15px')
    .attr('font-weight', '700')
    .text(isKnown ? "Known Word: 'lowest'" : "Unknown Word: 'lows'");

  if (isKnown) {
    // Known word path: direct lookup
    const centerY = 100;

    // Input word
    svg.append('text')
      .attr('x', 80)
      .attr('y', centerY)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.nodeText)
      .attr('font-size', '18px')
      .attr('font-family', 'monospace')
      .attr('font-weight', '600')
      .text("'lowest'");

    // Arrow
    svg.append('path')
      .attr('d', `M140,${centerY} L260,${centerY}`)
      .attr('stroke', diagramTheme.accent)
      .attr('stroke-width', 3)
      .attr('marker-end', 'url(#known-arrow)');

    // Result box
    const resultG = svg.append('g')
      .attr('transform', `translate(350, ${centerY})`);

    resultG.append('rect')
      .attr('x', -70)
      .attr('y', -28)
      .attr('width', 140)
      .attr('height', 56)
      .attr('rx', 8)
      .attr('fill', diagramTheme.accent)
      .attr('filter', `drop-shadow(0 0 10px ${diagramTheme.accentGlow})`);

    resultG.append('text')
      .attr('x', 0)
      .attr('y', -6)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.textOnAccent)
      .attr('font-size', '16px')
      .attr('font-family', 'monospace')
      .attr('font-weight', '600')
      .text('[16]');

    resultG.append('text')
      .attr('x', 0)
      .attr('y', 14)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.textOnAccent)
      .attr('font-size', '11px')
      .attr('opacity', 0.9)
      .text('Single token');

    // Efficiency note
    svg.append('text')
      .attr('x', width / 2)
      .attr('y', centerY + 60)
      .attr('text-anchor', 'middle')
      .attr('fill', diagramTheme.nodeText)
      .attr('font-size', '12px')
      .attr('opacity', 0.8)
      .text('Direct vocabulary lookup — maximum efficiency');

    // Arrow marker
    const defs = svg.append('defs');
    defs.append('marker')
      .attr('id', 'known-arrow')
      .attr('viewBox', '0 -5 10 10')
      .attr('refX', 8)
      .attr('refY', 0)
      .attr('markerWidth', 8)
      .attr('markerHeight', 8)
      .attr('orient', 'auto')
      .append('path')
      .attr('d', 'M0,-5L10,0L0,5')
      .attr('fill', diagramTheme.accent);

  } else {
    // Unknown word path: split and apply merges
    const steps = [
      { y: 70, label: "Input", tokens: ["'lows'"], note: null },
      { y: 120, label: "Split", tokens: ["'l'", "'o'", "'w'", "'s'"], note: "Character-level" },
      { y: 170, label: "Merge", tokens: ["'low'", "'s'"], note: "Apply learned merges" },
      { y: 220, label: "IDs", tokens: ["[13, 9]"], note: "Subword tokens" }
    ];

    // Arrow marker
    const defs = svg.append('defs');
    defs.append('marker')
      .attr('id', 'unknown-arrow')
      .attr('viewBox', '0 -5 10 10')
      .attr('refX', 8)
      .attr('refY', 0)
      .attr('markerWidth', 6)
      .attr('markerHeight', 6)
      .attr('orient', 'auto')
      .append('path')
      .attr('d', 'M0,-5L10,0L0,5')
      .attr('fill', diagramTheme.highlight);

    steps.forEach((step, i) => {
      // Step label
      svg.append('text')
        .attr('x', 50)
        .attr('y', step.y)
        .attr('text-anchor', 'middle')
        .attr('fill', diagramTheme.nodeText)
        .attr('font-size', '11px')
        .attr('font-weight', '600')
        .attr('opacity', 0.7)
        .text(step.label);

      // Tokens
      const tokenGap = 10;
      const tokenWidths = step.tokens.map(t => t.startsWith('[') ? 100 : Math.max(40, t.length * 14 + 16));
      const totalW = tokenWidths.reduce((a, b) => a + b, 0) + tokenGap * (step.tokens.length - 1);
      let startX = 200;

      step.tokens.forEach((token, j) => {
        const w = tokenWidths[j];
        const x = startX + w / 2;
        const isResult = i === steps.length - 1;

        const rect = svg.append('rect')
          .attr('x', x - w / 2)
          .attr('y', step.y - 16)
          .attr('width', w)
          .attr('height', 32)
          .attr('rx', 6)
          .attr('fill', isResult ? diagramTheme.highlight : diagramTheme.nodeFill)
          .attr('stroke', isResult ? diagramTheme.highlight : diagramTheme.nodeStroke)
          .attr('stroke-width', isResult ? 2 : 1.5);

        if (isResult) {
          rect.attr('filter', `drop-shadow(0 0 6px ${diagramTheme.highlightGlow})`);
        }

        svg.append('text')
          .attr('x', x)
          .attr('y', step.y)
          .attr('text-anchor', 'middle')
          .attr('dominant-baseline', 'central')
          .attr('fill', isResult ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
          .attr('font-size', '13px')
          .attr('font-family', 'monospace')
          .attr('font-weight', '500')
          .text(token);

        startX += w + tokenGap;
      });

      // Note
      if (step.note) {
        svg.append('text')
          .attr('x', 480)
          .attr('y', step.y)
          .attr('text-anchor', 'start')
          .attr('fill', diagramTheme.nodeText)
          .attr('font-size', '11px')
          .attr('opacity', 0.6)
          .text(step.note);
      }

      // Arrow to next step
      if (i < steps.length - 1) {
        svg.append('path')
          .attr('d', `M200,${step.y + 18} L200,${steps[i+1].y - 18}`)
          .attr('stroke', diagramTheme.highlight)
          .attr('stroke-width', 2)
          .attr('marker-end', 'url(#unknown-arrow)');
      }
    });
  }

  // Why BPE Works section
  const whyY = height - 38;
  const reasons = isKnown ?
    ["Common words → single tokens (efficient)"] :
    ["Rare words → split into subwords (still encodable)", "Never out of vocabulary"];

  svg.append('text')
    .attr('x', width / 2)
    .attr('y', whyY)
    .attr('text-anchor', 'middle')
    .attr('fill', diagramTheme.nodeText)
    .attr('font-size', '11px')
    .attr('font-style', 'italic')
    .attr('opacity', 0.7)
    .text(reasons.join(' • '));

  return svg.node();
}

Special Tokens

Before examining the code, understand special tokens - reserved tokens with specific meanings in the LLM pipeline:

Token	Purpose	When Used
`<PAD>` (ID 0)	Padding	Batch processing requires same-length sequences. Padding fills shorter sequences.
`<UNK>` (ID 1)	Unknown	Characters not seen during training. Production tokenizers avoid this with byte-level BPE.
`<BOS>` (ID 2)	Beginning of Sequence	Signals the start of text. Helps model distinguish context boundaries.
`<EOS>` (ID 3)	End of Sequence	Signals text completion. Model generates this to stop. Critical for generation.

The vocabulary reserves these tokens before training begins, ensuring consistent IDs across all tokenizers.

Code Walkthrough

Explore tokenization interactively:

# Import our BPE tokenizer
from tokenizer import BPETokenizer, SPECIAL_TOKENS

print("Special tokens:", SPECIAL_TOKENS)
print("\nThese tokens are reserved at IDs 0-3 before training begins.")

Special tokens: {'<PAD>': 0, '<UNK>': 1, '<BOS>': 2, '<EOS>': 3}

These tokens are reserved at IDs 0-3 before training begins.

Training a BPE Tokenizer

The BPETokenizer class has key parameters:

vocab_size: Target vocabulary size (including special tokens)
min_frequency: Minimum times a pair must appear to be merged (default: 2). This prevents rare pairs from being merged — if a pair only appears once, it’s likely noise rather than a useful pattern. Higher values create more conservative, generalizable vocabularies.
verbose: Print detailed training progress

# Simple text to train on
simple_text = "ab cd ab cd ab cd ab cd " * 20

# Create and train tokenizer
# vocab_size includes the 4 special tokens, so effective learned tokens = vocab_size - 4
tokenizer = BPETokenizer(vocab_size=30, verbose=False)
stats = tokenizer.train(simple_text, show_progress=True)

print(f"\nVocab size: {stats['vocab_size']}")
print(f"Merges learned: {stats['num_merges']}")
print(f"Special tokens: {stats['num_special_tokens']}")

============================================================
BPE TOKENIZER TRAINING
============================================================
Text length: 480 characters
Target vocab size: 30


Training complete!
  Final vocab size: 11
  Merges learned: 2


Vocab size: 11
Merges learned: 2
Special tokens: 4

# See what patterns were learned
print("Learned merges:")
for i, ((a, b), merged) in enumerate(list(tokenizer.merges.items())[:10]):
    print(f"  {i+1}. {repr(a)} + {repr(b)} -> {repr(merged)}")

Learned merges:
  1. 'a' + 'b' -> 'ab'
  2. 'c' + 'd' -> 'cd'

Encoding and Decoding

Encoding applies merges in their learned order. This is crucial - the merge order determines how text is split.

# Encode some text
test_text = "ab cd"
ids = tokenizer.encode(test_text)
tokens = [tokenizer.id_to_token(i) for i in ids]

print(f"Text: '{test_text}'")
print(f"Token IDs: {ids}")
print(f"Tokens: {tokens}")

# Decode back
decoded = tokenizer.decode(ids)
print(f"Decoded: '{decoded}'")
print(f"Round-trip successful: {test_text == decoded}")

Text: 'ab cd'
Token IDs: [9, 4, 10]
Tokens: ['ab', ' ', 'cd']
Decoded: 'ab cd'
Round-trip successful: True

# With special tokens (used during actual LLM training/inference)
ids_with_special = tokenizer.encode(test_text, add_special_tokens=True)
print(f"\nWith special tokens: {ids_with_special}")
print(f"Tokens: {[tokenizer.id_to_token(i) for i in ids_with_special]}")

# Decoding skips special tokens by default
decoded = tokenizer.decode(ids_with_special, skip_special_tokens=True)
print(f"Decoded (skip special): '{decoded}'")


With special tokens: [2, 9, 4, 10, 3]
Tokens: ['<BOS>', 'ab', ' ', 'cd', '<EOS>']
Decoded (skip special): 'ab cd'

Training on Python Code

python_code = '''
def fibonacci(n):
    """Calculate the nth Fibonacci number."""
    if n <= 1:
        return n
    return fibonacci(n - 1) + fibonacci(n - 2)

def factorial(n):
    """Calculate n factorial."""
    if n <= 1:
        return 1
    return n * factorial(n - 1)

class Calculator:
    def __init__(self):
        self.result = 0

    def add(self, x):
        self.result += x
        return self

    def subtract(self, x):
        self.result -= x
        return self

# Main execution
if __name__ == "__main__":
    print(fibonacci(10))
    print(factorial(5))
'''

print(f"Training on {len(python_code)} characters of Python code")

Training on 562 characters of Python code

# Train tokenizer on code
code_tokenizer = BPETokenizer(vocab_size=200, verbose=False)
stats = code_tokenizer.train(python_code * 3, show_progress=True)

print(f"\nFinal vocab size: {stats['vocab_size']}")
print(f"Merges learned: {stats['num_merges']}")

============================================================
BPE TOKENIZER TRAINING
============================================================
Text length: 1,686 characters
Target vocab size: 200

  Merge 50: 'self.res' + 'ul' → 'self.resul' (count: 9, progress: 32.1%)
  Merge 100: 'sub' + 't' → 'subt' (count: 3, progress: 64.1%)

Training complete!
  Final vocab size: 177
  Merges learned: 133


Final vocab size: 177
Merges learned: 133

# Look at what code patterns were learned
print("Interesting tokens learned (longest first):")
print("=" * 40)

interesting_patterns = []
for token, id in code_tokenizer.vocab.items():
    if len(token) >= 2 and not token.startswith('<'):
        interesting_patterns.append((token, id))

# Sort by length (longer = more merged)
interesting_patterns.sort(key=lambda x: len(x[0]), reverse=True)

for token, id in interesting_patterns[:15]:
    print(f"  {id:3d}: {repr(token)}")

Interesting tokens learned (longest first):
========================================
  172: 'print(fibonacci(10))'
  176: 'print(factorial(5))'
  171: 'print(fibonacci(10'
  170: 'print(fibonacci(1'
  175: 'print(factorial(5'
  169: 'print(fibonacci('
  174: 'print(factorial('
  136: '__init__(self):'
  168: 'print(fibonacci'
  173: 'print(factorial'
  146: 'subtract(self,'
  110: 'fibonacci(n):'
  122: 'factorial(n):'
  123: 'factorial."""'
  135: '__init__(self'

Visualizing Tokenization

def visualize_tokens(tokenizer, text):
    """Show how text is split into tokens with colors."""
    ids = tokenizer.encode(text)
    tokens = [tokenizer.id_to_token(i) for i in ids]

    print(f"Original: {repr(text)}")
    print(f"Tokens ({len(tokens)}): {tokens}")
    print(f"IDs: {ids}")
    print(f"Compression: {len(text)/len(ids):.2f} chars/token")
    print()

# Try different code patterns
patterns = [
    "def fibonacci(n):",
    "self.result = 0",
    "return self",
    "    for i in range(10):",
]

for pattern in patterns:
    visualize_tokens(code_tokenizer, pattern)

Original: 'def fibonacci(n):'
Tokens (3): ['def', ' ', 'fibonacci(n):']
IDs: [64, 5, 110]
Compression: 5.67 chars/token

Original: 'self.result = 0'
Tokens (5): ['self.result', ' ', '=', ' ', '0']
IDs: [94, 5, 21, 5, 15]
Compression: 3.00 chars/token

Original: 'return self'
Tokens (3): ['return', ' ', 'self']
IDs: [60, 5, 52]
Compression: 3.67 chars/token

Original: '    for i in range(10):'
Tokens (18): ['    ', 'f', 'o', 'r', ' ', 'i', ' ', 'in', ' ', 'r', 'a', 'n', '<UNK>', 'e', '(', '1', '0', '):']
IDs: [45, 31, 37, 39, 5, 33, 5, 88, 5, 39, 26, 36, 1, 30, 8, 16, 15, 71]
Compression: 1.28 chars/token

Vocabulary Size Tradeoffs

Vocabulary size is one of the most important hyperparameters in tokenization:

Larger vocabulary: - (+) Shorter sequences = faster training, more context in fixed window - (+) Common words as single tokens = better semantic units - (-) Larger embedding table = more parameters, more memory - (-) Rare tokens get few training examples = poor representations

Smaller vocabulary: - (+) Smaller model, faster embedding lookups - (+) Every token well-trained on many examples - (-) Longer sequences = slower training, less context - (-) Words split into less meaningful pieces

test_text = "def calculate_fibonacci(number):\n    return fibonacci(number)"

vocab_sizes = [50, 100, 200, 500]

print(f"Text: {repr(test_text)}")
print(f"Text length: {len(test_text)} characters")
print()

for vocab_size in vocab_sizes:
    tok = BPETokenizer(vocab_size=vocab_size, verbose=False)
    tok.train(python_code * 5, show_progress=False)

    ids = tok.encode(test_text)
    tokens = [tok.id_to_token(i) for i in ids]

    print(f"Vocab size {vocab_size}:")
    print(f"  Tokens: {len(ids)}")
    print(f"  Ratio: {len(test_text)/len(ids):.1f} chars/token")
    print(f"  Sample: {[tok.id_to_token(i) for i in ids[:5]]}...")
    print()

Text: 'def calculate_fibonacci(number):\n    return fibonacci(number)'
Text length: 61 characters

Vocab size 50:
  Tokens: 54
  Ratio: 1.1 chars/token
  Sample: ['d', 'e', 'f', ' ', 'c']...

Vocab size 100:
  Tokens: 27
  Ratio: 2.3 chars/token
  Sample: ['def', ' ', 'c', 'al', 'c']...

Vocab size 200:
  Tokens: 27
  Ratio: 2.3 chars/token
  Sample: ['def', ' ', 'c', 'al', 'c']...

Vocab size 500:
  Tokens: 27
  Ratio: 2.3 chars/token
  Sample: ['def', ' ', 'c', 'al', 'c']...

Real-world vocabulary sizes: - GPT-2: 50,257 tokens - GPT-4: ~100,000 tokens - Llama 2: 32,000 tokens - Claude: ~100,000 tokens

Saving and Loading

import tempfile
import os

# Save tokenizer
save_path = tempfile.mktemp(suffix='.json')
code_tokenizer.save(save_path)

# Load it back
loaded = BPETokenizer.load(save_path)

# Verify it works the same
test = "def test():"
original_ids = code_tokenizer.encode(test)
loaded_ids = loaded.encode(test)

print(f"\nOriginal encoding: {original_ids}")
print(f"Loaded encoding:   {loaded_ids}")
print(f"Match: {original_ids == loaded_ids}")

# Cleanup
os.unlink(save_path)

Tokenizer saved to /tmp/tmpdsmcysu0.json
Tokenizer loaded from /tmp/tmpdsmcysu0.json
  Vocab size: 177

Original encoding: [64, 5, 41, 30, 40, 41, 8, 71]
Loaded encoding:   [64, 5, 41, 30, 40, 41, 8, 71]
Match: True

Interactive Exploration

Watch BPE tokenization step by step. Type text and see how it gets broken into tokens through iterative pair merging.

Demo Uses Pre-defined Merge Rules

This interactive demo uses a simplified, pre-defined set of common English merge rules — not dynamically computed merges. A real tokenizer learns merges from a training corpus, but the mechanism shown here is identical. The Python implementation above (BPETokenizer) demonstrates actual BPE training.

bpeMerges = [
  // Common letter pairs
  ["t", "h", "th"],
  ["h", "e", "he"],
  ["i", "n", "in"],
  ["e", "r", "er"],
  ["a", "n", "an"],
  ["r", "e", "re"],
  ["o", "n", "on"],
  ["e", "s", "es"],
  ["o", "r", "or"],
  ["t", "i", "ti"],
  ["e", "n", "en"],
  ["a", "t", "at"],
  ["e", "d", "ed"],
  ["o", "u", "ou"],
  ["i", "s", "is"],
  ["i", "t", "it"],
  ["a", "l", "al"],
  ["a", "r", "ar"],
  ["s", "t", "st"],
  ["l", "l", "ll"],
  ["l", "e", "le"],
  ["n", "d", "nd"],
  // Common trigrams
  ["th", "e", "the"],
  ["in", "g", "ing"],
  ["an", "d", "and"],
  ["ti", "on", "tion"],
  ["er", "s", "ers"],
  ["he", "r", "her"],
  ["ll", "o", "llo"],
  ["he", "ll", "hell"],
  ["hell", "o", "hello"],
  ["w", "or", "wor"],
  ["wor", "l", "worl"],
  ["worl", "d", "world"]
]

// Apply a single merge to token list
function applyMerge(tokens, left, right, merged) {
  const result = [];
  let i = 0;
  while (i < tokens.length) {
    if (i < tokens.length - 1 && tokens[i] === left && tokens[i + 1] === right) {
      result.push(merged);
      i += 2;
    } else {
      result.push(tokens[i]);
      i += 1;
    }
  }
  return result;
}

// Apply merges up to a certain step
function tokenizeWithSteps(text, maxStep) {
  // Start with character-level tokens (preserve spaces)
  let tokens = text.split('');
  const steps = [{ tokens: [...tokens], mergeApplied: null }];

  for (let i = 0; i < Math.min(maxStep, bpeMerges.length); i++) {
    const [left, right, merged] = bpeMerges[i];
    const newTokens = applyMerge(tokens, left, right, merged);

    // Only record step if something changed
    if (newTokens.length !== tokens.length) {
      tokens = newTokens;
      steps.push({
        tokens: [...tokens],
        mergeApplied: `"${left}" + "${right}" → "${merged}"`
      });
    }
  }

  return { finalTokens: tokens, steps };
}

// Fully tokenize (all merges)
function tokenize(text) {
  let tokens = text.split('');
  for (const [left, right, merged] of bpeMerges) {
    tokens = applyMerge(tokens, left, right, merged);
  }
  return tokens;
}

// Widget theme - uses diagramTheme from _diagram-lib.qmd which already handles dark mode
theme = {
  const t = diagramTheme;
  return {
    textPrimary: t.nodeText,
    textMuted: t.edgeStroke,
    spaceBg: t.isDark ? 'rgba(168, 162, 158, 0.15)' : '#e5e7eb',
    spaceBorder: t.edgeStroke,
    tokenBorder: 50,
    tokenLightness: t.isDark ? 25 : 85,
    historyBg: t.bgSecondary,
    stepBg: t.bg === 'transparent' ? t.bgSecondary : t.bg,
    stepBorderInitial: t.edgeStroke,
    stepBorderMerge: t.accent,
    stepTextMuted: t.edgeStroke,
    tokenStepBg: t.isDark ? 'rgba(56, 189, 248, 0.15)' : 'rgba(14, 165, 233, 0.15)',
    spaceStepBg: t.isDark ? 'rgba(168, 162, 158, 0.15)' : '#e5e7eb'
  };
}

viewof inputText = Inputs.text({
  label: "Enter text",
  value: "hello world",
  placeholder: "Type something...",
  width: 400
})

viewof showSteps = Inputs.toggle({
  label: "Show step-by-step",
  value: true
})

viewof maxMergeStep = Inputs.range([0, bpeMerges.length], {
  value: bpeMerges.length,
  step: 1,
  label: "Merge steps to apply",
  disabled: !showSteps
})

result = tokenizeWithSteps(inputText.toLowerCase(), showSteps ? maxMergeStep : bpeMerges.length)
finalTokens = result.finalTokens
tokenizationSteps = result.steps

// Stats
charCount = inputText.length
tokenCount = finalTokens.length
compressionRatio = charCount > 0 ? (charCount / tokenCount).toFixed(2) : 0

// Token visualization as colored boxes
tokenVisualization = html`
<div style="margin: 20px 0; color: ${theme.textPrimary};">
  <strong>Tokens (${tokenCount}):</strong>
  <div style="display: flex; flex-wrap: wrap; gap: 4px; margin-top: 8px;">
    ${finalTokens.map((token, i) => {
      // Color based on token length (longer = more merged)
      const hue = Math.min(token.length * 30, 200);
      const color = `hsl(${hue}, 70%, ${theme.tokenLightness}%)`;
      const isSpace = token === ' ';
      return html`<span style="
        background: ${isSpace ? theme.spaceBg : color};
        padding: 4px 8px;
        border-radius: 4px;
        font-family: monospace;
        font-size: 14px;
        color: ${theme.textPrimary};
        border: 1px solid ${isSpace ? theme.spaceBorder : `hsl(${hue}, ${theme.tokenBorder}%, ${isDark ? 50 : 60}%)`};
      ">${isSpace ? '␣' : token}</span>`;
    })}
  </div>
</div>
`

md`**Stats:** ${charCount} characters → ${tokenCount} tokens | **Compression:** ${compressionRatio} chars/token`

// Step-by-step view (when enabled)
mergeHistory = showSteps && maxMergeStep > 0 ? html`
<div style="margin-top: 20px; padding: 15px; background: ${theme.historyBg}; border-radius: 8px; color: ${theme.textPrimary};">
  <strong>Merge History:</strong>
  <div style="font-family: monospace; font-size: 13px; margin-top: 10px;">
    ${tokenizationSteps.map((step, i) => html`
      <div style="margin: 8px 0; padding: 8px; background: ${theme.stepBg}; border-radius: 4px; border-left: 3px solid ${i === 0 ? theme.stepBorderInitial : theme.stepBorderMerge};">
        <div style="color: ${theme.textMuted}; font-size: 11px; margin-bottom: 4px;">
          ${i === 0 ? 'Initial (characters)' : `Step ${i}: ${step.mergeApplied}`}
        </div>
        <div style="display: flex; flex-wrap: wrap; gap: 2px;">
          ${step.tokens.map(t => html`<span style="background: ${t === ' ' ? theme.spaceStepBg : theme.tokenStepBg}; padding: 2px 6px; border-radius: 3px; color: ${theme.textPrimary};">${t === ' ' ? '␣' : t}</span>`)}
        </div>
        <div style="color: ${theme.stepTextMuted}; font-size: 11px; margin-top: 4px;">${step.tokens.length} tokens</div>
      </div>
    `)}
  </div>
</div>
` : html``

Try This

Common words merge well: Type “the” or “and” - they become single tokens quickly due to high-frequency merges.
Step through merges: Enable “Show step-by-step” and slide the merge steps from 0 to max. Watch how character pairs combine into larger tokens.
Rare words stay split: Type “xyz” or uncommon words - they remain as characters because those patterns weren’t in the training data.
Compression varies: Compare “the the the” (high compression) vs “qxz qxz qxz” (low compression). Common patterns compress better.
Spaces are preserved: Notice that spaces remain as separate tokens (shown as ␣). This is typical BPE behavior.

Exercises

Exercise 1: Compression Efficiency

BPE achieves better compression on repetitive text. This matters because better compression = shorter sequences = more context in the model’s window.

# Train on repetitive vs varied text and compare compression

texts = {
    "repetitive": "the the the " * 100,
    "varied": " ".join([f"word{i}" for i in range(100)]),
    "code": python_code,
}

print("Compression comparison:")
print("=" * 40)

for name, text in texts.items():
    tok = BPETokenizer(vocab_size=200, verbose=False)
    tok.train(text, show_progress=False)
    ids = tok.encode(text)
    ratio = len(text) / len(ids)
    print(f"{name:12s}: {ratio:.2f} chars/token")

print("\nNote: Repetitive text compresses best because BPE learns")
print("common patterns. Code has structure but more variety.")

Compression comparison:
========================================
repetitive  : 2.00 chars/token
varied      : 2.38 chars/token
code        : 2.70 chars/token

Note: Repetitive text compresses best because BPE learns
common patterns. Code has structure but more variety.

Exercise 2: Analyze the First Merges

The first merges reveal the most frequent patterns in your data. For English text, you’ll often see common letter pairs like ‘th’, ‘he’, ‘in’.

# What patterns are learned first?

sample_text = "hello world hello world hello world " * 10
tok = BPETokenizer(vocab_size=50, verbose=False)
tok.train(sample_text, show_progress=False)

print("First 10 merges (most frequent patterns):")
for i, ((a, b), merged) in enumerate(list(tok.merges.items())[:10]):
    print(f"  {i+1}. '{a}' + '{b}' = '{merged}'")

print("\nNotice: Common substrings merge first, eventually")
print("forming complete words like 'hello' and 'world'.")

First 10 merges (most frequent patterns):
  1. 'h' + 'e' = 'he'
  2. 'he' + 'l' = 'hel'
  3. 'hel' + 'l' = 'hell'
  4. 'hell' + 'o' = 'hello'
  5. 'w' + 'o' = 'wo'
  6. 'wo' + 'r' = 'wor'
  7. 'wor' + 'l' = 'worl'
  8. 'worl' + 'd' = 'world'

Notice: Common substrings merge first, eventually
forming complete words like 'hello' and 'world'.

Exercise 3: Observe Unknown Character Behavior

Our simple tokenizer can only encode characters it saw during training. Characters not in the vocabulary become <UNK> tokens. This exercise demonstrates the problem — and why production tokenizers use byte-level BPE to solve it.

# What happens with characters not in training?

tokenizer = BPETokenizer(vocab_size=50, verbose=False)
tokenizer.train("hello world", show_progress=False)

# Try encoding text with emoji
test = "hello world"  # Safe text
try:
    ids = tokenizer.encode(test)
    print(f"'{test}' -> {ids}")
    print(f"Decoded: '{tokenizer.decode(ids)}'")
except Exception as e:
    print(f"Error: {e}")

# Now try with a character not in training
test2 = "hello 123"
ids = tokenizer.encode(test2)
tokens = [tokenizer.id_to_token(i) for i in ids]
print(f"\n'{test2}' -> {ids}")
print(f"Tokens: {tokens}")
print("\nNotice: '1', '2', '3' become <UNK> (ID 1) because they")
print("weren't in the training data.")
print("\nThis is why production tokenizers use BYTE-LEVEL BPE:")
print("- Operate on UTF-8 bytes (0-255) instead of Unicode characters")
print("- Any byte sequence can be represented -> no UNK tokens")
print("- tiktoken and SentencePiece both use this approach")

'hello world' -> [7, 6, 8, 8, 9, 4, 11, 9, 10, 8, 5]
Decoded: 'hello world'

'hello 123' -> [7, 6, 8, 8, 9, 4, 1, 1, 1]
Tokens: ['h', 'e', 'l', 'l', 'o', ' ', '<UNK>', '<UNK>', '<UNK>']

Notice: '1', '2', '3' become <UNK> (ID 1) because they
weren't in the training data.

This is why production tokenizers use BYTE-LEVEL BPE:
- Operate on UTF-8 bytes (0-255) instead of Unicode characters
- Any byte sequence can be represented -> no UNK tokens
- tiktoken and SentencePiece both use this approach

Exercise 4: Whitespace Handling

Whitespace is tricky in tokenization. Our tokenizer preserves it, but notice how spaces can be part of tokens.

# Whitespace is significant in tokenization
code_tok = BPETokenizer(vocab_size=100, verbose=False)
code_tok.train("def foo():\n    return 1\ndef bar():\n    return 2", show_progress=False)

# See how indentation is tokenized
samples = [
    "def foo():",
    "    return",  # 4 spaces
    "        x",   # 8 spaces
]

for sample in samples:
    ids = code_tok.encode(sample)
    tokens = [code_tok.id_to_token(i) for i in ids]
    print(f"{repr(sample):20s} -> {tokens}")

print("\nIn production tokenizers, leading spaces often attach to")
print("the following word: ' hello' is one token, not ' ' + 'hello'")

'def foo():'         -> ['def', ' ', 'f', 'o', 'o', '():']
'    return'         -> ['  ', '  ', 'return']
'        x'          -> ['  ', '  ', '  ', '  ', '<UNK>']

In production tokenizers, leading spaces often attach to
the following word: ' hello' is one token, not ' ' + 'hello'

Tokenization in the LLM Pipeline

Tokenization occupies the first stage of the LLM pipeline:

// Interactive step-through of tokenization in LLM pipeline
viewof pipelineStep = stepControl({min: 0, max: 5, value: 0, label: "Pipeline Step"})

llmPipelineDiagram = {
  const width = 720;
  const height = 200;

  // Pipeline stages
  const stages = [
    { id: 'input', x: 60, label: 'Raw Text', sublabel: "'def hello():'", group: 'Input' },
    { id: 'split', x: 180, label: 'Split', sublabel: "['def',' ','hello',...]", group: 'Tokenization' },
    { id: 'lookup', x: 300, label: 'Look up IDs', sublabel: '[42, 5, 128, ...]', group: 'Tokenization' },
    { id: 'embed', x: 430, label: 'Embeddings', sublabel: 'Module 04', group: 'Model' },
    { id: 'transform', x: 550, label: 'Transformer', sublabel: 'Module 06', group: 'Model' },
    { id: 'decode', x: 670, label: 'Decode', sublabel: 'Back to text', group: 'Output' }
  ];

  // Stage descriptions
  const descriptions = [
    "Start: Raw source code text as input",
    "Tokenization: Split text into subword tokens using BPE",
    "Tokenization: Convert tokens to integer IDs via vocabulary lookup",
    "Model: Map token IDs to dense vector embeddings",
    "Model: Process embeddings through transformer layers",
    "Output: Decode predicted token IDs back to readable text"
  ];

  const svg = d3.create('svg')
    .attr('width', width)
    .attr('height', height)
    .attr('viewBox', `0 0 ${width} ${height}`);

  // Background
  svg.append('rect')
    .attr('width', width)
    .attr('height', height)
    .attr('fill', diagramTheme.bg)
    .attr('rx', 8);

  // Group backgrounds
  const groups = [
    { name: 'Input', x1: 20, x2: 120, color: diagramTheme.nodeStroke },
    { name: 'Tokenization', x1: 130, x2: 360, color: diagramTheme.highlight },
    { name: 'Model', x1: 370, x2: 610, color: diagramTheme.accent },
    { name: 'Output', x1: 620, x2: 710, color: diagramTheme.nodeStroke }
  ];

  groups.forEach(group => {
    const isActive = stages.filter(s => s.group === group.name)
                           .some((s, i) => stages.indexOf(s) === pipelineStep);

    svg.append('rect')
      .attr('x', group.x1)
      .attr('y', 25)
      .attr('width', group.x2 - group.x1)
      .attr('height', 95)
      .attr('rx', 6)
      .attr('fill', 'transparent')
      .attr('stroke', isActive ? group.color : diagramTheme.nodeStroke)
      .attr('stroke-width', isActive ? 2 : 1)
      .attr('stroke-dasharray', isActive ? 'none' : '4,2')
      .attr('opacity', isActive ? 1 : 0.4);

    svg.append('text')
      .attr('x', (group.x1 + group.x2) / 2)
      .attr('y', 40)
      .attr('text-anchor', 'middle')
      .attr('fill', isActive ? group.color : diagramTheme.nodeText)
      .attr('font-size', '10px')
      .attr('font-weight', isActive ? '600' : '400')
      .attr('opacity', isActive ? 1 : 0.5)
      .text(group.name);
  });

  // Arrow marker
  const defs = svg.append('defs');
  defs.append('marker')
    .attr('id', 'pipeline-arrow')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 5)
    .attr('markerHeight', 5)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', diagramTheme.edgeStroke);

  defs.append('marker')
    .attr('id', 'pipeline-arrow-active')
    .attr('viewBox', '0 -5 10 10')
    .attr('refX', 8)
    .attr('refY', 0)
    .attr('markerWidth', 5)
    .attr('markerHeight', 5)
    .attr('orient', 'auto')
    .append('path')
    .attr('d', 'M0,-5L10,0L0,5')
    .attr('fill', diagramTheme.highlight);

  // Draw nodes
  const nodeY = 85;
  const nodeW = 90;
  const nodeH = 48;

  stages.forEach((stage, i) => {
    const isActive = i === pipelineStep;
    const isPast = i < pipelineStep;

    const g = svg.append('g')
      .attr('transform', `translate(${stage.x}, ${nodeY})`);

    const rect = g.append('rect')
      .attr('x', -nodeW / 2)
      .attr('y', -nodeH / 2)
      .attr('width', nodeW)
      .attr('height', nodeH)
      .attr('rx', 6)
      .attr('fill', isActive ? diagramTheme.highlight :
                    isPast ? diagramTheme.bgSecondary : diagramTheme.nodeFill)
      .attr('stroke', isActive ? diagramTheme.highlight :
                      isPast ? diagramTheme.accent : diagramTheme.nodeStroke)
      .attr('stroke-width', isActive ? 2.5 : 1.5);

    if (isActive) {
      rect.attr('filter', `drop-shadow(0 0 10px ${diagramTheme.highlightGlow})`);
    }

    // Main label
    g.append('text')
      .attr('x', 0)
      .attr('y', -6)
      .attr('text-anchor', 'middle')
      .attr('fill', isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr('font-size', '11px')
      .attr('font-weight', '600')
      .text(stage.label);

    // Sublabel
    g.append('text')
      .attr('x', 0)
      .attr('y', 10)
      .attr('text-anchor', 'middle')
      .attr('fill', isActive ? diagramTheme.textOnHighlight : diagramTheme.nodeText)
      .attr('font-size', '9px')
      .attr('font-family', 'monospace')
      .attr('opacity', isActive ? 0.9 : 0.6)
      .text(stage.sublabel);

    // Draw arrow to next stage
    if (i < stages.length - 1) {
      const nextStage = stages[i + 1];
      const arrowActive = i === pipelineStep - 1;

      svg.append('path')
        .attr('d', `M${stage.x + nodeW/2 + 5},${nodeY} L${nextStage.x - nodeW/2 - 10},${nodeY}`)
        .attr('stroke', arrowActive ? diagramTheme.highlight : diagramTheme.edgeStroke)
        .attr('stroke-width', arrowActive ? 2 : 1.5)
        .attr('marker-end', `url(#${arrowActive ? 'pipeline-arrow-active' : 'pipeline-arrow'})`);
    }
  });

  // Description
  svg.append('rect')
    .attr('x', 20)
    .attr('y', height - 50)
    .attr('width', width - 40)
    .attr('height', 35)
    .attr('rx', 6)
    .attr('fill', diagramTheme.bgSecondary);

  svg.append('text')
    .attr('x', width / 2)
    .attr('y', height - 28)
    .attr('text-anchor', 'middle')
    .attr('fill', diagramTheme.nodeText)
    .attr('font-size', '12px')
    .text(descriptions[pipelineStep]);

  return svg.node();
}

Summary

Key takeaways:

BPE learns subword units by iteratively merging the most frequent adjacent token pairs
Vocabulary size is a tradeoff: larger = shorter sequences but more parameters and sparse token usage
Special tokens (BOS, EOS, PAD, UNK) serve critical roles in the LLM pipeline
Code patterns emerge naturally (def, self., return, indentation) when trained on code
Round-trip guarantee: encode -> decode should perfectly reconstruct the original text
Production tokenizers use byte-level BPE to handle any Unicode character without UNK tokens

What We Simplified

Our implementation differs from production tokenizers in several ways:

Our Tokenizer	Production Tokenizers
Character-level BPE	Byte-level BPE (handles any UTF-8)
Python dict lookups	Optimized Rust/C++ (tiktoken is 10x+ faster)
No regex pre-tokenization	GPT uses regex to split contractions, numbers specially
Simple word splitting	Careful handling of whitespace, punctuation

Pre-tokenization is a critical step we simplified. Production tokenizers first split text into “words” using regex patterns before applying BPE. This prevents merges across word boundaries — for example, preventing “the” at the end of one word from merging with “e” at the start of the next. GPT-2’s tokenizer uses a carefully crafted regex that handles contractions (“don’t” → “don”, “’t”), numbers, and punctuation specially. This pre-split ensures more linguistically meaningful merges.

Practical Implications

Context length: A 4096-token context window holds varying amounts of text depending on tokenization efficiency
Cost: API pricing is per-token, so tokenization directly affects cost
Multilingual: Tokenizers trained on English use more tokens for other languages (2-3x for some)
Code vs prose: Code often tokenizes inefficiently (many single-character tokens for syntax)

What’s Next

Module 04: Embeddings converts token IDs into dense vectors that capture meaning. Each token becomes a learnable vector in high-dimensional space.