Module 00: What Is a Language Model?
~10 minutes · No prerequisites
d3 = require("d3@7")
// =============================================================================
// THEME DETECTION
// =============================================================================
// Reactive variable that detects dark mode and updates when theme changes
isDarkMode = {
const check = () => document.body.classList.contains('quarto-dark');
// Initial check
let current = check();
// Set up observer for theme changes
const observer = new MutationObserver(() => {
const newValue = check();
if (newValue !== current) {
current = newValue;
}
});
observer.observe(document.body, {
attributes: true,
attributeFilter: ['class']
});
return current;
}
// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================
// Function to read CSS custom property values from the document
getCSSVar = function(name, fallback = null) {
if (typeof document === 'undefined') return fallback;
const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim();
return value || fallback;
}
// =============================================================================
// THEME OBJECT
// =============================================================================
// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
// Light mode fallbacks
const lightFallbacks = {
nodeFill: '#f5f5f4',
nodeFillHover: '#e7e5e4',
nodeStroke: '#d6d3d1',
nodeText: '#1c1917',
edgeStroke: '#78716c',
highlight: '#f97316',
highlightGlow: 'rgba(249, 115, 22, 0.3)',
accent: '#0ea5e9',
accentGlow: 'rgba(14, 165, 233, 0.3)',
textOnHighlight: '#ffffff',
textOnAccent: '#ffffff',
bg: '#fafaf9',
bgSecondary: '#f5f5f4'
};
// Dark mode fallbacks
const darkFallbacks = {
nodeFill: '#292524',
nodeFillHover: '#3f3a36',
nodeStroke: '#57534e',
nodeText: '#fafaf9',
edgeStroke: '#a8a29e',
highlight: '#fb923c',
highlightGlow: 'rgba(251, 146, 60, 0.4)',
accent: '#38bdf8',
accentGlow: 'rgba(56, 189, 248, 0.4)',
textOnHighlight: '#ffffff',
textOnAccent: '#ffffff',
bg: 'transparent',
bgSecondary: '#1c1917'
};
const fallbacks = isDarkMode ? darkFallbacks : lightFallbacks;
return {
nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
accent: getCSSVar('--diagram-accent', fallbacks.accent),
accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
textOnHighlight: fallbacks.textOnHighlight,
textOnAccent: fallbacks.textOnAccent,
bg: getCSSVar('--diagram-bg', fallbacks.bg),
bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
isDark: isDarkMode
};
}
// =============================================================================
// SVG PRIMITIVES
// =============================================================================
// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
const {
x = 0,
y = 0,
width = 100,
height = 50,
label = '',
sublabel = '',
id = null,
theme = diagramTheme,
rx = 6,
ry = 6,
className = 'diagram-node'
} = options;
// Create group
const g = svg.append('g')
.attr('class', className)
.attr('transform', `translate(${x}, ${y})`);
if (id) g.attr('id', id);
// Add rectangle
g.append('rect')
.attr('x', -width / 2)
.attr('y', -height / 2)
.attr('width', width)
.attr('height', height)
.attr('rx', rx)
.attr('ry', ry)
.attr('fill', theme.nodeFill)
.attr('stroke', theme.nodeStroke)
.attr('stroke-width', 1.5);
// Add main label
if (label) {
const labelY = sublabel ? -6 : 0;
g.append('text')
.attr('x', 0)
.attr('y', labelY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', theme.nodeText)
.attr('font-size', '12px')
.attr('font-weight', '500')
.attr('pointer-events', 'none')
.text(label);
}
// Add sublabel
if (sublabel) {
g.append('text')
.attr('x', 0)
.attr('y', 10)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', theme.nodeText)
.attr('font-size', '10px')
.attr('opacity', 0.7)
.attr('pointer-events', 'none')
.text(sublabel);
}
return g;
}
// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
const {
x1 = 0,
y1 = 0,
x2 = 100,
y2 = 0,
label = '',
theme = diagramTheme,
curved = false,
curvature = 0.3,
id = null,
className = 'diagram-edge',
dashed = false
} = options;
// Create unique marker ID
const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;
// Ensure defs exists
let defs = svg.select('defs');
if (defs.empty()) {
defs = svg.append('defs');
}
// Add arrowhead marker
defs.append('marker')
.attr('id', markerId)
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', theme.edgeStroke);
// Create group for arrow
const g = svg.append('g')
.attr('class', className);
if (id) g.attr('id', id);
// Calculate path
let pathD;
if (curved) {
// Quadratic Bezier curve
const midX = (x1 + x2) / 2;
const midY = (y1 + y2) / 2;
const dx = x2 - x1;
const dy = y2 - y1;
// Perpendicular offset for curve
const cx = midX - dy * curvature;
const cy = midY + dx * curvature;
pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
} else {
// Straight line
pathD = `M${x1},${y1} L${x2},${y2}`;
}
// Add path
const path = g.append('path')
.attr('d', pathD)
.attr('fill', 'none')
.attr('stroke', theme.edgeStroke)
.attr('stroke-width', 1.5)
.attr('marker-end', `url(#${markerId})`);
if (dashed) {
path.attr('stroke-dasharray', '5,3');
}
// Add label if provided
if (label) {
const labelX = (x1 + x2) / 2;
const labelY = (y1 + y2) / 2;
// Offset label perpendicular to line
const angle = Math.atan2(y2 - y1, x2 - x1);
const offsetX = Math.sin(angle) * 12;
const offsetY = -Math.cos(angle) * 12;
g.append('text')
.attr('x', labelX + offsetX)
.attr('y', labelY + offsetY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', theme.nodeText)
.attr('font-size', '10px')
.text(label);
}
return g;
}
// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================
// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
const {
total = 1,
initialStep = 0,
speed = 1000,
loop = true,
onStepChange = null
} = options;
let current = initialStep;
let isPlaying = false;
let intervalId = null;
let currentSpeed = speed;
const notifyChange = () => {
if (onStepChange && typeof onStepChange === 'function') {
onStepChange(current);
}
};
const controller = {
get current() { return current; },
get isPlaying() { return isPlaying; },
get total() { return total; },
get speed() { return currentSpeed; },
setStep(step) {
current = Math.max(0, Math.min(total - 1, step));
notifyChange();
return current;
},
next() {
if (current < total - 1) {
current++;
} else if (loop) {
current = 0;
}
notifyChange();
return current;
},
prev() {
if (current > 0) {
current--;
} else if (loop) {
current = total - 1;
}
notifyChange();
return current;
},
play() {
if (isPlaying) return;
isPlaying = true;
intervalId = setInterval(() => {
controller.next();
}, currentSpeed);
},
stop() {
isPlaying = false;
if (intervalId) {
clearInterval(intervalId);
intervalId = null;
}
},
toggle() {
if (isPlaying) {
controller.stop();
} else {
controller.play();
}
},
reset() {
controller.stop();
current = initialStep;
notifyChange();
},
setSpeed(newSpeed) {
currentSpeed = newSpeed;
if (isPlaying) {
controller.stop();
controller.play();
}
}
};
return controller;
}
// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================
// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
const {
nodes = [],
edges = [],
width = 600,
height = 400,
activeNodes = [],
activeEdges = [],
theme = diagramTheme,
nodeWidth = 100,
nodeHeight = 50,
padding = 20
} = options;
// Create SVG element
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`)
.attr('class', 'flow-diagram');
// Add background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', theme.bg)
.attr('rx', 8);
// Create defs for markers
const defs = svg.append('defs');
// Standard arrow marker
defs.append('marker')
.attr('id', 'flow-arrow')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', theme.edgeStroke);
// Highlighted arrow marker
defs.append('marker')
.attr('id', 'flow-arrow-highlight')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', theme.highlight);
// Edges layer (draw first so nodes appear on top)
const edgesLayer = svg.append('g').attr('class', 'edges-layer');
// Nodes layer
const nodesLayer = svg.append('g').attr('class', 'nodes-layer');
// Draw edges
edges.forEach((edge, i) => {
const sourceNode = nodes.find(n => n.id === edge.source);
const targetNode = nodes.find(n => n.id === edge.target);
if (!sourceNode || !targetNode) return;
const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';
// Calculate edge path
const x1 = sourceNode.x;
const y1 = sourceNode.y;
const x2 = targetNode.x;
const y2 = targetNode.y;
// Shorten path to not overlap with node edges
const dx = x2 - x1;
const dy = y2 - y1;
const len = Math.sqrt(dx * dx + dy * dy);
const offsetStart = (nodeWidth / 2) + 5;
const offsetEnd = (nodeWidth / 2) + 10;
const startX = x1 + (dx / len) * offsetStart;
const startY = y1 + (dy / len) * offsetStart;
const endX = x2 - (dx / len) * offsetEnd;
const endY = y2 - (dy / len) * offsetEnd;
const edgeGroup = edgesLayer.append('g')
.attr('class', `edge ${isActive ? 'highlighted' : ''}`);
if (edge.id) edgeGroup.attr('id', edge.id);
// Draw path
let pathD;
if (edge.curved) {
const midX = (startX + endX) / 2;
const midY = (startY + endY) / 2;
const curvature = edge.curvature || 0.2;
const cx = midX - dy * curvature;
const cy = midY + dx * curvature;
pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
} else {
pathD = `M${startX},${startY} L${endX},${endY}`;
}
const path = edgeGroup.append('path')
.attr('d', pathD)
.attr('fill', 'none')
.attr('stroke', edgeColor)
.attr('stroke-width', isActive ? 2.5 : 1.5)
.attr('marker-end', `url(#${markerId})`);
if (edge.dashed) {
path.attr('stroke-dasharray', '5,3');
}
if (isActive) {
path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
}
// Add label if present
if (edge.label) {
const labelX = (startX + endX) / 2;
const labelY = (startY + endY) / 2;
const angle = Math.atan2(endY - startY, endX - startX);
const offsetX = Math.sin(angle) * 14;
const offsetY = -Math.cos(angle) * 14;
edgeGroup.append('text')
.attr('x', labelX + offsetX)
.attr('y', labelY + offsetY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isActive ? theme.highlight : theme.nodeText)
.attr('font-size', '10px')
.text(edge.label);
}
});
// Draw nodes
nodes.forEach((node, i) => {
const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
const nodeFill = isActive ? theme.highlight : theme.nodeFill;
const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
const textFill = isActive ? theme.textOnHighlight : theme.nodeText;
const nodeGroup = nodesLayer.append('g')
.attr('class', `node ${isActive ? 'highlighted' : ''}`)
.attr('transform', `translate(${node.x}, ${node.y})`);
if (node.id) nodeGroup.attr('id', node.id);
// Node rectangle
const rect = nodeGroup.append('rect')
.attr('x', -nodeWidth / 2)
.attr('y', -nodeHeight / 2)
.attr('width', node.width || nodeWidth)
.attr('height', node.height || nodeHeight)
.attr('rx', 6)
.attr('ry', 6)
.attr('fill', nodeFill)
.attr('stroke', nodeStroke)
.attr('stroke-width', isActive ? 2 : 1.5);
if (isActive) {
rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
}
// Main label
const labelY = node.sublabel ? -6 : 0;
nodeGroup.append('text')
.attr('x', 0)
.attr('y', labelY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', textFill)
.attr('font-size', '12px')
.attr('font-weight', '500')
.attr('pointer-events', 'none')
.text(node.label || '');
// Sublabel
if (node.sublabel) {
nodeGroup.append('text')
.attr('x', 0)
.attr('y', 10)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', textFill)
.attr('font-size', '10px')
.attr('opacity', isActive ? 0.9 : 0.7)
.attr('pointer-events', 'none')
.text(node.sublabel);
}
});
return svg.node();
}
// =============================================================================
// EXPORTS
// =============================================================================
// Export everything as a single object for lessons to use
diagramLib = {
// Core dependencies
d3,
// Theme detection
isDarkMode,
getCSSVar,
diagramTheme,
// SVG primitives
createNode,
createArrow,
// Animation controller
createStepController,
// Components
FlowDiagram
}
NoteWhat You’ll Learn
By the end of this module, you will:
- Understand what language models actually do (next-token prediction)
- Know why simple statistical approaches fail at this task
- Have a mental model of transformer architecture
- See the roadmap of what you’ll build in this course
Tokenizer = (await import('/playground/tokenizer.js')).Tokenizer
GPTModel = (await import('/playground/model.js')).GPTModel
// Load data files
tokenizerData = FileAttachment("../../playground/tokenizer.json").json()
weightsData = FileAttachment("../../playground/weights.json").json()
// Initialize tokenizer and model
tokenizer = new Tokenizer(tokenizerData)
model = new GPTModel(weightsData)A language model predicts the next token.
Given the text def hello(, what comes next? A language model outputs a probability distribution over all possible tokens:
| Token | Probability |
|---|---|
) |
31% |
name |
18% |
self |
12% |
x |
8% |
| … | … |
Three things to notice:
Tokens aren’t words. The model works with subword pieces - fragments smaller than words.
hellomight becomehel+lo. This keeps the vocabulary manageable (typically 30-50k tokens) while handling any text, including rare words and code the model has never seen before.The output is probabilities, not a single answer. The model expresses uncertainty. Sometimes
)is clearly right; sometimes several options make sense.This simple task scales remarkably. Predicting the next token well requires understanding syntax, semantics, context, and even reasoning. A model that excels at this task can write code, answer questions, and hold conversations. This is how GitHub Copilot suggests completions, how ChatGPT generates responses, and how your IDE’s autocomplete works - all next-token prediction at scale.
Try It
// Compute predictions
predictions = {
const tok = await tokenizer;
const mod = await model;
if (!inputText || inputText.length === 0) {
return [];
}
const ids = tok.encode(inputText);
const trace = mod.forward(ids);
const logits = trace.logits;
// Get last position logits
const vocabSize = logits.shape[1];
const lastLogits = [];
const offset = (logits.shape[0] - 1) * vocabSize;
for (let i = 0; i < vocabSize; i++) {
lastLogits.push(logits.data[offset + i]);
}
// Softmax
const maxLogit = Math.max(...lastLogits);
const expLogits = lastLogits.map(l => Math.exp(l - maxLogit));
const sumExp = expLogits.reduce((a, b) => a + b, 0);
const probs = expLogits.map(e => e / sumExp);
// Get top 5
const indexed = probs.map((p, i) => ({ prob: p, id: i }));
indexed.sort((a, b) => b.prob - a.prob);
const top5 = indexed.slice(0, 5);
return top5.map(({ prob, id }) => ({
token: tok.idToToken(id).replace(/ /g, '␣'),
probability: (prob * 100).toFixed(1) + '%'
}));
}// Display predictions as a simple table
predictionDisplay = {
if (predictions.length === 0) {
return html`<p style="color: var(--text-muted);">Type something to see predictions.</p>`;
}
return html`
<div style="
display: flex;
gap: 12px;
flex-wrap: wrap;
margin-top: 12px;
">
${predictions.map((p, i) => html`
<div style="
padding: 8px 16px;
background: ${i === 0 ? 'var(--diagram-highlight, #f97316)' : 'var(--surface-tertiary, #f1f5f9)'};
color: ${i === 0 ? 'white' : 'var(--text-primary, #1e293b)'};
border-radius: 6px;
font-family: 'JetBrains Mono', monospace;
font-size: 14px;
">
<span style="font-weight: 600;">${p.token}</span>
<span style="opacity: 0.8; margin-left: 8px;">${p.probability}</span>
</div>
`)}
</div>
`;
}Try it: Type a function definition like def add( and notice how the model predicts argument names. Then try for i in - the predictions shift based on what typically follows loop constructs.
Predicting the next token sounds simple. But how do you do it well?
Why Simple Approaches Fail
The obvious approach: count what tokens typically follow other tokens. After seeing for i in thousands of times, you learn that range often comes next.
This is called an n-gram model. It looks at the last N tokens to predict the next one. Simple, fast, and works surprisingly well for common patterns.
But n-grams have a fundamental limitation that no amount of data can fix: they can only see a fixed window of tokens.
The Context Problem
Consider this code:
What goes in the blank? A human immediately knows the answer is n - it’s the loop variable. But an n-gram model looking at just total += has no idea. It might guess 1 or x or value - common things that follow +=.
N-gram vs Transformer: A Comparison
comparisonExamples = [
{
id: 'loop',
name: 'Loop Variable',
code: 'def calculate_sum(numbers):\n total = 0\n for n in numbers:\n total += █',
ngram: [
{ token: '1', prob: '24%', reason: 'common increment' },
{ token: 'x', prob: '18%', reason: 'common variable' },
{ token: 'value', prob: '15%', reason: 'common name' }
],
transformer: [
{ token: 'n', prob: '72%', reason: 'sees loop variable' },
{ token: 'numbers', prob: '8%', reason: 'sees parameter' },
{ token: '1', prob: '5%', reason: 'fallback' }
],
insight: 'The transformer attends to the loop definition and knows `n` is the iteration variable.'
},
{
id: 'return',
name: 'Return Value',
code: 'if x > 0:\n return x\nelse:\n return █',
ngram: [
{ token: '0', prob: '28%', reason: 'common return' },
{ token: 'None', prob: '22%', reason: 'common return' },
{ token: 'False', prob: '14%', reason: 'common return' }
],
transformer: [
{ token: '-x', prob: '45%', reason: 'sees negation pattern' },
{ token: '0', prob: '25%', reason: 'conditional default' },
{ token: '-1', prob: '12%', reason: 'error sentinel' }
],
insight: 'The transformer recognizes the absolute value pattern from the `if x > 0` condition.'
},
{
id: 'comment',
name: 'Following Comments',
code: '# reverse a string\ndef █',
ngram: [
{ token: 'main', prob: '15%', reason: 'common function' },
{ token: 'get', prob: '12%', reason: 'common prefix' },
{ token: 'calculate', prob: '10%', reason: 'common prefix' }
],
transformer: [
{ token: 'reverse', prob: '68%', reason: 'matches comment' },
{ token: 'rev', prob: '12%', reason: 'abbreviation' },
{ token: 'flip', prob: '6%', reason: 'synonym' }
],
insight: 'The transformer reads the comment and predicts a function name that matches its intent.'
}
]
viewof selectedExample = Inputs.radio(
comparisonExamples.map(e => e.id),
{
label: "Choose an example:",
value: 'loop',
format: id => comparisonExamples.find(e => e.id === id).name
}
)// Display the comparison widget
comparisonWidget = {
const example = comparisonExamples.find(e => e.id === selectedExample);
const t = diagramTheme;
return html`
<div style="margin: 20px 0;">
<pre style="
background: var(--surface-secondary, #1e293b);
color: var(--text-primary, #e2e8f0);
padding: 16px;
border-radius: 8px;
font-family: 'JetBrains Mono', monospace;
font-size: 14px;
line-height: 1.5;
overflow-x: auto;
margin-bottom: 20px;
">${example.code}</pre>
<div style="
display: grid;
grid-template-columns: 1fr 1fr;
gap: 16px;
">
<div style="
background: var(--surface-tertiary, #f1f5f9);
border-radius: 8px;
padding: 16px;
border-top: 4px solid #94a3b8;
">
<div style="
font-weight: 600;
color: var(--text-primary, #1e293b);
margin-bottom: 12px;
font-size: 14px;
">N-gram Model <span style="opacity: 0.6; font-weight: 400;">(sees last ~3 tokens)</span></div>
${example.ngram.map((p, i) => html`
<div style="
display: flex;
justify-content: space-between;
align-items: center;
padding: 8px 12px;
margin-bottom: 6px;
background: ${i === 0 ? 'rgba(148, 163, 184, 0.3)' : 'transparent'};
border-radius: 4px;
">
<span style="
font-family: 'JetBrains Mono', monospace;
font-weight: 500;
">${p.token}</span>
<span style="
font-size: 13px;
color: var(--text-secondary, #64748b);
">${p.prob} <span style="opacity: 0.7;">· ${p.reason}</span></span>
</div>
`)}
</div>
<div style="
background: var(--surface-tertiary, #f1f5f9);
border-radius: 8px;
padding: 16px;
border-top: 4px solid var(--diagram-highlight, #f97316);
">
<div style="
font-weight: 600;
color: var(--text-primary, #1e293b);
margin-bottom: 12px;
font-size: 14px;
">Transformer <span style="opacity: 0.6; font-weight: 400;">(sees all context)</span></div>
${example.transformer.map((p, i) => html`
<div style="
display: flex;
justify-content: space-between;
align-items: center;
padding: 8px 12px;
margin-bottom: 6px;
background: ${i === 0 ? 'rgba(249, 115, 22, 0.2)' : 'transparent'};
border-radius: 4px;
">
<span style="
font-family: 'JetBrains Mono', monospace;
font-weight: 500;
color: ${i === 0 ? 'var(--diagram-highlight, #f97316)' : 'inherit'};
">${p.token}</span>
<span style="
font-size: 13px;
color: var(--text-secondary, #64748b);
">${p.prob} <span style="opacity: 0.7;">· ${p.reason}</span></span>
</div>
`)}
</div>
</div>
<div style="
margin-top: 16px;
padding: 12px 16px;
background: linear-gradient(90deg, rgba(249, 115, 22, 0.1), transparent);
border-left: 3px solid var(--diagram-highlight, #f97316);
border-radius: 0 6px 6px 0;
font-size: 14px;
color: var(--text-primary, #334155);
">
<strong>Why the difference?</strong> ${example.insight}
</div>
</div>
`;
}The answer depends on context from 10+ tokens back. N-grams can’t reach that far. Increase the window size and you run into another problem: you’ve never seen this exact sequence before, so you have no statistics to rely on.
Language is full of these long-range dependencies:
- Matching brackets and parentheses
- Variable references spanning multiple lines
- Pronouns referring to earlier nouns
- Comments describing code that follows
A good language model needs to consider all previous tokens and learn which ones matter for the current prediction.
The Transformer Solution
Transformers solve this with a mechanism called attention: a way to look at all previous tokens and learn which ones matter.
The architecture flows like this:
stageDescriptions = [
{ stage: "Input", desc: "Text enters as token IDs (integers)" },
{ stage: "Embedding", desc: "Each ID becomes a dense vector (e.g., 768 numbers). These aren't random - the model learns which tokens should have similar vectors based on how they're used. Words like 'run' and 'jog' end up close together." },
{ stage: "Attention", desc: "Each token gathers information from relevant previous tokens. This is where 'it' learns what 'it' refers to, and where `total +=` finds the loop variable `n`." },
{ stage: "Feed-Forward", desc: "Process each token's combined information through a small neural network. This is where the model 'thinks' about what it learned from attention." },
{ stage: "Repeat", desc: "Stack multiple layers to build deeper understanding. Early layers might recognize syntax; later layers understand meaning." },
{ stage: "Output", desc: "Final vectors become probabilities over the vocabulary" }
]
currentStage = stageDescriptions[animStep]// Animated architecture diagram
architectureDiagram = {
const width = 700;
const height = 200;
const t = diagramTheme;
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`);
// Background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', t.bg)
.attr('rx', 8);
// Node positions
const nodes = [
{ id: 0, x: 60, y: 100, label: 'Input', sublabel: 'Token IDs' },
{ id: 1, x: 180, y: 100, label: 'Embedding', sublabel: 'Vectors' },
{ id: 2, x: 320, y: 100, label: 'Attention', sublabel: 'Context' },
{ id: 3, x: 460, y: 100, label: 'Feed-Forward', sublabel: 'Process' },
{ id: 4, x: 540, y: 50, label: '×N', sublabel: 'Layers' },
{ id: 5, x: 640, y: 100, label: 'Output', sublabel: 'Probabilities' }
];
// Draw edges first
const edges = [
{ from: 0, to: 1 },
{ from: 1, to: 2 },
{ from: 2, to: 3 },
{ from: 3, to: 5 }
];
// Arrow marker
const defs = svg.append('defs');
defs.append('marker')
.attr('id', 'arch-arrow')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', t.edgeStroke);
defs.append('marker')
.attr('id', 'arch-arrow-highlight')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', t.highlight);
edges.forEach(e => {
const from = nodes[e.from];
const to = nodes[e.to];
const isActive = animStep >= e.from && animStep <= e.to;
svg.append('line')
.attr('x1', from.x + 50)
.attr('y1', from.y)
.attr('x2', to.x - 50)
.attr('y2', to.y)
.attr('stroke', isActive ? t.highlight : t.edgeStroke)
.attr('stroke-width', isActive ? 2.5 : 1.5)
.attr('marker-end', isActive ? 'url(#arch-arrow-highlight)' : 'url(#arch-arrow)');
});
// Loop arrow for "Repeat" (curved)
if (animStep >= 4) {
const loopPath = svg.append('path')
.attr('d', 'M 470,70 Q 500,20 530,50')
.attr('fill', 'none')
.attr('stroke', animStep === 4 ? t.highlight : t.edgeStroke)
.attr('stroke-width', animStep === 4 ? 2.5 : 1.5)
.attr('stroke-dasharray', '4,2');
}
// Draw nodes
nodes.forEach((node, i) => {
const isActive = animStep === i;
const nodeWidth = i === 4 ? 50 : 90;
const nodeHeight = i === 4 ? 35 : 50;
const g = svg.append('g')
.attr('transform', `translate(${node.x}, ${node.y})`);
g.append('rect')
.attr('x', -nodeWidth/2)
.attr('y', -nodeHeight/2)
.attr('width', nodeWidth)
.attr('height', nodeHeight)
.attr('rx', 6)
.attr('fill', isActive ? t.highlight : t.nodeFill)
.attr('stroke', isActive ? t.highlight : t.nodeStroke)
.attr('stroke-width', isActive ? 2 : 1.5);
g.append('text')
.attr('y', node.sublabel && i !== 4 ? -6 : 0)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isActive ? t.textOnHighlight : t.nodeText)
.attr('font-size', i === 4 ? '14px' : '12px')
.attr('font-weight', '500')
.text(node.label);
if (node.sublabel && i !== 4) {
g.append('text')
.attr('y', 10)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isActive ? t.textOnHighlight : t.nodeText)
.attr('font-size', '10px')
.attr('opacity', 0.7)
.text(node.sublabel);
}
});
return svg.node();
}// Attention visualization - shows when on step 2
attentionViz = {
if (animStep !== 2) return html``;
const t = diagramTheme;
const tokens = ['The', 'cat', 'sat', 'on', 'the', 'mat', 'because', 'it', 'was', 'tired'];
const targetIdx = 7; // "it"
// Attention weights from "it" to previous tokens
const weights = [0.05, 0.62, 0.08, 0.02, 0.03, 0.12, 0.05, 0, 0, 0];
const width = 700;
const height = 120;
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`);
// Background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', t.bg)
.attr('rx', 8);
const tokenSpacing = 65;
const startX = 30;
const tokenY = 80;
const targetY = 35;
// Draw tokens
tokens.forEach((token, i) => {
const x = startX + i * tokenSpacing;
const weight = weights[i];
const isTarget = i === targetIdx;
// Token box
svg.append('rect')
.attr('x', x)
.attr('y', isTarget ? targetY - 12 : tokenY - 12)
.attr('width', 55)
.attr('height', 24)
.attr('rx', 4)
.attr('fill', isTarget ? t.highlight : (weight > 0.3 ? 'rgba(249, 115, 22, 0.3)' : t.nodeFill))
.attr('stroke', isTarget ? t.highlight : (weight > 0.3 ? t.highlight : t.nodeStroke))
.attr('stroke-width', isTarget ? 2 : 1);
// Token text
svg.append('text')
.attr('x', x + 27.5)
.attr('y', isTarget ? targetY : tokenY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isTarget ? t.textOnHighlight : t.nodeText)
.attr('font-size', '12px')
.attr('font-family', "'JetBrains Mono', monospace")
.text(token);
// Draw attention line from target to this token
if (i < targetIdx && weight > 0) {
const targetX = startX + targetIdx * tokenSpacing + 27.5;
const sourceX = x + 27.5;
svg.append('line')
.attr('x1', sourceX)
.attr('y1', tokenY - 14)
.attr('x2', targetX)
.attr('y2', targetY + 14)
.attr('stroke', t.highlight)
.attr('stroke-width', Math.max(0.5, weight * 5))
.attr('opacity', 0.4 + weight * 0.6);
}
});
// Label
svg.append('text')
.attr('x', startX + targetIdx * tokenSpacing + 27.5)
.attr('y', targetY - 25)
.attr('text-anchor', 'middle')
.attr('fill', t.highlight)
.attr('font-size', '10px')
.attr('font-weight', '500')
.text('predicting next token');
return svg.node();
}// Attention explanation
attentionExplanation = {
if (animStep !== 2) return html``;
return html`
<div style="
margin-top: 12px;
padding: 12px 16px;
background: rgba(249, 115, 22, 0.1);
border-radius: 6px;
font-size: 13px;
color: var(--text-primary, #334155);
">
<strong>Attention in action:</strong> When predicting what comes after "it", the model attends strongly to "cat" (62%) because "it" likely refers to the cat. The line thickness shows attention weight - the model learned this connection, it wasn't programmed.
</div>
`;
}// Stage description display
stageDisplay = {
return html`
<div style="
margin-top: 16px;
padding: 16px 20px;
background: var(--surface-secondary, #f8fafc);
border-radius: 8px;
border-left: 4px solid var(--diagram-highlight, #f97316);
">
<div style="
font-weight: 600;
font-size: 15px;
color: var(--text-primary, #1e293b);
margin-bottom: 4px;
">${currentStage.stage}</div>
<div style="
font-size: 14px;
color: var(--text-secondary, #475569);
">${currentStage.desc}</div>
</div>
`;
}Three key insights:
Full context visibility. Unlike older models that read left-to-right one token at a time, transformers can see the entire input at once. Each layer processes all positions simultaneously, making them highly parallelizable and efficient.
Learned relevance. The attention mechanism learns which tokens matter for each prediction. When predicting after
total +=, it learns to focus on the loop variablen, not the function name.Stacked layers. Multiple transformer layers build increasingly abstract representations. Early layers might recognize syntax; later layers understand meaning.
That’s the high-level overview. In the modules ahead, you’ll build each component from scratch.
What You’ll Build
Each module ahead tackles one piece of this architecture:
moduleData = [
{ id: '01', name: 'Tensors', desc: 'The n-dimensional arrays that hold everything - weights, activations, gradients', cat: 'foundation' },
{ id: '02', name: 'Autograd', desc: 'How models learn: automatic gradients without manual calculus', cat: 'foundation' },
{ id: '03', name: 'Tokenization', desc: 'Why "hello" becomes [15339] and how subwords handle any text', cat: 'input' },
{ id: '04', name: 'Embeddings', desc: 'Where meaning lives: turning IDs into rich vector representations', cat: 'model' },
{ id: '05', name: 'Attention', desc: 'The mechanism that lets "it" know what "it" refers to', cat: 'model' },
{ id: '06', name: 'Transformer', desc: 'Putting it all together into a complete architecture', cat: 'model' },
{ id: '07', name: 'Training', desc: 'Teaching the model to predict well through gradient descent', cat: 'training' },
{ id: '08', name: 'Generation', desc: 'Sampling strategies: from greedy to nucleus sampling', cat: 'output' }
]
// Category colors
catColors = ({
foundation: { bg: '#dbeafe', border: '#3b82f6', text: '#1e40af' },
input: { bg: '#fef3c7', border: '#f59e0b', text: '#92400e' },
model: { bg: '#dcfce7', border: '#22c55e', text: '#166534' },
training: { bg: '#fce7f3', border: '#ec4899', text: '#9d174d' },
output: { bg: '#f3e8ff', border: '#a855f7', text: '#6b21a8' }
})
// Module directory name mapping
moduleNames = ({
'01': 'tensors',
'02': 'autograd',
'03': 'tokenization',
'04': 'embeddings',
'05': 'attention',
'06': 'transformer',
'07': 'training',
'08': 'generation'
})// Render module roadmap
roadmapDisplay = {
const t = diagramTheme;
const isDark = t.isDark;
return html`
<div style="
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 12px;
margin: 20px 0;
">
${moduleData.map(m => {
const colors = catColors[m.cat];
const bgColor = isDark ? 'var(--surface-secondary)' : colors.bg;
const borderColor = isDark ? colors.border : colors.border;
const textColor = isDark ? 'var(--text-primary)' : colors.text;
return html`
<a href="../m${m.id}_${moduleNames[m.id]}/lesson.html" style="
display: block;
padding: 16px;
background: ${bgColor};
border-left: 4px solid ${borderColor};
border-radius: 6px;
text-decoration: none;
transition: transform 0.15s ease, box-shadow 0.15s ease;
" onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 4px 12px rgba(0,0,0,0.1)';"
onmouseout="this.style.transform='none'; this.style.boxShadow='none';">
<div style="
font-size: 13px;
font-weight: 600;
color: ${textColor};
margin-bottom: 4px;
font-family: 'JetBrains Mono', monospace;
">Module ${m.id}</div>
<div style="
font-size: 16px;
font-weight: 600;
color: var(--text-primary, #1e293b);
margin-bottom: 6px;
">${m.name}</div>
<div style="
font-size: 13px;
color: var(--text-secondary, #475569);
">${m.desc}</div>
</a>
`;
})}
</div>
`;
}The path is sequential: each module builds on the previous ones. By the end, you’ll have a working language model that you fully understand - not magic, but something you built piece by piece.
NoteKey Takeaways
- Language models predict the next token from a probability distribution over the vocabulary
- Simple counting (n-grams) fails because language has long-range dependencies that require more than a fixed window
- Transformers use attention to dynamically focus on relevant context, no matter how far back
- You’ll build each component from scratch in the modules ahead, from tensors to text generation