Module 00: What Is a Language Model?
~15 minutes · No prerequisites
d3 = require("d3@7")
// =============================================================================
// CSS VARIABLE UTILITIES
// =============================================================================
// Function to read CSS custom property values from the document
getCSSVar = function(name, fallback = null) {
if (typeof document === 'undefined') return fallback;
const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim();
return value || fallback;
}
// =============================================================================
// THEME OBJECT
// =============================================================================
// Object containing all diagram colors read from CSS variables
// Falls back to hardcoded values if CSS vars not available
diagramTheme = {
// Fallback values (light mode)
const fallbacks = {
nodeFill: '#f5f5f4',
nodeFillHover: '#e7e5e4',
nodeStroke: '#d6d3d1',
nodeText: '#1c1917',
edgeStroke: '#78716c',
highlight: '#f97316',
highlightGlow: 'rgba(249, 115, 22, 0.3)',
accent: '#0ea5e9',
accentGlow: 'rgba(14, 165, 233, 0.3)',
textOnHighlight: '#1c1917',
textOnAccent: '#1c1917',
bg: '#fafaf9',
bgSecondary: '#f5f5f4',
// Semantic colors for status/feedback
error: '#dc2626',
errorBg: 'rgba(220, 38, 38, 0.1)',
success: '#16a34a',
successBg: 'rgba(22, 163, 74, 0.1)',
info: '#2563eb',
infoBg: 'rgba(37, 99, 235, 0.1)'
};
return {
nodeFill: getCSSVar('--diagram-node-fill', fallbacks.nodeFill),
nodeFillHover: getCSSVar('--diagram-hover-fill', fallbacks.nodeFillHover),
nodeStroke: getCSSVar('--diagram-node-stroke', fallbacks.nodeStroke),
nodeText: getCSSVar('--diagram-node-text', fallbacks.nodeText),
edgeStroke: getCSSVar('--diagram-edge-stroke', fallbacks.edgeStroke),
highlight: getCSSVar('--diagram-highlight', fallbacks.highlight),
highlightGlow: getCSSVar('--diagram-highlight-glow', fallbacks.highlightGlow),
accent: getCSSVar('--diagram-accent', fallbacks.accent),
accentGlow: getCSSVar('--diagram-accent-glow', fallbacks.accentGlow),
textOnHighlight: fallbacks.textOnHighlight,
textOnAccent: fallbacks.textOnAccent,
bg: getCSSVar('--diagram-bg', fallbacks.bg),
bgSecondary: getCSSVar('--diagram-bg-secondary', fallbacks.bgSecondary),
// Semantic colors (use fallbacks directly since no CSS vars defined)
error: fallbacks.error,
errorBg: fallbacks.errorBg,
success: fallbacks.success,
successBg: fallbacks.successBg,
info: fallbacks.info,
infoBg: fallbacks.infoBg
};
}
// =============================================================================
// SVG PRIMITIVES
// =============================================================================
// Creates a group with rounded rect and text
// Options: {x, y, width, height, label, sublabel, id, theme, rx, ry, className}
createNode = function(svg, options) {
const {
x = 0,
y = 0,
width = 100,
height = 50,
label = '',
sublabel = '',
id = null,
theme = diagramTheme,
rx = 6,
ry = 6,
className = 'diagram-node'
} = options;
// Create group
const g = svg.append('g')
.attr('class', className)
.attr('transform', `translate(${x}, ${y})`);
if (id) g.attr('id', id);
// Add rectangle
g.append('rect')
.attr('x', -width / 2)
.attr('y', -height / 2)
.attr('width', width)
.attr('height', height)
.attr('rx', rx)
.attr('ry', ry)
.attr('fill', theme.nodeFill)
.attr('stroke', theme.nodeStroke)
.attr('stroke-width', 1.5);
// Add main label
if (label) {
const labelY = sublabel ? -6 : 0;
g.append('text')
.attr('x', 0)
.attr('y', labelY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', theme.nodeText)
.attr('font-size', '12px')
.attr('font-weight', '500')
.attr('pointer-events', 'none')
.text(label);
}
// Add sublabel
if (sublabel) {
g.append('text')
.attr('x', 0)
.attr('y', 10)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', theme.nodeText)
.attr('font-size', '10px')
.attr('opacity', 0.7)
.attr('pointer-events', 'none')
.text(sublabel);
}
return g;
}
// Creates a path with arrowhead marker
// Options: {x1, y1, x2, y2, label, theme, curved, curvature, id, className, dashed}
createArrow = function(svg, options) {
const {
x1 = 0,
y1 = 0,
x2 = 100,
y2 = 0,
label = '',
theme = diagramTheme,
curved = false,
curvature = 0.3,
id = null,
className = 'diagram-edge',
dashed = false
} = options;
// Create unique marker ID
const markerId = `arrow-${Math.random().toString(36).substr(2, 9)}`;
// Ensure defs exists
let defs = svg.select('defs');
if (defs.empty()) {
defs = svg.append('defs');
}
// Add arrowhead marker
defs.append('marker')
.attr('id', markerId)
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', theme.edgeStroke);
// Create group for arrow
const g = svg.append('g')
.attr('class', className);
if (id) g.attr('id', id);
// Calculate path
let pathD;
if (curved) {
// Quadratic Bezier curve
const midX = (x1 + x2) / 2;
const midY = (y1 + y2) / 2;
const dx = x2 - x1;
const dy = y2 - y1;
// Perpendicular offset for curve
const cx = midX - dy * curvature;
const cy = midY + dx * curvature;
pathD = `M${x1},${y1} Q${cx},${cy} ${x2},${y2}`;
} else {
// Straight line
pathD = `M${x1},${y1} L${x2},${y2}`;
}
// Add path
const path = g.append('path')
.attr('d', pathD)
.attr('fill', 'none')
.attr('stroke', theme.edgeStroke)
.attr('stroke-width', 1.5)
.attr('marker-end', `url(#${markerId})`);
if (dashed) {
path.attr('stroke-dasharray', '5,3');
}
// Add label if provided
if (label) {
const labelX = (x1 + x2) / 2;
const labelY = (y1 + y2) / 2;
// Offset label perpendicular to line
const angle = Math.atan2(y2 - y1, x2 - x1);
const offsetX = Math.sin(angle) * 12;
const offsetY = -Math.cos(angle) * 12;
g.append('text')
.attr('x', labelX + offsetX)
.attr('y', labelY + offsetY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', theme.nodeText)
.attr('font-size', '10px')
.text(label);
}
return g;
}
// =============================================================================
// STEP ANIMATION CONTROLLER
// =============================================================================
// Factory function returning controller for step-through animations
// Options: {total, initialStep, speed, loop, onStepChange}
createStepController = function(options = {}) {
const {
total = 1,
initialStep = 0,
speed = 1000,
loop = true,
onStepChange = null
} = options;
let current = initialStep;
let isPlaying = false;
let intervalId = null;
let currentSpeed = speed;
const notifyChange = () => {
if (onStepChange && typeof onStepChange === 'function') {
onStepChange(current);
}
};
const controller = {
get current() { return current; },
get isPlaying() { return isPlaying; },
get total() { return total; },
get speed() { return currentSpeed; },
setStep(step) {
current = Math.max(0, Math.min(total - 1, step));
notifyChange();
return current;
},
next() {
if (current < total - 1) {
current++;
} else if (loop) {
current = 0;
}
notifyChange();
return current;
},
prev() {
if (current > 0) {
current--;
} else if (loop) {
current = total - 1;
}
notifyChange();
return current;
},
play() {
if (isPlaying) return;
isPlaying = true;
intervalId = setInterval(() => {
controller.next();
}, currentSpeed);
},
stop() {
isPlaying = false;
if (intervalId) {
clearInterval(intervalId);
intervalId = null;
}
},
toggle() {
if (isPlaying) {
controller.stop();
} else {
controller.play();
}
},
reset() {
controller.stop();
current = initialStep;
notifyChange();
},
setSpeed(newSpeed) {
currentSpeed = newSpeed;
if (isPlaying) {
controller.stop();
controller.play();
}
}
};
return controller;
}
// =============================================================================
// FLOW DIAGRAM COMPONENT
// =============================================================================
// Higher-level component for node/edge diagrams
// Options: {nodes, edges, width, height, activeNodes, activeEdges, theme, nodeWidth, nodeHeight, padding}
FlowDiagram = function(options) {
const {
nodes = [],
edges = [],
width = 600,
height = 400,
activeNodes = [],
activeEdges = [],
theme = diagramTheme,
nodeWidth = 100,
nodeHeight = 50,
padding = 20
} = options;
// Create SVG element
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`)
.attr('class', 'flow-diagram');
// Add background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', theme.bg)
.attr('rx', 8);
// Create defs for markers
const defs = svg.append('defs');
// Standard arrow marker
defs.append('marker')
.attr('id', 'flow-arrow')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', theme.edgeStroke);
// Highlighted arrow marker
defs.append('marker')
.attr('id', 'flow-arrow-highlight')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', theme.highlight);
// Edges layer (draw first so nodes appear on top)
const edgesLayer = svg.append('g').attr('class', 'edges-layer');
// Nodes layer
const nodesLayer = svg.append('g').attr('class', 'nodes-layer');
// Draw edges
edges.forEach((edge, i) => {
const sourceNode = nodes.find(n => n.id === edge.source);
const targetNode = nodes.find(n => n.id === edge.target);
if (!sourceNode || !targetNode) return;
const isActive = activeEdges.includes(edge.id) || activeEdges.includes(i);
const edgeColor = isActive ? theme.highlight : theme.edgeStroke;
const markerId = isActive ? 'flow-arrow-highlight' : 'flow-arrow';
// Calculate edge path
const x1 = sourceNode.x;
const y1 = sourceNode.y;
const x2 = targetNode.x;
const y2 = targetNode.y;
// Shorten path to not overlap with node edges
const dx = x2 - x1;
const dy = y2 - y1;
const len = Math.sqrt(dx * dx + dy * dy);
const offsetStart = (nodeWidth / 2) + 5;
const offsetEnd = (nodeWidth / 2) + 10;
const startX = x1 + (dx / len) * offsetStart;
const startY = y1 + (dy / len) * offsetStart;
const endX = x2 - (dx / len) * offsetEnd;
const endY = y2 - (dy / len) * offsetEnd;
const edgeGroup = edgesLayer.append('g')
.attr('class', `edge ${isActive ? 'highlighted' : ''}`);
if (edge.id) edgeGroup.attr('id', edge.id);
// Draw path
let pathD;
if (edge.curved) {
const midX = (startX + endX) / 2;
const midY = (startY + endY) / 2;
const curvature = edge.curvature || 0.2;
const cx = midX - dy * curvature;
const cy = midY + dx * curvature;
pathD = `M${startX},${startY} Q${cx},${cy} ${endX},${endY}`;
} else {
pathD = `M${startX},${startY} L${endX},${endY}`;
}
const path = edgeGroup.append('path')
.attr('d', pathD)
.attr('fill', 'none')
.attr('stroke', edgeColor)
.attr('stroke-width', isActive ? 2.5 : 1.5)
.attr('marker-end', `url(#${markerId})`);
if (edge.dashed) {
path.attr('stroke-dasharray', '5,3');
}
if (isActive) {
path.attr('filter', `drop-shadow(0 0 4px ${theme.highlightGlow})`);
}
// Add label if present
if (edge.label) {
const labelX = (startX + endX) / 2;
const labelY = (startY + endY) / 2;
const angle = Math.atan2(endY - startY, endX - startX);
const offsetX = Math.sin(angle) * 14;
const offsetY = -Math.cos(angle) * 14;
edgeGroup.append('text')
.attr('x', labelX + offsetX)
.attr('y', labelY + offsetY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isActive ? theme.highlight : theme.nodeText)
.attr('font-size', '10px')
.text(edge.label);
}
});
// Draw nodes
nodes.forEach((node, i) => {
const isActive = activeNodes.includes(node.id) || activeNodes.includes(i);
const nodeFill = isActive ? theme.highlight : theme.nodeFill;
const nodeStroke = isActive ? theme.highlight : theme.nodeStroke;
const textFill = isActive ? theme.textOnHighlight : theme.nodeText;
const nodeGroup = nodesLayer.append('g')
.attr('class', `node ${isActive ? 'highlighted' : ''}`)
.attr('transform', `translate(${node.x}, ${node.y})`);
if (node.id) nodeGroup.attr('id', node.id);
// Node rectangle
const rect = nodeGroup.append('rect')
.attr('x', -nodeWidth / 2)
.attr('y', -nodeHeight / 2)
.attr('width', node.width || nodeWidth)
.attr('height', node.height || nodeHeight)
.attr('rx', 6)
.attr('ry', 6)
.attr('fill', nodeFill)
.attr('stroke', nodeStroke)
.attr('stroke-width', isActive ? 2 : 1.5);
if (isActive) {
rect.attr('filter', `drop-shadow(0 0 6px ${theme.highlightGlow})`);
}
// Main label
const labelY = node.sublabel ? -6 : 0;
nodeGroup.append('text')
.attr('x', 0)
.attr('y', labelY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', textFill)
.attr('font-size', '12px')
.attr('font-weight', '500')
.attr('pointer-events', 'none')
.text(node.label || '');
// Sublabel
if (node.sublabel) {
nodeGroup.append('text')
.attr('x', 0)
.attr('y', 10)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', textFill)
.attr('font-size', '10px')
.attr('opacity', isActive ? 0.9 : 0.7)
.attr('pointer-events', 'none')
.text(node.sublabel);
}
});
return svg.node();
}
// =============================================================================
// EXPORTS
// =============================================================================
// Export everything as a single object for lessons to use
diagramLib = {
// Core dependencies
d3,
// Theme utilities
getCSSVar,
diagramTheme,
// SVG primitives
createNode,
createArrow,
// Animation controller
createStepController,
// Components
FlowDiagram
}
NoteWhat You’ll Learn
After this module, you will:
- Understand what language models do: next-token prediction
- Know why simple statistical approaches fail at this task
- Grasp how neural networks learn from data (training loop intuition)
- Have a mental model of transformer architecture
- See the roadmap of what you’ll build in this course
Tokenizer = (await import('/playground/tokenizer.js')).Tokenizer
GPTModel = (await import('/playground/model.js')).GPTModel
// Load data files
tokenizerData = FileAttachment("../../playground/tokenizer.json").json()
weightsData = FileAttachment("../../playground/weights.json").json()
// Initialize tokenizer and model
tokenizer = new Tokenizer(tokenizerData)
model = new GPTModel(weightsData)A language model predicts the next token.
Given the text def hello(, what comes next? A language model outputs a probability distribution over all possible tokens:
| Token | Probability |
|---|---|
) |
31% |
name |
18% |
self |
12% |
x |
8% |
| … | … |
Notice three points:
Tokens aren’t words. The model works with subword pieces - fragments smaller than words.
hellomight becomehel+lo. This keeps the vocabulary manageable - typically 30,000 to 50,000 tokens - yet handles any text, including rare words and unfamiliar code.The output is probabilities, not a single answer. The model expresses uncertainty. Sometimes
)is clearly right; sometimes several options make sense.This simple task scales remarkably. Predicting the next token well requires understanding syntax, semantics, context, and even reasoning. A model that excels at this task can write code, answer questions, and hold conversations. This is how GitHub Copilot suggests completions, how ChatGPT generates responses, and how your IDE’s autocomplete works - all next-token prediction at scale.
Try It
// Compute predictions
predictions = {
const tok = await tokenizer;
const mod = await model;
if (!inputText || inputText.length === 0) {
return [];
}
const ids = tok.encode(inputText);
const trace = mod.forward(ids);
const logits = trace.logits;
// Get last position logits
const vocabSize = logits.shape[1];
const lastLogits = [];
const offset = (logits.shape[0] - 1) * vocabSize;
for (let i = 0; i < vocabSize; i++) {
lastLogits.push(logits.data[offset + i]);
}
// Softmax
const maxLogit = Math.max(...lastLogits);
const expLogits = lastLogits.map(l => Math.exp(l - maxLogit));
const sumExp = expLogits.reduce((a, b) => a + b, 0);
const probs = expLogits.map(e => e / sumExp);
// Get top 5
const indexed = probs.map((p, i) => ({ prob: p, id: i }));
indexed.sort((a, b) => b.prob - a.prob);
const top5 = indexed.slice(0, 5);
return top5.map(({ prob, id }) => ({
token: tok.idToToken(id).replace(/ /g, '␣'),
probability: (prob * 100).toFixed(1) + '%'
}));
}// Display predictions as a simple table
predictionDisplay = {
if (predictions.length === 0) {
return html`<p style="color: var(--text-muted);">Type something to see predictions.</p>`;
}
return html`
<div style="
display: flex;
gap: 12px;
flex-wrap: wrap;
margin-top: 12px;
">
${predictions.map((p, i) => html`
<div style="
padding: 8px 16px;
background: ${i === 0 ? 'var(--diagram-highlight, #f97316)' : 'var(--surface-tertiary, #f1f5f9)'};
color: ${i === 0 ? 'var(--diagram-text-on-highlight, #1c1917)' : 'var(--text-primary, #1e293b)'};
border-radius: 6px;
font-family: 'JetBrains Mono', monospace;
font-size: 14px;
">
<span style="font-weight: 600;">${p.token}</span>
<span style="opacity: 0.8; margin-left: 8px;">${p.probability}</span>
</div>
`)}
</div>
`;
}Try it: Type a function definition like def add( and notice how the model predicts argument names. Then try for i in - the predictions shift based on what typically follows loop constructs.
Predicting the next token sounds simple. How do you do it well?
Why Simple Approaches Fail
The obvious approach: count what tokens typically follow other tokens. After seeing for i in a million times, you learn that range follows.
This is called an n-gram model. It looks at the last N tokens to predict the next one. Simple, fast, and works surprisingly well for common patterns.
But n-grams have a fundamental limitation: they see only a fixed window of tokens.
The Context Problem
Consider this code:
What token fills the blank? A human immediately knows the answer is n - it’s the loop variable. An n-gram model looking at just total += guesses blindly - perhaps 1, x, or value, tokens that commonly follow +=.
N-gram vs Transformer: A Comparison
comparisonExamples = [
{
id: 'loop',
name: 'Loop Variable',
code: 'def calculate_sum(numbers):\n total = 0\n for n in numbers:\n total += █',
ngram: [
{ token: '1', prob: '24%', reason: 'common increment' },
{ token: 'x', prob: '18%', reason: 'common variable' },
{ token: 'value', prob: '15%', reason: 'common name' }
],
transformer: [
{ token: 'n', prob: '72%', reason: 'sees loop variable' },
{ token: 'numbers', prob: '8%', reason: 'sees parameter' },
{ token: '1', prob: '5%', reason: 'fallback' }
],
insight: 'The transformer attends to the loop definition and knows `n` is the iteration variable.'
},
{
id: 'return',
name: 'Return Value',
code: 'if x > 0:\n return x\nelse:\n return █',
ngram: [
{ token: '0', prob: '28%', reason: 'common return' },
{ token: 'None', prob: '22%', reason: 'common return' },
{ token: 'False', prob: '14%', reason: 'common return' }
],
transformer: [
{ token: '-x', prob: '45%', reason: 'sees negation pattern' },
{ token: '0', prob: '25%', reason: 'conditional default' },
{ token: '-1', prob: '12%', reason: 'error sentinel' }
],
insight: 'The transformer recognizes the absolute value pattern from the `if x > 0` condition.'
},
{
id: 'comment',
name: 'Following Comments',
code: '# reverse a string\ndef █',
ngram: [
{ token: 'main', prob: '15%', reason: 'common function' },
{ token: 'get', prob: '12%', reason: 'common prefix' },
{ token: 'calculate', prob: '10%', reason: 'common prefix' }
],
transformer: [
{ token: 'reverse', prob: '68%', reason: 'matches comment' },
{ token: 'rev', prob: '12%', reason: 'abbreviation' },
{ token: 'flip', prob: '6%', reason: 'synonym' }
],
insight: 'The transformer reads the comment and predicts a function name that matches its intent.'
}
]
viewof selectedExample = Inputs.radio(
comparisonExamples.map(e => e.id),
{
label: "Choose an example:",
value: 'loop',
format: id => comparisonExamples.find(e => e.id === id).name
}
)// Display the comparison widget
comparisonWidget = {
const example = comparisonExamples.find(e => e.id === selectedExample);
const t = diagramTheme;
return html`
<div style="margin: 20px 0;">
<pre style="
background: var(--surface-secondary, #1e293b);
color: var(--text-primary, #e2e8f0);
padding: 16px;
border-radius: 8px;
font-family: 'JetBrains Mono', monospace;
font-size: 14px;
line-height: 1.5;
overflow-x: auto;
margin-bottom: 20px;
">${example.code}</pre>
<div style="
display: grid;
grid-template-columns: 1fr 1fr;
gap: 16px;
">
<div style="
background: var(--surface-tertiary, #f1f5f9);
border-radius: 8px;
padding: 16px;
border-top: 4px solid var(--diagram-edge-stroke, #94a3b8);
">
<div style="
font-weight: 600;
color: var(--text-primary, #1e293b);
margin-bottom: 12px;
font-size: 14px;
">N-gram Model <span style="opacity: 0.6; font-weight: 400;">(sees last ~3 tokens)</span></div>
${example.ngram.map((p, i) => html`
<div style="
display: flex;
justify-content: space-between;
align-items: center;
padding: 8px 12px;
margin-bottom: 6px;
background: ${i === 0 ? 'var(--diagram-node-fill-alt, rgba(148, 163, 184, 0.3))' : 'transparent'};
border-radius: 4px;
">
<span style="
font-family: 'JetBrains Mono', monospace;
font-weight: 500;
">${p.token}</span>
<span style="
font-size: 13px;
color: var(--text-secondary, #64748b);
">${p.prob} <span style="opacity: 0.7;">· ${p.reason}</span></span>
</div>
`)}
</div>
<div style="
background: var(--surface-tertiary, #f1f5f9);
border-radius: 8px;
padding: 16px;
border-top: 4px solid var(--diagram-highlight, #f97316);
">
<div style="
font-weight: 600;
color: var(--text-primary, #1e293b);
margin-bottom: 12px;
font-size: 14px;
">Transformer <span style="opacity: 0.6; font-weight: 400;">(sees all context)</span></div>
${example.transformer.map((p, i) => html`
<div style="
display: flex;
justify-content: space-between;
align-items: center;
padding: 8px 12px;
margin-bottom: 6px;
background: ${i === 0 ? 'rgba(249, 115, 22, 0.2)' : 'transparent'};
border-radius: 4px;
">
<span style="
font-family: 'JetBrains Mono', monospace;
font-weight: 500;
color: ${i === 0 ? 'var(--diagram-highlight, #f97316)' : 'inherit'};
">${p.token}</span>
<span style="
font-size: 13px;
color: var(--text-secondary, #64748b);
">${p.prob} <span style="opacity: 0.7;">· ${p.reason}</span></span>
</div>
`)}
</div>
</div>
<div style="
margin-top: 16px;
padding: 12px 16px;
background: linear-gradient(90deg, rgba(249, 115, 22, 0.1), transparent);
border-left: 3px solid var(--diagram-highlight, #f97316);
border-radius: 0 6px 6px 0;
font-size: 14px;
color: var(--text-primary, #334155);
">
<strong>Why the difference?</strong> ${example.insight}
</div>
</div>
`;
}The answer depends on context from ten or more tokens earlier. N-grams cannot reach that far. Increase the window size and another problem emerges: this exact sequence is new, so statistics offer no guidance.
Language is full of these long-range dependencies:
- Matching brackets and parentheses
- Variable references spanning multiple lines
- Pronouns referring to earlier nouns
- Comments describing code that follows
A good language model must consider all previous tokens and learn which ones matter for each prediction.
Neural Networks: Learning from Data
N-grams count patterns; neural networks learn them. The difference is fundamental.
A neural network is a function with adjustable parameters. Feed it an input, it produces an output. These adjustable parameters are called weights - numbers multiplied with inputs and summed. They determine which function the network computes, and we adjust them so the function does what we want.
nnLayerNames = ["Input Layer", "Hidden Layer", "Output Layer"]
nnLayerDescs = [
"Raw input data enters here. For text, this might be token embeddings - vectors representing each word.",
"Where the learning happens. Neurons multiply inputs by weights, sum them up, then apply activation functions (like ReLU). Without these non-linearities, stacking layers would be pointless - multiple linear operations just collapse into one.",
"Produces the final result. For language models, this is a probability distribution over possible next tokens."
]
currentNNLayer = {
return {
name: nnLayerNames[nnLayer],
desc: nnLayerDescs[nnLayer]
};
}// Simple neural network visualization
nnDiagram = {
const width = 600;
const height = 280;
const t = diagramTheme;
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`);
// Background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', t.bg)
.attr('rx', 8);
// Layer configuration
const layers = [
{ x: 100, neurons: 3, label: 'Input' },
{ x: 300, neurons: 4, label: 'Hidden' },
{ x: 500, neurons: 2, label: 'Output' }
];
const neuronRadius = 18;
const verticalSpacing = 55;
// Draw connections first (behind neurons)
for (let l = 0; l < layers.length - 1; l++) {
const fromLayer = layers[l];
const toLayer = layers[l + 1];
const fromYStart = (height - (fromLayer.neurons - 1) * verticalSpacing) / 2;
const toYStart = (height - (toLayer.neurons - 1) * verticalSpacing) / 2;
for (let i = 0; i < fromLayer.neurons; i++) {
for (let j = 0; j < toLayer.neurons; j++) {
const fromY = fromYStart + i * verticalSpacing;
const toY = toYStart + j * verticalSpacing;
// Highlight connections from/to highlighted layer
const isHighlighted = (nnLayer === l) || (nnLayer === l + 1);
svg.append('line')
.attr('x1', fromLayer.x + neuronRadius)
.attr('y1', fromY)
.attr('x2', toLayer.x - neuronRadius)
.attr('y2', toY)
.attr('stroke', isHighlighted ? t.highlight : t.edgeStroke)
.attr('stroke-width', isHighlighted ? 1.5 : 0.8)
.attr('opacity', isHighlighted ? 0.7 : 0.3);
}
}
}
// Draw neurons
layers.forEach((layer, l) => {
const yStart = (height - (layer.neurons - 1) * verticalSpacing) / 2;
const isActive = nnLayer === l;
for (let i = 0; i < layer.neurons; i++) {
const y = yStart + i * verticalSpacing;
// Neuron circle
svg.append('circle')
.attr('cx', layer.x)
.attr('cy', y)
.attr('r', neuronRadius)
.attr('fill', isActive ? t.highlight : t.nodeFill)
.attr('stroke', isActive ? t.highlight : t.nodeStroke)
.attr('stroke-width', isActive ? 2 : 1.5);
// Show weights symbol in hidden layer
if (l === 1 && isActive) {
svg.append('text')
.attr('x', layer.x)
.attr('y', y)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', t.textOnHighlight)
.attr('font-size', '14px')
.attr('font-weight', '600')
.text('w');
}
}
// Layer label
svg.append('text')
.attr('x', layer.x)
.attr('y', height - 20)
.attr('text-anchor', 'middle')
.attr('fill', isActive ? t.highlight : t.nodeText)
.attr('font-size', '12px')
.attr('font-weight', isActive ? '600' : '400')
.text(layer.label);
});
// Title
svg.append('text')
.attr('x', width / 2)
.attr('y', 25)
.attr('text-anchor', 'middle')
.attr('fill', t.nodeText)
.attr('font-size', '14px')
.attr('font-weight', '600')
.text('A Simple Neural Network');
return svg.node();
}// Layer description display
nnLayerDisplay = {
return html`
<div style="
margin-top: 16px;
padding: 16px 20px;
background: var(--surface-secondary, #f8fafc);
border-radius: 8px;
border-left: 4px solid var(--diagram-highlight, #f97316);
">
<div style="
font-weight: 600;
font-size: 15px;
color: var(--text-primary, #1e293b);
margin-bottom: 4px;
">${currentNNLayer.name}</div>
<div style="
font-size: 14px;
color: var(--text-secondary, #475569);
">${currentNNLayer.desc}</div>
</div>
`;
}How Neural Networks Learn
This process matters because transformers are neural networks. The same training loop you’ll see here is exactly how we’ll train our language model in Module 07.
Training follows a simple loop:
- Forward pass: Feed input through the network, get a prediction
- Compute loss: Measure how wrong the prediction is (the loss function quantifies prediction quality)
- Backward pass: Calculate how each weight contributed to the error (these sensitivity values are called gradients)
- Update weights: Nudge weights in the direction that reduces error
Repeat with millions of examples. For language models, this means feeding billions of tokens from books, code, and web text, learning from each wrong prediction. The weights shift until the network predicts well.
(You’ll implement the backward pass yourself in Module 02: Autograd. PyTorch computes gradients automatically, but implementing them sharpens intuition.)
// Training loop visualization
trainingLoopDiagram = {
const width = 600;
const height = 180;
const t = diagramTheme;
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`);
// Background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', t.bg)
.attr('rx', 8);
// Steps
const steps = [
{ x: 80, label: 'Input', sublabel: '"def hello("' },
{ x: 200, label: 'Forward', sublabel: 'predict' },
{ x: 320, label: 'Loss', sublabel: 'how wrong?' },
{ x: 440, label: 'Backward', sublabel: 'gradients' },
{ x: 560, label: 'Update', sublabel: 'fix weights' }
];
const y = 90;
const boxWidth = 85;
const boxHeight = 50;
// Arrow marker
const defs = svg.append('defs');
defs.append('marker')
.attr('id', 'train-arrow')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', t.edgeStroke);
// Draw arrows between steps
for (let i = 0; i < steps.length - 1; i++) {
svg.append('line')
.attr('x1', steps[i].x + boxWidth/2 + 5)
.attr('y1', y)
.attr('x2', steps[i + 1].x - boxWidth/2 - 10)
.attr('y2', y)
.attr('stroke', t.edgeStroke)
.attr('stroke-width', 1.5)
.attr('marker-end', 'url(#train-arrow)');
}
// Loop back arrow (curved)
svg.append('path')
.attr('d', `M ${steps[4].x} ${y + boxHeight/2 + 10}
Q ${steps[4].x} ${y + 60} ${steps[2].x} ${y + 60}
Q ${steps[0].x} ${y + 60} ${steps[0].x} ${y + boxHeight/2 + 10}`)
.attr('fill', 'none')
.attr('stroke', t.accent)
.attr('stroke-width', 1.5)
.attr('stroke-dasharray', '5,3')
.attr('marker-end', 'url(#train-arrow)');
// "repeat" label
svg.append('text')
.attr('x', width / 2)
.attr('y', y + 70)
.attr('text-anchor', 'middle')
.attr('fill', t.accent)
.attr('font-size', '11px')
.attr('font-style', 'italic')
.text('repeat millions of times');
// Draw step boxes
steps.forEach((step, i) => {
const g = svg.append('g')
.attr('transform', `translate(${step.x}, ${y})`);
g.append('rect')
.attr('x', -boxWidth/2)
.attr('y', -boxHeight/2)
.attr('width', boxWidth)
.attr('height', boxHeight)
.attr('rx', 6)
.attr('fill', i === 2 ? t.highlight : t.nodeFill)
.attr('stroke', i === 2 ? t.highlight : t.nodeStroke)
.attr('stroke-width', 1.5);
g.append('text')
.attr('y', -8)
.attr('text-anchor', 'middle')
.attr('fill', i === 2 ? t.textOnHighlight : t.nodeText)
.attr('font-size', '12px')
.attr('font-weight', '600')
.text(step.label);
g.append('text')
.attr('y', 10)
.attr('text-anchor', 'middle')
.attr('fill', i === 2 ? t.textOnHighlight : t.nodeText)
.attr('font-size', '10px')
.attr('opacity', i === 2 ? 0.9 : 0.7)
.text(step.sublabel);
});
// Title
svg.append('text')
.attr('x', width / 2)
.attr('y', 25)
.attr('text-anchor', 'middle')
.attr('fill', t.nodeText)
.attr('font-size', '14px')
.attr('font-weight', '600')
.text('The Training Loop');
return svg.node();
}Why This Matters for Language
Neural networks do not merely memorize patterns as n-grams do. They learn representations - internal encodings where similar concepts cluster together.
Consider the word “cat”: - An n-gram sees only the characters c-a-t - A neural network learns a vector where “cat” is close to “dog”, “kitten”, and “pet” but far from “quantum” and “derivative”
In transformers, these representations become even richer: the same word gets different vectors depending on context. “Bank” near “river” differs from “bank” near “money.”
N-grams lack this capacity entirely. They treat “cat sat on the mat” and “dog sat on the rug” as completely unrelated sequences. Neural networks recognize the structural similarity and generalize patterns from one to the other.
This representation learning lets neural networks: - Generalize to unseen word combinations - Handle context by learning which parts of the input matter - Scale with more data and compute
Transformers are neural networks distinguished by how they handle context: through attention.
The Transformer Solution
Attention examines all previous tokens and learns which ones matter.
The architecture flows like this:
stageDescriptions = [
{ stage: "Input", desc: "Text enters as token IDs (integers)" },
{ stage: "Embedding", desc: "Each ID becomes a dense vector (e.g., 768 numbers). These vectors are learned - the model discovers which tokens should have similar vectors based on usage patterns. Words like 'run' and 'jog' end up close together." },
{ stage: "Attention", desc: "Each token gathers information from relevant previous tokens. Here, the pronoun 'it' discovers its referent, and `total +=` finds the loop variable `n`." },
{ stage: "Feed-Forward", desc: "Process each token's combined information through a small neural network. This is where the model 'thinks' about what it learned from attention." },
{ stage: "Repeat", desc: "Stack multiple layers to build deeper understanding. Early layers recognize syntax; later layers grasp meaning." },
{ stage: "Output", desc: "Final vectors become probabilities over the vocabulary" }
]
currentStage = stageDescriptions[animStep]// Animated architecture diagram
architectureDiagram = {
const width = 700;
const height = 200;
const t = diagramTheme;
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`);
// Background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', t.bg)
.attr('rx', 8);
// Node positions
const nodes = [
{ id: 0, x: 60, y: 100, label: 'Input', sublabel: 'Token IDs' },
{ id: 1, x: 180, y: 100, label: 'Embedding', sublabel: 'Vectors' },
{ id: 2, x: 320, y: 100, label: 'Attention', sublabel: 'Context' },
{ id: 3, x: 460, y: 100, label: 'Feed-Forward', sublabel: 'Process' },
{ id: 4, x: 540, y: 50, label: '×N', sublabel: 'Layers' },
{ id: 5, x: 640, y: 100, label: 'Output', sublabel: 'Probabilities' }
];
// Draw edges first
const edges = [
{ from: 0, to: 1 },
{ from: 1, to: 2 },
{ from: 2, to: 3 },
{ from: 3, to: 5 }
];
// Arrow marker
const defs = svg.append('defs');
defs.append('marker')
.attr('id', 'arch-arrow')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', t.edgeStroke);
defs.append('marker')
.attr('id', 'arch-arrow-highlight')
.attr('viewBox', '0 -5 10 10')
.attr('refX', 8)
.attr('refY', 0)
.attr('markerWidth', 6)
.attr('markerHeight', 6)
.attr('orient', 'auto')
.append('path')
.attr('d', 'M0,-5L10,0L0,5')
.attr('fill', t.highlight);
edges.forEach(e => {
const from = nodes[e.from];
const to = nodes[e.to];
const isActive = animStep >= e.from && animStep <= e.to;
svg.append('line')
.attr('x1', from.x + 50)
.attr('y1', from.y)
.attr('x2', to.x - 50)
.attr('y2', to.y)
.attr('stroke', isActive ? t.highlight : t.edgeStroke)
.attr('stroke-width', isActive ? 2.5 : 1.5)
.attr('marker-end', isActive ? 'url(#arch-arrow-highlight)' : 'url(#arch-arrow)');
});
// Loop arrow for "Repeat" (curved)
if (animStep >= 4) {
const loopPath = svg.append('path')
.attr('d', 'M 470,70 Q 500,20 530,50')
.attr('fill', 'none')
.attr('stroke', animStep === 4 ? t.highlight : t.edgeStroke)
.attr('stroke-width', animStep === 4 ? 2.5 : 1.5)
.attr('stroke-dasharray', '4,2');
}
// Draw nodes
nodes.forEach((node, i) => {
const isActive = animStep === i;
const nodeWidth = i === 4 ? 50 : 90;
const nodeHeight = i === 4 ? 35 : 50;
const g = svg.append('g')
.attr('transform', `translate(${node.x}, ${node.y})`);
g.append('rect')
.attr('x', -nodeWidth/2)
.attr('y', -nodeHeight/2)
.attr('width', nodeWidth)
.attr('height', nodeHeight)
.attr('rx', 6)
.attr('fill', isActive ? t.highlight : t.nodeFill)
.attr('stroke', isActive ? t.highlight : t.nodeStroke)
.attr('stroke-width', isActive ? 2 : 1.5);
g.append('text')
.attr('y', node.sublabel && i !== 4 ? -6 : 0)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isActive ? t.textOnHighlight : t.nodeText)
.attr('font-size', i === 4 ? '14px' : '12px')
.attr('font-weight', '500')
.text(node.label);
if (node.sublabel && i !== 4) {
g.append('text')
.attr('y', 10)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isActive ? t.textOnHighlight : t.nodeText)
.attr('font-size', '10px')
.attr('opacity', 0.7)
.text(node.sublabel);
}
});
return svg.node();
}// Attention visualization - shows when on step 2
attentionViz = {
if (animStep !== 2) return html``;
const t = diagramTheme;
const tokens = ['The', 'cat', 'sat', 'on', 'the', 'mat', 'because', 'it', 'was', 'tired'];
const targetIdx = 7; // "it"
// Attention weights from "it" to previous tokens
const weights = [0.05, 0.62, 0.08, 0.02, 0.03, 0.12, 0.05, 0, 0, 0];
const width = 700;
const height = 120;
const svg = d3.create('svg')
.attr('width', width)
.attr('height', height)
.attr('viewBox', `0 0 ${width} ${height}`);
// Background
svg.append('rect')
.attr('width', width)
.attr('height', height)
.attr('fill', t.bg)
.attr('rx', 8);
const tokenSpacing = 65;
const startX = 30;
const tokenY = 80;
const targetY = 35;
// Draw tokens
tokens.forEach((token, i) => {
const x = startX + i * tokenSpacing;
const weight = weights[i];
const isTarget = i === targetIdx;
// Token box
svg.append('rect')
.attr('x', x)
.attr('y', isTarget ? targetY - 12 : tokenY - 12)
.attr('width', 55)
.attr('height', 24)
.attr('rx', 4)
.attr('fill', isTarget ? t.highlight : (weight > 0.3 ? 'rgba(249, 115, 22, 0.3)' : t.nodeFill))
.attr('stroke', isTarget ? t.highlight : (weight > 0.3 ? t.highlight : t.nodeStroke))
.attr('stroke-width', isTarget ? 2 : 1);
// Token text
svg.append('text')
.attr('x', x + 27.5)
.attr('y', isTarget ? targetY : tokenY)
.attr('text-anchor', 'middle')
.attr('dominant-baseline', 'central')
.attr('fill', isTarget ? t.textOnHighlight : t.nodeText)
.attr('font-size', '12px')
.attr('font-family', "'JetBrains Mono', monospace")
.text(token);
// Draw attention line from target to this token
if (i < targetIdx && weight > 0) {
const targetX = startX + targetIdx * tokenSpacing + 27.5;
const sourceX = x + 27.5;
svg.append('line')
.attr('x1', sourceX)
.attr('y1', tokenY - 14)
.attr('x2', targetX)
.attr('y2', targetY + 14)
.attr('stroke', t.highlight)
.attr('stroke-width', Math.max(0.5, weight * 5))
.attr('opacity', 0.4 + weight * 0.6);
}
});
// Label
svg.append('text')
.attr('x', startX + targetIdx * tokenSpacing + 27.5)
.attr('y', targetY - 25)
.attr('text-anchor', 'middle')
.attr('fill', t.highlight)
.attr('font-size', '10px')
.attr('font-weight', '500')
.text('predicting next token');
return svg.node();
}// Attention explanation
attentionExplanation = {
if (animStep !== 2) return html``;
return html`
<div style="
margin-top: 12px;
padding: 12px 16px;
background: rgba(249, 115, 22, 0.1);
border-radius: 6px;
font-size: 13px;
color: var(--text-primary, #334155);
">
<strong>Attention in action:</strong> When predicting what comes after "it", the model attends strongly to "cat" (62%) because "it" likely refers to the cat. The line thickness shows attention weight - the model learned this connection, it wasn't programmed.
</div>
`;
}// Stage description display
stageDisplay = {
return html`
<div style="
margin-top: 16px;
padding: 16px 20px;
background: var(--surface-secondary, #f8fafc);
border-radius: 8px;
border-left: 4px solid var(--diagram-highlight, #f97316);
">
<div style="
font-weight: 600;
font-size: 15px;
color: var(--text-primary, #1e293b);
margin-bottom: 4px;
">${currentStage.stage}</div>
<div style="
font-size: 14px;
color: var(--text-secondary, #475569);
">${currentStage.desc}</div>
</div>
`;
}Three key insights:
Full context visibility. Older models read left-to-right, one token at a time. Transformers see the entire input at once. Each layer processes all positions simultaneously, making them highly parallelizable and efficient.
Learned relevance. The attention mechanism learns which tokens matter for each prediction. When predicting after
total +=, it learns to focus on the loop variablen, not the function name.Stacked layers. Multiple transformer layers build increasingly abstract representations. Early layers recognize syntax; later layers grasp meaning.
The modules ahead build each component from scratch.
What You’ll Build
Each module tackles one piece of this architecture:
moduleData = [
{ id: '01', name: 'Tensors', desc: 'The n-dimensional arrays that hold everything - weights, activations, gradients', cat: 'foundation' },
{ id: '02', name: 'Autograd', desc: 'How models learn: automatic gradients without manual calculus', cat: 'foundation' },
{ id: '03', name: 'Tokenization', desc: 'Why "hello" becomes [15339] and how subwords handle any text', cat: 'input' },
{ id: '04', name: 'Embeddings', desc: 'Where meaning lives: turning IDs into rich vector representations', cat: 'model' },
{ id: '05', name: 'Attention', desc: 'The mechanism that lets "it" know what "it" refers to', cat: 'model' },
{ id: '06', name: 'Transformer', desc: 'Putting it all together into a complete architecture', cat: 'model' },
{ id: '07', name: 'Training', desc: 'Teaching the model to predict well through gradient descent', cat: 'training' },
{ id: '08', name: 'Generation', desc: 'Sampling strategies: from greedy to nucleus sampling', cat: 'output' }
]
// Category colors - light and dark mode variants
catColors = ({
foundation: {
bg: '#dbeafe', darkBg: '#1e3a5f',
border: '#3b82f6',
text: '#1e40af', darkText: '#93c5fd'
},
input: {
bg: '#fef3c7', darkBg: '#422006',
border: '#f59e0b',
text: '#92400e', darkText: '#fcd34d'
},
model: {
bg: '#dcfce7', darkBg: '#14352a',
border: '#22c55e',
text: '#166534', darkText: '#86efac'
},
training: {
bg: '#fce7f3', darkBg: '#3b1436',
border: '#ec4899',
text: '#9d174d', darkText: '#f9a8d4'
},
output: {
bg: '#f3e8ff', darkBg: '#3b1764',
border: '#a855f7',
text: '#6b21a8', darkText: '#d8b4fe'
}
})
// Module directory name mapping
moduleNames = ({
'01': 'tensors',
'02': 'autograd',
'03': 'tokenization',
'04': 'embeddings',
'05': 'attention',
'06': 'transformer',
'07': 'training',
'08': 'generation'
})// Render module roadmap
roadmapDisplay = {
const t = diagramTheme;
const isDark = t.isDark;
return html`
<div style="
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 12px;
margin: 20px 0;
">
${moduleData.map(m => {
const colors = catColors[m.cat];
const bgColor = isDark ? colors.darkBg : colors.bg;
const borderColor = colors.border;
const textColor = isDark ? colors.darkText : colors.text;
return html`
<a href="../m${m.id}_${moduleNames[m.id]}/lesson.html" style="
display: block;
padding: 16px;
background: ${bgColor};
border-left: 4px solid ${borderColor};
border-radius: 6px;
text-decoration: none;
transition: transform 0.15s ease, box-shadow 0.15s ease;
" onmouseover="this.style.transform='translateY(-2px)'; this.style.boxShadow='0 4px 12px rgba(0,0,0,0.1)';"
onmouseout="this.style.transform='none'; this.style.boxShadow='none';">
<div style="
font-size: 13px;
font-weight: 600;
color: ${textColor};
margin-bottom: 4px;
font-family: 'JetBrains Mono', monospace;
">Module ${m.id}</div>
<div style="
font-size: 16px;
font-weight: 600;
color: var(--text-primary, #1e293b);
margin-bottom: 6px;
">${m.name}</div>
<div style="
font-size: 13px;
color: var(--text-secondary, #475569);
">${m.desc}</div>
</a>
`;
})}
</div>
`;
}Each module builds on the previous ones. By the end, you’ll have a working language model you fully understand - something you built piece by piece.
NoteKey Takeaways
- Language models predict the next token from a probability distribution over the vocabulary
- Simple counting (n-grams) fails because language has long-range dependencies that demand variable context
- Neural networks learn patterns through iterative weight updates - they don’t just memorize, they generalize
- Transformers use attention to dynamically focus on relevant context, however distant
- You’ll build each component from scratch in the modules ahead, from tensors to text generation