microsoft/TypeAgent
Publicmirrored fromhttps://github.com/microsoft/TypeAgentAvailable
docs/slides/discrete-distillation.html
634lines · modecode
| 1 | <!-- Copyright (c) Microsoft Corporation. |
| 2 | Licensed under the MIT License. --> |
| 3 | <!DOCTYPE html> |
| 4 | <html lang="en"> |
| 5 | <head> |
| 6 | <meta charset="UTF-8"> |
| 7 | <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| 8 | <title>Discrete Distillation</title> |
| 9 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@4.6.1/dist/reset.css"> |
| 10 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@4.6.1/dist/reveal.css"> |
| 11 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@4.6.1/dist/theme/white.css"> |
| 12 | <style> |
| 13 | :root { |
| 14 | --blue: #2255bb; |
| 15 | --dblue: #1a1a2e; |
| 16 | --green: #336633; |
| 17 | --red: #882222; |
| 18 | } |
| 19 | |
| 20 | /* Pin slides to top, prevent vertical clipping */ |
| 21 | .reveal .slides > section { |
| 22 | top: 0 !important; |
| 23 | margin-top: 0 !important; |
| 24 | height: auto !important; |
| 25 | } |
| 26 | .reveal .slides > section.present, |
| 27 | .reveal .slides > section.past, |
| 28 | .reveal .slides > section.future { |
| 29 | top: 0 !important; |
| 30 | } |
| 31 | .reveal .slides section { text-align: left; } |
| 32 | .reveal .slides section.center { text-align: center; } |
| 33 | .reveal h2 { font-size: 1.2em; border-bottom: 2px solid #dde; padding-bottom: 0.15em; margin-bottom: 0.45em; margin-top: 0; } |
| 34 | .reveal p { font-size: 0.76em; line-height: 1.45; margin: 0.35em 0; } |
| 35 | .reveal li { font-size: 0.76em; line-height: 1.45; } |
| 36 | .reveal ul { margin-left: 1.1em; margin-top: 0.25em; margin-bottom: 0.25em; } |
| 37 | .reveal ol { margin-left: 1.1em; margin-top: 0.25em; margin-bottom: 0.25em; } |
| 38 | |
| 39 | /* Labeled boxes */ |
| 40 | .box { |
| 41 | border: 2px solid #555; |
| 42 | border-radius: 6px; |
| 43 | padding: 0.35em 0.7em; |
| 44 | display: inline-block; |
| 45 | background: #f8f8f8; |
| 46 | } |
| 47 | .hat-box { |
| 48 | border: 2px solid var(--red); |
| 49 | border-radius: 6px; |
| 50 | padding: 0.35em 0.7em; |
| 51 | display: inline-block; |
| 52 | background: #fff5f5; |
| 53 | font-weight: 600; |
| 54 | } |
| 55 | .det-box { |
| 56 | border: 2px solid var(--blue); |
| 57 | border-radius: 6px; |
| 58 | padding: 0.35em 0.7em; |
| 59 | display: inline-block; |
| 60 | background: #f5f5ff; |
| 61 | font-weight: 600; |
| 62 | } |
| 63 | |
| 64 | /* Inline callout */ |
| 65 | .callout { |
| 66 | margin-top: 0.5em; |
| 67 | padding: 0.4em 0.8em; |
| 68 | border-radius: 4px; |
| 69 | font-size: 0.74em; |
| 70 | } |
| 71 | .callout-blue { background: #eef2ff; border-left: 4px solid var(--blue); } |
| 72 | .callout-green { background: #f0fff0; border-left: 4px solid var(--green); } |
| 73 | .callout-dark { background: #f4f4f8; border-left: 4px solid #888; } |
| 74 | |
| 75 | /* SVG diagrams */ |
| 76 | .diagram-wrap { margin: 0.7em auto; display: block; } |
| 77 | |
| 78 | /* Coverage bars */ |
| 79 | .cov-row { margin-bottom: 0.15em; } |
| 80 | .cov-label { font-size: 0.52em; color: #666; margin-bottom: 0.05em; } |
| 81 | .cov-bar { |
| 82 | display: flex; height: 22px; border-radius: 4px; overflow: hidden; |
| 83 | font-size: 0.55em; font-weight: 600; |
| 84 | } |
| 85 | .seg-det { |
| 86 | background: var(--blue); color: #fff; |
| 87 | display: flex; align-items: center; justify-content: center; |
| 88 | flex-shrink: 0; border-right: 1px solid rgba(255,255,255,0.3); |
| 89 | } |
| 90 | .seg-llm { |
| 91 | background: #cc8844; color: #fff; flex: 1; |
| 92 | display: flex; align-items: center; justify-content: center; |
| 93 | } |
| 94 | |
| 95 | /* Comparison table */ |
| 96 | table.cmp { font-size: 0.61em; border-collapse: collapse; width: 100%; margin-top: 0.4em; border-bottom: 1px solid #ccc; } |
| 97 | table.cmp th, table.cmp td { border: 1px solid #ccc; padding: 0.17em 0.5em; } |
| 98 | table.cmp th { background: #eee; text-align: left; } |
| 99 | table.cmp td:first-child { font-weight: 600; color: #444; } |
| 100 | table.cmp .good { color: var(--green); font-weight: 600; } |
| 101 | table.cmp .bad { color: var(--red); } |
| 102 | </style> |
| 103 | </head> |
| 104 | <body> |
| 105 | <div class="reveal"> |
| 106 | <div class="slides"> |
| 107 | |
| 108 | <!-- ═══════════════════════════════════════════ SLIDE 1: Title --> |
| 109 | <section class="center" data-background-color="#1a1a2e"> |
| 110 | <h1 style="color:#dde; font-size:1.7em; font-weight:700; margin-bottom:0.3em;"> |
| 111 | Discrete Distillation |
| 112 | </h1> |
| 113 | <h3 style="color:#99b; font-weight:400; font-size:1em; margin-top:0;"> |
| 114 | Compiling LLM Knowledge into Deterministic Programs |
| 115 | </h3> |
| 116 | <p style="color:#667; margin-top:2.5em; font-size:0.7em;"> |
| 117 | A framework for amortizing reasoning over repeated tasks |
| 118 | </p> |
| 119 | </section> |
| 120 | |
| 121 | <!-- ═══════════════════════════════════════════ SLIDE 2: Higher-order function --> |
| 122 | <section> |
| 123 | <h2>The Model as a Higher-Order Function</h2> |
| 124 | <p>A language model \(\mathcal{M}\) induces a <em>probabilistic higher-order function</em>:</p> |
| 125 | |
| 126 | \[ \hat{F} \;:\; \Pi \;\longrightarrow\; \bigl(\Sigma^* \to \Delta(\Sigma^*)\bigr) \] |
| 127 | |
| 128 | <ul style="margin-top:0.4em;"> |
| 129 | <li>\(\Pi\) — <em>prompt prefixes</em>: system prompts, instruction sets, persona</li> |
| 130 | <li>\(\Sigma^*\) — token sequences (inputs and outputs)</li> |
| 131 | <li>\(\Delta(\Sigma^*)\) — probability distributions over token sequences</li> |
| 132 | </ul> |
| 133 | |
| 134 | <p class="fragment" style="margin-top:0.5em;"> |
| 135 | Applying \(\hat{F}\) to a prefix \(\pi\) yields a <em>continuation function</em>: |
| 136 | \[ \hat{g}_\pi \;=\; \hat{F}(\pi) \;:\; \Sigma^* \to \Delta(\Sigma^*) \] |
| 137 | </p> |
| 138 | |
| 139 | <div class="callout callout-dark fragment"> |
| 140 | The hat \(\hat{\cdot}\) marks functions that <strong>may produce semantically incorrect outputs</strong> |
| 141 | — they are stochastic and not guaranteed correct. |
| 142 | </div> |
| 143 | </section> |
| 144 | |
| 145 | <!-- ═══════════════════════════════════════════ SLIDE 3: Two-stage application --> |
| 146 | <section> |
| 147 | <h2>Two-Stage Application</h2> |
| 148 | <p> |
| 149 | Given a continuation \(c \in \Sigma^*\) (chat history + user request), |
| 150 | sampling the answer: |
| 151 | \[ \hat{y} \;\sim\; \hat{g}_\pi(c) \] |
| 152 | The hat propagates — \(\hat{y}\) inherits potential error from \(\hat{F}\). |
| 153 | </p> |
| 154 | |
| 155 | <!-- SVG diagram of two-stage application --> |
| 156 | <svg class="diagram-wrap fragment" width="680" height="140" viewBox="0 0 680 160" style="max-width:100%;margin:0.3em auto"> |
| 157 | <!-- π box --> |
| 158 | <rect x="10" y="60" width="60" height="36" rx="5" fill="#f5f5ff" stroke="#2255bb" stroke-width="2"/> |
| 159 | <text x="40" y="83" text-anchor="middle" font-size="14" fill="#2255bb" font-family="serif">π</text> |
| 160 | <!-- arrow to F-hat --> |
| 161 | <line x1="70" y1="78" x2="140" y2="78" stroke="#555" stroke-width="2" marker-end="url(#arr)"/> |
| 162 | <!-- F-hat box (red = probabilistic) --> |
| 163 | <rect x="140" y="55" width="68" height="46" rx="6" fill="#fff5f5" stroke="#882222" stroke-width="2"/> |
| 164 | <text x="174" y="76" text-anchor="middle" font-size="15" fill="#882222" font-family="serif" font-weight="bold">F̂</text> |
| 165 | <text x="174" y="92" text-anchor="middle" font-size="9" fill="#882222" font-family="sans-serif">higher-order</text> |
| 166 | <!-- arrow to g-hat-pi --> |
| 167 | <line x1="208" y1="78" x2="290" y2="78" stroke="#555" stroke-width="2" marker-end="url(#arr)"/> |
| 168 | <!-- g-hat-pi box --> |
| 169 | <rect x="290" y="55" width="80" height="46" rx="6" fill="#fff5f5" stroke="#882222" stroke-width="2"/> |
| 170 | <text x="330" y="76" text-anchor="middle" font-size="15" fill="#882222" font-family="serif" font-weight="bold">ĝ<tspan font-size="10" dy="3">π</tspan></text> |
| 171 | <text x="330" y="93" text-anchor="middle" font-size="9" fill="#882222" font-family="sans-serif">continuation fn</text> |
| 172 | <!-- c (from below) --> |
| 173 | <text x="408" y="135" text-anchor="middle" font-size="13" fill="#444" font-family="serif">c</text> |
| 174 | <text x="455" y="139" text-anchor="start" font-size="10" fill="#666" font-family="sans-serif">(chat + user request)</text> |
| 175 | <line x1="408" y1="125" x2="408" y2="100" stroke="#555" stroke-width="2" marker-end="url(#arr)"/> |
| 176 | <!-- invisible connector from g-hat-pi to combiner --> |
| 177 | <line x1="370" y1="78" x2="390" y2="78" stroke="#555" stroke-width="1.5"/> |
| 178 | <line x1="390" y1="78" x2="390" y2="100" stroke="#555" stroke-width="1.5"/> |
| 179 | <!-- combiner dot --> |
| 180 | <circle cx="408" cy="100" r="4" fill="#555"/> |
| 181 | <line x1="390" y1="100" x2="408" y2="100" stroke="#555" stroke-width="1.5"/> |
| 182 | <!-- arrow from combiner to y-hat --> |
| 183 | <line x1="412" y1="100" x2="490" y2="78" stroke="#555" stroke-width="2" marker-end="url(#arr)"/> |
| 184 | <!-- y-hat box --> |
| 185 | <rect x="490" y="55" width="80" height="46" rx="6" fill="#fff5f5" stroke="#882222" stroke-width="2"/> |
| 186 | <text x="530" y="76" text-anchor="middle" font-size="15" fill="#882222" font-family="serif" font-weight="bold">ŷ</text> |
| 187 | <text x="530" y="93" text-anchor="middle" font-size="9" fill="#cc4444" font-family="sans-serif">may be wrong</text> |
| 188 | <!-- arrowhead marker --> |
| 189 | <defs> |
| 190 | <marker id="arr" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto"> |
| 191 | <path d="M0,0 L0,6 L8,3 z" fill="#555"/> |
| 192 | </marker> |
| 193 | </defs> |
| 194 | </svg> |
| 195 | |
| 196 | <p class="fragment" style="margin-top:0.3em;"> |
| 197 | For a fixed \(\pi\), the model is a black box consuming user turns and emitting sampled |
| 198 | answers — correct with high but <em>not certain</em> probability. |
| 199 | </p> |
| 200 | </section> |
| 201 | |
| 202 | <!-- ═══════════════════════════════════════════ SLIDE 4: Standard usage --> |
| 203 | <section> |
| 204 | <h2>Standard (Query-Time) Reasoning</h2> |
| 205 | <p>For each user query \(u_i\), invoke \(\hat{g}_\pi\):</p> |
| 206 | \[ \hat{y}_i \;\sim\; \hat{g}_\pi(u_i), \qquad \text{cost } c_{\mathcal{M}} \text{ per query} \] |
| 207 | |
| 208 | <p class="fragment" style="margin-top:0.7em;">Total cost for \(N\) queries:</p> |
| 209 | \[ C_{\mathrm{std}}(N) \;=\; N \cdot c_{\mathcal{M}} \] |
| 210 | |
| 211 | <ul class="fragment" style="margin-top:0.7em;"> |
| 212 | <li>Appropriate for <em>novel</em> tasks — full generalization, no prior knowledge assumed</li> |
| 213 | <li>Cost scales <em>linearly</em> — no benefit from repetition or structure</li> |
| 214 | <li>Each \(\hat{y}_i\) is a fresh sample — output may vary even for identical \(u_i\)</li> |
| 215 | </ul> |
| 216 | </section> |
| 217 | |
| 218 | <!-- ═══════════════════════════════════════════ SLIDE 5: Neural distillation background --> |
| 219 | <section> |
| 220 | <h2>Knowledge Distillation — Neural Target</h2> |
| 221 | <p> |
| 222 | Hinton et al. (2015): transfer knowledge from a large model \(\mathcal{M}_L\) |
| 223 | into a small model \(\mathcal{M}_S\) by training on <em>soft targets</em> |
| 224 | (output distributions of \(\mathcal{M}_L\) rather than hard labels). |
| 225 | </p> |
| 226 | |
| 227 | \[ \hat{F}_L \;\xrightarrow{\;\text{distill}\;}\; \hat{F}_S, \qquad |\mathcal{M}_S| \ll |\mathcal{M}_L| \] |
| 228 | |
| 229 | <div class="fragment callout callout-dark" style="margin-top:0.8em;"> |
| 230 | <strong>What stays the same after neural distillation:</strong> |
| 231 | <ul style="margin-top:0.4em; margin-bottom:0;"> |
| 232 | <li>Target is still a <em>neural network</em> — still \(\hat{\cdot}\), still probabilistic</li> |
| 233 | <li>Knowledge remains in weights — <em>opaque</em></li> |
| 234 | <li>Errors corrected only by <em>re-training</em></li> |
| 235 | <li>Broad generalization preserved — but so is the black-box character</li> |
| 236 | </ul> |
| 237 | </div> |
| 238 | |
| 239 | <p class="fragment" style="margin-top:0.8em;"> |
| 240 | The target representation is the same kind of thing as the source. |
| 241 | What if we chose a <em>different</em> target? |
| 242 | </p> |
| 243 | </section> |
| 244 | |
| 245 | <!-- ═══════════════════════════════════════════ SLIDE 6: Discrete distillation --> |
| 246 | <section> |
| 247 | <h2>Discrete Distillation — A Different Target</h2> |
| 248 | |
| 249 | <p> |
| 250 | Instead of another neural net, distill \(\hat{F}\)'s knowledge into |
| 251 | <em>discrete structures</em>: programs, grammars, schemas. |
| 252 | </p> |
| 253 | |
| 254 | <div style="margin:0.9em 0; display:flex; gap:1.5em; align-items:center;"> |
| 255 | <div class="hat-box">\(\hat{F}\)</div> |
| 256 | <span style="font-size:1.6em; color:#555;">⟹</span> |
| 257 | <div class="det-box">\(\tilde{p},\;\; G,\;\; \ldots\)</div> |
| 258 | <span style="font-size:0.75em; color:#666;">programs & grammars</span> |
| 259 | </div> |
| 260 | |
| 261 | <ul class="fragment"> |
| 262 | <li><strong>Programs</strong> \(\tilde{p}: \Theta \to \mathcal{Y}\) — deterministic computations over typed parameters</li> |
| 263 | <li><strong>Grammars</strong> \(G: \mathcal{T} \to \Theta\) — NFA recognizers that match utterances and extract parameters</li> |
| 264 | <li><strong>Schemas</strong>, API bindings, structured plans, structured memory indexes, …</li> |
| 265 | </ul> |
| 266 | |
| 267 | <p class="fragment" style="margin-top:0.8em;"> |
| 268 | \(\tilde{\cdot}\) denotes <em>deterministic but initially unverified</em> — no randomness, |
| 269 | but may be semantically wrong if compiled from a flawed \(\hat{y}\). |
| 270 | </p> |
| 271 | |
| 272 | </section> |
| 273 | |
| 274 | <!-- ═══════════════════════════════════════════ SLIDE 7: The compiler --> |
| 275 | <section> |
| 276 | <h2>The Compilation Step</h2> |
| 277 | <p> |
| 278 | Given a teaching example \(u_0\), the model produces a <em>recipe</em> |
| 279 | \(\hat{r} \in \mathcal{R}\) — a structured description of the intended computation: |
| 280 | \[ \hat{r} \;\sim\; \hat{g}_\pi(u_0), \qquad \hat{r} \in \mathcal{R} \subset \Sigma^* \] |
| 281 | </p> |
| 282 | |
| 283 | <p class="fragment"> |
| 284 | A <em>compiler</em> — a total <strong>deterministic</strong> function — maps recipe to program: |
| 285 | \[ \kappa : \mathcal{R} \;\longrightarrow\; (\Theta \to \mathcal{Y}), \qquad \tilde{p} = \kappa(\hat{r}) \] |
| 286 | </p> |
| 287 | |
| 288 | <!-- SVG: compilation pipeline --> |
| 289 | <svg class="diagram-wrap fragment" width="680" height="95" viewBox="0 0 680 110" style="max-width:100%;margin:0.3em auto"> |
| 290 | <defs> |
| 291 | <marker id="arr2" markerWidth="8" markerHeight="8" refX="6" refY="3" orient="auto"> |
| 292 | <path d="M0,0 L0,6 L8,3 z" fill="#555"/> |
| 293 | </marker> |
| 294 | </defs> |
| 295 | <!-- u0 --> |
| 296 | <rect x="5" y="32" width="60" height="34" rx="5" fill="#f8f8f8" stroke="#555" stroke-width="1.5"/> |
| 297 | <text x="35" y="53" text-anchor="middle" font-size="13" fill="#333" font-family="serif">u₀</text> |
| 298 | <!-- arrow --> |
| 299 | <line x1="65" y1="49" x2="105" y2="49" stroke="#555" stroke-width="1.5" marker-end="url(#arr2)"/> |
| 300 | <!-- g-hat-pi --> |
| 301 | <rect x="105" y="28" width="76" height="42" rx="6" fill="#fff5f5" stroke="#882222" stroke-width="2"/> |
| 302 | <text x="143" y="48" text-anchor="middle" font-size="14" fill="#882222" font-family="serif" font-weight="bold">ĝ<tspan font-size="10" dy="3">π</tspan></text> |
| 303 | <text x="143" y="63" text-anchor="middle" font-size="9" fill="#882222" font-family="sans-serif">LLM</text> |
| 304 | <!-- arrow to r-hat --> |
| 305 | <line x1="181" y1="49" x2="225" y2="49" stroke="#555" stroke-width="1.5" marker-end="url(#arr2)"/> |
| 306 | <!-- r-hat box --> |
| 307 | <rect x="225" y="28" width="80" height="42" rx="6" fill="#fff5f5" stroke="#882222" stroke-width="2"/> |
| 308 | <text x="265" y="48" text-anchor="middle" font-size="14" fill="#882222" font-family="serif" font-weight="bold">r̂</text> |
| 309 | <text x="265" y="63" text-anchor="middle" font-size="9" fill="#882222" font-family="sans-serif">recipe</text> |
| 310 | <!-- arrow to kappa --> |
| 311 | <line x1="305" y1="49" x2="355" y2="49" stroke="#555" stroke-width="1.5" marker-end="url(#arr2)"/> |
| 312 | <!-- kappa box (deterministic = blue) --> |
| 313 | <rect x="355" y="28" width="76" height="42" rx="6" fill="#f5f5ff" stroke="#2255bb" stroke-width="2"/> |
| 314 | <text x="393" y="48" text-anchor="middle" font-size="14" fill="#2255bb" font-family="serif" font-weight="bold">κ</text> |
| 315 | <text x="393" y="63" text-anchor="middle" font-size="9" fill="#2255bb" font-family="sans-serif">compiler</text> |
| 316 | <!-- arrow to p-tilde --> |
| 317 | <line x1="431" y1="49" x2="475" y2="49" stroke="#555" stroke-width="1.5" marker-end="url(#arr2)"/> |
| 318 | <!-- p-tilde box --> |
| 319 | <rect x="475" y="28" width="110" height="42" rx="6" fill="#f5f5ff" stroke="#2255bb" stroke-width="2"/> |
| 320 | <text x="530" y="48" text-anchor="middle" font-size="14" fill="#2255bb" font-family="serif" font-weight="bold">p̃ : Θ → Y</text> |
| 321 | <text x="530" y="63" text-anchor="middle" font-size="9" fill="#2255bb" font-family="sans-serif">deterministic program</text> |
| 322 | <!-- labels below arrows --> |
| 323 | <text x="143" y="85" text-anchor="middle" font-size="9" fill="#aa2222">probabilistic</text> |
| 324 | <text x="393" y="85" text-anchor="middle" font-size="9" fill="#2255bb">deterministic</text> |
| 325 | </svg> |
| 326 | |
| 327 | <p class="fragment" style="margin-top:0.2em;"> |
| 328 | \(\kappa\) is deterministic but \(\tilde{p}\) is only as correct as \(\hat{r}\) was. |
| 329 | Also emitted: a grammar \(G\) that recognizes the task family and extracts \(\theta \in \Theta\). |
| 330 | </p> |
| 331 | </section> |
| 332 | |
| 333 | <!-- ═══════════════════════════════════════════ SLIDE 8: Task families --> |
| 334 | <section> |
| 335 | <h2>Task Families</h2> |
| 336 | <p>A <em>task family</em> is a triple \((\mathcal{T},\, \Theta,\, \phi)\):</p> |
| 337 | <ul> |
| 338 | <li>\(\mathcal{T} \subseteq \Sigma^*\) — all utterances expressing the same underlying task</li> |
| 339 | <li>\(\Theta\) — parameter space (e.g. genre × quantity × time period)</li> |
| 340 | <li>\(\phi : \Theta \to \mathcal{T}\) — maps parameters to representative utterances</li> |
| 341 | </ul> |
| 342 | |
| 343 | <p class="fragment" style="margin-top:0.7em;"> |
| 344 | A grammar \(G : \mathcal{T} \to \Theta\) (implemented as an NFA) <em>recognizes</em> the |
| 345 | task family and <em>extracts</em> parameters at near-zero cost: |
| 346 | \[ G(u) = \theta \in \Theta \quad \forall\, u \in \mathcal{T} \] |
| 347 | </p> |
| 348 | |
| 349 | <div class="callout callout-dark fragment"> |
| 350 | <strong>Correctness requirement on \(\tilde{p}\):</strong> |
| 351 | \(\tilde{p}\) must <em>parameterize</em>, not specialize. |
| 352 | A recipe that hardcodes one value (e.g. a URL for a single genre) produces |
| 353 | \(\tilde{p}\) correct only at the teaching point \(\theta_0\). |
| 354 | Correct distillation requires \(\tilde{p}(\theta) \approx_\varepsilon \hat{g}_\pi(\phi(\theta))\) |
| 355 | for <em>all</em> \(\theta \in \Theta\). |
| 356 | </div> |
| 357 | </section> |
| 358 | |
| 359 | <!-- ═══════════════════════════════════════════ SLIDE 9: Piecewise handler --> |
| 360 | <section> |
| 361 | <h2>The Growing Piecewise Handler \(H_k\)</h2> |
| 362 | <p> |
| 363 | Let \(\{(\mathcal{T}_i, \Theta_i, G_i, \tilde{p}_i)\}_{i=1}^k\) be compiled task families. Define: |
| 364 | \[ |
| 365 | H_k(u) \;=\; \begin{cases} |
| 366 | \tilde{p}_i\bigl(G_i(u)\bigr) & \exists\, i \leq k : u \in \mathcal{T}_i \\[4pt] |
| 367 | \hat{g}_\pi(u) & \text{otherwise} |
| 368 | \end{cases} |
| 369 | \] |
| 370 | </p> |
| 371 | |
| 372 | <p class="fragment" style="margin-top:0.5em; font-size:0.8em;"> |
| 373 | The first \(k\) branches are <strong>deterministic</strong>. The fallback is the full LLM. |
| 374 | As \(k\) grows, deterministic coverage expands: |
| 375 | </p> |
| 376 | |
| 377 | <div class="fragment" style="margin-top:0.6em;"> |
| 378 | <div class="cov-row"> |
| 379 | <div class="cov-label">k = 1</div> |
| 380 | <div class="cov-bar"> |
| 381 | <div class="seg-det" style="width:18%">T₁</div> |
| 382 | <div class="seg-llm">LLM fallback</div> |
| 383 | </div> |
| 384 | </div> |
| 385 | <div class="cov-row" style="margin-top:0.25em;"> |
| 386 | <div class="cov-label">k = 3</div> |
| 387 | <div class="cov-bar"> |
| 388 | <div class="seg-det" style="width:18%">T₁</div> |
| 389 | <div class="seg-det" style="width:15%">T₂</div> |
| 390 | <div class="seg-det" style="width:13%">T₃</div> |
| 391 | <div class="seg-llm">LLM fallback</div> |
| 392 | </div> |
| 393 | </div> |
| 394 | <div class="cov-row" style="margin-top:0.25em;"> |
| 395 | <div class="cov-label">k → ∞</div> |
| 396 | <div class="cov-bar"> |
| 397 | <div class="seg-det" style="width:18%">T₁</div> |
| 398 | <div class="seg-det" style="width:15%">T₂</div> |
| 399 | <div class="seg-det" style="width:13%">T₃</div> |
| 400 | <div class="seg-det" style="width:11%">T₄</div> |
| 401 | <div class="seg-det" style="width:9%">T₅</div> |
| 402 | <div class="seg-det" style="width:7%">…</div> |
| 403 | <div class="seg-llm" style="flex:0.3">LLM</div> |
| 404 | </div> |
| 405 | </div> |
| 406 | </div> |
| 407 | </section> |
| 408 | |
| 409 | <!-- ═══════════════════════════════════════════ SLIDE 10: Coverage and cost --> |
| 410 | <section> |
| 411 | <h2>Coverage and Expected Cost</h2> |
| 412 | <p>Let \(\mathcal{D}\) be a distribution over user utterances. Define <em>coverage</em>:</p> |
| 413 | \[ \mathrm{cov}(H_k) \;=\; \Pr_{u \sim \mathcal{D}}\!\Bigl[u \in \textstyle\bigcup_{i=1}^k \mathcal{T}_i\Bigr] \] |
| 414 | |
| 415 | <p class="fragment">Expected cost per query:</p> |
| 416 | \[ |
| 417 | \begin{aligned} |
| 418 | \mathbb{E}\bigl[C(H_k)\bigr] |
| 419 | &\;=\; \mathrm{cov}(H_k)\cdot c_{\mathrm{det}} \\ |
| 420 | &\quad+\; \bigl(1 - \mathrm{cov}(H_k)\bigr)\cdot c_{\mathcal{M}} |
| 421 | \end{aligned} |
| 422 | \] |
| 423 | |
| 424 | <p class="fragment" style="margin-top:0.5em;"> |
| 425 | Since \(c_{\mathrm{det}} \approx 0 \ll c_{\mathcal{M}}\), |
| 426 | as \(\mathrm{cov}(H_k) \nearrow 1\): |
| 427 | \[ \mathbb{E}\bigl[C(H_k)\bigr] \;\longrightarrow\; 0 \] |
| 428 | </p> |
| 429 | |
| 430 | <div class="callout callout-blue fragment"> |
| 431 | <strong>Break-even:</strong> compiling a task family costs one LLM call \(c_{\mathcal{M}}\) |
| 432 | (the teaching example). A single subsequent query recoups that cost entirely. |
| 433 | \(N^* \approx 1\). |
| 434 | </div> |
| 435 | </section> |
| 436 | |
| 437 | <!-- ═══════════════════════════════════════════ SLIDE 11a: Correctness refinement --> |
| 438 | <section> |
| 439 | <h2>Correctness Refinement</h2> |
| 440 | <p> |
| 441 | Each \(\tilde{p}_i\) is initially <em>unverified</em>. Let \(y^*(\theta)\) be |
| 442 | the ground-truth outcome. Define the error rate at refinement step \(t\): |
| 443 | \[ \varepsilon_i(t) \;=\; \Pr_{\theta \sim \Theta_i}\!\bigl[\tilde{p}_i^{(t)}(\theta) \neq y^*(\theta)\bigr] \] |
| 444 | </p> |
| 445 | |
| 446 | <p class="fragment" style="margin-top:0.4em;">When a user signals an error, refinement takes one of three paths:</p> |
| 447 | <ol class="fragment"> |
| 448 | <li style="margin-bottom:0.2em;"><strong>Re-recording</strong> — back through \(\hat{g}_\pi\) with corrected context</li> |
| 449 | <li><strong>AI-assisted repair</strong> of \(\tilde{p}_i\): |
| 450 | <ul style="margin-top:0.15em;"> |
| 451 | <li>(a) direct engineering — unit tests, debugger, code review</li> |
| 452 | <li>(b) ask a code-generation model to fix \(\tilde{p}_i\) given the failure case</li> |
| 453 | </ul> |
| 454 | </li> |
| 455 | </ol> |
| 456 | |
| 457 | <div class="callout callout-green fragment" style="margin-top:0.45em;"> |
| 458 | Key asymmetry: \(\hat{g}_\pi\) is a black box — you can only reprompt. |
| 459 | \(\tilde{p}_i\) is <strong>conventional software</strong> — transparent, testable, refineable. |
| 460 | \(\varepsilon_i(t) \to 0\) as \(t \to \infty\). |
| 461 | </div> |
| 462 | </section> |
| 463 | |
| 464 | <!-- ═══════════════════════════════════════════ SLIDE 11b: Data-driven splitting --> |
| 465 | <section> |
| 466 | <h2>Correctness Refinement — Data-Driven Splitting</h2> |
| 467 | <p> |
| 468 | A third path exploits accumulated user feedback directly. |
| 469 | Partition inputs: let \(A \subset \mathcal{T}_i\) be accepted, \(R \subset \mathcal{T}_i\) rejected. |
| 470 | </p> |
| 471 | |
| 472 | <ol class="fragment" start="3"> |
| 473 | <li><strong>Data-driven splitting</strong>: |
| 474 | <ul style="margin-top:0.2em;"> |
| 475 | <li style="margin-bottom:0.15em;">(a) <em>Grammar</em>: pose \((A,\, R)\) to \(\hat{g}_\pi\) — |
| 476 | <em>"find a rule accepting \(A\) and rejecting \(R\)"</em> — |
| 477 | replacing \(G_i\) with discriminating rules \(G_i^+,\, G_i^-\)</li> |
| 478 | <li>(b) <em>Program</em>: filter \(\tilde{p}_i\)'s control flow by coverage over \(A\) vs \(R\) — |
| 479 | automatically specializing two flow programs \(\tilde{p}_i^+,\, \tilde{p}_i^-\), |
| 480 | one per partition</li> |
| 481 | </ul> |
| 482 | </li> |
| 483 | </ol> |
| 484 | |
| 485 | <div class="callout callout-dark fragment" style="margin-top:0.4em; font-size:0.63em;"> |
| 486 | <strong>Program Synthesis connection</strong> — |
| 487 | path 3 is an instance of <em>synthesis from examples</em>: |
| 488 | grammar induction from \((A, R)\) recalls Angluin's L* / RPNI (Oncina & García 1992); |
| 489 | flow specialization resembles SyGuS (Alur et al. 2013) and FlashFill / PROSE (Gulwani et al.). |
| 490 | The novel element: \(\hat{g}_\pi\) as <em>oracle</em> generalizes to informal, natural-language domains; |
| 491 | regular closure makes \(G_i^- = \overline{G_i^+}\) automatic. |
| 492 | </div> |
| 493 | </section> |
| 494 | |
| 495 | <!-- ═══════════════════════════════════════════ SLIDE 12: Comparison --> |
| 496 | <section> |
| 497 | <h2>Neural vs. Discrete Distillation</h2> |
| 498 | <table class="cmp"> |
| 499 | <thead> |
| 500 | <tr> |
| 501 | <th></th> |
| 502 | <th>Neural Distillation</th> |
| 503 | <th>Discrete Distillation</th> |
| 504 | </tr> |
| 505 | </thead> |
| 506 | <tbody> |
| 507 | <tr> |
| 508 | <td>Source</td> |
| 509 | <td>\(\hat{F}_L\) (large model)</td> |
| 510 | <td>\(\hat{F}\) (any model)</td> |
| 511 | </tr> |
| 512 | <tr> |
| 513 | <td>Target</td> |
| 514 | <td>Small neural net \(\hat{F}_S\)</td> |
| 515 | <td class="good">Programs, grammars, schemas</td> |
| 516 | </tr> |
| 517 | <tr> |
| 518 | <td>Output</td> |
| 519 | <td class="bad">Probabilistic (\(\hat{\cdot}\))</td> |
| 520 | <td class="good">Deterministic (\(\tilde{\cdot}\))</td> |
| 521 | </tr> |
| 522 | <tr> |
| 523 | <td>Coverage</td> |
| 524 | <td>Broad — generalizes</td> |
| 525 | <td>Targeted — task families</td> |
| 526 | </tr> |
| 527 | <tr> |
| 528 | <td>Transparency</td> |
| 529 | <td class="bad">Opaque (weights)</td> |
| 530 | <td class="good">Readable (source code)</td> |
| 531 | </tr> |
| 532 | <tr> |
| 533 | <td>Error correction</td> |
| 534 | <td class="bad">Re-training</td> |
| 535 | <td class="good">Conventional debugging</td> |
| 536 | </tr> |
| 537 | <tr> |
| 538 | <td>Cost per query</td> |
| 539 | <td>\(c_{\mathcal{M}_S} < c_{\mathcal{M}_L}\)</td> |
| 540 | <td class="good">\(c_{\mathrm{det}} \approx 0\)</td> |
| 541 | </tr> |
| 542 | <tr> |
| 543 | <td>Refinement loop</td> |
| 544 | <td class="bad">Gradient descent</td> |
| 545 | <td class="good">Engineering + user feedback</td> |
| 546 | </tr> |
| 547 | <tr> |
| 548 | <td>Artifact class</td> |
| 549 | <td class="bad">Parametric weights (opaque, fixed)</td> |
| 550 | <td class="good">Weakest sufficient programs + grammars</td> |
| 551 | </tr> |
| 552 | <tr> |
| 553 | <td>Algebraic ops</td> |
| 554 | <td class="bad">None — retrain to change</td> |
| 555 | <td class="good">Union, concat, complement — automatic</td> |
| 556 | </tr> |
| 557 | </tbody> |
| 558 | </table> |
| 559 | </section> |
| 560 | |
| 561 | <!-- ═══════════════════════════════════════════ SLIDE 13: Summary --> |
| 562 | <section class="center" data-background-color="#1a1a2e"> |
| 563 | <h2 style="color:#dde; border-bottom-color:#445;">Summary</h2> |
| 564 | <div style="text-align:left; font-size:0.82em; margin-top:0.8em;"> |
| 565 | <p class="fragment" style="color:#bbc;"> |
| 566 | \(\hat{F}\) is a <strong style="color:#99ccff;">probabilistic higher-order function</strong>: |
| 567 | prompt prefix → continuation function → sampled (possibly incorrect) answer. |
| 568 | </p> |
| 569 | <p class="fragment" style="color:#bbc; margin-top:0.7em;"> |
| 570 | <strong style="color:#99ccff;">Discrete distillation</strong> compiles outputs of \(\hat{F}\) |
| 571 | into programs \(\tilde{p}\) and grammars \(G\) — deterministic, transparent, refineable |
| 572 | structures that approximate \(\hat{F}\) on specific task families. |
| 573 | </p> |
| 574 | <p class="fragment" style="color:#bbc; margin-top:0.7em;"> |
| 575 | The handler \(H_k\) is a <strong style="color:#99ccff;">piecewise function</strong> |
| 576 | growing over time: deterministic branches multiply, cost collapses, |
| 577 | correctness improves through ordinary software engineering. |
| 578 | </p> |
| 579 | <p class="fragment" style="color:#bbc; margin-top:0.7em;"> |
| 580 | We target the <strong style="color:#99ccff;">weakest sufficient artifact class</strong> — |
| 581 | regular grammars (closed under union and complement) and bounded dataflow programs |
| 582 | (composable, terminating, verifiable). Weakness is a feature: algebraic closure means |
| 583 | splits, merges, and compositions are <em>automatic</em>. |
| 584 | Turing-complete programs fit the framework, but weaker classes accelerate refinement |
| 585 | and are secure by construction. |
| 586 | </p> |
| 587 | <p class="fragment" style="color:#7799cc; font-style:italic; margin-top:0.9em; text-align:center;"> |
| 588 | Where neural distillation produces a smaller black box,<br> |
| 589 | discrete distillation produces conventional software. |
| 590 | </p> |
| 591 | </div> |
| 592 | </section> |
| 593 | |
| 594 | </div><!-- slides --> |
| 595 | </div><!-- reveal --> |
| 596 | |
| 597 | <script src="https://cdn.jsdelivr.net/npm/reveal.js@4.6.1/dist/reveal.js"></script> |
| 598 | <script src="https://cdn.jsdelivr.net/npm/reveal.js@4.6.1/plugin/math/math.js"></script> |
| 599 | <script src="https://cdn.jsdelivr.net/npm/reveal.js@4.6.1/plugin/notes/notes.js"></script> |
| 600 | <script> |
| 601 | Reveal.initialize({ |
| 602 | hash: true, |
| 603 | center: false, |
| 604 | slideNumber: 'c/t', |
| 605 | transition: 'fade', |
| 606 | transitionSpeed: 'fast', |
| 607 | width: 1050, |
| 608 | height: 700, |
| 609 | margin: 0.06, |
| 610 | minScale: 0.1, |
| 611 | maxScale: 2.0, |
| 612 | plugins: [ RevealMath.KaTeX, RevealNotes ], |
| 613 | katex: { |
| 614 | version: 'latest', |
| 615 | delimiters: [ |
| 616 | { left: '\\[', right: '\\]', display: true }, |
| 617 | { left: '\\(', right: '\\)', display: false }, |
| 618 | ], |
| 619 | }, |
| 620 | }); |
| 621 | |
| 622 | // reveal.js sets top as an inline style, overriding CSS !important. |
| 623 | // Use setProperty with 'important' priority to beat it. |
| 624 | function pinSlidesToTop() { |
| 625 | document.querySelectorAll('.reveal .slides > section').forEach(s => { |
| 626 | s.style.setProperty('top', '0px', 'important'); |
| 627 | }); |
| 628 | } |
| 629 | Reveal.on('ready', () => requestAnimationFrame(pinSlidesToTop)); |
| 630 | Reveal.on('resize', () => requestAnimationFrame(pinSlidesToTop)); |
| 631 | Reveal.on('slidechanged', () => requestAnimationFrame(pinSlidesToTop)); |
| 632 | </script> |
| 633 | </body> |
| 634 | </html> |
| 635 | |