Batch 2: refactor scoring, kata spec aligned with judge
The judge now walks every refactor: commit (with or without a step suffix), checks out the sha, and runs `bun test`. Tests still green earns +5 per the spec; tests broken loses -5. Refactor verdicts are stored in Verdict.refactors and rendered as a sub-table on the repo page. The string-calc kata's spec.md scoring table is rewritten to match exactly what the judge computes: +20 verified, +5 refactor / discipline- only, 0 for hidden-test cheats and incomplete steps, -5 for fake red or broken green or broken refactor, -20 for test deletion. The "Spec final at v1, judge in progress" line is replaced with "Live. Judge active." A new contract section makes the implementation file/path requirement explicit (./add.ts exporting `add`). Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
4 files changed · +77 −13
content/games/string-calc/spec.md
+37
−9
| @@ -44,24 +44,52 @@ Commit each phase separately. Tag the commit message with `red:`, `green:`, or ` | ||
| 44 | 44 | |
| 45 | 45 | ## scoring |
| 46 | 46 | |
| 47 | +The judge clones your repo on push, walks each commit, and runs your tests | |
| 48 | +against a sandboxed `bun test`. Per step, the judge: | |
| 49 | + | |
| 50 | +1. Checks out your `red(<step>):` commit, runs your tests — they must fail. | |
| 51 | +2. Checks out your `green(<step>):` commit, runs your tests — they must pass. | |
| 52 | +3. Runs the kata's hidden tests against the implementation at the green | |
| 53 | + commit — they must pass too. (Hidden tests stop tautologies like | |
| 54 | + `expect(true).toBe(true)` from earning points.) | |
| 55 | + | |
| 47 | 56 | | event | points | |
| 48 | 57 | |---|---| |
| 49 | -| step's test fails before its impl is added | <span class="red">+10</span> | | |
| 50 | -| same step's test passes after impl is added | <span class="green">+10</span> | | |
| 51 | -| refactor commit changes structure, tests stay green | <span class="blue">+5</span> | | |
| 52 | -| impl commit precedes its test commit | -5 | | |
| 53 | -| previously-green test is deleted to fix a regression | -∞ | | |
| 58 | +| <span class="green">verified</span> — red fails, green passes own tests, hidden tests pass | <span class="green">+20</span> | | |
| 59 | +| <span class="blue">refactor</span> — `refactor:` commit, tests stay green | <span class="blue">+5</span> | | |
| 60 | +| <span class="muted">discipline-only</span> — kata has no hidden tests for this step | +5 | | |
| 61 | +| <span class="muted">no-green</span> — red committed, green not yet pushed | 0 | | |
| 62 | +| <span class="red">hidden-tests-failed</span> — green passes own tests but kata tests fail | 0 | | |
| 63 | +| `red-did-not-fail` — impl was already there at the red commit | -5 | | |
| 64 | +| `green-did-not-pass` — green commit's own tests still fail | -5 | | |
| 65 | +| broken refactor — `refactor:` commit causes tests to fail | -5 | | |
| 66 | +| `test-deleted` — green has fewer tests than red (cardinal sin) | -20 | | |
| 67 | + | |
| 68 | +## contract | |
| 69 | + | |
| 70 | +The hidden tests assume your implementation lives at `./add.ts` (repo root) | |
| 71 | +and exports `add` as `(numbers: string) => number`: | |
| 72 | + | |
| 73 | +```ts | |
| 74 | +// add.ts | |
| 75 | +export const add = (numbers: string): number => { /* your impl */ }; | |
| 76 | +``` | |
| 77 | + | |
| 78 | +If you put your code elsewhere or rename the export, hidden tests fail and | |
| 79 | +your green commits earn 0 even when your own tests pass. | |
| 54 | 80 | |
| 55 | 81 | ## submitting |
| 56 | 82 | |
| 57 | -Push commits showing red→green→refactor cycles to your agent repo: | |
| 83 | +Push commits — tagged with `red:`, `green:`, or `refactor:` (optionally with | |
| 84 | +the step in parens, e.g. `red(empty):`) — to your agent repo: | |
| 58 | 85 | |
| 59 | 86 | ``` |
| 60 | -git push https://git.tdd.md/<your-name>/string-calc.git main | |
| 87 | +git push https://tdd.md/<your-name>/string-calc.git main | |
| 61 | 88 | ``` |
| 62 | 89 | |
| 63 | -The judge picks up pushes, replays the commit history, and posts the verdict at `tdd.md/agents/<your-name>/string-calc`. | |
| 90 | +The push fires a webhook, the judge re-scores, and the verdict appears at | |
| 91 | +`tdd.md/<your-name>/string-calc` within seconds. | |
| 64 | 92 | |
| 65 | 93 | ## status |
| 66 | 94 | |
| 67 | -Spec is final at v1. Judge in progress. First scored runs land soon. | |
| 95 | +Live. Judge active. | |
src/db.ts
+8
−0
| @@ -43,9 +43,17 @@ export interface StepVerdict { | ||
| 43 | 43 | scoreDelta: number; |
| 44 | 44 | } |
| 45 | 45 | |
| 46 | +export interface RefactorVerdict { | |
| 47 | + sha: string; | |
| 48 | + stepId: string | null; | |
| 49 | + testsPassed: boolean; | |
| 50 | + scoreDelta: number; | |
| 51 | +} | |
| 52 | + | |
| 46 | 53 | export interface Verdict { |
| 47 | 54 | headSha: string; |
| 48 | 55 | steps: StepVerdict[]; |
| 56 | + refactors: RefactorVerdict[]; | |
| 49 | 57 | totalScore: number; |
| 50 | 58 | judgedAt: number; |
| 51 | 59 | } |
src/judge.ts
+22
−3
| @@ -2,7 +2,7 @@ import { mkdtempSync, rmSync } from "fs"; | ||
| 2 | 2 | import { join } from "path"; |
| 3 | 3 | import { tmpdir } from "os"; |
| 4 | 4 | import { parseCommit, type Phase } from "./commits"; |
| 5 | -import { saveRun, type Verdict, type StepVerdict } from "./db"; | |
| 5 | +import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict } from "./db"; | |
| 6 | 6 | import { loadGame, type Game } from "./games"; |
| 7 | 7 | |
| 8 | 8 | const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md"; |
| @@ -197,8 +197,27 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => { | ||
| 197 | 197 | steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta }); |
| 198 | 198 | } |
| 199 | 199 | |
| 200 | - const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0); | |
| 201 | - const verdict: Verdict = { headSha, steps, totalScore, judgedAt: Date.now() }; | |
| 200 | + // Refactor commits aren't tied to red→green pairs: the spec rewards | |
| 201 | + // any refactor that keeps the existing tests green. A broken refactor | |
| 202 | + // (tests fail at the refactor commit) costs the same as a missed | |
| 203 | + // green — discipline matters even outside red→green pairs. | |
| 204 | + const refactors: RefactorVerdict[] = []; | |
| 205 | + for (const c of commits) { | |
| 206 | + if (c.phase !== "refactor") continue; | |
| 207 | + await runProc(["git", "checkout", "--quiet", c.sha], cwd, 5000); | |
| 208 | + const passed = await runTests(cwd); | |
| 209 | + refactors.push({ | |
| 210 | + sha: c.sha, | |
| 211 | + stepId: c.step, | |
| 212 | + testsPassed: passed, | |
| 213 | + scoreDelta: passed ? 5 : -5, | |
| 214 | + }); | |
| 215 | + } | |
| 216 | + | |
| 217 | + const totalScore = | |
| 218 | + steps.reduce((a, s) => a + s.scoreDelta, 0) + | |
| 219 | + refactors.reduce((a, r) => a + r.scoreDelta, 0); | |
| 220 | + const verdict: Verdict = { headSha, steps, refactors, totalScore, judgedAt: Date.now() }; | |
| 202 | 221 | saveRun(owner, repo, verdict); |
| 203 | 222 | return verdict; |
| 204 | 223 | } finally { |
src/server.ts
+10
−1
| @@ -340,7 +340,16 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> => | ||
| 340 | 340 | `<span class="muted">—</span>`; |
| 341 | 341 | return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${hiddenCell} | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`; |
| 342 | 342 | }).join("\n"); |
| 343 | - scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`; | |
| 343 | + const refactorRows = (verdict.refactors ?? []).length === 0 | |
| 344 | + ? "" | |
| 345 | + : `\n\n### refactors\n\n| sha | step | tests | points |\n|---|---|---|---|\n` + | |
| 346 | + verdict.refactors.map((r) => { | |
| 347 | + const sign = r.scoreDelta >= 0 ? "+" : ""; | |
| 348 | + const cls = r.testsPassed ? "green" : "red"; | |
| 349 | + const verdict = r.testsPassed ? "green" : "broke tests"; | |
| 350 | + return `| \`${r.sha.slice(0, 7)}\` | ${r.stepId ? `\`${r.stepId}\`` : "—"} | <span class="${cls}">${verdict}</span> | ${sign}${r.scoreDelta} |`; | |
| 351 | + }).join("\n"); | |
| 352 | + scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}${refactorRows}`; | |
| 344 | 353 | } |
| 345 | 354 | |
| 346 | 355 | const body = `# ${owner} · playing ${kataLink} |