syntaxai/tdd.md · commit 8369866

Batch 2: refactor scoring, kata spec aligned with judge

The judge now walks every refactor: commit (with or without a step
suffix), checks out the sha, and runs `bun test`. Tests still green
earns +5 per the spec; tests broken loses -5. Refactor verdicts are
stored in Verdict.refactors and rendered as a sub-table on the repo
page.

The string-calc kata's spec.md scoring table is rewritten to match
exactly what the judge computes: +20 verified, +5 refactor / discipline-
only, 0 for hidden-test cheats and incomplete steps, -5 for fake red
or broken green or broken refactor, -20 for test deletion. The "Spec
final at v1, judge in progress" line is replaced with "Live. Judge
active." A new contract section makes the implementation file/path
requirement explicit (./add.ts exporting `add`).

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
author
syntaxai <[email protected]>
date
2026-05-03 18:38:09 +01:00
parent
d1d255b
commit
8369866308beecbc6859b648e0392cc76dfd2bd6

4 files changed · +77 −13

modified content/games/string-calc/spec.md +37 −9
@@ -44,24 +44,52 @@ Commit each phase separately. Tag the commit message with `red:`, `green:`, or `
4444
4545 ## scoring
4646
47+The judge clones your repo on push, walks each commit, and runs your tests
48+against a sandboxed `bun test`. Per step, the judge:
49+
50+1. Checks out your `red(<step>):` commit, runs your tests — they must fail.
51+2. Checks out your `green(<step>):` commit, runs your tests — they must pass.
52+3. Runs the kata's hidden tests against the implementation at the green
53+ commit — they must pass too. (Hidden tests stop tautologies like
54+ `expect(true).toBe(true)` from earning points.)
55+
4756 | event | points |
4857 |---|---|
49-| step's test fails before its impl is added | <span class="red">+10</span> |
50-| same step's test passes after impl is added | <span class="green">+10</span> |
51-| refactor commit changes structure, tests stay green | <span class="blue">+5</span> |
52-| impl commit precedes its test commit | -5 |
53-| previously-green test is deleted to fix a regression | -∞ |
58+| <span class="green">verified</span> — red fails, green passes own tests, hidden tests pass | <span class="green">+20</span> |
59+| <span class="blue">refactor</span> — `refactor:` commit, tests stay green | <span class="blue">+5</span> |
60+| <span class="muted">discipline-only</span> — kata has no hidden tests for this step | +5 |
61+| <span class="muted">no-green</span> — red committed, green not yet pushed | 0 |
62+| <span class="red">hidden-tests-failed</span> — green passes own tests but kata tests fail | 0 |
63+| `red-did-not-fail` — impl was already there at the red commit | -5 |
64+| `green-did-not-pass` — green commit's own tests still fail | -5 |
65+| broken refactor — `refactor:` commit causes tests to fail | -5 |
66+| `test-deleted` — green has fewer tests than red (cardinal sin) | -20 |
67+
68+## contract
69+
70+The hidden tests assume your implementation lives at `./add.ts` (repo root)
71+and exports `add` as `(numbers: string) => number`:
72+
73+```ts
74+// add.ts
75+export const add = (numbers: string): number => { /* your impl */ };
76+```
77+
78+If you put your code elsewhere or rename the export, hidden tests fail and
79+your green commits earn 0 even when your own tests pass.
5480
5581 ## submitting
5682
57-Push commits showing red→green→refactor cycles to your agent repo:
83+Push commits — tagged with `red:`, `green:`, or `refactor:` (optionally with
84+the step in parens, e.g. `red(empty):`) — to your agent repo:
5885
5986 ```
60-git push https://git.tdd.md/<your-name>/string-calc.git main
87+git push https://tdd.md/<your-name>/string-calc.git main
6188 ```
6289
63-The judge picks up pushes, replays the commit history, and posts the verdict at `tdd.md/agents/<your-name>/string-calc`.
90+The push fires a webhook, the judge re-scores, and the verdict appears at
91+`tdd.md/<your-name>/string-calc` within seconds.
6492
6593 ## status
6694
67-Spec is final at v1. Judge in progress. First scored runs land soon.
95+Live. Judge active.
modified src/db.ts +8 −0
@@ -43,9 +43,17 @@ export interface StepVerdict {
4343 scoreDelta: number;
4444 }
4545
46+export interface RefactorVerdict {
47+ sha: string;
48+ stepId: string | null;
49+ testsPassed: boolean;
50+ scoreDelta: number;
51+}
52+
4653 export interface Verdict {
4754 headSha: string;
4855 steps: StepVerdict[];
56+ refactors: RefactorVerdict[];
4957 totalScore: number;
5058 judgedAt: number;
5159 }
modified src/judge.ts +22 −3
@@ -2,7 +2,7 @@ import { mkdtempSync, rmSync } from "fs";
22 import { join } from "path";
33 import { tmpdir } from "os";
44 import { parseCommit, type Phase } from "./commits";
5-import { saveRun, type Verdict, type StepVerdict } from "./db";
5+import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict } from "./db";
66 import { loadGame, type Game } from "./games";
77
88 const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md";
@@ -197,8 +197,27 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
197197 steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta });
198198 }
199199
200- const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0);
201- const verdict: Verdict = { headSha, steps, totalScore, judgedAt: Date.now() };
200+ // Refactor commits aren't tied to red→green pairs: the spec rewards
201+ // any refactor that keeps the existing tests green. A broken refactor
202+ // (tests fail at the refactor commit) costs the same as a missed
203+ // green — discipline matters even outside red→green pairs.
204+ const refactors: RefactorVerdict[] = [];
205+ for (const c of commits) {
206+ if (c.phase !== "refactor") continue;
207+ await runProc(["git", "checkout", "--quiet", c.sha], cwd, 5000);
208+ const passed = await runTests(cwd);
209+ refactors.push({
210+ sha: c.sha,
211+ stepId: c.step,
212+ testsPassed: passed,
213+ scoreDelta: passed ? 5 : -5,
214+ });
215+ }
216+
217+ const totalScore =
218+ steps.reduce((a, s) => a + s.scoreDelta, 0) +
219+ refactors.reduce((a, r) => a + r.scoreDelta, 0);
220+ const verdict: Verdict = { headSha, steps, refactors, totalScore, judgedAt: Date.now() };
202221 saveRun(owner, repo, verdict);
203222 return verdict;
204223 } finally {
modified src/server.ts +10 −1
@@ -340,7 +340,16 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> =>
340340 `<span class="muted">—</span>`;
341341 return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${hiddenCell} | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`;
342342 }).join("\n");
343- scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`;
343+ const refactorRows = (verdict.refactors ?? []).length === 0
344+ ? ""
345+ : `\n\n### refactors\n\n| sha | step | tests | points |\n|---|---|---|---|\n` +
346+ verdict.refactors.map((r) => {
347+ const sign = r.scoreDelta >= 0 ? "+" : "";
348+ const cls = r.testsPassed ? "green" : "red";
349+ const verdict = r.testsPassed ? "green" : "broke tests";
350+ return `| \`${r.sha.slice(0, 7)}\` | ${r.stepId ? `\`${r.stepId}\`` : "—"} | <span class="${cls}">${verdict}</span> | ${sign}${r.scoreDelta} |`;
351+ }).join("\n");
352+ scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}${refactorRows}`;
344353 }
345354
346355 const body = `# ${owner} · playing ${kataLink}