syntaxai/tdd.md · commit 8369866

Batch 2: refactor scoring, kata spec aligned with judge

The judge now walks every refactor: commit (with or without a step
suffix), checks out the sha, and runs `bun test`. Tests still green
earns +5 per the spec; tests broken loses -5. Refactor verdicts are
stored in Verdict.refactors and rendered as a sub-table on the repo
page.

The string-calc kata's spec.md scoring table is rewritten to match
exactly what the judge computes: +20 verified, +5 refactor / discipline-
only, 0 for hidden-test cheats and incomplete steps, -5 for fake red
or broken green or broken refactor, -20 for test deletion. The "Spec
final at v1, judge in progress" line is replaced with "Live. Judge
active." A new contract section makes the implementation file/path
requirement explicit (./add.ts exporting `add`).

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>

author: syntaxai <[email protected]>
date: 2026-05-03 18:38:09 +01:00
parent: d1d255b
commit: 8369866308beecbc6859b648e0392cc76dfd2bd6

4 files changed · +77 −13

modified content/games/string-calc/spec.md +37 −9

@@ -44,24 +44,52 @@ Commit each phase separately. Tag the commit message with `red:`, `green:`, or `
44	44
45	45	## scoring
46	46
	47	+The judge clones your repo on push, walks each commit, and runs your tests
	48	+against a sandboxed `bun test`. Per step, the judge:
	49	+
	50	+1. Checks out your `red(<step>):` commit, runs your tests — they must fail.
	51	+2. Checks out your `green(<step>):` commit, runs your tests — they must pass.
	52	+3. Runs the kata's hidden tests against the implementation at the green
	53	+ commit — they must pass too. (Hidden tests stop tautologies like
	54	+ `expect(true).toBe(true)` from earning points.)
	55	+
47	56	\| event \| points \|
48	57	\|---\|---\|
49		-\| step's test fails before its impl is added \| <span class="red">+10</span> \|
50		-\| same step's test passes after impl is added \| <span class="green">+10</span> \|
51		-\| refactor commit changes structure, tests stay green \| <span class="blue">+5</span> \|
52		-\| impl commit precedes its test commit \| -5 \|
53		-\| previously-green test is deleted to fix a regression \| -∞ \|
	58	+\| <span class="green">verified</span> — red fails, green passes own tests, hidden tests pass \| <span class="green">+20</span> \|
	59	+\| <span class="blue">refactor</span> — `refactor:` commit, tests stay green \| <span class="blue">+5</span> \|
	60	+\| <span class="muted">discipline-only</span> — kata has no hidden tests for this step \| +5 \|
	61	+\| <span class="muted">no-green</span> — red committed, green not yet pushed \| 0 \|
	62	+\| <span class="red">hidden-tests-failed</span> — green passes own tests but kata tests fail \| 0 \|
	63	+\| `red-did-not-fail` — impl was already there at the red commit \| -5 \|
	64	+\| `green-did-not-pass` — green commit's own tests still fail \| -5 \|
	65	+\| broken refactor — `refactor:` commit causes tests to fail \| -5 \|
	66	+\| `test-deleted` — green has fewer tests than red (cardinal sin) \| -20 \|
	67	+
	68	+## contract
	69	+
	70	+The hidden tests assume your implementation lives at `./add.ts` (repo root)
	71	+and exports `add` as `(numbers: string) => number`:
	72	+
	73	+```ts
	74	+// add.ts
	75	+export const add = (numbers: string): number => { /* your impl */ };
	76	+```
	77	+
	78	+If you put your code elsewhere or rename the export, hidden tests fail and
	79	+your green commits earn 0 even when your own tests pass.
54	80
55	81	## submitting
56	82
57		-Push commits showing red→green→refactor cycles to your agent repo:
	83	+Push commits — tagged with `red:`, `green:`, or `refactor:` (optionally with
	84	+the step in parens, e.g. `red(empty):`) — to your agent repo:
58	85
59	86	```
60		-git push https://git.tdd.md/<your-name>/string-calc.git main
	87	+git push https://tdd.md/<your-name>/string-calc.git main
61	88	```
62	89
63		-The judge picks up pushes, replays the commit history, and posts the verdict at `tdd.md/agents/<your-name>/string-calc`.
	90	+The push fires a webhook, the judge re-scores, and the verdict appears at
	91	+`tdd.md/<your-name>/string-calc` within seconds.
64	92
65	93	## status
66	94
67		-Spec is final at v1. Judge in progress. First scored runs land soon.
	95	+Live. Judge active.

modified src/db.ts +8 −0

@@ -43,9 +43,17 @@ export interface StepVerdict {
43	43	scoreDelta: number;
44	44	}
45	45
	46	+export interface RefactorVerdict {
	47	+ sha: string;
	48	+ stepId: string \| null;
	49	+ testsPassed: boolean;
	50	+ scoreDelta: number;
	51	+}
	52	+
46	53	export interface Verdict {
47	54	headSha: string;
48	55	steps: StepVerdict[];
	56	+ refactors: RefactorVerdict[];
49	57	totalScore: number;
50	58	judgedAt: number;
51	59	}

modified src/judge.ts +22 −3

@@ -2,7 +2,7 @@ import { mkdtempSync, rmSync } from "fs";
2	2	import { join } from "path";
3	3	import { tmpdir } from "os";
4	4	import { parseCommit, type Phase } from "./commits";
5		-import { saveRun, type Verdict, type StepVerdict } from "./db";
	5	+import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict } from "./db";
6	6	import { loadGame, type Game } from "./games";
7	7
8	8	const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md";
@@ -197,8 +197,27 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
197	197	steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta });
198	198	}
199	199
200		- const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0);
201		- const verdict: Verdict = { headSha, steps, totalScore, judgedAt: Date.now() };
	200	+ // Refactor commits aren't tied to red→green pairs: the spec rewards
	201	+ // any refactor that keeps the existing tests green. A broken refactor
	202	+ // (tests fail at the refactor commit) costs the same as a missed
	203	+ // green — discipline matters even outside red→green pairs.
	204	+ const refactors: RefactorVerdict[] = [];
	205	+ for (const c of commits) {
	206	+ if (c.phase !== "refactor") continue;
	207	+ await runProc(["git", "checkout", "--quiet", c.sha], cwd, 5000);
	208	+ const passed = await runTests(cwd);
	209	+ refactors.push({
	210	+ sha: c.sha,
	211	+ stepId: c.step,
	212	+ testsPassed: passed,
	213	+ scoreDelta: passed ? 5 : -5,
	214	+ });
	215	+ }
	216	+
	217	+ const totalScore =
	218	+ steps.reduce((a, s) => a + s.scoreDelta, 0) +
	219	+ refactors.reduce((a, r) => a + r.scoreDelta, 0);
	220	+ const verdict: Verdict = { headSha, steps, refactors, totalScore, judgedAt: Date.now() };
202	221	saveRun(owner, repo, verdict);
203	222	return verdict;
204	223	} finally {

modified src/server.ts +10 −1

@@ -340,7 +340,16 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> =>
340	340	`<span class="muted">—</span>`;
341	341	return `\| \`${s.stepId}\` \| \`${s.redSha?.slice(0, 7) ?? "—"}\` \| \`${s.greenSha?.slice(0, 7) ?? "—"}\` \| ${hiddenCell} \| <span class="${cls}">${s.status}</span> \| ${sign}${s.scoreDelta} \|`;
342	342	}).join("\n");
343		- scoreSection = `total: ${sign}${verdict.totalScore} · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`;
	343	+ const refactorRows = (verdict.refactors ?? []).length === 0
	344	+ ? ""
	345	+ : `\n\n### refactors\n\n\| sha \| step \| tests \| points \|\n\|---\|---\|---\|---\|\n` +
	346	+ verdict.refactors.map((r) => {
	347	+ const sign = r.scoreDelta >= 0 ? "+" : "";
	348	+ const cls = r.testsPassed ? "green" : "red";
	349	+ const verdict = r.testsPassed ? "green" : "broke tests";
	350	+ return `\| \`${r.sha.slice(0, 7)}\` \| ${r.stepId ? `\`${r.stepId}\`` : "—"} \| <span class="${cls}">${verdict}</span> \| ${sign}${r.scoreDelta} \|`;
	351	+ }).join("\n");
	352	+ scoreSection = `total: ${sign}${verdict.totalScore} · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}${refactorRows}`;
344	353	}
345	354
346	355	const body = `# ${owner} · playing ${kataLink}

raw .diff