syntaxai/tdd.md · commit d1d255b

Batch 1: hidden tests, auth, test-deletion detection

Closes the four critical audit findings:

- Hidden tests per kata step. content/games/string-calc/hidden/<step>.ts
  files hold authoritative tests that import from "./add" (the kata's
  documented import path). The judge copies the matching hidden file
  into the agent's working tree as __hidden_<step>__.test.ts after the
  green checkout, runs it in isolation, and only awards +20 when both
  the agent's own tests and the hidden tests pass. Tautological tests
  ("expect(true).toBe(true)") now score 0 ("hidden-tests-failed")
  instead of +20.

- POST /api/judge/:owner/:repo now requires
  Authorization: Bearer <FORGEJO_ADMIN_TOKEN>. Anyone could previously
  trigger heavy clone+test cycles on any repo. Push-driven judge runs
  still arrive via /api/forgejo/webhook with HMAC verification and are
  unaffected.

- Test-deletion detection. countTests() reads tracked *.test.ts files
  before the red and green checkouts and compares the test() / it()
  call counts. If green has fewer than red, the step is flagged
  "test-deleted" and scored -20 — the spec calls this -∞; we cap it
  at "wipes the +20 you would've earned, then some".

- Game and Step types gained signature, importPath, requirement, and
  hiddenTestFile so loaders and renderers can do more than just pass
  the id through. spec.ts is the source of truth for the kata; the
  human spec.md will track in a follow-up.

- Repo page now has a "hidden" column showing pass/fail/—; status
  cells use blue for "discipline-only" and red for fail-states.

- Dropped the never-read SESSION_SECRET podman secret env wiring.
  OAuth state lives entirely in the HttpOnly cookie; HMAC for webhooks
  uses WEBHOOK_SECRET.

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>

author: syntaxai <[email protected]>
date: 2026-05-03 18:35:59 +01:00
parent: 2093c3c
commit: d1d255b048a3668a37f65e3ae9188707ea3245db

13 files changed · +237 −14

added content/games/string-calc/hidden/custom-separator.ts +10 −0

@@ -0,0 +1,10 @@
	1	+import { test, expect } from "bun:test";
	2	+import { add } from "./add";
	3	+
	4	+test("HIDDEN: '//;\\n1;2' returns 3", () => {
	5	+ expect(add("//;\n1;2")).toBe(3);
	6	+});
	7	+
	8	+test("HIDDEN: '//#\\n1#2#3' returns 6", () => {
	9	+ expect(add("//#\n1#2#3")).toBe(6);
	10	+});

added content/games/string-calc/hidden/empty.ts +6 −0

@@ -0,0 +1,6 @@
	1	+import { test, expect } from "bun:test";
	2	+import { add } from "./add";
	3	+
	4	+test("HIDDEN: empty string returns 0", () => {
	5	+ expect(add("")).toBe(0);
	6	+});

added content/games/string-calc/hidden/n-numbers.ts +10 −0

@@ -0,0 +1,10 @@
	1	+import { test, expect } from "bun:test";
	2	+import { add } from "./add";
	3	+
	4	+test("HIDDEN: '1,2,3,4' returns 10", () => {
	5	+ expect(add("1,2,3,4")).toBe(10);
	6	+});
	7	+
	8	+test("HIDDEN: '1,2,3,4,5,6,7,8,9,10' returns 55", () => {
	9	+ expect(add("1,2,3,4,5,6,7,8,9,10")).toBe(55);
	10	+});

added content/games/string-calc/hidden/negatives-throw.ts +14 −0

@@ -0,0 +1,14 @@
	1	+import { test, expect } from "bun:test";
	2	+import { add } from "./add";
	3	+
	4	+test("HIDDEN: single negative throws with negative listed", () => {
	5	+ expect(() => add("1,-2")).toThrow(/negatives not allowed.*-2/);
	6	+});
	7	+
	8	+test("HIDDEN: multiple negatives all listed", () => {
	9	+ expect(() => add("1,-2,-3")).toThrow(/negatives not allowed.-2.-3/);
	10	+});
	11	+
	12	+test("HIDDEN: positives never throw", () => {
	13	+ expect(() => add("1,2,3")).not.toThrow();
	14	+});

added content/games/string-calc/hidden/newline-separator.ts +10 −0

@@ -0,0 +1,10 @@
	1	+import { test, expect } from "bun:test";
	2	+import { add } from "./add";
	3	+
	4	+test("HIDDEN: '1\\n2,3' returns 6", () => {
	5	+ expect(add("1\n2,3")).toBe(6);
	6	+});
	7	+
	8	+test("HIDDEN: '1\\n2\\n3' returns 6", () => {
	9	+ expect(add("1\n2\n3")).toBe(6);
	10	+});

added content/games/string-calc/hidden/single-number.ts +14 −0

@@ -0,0 +1,14 @@
	1	+import { test, expect } from "bun:test";
	2	+import { add } from "./add";
	3	+
	4	+test("HIDDEN: single '1' returns 1", () => {
	5	+ expect(add("1")).toBe(1);
	6	+});
	7	+
	8	+test("HIDDEN: single '42' returns 42", () => {
	9	+ expect(add("42")).toBe(42);
	10	+});
	11	+
	12	+test("HIDDEN: single '0' returns 0", () => {
	13	+ expect(add("0")).toBe(0);
	14	+});

added content/games/string-calc/hidden/two-numbers.ts +14 −0

@@ -0,0 +1,14 @@
	1	+import { test, expect } from "bun:test";
	2	+import { add } from "./add";
	3	+
	4	+test("HIDDEN: '1,2' returns 3", () => {
	5	+ expect(add("1,2")).toBe(3);
	6	+});
	7	+
	8	+test("HIDDEN: '10,20' returns 30", () => {
	9	+ expect(add("10,20")).toBe(30);
	10	+});
	11	+
	12	+test("HIDDEN: '0,0' returns 0", () => {
	13	+ expect(add("0,0")).toBe(0);
	14	+});

modified content/games/string-calc/spec.ts +37 −7

@@ -2,13 +2,43 @@ import type { Game } from "../../../src/games";
2	2
3	3	export const spec: Game = {
4	4	id: "string-calc",
	5	+ signature: "add(numbers: string): number",
	6	+ importPath: "./add",
5	7	steps: [
6		- { id: "empty" },
7		- { id: "single-number" },
8		- { id: "two-numbers" },
9		- { id: "n-numbers" },
10		- { id: "newline-separator" },
11		- { id: "custom-separator" },
12		- { id: "negatives-throw" },
	8	+ {
	9	+ id: "empty",
	10	+ requirement: "An empty string returns 0",
	11	+ hiddenTestFile: "hidden/empty.ts",
	12	+ },
	13	+ {
	14	+ id: "single-number",
	15	+ requirement: "A single number returns its value",
	16	+ hiddenTestFile: "hidden/single-number.ts",
	17	+ },
	18	+ {
	19	+ id: "two-numbers",
	20	+ requirement: "Two comma-separated numbers return their sum",
	21	+ hiddenTestFile: "hidden/two-numbers.ts",
	22	+ },
	23	+ {
	24	+ id: "n-numbers",
	25	+ requirement: "Any count of comma-separated numbers",
	26	+ hiddenTestFile: "hidden/n-numbers.ts",
	27	+ },
	28	+ {
	29	+ id: "newline-separator",
	30	+ requirement: "Newlines are valid separators alongside commas",
	31	+ hiddenTestFile: "hidden/newline-separator.ts",
	32	+ },
	33	+ {
	34	+ id: "custom-separator",
	35	+ requirement: "//<sep>\\n header defines a single-character custom separator",
	36	+ hiddenTestFile: "hidden/custom-separator.ts",
	37	+ },
	38	+ {
	39	+ id: "negatives-throw",
	40	+ requirement: "Negative inputs throw an error listing all negatives",
	41	+ hiddenTestFile: "hidden/negatives-throw.ts",
	42	+ },
13	43	],
14	44	};

modified scripts/p620/tdd-md.container +0 −1

@@ -30,7 +30,6 @@ Environment=GITHUB_CLIENT_ID=Ov23li9O1wWWJDjlm6dX
30	30
31	31	Secret=tdd_github_client_secret,type=env,target=GITHUB_CLIENT_SECRET
32	32	Secret=tdd_forgejo_admin_token,type=env,target=FORGEJO_ADMIN_TOKEN
33		-Secret=tdd_session_secret,type=env,target=SESSION_SECRET
34	33	Secret=tdd_webhook_secret,type=env,target=WEBHOOK_SECRET
35	34
36	35	# Geen PublishPort — pod publisht al :44390 → :3000.

modified src/db.ts +12 −1

@@ -28,7 +28,18 @@ export interface StepVerdict {
28	28	greenSha: string \| null;
29	29	redFailed: boolean \| null;
30	30	greenPassed: boolean \| null;
31		- status: "verified" \| "no-green" \| "red-did-not-fail" \| "green-did-not-pass";
	31	+ // Whether the kata's authoritative hidden tests pass against the agent's
	32	+ // implementation at the green commit. null when no hidden tests exist
	33	+ // for the step (unknown kata, or step not registered with the spec).
	34	+ hiddenPassed: boolean \| null;
	35	+ status:
	36	+ \| "verified"
	37	+ \| "discipline-only"
	38	+ \| "no-green"
	39	+ \| "red-did-not-fail"
	40	+ \| "green-did-not-pass"
	41	+ \| "hidden-tests-failed"
	42	+ \| "test-deleted";
32	43	scoreDelta: number;
33	44	}
34	45

modified src/games.ts +12 −0

@@ -1,9 +1,21 @@
1	1	export interface Step {
2	2	id: string;
	3	+ requirement: string;
	4	+ // Path (relative to the kata's spec.ts) of the authoritative test file.
	5	+ // The judge copies this into the agent's working tree after the green
	6	+ // checkout and runs it — hidden tests are how we detect cheating where
	7	+ // an agent writes a tautological test like `expect(true).toBe(true)`.
	8	+ hiddenTestFile: string;
3	9	}
4	10
5	11	export interface Game {
6	12	id: string;
	13	+ // Human-readable function signature the agent must export. Documented
	14	+ // on the kata page so authors know what to build.
	15	+ signature: string;
	16	+ // The module path the hidden tests will import from. Agents must export
	17	+ // their solution from this exact path (relative to repo root).
	18	+ importPath: string;
7	19	steps: Step[];
8	20	}
9	21

modified src/judge.ts +78 −2

@@ -3,6 +3,7 @@ import { join } from "path";
3	3	import { tmpdir } from "os";
4	4	import { parseCommit, type Phase } from "./commits";
5	5	import { saveRun, type Verdict, type StepVerdict } from "./db";
	6	+import { loadGame, type Game } from "./games";
6	7
7	8	const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md";
8	9	const TEST_TIMEOUT_MS = 8000;
@@ -47,6 +48,51 @@ const runTests = async (cwd: string): Promise<boolean> => {
47	48	return !r.timedOut && r.exitCode === 0;
48	49	};
49	50
	51	+// Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect
	52	+// when an agent deletes tests between red and green to make a regression
	53	+// "pass" — a cardinal TDD sin per the kata spec.
	54	+const countTests = async (cwd: string): Promise<number> => {
	55	+ const r = await runProc(["git", "ls-files", "*.test.ts"], cwd, 5000);
	56	+ if (r.exitCode !== 0) return 0;
	57	+ const files = r.stdout.split("\n").filter((f) => f && !f.includes("__hidden_"));
	58	+ let count = 0;
	59	+ for (const f of files) {
	60	+ const content = await Bun.file(join(cwd, f))
	61	+ .text()
	62	+ .catch(() => "");
	63	+ const matches = content.match(/\b(?:test\|it)\s*\(/g);
	64	+ if (matches) count += matches.length;
	65	+ }
	66	+ return count;
	67	+};
	68	+
	69	+// Runs the kata's authoritative tests against the agent's implementation
	70	+// at whatever commit is currently checked out. Copies the hidden test
	71	+// file into the working tree under a __hidden__ prefix so it doesn't
	72	+// collide with the agent's filenames, runs only that file, then deletes
	73	+// it. Returns null if the kata doesn't have hidden tests for this step.
	74	+const runHiddenTests = async (cwd: string, spec: Game, stepId: string): Promise<boolean \| null> => {
	75	+ const stepDef = spec.steps.find((s) => s.id === stepId);
	76	+ if (!stepDef) return null;
	77	+ const sourcePath = `./content/games/${spec.id}/${stepDef.hiddenTestFile}`;
	78	+ const sourceFile = Bun.file(sourcePath);
	79	+ if (!(await sourceFile.exists())) return null;
	80	+ const content = await sourceFile.text();
	81	+ const targetName = `__hidden_${stepId}__.test.ts`;
	82	+ const targetPath = join(cwd, targetName);
	83	+ await Bun.write(targetPath, content);
	84	+ try {
	85	+ const r = await runProc(["bun", "test", targetName], cwd, TEST_TIMEOUT_MS);
	86	+ return !r.timedOut && r.exitCode === 0;
	87	+ } finally {
	88	+ try {
	89	+ rmSync(targetPath, { force: true });
	90	+ } catch {
	91	+ // best effort
	92	+ }
	93	+ }
	94	+};
	95	+
50	96	interface CommitInfo {
51	97	sha: string;
52	98	phase: Phase;
@@ -93,32 +139,62 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
93	139	}
94	140	}
95	141
	142	+ // Load the kata's authoritative spec — used to fetch hidden tests
	143	+ // per step. Repos that don't match a known kata get scored on red→green
	144	+ // discipline only (no hidden-test verification).
	145	+ let spec: Game \| null = null;
	146	+ try {
	147	+ spec = await loadGame(repo);
	148	+ } catch {
	149	+ spec = null;
	150	+ }
	151	+
96	152	const steps: StepVerdict[] = [];
97	153	for (const [stepId, redSha] of stepRed) {
98	154	const greenSha = stepGreen.get(stepId) ?? null;
99	155	await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000);
	156	+ const redTestCount = await countTests(cwd);
100	157	const redPassed = await runTests(cwd);
101	158	const redFailed = !redPassed;
102	159	let greenPassed: boolean \| null = null;
	160	+ let hiddenPassed: boolean \| null = null;
	161	+ let testsDeleted = false;
103	162	if (greenSha) {
104	163	await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000);
	164	+ const greenTestCount = await countTests(cwd);
	165	+ testsDeleted = greenTestCount < redTestCount;
105	166	greenPassed = await runTests(cwd);
	167	+ if (greenPassed && spec && !testsDeleted) {
	168	+ hiddenPassed = await runHiddenTests(cwd, spec, stepId);
	169	+ }
106	170	}
	171	+
107	172	let status: StepVerdict["status"];
108	173	let scoreDelta = 0;
109	174	if (greenSha === null) {
110	175	status = "no-green";
	176	+ } else if (testsDeleted) {
	177	+ // The kata spec calls this -∞. Stiff penalty: the entire step's
	178	+ // potential gain (+20) is wiped and then some.
	179	+ status = "test-deleted";
	180	+ scoreDelta = -20;
111	181	} else if (!redFailed) {
112	182	status = "red-did-not-fail";
113	183	scoreDelta = -5;
114	184	} else if (greenPassed === false) {
115	185	status = "green-did-not-pass";
116	186	scoreDelta = -5;
117		- } else {
	187	+ } else if (hiddenPassed === false) {
	188	+ status = "hidden-tests-failed";
	189	+ scoreDelta = 0;
	190	+ } else if (hiddenPassed === true) {
118	191	status = "verified";
119	192	scoreDelta = 20;
	193	+ } else {
	194	+ status = "discipline-only";
	195	+ scoreDelta = 5;
120	196	}
121		- steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, status, scoreDelta });
	197	+ steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta });
122	198	}
123	199
124	200	const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0);

modified src/server.ts +20 −3

@@ -322,13 +322,23 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> =>
322	322	} else {
323	323	const stale = verdictStale ? ` · <span class="muted">stale — newer commits not yet judged</span>` : "";
324	324	const sign = verdict.totalScore >= 0 ? "+" : "";
	325	+ const statusClass = (status: string): string => {
	326	+ if (status === "verified") return "green";
	327	+ if (status === "discipline-only") return "blue";
	328	+ if (status === "no-green") return "muted";
	329	+ return "red";
	330	+ };
325	331	const rows = verdict.steps.length === 0
326	332	? "_No red→green pairs found yet._"
327		- : `\| step \| red \| green \| status \| points \|\n\|---\|---\|---\|---\|---\|\n` +
	333	+ : `\| step \| red \| green \| hidden \| status \| points \|\n\|---\|---\|---\|---\|---\|---\|\n` +
328	334	verdict.steps.map((s) => {
329		- const cls = s.status === "verified" ? "green" : s.status === "no-green" ? "muted" : "red";
	335	+ const cls = statusClass(s.status);
330	336	const sign = s.scoreDelta >= 0 ? "+" : "";
331		- return `\| \`${s.stepId}\` \| \`${s.redSha?.slice(0, 7) ?? "—"}\` \| \`${s.greenSha?.slice(0, 7) ?? "—"}\` \| <span class="${cls}">${s.status}</span> \| ${sign}${s.scoreDelta} \|`;
	337	+ const hiddenCell =
	338	+ s.hiddenPassed === true ? `<span class="green">pass</span>` :
	339	+ s.hiddenPassed === false ? `<span class="red">fail</span>` :
	340	+ `<span class="muted">—</span>`;
	341	+ return `\| \`${s.stepId}\` \| \`${s.redSha?.slice(0, 7) ?? "—"}\` \| \`${s.greenSha?.slice(0, 7) ?? "—"}\` \| ${hiddenCell} \| <span class="${cls}">${s.status}</span> \| ${sign}${s.scoreDelta} \|`;
332	342	}).join("\n");
333	343	scoreSection = `total: ${sign}${verdict.totalScore} · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`;
334	344	}
@@ -486,6 +496,13 @@ ${url("https://tdd.md/leaderboard", "0.7")}
486	496	if (req.method !== "POST") {
487	497	return new Response("method not allowed; POST to trigger a judge run", { status: 405 });
488	498	}
	499	+ // Manual triggers require the admin token. Push-driven runs come
	500	+ // through /api/forgejo/webhook with HMAC signature verification.
	501	+ const adminToken = process.env.FORGEJO_ADMIN_TOKEN;
	502	+ const provided = req.headers.get("authorization")?.replace(/^[Bb]earer\s+/, "") ?? "";
	503	+ if (!adminToken \|\| !timingSafeEqual(provided, adminToken)) {
	504	+ return new Response("unauthorized — POST with `Authorization: Bearer <admin-token>`", { status: 401 });
	505	+ }
489	506	try {
490	507	const verdict = await judge(req.params.owner, req.params.repo);
491	508	return Response.json(verdict);

raw .diff