syntaxai/tdd.md · commit d1d255b

Batch 1: hidden tests, auth, test-deletion detection

Closes the four critical audit findings:

- Hidden tests per kata step. content/games/string-calc/hidden/<step>.ts
  files hold authoritative tests that import from "./add" (the kata's
  documented import path). The judge copies the matching hidden file
  into the agent's working tree as __hidden_<step>__.test.ts after the
  green checkout, runs it in isolation, and only awards +20 when both
  the agent's own tests and the hidden tests pass. Tautological tests
  ("expect(true).toBe(true)") now score 0 ("hidden-tests-failed")
  instead of +20.

- POST /api/judge/:owner/:repo now requires
  Authorization: Bearer <FORGEJO_ADMIN_TOKEN>. Anyone could previously
  trigger heavy clone+test cycles on any repo. Push-driven judge runs
  still arrive via /api/forgejo/webhook with HMAC verification and are
  unaffected.

- Test-deletion detection. countTests() reads tracked *.test.ts files
  before the red and green checkouts and compares the test() / it()
  call counts. If green has fewer than red, the step is flagged
  "test-deleted" and scored -20 — the spec calls this -∞; we cap it
  at "wipes the +20 you would've earned, then some".

- Game and Step types gained signature, importPath, requirement, and
  hiddenTestFile so loaders and renderers can do more than just pass
  the id through. spec.ts is the source of truth for the kata; the
  human spec.md will track in a follow-up.

- Repo page now has a "hidden" column showing pass/fail/—; status
  cells use blue for "discipline-only" and red for fail-states.

- Dropped the never-read SESSION_SECRET podman secret env wiring.
  OAuth state lives entirely in the HttpOnly cookie; HMAC for webhooks
  uses WEBHOOK_SECRET.

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
author
syntaxai <[email protected]>
date
2026-05-03 18:35:59 +01:00
parent
2093c3c
commit
d1d255b048a3668a37f65e3ae9188707ea3245db

13 files changed · +237 −14

added content/games/string-calc/hidden/custom-separator.ts +10 −0
@@ -0,0 +1,10 @@
1+import { test, expect } from "bun:test";
2+import { add } from "./add";
3+
4+test("HIDDEN: '//;\\n1;2' returns 3", () => {
5+ expect(add("//;\n1;2")).toBe(3);
6+});
7+
8+test("HIDDEN: '//#\\n1#2#3' returns 6", () => {
9+ expect(add("//#\n1#2#3")).toBe(6);
10+});
added content/games/string-calc/hidden/empty.ts +6 −0
@@ -0,0 +1,6 @@
1+import { test, expect } from "bun:test";
2+import { add } from "./add";
3+
4+test("HIDDEN: empty string returns 0", () => {
5+ expect(add("")).toBe(0);
6+});
added content/games/string-calc/hidden/n-numbers.ts +10 −0
@@ -0,0 +1,10 @@
1+import { test, expect } from "bun:test";
2+import { add } from "./add";
3+
4+test("HIDDEN: '1,2,3,4' returns 10", () => {
5+ expect(add("1,2,3,4")).toBe(10);
6+});
7+
8+test("HIDDEN: '1,2,3,4,5,6,7,8,9,10' returns 55", () => {
9+ expect(add("1,2,3,4,5,6,7,8,9,10")).toBe(55);
10+});
added content/games/string-calc/hidden/negatives-throw.ts +14 −0
@@ -0,0 +1,14 @@
1+import { test, expect } from "bun:test";
2+import { add } from "./add";
3+
4+test("HIDDEN: single negative throws with negative listed", () => {
5+ expect(() => add("1,-2")).toThrow(/negatives not allowed.*-2/);
6+});
7+
8+test("HIDDEN: multiple negatives all listed", () => {
9+ expect(() => add("1,-2,-3")).toThrow(/negatives not allowed.*-2.*-3/);
10+});
11+
12+test("HIDDEN: positives never throw", () => {
13+ expect(() => add("1,2,3")).not.toThrow();
14+});
added content/games/string-calc/hidden/newline-separator.ts +10 −0
@@ -0,0 +1,10 @@
1+import { test, expect } from "bun:test";
2+import { add } from "./add";
3+
4+test("HIDDEN: '1\\n2,3' returns 6", () => {
5+ expect(add("1\n2,3")).toBe(6);
6+});
7+
8+test("HIDDEN: '1\\n2\\n3' returns 6", () => {
9+ expect(add("1\n2\n3")).toBe(6);
10+});
added content/games/string-calc/hidden/single-number.ts +14 −0
@@ -0,0 +1,14 @@
1+import { test, expect } from "bun:test";
2+import { add } from "./add";
3+
4+test("HIDDEN: single '1' returns 1", () => {
5+ expect(add("1")).toBe(1);
6+});
7+
8+test("HIDDEN: single '42' returns 42", () => {
9+ expect(add("42")).toBe(42);
10+});
11+
12+test("HIDDEN: single '0' returns 0", () => {
13+ expect(add("0")).toBe(0);
14+});
added content/games/string-calc/hidden/two-numbers.ts +14 −0
@@ -0,0 +1,14 @@
1+import { test, expect } from "bun:test";
2+import { add } from "./add";
3+
4+test("HIDDEN: '1,2' returns 3", () => {
5+ expect(add("1,2")).toBe(3);
6+});
7+
8+test("HIDDEN: '10,20' returns 30", () => {
9+ expect(add("10,20")).toBe(30);
10+});
11+
12+test("HIDDEN: '0,0' returns 0", () => {
13+ expect(add("0,0")).toBe(0);
14+});
modified content/games/string-calc/spec.ts +37 −7
@@ -2,13 +2,43 @@ import type { Game } from "../../../src/games";
22
33 export const spec: Game = {
44 id: "string-calc",
5+ signature: "add(numbers: string): number",
6+ importPath: "./add",
57 steps: [
6- { id: "empty" },
7- { id: "single-number" },
8- { id: "two-numbers" },
9- { id: "n-numbers" },
10- { id: "newline-separator" },
11- { id: "custom-separator" },
12- { id: "negatives-throw" },
8+ {
9+ id: "empty",
10+ requirement: "An empty string returns 0",
11+ hiddenTestFile: "hidden/empty.ts",
12+ },
13+ {
14+ id: "single-number",
15+ requirement: "A single number returns its value",
16+ hiddenTestFile: "hidden/single-number.ts",
17+ },
18+ {
19+ id: "two-numbers",
20+ requirement: "Two comma-separated numbers return their sum",
21+ hiddenTestFile: "hidden/two-numbers.ts",
22+ },
23+ {
24+ id: "n-numbers",
25+ requirement: "Any count of comma-separated numbers",
26+ hiddenTestFile: "hidden/n-numbers.ts",
27+ },
28+ {
29+ id: "newline-separator",
30+ requirement: "Newlines are valid separators alongside commas",
31+ hiddenTestFile: "hidden/newline-separator.ts",
32+ },
33+ {
34+ id: "custom-separator",
35+ requirement: "//<sep>\\n header defines a single-character custom separator",
36+ hiddenTestFile: "hidden/custom-separator.ts",
37+ },
38+ {
39+ id: "negatives-throw",
40+ requirement: "Negative inputs throw an error listing all negatives",
41+ hiddenTestFile: "hidden/negatives-throw.ts",
42+ },
1343 ],
1444 };
modified scripts/p620/tdd-md.container +0 −1
@@ -30,7 +30,6 @@ Environment=GITHUB_CLIENT_ID=Ov23li9O1wWWJDjlm6dX
3030
3131 Secret=tdd_github_client_secret,type=env,target=GITHUB_CLIENT_SECRET
3232 Secret=tdd_forgejo_admin_token,type=env,target=FORGEJO_ADMIN_TOKEN
33-Secret=tdd_session_secret,type=env,target=SESSION_SECRET
3433 Secret=tdd_webhook_secret,type=env,target=WEBHOOK_SECRET
3534
3635 # Geen PublishPort — pod publisht al :44390 → :3000.
modified src/db.ts +12 −1
@@ -28,7 +28,18 @@ export interface StepVerdict {
2828 greenSha: string | null;
2929 redFailed: boolean | null;
3030 greenPassed: boolean | null;
31- status: "verified" | "no-green" | "red-did-not-fail" | "green-did-not-pass";
31+ // Whether the kata's authoritative hidden tests pass against the agent's
32+ // implementation at the green commit. null when no hidden tests exist
33+ // for the step (unknown kata, or step not registered with the spec).
34+ hiddenPassed: boolean | null;
35+ status:
36+ | "verified"
37+ | "discipline-only"
38+ | "no-green"
39+ | "red-did-not-fail"
40+ | "green-did-not-pass"
41+ | "hidden-tests-failed"
42+ | "test-deleted";
3243 scoreDelta: number;
3344 }
3445
modified src/games.ts +12 −0
@@ -1,9 +1,21 @@
11 export interface Step {
22 id: string;
3+ requirement: string;
4+ // Path (relative to the kata's spec.ts) of the authoritative test file.
5+ // The judge copies this into the agent's working tree after the green
6+ // checkout and runs it — hidden tests are how we detect cheating where
7+ // an agent writes a tautological test like `expect(true).toBe(true)`.
8+ hiddenTestFile: string;
39 }
410
511 export interface Game {
612 id: string;
13+ // Human-readable function signature the agent must export. Documented
14+ // on the kata page so authors know what to build.
15+ signature: string;
16+ // The module path the hidden tests will import from. Agents must export
17+ // their solution from this exact path (relative to repo root).
18+ importPath: string;
719 steps: Step[];
820 }
921
modified src/judge.ts +78 −2
@@ -3,6 +3,7 @@ import { join } from "path";
33 import { tmpdir } from "os";
44 import { parseCommit, type Phase } from "./commits";
55 import { saveRun, type Verdict, type StepVerdict } from "./db";
6+import { loadGame, type Game } from "./games";
67
78 const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md";
89 const TEST_TIMEOUT_MS = 8000;
@@ -47,6 +48,51 @@ const runTests = async (cwd: string): Promise<boolean> => {
4748 return !r.timedOut && r.exitCode === 0;
4849 };
4950
51+// Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect
52+// when an agent deletes tests between red and green to make a regression
53+// "pass" — a cardinal TDD sin per the kata spec.
54+const countTests = async (cwd: string): Promise<number> => {
55+ const r = await runProc(["git", "ls-files", "*.test.ts"], cwd, 5000);
56+ if (r.exitCode !== 0) return 0;
57+ const files = r.stdout.split("\n").filter((f) => f && !f.includes("__hidden_"));
58+ let count = 0;
59+ for (const f of files) {
60+ const content = await Bun.file(join(cwd, f))
61+ .text()
62+ .catch(() => "");
63+ const matches = content.match(/\b(?:test|it)\s*\(/g);
64+ if (matches) count += matches.length;
65+ }
66+ return count;
67+};
68+
69+// Runs the kata's authoritative tests against the agent's implementation
70+// at whatever commit is currently checked out. Copies the hidden test
71+// file into the working tree under a __hidden__ prefix so it doesn't
72+// collide with the agent's filenames, runs only that file, then deletes
73+// it. Returns null if the kata doesn't have hidden tests for this step.
74+const runHiddenTests = async (cwd: string, spec: Game, stepId: string): Promise<boolean | null> => {
75+ const stepDef = spec.steps.find((s) => s.id === stepId);
76+ if (!stepDef) return null;
77+ const sourcePath = `./content/games/${spec.id}/${stepDef.hiddenTestFile}`;
78+ const sourceFile = Bun.file(sourcePath);
79+ if (!(await sourceFile.exists())) return null;
80+ const content = await sourceFile.text();
81+ const targetName = `__hidden_${stepId}__.test.ts`;
82+ const targetPath = join(cwd, targetName);
83+ await Bun.write(targetPath, content);
84+ try {
85+ const r = await runProc(["bun", "test", targetName], cwd, TEST_TIMEOUT_MS);
86+ return !r.timedOut && r.exitCode === 0;
87+ } finally {
88+ try {
89+ rmSync(targetPath, { force: true });
90+ } catch {
91+ // best effort
92+ }
93+ }
94+};
95+
5096 interface CommitInfo {
5197 sha: string;
5298 phase: Phase;
@@ -93,32 +139,62 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
93139 }
94140 }
95141
142+ // Load the kata's authoritative spec — used to fetch hidden tests
143+ // per step. Repos that don't match a known kata get scored on red→green
144+ // discipline only (no hidden-test verification).
145+ let spec: Game | null = null;
146+ try {
147+ spec = await loadGame(repo);
148+ } catch {
149+ spec = null;
150+ }
151+
96152 const steps: StepVerdict[] = [];
97153 for (const [stepId, redSha] of stepRed) {
98154 const greenSha = stepGreen.get(stepId) ?? null;
99155 await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000);
156+ const redTestCount = await countTests(cwd);
100157 const redPassed = await runTests(cwd);
101158 const redFailed = !redPassed;
102159 let greenPassed: boolean | null = null;
160+ let hiddenPassed: boolean | null = null;
161+ let testsDeleted = false;
103162 if (greenSha) {
104163 await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000);
164+ const greenTestCount = await countTests(cwd);
165+ testsDeleted = greenTestCount < redTestCount;
105166 greenPassed = await runTests(cwd);
167+ if (greenPassed && spec && !testsDeleted) {
168+ hiddenPassed = await runHiddenTests(cwd, spec, stepId);
169+ }
106170 }
171+
107172 let status: StepVerdict["status"];
108173 let scoreDelta = 0;
109174 if (greenSha === null) {
110175 status = "no-green";
176+ } else if (testsDeleted) {
177+ // The kata spec calls this -∞. Stiff penalty: the entire step's
178+ // potential gain (+20) is wiped and then some.
179+ status = "test-deleted";
180+ scoreDelta = -20;
111181 } else if (!redFailed) {
112182 status = "red-did-not-fail";
113183 scoreDelta = -5;
114184 } else if (greenPassed === false) {
115185 status = "green-did-not-pass";
116186 scoreDelta = -5;
117- } else {
187+ } else if (hiddenPassed === false) {
188+ status = "hidden-tests-failed";
189+ scoreDelta = 0;
190+ } else if (hiddenPassed === true) {
118191 status = "verified";
119192 scoreDelta = 20;
193+ } else {
194+ status = "discipline-only";
195+ scoreDelta = 5;
120196 }
121- steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, status, scoreDelta });
197+ steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta });
122198 }
123199
124200 const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0);
modified src/server.ts +20 −3
@@ -322,13 +322,23 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> =>
322322 } else {
323323 const stale = verdictStale ? ` · <span class="muted">stale — newer commits not yet judged</span>` : "";
324324 const sign = verdict.totalScore >= 0 ? "+" : "";
325+ const statusClass = (status: string): string => {
326+ if (status === "verified") return "green";
327+ if (status === "discipline-only") return "blue";
328+ if (status === "no-green") return "muted";
329+ return "red";
330+ };
325331 const rows = verdict.steps.length === 0
326332 ? "_No red→green pairs found yet._"
327- : `| step | red | green | status | points |\n|---|---|---|---|---|\n` +
333+ : `| step | red | green | hidden | status | points |\n|---|---|---|---|---|---|\n` +
328334 verdict.steps.map((s) => {
329- const cls = s.status === "verified" ? "green" : s.status === "no-green" ? "muted" : "red";
335+ const cls = statusClass(s.status);
330336 const sign = s.scoreDelta >= 0 ? "+" : "";
331- return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`;
337+ const hiddenCell =
338+ s.hiddenPassed === true ? `<span class="green">pass</span>` :
339+ s.hiddenPassed === false ? `<span class="red">fail</span>` :
340+ `<span class="muted">—</span>`;
341+ return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${hiddenCell} | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`;
332342 }).join("\n");
333343 scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`;
334344 }
@@ -486,6 +496,13 @@ ${url("https://tdd.md/leaderboard", "0.7")}
486496 if (req.method !== "POST") {
487497 return new Response("method not allowed; POST to trigger a judge run", { status: 405 });
488498 }
499+ // Manual triggers require the admin token. Push-driven runs come
500+ // through /api/forgejo/webhook with HMAC signature verification.
501+ const adminToken = process.env.FORGEJO_ADMIN_TOKEN;
502+ const provided = req.headers.get("authorization")?.replace(/^[Bb]earer\s+/, "") ?? "";
503+ if (!adminToken || !timingSafeEqual(provided, adminToken)) {
504+ return new Response("unauthorized — POST with `Authorization: Bearer <admin-token>`", { status: 401 });
505+ }
489506 try {
490507 const verdict = await judge(req.params.owner, req.params.repo);
491508 return Response.json(verdict);