Batch 1: hidden tests, auth, test-deletion detection
Closes the four critical audit findings:
- Hidden tests per kata step. content/games/string-calc/hidden/<step>.ts
files hold authoritative tests that import from "./add" (the kata's
documented import path). The judge copies the matching hidden file
into the agent's working tree as __hidden_<step>__.test.ts after the
green checkout, runs it in isolation, and only awards +20 when both
the agent's own tests and the hidden tests pass. Tautological tests
("expect(true).toBe(true)") now score 0 ("hidden-tests-failed")
instead of +20.
- POST /api/judge/:owner/:repo now requires
Authorization: Bearer <FORGEJO_ADMIN_TOKEN>. Anyone could previously
trigger heavy clone+test cycles on any repo. Push-driven judge runs
still arrive via /api/forgejo/webhook with HMAC verification and are
unaffected.
- Test-deletion detection. countTests() reads tracked *.test.ts files
before the red and green checkouts and compares the test() / it()
call counts. If green has fewer than red, the step is flagged
"test-deleted" and scored -20 — the spec calls this -∞; we cap it
at "wipes the +20 you would've earned, then some".
- Game and Step types gained signature, importPath, requirement, and
hiddenTestFile so loaders and renderers can do more than just pass
the id through. spec.ts is the source of truth for the kata; the
human spec.md will track in a follow-up.
- Repo page now has a "hidden" column showing pass/fail/—; status
cells use blue for "discipline-only" and red for fail-states.
- Dropped the never-read SESSION_SECRET podman secret env wiring.
OAuth state lives entirely in the HttpOnly cookie; HMAC for webhooks
uses WEBHOOK_SECRET.
Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
13 files changed · +237 −14
content/games/string-calc/hidden/custom-separator.ts
+10
−0
| @@ -0,0 +1,10 @@ | ||
| 1 | +import { test, expect } from "bun:test"; | |
| 2 | +import { add } from "./add"; | |
| 3 | + | |
| 4 | +test("HIDDEN: '//;\\n1;2' returns 3", () => { | |
| 5 | + expect(add("//;\n1;2")).toBe(3); | |
| 6 | +}); | |
| 7 | + | |
| 8 | +test("HIDDEN: '//#\\n1#2#3' returns 6", () => { | |
| 9 | + expect(add("//#\n1#2#3")).toBe(6); | |
| 10 | +}); | |
content/games/string-calc/hidden/empty.ts
+6
−0
| @@ -0,0 +1,6 @@ | ||
| 1 | +import { test, expect } from "bun:test"; | |
| 2 | +import { add } from "./add"; | |
| 3 | + | |
| 4 | +test("HIDDEN: empty string returns 0", () => { | |
| 5 | + expect(add("")).toBe(0); | |
| 6 | +}); | |
content/games/string-calc/hidden/n-numbers.ts
+10
−0
| @@ -0,0 +1,10 @@ | ||
| 1 | +import { test, expect } from "bun:test"; | |
| 2 | +import { add } from "./add"; | |
| 3 | + | |
| 4 | +test("HIDDEN: '1,2,3,4' returns 10", () => { | |
| 5 | + expect(add("1,2,3,4")).toBe(10); | |
| 6 | +}); | |
| 7 | + | |
| 8 | +test("HIDDEN: '1,2,3,4,5,6,7,8,9,10' returns 55", () => { | |
| 9 | + expect(add("1,2,3,4,5,6,7,8,9,10")).toBe(55); | |
| 10 | +}); | |
content/games/string-calc/hidden/negatives-throw.ts
+14
−0
| @@ -0,0 +1,14 @@ | ||
| 1 | +import { test, expect } from "bun:test"; | |
| 2 | +import { add } from "./add"; | |
| 3 | + | |
| 4 | +test("HIDDEN: single negative throws with negative listed", () => { | |
| 5 | + expect(() => add("1,-2")).toThrow(/negatives not allowed.*-2/); | |
| 6 | +}); | |
| 7 | + | |
| 8 | +test("HIDDEN: multiple negatives all listed", () => { | |
| 9 | + expect(() => add("1,-2,-3")).toThrow(/negatives not allowed.*-2.*-3/); | |
| 10 | +}); | |
| 11 | + | |
| 12 | +test("HIDDEN: positives never throw", () => { | |
| 13 | + expect(() => add("1,2,3")).not.toThrow(); | |
| 14 | +}); | |
content/games/string-calc/hidden/newline-separator.ts
+10
−0
| @@ -0,0 +1,10 @@ | ||
| 1 | +import { test, expect } from "bun:test"; | |
| 2 | +import { add } from "./add"; | |
| 3 | + | |
| 4 | +test("HIDDEN: '1\\n2,3' returns 6", () => { | |
| 5 | + expect(add("1\n2,3")).toBe(6); | |
| 6 | +}); | |
| 7 | + | |
| 8 | +test("HIDDEN: '1\\n2\\n3' returns 6", () => { | |
| 9 | + expect(add("1\n2\n3")).toBe(6); | |
| 10 | +}); | |
content/games/string-calc/hidden/single-number.ts
+14
−0
| @@ -0,0 +1,14 @@ | ||
| 1 | +import { test, expect } from "bun:test"; | |
| 2 | +import { add } from "./add"; | |
| 3 | + | |
| 4 | +test("HIDDEN: single '1' returns 1", () => { | |
| 5 | + expect(add("1")).toBe(1); | |
| 6 | +}); | |
| 7 | + | |
| 8 | +test("HIDDEN: single '42' returns 42", () => { | |
| 9 | + expect(add("42")).toBe(42); | |
| 10 | +}); | |
| 11 | + | |
| 12 | +test("HIDDEN: single '0' returns 0", () => { | |
| 13 | + expect(add("0")).toBe(0); | |
| 14 | +}); | |
content/games/string-calc/hidden/two-numbers.ts
+14
−0
| @@ -0,0 +1,14 @@ | ||
| 1 | +import { test, expect } from "bun:test"; | |
| 2 | +import { add } from "./add"; | |
| 3 | + | |
| 4 | +test("HIDDEN: '1,2' returns 3", () => { | |
| 5 | + expect(add("1,2")).toBe(3); | |
| 6 | +}); | |
| 7 | + | |
| 8 | +test("HIDDEN: '10,20' returns 30", () => { | |
| 9 | + expect(add("10,20")).toBe(30); | |
| 10 | +}); | |
| 11 | + | |
| 12 | +test("HIDDEN: '0,0' returns 0", () => { | |
| 13 | + expect(add("0,0")).toBe(0); | |
| 14 | +}); | |
content/games/string-calc/spec.ts
+37
−7
| @@ -2,13 +2,43 @@ import type { Game } from "../../../src/games"; | ||
| 2 | 2 | |
| 3 | 3 | export const spec: Game = { |
| 4 | 4 | id: "string-calc", |
| 5 | + signature: "add(numbers: string): number", | |
| 6 | + importPath: "./add", | |
| 5 | 7 | steps: [ |
| 6 | - { id: "empty" }, | |
| 7 | - { id: "single-number" }, | |
| 8 | - { id: "two-numbers" }, | |
| 9 | - { id: "n-numbers" }, | |
| 10 | - { id: "newline-separator" }, | |
| 11 | - { id: "custom-separator" }, | |
| 12 | - { id: "negatives-throw" }, | |
| 8 | + { | |
| 9 | + id: "empty", | |
| 10 | + requirement: "An empty string returns 0", | |
| 11 | + hiddenTestFile: "hidden/empty.ts", | |
| 12 | + }, | |
| 13 | + { | |
| 14 | + id: "single-number", | |
| 15 | + requirement: "A single number returns its value", | |
| 16 | + hiddenTestFile: "hidden/single-number.ts", | |
| 17 | + }, | |
| 18 | + { | |
| 19 | + id: "two-numbers", | |
| 20 | + requirement: "Two comma-separated numbers return their sum", | |
| 21 | + hiddenTestFile: "hidden/two-numbers.ts", | |
| 22 | + }, | |
| 23 | + { | |
| 24 | + id: "n-numbers", | |
| 25 | + requirement: "Any count of comma-separated numbers", | |
| 26 | + hiddenTestFile: "hidden/n-numbers.ts", | |
| 27 | + }, | |
| 28 | + { | |
| 29 | + id: "newline-separator", | |
| 30 | + requirement: "Newlines are valid separators alongside commas", | |
| 31 | + hiddenTestFile: "hidden/newline-separator.ts", | |
| 32 | + }, | |
| 33 | + { | |
| 34 | + id: "custom-separator", | |
| 35 | + requirement: "//<sep>\\n header defines a single-character custom separator", | |
| 36 | + hiddenTestFile: "hidden/custom-separator.ts", | |
| 37 | + }, | |
| 38 | + { | |
| 39 | + id: "negatives-throw", | |
| 40 | + requirement: "Negative inputs throw an error listing all negatives", | |
| 41 | + hiddenTestFile: "hidden/negatives-throw.ts", | |
| 42 | + }, | |
| 13 | 43 | ], |
| 14 | 44 | }; |
scripts/p620/tdd-md.container
+0
−1
| @@ -30,7 +30,6 @@ Environment=GITHUB_CLIENT_ID=Ov23li9O1wWWJDjlm6dX | ||
| 30 | 30 | |
| 31 | 31 | Secret=tdd_github_client_secret,type=env,target=GITHUB_CLIENT_SECRET |
| 32 | 32 | Secret=tdd_forgejo_admin_token,type=env,target=FORGEJO_ADMIN_TOKEN |
| 33 | -Secret=tdd_session_secret,type=env,target=SESSION_SECRET | |
| 34 | 33 | Secret=tdd_webhook_secret,type=env,target=WEBHOOK_SECRET |
| 35 | 34 | |
| 36 | 35 | # Geen PublishPort — pod publisht al :44390 → :3000. |
src/db.ts
+12
−1
| @@ -28,7 +28,18 @@ export interface StepVerdict { | ||
| 28 | 28 | greenSha: string | null; |
| 29 | 29 | redFailed: boolean | null; |
| 30 | 30 | greenPassed: boolean | null; |
| 31 | - status: "verified" | "no-green" | "red-did-not-fail" | "green-did-not-pass"; | |
| 31 | + // Whether the kata's authoritative hidden tests pass against the agent's | |
| 32 | + // implementation at the green commit. null when no hidden tests exist | |
| 33 | + // for the step (unknown kata, or step not registered with the spec). | |
| 34 | + hiddenPassed: boolean | null; | |
| 35 | + status: | |
| 36 | + | "verified" | |
| 37 | + | "discipline-only" | |
| 38 | + | "no-green" | |
| 39 | + | "red-did-not-fail" | |
| 40 | + | "green-did-not-pass" | |
| 41 | + | "hidden-tests-failed" | |
| 42 | + | "test-deleted"; | |
| 32 | 43 | scoreDelta: number; |
| 33 | 44 | } |
| 34 | 45 | |
src/games.ts
+12
−0
| @@ -1,9 +1,21 @@ | ||
| 1 | 1 | export interface Step { |
| 2 | 2 | id: string; |
| 3 | + requirement: string; | |
| 4 | + // Path (relative to the kata's spec.ts) of the authoritative test file. | |
| 5 | + // The judge copies this into the agent's working tree after the green | |
| 6 | + // checkout and runs it — hidden tests are how we detect cheating where | |
| 7 | + // an agent writes a tautological test like `expect(true).toBe(true)`. | |
| 8 | + hiddenTestFile: string; | |
| 3 | 9 | } |
| 4 | 10 | |
| 5 | 11 | export interface Game { |
| 6 | 12 | id: string; |
| 13 | + // Human-readable function signature the agent must export. Documented | |
| 14 | + // on the kata page so authors know what to build. | |
| 15 | + signature: string; | |
| 16 | + // The module path the hidden tests will import from. Agents must export | |
| 17 | + // their solution from this exact path (relative to repo root). | |
| 18 | + importPath: string; | |
| 7 | 19 | steps: Step[]; |
| 8 | 20 | } |
| 9 | 21 | |
src/judge.ts
+78
−2
| @@ -3,6 +3,7 @@ import { join } from "path"; | ||
| 3 | 3 | import { tmpdir } from "os"; |
| 4 | 4 | import { parseCommit, type Phase } from "./commits"; |
| 5 | 5 | import { saveRun, type Verdict, type StepVerdict } from "./db"; |
| 6 | +import { loadGame, type Game } from "./games"; | |
| 6 | 7 | |
| 7 | 8 | const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md"; |
| 8 | 9 | const TEST_TIMEOUT_MS = 8000; |
| @@ -47,6 +48,51 @@ const runTests = async (cwd: string): Promise<boolean> => { | ||
| 47 | 48 | return !r.timedOut && r.exitCode === 0; |
| 48 | 49 | }; |
| 49 | 50 | |
| 51 | +// Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect | |
| 52 | +// when an agent deletes tests between red and green to make a regression | |
| 53 | +// "pass" — a cardinal TDD sin per the kata spec. | |
| 54 | +const countTests = async (cwd: string): Promise<number> => { | |
| 55 | + const r = await runProc(["git", "ls-files", "*.test.ts"], cwd, 5000); | |
| 56 | + if (r.exitCode !== 0) return 0; | |
| 57 | + const files = r.stdout.split("\n").filter((f) => f && !f.includes("__hidden_")); | |
| 58 | + let count = 0; | |
| 59 | + for (const f of files) { | |
| 60 | + const content = await Bun.file(join(cwd, f)) | |
| 61 | + .text() | |
| 62 | + .catch(() => ""); | |
| 63 | + const matches = content.match(/\b(?:test|it)\s*\(/g); | |
| 64 | + if (matches) count += matches.length; | |
| 65 | + } | |
| 66 | + return count; | |
| 67 | +}; | |
| 68 | + | |
| 69 | +// Runs the kata's authoritative tests against the agent's implementation | |
| 70 | +// at whatever commit is currently checked out. Copies the hidden test | |
| 71 | +// file into the working tree under a __hidden__ prefix so it doesn't | |
| 72 | +// collide with the agent's filenames, runs only that file, then deletes | |
| 73 | +// it. Returns null if the kata doesn't have hidden tests for this step. | |
| 74 | +const runHiddenTests = async (cwd: string, spec: Game, stepId: string): Promise<boolean | null> => { | |
| 75 | + const stepDef = spec.steps.find((s) => s.id === stepId); | |
| 76 | + if (!stepDef) return null; | |
| 77 | + const sourcePath = `./content/games/${spec.id}/${stepDef.hiddenTestFile}`; | |
| 78 | + const sourceFile = Bun.file(sourcePath); | |
| 79 | + if (!(await sourceFile.exists())) return null; | |
| 80 | + const content = await sourceFile.text(); | |
| 81 | + const targetName = `__hidden_${stepId}__.test.ts`; | |
| 82 | + const targetPath = join(cwd, targetName); | |
| 83 | + await Bun.write(targetPath, content); | |
| 84 | + try { | |
| 85 | + const r = await runProc(["bun", "test", targetName], cwd, TEST_TIMEOUT_MS); | |
| 86 | + return !r.timedOut && r.exitCode === 0; | |
| 87 | + } finally { | |
| 88 | + try { | |
| 89 | + rmSync(targetPath, { force: true }); | |
| 90 | + } catch { | |
| 91 | + // best effort | |
| 92 | + } | |
| 93 | + } | |
| 94 | +}; | |
| 95 | + | |
| 50 | 96 | interface CommitInfo { |
| 51 | 97 | sha: string; |
| 52 | 98 | phase: Phase; |
| @@ -93,32 +139,62 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => { | ||
| 93 | 139 | } |
| 94 | 140 | } |
| 95 | 141 | |
| 142 | + // Load the kata's authoritative spec — used to fetch hidden tests | |
| 143 | + // per step. Repos that don't match a known kata get scored on red→green | |
| 144 | + // discipline only (no hidden-test verification). | |
| 145 | + let spec: Game | null = null; | |
| 146 | + try { | |
| 147 | + spec = await loadGame(repo); | |
| 148 | + } catch { | |
| 149 | + spec = null; | |
| 150 | + } | |
| 151 | + | |
| 96 | 152 | const steps: StepVerdict[] = []; |
| 97 | 153 | for (const [stepId, redSha] of stepRed) { |
| 98 | 154 | const greenSha = stepGreen.get(stepId) ?? null; |
| 99 | 155 | await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000); |
| 156 | + const redTestCount = await countTests(cwd); | |
| 100 | 157 | const redPassed = await runTests(cwd); |
| 101 | 158 | const redFailed = !redPassed; |
| 102 | 159 | let greenPassed: boolean | null = null; |
| 160 | + let hiddenPassed: boolean | null = null; | |
| 161 | + let testsDeleted = false; | |
| 103 | 162 | if (greenSha) { |
| 104 | 163 | await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000); |
| 164 | + const greenTestCount = await countTests(cwd); | |
| 165 | + testsDeleted = greenTestCount < redTestCount; | |
| 105 | 166 | greenPassed = await runTests(cwd); |
| 167 | + if (greenPassed && spec && !testsDeleted) { | |
| 168 | + hiddenPassed = await runHiddenTests(cwd, spec, stepId); | |
| 169 | + } | |
| 106 | 170 | } |
| 171 | + | |
| 107 | 172 | let status: StepVerdict["status"]; |
| 108 | 173 | let scoreDelta = 0; |
| 109 | 174 | if (greenSha === null) { |
| 110 | 175 | status = "no-green"; |
| 176 | + } else if (testsDeleted) { | |
| 177 | + // The kata spec calls this -∞. Stiff penalty: the entire step's | |
| 178 | + // potential gain (+20) is wiped and then some. | |
| 179 | + status = "test-deleted"; | |
| 180 | + scoreDelta = -20; | |
| 111 | 181 | } else if (!redFailed) { |
| 112 | 182 | status = "red-did-not-fail"; |
| 113 | 183 | scoreDelta = -5; |
| 114 | 184 | } else if (greenPassed === false) { |
| 115 | 185 | status = "green-did-not-pass"; |
| 116 | 186 | scoreDelta = -5; |
| 117 | - } else { | |
| 187 | + } else if (hiddenPassed === false) { | |
| 188 | + status = "hidden-tests-failed"; | |
| 189 | + scoreDelta = 0; | |
| 190 | + } else if (hiddenPassed === true) { | |
| 118 | 191 | status = "verified"; |
| 119 | 192 | scoreDelta = 20; |
| 193 | + } else { | |
| 194 | + status = "discipline-only"; | |
| 195 | + scoreDelta = 5; | |
| 120 | 196 | } |
| 121 | - steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, status, scoreDelta }); | |
| 197 | + steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta }); | |
| 122 | 198 | } |
| 123 | 199 | |
| 124 | 200 | const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0); |
src/server.ts
+20
−3
| @@ -322,13 +322,23 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> => | ||
| 322 | 322 | } else { |
| 323 | 323 | const stale = verdictStale ? ` · <span class="muted">stale — newer commits not yet judged</span>` : ""; |
| 324 | 324 | const sign = verdict.totalScore >= 0 ? "+" : ""; |
| 325 | + const statusClass = (status: string): string => { | |
| 326 | + if (status === "verified") return "green"; | |
| 327 | + if (status === "discipline-only") return "blue"; | |
| 328 | + if (status === "no-green") return "muted"; | |
| 329 | + return "red"; | |
| 330 | + }; | |
| 325 | 331 | const rows = verdict.steps.length === 0 |
| 326 | 332 | ? "_No red→green pairs found yet._" |
| 327 | - : `| step | red | green | status | points |\n|---|---|---|---|---|\n` + | |
| 333 | + : `| step | red | green | hidden | status | points |\n|---|---|---|---|---|---|\n` + | |
| 328 | 334 | verdict.steps.map((s) => { |
| 329 | - const cls = s.status === "verified" ? "green" : s.status === "no-green" ? "muted" : "red"; | |
| 335 | + const cls = statusClass(s.status); | |
| 330 | 336 | const sign = s.scoreDelta >= 0 ? "+" : ""; |
| 331 | - return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`; | |
| 337 | + const hiddenCell = | |
| 338 | + s.hiddenPassed === true ? `<span class="green">pass</span>` : | |
| 339 | + s.hiddenPassed === false ? `<span class="red">fail</span>` : | |
| 340 | + `<span class="muted">—</span>`; | |
| 341 | + return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${hiddenCell} | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`; | |
| 332 | 342 | }).join("\n"); |
| 333 | 343 | scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`; |
| 334 | 344 | } |
| @@ -486,6 +496,13 @@ ${url("https://tdd.md/leaderboard", "0.7")} | ||
| 486 | 496 | if (req.method !== "POST") { |
| 487 | 497 | return new Response("method not allowed; POST to trigger a judge run", { status: 405 }); |
| 488 | 498 | } |
| 499 | + // Manual triggers require the admin token. Push-driven runs come | |
| 500 | + // through /api/forgejo/webhook with HMAC signature verification. | |
| 501 | + const adminToken = process.env.FORGEJO_ADMIN_TOKEN; | |
| 502 | + const provided = req.headers.get("authorization")?.replace(/^[Bb]earer\s+/, "") ?? ""; | |
| 503 | + if (!adminToken || !timingSafeEqual(provided, adminToken)) { | |
| 504 | + return new Response("unauthorized — POST with `Authorization: Bearer <admin-token>`", { status: 401 }); | |
| 505 | + } | |
| 489 | 506 | try { |
| 490 | 507 | const verdict = await judge(req.params.owner, req.params.repo); |
| 491 | 508 | return Response.json(verdict); |