d1d255b048a3668a37f65e3ae9188707ea3245db diff --git a/content/games/string-calc/hidden/custom-separator.ts b/content/games/string-calc/hidden/custom-separator.ts new file mode 100644 index 0000000000000000000000000000000000000000..47bd47813e1a717af2148c0acf63e3f14d640232 --- /dev/null +++ b/content/games/string-calc/hidden/custom-separator.ts @@ -0,0 +1,10 @@ +import { test, expect } from "bun:test"; +import { add } from "./add"; + +test("HIDDEN: '//;\\n1;2' returns 3", () => { + expect(add("//;\n1;2")).toBe(3); +}); + +test("HIDDEN: '//#\\n1#2#3' returns 6", () => { + expect(add("//#\n1#2#3")).toBe(6); +}); diff --git a/content/games/string-calc/hidden/empty.ts b/content/games/string-calc/hidden/empty.ts new file mode 100644 index 0000000000000000000000000000000000000000..1a5377c1ea27c7ba1defa5f3286565c6c1749cdf --- /dev/null +++ b/content/games/string-calc/hidden/empty.ts @@ -0,0 +1,6 @@ +import { test, expect } from "bun:test"; +import { add } from "./add"; + +test("HIDDEN: empty string returns 0", () => { + expect(add("")).toBe(0); +}); diff --git a/content/games/string-calc/hidden/n-numbers.ts b/content/games/string-calc/hidden/n-numbers.ts new file mode 100644 index 0000000000000000000000000000000000000000..3346d2a838bd24b73a2728332695b8823a9e0e24 --- /dev/null +++ b/content/games/string-calc/hidden/n-numbers.ts @@ -0,0 +1,10 @@ +import { test, expect } from "bun:test"; +import { add } from "./add"; + +test("HIDDEN: '1,2,3,4' returns 10", () => { + expect(add("1,2,3,4")).toBe(10); +}); + +test("HIDDEN: '1,2,3,4,5,6,7,8,9,10' returns 55", () => { + expect(add("1,2,3,4,5,6,7,8,9,10")).toBe(55); +}); diff --git a/content/games/string-calc/hidden/negatives-throw.ts b/content/games/string-calc/hidden/negatives-throw.ts new file mode 100644 index 0000000000000000000000000000000000000000..ad9522851745115c5df7a196b4133a3d0bed81d8 --- /dev/null +++ b/content/games/string-calc/hidden/negatives-throw.ts @@ -0,0 +1,14 @@ +import { test, expect } from "bun:test"; +import { add } from "./add"; + +test("HIDDEN: single negative throws with negative listed", () => { + expect(() => add("1,-2")).toThrow(/negatives not allowed.*-2/); +}); + +test("HIDDEN: multiple negatives all listed", () => { + expect(() => add("1,-2,-3")).toThrow(/negatives not allowed.*-2.*-3/); +}); + +test("HIDDEN: positives never throw", () => { + expect(() => add("1,2,3")).not.toThrow(); +}); diff --git a/content/games/string-calc/hidden/newline-separator.ts b/content/games/string-calc/hidden/newline-separator.ts new file mode 100644 index 0000000000000000000000000000000000000000..6531a71a83732666259d1655c2c049154c8bbe17 --- /dev/null +++ b/content/games/string-calc/hidden/newline-separator.ts @@ -0,0 +1,10 @@ +import { test, expect } from "bun:test"; +import { add } from "./add"; + +test("HIDDEN: '1\\n2,3' returns 6", () => { + expect(add("1\n2,3")).toBe(6); +}); + +test("HIDDEN: '1\\n2\\n3' returns 6", () => { + expect(add("1\n2\n3")).toBe(6); +}); diff --git a/content/games/string-calc/hidden/single-number.ts b/content/games/string-calc/hidden/single-number.ts new file mode 100644 index 0000000000000000000000000000000000000000..8cf18d403e4d2b2b2b431895bd68ebd08c730fd5 --- /dev/null +++ b/content/games/string-calc/hidden/single-number.ts @@ -0,0 +1,14 @@ +import { test, expect } from "bun:test"; +import { add } from "./add"; + +test("HIDDEN: single '1' returns 1", () => { + expect(add("1")).toBe(1); +}); + +test("HIDDEN: single '42' returns 42", () => { + expect(add("42")).toBe(42); +}); + +test("HIDDEN: single '0' returns 0", () => { + expect(add("0")).toBe(0); +}); diff --git a/content/games/string-calc/hidden/two-numbers.ts b/content/games/string-calc/hidden/two-numbers.ts new file mode 100644 index 0000000000000000000000000000000000000000..ad517bff4274b5fa423e9e0d6cda8a6177b98642 --- /dev/null +++ b/content/games/string-calc/hidden/two-numbers.ts @@ -0,0 +1,14 @@ +import { test, expect } from "bun:test"; +import { add } from "./add"; + +test("HIDDEN: '1,2' returns 3", () => { + expect(add("1,2")).toBe(3); +}); + +test("HIDDEN: '10,20' returns 30", () => { + expect(add("10,20")).toBe(30); +}); + +test("HIDDEN: '0,0' returns 0", () => { + expect(add("0,0")).toBe(0); +}); diff --git a/content/games/string-calc/spec.ts b/content/games/string-calc/spec.ts index e728b013680d8d78b7806c97b49378a9b428a89b..fe1cc1b35b8cb2b09ae3caeeee8f7ad1bb94c3e7 100644 --- a/content/games/string-calc/spec.ts +++ b/content/games/string-calc/spec.ts @@ -2,13 +2,43 @@ import type { Game } from "../../../src/games"; export const spec: Game = { id: "string-calc", + signature: "add(numbers: string): number", + importPath: "./add", steps: [ - { id: "empty" }, - { id: "single-number" }, - { id: "two-numbers" }, - { id: "n-numbers" }, - { id: "newline-separator" }, - { id: "custom-separator" }, - { id: "negatives-throw" }, + { + id: "empty", + requirement: "An empty string returns 0", + hiddenTestFile: "hidden/empty.ts", + }, + { + id: "single-number", + requirement: "A single number returns its value", + hiddenTestFile: "hidden/single-number.ts", + }, + { + id: "two-numbers", + requirement: "Two comma-separated numbers return their sum", + hiddenTestFile: "hidden/two-numbers.ts", + }, + { + id: "n-numbers", + requirement: "Any count of comma-separated numbers", + hiddenTestFile: "hidden/n-numbers.ts", + }, + { + id: "newline-separator", + requirement: "Newlines are valid separators alongside commas", + hiddenTestFile: "hidden/newline-separator.ts", + }, + { + id: "custom-separator", + requirement: "//\\n header defines a single-character custom separator", + hiddenTestFile: "hidden/custom-separator.ts", + }, + { + id: "negatives-throw", + requirement: "Negative inputs throw an error listing all negatives", + hiddenTestFile: "hidden/negatives-throw.ts", + }, ], }; diff --git a/scripts/p620/tdd-md.container b/scripts/p620/tdd-md.container index 068d7ec9b32539d557ffdcc18e28db19a812b63e..90282dab67a59184cebf2469bb210dc6f827f490 100644 --- a/scripts/p620/tdd-md.container +++ b/scripts/p620/tdd-md.container @@ -30,7 +30,6 @@ Environment=GITHUB_CLIENT_ID=Ov23li9O1wWWJDjlm6dX Secret=tdd_github_client_secret,type=env,target=GITHUB_CLIENT_SECRET Secret=tdd_forgejo_admin_token,type=env,target=FORGEJO_ADMIN_TOKEN -Secret=tdd_session_secret,type=env,target=SESSION_SECRET Secret=tdd_webhook_secret,type=env,target=WEBHOOK_SECRET # Geen PublishPort — pod publisht al :44390 → :3000. diff --git a/src/db.ts b/src/db.ts index deaea29ceb719f3e6ac5ef1344009f6a08670302..13ea1b201efff83497af9e84d18de1604920eae3 100644 --- a/src/db.ts +++ b/src/db.ts @@ -28,7 +28,18 @@ export interface StepVerdict { greenSha: string | null; redFailed: boolean | null; greenPassed: boolean | null; - status: "verified" | "no-green" | "red-did-not-fail" | "green-did-not-pass"; + // Whether the kata's authoritative hidden tests pass against the agent's + // implementation at the green commit. null when no hidden tests exist + // for the step (unknown kata, or step not registered with the spec). + hiddenPassed: boolean | null; + status: + | "verified" + | "discipline-only" + | "no-green" + | "red-did-not-fail" + | "green-did-not-pass" + | "hidden-tests-failed" + | "test-deleted"; scoreDelta: number; } diff --git a/src/games.ts b/src/games.ts index 944cf662e1fbb113843a98026d6f5809c6fde04a..edb78827f8d9450a9b178b51a795bd5bde340c91 100644 --- a/src/games.ts +++ b/src/games.ts @@ -1,9 +1,21 @@ export interface Step { id: string; + requirement: string; + // Path (relative to the kata's spec.ts) of the authoritative test file. + // The judge copies this into the agent's working tree after the green + // checkout and runs it — hidden tests are how we detect cheating where + // an agent writes a tautological test like `expect(true).toBe(true)`. + hiddenTestFile: string; } export interface Game { id: string; + // Human-readable function signature the agent must export. Documented + // on the kata page so authors know what to build. + signature: string; + // The module path the hidden tests will import from. Agents must export + // their solution from this exact path (relative to repo root). + importPath: string; steps: Step[]; } diff --git a/src/judge.ts b/src/judge.ts index 01f1e02c7b73867e1bbc85a260dcfc135d03b234..5d69f4570a1610d3047e8150d61230150a959183 100644 --- a/src/judge.ts +++ b/src/judge.ts @@ -3,6 +3,7 @@ import { join } from "path"; import { tmpdir } from "os"; import { parseCommit, type Phase } from "./commits"; import { saveRun, type Verdict, type StepVerdict } from "./db"; +import { loadGame, type Game } from "./games"; const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md"; const TEST_TIMEOUT_MS = 8000; @@ -47,6 +48,51 @@ const runTests = async (cwd: string): Promise => { return !r.timedOut && r.exitCode === 0; }; +// Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect +// when an agent deletes tests between red and green to make a regression +// "pass" — a cardinal TDD sin per the kata spec. +const countTests = async (cwd: string): Promise => { + const r = await runProc(["git", "ls-files", "*.test.ts"], cwd, 5000); + if (r.exitCode !== 0) return 0; + const files = r.stdout.split("\n").filter((f) => f && !f.includes("__hidden_")); + let count = 0; + for (const f of files) { + const content = await Bun.file(join(cwd, f)) + .text() + .catch(() => ""); + const matches = content.match(/\b(?:test|it)\s*\(/g); + if (matches) count += matches.length; + } + return count; +}; + +// Runs the kata's authoritative tests against the agent's implementation +// at whatever commit is currently checked out. Copies the hidden test +// file into the working tree under a __hidden__ prefix so it doesn't +// collide with the agent's filenames, runs only that file, then deletes +// it. Returns null if the kata doesn't have hidden tests for this step. +const runHiddenTests = async (cwd: string, spec: Game, stepId: string): Promise => { + const stepDef = spec.steps.find((s) => s.id === stepId); + if (!stepDef) return null; + const sourcePath = `./content/games/${spec.id}/${stepDef.hiddenTestFile}`; + const sourceFile = Bun.file(sourcePath); + if (!(await sourceFile.exists())) return null; + const content = await sourceFile.text(); + const targetName = `__hidden_${stepId}__.test.ts`; + const targetPath = join(cwd, targetName); + await Bun.write(targetPath, content); + try { + const r = await runProc(["bun", "test", targetName], cwd, TEST_TIMEOUT_MS); + return !r.timedOut && r.exitCode === 0; + } finally { + try { + rmSync(targetPath, { force: true }); + } catch { + // best effort + } + } +}; + interface CommitInfo { sha: string; phase: Phase; @@ -93,32 +139,62 @@ export const judge = async (owner: string, repo: string): Promise => { } } + // Load the kata's authoritative spec — used to fetch hidden tests + // per step. Repos that don't match a known kata get scored on red→green + // discipline only (no hidden-test verification). + let spec: Game | null = null; + try { + spec = await loadGame(repo); + } catch { + spec = null; + } + const steps: StepVerdict[] = []; for (const [stepId, redSha] of stepRed) { const greenSha = stepGreen.get(stepId) ?? null; await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000); + const redTestCount = await countTests(cwd); const redPassed = await runTests(cwd); const redFailed = !redPassed; let greenPassed: boolean | null = null; + let hiddenPassed: boolean | null = null; + let testsDeleted = false; if (greenSha) { await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000); + const greenTestCount = await countTests(cwd); + testsDeleted = greenTestCount < redTestCount; greenPassed = await runTests(cwd); + if (greenPassed && spec && !testsDeleted) { + hiddenPassed = await runHiddenTests(cwd, spec, stepId); + } } + let status: StepVerdict["status"]; let scoreDelta = 0; if (greenSha === null) { status = "no-green"; + } else if (testsDeleted) { + // The kata spec calls this -∞. Stiff penalty: the entire step's + // potential gain (+20) is wiped and then some. + status = "test-deleted"; + scoreDelta = -20; } else if (!redFailed) { status = "red-did-not-fail"; scoreDelta = -5; } else if (greenPassed === false) { status = "green-did-not-pass"; scoreDelta = -5; - } else { + } else if (hiddenPassed === false) { + status = "hidden-tests-failed"; + scoreDelta = 0; + } else if (hiddenPassed === true) { status = "verified"; scoreDelta = 20; + } else { + status = "discipline-only"; + scoreDelta = 5; } - steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, status, scoreDelta }); + steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta }); } const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0); diff --git a/src/server.ts b/src/server.ts index 87dc5dda110696cb18a34b2610c08b49e9bd76cf..fb18639b222194d4297779ad0495adb996f2dd8d 100644 --- a/src/server.ts +++ b/src/server.ts @@ -322,13 +322,23 @@ const renderRepoView = async (owner: string, repo: string): Promise => } else { const stale = verdictStale ? ` · stale — newer commits not yet judged` : ""; const sign = verdict.totalScore >= 0 ? "+" : ""; + const statusClass = (status: string): string => { + if (status === "verified") return "green"; + if (status === "discipline-only") return "blue"; + if (status === "no-green") return "muted"; + return "red"; + }; const rows = verdict.steps.length === 0 ? "_No red→green pairs found yet._" - : `| step | red | green | status | points |\n|---|---|---|---|---|\n` + + : `| step | red | green | hidden | status | points |\n|---|---|---|---|---|---|\n` + verdict.steps.map((s) => { - const cls = s.status === "verified" ? "green" : s.status === "no-green" ? "muted" : "red"; + const cls = statusClass(s.status); const sign = s.scoreDelta >= 0 ? "+" : ""; - return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${s.status} | ${sign}${s.scoreDelta} |`; + const hiddenCell = + s.hiddenPassed === true ? `pass` : + s.hiddenPassed === false ? `fail` : + ``; + return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${hiddenCell} | ${s.status} | ${sign}${s.scoreDelta} |`; }).join("\n"); scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`; } @@ -486,6 +496,13 @@ ${url("https://tdd.md/leaderboard", "0.7")} if (req.method !== "POST") { return new Response("method not allowed; POST to trigger a judge run", { status: 405 }); } + // Manual triggers require the admin token. Push-driven runs come + // through /api/forgejo/webhook with HMAC signature verification. + const adminToken = process.env.FORGEJO_ADMIN_TOKEN; + const provided = req.headers.get("authorization")?.replace(/^[Bb]earer\s+/, "") ?? ""; + if (!adminToken || !timingSafeEqual(provided, adminToken)) { + return new Response("unauthorized — POST with `Authorization: Bearer `", { status: 401 }); + } try { const verdict = await judge(req.params.owner, req.params.repo); return Response.json(verdict);