import { mkdtempSync, rmSync } from "fs"; import { join } from "path"; import { tmpdir } from "os"; import { parseCommit, type Phase } from "./a31_commits.ts"; import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./c13_database.ts"; import { loadGame, type Game } from "./a31_games.ts"; type TestRunner = "bun" | "none"; interface TddConfig { mode: Mode; testRunner: TestRunner; } // tdd.config.json from the agent's repo selects the scoring mode and // test runner. Falls back to strict / bun when missing or unparseable. // // { "mode": "pragmatic", "test_runner": "none" } // // test_runner: "none" enables trace-only judging — no checkout, no test // execution. Useful as a CI gate on projects where Bun can't run the // suite (e.g. .NET, Python without bun-compat tests). const readConfig = async (cwd: string): Promise => { const file = Bun.file(join(cwd, "tdd.config.json")); let mode: Mode = "strict"; let testRunner: TestRunner = "bun"; if (await file.exists()) { try { const cfg = (await file.json()) as { mode?: string; test_runner?: string }; if (cfg.mode === "pragmatic" || cfg.mode === "learning") mode = cfg.mode; if (cfg.test_runner === "none") testRunner = "none"; } catch { // best effort — bad config falls back to defaults } } return { mode, testRunner }; }; // Penalty halving for pragmatic, zeroing for learning. Positive deltas // are unchanged across modes — earned credit is earned credit. export const applyMode = (delta: number, mode: Mode): number => { if (delta >= 0) return delta; if (mode === "learning") return 0; if (mode === "pragmatic") return Math.ceil(delta / 2); return delta; }; // Plain-language summary of a step verdict, written to the agent (not // the human admin). One short paragraph; named intentionally so callers // can see it next to the row in the score table. const explainStep = (params: { status: StepVerdict["status"]; redSha: string | null; greenSha: string | null; hiddenPassed: boolean | null; mode: Mode; }): string => { const { status, hiddenPassed, mode } = params; switch (status) { case "verified": return "Red failed as expected, green passes your tests, and the kata's hidden tests confirm the implementation matches the requirement."; case "discipline-only": return "Red→green discipline holds, but this kata didn't ship hidden tests for the step. Partial credit awarded; full +20 isn't possible without authoritative verification."; case "no-green": return "Red commit landed; the matching green() commit hasn't been pushed yet. Push your green to lock in the score."; case "red-did-not-fail": return mode === "pragmatic" ? "Combined red+green commit detected. Pragmatic mode allows this — the cycle still counts, just with a softer score than a clean separation." : "Red commit's tests already passed when the step was first introduced — meaning the implementation was added before the test, or the test is tautological. Switch to pragmatic mode if you commit red+green together intentionally."; case "green-did-not-pass": return "Green commit's own tests still fail. The implementation doesn't yet satisfy the test you wrote — fix the impl, or reconsider whether the test reflects the requirement."; case "hidden-tests-failed": return hiddenPassed === false ? "Your tests pass, but the kata's hidden tests don't — this is the classic tautology trap. Tighten your test to mirror the requirement (e.g., assert the actual return value, not just that it runs)." : "Your tests pass, but hidden verification was inconclusive. Re-push to retry."; case "test-deleted": return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle."; case "trace-verified": return "Trace-only mode: red→green pair found in the commit log. Tests weren't executed (test_runner: \"none\"). Switch to bun runner for behaviour verification."; case "trace-tests-shrunk": return "Trace-only mode: the green commit's tree has fewer test files than the red commit's tree — looks like deletion. If you renamed or split test files, the tally still drops."; } }; export const explainRefactor = (passed: boolean): string => passed ? "Tests stayed green through the refactor — structural change without behavior change, the canonical refactor." : "Refactor commit broke at least one test. Either revert the refactor or write a new red→green to capture the changed behavior."; const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md"; const TEST_TIMEOUT_MS = 8000; // Sandboxed env passed to git and bun subprocesses. Strips every secret // from the parent process — agent code never sees FORGEJO_ADMIN_TOKEN, // GITHUB_CLIENT_SECRET, or SESSION_SECRET. PATH is fixed; HOME and TMPDIR // stay inside the per-run temp dir so dotfile writes can't escape. const sandboxEnv = (cwd: string): Record => ({ PATH: "/usr/local/bin:/usr/bin:/bin", HOME: cwd, TMPDIR: cwd, NODE_ENV: "test", }); const runProc = async ( cmd: string[], cwd: string, timeoutMs: number, ): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> => { const proc = Bun.spawn(cmd, { cwd, stdout: "pipe", stderr: "pipe", env: sandboxEnv(cwd), }); let timedOut = false; const timer = setTimeout(() => { timedOut = true; proc.kill("SIGKILL"); }, timeoutMs); const exitCode = await proc.exited; clearTimeout(timer); const stdout = await new Response(proc.stdout).text(); const stderr = await new Response(proc.stderr).text(); return { stdout: stdout.trim(), stderr: stderr.trim(), exitCode, timedOut }; }; const runTests = async (cwd: string): Promise => { const r = await runProc(["bun", "test"], cwd, TEST_TIMEOUT_MS); // Bun test exits 0 only when all tests pass. return !r.timedOut && r.exitCode === 0; }; // Language-agnostic test-file counter for trace-only mode. Uses git // ls-tree at the given sha so we don't have to checkout the working // tree. Matches conventional test-file naming across ecosystems: // foo.test.ts, foo.spec.ts, FooTests.cs, FooTest.java, test_foo.py, // foo_test.go, FooSpec.scala, foo_spec.rb. const countTestFiles = async (cwd: string, sha: string): Promise => { const r = await runProc(["git", "ls-tree", "-r", "--name-only", sha], cwd, 5000); if (r.exitCode !== 0) return 0; const re = /(?:^|\/)(?:[^/]*\.(?:test|spec)\.[a-z]+|[Tt]ests?\/[^/]+|test_[^/]+|[^/]+_test\.[a-z]+|[^/]+[Tt]ests?\.cs|[^/]+[Tt]est\.java)$/; let count = 0; for (const line of r.stdout.split("\n")) { if (re.test(line)) count++; } return count; }; // Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect // when an agent deletes tests between red and green to make a regression // "pass" — a cardinal TDD sin per the kata spec. const countTests = async (cwd: string): Promise => { const r = await runProc(["git", "ls-files", "*.test.ts"], cwd, 5000); if (r.exitCode !== 0) return 0; const files = r.stdout.split("\n").filter((f) => f && !f.includes("__hidden_")); let count = 0; for (const f of files) { const content = await Bun.file(join(cwd, f)) .text() .catch(() => ""); const matches = content.match(/\b(?:test|it)\s*\(/g); if (matches) count += matches.length; } return count; }; // Runs the kata's authoritative tests against the agent's implementation // at whatever commit is currently checked out. Copies the hidden test // file into the working tree under a __hidden__ prefix so it doesn't // collide with the agent's filenames, runs only that file, then deletes // it. Returns null if the kata doesn't have hidden tests for this step. const runHiddenTests = async (cwd: string, spec: Game, stepId: string): Promise => { const stepDef = spec.steps.find((s) => s.id === stepId); if (!stepDef) return null; const sourcePath = `./content/games/${spec.id}/${stepDef.hiddenTestFile}`; const sourceFile = Bun.file(sourcePath); if (!(await sourceFile.exists())) return null; const content = await sourceFile.text(); const targetName = `__hidden_${stepId}__.test.ts`; const targetPath = join(cwd, targetName); await Bun.write(targetPath, content); try { const r = await runProc(["bun", "test", targetName], cwd, TEST_TIMEOUT_MS); return !r.timedOut && r.exitCode === 0; } finally { try { rmSync(targetPath, { force: true }); } catch { // best effort } } }; interface CommitInfo { sha: string; phase: Phase; step: string | null; } const readCommits = async (cwd: string): Promise => { const r = await runProc(["git", "log", "--reverse", "--pretty=format:%H%x1f%B%x1e"], cwd, 10000); if (r.exitCode !== 0) return []; const out: CommitInfo[] = []; for (const block of r.stdout.split("\x1e")) { const t = block.trim(); if (!t) continue; const [sha, message = ""] = t.split("\x1f"); if (!sha) continue; const p = parseCommit(message); out.push({ sha, phase: p.phase, step: p.step }); } return out; }; export const judge = async (owner: string, repo: string): Promise => { const cwd = mkdtempSync(join(tmpdir(), `judge-${owner}-${repo}-`)); try { // Agent repos default to private. Authenticate via admin token in // an http.extraheader so the token isn't persisted in the cloned // repo's config (extraheader applies to the clone request only). const cloneUrl = `${FORGEJO_INTERNAL}/${owner}/${repo}.git`; const adminToken = process.env.FORGEJO_ADMIN_TOKEN; const gitArgs = adminToken ? ["-c", `http.extraheader=Authorization: token ${adminToken}`, "clone", "--quiet", cloneUrl, "."] : ["clone", "--quiet", cloneUrl, "."]; const cloneR = await runProc(["git", ...gitArgs], cwd, 30000); if (cloneR.exitCode !== 0) { throw new Error(`clone failed: ${cloneR.stderr || cloneR.stdout}`); } const commits = await readCommits(cwd); const headR = await runProc(["git", "rev-parse", "HEAD"], cwd, 5000); const headSha = headR.stdout; // First red per step + first green-after-red per step (chronological). const stepRed = new Map(); const stepGreen = new Map(); for (const c of commits) { if (!c.step) continue; if (c.phase === "red" && !stepRed.has(c.step)) { stepRed.set(c.step, c.sha); } else if (c.phase === "green" && stepRed.has(c.step) && !stepGreen.has(c.step)) { stepGreen.set(c.step, c.sha); } } // Read the agent's mode + runner preferences from tdd.config.json. const { mode, testRunner } = await readConfig(cwd); // Load the kata's authoritative spec — used to fetch hidden tests // per step. Repos that don't match a known kata get scored on red→green // discipline only (no hidden-test verification). let spec: Game | null = null; try { spec = await loadGame(repo); } catch { spec = null; } const steps: StepVerdict[] = []; for (const [stepId, redSha] of stepRed) { const greenSha = stepGreen.get(stepId) ?? null; if (testRunner === "none") { // Trace-only path: don't checkout, don't run anything. Score // purely from the commit log + a language-agnostic test-file // count via `git ls-tree`. Useful for non-Bun projects. const redFiles = await countTestFiles(cwd, redSha); const greenFiles = greenSha ? await countTestFiles(cwd, greenSha) : redFiles; const filesShrank = greenSha !== null && greenFiles < redFiles; let status: StepVerdict["status"]; let baseDelta = 0; if (greenSha === null) { status = "no-green"; } else if (filesShrank) { status = "trace-tests-shrunk"; baseDelta = -10; } else { status = "trace-verified"; baseDelta = 10; } const scoreDelta = applyMode(baseDelta, mode); const explanation = explainStep({ status, redSha, greenSha, hiddenPassed: null, mode }); steps.push({ stepId, redSha, greenSha, redFailed: null, greenPassed: null, hiddenPassed: null, status, scoreDelta, explanation, }); continue; } await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000); const redTestCount = await countTests(cwd); const redPassed = await runTests(cwd); const redFailed = !redPassed; let greenPassed: boolean | null = null; let hiddenPassed: boolean | null = null; let testsDeleted = false; if (greenSha) { await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000); const greenTestCount = await countTests(cwd); testsDeleted = greenTestCount < redTestCount; greenPassed = await runTests(cwd); if (greenPassed && spec && !testsDeleted) { hiddenPassed = await runHiddenTests(cwd, spec, stepId); } } let status: StepVerdict["status"]; let baseDelta = 0; if (greenSha === null) { status = "no-green"; } else if (testsDeleted) { status = "test-deleted"; baseDelta = -20; } else if (!redFailed) { status = "red-did-not-fail"; baseDelta = -5; } else if (greenPassed === false) { status = "green-did-not-pass"; baseDelta = -5; } else if (hiddenPassed === false) { status = "hidden-tests-failed"; baseDelta = 0; } else if (hiddenPassed === true) { status = "verified"; baseDelta = 20; } else { status = "discipline-only"; baseDelta = 5; } const scoreDelta = applyMode(baseDelta, mode); const explanation = explainStep({ status, redSha, greenSha, hiddenPassed, mode }); steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta, explanation }); } // Refactor commits aren't tied to red→green pairs: the spec rewards // any refactor that keeps the existing tests green. A broken refactor // (tests fail at the refactor commit) costs the same as a missed // green — discipline matters even outside red→green pairs. const refactors: RefactorVerdict[] = []; for (const c of commits) { if (c.phase !== "refactor") continue; await runProc(["git", "checkout", "--quiet", c.sha], cwd, 5000); const passed = await runTests(cwd); const baseDelta = passed ? 5 : -5; refactors.push({ sha: c.sha, stepId: c.step, testsPassed: passed, scoreDelta: applyMode(baseDelta, mode), explanation: explainRefactor(passed), }); } const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0) + refactors.reduce((a, r) => a + r.scoreDelta, 0); const verdict: Verdict = { headSha, mode, steps, refactors, totalScore, judgedAt: Date.now() }; saveRun(owner, repo, verdict); return verdict; } finally { try { rmSync(cwd, { recursive: true, force: true }); } catch { // best effort cleanup } } };