syntaxai/tdd.md · main · src / c14_judge.ts
import { mkdtempSync, rmSync } from "fs";
import { join } from "path";
import { tmpdir } from "os";
import { parseCommit, type Phase } from "./a31_commits.ts";
import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./c13_database.ts";
import { loadGame, type Game } from "./a31_games.ts";
type TestRunner = "bun" | "none";
interface TddConfig {
mode: Mode;
testRunner: TestRunner;
}
// tdd.config.json from the agent's repo selects the scoring mode and
// test runner. Falls back to strict / bun when missing or unparseable.
//
// { "mode": "pragmatic", "test_runner": "none" }
//
// test_runner: "none" enables trace-only judging — no checkout, no test
// execution. Useful as a CI gate on projects where Bun can't run the
// suite (e.g. .NET, Python without bun-compat tests).
const readConfig = async (cwd: string): Promise<TddConfig> => {
const file = Bun.file(join(cwd, "tdd.config.json"));
let mode: Mode = "strict";
let testRunner: TestRunner = "bun";
if (await file.exists()) {
try {
const cfg = (await file.json()) as { mode?: string; test_runner?: string };
if (cfg.mode === "pragmatic" || cfg.mode === "learning") mode = cfg.mode;
if (cfg.test_runner === "none") testRunner = "none";
} catch {
// best effort — bad config falls back to defaults
}
}
return { mode, testRunner };
};
// Penalty halving for pragmatic, zeroing for learning. Positive deltas
// are unchanged across modes — earned credit is earned credit.
export const applyMode = (delta: number, mode: Mode): number => {
if (delta >= 0) return delta;
if (mode === "learning") return 0;
if (mode === "pragmatic") return Math.ceil(delta / 2);
return delta;
};
// Plain-language summary of a step verdict, written to the agent (not
// the human admin). One short paragraph; named intentionally so callers
// can see it next to the row in the score table.
const explainStep = (params: {
status: StepVerdict["status"];
redSha: string | null;
greenSha: string | null;
hiddenPassed: boolean | null;
mode: Mode;
}): string => {
const { status, hiddenPassed, mode } = params;
switch (status) {
case "verified":
return "Red failed as expected, green passes your tests, and the kata's hidden tests confirm the implementation matches the requirement.";
case "discipline-only":
return "Red→green discipline holds, but this kata didn't ship hidden tests for the step. Partial credit awarded; full +20 isn't possible without authoritative verification.";
case "no-green":
return "Red commit landed; the matching green(<step>) commit hasn't been pushed yet. Push your green to lock in the score.";
case "red-did-not-fail":
return mode === "pragmatic"
? "Combined red+green commit detected. Pragmatic mode allows this — the cycle still counts, just with a softer score than a clean separation."
: "Red commit's tests already passed when the step was first introduced — meaning the implementation was added before the test, or the test is tautological. Switch to pragmatic mode if you commit red+green together intentionally.";
case "green-did-not-pass":
return "Green commit's own tests still fail. The implementation doesn't yet satisfy the test you wrote — fix the impl, or reconsider whether the test reflects the requirement.";
case "hidden-tests-failed":
return hiddenPassed === false
? "Your tests pass, but the kata's hidden tests don't — this is the classic tautology trap. Tighten your test to mirror the requirement (e.g., assert the actual return value, not just that it runs)."
: "Your tests pass, but hidden verification was inconclusive. Re-push to retry.";
case "test-deleted":
return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle.";
case "trace-verified":
return "Trace-only mode: red→green pair found in the commit log. Tests weren't executed (test_runner: \"none\"). Switch to bun runner for behaviour verification.";
case "trace-tests-shrunk":
return "Trace-only mode: the green commit's tree has fewer test files than the red commit's tree — looks like deletion. If you renamed or split test files, the tally still drops.";
}
};
export const explainRefactor = (passed: boolean): string =>
passed
? "Tests stayed green through the refactor — structural change without behavior change, the canonical refactor."
: "Refactor commit broke at least one test. Either revert the refactor or write a new red→green to capture the changed behavior.";
const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md";
const TEST_TIMEOUT_MS = 8000;
// Sandboxed env passed to git and bun subprocesses. Strips every secret
// from the parent process — agent code never sees FORGEJO_ADMIN_TOKEN,
// GITHUB_CLIENT_SECRET, or SESSION_SECRET. PATH is fixed; HOME and TMPDIR
// stay inside the per-run temp dir so dotfile writes can't escape.
const sandboxEnv = (cwd: string): Record<string, string> => ({
PATH: "/usr/local/bin:/usr/bin:/bin",
HOME: cwd,
TMPDIR: cwd,
NODE_ENV: "test",
});
const runProc = async (
cmd: string[],
cwd: string,
timeoutMs: number,
): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> => {
const proc = Bun.spawn(cmd, {
cwd,
stdout: "pipe",
stderr: "pipe",
env: sandboxEnv(cwd),
});
let timedOut = false;
const timer = setTimeout(() => {
timedOut = true;
proc.kill("SIGKILL");
}, timeoutMs);
const exitCode = await proc.exited;
clearTimeout(timer);
const stdout = await new Response(proc.stdout).text();
const stderr = await new Response(proc.stderr).text();
return { stdout: stdout.trim(), stderr: stderr.trim(), exitCode, timedOut };
};
const runTests = async (cwd: string): Promise<boolean> => {
const r = await runProc(["bun", "test"], cwd, TEST_TIMEOUT_MS);
// Bun test exits 0 only when all tests pass.
return !r.timedOut && r.exitCode === 0;
};
// Language-agnostic test-file counter for trace-only mode. Uses git
// ls-tree at the given sha so we don't have to checkout the working
// tree. Matches conventional test-file naming across ecosystems:
// foo.test.ts, foo.spec.ts, FooTests.cs, FooTest.java, test_foo.py,
// foo_test.go, FooSpec.scala, foo_spec.rb.
const countTestFiles = async (cwd: string, sha: string): Promise<number> => {
const r = await runProc(["git", "ls-tree", "-r", "--name-only", sha], cwd, 5000);
if (r.exitCode !== 0) return 0;
const re = /(?:^|\/)(?:[^/]*\.(?:test|spec)\.[a-z]+|[Tt]ests?\/[^/]+|test_[^/]+|[^/]+_test\.[a-z]+|[^/]+[Tt]ests?\.cs|[^/]+[Tt]est\.java)$/;
let count = 0;
for (const line of r.stdout.split("\n")) {
if (re.test(line)) count++;
}
return count;
};
// Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect
// when an agent deletes tests between red and green to make a regression
// "pass" — a cardinal TDD sin per the kata spec.
const countTests = async (cwd: string): Promise<number> => {
const r = await runProc(["git", "ls-files", "*.test.ts"], cwd, 5000);
if (r.exitCode !== 0) return 0;
const files = r.stdout.split("\n").filter((f) => f && !f.includes("__hidden_"));
let count = 0;
for (const f of files) {
const content = await Bun.file(join(cwd, f))
.text()
.catch(() => "");
const matches = content.match(/\b(?:test|it)\s*\(/g);
if (matches) count += matches.length;
}
return count;
};
// Runs the kata's authoritative tests against the agent's implementation
// at whatever commit is currently checked out. Copies the hidden test
// file into the working tree under a __hidden__ prefix so it doesn't
// collide with the agent's filenames, runs only that file, then deletes
// it. Returns null if the kata doesn't have hidden tests for this step.
const runHiddenTests = async (cwd: string, spec: Game, stepId: string): Promise<boolean | null> => {
const stepDef = spec.steps.find((s) => s.id === stepId);
if (!stepDef) return null;
const sourcePath = `./content/games/${spec.id}/${stepDef.hiddenTestFile}`;
const sourceFile = Bun.file(sourcePath);
if (!(await sourceFile.exists())) return null;
const content = await sourceFile.text();
const targetName = `__hidden_${stepId}__.test.ts`;
const targetPath = join(cwd, targetName);
await Bun.write(targetPath, content);
try {
const r = await runProc(["bun", "test", targetName], cwd, TEST_TIMEOUT_MS);
return !r.timedOut && r.exitCode === 0;
} finally {
try {
rmSync(targetPath, { force: true });
} catch {
// best effort
}
}
};
interface CommitInfo {
sha: string;
phase: Phase;
step: string | null;
}
const readCommits = async (cwd: string): Promise<CommitInfo[]> => {
const r = await runProc(["git", "log", "--reverse", "--pretty=format:%H%x1f%B%x1e"], cwd, 10000);
if (r.exitCode !== 0) return [];
const out: CommitInfo[] = [];
for (const block of r.stdout.split("\x1e")) {
const t = block.trim();
if (!t) continue;
const [sha, message = ""] = t.split("\x1f");
if (!sha) continue;
const p = parseCommit(message);
out.push({ sha, phase: p.phase, step: p.step });
}
return out;
};
export const judge = async (owner: string, repo: string): Promise<Verdict> => {
const cwd = mkdtempSync(join(tmpdir(), `judge-${owner}-${repo}-`));
try {
// Agent repos default to private. Authenticate via admin token in
// an http.extraheader so the token isn't persisted in the cloned
// repo's config (extraheader applies to the clone request only).
const cloneUrl = `${FORGEJO_INTERNAL}/${owner}/${repo}.git`;
const adminToken = process.env.FORGEJO_ADMIN_TOKEN;
const gitArgs = adminToken
? ["-c", `http.extraheader=Authorization: token ${adminToken}`, "clone", "--quiet", cloneUrl, "."]
: ["clone", "--quiet", cloneUrl, "."];
const cloneR = await runProc(["git", ...gitArgs], cwd, 30000);
if (cloneR.exitCode !== 0) {
throw new Error(`clone failed: ${cloneR.stderr || cloneR.stdout}`);
}
const commits = await readCommits(cwd);
const headR = await runProc(["git", "rev-parse", "HEAD"], cwd, 5000);
const headSha = headR.stdout;
// First red per step + first green-after-red per step (chronological).
const stepRed = new Map<string, string>();
const stepGreen = new Map<string, string>();
for (const c of commits) {
if (!c.step) continue;
if (c.phase === "red" && !stepRed.has(c.step)) {
stepRed.set(c.step, c.sha);
} else if (c.phase === "green" && stepRed.has(c.step) && !stepGreen.has(c.step)) {
stepGreen.set(c.step, c.sha);
}
}
// Read the agent's mode + runner preferences from tdd.config.json.
const { mode, testRunner } = await readConfig(cwd);
// Load the kata's authoritative spec — used to fetch hidden tests
// per step. Repos that don't match a known kata get scored on red→green
// discipline only (no hidden-test verification).
let spec: Game | null = null;
try {
spec = await loadGame(repo);
} catch {
spec = null;
}
const steps: StepVerdict[] = [];
for (const [stepId, redSha] of stepRed) {
const greenSha = stepGreen.get(stepId) ?? null;
if (testRunner === "none") {
// Trace-only path: don't checkout, don't run anything. Score
// purely from the commit log + a language-agnostic test-file
// count via `git ls-tree`. Useful for non-Bun projects.
const redFiles = await countTestFiles(cwd, redSha);
const greenFiles = greenSha ? await countTestFiles(cwd, greenSha) : redFiles;
const filesShrank = greenSha !== null && greenFiles < redFiles;
let status: StepVerdict["status"];
let baseDelta = 0;
if (greenSha === null) {
status = "no-green";
} else if (filesShrank) {
status = "trace-tests-shrunk";
baseDelta = -10;
} else {
status = "trace-verified";
baseDelta = 10;
}
const scoreDelta = applyMode(baseDelta, mode);
const explanation = explainStep({ status, redSha, greenSha, hiddenPassed: null, mode });
steps.push({
stepId, redSha, greenSha,
redFailed: null, greenPassed: null, hiddenPassed: null,
status, scoreDelta, explanation,
});
continue;
}
await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000);
const redTestCount = await countTests(cwd);
const redPassed = await runTests(cwd);
const redFailed = !redPassed;
let greenPassed: boolean | null = null;
let hiddenPassed: boolean | null = null;
let testsDeleted = false;
if (greenSha) {
await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000);
const greenTestCount = await countTests(cwd);
testsDeleted = greenTestCount < redTestCount;
greenPassed = await runTests(cwd);
if (greenPassed && spec && !testsDeleted) {
hiddenPassed = await runHiddenTests(cwd, spec, stepId);
}
}
let status: StepVerdict["status"];
let baseDelta = 0;
if (greenSha === null) {
status = "no-green";
} else if (testsDeleted) {
status = "test-deleted";
baseDelta = -20;
} else if (!redFailed) {
status = "red-did-not-fail";
baseDelta = -5;
} else if (greenPassed === false) {
status = "green-did-not-pass";
baseDelta = -5;
} else if (hiddenPassed === false) {
status = "hidden-tests-failed";
baseDelta = 0;
} else if (hiddenPassed === true) {
status = "verified";
baseDelta = 20;
} else {
status = "discipline-only";
baseDelta = 5;
}
const scoreDelta = applyMode(baseDelta, mode);
const explanation = explainStep({ status, redSha, greenSha, hiddenPassed, mode });
steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta, explanation });
}
// Refactor commits aren't tied to red→green pairs: the spec rewards
// any refactor that keeps the existing tests green. A broken refactor
// (tests fail at the refactor commit) costs the same as a missed
// green — discipline matters even outside red→green pairs.
const refactors: RefactorVerdict[] = [];
for (const c of commits) {
if (c.phase !== "refactor") continue;
await runProc(["git", "checkout", "--quiet", c.sha], cwd, 5000);
const passed = await runTests(cwd);
const baseDelta = passed ? 5 : -5;
refactors.push({
sha: c.sha,
stepId: c.step,
testsPassed: passed,
scoreDelta: applyMode(baseDelta, mode),
explanation: explainRefactor(passed),
});
}
const totalScore =
steps.reduce((a, s) => a + s.scoreDelta, 0) +
refactors.reduce((a, r) => a + r.scoreDelta, 0);
const verdict: Verdict = { headSha, mode, steps, refactors, totalScore, judgedAt: Date.now() };
saveRun(owner, repo, verdict);
return verdict;
} finally {
try {
rmSync(cwd, { recursive: true, force: true });
} catch {
// best effort cleanup
}
}
};