syntaxai/tdd.md · commit 02961df

Add judge module: sandbox runs agent commits, scores red→green pairs

The judge clones an agent's kata repo into a per-run /tmp dir, walks
its history, and for every step that has a red commit followed by a
green for the same step:

- checks out the red sha, runs `bun test`, asserts it fails
- checks out the green sha, runs `bun test`, asserts it passes
- writes a verdict (+20 verified, -5 if red passed or green failed,
  0 if no green yet) to a SQLite trace store

Hardening for now: subprocesses run with a stripped env (no
FORGEJO_ADMIN_TOKEN, no GITHUB_CLIENT_SECRET, no SESSION_SECRET),
HOME and TMPDIR pinned to the run dir, and an 8s wallclock cap on
each test invocation. Container isolation per run is a follow-up.

Verdicts persist in /app/data/runs.db (named podman volume
tdd-md-data, :Z relabel for SELinux). The repo page reads the
latest verdict and renders a per-step scoring table next to the
phase log; if no verdict exists, the page links to the manual
trigger endpoint POST /api/judge/:owner/:repo.

git is now installed in the runtime stage of the Containerfile —
needed by the judge for clone/log/checkout.

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
author
syntaxai <[email protected]>
date
2026-05-03 17:34:52 +01:00
parent
9d21dfa
commit
02961dfb45997448bc389c1ce35b636bf8f57276

5 files changed · +236 −1

modified Containerfile +3 −0
@@ -8,6 +8,9 @@ COPY package.json bun.lock ./
88 RUN bun install --frozen-lockfile --production
99
1010 FROM docker.io/oven/bun:1-alpine AS runtime
11+# git is needed by the judge module (clone agent repos, walk commits via
12+# `git log`/`checkout`).
13+RUN apk add --no-cache git
1114 WORKDIR /app
1215 COPY --from=deps /app/node_modules ./node_modules
1316 COPY package.json bun.lock tsconfig.json ./
modified scripts/p620/tdd-md.container +5 −0
@@ -15,6 +15,11 @@ Environment=PORT=3000
1515 Environment=NODE_ENV=production
1616 Environment=BASE_URL=https://tdd.md
1717
18+# SQLite voor judge-verdicts. Persisted in named podman volume.
19+# :Z relabel voor SELinux (Fedora Atomic).
20+Volume=tdd-md-data:/app/data:Z
21+Environment=TDD_DB_PATH=/app/data/runs.db
22+
1823 # Praat met Forgejo via host-network (Forgejo publisht :44400 op de host).
1924 # host.containers.internal is de standaard rootless-podman alias voor de host.
2025 Environment=FORGEJO_URL=http://host.containers.internal:44400
added src/db.ts +57 −0
@@ -0,0 +1,57 @@
1+import { Database } from "bun:sqlite";
2+
3+const DB_PATH = process.env.TDD_DB_PATH ?? ":memory:";
4+
5+let db: Database | null = null;
6+
7+const getDb = (): Database => {
8+ if (db) return db;
9+ db = new Database(DB_PATH, { create: true });
10+ db.exec(`
11+ CREATE TABLE IF NOT EXISTS runs (
12+ id INTEGER PRIMARY KEY AUTOINCREMENT,
13+ owner TEXT NOT NULL,
14+ repo TEXT NOT NULL,
15+ head_sha TEXT NOT NULL,
16+ judged_at INTEGER NOT NULL,
17+ verdict_json TEXT NOT NULL
18+ );
19+ CREATE INDEX IF NOT EXISTS idx_runs_owner_repo
20+ ON runs(owner, repo, judged_at DESC);
21+ `);
22+ return db;
23+};
24+
25+export interface StepVerdict {
26+ stepId: string;
27+ redSha: string | null;
28+ greenSha: string | null;
29+ redFailed: boolean | null;
30+ greenPassed: boolean | null;
31+ status: "verified" | "no-green" | "red-did-not-fail" | "green-did-not-pass";
32+ scoreDelta: number;
33+}
34+
35+export interface Verdict {
36+ headSha: string;
37+ steps: StepVerdict[];
38+ totalScore: number;
39+ judgedAt: number;
40+}
41+
42+export const saveRun = (owner: string, repo: string, verdict: Verdict): void => {
43+ getDb().run(
44+ `INSERT INTO runs (owner, repo, head_sha, judged_at, verdict_json) VALUES (?, ?, ?, ?, ?)`,
45+ [owner, repo, verdict.headSha, verdict.judgedAt, JSON.stringify(verdict)],
46+ );
47+};
48+
49+export const latestRun = (owner: string, repo: string): Verdict | null => {
50+ const row = getDb()
51+ .query<{ verdict_json: string }, [string, string]>(
52+ `SELECT verdict_json FROM runs WHERE owner = ? AND repo = ? ORDER BY judged_at DESC LIMIT 1`,
53+ )
54+ .get(owner, repo);
55+ if (!row) return null;
56+ return JSON.parse(row.verdict_json) as Verdict;
57+};
added src/judge.ts +135 −0
@@ -0,0 +1,135 @@
1+import { mkdtempSync, rmSync } from "fs";
2+import { join } from "path";
3+import { tmpdir } from "os";
4+import { parseCommit, type Phase } from "./commits";
5+import { saveRun, type Verdict, type StepVerdict } from "./db";
6+
7+const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md";
8+const TEST_TIMEOUT_MS = 8000;
9+
10+// Sandboxed env passed to git and bun subprocesses. Strips every secret
11+// from the parent process — agent code never sees FORGEJO_ADMIN_TOKEN,
12+// GITHUB_CLIENT_SECRET, or SESSION_SECRET. PATH is fixed; HOME and TMPDIR
13+// stay inside the per-run temp dir so dotfile writes can't escape.
14+const sandboxEnv = (cwd: string): Record<string, string> => ({
15+ PATH: "/usr/local/bin:/usr/bin:/bin",
16+ HOME: cwd,
17+ TMPDIR: cwd,
18+ NODE_ENV: "test",
19+});
20+
21+const runProc = async (
22+ cmd: string[],
23+ cwd: string,
24+ timeoutMs: number,
25+): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> => {
26+ const proc = Bun.spawn(cmd, {
27+ cwd,
28+ stdout: "pipe",
29+ stderr: "pipe",
30+ env: sandboxEnv(cwd),
31+ });
32+ let timedOut = false;
33+ const timer = setTimeout(() => {
34+ timedOut = true;
35+ proc.kill("SIGKILL");
36+ }, timeoutMs);
37+ const exitCode = await proc.exited;
38+ clearTimeout(timer);
39+ const stdout = await new Response(proc.stdout).text();
40+ const stderr = await new Response(proc.stderr).text();
41+ return { stdout: stdout.trim(), stderr: stderr.trim(), exitCode, timedOut };
42+};
43+
44+const runTests = async (cwd: string): Promise<boolean> => {
45+ const r = await runProc(["bun", "test"], cwd, TEST_TIMEOUT_MS);
46+ // Bun test exits 0 only when all tests pass.
47+ return !r.timedOut && r.exitCode === 0;
48+};
49+
50+interface CommitInfo {
51+ sha: string;
52+ phase: Phase;
53+ step: string | null;
54+}
55+
56+const readCommits = async (cwd: string): Promise<CommitInfo[]> => {
57+ const r = await runProc(["git", "log", "--reverse", "--pretty=format:%H%x1f%B%x1e"], cwd, 10000);
58+ if (r.exitCode !== 0) return [];
59+ const out: CommitInfo[] = [];
60+ for (const block of r.stdout.split("\x1e")) {
61+ const t = block.trim();
62+ if (!t) continue;
63+ const [sha, message = ""] = t.split("\x1f");
64+ if (!sha) continue;
65+ const p = parseCommit(message);
66+ out.push({ sha, phase: p.phase, step: p.step });
67+ }
68+ return out;
69+};
70+
71+export const judge = async (owner: string, repo: string): Promise<Verdict> => {
72+ const cwd = mkdtempSync(join(tmpdir(), `judge-${owner}-${repo}-`));
73+ try {
74+ const cloneUrl = `${FORGEJO_INTERNAL}/${owner}/${repo}.git`;
75+ const cloneR = await runProc(["git", "clone", "--quiet", cloneUrl, "."], cwd, 30000);
76+ if (cloneR.exitCode !== 0) {
77+ throw new Error(`clone failed: ${cloneR.stderr || cloneR.stdout}`);
78+ }
79+
80+ const commits = await readCommits(cwd);
81+ const headR = await runProc(["git", "rev-parse", "HEAD"], cwd, 5000);
82+ const headSha = headR.stdout;
83+
84+ // First red per step + first green-after-red per step (chronological).
85+ const stepRed = new Map<string, string>();
86+ const stepGreen = new Map<string, string>();
87+ for (const c of commits) {
88+ if (!c.step) continue;
89+ if (c.phase === "red" && !stepRed.has(c.step)) {
90+ stepRed.set(c.step, c.sha);
91+ } else if (c.phase === "green" && stepRed.has(c.step) && !stepGreen.has(c.step)) {
92+ stepGreen.set(c.step, c.sha);
93+ }
94+ }
95+
96+ const steps: StepVerdict[] = [];
97+ for (const [stepId, redSha] of stepRed) {
98+ const greenSha = stepGreen.get(stepId) ?? null;
99+ await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000);
100+ const redPassed = await runTests(cwd);
101+ const redFailed = !redPassed;
102+ let greenPassed: boolean | null = null;
103+ if (greenSha) {
104+ await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000);
105+ greenPassed = await runTests(cwd);
106+ }
107+ let status: StepVerdict["status"];
108+ let scoreDelta = 0;
109+ if (greenSha === null) {
110+ status = "no-green";
111+ } else if (!redFailed) {
112+ status = "red-did-not-fail";
113+ scoreDelta = -5;
114+ } else if (greenPassed === false) {
115+ status = "green-did-not-pass";
116+ scoreDelta = -5;
117+ } else {
118+ status = "verified";
119+ scoreDelta = 20;
120+ }
121+ steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, status, scoreDelta });
122+ }
123+
124+ const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0);
125+ const verdict: Verdict = { headSha, steps, totalScore, judgedAt: Date.now() };
126+ saveRun(owner, repo, verdict);
127+ return verdict;
128+ } finally {
129+ try {
130+ rmSync(cwd, { recursive: true, force: true });
131+ } catch {
132+ // best effort cleanup
133+ }
134+ }
135+};
modified src/server.ts +36 −1
@@ -3,6 +3,8 @@ import * as github from "./github_oauth";
33 import * as forgejo from "./forgejo";
44 import { parseCommit, computeProgress, type Phase } from "./commits";
55 import { loadGame } from "./games";
6+import { judge } from "./judge";
7+import { latestRun } from "./db";
68
79 const HOME_MD = "./content/home.md";
810 const GAME_DIR = "./content/games";
@@ -288,6 +290,27 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> =>
288290 ? `[\`${repo}\` →](/games/${repo})`
289291 : `\`${repo}\``;
290292
293+ const verdict = latestRun(owner, repo);
294+ const headSha = commits[0]?.sha ?? null;
295+ const verdictStale = verdict !== null && headSha !== null && verdict.headSha !== headSha;
296+
297+ let scoreSection: string;
298+ if (verdict === null) {
299+ scoreSection = `> Not yet judged. The next push triggers a judge run, or [run the judge now](/api/judge/${owner}/${repo}) (POST).\n\nPhase tally: <span class="red">red ${progress.redCount}</span> · <span class="green">green ${progress.greenCount}</span> · <span class="blue">refactor ${progress.refactorCount}</span>${progress.untaggedCount > 0 ? ` · <span class="muted">untagged ${progress.untaggedCount}</span>` : ""}.`;
300+ } else {
301+ const stale = verdictStale ? ` · <span class="muted">stale — newer commits not yet judged</span>` : "";
302+ const sign = verdict.totalScore >= 0 ? "+" : "";
303+ const rows = verdict.steps.length === 0
304+ ? "_No red→green pairs found yet._"
305+ : `| step | red | green | status | points |\n|---|---|---|---|---|\n` +
306+ verdict.steps.map((s) => {
307+ const cls = s.status === "verified" ? "green" : s.status === "no-green" ? "muted" : "red";
308+ const sign = s.scoreDelta >= 0 ? "+" : "";
309+ return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`;
310+ }).join("\n");
311+ scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`;
312+ }
313+
291314 const body = `# ${owner} · playing ${kataLink}
292315
293316 > ${status}
@@ -299,7 +322,7 @@ ${phaseLog}
299322
300323 ## score
301324
302-> Final scoring lands when the judge module ships. Phase tally: <span class="red">red ${progress.redCount}</span> · <span class="green">green ${progress.greenCount}</span> · <span class="blue">refactor ${progress.refactorCount}</span>${progress.untaggedCount > 0 ? ` · <span class="muted">untagged ${progress.untaggedCount}</span>` : ""}.
325+${scoreSection}
303326
304327 ## clone
305328
@@ -408,6 +431,18 @@ const server = Bun.serve({
408431
409432 "/leaderboard": htmlResponse(LEADERBOARD_HTML),
410433
434+ "/api/judge/:owner/:repo": async (req) => {
435+ if (req.method !== "POST") {
436+ return new Response("method not allowed; POST to trigger a judge run", { status: 405 });
437+ }
438+ try {
439+ const verdict = await judge(req.params.owner, req.params.repo);
440+ return Response.json(verdict);
441+ } catch (err) {
442+ return Response.json({ error: (err as Error).message }, { status: 500 });
443+ }
444+ },
445+
411446 "/auth/github/start": (_req) => {
412447 if (!github.isConfigured() || !forgejo.isConfigured()) {
413448 return errorPage("registration is not configured on this server", 503);