Add judge module: sandbox runs agent commits, scores red→green pairs
The judge clones an agent's kata repo into a per-run /tmp dir, walks its history, and for every step that has a red commit followed by a green for the same step: - checks out the red sha, runs `bun test`, asserts it fails - checks out the green sha, runs `bun test`, asserts it passes - writes a verdict (+20 verified, -5 if red passed or green failed, 0 if no green yet) to a SQLite trace store Hardening for now: subprocesses run with a stripped env (no FORGEJO_ADMIN_TOKEN, no GITHUB_CLIENT_SECRET, no SESSION_SECRET), HOME and TMPDIR pinned to the run dir, and an 8s wallclock cap on each test invocation. Container isolation per run is a follow-up. Verdicts persist in /app/data/runs.db (named podman volume tdd-md-data, :Z relabel for SELinux). The repo page reads the latest verdict and renders a per-step scoring table next to the phase log; if no verdict exists, the page links to the manual trigger endpoint POST /api/judge/:owner/:repo. git is now installed in the runtime stage of the Containerfile — needed by the judge for clone/log/checkout. Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
5 files changed · +236 −1
Containerfile
+3
−0
| @@ -8,6 +8,9 @@ COPY package.json bun.lock ./ | ||
| 8 | 8 | RUN bun install --frozen-lockfile --production |
| 9 | 9 | |
| 10 | 10 | FROM docker.io/oven/bun:1-alpine AS runtime |
| 11 | +# git is needed by the judge module (clone agent repos, walk commits via | |
| 12 | +# `git log`/`checkout`). | |
| 13 | +RUN apk add --no-cache git | |
| 11 | 14 | WORKDIR /app |
| 12 | 15 | COPY --from=deps /app/node_modules ./node_modules |
| 13 | 16 | COPY package.json bun.lock tsconfig.json ./ |
scripts/p620/tdd-md.container
+5
−0
| @@ -15,6 +15,11 @@ Environment=PORT=3000 | ||
| 15 | 15 | Environment=NODE_ENV=production |
| 16 | 16 | Environment=BASE_URL=https://tdd.md |
| 17 | 17 | |
| 18 | +# SQLite voor judge-verdicts. Persisted in named podman volume. | |
| 19 | +# :Z relabel voor SELinux (Fedora Atomic). | |
| 20 | +Volume=tdd-md-data:/app/data:Z | |
| 21 | +Environment=TDD_DB_PATH=/app/data/runs.db | |
| 22 | + | |
| 18 | 23 | # Praat met Forgejo via host-network (Forgejo publisht :44400 op de host). |
| 19 | 24 | # host.containers.internal is de standaard rootless-podman alias voor de host. |
| 20 | 25 | Environment=FORGEJO_URL=http://host.containers.internal:44400 |
src/db.ts
+57
−0
| @@ -0,0 +1,57 @@ | ||
| 1 | +import { Database } from "bun:sqlite"; | |
| 2 | + | |
| 3 | +const DB_PATH = process.env.TDD_DB_PATH ?? ":memory:"; | |
| 4 | + | |
| 5 | +let db: Database | null = null; | |
| 6 | + | |
| 7 | +const getDb = (): Database => { | |
| 8 | + if (db) return db; | |
| 9 | + db = new Database(DB_PATH, { create: true }); | |
| 10 | + db.exec(` | |
| 11 | + CREATE TABLE IF NOT EXISTS runs ( | |
| 12 | + id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| 13 | + owner TEXT NOT NULL, | |
| 14 | + repo TEXT NOT NULL, | |
| 15 | + head_sha TEXT NOT NULL, | |
| 16 | + judged_at INTEGER NOT NULL, | |
| 17 | + verdict_json TEXT NOT NULL | |
| 18 | + ); | |
| 19 | + CREATE INDEX IF NOT EXISTS idx_runs_owner_repo | |
| 20 | + ON runs(owner, repo, judged_at DESC); | |
| 21 | + `); | |
| 22 | + return db; | |
| 23 | +}; | |
| 24 | + | |
| 25 | +export interface StepVerdict { | |
| 26 | + stepId: string; | |
| 27 | + redSha: string | null; | |
| 28 | + greenSha: string | null; | |
| 29 | + redFailed: boolean | null; | |
| 30 | + greenPassed: boolean | null; | |
| 31 | + status: "verified" | "no-green" | "red-did-not-fail" | "green-did-not-pass"; | |
| 32 | + scoreDelta: number; | |
| 33 | +} | |
| 34 | + | |
| 35 | +export interface Verdict { | |
| 36 | + headSha: string; | |
| 37 | + steps: StepVerdict[]; | |
| 38 | + totalScore: number; | |
| 39 | + judgedAt: number; | |
| 40 | +} | |
| 41 | + | |
| 42 | +export const saveRun = (owner: string, repo: string, verdict: Verdict): void => { | |
| 43 | + getDb().run( | |
| 44 | + `INSERT INTO runs (owner, repo, head_sha, judged_at, verdict_json) VALUES (?, ?, ?, ?, ?)`, | |
| 45 | + [owner, repo, verdict.headSha, verdict.judgedAt, JSON.stringify(verdict)], | |
| 46 | + ); | |
| 47 | +}; | |
| 48 | + | |
| 49 | +export const latestRun = (owner: string, repo: string): Verdict | null => { | |
| 50 | + const row = getDb() | |
| 51 | + .query<{ verdict_json: string }, [string, string]>( | |
| 52 | + `SELECT verdict_json FROM runs WHERE owner = ? AND repo = ? ORDER BY judged_at DESC LIMIT 1`, | |
| 53 | + ) | |
| 54 | + .get(owner, repo); | |
| 55 | + if (!row) return null; | |
| 56 | + return JSON.parse(row.verdict_json) as Verdict; | |
| 57 | +}; | |
src/judge.ts
+135
−0
| @@ -0,0 +1,135 @@ | ||
| 1 | +import { mkdtempSync, rmSync } from "fs"; | |
| 2 | +import { join } from "path"; | |
| 3 | +import { tmpdir } from "os"; | |
| 4 | +import { parseCommit, type Phase } from "./commits"; | |
| 5 | +import { saveRun, type Verdict, type StepVerdict } from "./db"; | |
| 6 | + | |
| 7 | +const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md"; | |
| 8 | +const TEST_TIMEOUT_MS = 8000; | |
| 9 | + | |
| 10 | +// Sandboxed env passed to git and bun subprocesses. Strips every secret | |
| 11 | +// from the parent process — agent code never sees FORGEJO_ADMIN_TOKEN, | |
| 12 | +// GITHUB_CLIENT_SECRET, or SESSION_SECRET. PATH is fixed; HOME and TMPDIR | |
| 13 | +// stay inside the per-run temp dir so dotfile writes can't escape. | |
| 14 | +const sandboxEnv = (cwd: string): Record<string, string> => ({ | |
| 15 | + PATH: "/usr/local/bin:/usr/bin:/bin", | |
| 16 | + HOME: cwd, | |
| 17 | + TMPDIR: cwd, | |
| 18 | + NODE_ENV: "test", | |
| 19 | +}); | |
| 20 | + | |
| 21 | +const runProc = async ( | |
| 22 | + cmd: string[], | |
| 23 | + cwd: string, | |
| 24 | + timeoutMs: number, | |
| 25 | +): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> => { | |
| 26 | + const proc = Bun.spawn(cmd, { | |
| 27 | + cwd, | |
| 28 | + stdout: "pipe", | |
| 29 | + stderr: "pipe", | |
| 30 | + env: sandboxEnv(cwd), | |
| 31 | + }); | |
| 32 | + let timedOut = false; | |
| 33 | + const timer = setTimeout(() => { | |
| 34 | + timedOut = true; | |
| 35 | + proc.kill("SIGKILL"); | |
| 36 | + }, timeoutMs); | |
| 37 | + const exitCode = await proc.exited; | |
| 38 | + clearTimeout(timer); | |
| 39 | + const stdout = await new Response(proc.stdout).text(); | |
| 40 | + const stderr = await new Response(proc.stderr).text(); | |
| 41 | + return { stdout: stdout.trim(), stderr: stderr.trim(), exitCode, timedOut }; | |
| 42 | +}; | |
| 43 | + | |
| 44 | +const runTests = async (cwd: string): Promise<boolean> => { | |
| 45 | + const r = await runProc(["bun", "test"], cwd, TEST_TIMEOUT_MS); | |
| 46 | + // Bun test exits 0 only when all tests pass. | |
| 47 | + return !r.timedOut && r.exitCode === 0; | |
| 48 | +}; | |
| 49 | + | |
| 50 | +interface CommitInfo { | |
| 51 | + sha: string; | |
| 52 | + phase: Phase; | |
| 53 | + step: string | null; | |
| 54 | +} | |
| 55 | + | |
| 56 | +const readCommits = async (cwd: string): Promise<CommitInfo[]> => { | |
| 57 | + const r = await runProc(["git", "log", "--reverse", "--pretty=format:%H%x1f%B%x1e"], cwd, 10000); | |
| 58 | + if (r.exitCode !== 0) return []; | |
| 59 | + const out: CommitInfo[] = []; | |
| 60 | + for (const block of r.stdout.split("\x1e")) { | |
| 61 | + const t = block.trim(); | |
| 62 | + if (!t) continue; | |
| 63 | + const [sha, message = ""] = t.split("\x1f"); | |
| 64 | + if (!sha) continue; | |
| 65 | + const p = parseCommit(message); | |
| 66 | + out.push({ sha, phase: p.phase, step: p.step }); | |
| 67 | + } | |
| 68 | + return out; | |
| 69 | +}; | |
| 70 | + | |
| 71 | +export const judge = async (owner: string, repo: string): Promise<Verdict> => { | |
| 72 | + const cwd = mkdtempSync(join(tmpdir(), `judge-${owner}-${repo}-`)); | |
| 73 | + try { | |
| 74 | + const cloneUrl = `${FORGEJO_INTERNAL}/${owner}/${repo}.git`; | |
| 75 | + const cloneR = await runProc(["git", "clone", "--quiet", cloneUrl, "."], cwd, 30000); | |
| 76 | + if (cloneR.exitCode !== 0) { | |
| 77 | + throw new Error(`clone failed: ${cloneR.stderr || cloneR.stdout}`); | |
| 78 | + } | |
| 79 | + | |
| 80 | + const commits = await readCommits(cwd); | |
| 81 | + const headR = await runProc(["git", "rev-parse", "HEAD"], cwd, 5000); | |
| 82 | + const headSha = headR.stdout; | |
| 83 | + | |
| 84 | + // First red per step + first green-after-red per step (chronological). | |
| 85 | + const stepRed = new Map<string, string>(); | |
| 86 | + const stepGreen = new Map<string, string>(); | |
| 87 | + for (const c of commits) { | |
| 88 | + if (!c.step) continue; | |
| 89 | + if (c.phase === "red" && !stepRed.has(c.step)) { | |
| 90 | + stepRed.set(c.step, c.sha); | |
| 91 | + } else if (c.phase === "green" && stepRed.has(c.step) && !stepGreen.has(c.step)) { | |
| 92 | + stepGreen.set(c.step, c.sha); | |
| 93 | + } | |
| 94 | + } | |
| 95 | + | |
| 96 | + const steps: StepVerdict[] = []; | |
| 97 | + for (const [stepId, redSha] of stepRed) { | |
| 98 | + const greenSha = stepGreen.get(stepId) ?? null; | |
| 99 | + await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000); | |
| 100 | + const redPassed = await runTests(cwd); | |
| 101 | + const redFailed = !redPassed; | |
| 102 | + let greenPassed: boolean | null = null; | |
| 103 | + if (greenSha) { | |
| 104 | + await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000); | |
| 105 | + greenPassed = await runTests(cwd); | |
| 106 | + } | |
| 107 | + let status: StepVerdict["status"]; | |
| 108 | + let scoreDelta = 0; | |
| 109 | + if (greenSha === null) { | |
| 110 | + status = "no-green"; | |
| 111 | + } else if (!redFailed) { | |
| 112 | + status = "red-did-not-fail"; | |
| 113 | + scoreDelta = -5; | |
| 114 | + } else if (greenPassed === false) { | |
| 115 | + status = "green-did-not-pass"; | |
| 116 | + scoreDelta = -5; | |
| 117 | + } else { | |
| 118 | + status = "verified"; | |
| 119 | + scoreDelta = 20; | |
| 120 | + } | |
| 121 | + steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, status, scoreDelta }); | |
| 122 | + } | |
| 123 | + | |
| 124 | + const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0); | |
| 125 | + const verdict: Verdict = { headSha, steps, totalScore, judgedAt: Date.now() }; | |
| 126 | + saveRun(owner, repo, verdict); | |
| 127 | + return verdict; | |
| 128 | + } finally { | |
| 129 | + try { | |
| 130 | + rmSync(cwd, { recursive: true, force: true }); | |
| 131 | + } catch { | |
| 132 | + // best effort cleanup | |
| 133 | + } | |
| 134 | + } | |
| 135 | +}; | |
src/server.ts
+36
−1
| @@ -3,6 +3,8 @@ import * as github from "./github_oauth"; | ||
| 3 | 3 | import * as forgejo from "./forgejo"; |
| 4 | 4 | import { parseCommit, computeProgress, type Phase } from "./commits"; |
| 5 | 5 | import { loadGame } from "./games"; |
| 6 | +import { judge } from "./judge"; | |
| 7 | +import { latestRun } from "./db"; | |
| 6 | 8 | |
| 7 | 9 | const HOME_MD = "./content/home.md"; |
| 8 | 10 | const GAME_DIR = "./content/games"; |
| @@ -288,6 +290,27 @@ const renderRepoView = async (owner: string, repo: string): Promise<Response> => | ||
| 288 | 290 | ? `[\`${repo}\` →](/games/${repo})` |
| 289 | 291 | : `\`${repo}\``; |
| 290 | 292 | |
| 293 | + const verdict = latestRun(owner, repo); | |
| 294 | + const headSha = commits[0]?.sha ?? null; | |
| 295 | + const verdictStale = verdict !== null && headSha !== null && verdict.headSha !== headSha; | |
| 296 | + | |
| 297 | + let scoreSection: string; | |
| 298 | + if (verdict === null) { | |
| 299 | + scoreSection = `> Not yet judged. The next push triggers a judge run, or [run the judge now](/api/judge/${owner}/${repo}) (POST).\n\nPhase tally: <span class="red">red ${progress.redCount}</span> · <span class="green">green ${progress.greenCount}</span> · <span class="blue">refactor ${progress.refactorCount}</span>${progress.untaggedCount > 0 ? ` · <span class="muted">untagged ${progress.untaggedCount}</span>` : ""}.`; | |
| 300 | + } else { | |
| 301 | + const stale = verdictStale ? ` · <span class="muted">stale — newer commits not yet judged</span>` : ""; | |
| 302 | + const sign = verdict.totalScore >= 0 ? "+" : ""; | |
| 303 | + const rows = verdict.steps.length === 0 | |
| 304 | + ? "_No red→green pairs found yet._" | |
| 305 | + : `| step | red | green | status | points |\n|---|---|---|---|---|\n` + | |
| 306 | + verdict.steps.map((s) => { | |
| 307 | + const cls = s.status === "verified" ? "green" : s.status === "no-green" ? "muted" : "red"; | |
| 308 | + const sign = s.scoreDelta >= 0 ? "+" : ""; | |
| 309 | + return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | <span class="${cls}">${s.status}</span> | ${sign}${s.scoreDelta} |`; | |
| 310 | + }).join("\n"); | |
| 311 | + scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}`; | |
| 312 | + } | |
| 313 | + | |
| 291 | 314 | const body = `# ${owner} · playing ${kataLink} |
| 292 | 315 | |
| 293 | 316 | > ${status} |
| @@ -299,7 +322,7 @@ ${phaseLog} | ||
| 299 | 322 | |
| 300 | 323 | ## score |
| 301 | 324 | |
| 302 | -> Final scoring lands when the judge module ships. Phase tally: <span class="red">red ${progress.redCount}</span> · <span class="green">green ${progress.greenCount}</span> · <span class="blue">refactor ${progress.refactorCount}</span>${progress.untaggedCount > 0 ? ` · <span class="muted">untagged ${progress.untaggedCount}</span>` : ""}. | |
| 325 | +${scoreSection} | |
| 303 | 326 | |
| 304 | 327 | ## clone |
| 305 | 328 | |
| @@ -408,6 +431,18 @@ const server = Bun.serve({ | ||
| 408 | 431 | |
| 409 | 432 | "/leaderboard": htmlResponse(LEADERBOARD_HTML), |
| 410 | 433 | |
| 434 | + "/api/judge/:owner/:repo": async (req) => { | |
| 435 | + if (req.method !== "POST") { | |
| 436 | + return new Response("method not allowed; POST to trigger a judge run", { status: 405 }); | |
| 437 | + } | |
| 438 | + try { | |
| 439 | + const verdict = await judge(req.params.owner, req.params.repo); | |
| 440 | + return Response.json(verdict); | |
| 441 | + } catch (err) { | |
| 442 | + return Response.json({ error: (err as Error).message }, { status: 500 }); | |
| 443 | + } | |
| 444 | + }, | |
| 445 | + | |
| 411 | 446 | "/auth/github/start": (_req) => { |
| 412 | 447 | if (!github.isConfigured() || !forgejo.isConfigured()) { |
| 413 | 448 | return errorPage("registration is not configured on this server", 503); |