syntaxai/tdd.md · main · src / c14_judge.ts

c14_judge.ts 371 lines · 15447 bytes raw
import { mkdtempSync, rmSync } from "fs";
import { join } from "path";
import { tmpdir } from "os";
import { parseCommit, type Phase } from "./a31_commits.ts";
import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./c13_database.ts";
import { loadGame, type Game } from "./a31_games.ts";

type TestRunner = "bun" | "none";

interface TddConfig {
  mode: Mode;
  testRunner: TestRunner;
}

// tdd.config.json from the agent's repo selects the scoring mode and
// test runner. Falls back to strict / bun when missing or unparseable.
//
//   { "mode": "pragmatic", "test_runner": "none" }
//
// test_runner: "none" enables trace-only judging — no checkout, no test
// execution. Useful as a CI gate on projects where Bun can't run the
// suite (e.g. .NET, Python without bun-compat tests).
const readConfig = async (cwd: string): Promise<TddConfig> => {
  const file = Bun.file(join(cwd, "tdd.config.json"));
  let mode: Mode = "strict";
  let testRunner: TestRunner = "bun";
  if (await file.exists()) {
    try {
      const cfg = (await file.json()) as { mode?: string; test_runner?: string };
      if (cfg.mode === "pragmatic" || cfg.mode === "learning") mode = cfg.mode;
      if (cfg.test_runner === "none") testRunner = "none";
    } catch {
      // best effort — bad config falls back to defaults
    }
  }
  return { mode, testRunner };
};

// Penalty halving for pragmatic, zeroing for learning. Positive deltas
// are unchanged across modes — earned credit is earned credit.
export const applyMode = (delta: number, mode: Mode): number => {
  if (delta >= 0) return delta;
  if (mode === "learning") return 0;
  if (mode === "pragmatic") return Math.ceil(delta / 2);
  return delta;
};

// Plain-language summary of a step verdict, written to the agent (not
// the human admin). One short paragraph; named intentionally so callers
// can see it next to the row in the score table.
const explainStep = (params: {
  status: StepVerdict["status"];
  redSha: string | null;
  greenSha: string | null;
  hiddenPassed: boolean | null;
  mode: Mode;
}): string => {
  const { status, hiddenPassed, mode } = params;
  switch (status) {
    case "verified":
      return "Red failed as expected, green passes your tests, and the kata's hidden tests confirm the implementation matches the requirement.";
    case "discipline-only":
      return "Red→green discipline holds, but this kata didn't ship hidden tests for the step. Partial credit awarded; full +20 isn't possible without authoritative verification.";
    case "no-green":
      return "Red commit landed; the matching green(<step>) commit hasn't been pushed yet. Push your green to lock in the score.";
    case "red-did-not-fail":
      return mode === "pragmatic"
        ? "Combined red+green commit detected. Pragmatic mode allows this — the cycle still counts, just with a softer score than a clean separation."
        : "Red commit's tests already passed when the step was first introduced — meaning the implementation was added before the test, or the test is tautological. Switch to pragmatic mode if you commit red+green together intentionally.";
    case "green-did-not-pass":
      return "Green commit's own tests still fail. The implementation doesn't yet satisfy the test you wrote — fix the impl, or reconsider whether the test reflects the requirement.";
    case "hidden-tests-failed":
      return hiddenPassed === false
        ? "Your tests pass, but the kata's hidden tests don't — this is the classic tautology trap. Tighten your test to mirror the requirement (e.g., assert the actual return value, not just that it runs)."
        : "Your tests pass, but hidden verification was inconclusive. Re-push to retry.";
    case "test-deleted":
      return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle.";
    case "trace-verified":
      return "Trace-only mode: red→green pair found in the commit log. Tests weren't executed (test_runner: \"none\"). Switch to bun runner for behaviour verification.";
    case "trace-tests-shrunk":
      return "Trace-only mode: the green commit's tree has fewer test files than the red commit's tree — looks like deletion. If you renamed or split test files, the tally still drops.";
  }
};

export const explainRefactor = (passed: boolean): string =>
  passed
    ? "Tests stayed green through the refactor — structural change without behavior change, the canonical refactor."
    : "Refactor commit broke at least one test. Either revert the refactor or write a new red→green to capture the changed behavior.";

const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md";
const TEST_TIMEOUT_MS = 8000;

// Sandboxed env passed to git and bun subprocesses. Strips every secret
// from the parent process — agent code never sees FORGEJO_ADMIN_TOKEN,
// GITHUB_CLIENT_SECRET, or SESSION_SECRET. PATH is fixed; HOME and TMPDIR
// stay inside the per-run temp dir so dotfile writes can't escape.
const sandboxEnv = (cwd: string): Record<string, string> => ({
  PATH: "/usr/local/bin:/usr/bin:/bin",
  HOME: cwd,
  TMPDIR: cwd,
  NODE_ENV: "test",
});

const runProc = async (
  cmd: string[],
  cwd: string,
  timeoutMs: number,
): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> => {
  const proc = Bun.spawn(cmd, {
    cwd,
    stdout: "pipe",
    stderr: "pipe",
    env: sandboxEnv(cwd),
  });
  let timedOut = false;
  const timer = setTimeout(() => {
    timedOut = true;
    proc.kill("SIGKILL");
  }, timeoutMs);
  const exitCode = await proc.exited;
  clearTimeout(timer);
  const stdout = await new Response(proc.stdout).text();
  const stderr = await new Response(proc.stderr).text();
  return { stdout: stdout.trim(), stderr: stderr.trim(), exitCode, timedOut };
};

const runTests = async (cwd: string): Promise<boolean> => {
  const r = await runProc(["bun", "test"], cwd, TEST_TIMEOUT_MS);
  // Bun test exits 0 only when all tests pass.
  return !r.timedOut && r.exitCode === 0;
};

// Language-agnostic test-file counter for trace-only mode. Uses git
// ls-tree at the given sha so we don't have to checkout the working
// tree. Matches conventional test-file naming across ecosystems:
//   foo.test.ts, foo.spec.ts, FooTests.cs, FooTest.java, test_foo.py,
//   foo_test.go, FooSpec.scala, foo_spec.rb.
const countTestFiles = async (cwd: string, sha: string): Promise<number> => {
  const r = await runProc(["git", "ls-tree", "-r", "--name-only", sha], cwd, 5000);
  if (r.exitCode !== 0) return 0;
  const re = /(?:^|\/)(?:[^/]*\.(?:test|spec)\.[a-z]+|[Tt]ests?\/[^/]+|test_[^/]+|[^/]+_test\.[a-z]+|[^/]+[Tt]ests?\.cs|[^/]+[Tt]est\.java)$/;
  let count = 0;
  for (const line of r.stdout.split("\n")) {
    if (re.test(line)) count++;
  }
  return count;
};

// Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect
// when an agent deletes tests between red and green to make a regression
// "pass" — a cardinal TDD sin per the kata spec.
const countTests = async (cwd: string): Promise<number> => {
  const r = await runProc(["git", "ls-files", "*.test.ts"], cwd, 5000);
  if (r.exitCode !== 0) return 0;
  const files = r.stdout.split("\n").filter((f) => f && !f.includes("__hidden_"));
  let count = 0;
  for (const f of files) {
    const content = await Bun.file(join(cwd, f))
      .text()
      .catch(() => "");
    const matches = content.match(/\b(?:test|it)\s*\(/g);
    if (matches) count += matches.length;
  }
  return count;
};

// Runs the kata's authoritative tests against the agent's implementation
// at whatever commit is currently checked out. Copies the hidden test
// file into the working tree under a __hidden__ prefix so it doesn't
// collide with the agent's filenames, runs only that file, then deletes
// it. Returns null if the kata doesn't have hidden tests for this step.
const runHiddenTests = async (cwd: string, spec: Game, stepId: string): Promise<boolean | null> => {
  const stepDef = spec.steps.find((s) => s.id === stepId);
  if (!stepDef) return null;
  const sourcePath = `./content/games/${spec.id}/${stepDef.hiddenTestFile}`;
  const sourceFile = Bun.file(sourcePath);
  if (!(await sourceFile.exists())) return null;
  const content = await sourceFile.text();
  const targetName = `__hidden_${stepId}__.test.ts`;
  const targetPath = join(cwd, targetName);
  await Bun.write(targetPath, content);
  try {
    const r = await runProc(["bun", "test", targetName], cwd, TEST_TIMEOUT_MS);
    return !r.timedOut && r.exitCode === 0;
  } finally {
    try {
      rmSync(targetPath, { force: true });
    } catch {
      // best effort
    }
  }
};

interface CommitInfo {
  sha: string;
  phase: Phase;
  step: string | null;
}

const readCommits = async (cwd: string): Promise<CommitInfo[]> => {
  const r = await runProc(["git", "log", "--reverse", "--pretty=format:%H%x1f%B%x1e"], cwd, 10000);
  if (r.exitCode !== 0) return [];
  const out: CommitInfo[] = [];
  for (const block of r.stdout.split("\x1e")) {
    const t = block.trim();
    if (!t) continue;
    const [sha, message = ""] = t.split("\x1f");
    if (!sha) continue;
    const p = parseCommit(message);
    out.push({ sha, phase: p.phase, step: p.step });
  }
  return out;
};

export const judge = async (owner: string, repo: string): Promise<Verdict> => {
  const cwd = mkdtempSync(join(tmpdir(), `judge-${owner}-${repo}-`));
  try {
    // Agent repos default to private. Authenticate via admin token in
    // an http.extraheader so the token isn't persisted in the cloned
    // repo's config (extraheader applies to the clone request only).
    const cloneUrl = `${FORGEJO_INTERNAL}/${owner}/${repo}.git`;
    const adminToken = process.env.FORGEJO_ADMIN_TOKEN;
    const gitArgs = adminToken
      ? ["-c", `http.extraheader=Authorization: token ${adminToken}`, "clone", "--quiet", cloneUrl, "."]
      : ["clone", "--quiet", cloneUrl, "."];
    const cloneR = await runProc(["git", ...gitArgs], cwd, 30000);
    if (cloneR.exitCode !== 0) {
      throw new Error(`clone failed: ${cloneR.stderr || cloneR.stdout}`);
    }

    const commits = await readCommits(cwd);
    const headR = await runProc(["git", "rev-parse", "HEAD"], cwd, 5000);
    const headSha = headR.stdout;

    // First red per step + first green-after-red per step (chronological).
    const stepRed = new Map<string, string>();
    const stepGreen = new Map<string, string>();
    for (const c of commits) {
      if (!c.step) continue;
      if (c.phase === "red" && !stepRed.has(c.step)) {
        stepRed.set(c.step, c.sha);
      } else if (c.phase === "green" && stepRed.has(c.step) && !stepGreen.has(c.step)) {
        stepGreen.set(c.step, c.sha);
      }
    }

    // Read the agent's mode + runner preferences from tdd.config.json.
    const { mode, testRunner } = await readConfig(cwd);

    // Load the kata's authoritative spec — used to fetch hidden tests
    // per step. Repos that don't match a known kata get scored on red→green
    // discipline only (no hidden-test verification).
    let spec: Game | null = null;
    try {
      spec = await loadGame(repo);
    } catch {
      spec = null;
    }

    const steps: StepVerdict[] = [];
    for (const [stepId, redSha] of stepRed) {
      const greenSha = stepGreen.get(stepId) ?? null;

      if (testRunner === "none") {
        // Trace-only path: don't checkout, don't run anything. Score
        // purely from the commit log + a language-agnostic test-file
        // count via `git ls-tree`. Useful for non-Bun projects.
        const redFiles = await countTestFiles(cwd, redSha);
        const greenFiles = greenSha ? await countTestFiles(cwd, greenSha) : redFiles;
        const filesShrank = greenSha !== null && greenFiles < redFiles;

        let status: StepVerdict["status"];
        let baseDelta = 0;
        if (greenSha === null) {
          status = "no-green";
        } else if (filesShrank) {
          status = "trace-tests-shrunk";
          baseDelta = -10;
        } else {
          status = "trace-verified";
          baseDelta = 10;
        }
        const scoreDelta = applyMode(baseDelta, mode);
        const explanation = explainStep({ status, redSha, greenSha, hiddenPassed: null, mode });
        steps.push({
          stepId, redSha, greenSha,
          redFailed: null, greenPassed: null, hiddenPassed: null,
          status, scoreDelta, explanation,
        });
        continue;
      }

      await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000);
      const redTestCount = await countTests(cwd);
      const redPassed = await runTests(cwd);
      const redFailed = !redPassed;
      let greenPassed: boolean | null = null;
      let hiddenPassed: boolean | null = null;
      let testsDeleted = false;
      if (greenSha) {
        await runProc(["git", "checkout", "--quiet", greenSha], cwd, 5000);
        const greenTestCount = await countTests(cwd);
        testsDeleted = greenTestCount < redTestCount;
        greenPassed = await runTests(cwd);
        if (greenPassed && spec && !testsDeleted) {
          hiddenPassed = await runHiddenTests(cwd, spec, stepId);
        }
      }

      let status: StepVerdict["status"];
      let baseDelta = 0;
      if (greenSha === null) {
        status = "no-green";
      } else if (testsDeleted) {
        status = "test-deleted";
        baseDelta = -20;
      } else if (!redFailed) {
        status = "red-did-not-fail";
        baseDelta = -5;
      } else if (greenPassed === false) {
        status = "green-did-not-pass";
        baseDelta = -5;
      } else if (hiddenPassed === false) {
        status = "hidden-tests-failed";
        baseDelta = 0;
      } else if (hiddenPassed === true) {
        status = "verified";
        baseDelta = 20;
      } else {
        status = "discipline-only";
        baseDelta = 5;
      }
      const scoreDelta = applyMode(baseDelta, mode);
      const explanation = explainStep({ status, redSha, greenSha, hiddenPassed, mode });
      steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta, explanation });
    }

    // Refactor commits aren't tied to red→green pairs: the spec rewards
    // any refactor that keeps the existing tests green. A broken refactor
    // (tests fail at the refactor commit) costs the same as a missed
    // green — discipline matters even outside red→green pairs.
    const refactors: RefactorVerdict[] = [];
    for (const c of commits) {
      if (c.phase !== "refactor") continue;
      await runProc(["git", "checkout", "--quiet", c.sha], cwd, 5000);
      const passed = await runTests(cwd);
      const baseDelta = passed ? 5 : -5;
      refactors.push({
        sha: c.sha,
        stepId: c.step,
        testsPassed: passed,
        scoreDelta: applyMode(baseDelta, mode),
        explanation: explainRefactor(passed),
      });
    }

    const totalScore =
      steps.reduce((a, s) => a + s.scoreDelta, 0) +
      refactors.reduce((a, r) => a + r.scoreDelta, 0);
    const verdict: Verdict = { headSha, mode, steps, refactors, totalScore, judgedAt: Date.now() };
    saveRun(owner, repo, verdict);
    return verdict;
  } finally {
    try {
      rmSync(cwd, { recursive: true, force: true });
    } catch {
      // best effort cleanup
    }
  }
};