| 5 | 5 | import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./db"; |
| 6 | 6 | import { loadGame, type Game } from "./games"; |
| 7 | 7 | |
| 8 | | -// tdd.config.json from the agent's repo selects the scoring mode. |
| 9 | | -// Falls back to strict when missing or unparseable. |
| 10 | | -const readMode = async (cwd: string): Promise<Mode> => { |
| 8 | +type TestRunner = "bun" | "none"; |
| 9 | + |
| 10 | +interface TddConfig { |
| 11 | + mode: Mode; |
| 12 | + testRunner: TestRunner; |
| 13 | +} |
| 14 | + |
| 15 | +// tdd.config.json from the agent's repo selects the scoring mode and |
| 16 | +// test runner. Falls back to strict / bun when missing or unparseable. |
| 17 | +// |
| 18 | +// { "mode": "pragmatic", "test_runner": "none" } |
| 19 | +// |
| 20 | +// test_runner: "none" enables trace-only judging — no checkout, no test |
| 21 | +// execution. Useful as a CI gate on projects where Bun can't run the |
| 22 | +// suite (e.g. .NET, Python without bun-compat tests). |
| 23 | +const readConfig = async (cwd: string): Promise<TddConfig> => { |
| 11 | 24 | const file = Bun.file(join(cwd, "tdd.config.json")); |
| 12 | | - if (!(await file.exists())) return "strict"; |
| 13 | | - try { |
| 14 | | - const cfg = (await file.json()) as { mode?: string }; |
| 15 | | - if (cfg.mode === "pragmatic" || cfg.mode === "learning") return cfg.mode; |
| 16 | | - return "strict"; |
| 17 | | - } catch { |
| 18 | | - return "strict"; |
| 25 | + let mode: Mode = "strict"; |
| 26 | + let testRunner: TestRunner = "bun"; |
| 27 | + if (await file.exists()) { |
| 28 | + try { |
| 29 | + const cfg = (await file.json()) as { mode?: string; test_runner?: string }; |
| 30 | + if (cfg.mode === "pragmatic" || cfg.mode === "learning") mode = cfg.mode; |
| 31 | + if (cfg.test_runner === "none") testRunner = "none"; |
| 32 | + } catch { |
| 33 | + // best effort — bad config falls back to defaults |
| 34 | + } |
| 19 | 35 | } |
| 36 | + return { mode, testRunner }; |
| 20 | 37 | }; |
| 21 | 38 | |
| 22 | 39 | // Penalty halving for pragmatic, zeroing for learning. Positive deltas |
| 58 | 75 | : "Your tests pass, but hidden verification was inconclusive. Re-push to retry."; |
| 59 | 76 | case "test-deleted": |
| 60 | 77 | return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle."; |
| 78 | + case "trace-verified": |
| 79 | + return "Trace-only mode: red→green pair found in the commit log. Tests weren't executed (test_runner: \"none\"). Switch to bun runner for behaviour verification."; |
| 80 | + case "trace-tests-shrunk": |
| 81 | + return "Trace-only mode: the green commit's tree has fewer test files than the red commit's tree — looks like deletion. If you renamed or split test files, the tally still drops."; |
| 61 | 82 | } |
| 62 | 83 | }; |
| 63 | 84 | |
| 109 | 130 | return !r.timedOut && r.exitCode === 0; |
| 110 | 131 | }; |
| 111 | 132 | |
| 133 | +// Language-agnostic test-file counter for trace-only mode. Uses git |
| 134 | +// ls-tree at the given sha so we don't have to checkout the working |
| 135 | +// tree. Matches conventional test-file naming across ecosystems: |
| 136 | +// foo.test.ts, foo.spec.ts, FooTests.cs, FooTest.java, test_foo.py, |
| 137 | +// foo_test.go, FooSpec.scala, foo_spec.rb. |
| 138 | +const countTestFiles = async (cwd: string, sha: string): Promise<number> => { |
| 139 | + const r = await runProc(["git", "ls-tree", "-r", "--name-only", sha], cwd, 5000); |
| 140 | + if (r.exitCode !== 0) return 0; |
| 141 | + const re = /(?:^|\/)(?:[^/]*\.(?:test|spec)\.[a-z]+|[Tt]ests?\/[^/]+|test_[^/]+|[^/]+_test\.[a-z]+|[^/]+[Tt]ests?\.cs|[^/]+[Tt]est\.java)$/; |
| 142 | + let count = 0; |
| 143 | + for (const line of r.stdout.split("\n")) { |
| 144 | + if (re.test(line)) count++; |
| 145 | + } |
| 146 | + return count; |
| 147 | +}; |
| 148 | + |
| 112 | 149 | // Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect |
| 113 | 150 | // when an agent deletes tests between red and green to make a regression |
| 114 | 151 | // "pass" — a cardinal TDD sin per the kata spec. |
| 200 | 237 | } |
| 201 | 238 | } |
| 202 | 239 | |
| 203 | | - // Read the agent's mode preference (defaults to strict). Mode |
| 204 | | - // affects penalties only — verified credits are mode-invariant. |
| 205 | | - const mode = await readMode(cwd); |
| 240 | + // Read the agent's mode + runner preferences from tdd.config.json. |
| 241 | + const { mode, testRunner } = await readConfig(cwd); |
| 206 | 242 | |
| 207 | 243 | // Load the kata's authoritative spec — used to fetch hidden tests |
| 208 | 244 | // per step. Repos that don't match a known kata get scored on red→green |
| 217 | 253 | const steps: StepVerdict[] = []; |
| 218 | 254 | for (const [stepId, redSha] of stepRed) { |
| 219 | 255 | const greenSha = stepGreen.get(stepId) ?? null; |
| 256 | + |
| 257 | + if (testRunner === "none") { |
| 258 | + // Trace-only path: don't checkout, don't run anything. Score |
| 259 | + // purely from the commit log + a language-agnostic test-file |
| 260 | + // count via `git ls-tree`. Useful for non-Bun projects. |
| 261 | + const redFiles = await countTestFiles(cwd, redSha); |
| 262 | + const greenFiles = greenSha ? await countTestFiles(cwd, greenSha) : redFiles; |
| 263 | + const filesShrank = greenSha !== null && greenFiles < redFiles; |
| 264 | + |
| 265 | + let status: StepVerdict["status"]; |
| 266 | + let baseDelta = 0; |
| 267 | + if (greenSha === null) { |
| 268 | + status = "no-green"; |
| 269 | + } else if (filesShrank) { |
| 270 | + status = "trace-tests-shrunk"; |
| 271 | + baseDelta = -10; |
| 272 | + } else { |
| 273 | + status = "trace-verified"; |
| 274 | + baseDelta = 10; |
| 275 | + } |
| 276 | + const scoreDelta = applyMode(baseDelta, mode); |
| 277 | + const explanation = explainStep({ status, redSha, greenSha, hiddenPassed: null, mode }); |
| 278 | + steps.push({ |
| 279 | + stepId, redSha, greenSha, |
| 280 | + redFailed: null, greenPassed: null, hiddenPassed: null, |
| 281 | + status, scoreDelta, explanation, |
| 282 | + }); |
| 283 | + continue; |
| 284 | + } |
| 285 | + |
| 220 | 286 | await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000); |
| 221 | 287 | const redTestCount = await countTests(cwd); |
| 222 | 288 | const redPassed = await runTests(cwd); |