496ab6181730a7bf1b8d19d3b8b8156e49002c5c diff --git a/README.md b/README.md index a77913c5c073eb7b1cb1b588a71760970c94f597..adb5e25ddfe80f6644b83c47ac72c18cf3176f9d 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,27 @@ if anything changed): State lives in podman volumes (`forgejo-data`, `tdd-md-data`) — no host pollution, survives container restarts. +## Trace-only mode (real projects, any language) + +To use tdd.md as a CI gate on a non-Bun project, set `tdd.config.json` +at the repo root: + +```json +{ "mode": "pragmatic", "test_runner": "none" } +``` + +In trace-only mode the judge skips checkout and test execution. It still: + +- walks the commit log and tags every `red:` / `green:` / `refactor:` / + `spike:` commit +- detects red→green pairings per step (+10 per pair, vs +20 with full + verification) +- counts test files (language-agnostic glob) at each commit's tree via + `git ls-tree` and flags drops as `trace-tests-shrunk` (-10) + +This works on .NET, Python, Go, Ruby — anywhere Bun can't run the suite. +Useful as a discipline gate while the AI agent is doing real work. + ## Adding a kata Drop a folder under `content/games//`: diff --git a/src/db.ts b/src/db.ts index 8636f8b95f39ad016ce0017a67393264de0fd255..4612b3b3bbcf12b6917a96b543d4252795b39c30 100644 --- a/src/db.ts +++ b/src/db.ts @@ -41,7 +41,12 @@ export interface StepVerdict { | "red-did-not-fail" | "green-did-not-pass" | "hidden-tests-failed" - | "test-deleted"; + | "test-deleted" + // Trace-only mode: tests not executed, only commit discipline checked. + // Used when test_runner: "none" — language-agnostic, useful as a + // CI gate on real projects where Bun can't run the test suite. + | "trace-verified" + | "trace-tests-shrunk"; scoreDelta: number; // Coach-style explanation of the verdict — what happened, why the score // is what it is, and (when relevant) how to improve next time. diff --git a/src/judge.ts b/src/judge.ts index 6374a58467e18c283fa10592ea431ba362aa9c0b..96a50acafc297648dcdda25137bb528279fbc9e1 100644 --- a/src/judge.ts +++ b/src/judge.ts @@ -5,18 +5,35 @@ import { parseCommit, type Phase } from "./commits"; import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./db"; import { loadGame, type Game } from "./games"; -// tdd.config.json from the agent's repo selects the scoring mode. -// Falls back to strict when missing or unparseable. -const readMode = async (cwd: string): Promise => { +type TestRunner = "bun" | "none"; + +interface TddConfig { + mode: Mode; + testRunner: TestRunner; +} + +// tdd.config.json from the agent's repo selects the scoring mode and +// test runner. Falls back to strict / bun when missing or unparseable. +// +// { "mode": "pragmatic", "test_runner": "none" } +// +// test_runner: "none" enables trace-only judging — no checkout, no test +// execution. Useful as a CI gate on projects where Bun can't run the +// suite (e.g. .NET, Python without bun-compat tests). +const readConfig = async (cwd: string): Promise => { const file = Bun.file(join(cwd, "tdd.config.json")); - if (!(await file.exists())) return "strict"; - try { - const cfg = (await file.json()) as { mode?: string }; - if (cfg.mode === "pragmatic" || cfg.mode === "learning") return cfg.mode; - return "strict"; - } catch { - return "strict"; + let mode: Mode = "strict"; + let testRunner: TestRunner = "bun"; + if (await file.exists()) { + try { + const cfg = (await file.json()) as { mode?: string; test_runner?: string }; + if (cfg.mode === "pragmatic" || cfg.mode === "learning") mode = cfg.mode; + if (cfg.test_runner === "none") testRunner = "none"; + } catch { + // best effort — bad config falls back to defaults + } } + return { mode, testRunner }; }; // Penalty halving for pragmatic, zeroing for learning. Positive deltas @@ -58,6 +75,10 @@ const explainStep = (params: { : "Your tests pass, but hidden verification was inconclusive. Re-push to retry."; case "test-deleted": return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle."; + case "trace-verified": + return "Trace-only mode: red→green pair found in the commit log. Tests weren't executed (test_runner: \"none\"). Switch to bun runner for behaviour verification."; + case "trace-tests-shrunk": + return "Trace-only mode: the green commit's tree has fewer test files than the red commit's tree — looks like deletion. If you renamed or split test files, the tally still drops."; } }; @@ -109,6 +130,22 @@ const runTests = async (cwd: string): Promise => { return !r.timedOut && r.exitCode === 0; }; +// Language-agnostic test-file counter for trace-only mode. Uses git +// ls-tree at the given sha so we don't have to checkout the working +// tree. Matches conventional test-file naming across ecosystems: +// foo.test.ts, foo.spec.ts, FooTests.cs, FooTest.java, test_foo.py, +// foo_test.go, FooSpec.scala, foo_spec.rb. +const countTestFiles = async (cwd: string, sha: string): Promise => { + const r = await runProc(["git", "ls-tree", "-r", "--name-only", sha], cwd, 5000); + if (r.exitCode !== 0) return 0; + const re = /(?:^|\/)(?:[^/]*\.(?:test|spec)\.[a-z]+|[Tt]ests?\/[^/]+|test_[^/]+|[^/]+_test\.[a-z]+|[^/]+[Tt]ests?\.cs|[^/]+[Tt]est\.java)$/; + let count = 0; + for (const line of r.stdout.split("\n")) { + if (re.test(line)) count++; + } + return count; +}; + // Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect // when an agent deletes tests between red and green to make a regression // "pass" — a cardinal TDD sin per the kata spec. @@ -200,9 +237,8 @@ export const judge = async (owner: string, repo: string): Promise => { } } - // Read the agent's mode preference (defaults to strict). Mode - // affects penalties only — verified credits are mode-invariant. - const mode = await readMode(cwd); + // Read the agent's mode + runner preferences from tdd.config.json. + const { mode, testRunner } = await readConfig(cwd); // Load the kata's authoritative spec — used to fetch hidden tests // per step. Repos that don't match a known kata get scored on red→green @@ -217,6 +253,36 @@ export const judge = async (owner: string, repo: string): Promise => { const steps: StepVerdict[] = []; for (const [stepId, redSha] of stepRed) { const greenSha = stepGreen.get(stepId) ?? null; + + if (testRunner === "none") { + // Trace-only path: don't checkout, don't run anything. Score + // purely from the commit log + a language-agnostic test-file + // count via `git ls-tree`. Useful for non-Bun projects. + const redFiles = await countTestFiles(cwd, redSha); + const greenFiles = greenSha ? await countTestFiles(cwd, greenSha) : redFiles; + const filesShrank = greenSha !== null && greenFiles < redFiles; + + let status: StepVerdict["status"]; + let baseDelta = 0; + if (greenSha === null) { + status = "no-green"; + } else if (filesShrank) { + status = "trace-tests-shrunk"; + baseDelta = -10; + } else { + status = "trace-verified"; + baseDelta = 10; + } + const scoreDelta = applyMode(baseDelta, mode); + const explanation = explainStep({ status, redSha, greenSha, hiddenPassed: null, mode }); + steps.push({ + stepId, redSha, greenSha, + redFailed: null, greenPassed: null, hiddenPassed: null, + status, scoreDelta, explanation, + }); + continue; + } + await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000); const redTestCount = await countTests(cwd); const redPassed = await runTests(cwd);