syntaxai/tdd.md · commit 496ab61

Add trace-only mode for non-Bun projects (CI-gate use case)

The judge can now skip test execution and score discipline from the
git log alone. Set tdd.config.json:

    { "test_runner": "none" }

In trace mode the judge:
- doesn't checkout commits or run any test process
- pairs red→green commits per step (+10 each pair under strict; mode
  multipliers still apply)
- counts test files at each step's tree via `git ls-tree` matching a
  language-agnostic name pattern (foo.test.ts, FooTests.cs, test_foo.py,
  foo_test.go, *Spec.scala, *_spec.rb, etc.) and flags drops as
  trace-tests-shrunk (-10)

This unlocks tdd.md as a CI gate for projects where Bun can't run the
suite — the snowplaza .NET API was the driving use case. The verdict
table and explanation strings carry over; new statuses trace-verified
and trace-tests-shrunk get their own one-liners.

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>
author
syntaxai <[email protected]>
date
2026-05-04 06:51:49 +01:00
parent
537840e
commit
496ab6181730a7bf1b8d19d3b8b8156e49002c5c

3 files changed · +106 −14

modified README.md +21 −0
@@ -81,6 +81,27 @@ if anything changed):
8181 State lives in podman volumes (`forgejo-data`, `tdd-md-data`) — no host
8282 pollution, survives container restarts.
8383
84+## Trace-only mode (real projects, any language)
85+
86+To use tdd.md as a CI gate on a non-Bun project, set `tdd.config.json`
87+at the repo root:
88+
89+```json
90+{ "mode": "pragmatic", "test_runner": "none" }
91+```
92+
93+In trace-only mode the judge skips checkout and test execution. It still:
94+
95+- walks the commit log and tags every `red:` / `green:` / `refactor:` /
96+ `spike:` commit
97+- detects red→green pairings per step (+10 per pair, vs +20 with full
98+ verification)
99+- counts test files (language-agnostic glob) at each commit's tree via
100+ `git ls-tree` and flags drops as `trace-tests-shrunk` (-10)
101+
102+This works on .NET, Python, Go, Ruby — anywhere Bun can't run the suite.
103+Useful as a discipline gate while the AI agent is doing real work.
104+
84105 ## Adding a kata
85106
86107 Drop a folder under `content/games/<kata-id>/`:
modified src/db.ts +6 −1
@@ -41,7 +41,12 @@ export interface StepVerdict {
4141 | "red-did-not-fail"
4242 | "green-did-not-pass"
4343 | "hidden-tests-failed"
44- | "test-deleted";
44+ | "test-deleted"
45+ // Trace-only mode: tests not executed, only commit discipline checked.
46+ // Used when test_runner: "none" — language-agnostic, useful as a
47+ // CI gate on real projects where Bun can't run the test suite.
48+ | "trace-verified"
49+ | "trace-tests-shrunk";
4550 scoreDelta: number;
4651 // Coach-style explanation of the verdict — what happened, why the score
4752 // is what it is, and (when relevant) how to improve next time.
modified src/judge.ts +79 −13
@@ -5,18 +5,35 @@ import { parseCommit, type Phase } from "./commits";
55 import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./db";
66 import { loadGame, type Game } from "./games";
77
8-// tdd.config.json from the agent's repo selects the scoring mode.
9-// Falls back to strict when missing or unparseable.
10-const readMode = async (cwd: string): Promise<Mode> => {
8+type TestRunner = "bun" | "none";
9+
10+interface TddConfig {
11+ mode: Mode;
12+ testRunner: TestRunner;
13+}
14+
15+// tdd.config.json from the agent's repo selects the scoring mode and
16+// test runner. Falls back to strict / bun when missing or unparseable.
17+//
18+// { "mode": "pragmatic", "test_runner": "none" }
19+//
20+// test_runner: "none" enables trace-only judging — no checkout, no test
21+// execution. Useful as a CI gate on projects where Bun can't run the
22+// suite (e.g. .NET, Python without bun-compat tests).
23+const readConfig = async (cwd: string): Promise<TddConfig> => {
1124 const file = Bun.file(join(cwd, "tdd.config.json"));
12- if (!(await file.exists())) return "strict";
13- try {
14- const cfg = (await file.json()) as { mode?: string };
15- if (cfg.mode === "pragmatic" || cfg.mode === "learning") return cfg.mode;
16- return "strict";
17- } catch {
18- return "strict";
25+ let mode: Mode = "strict";
26+ let testRunner: TestRunner = "bun";
27+ if (await file.exists()) {
28+ try {
29+ const cfg = (await file.json()) as { mode?: string; test_runner?: string };
30+ if (cfg.mode === "pragmatic" || cfg.mode === "learning") mode = cfg.mode;
31+ if (cfg.test_runner === "none") testRunner = "none";
32+ } catch {
33+ // best effort — bad config falls back to defaults
34+ }
1935 }
36+ return { mode, testRunner };
2037 };
2138
2239 // Penalty halving for pragmatic, zeroing for learning. Positive deltas
@@ -58,6 +75,10 @@ const explainStep = (params: {
5875 : "Your tests pass, but hidden verification was inconclusive. Re-push to retry.";
5976 case "test-deleted":
6077 return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle.";
78+ case "trace-verified":
79+ return "Trace-only mode: red→green pair found in the commit log. Tests weren't executed (test_runner: \"none\"). Switch to bun runner for behaviour verification.";
80+ case "trace-tests-shrunk":
81+ return "Trace-only mode: the green commit's tree has fewer test files than the red commit's tree — looks like deletion. If you renamed or split test files, the tally still drops.";
6182 }
6283 };
6384
@@ -109,6 +130,22 @@ const runTests = async (cwd: string): Promise<boolean> => {
109130 return !r.timedOut && r.exitCode === 0;
110131 };
111132
133+// Language-agnostic test-file counter for trace-only mode. Uses git
134+// ls-tree at the given sha so we don't have to checkout the working
135+// tree. Matches conventional test-file naming across ecosystems:
136+// foo.test.ts, foo.spec.ts, FooTests.cs, FooTest.java, test_foo.py,
137+// foo_test.go, FooSpec.scala, foo_spec.rb.
138+const countTestFiles = async (cwd: string, sha: string): Promise<number> => {
139+ const r = await runProc(["git", "ls-tree", "-r", "--name-only", sha], cwd, 5000);
140+ if (r.exitCode !== 0) return 0;
141+ const re = /(?:^|\/)(?:[^/]*\.(?:test|spec)\.[a-z]+|[Tt]ests?\/[^/]+|test_[^/]+|[^/]+_test\.[a-z]+|[^/]+[Tt]ests?\.cs|[^/]+[Tt]est\.java)$/;
142+ let count = 0;
143+ for (const line of r.stdout.split("\n")) {
144+ if (re.test(line)) count++;
145+ }
146+ return count;
147+};
148+
112149 // Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect
113150 // when an agent deletes tests between red and green to make a regression
114151 // "pass" — a cardinal TDD sin per the kata spec.
@@ -200,9 +237,8 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
200237 }
201238 }
202239
203- // Read the agent's mode preference (defaults to strict). Mode
204- // affects penalties only — verified credits are mode-invariant.
205- const mode = await readMode(cwd);
240+ // Read the agent's mode + runner preferences from tdd.config.json.
241+ const { mode, testRunner } = await readConfig(cwd);
206242
207243 // Load the kata's authoritative spec — used to fetch hidden tests
208244 // per step. Repos that don't match a known kata get scored on red→green
@@ -217,6 +253,36 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
217253 const steps: StepVerdict[] = [];
218254 for (const [stepId, redSha] of stepRed) {
219255 const greenSha = stepGreen.get(stepId) ?? null;
256+
257+ if (testRunner === "none") {
258+ // Trace-only path: don't checkout, don't run anything. Score
259+ // purely from the commit log + a language-agnostic test-file
260+ // count via `git ls-tree`. Useful for non-Bun projects.
261+ const redFiles = await countTestFiles(cwd, redSha);
262+ const greenFiles = greenSha ? await countTestFiles(cwd, greenSha) : redFiles;
263+ const filesShrank = greenSha !== null && greenFiles < redFiles;
264+
265+ let status: StepVerdict["status"];
266+ let baseDelta = 0;
267+ if (greenSha === null) {
268+ status = "no-green";
269+ } else if (filesShrank) {
270+ status = "trace-tests-shrunk";
271+ baseDelta = -10;
272+ } else {
273+ status = "trace-verified";
274+ baseDelta = 10;
275+ }
276+ const scoreDelta = applyMode(baseDelta, mode);
277+ const explanation = explainStep({ status, redSha, greenSha, hiddenPassed: null, mode });
278+ steps.push({
279+ stepId, redSha, greenSha,
280+ redFailed: null, greenPassed: null, hiddenPassed: null,
281+ status, scoreDelta, explanation,
282+ });
283+ continue;
284+ }
285+
220286 await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000);
221287 const redTestCount = await countTests(cwd);
222288 const redPassed = await runTests(cwd);