syntaxai/tdd.md · commit 496ab61

Add trace-only mode for non-Bun projects (CI-gate use case)

The judge can now skip test execution and score discipline from the
git log alone. Set tdd.config.json:

    { "test_runner": "none" }

In trace mode the judge:
- doesn't checkout commits or run any test process
- pairs red→green commits per step (+10 each pair under strict; mode
  multipliers still apply)
- counts test files at each step's tree via `git ls-tree` matching a
  language-agnostic name pattern (foo.test.ts, FooTests.cs, test_foo.py,
  foo_test.go, *Spec.scala, *_spec.rb, etc.) and flags drops as
  trace-tests-shrunk (-10)

This unlocks tdd.md as a CI gate for projects where Bun can't run the
suite — the snowplaza .NET API was the driving use case. The verdict
table and explanation strings carry over; new statuses trace-verified
and trace-tests-shrunk get their own one-liners.

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>

author: syntaxai <[email protected]>
date: 2026-05-04 06:51:49 +01:00
parent: 537840e
commit: 496ab6181730a7bf1b8d19d3b8b8156e49002c5c

3 files changed · +106 −14

modified README.md +21 −0

@@ -81,6 +81,27 @@ if anything changed):
81	81	State lives in podman volumes (`forgejo-data`, `tdd-md-data`) — no host
82	82	pollution, survives container restarts.
83	83
	84	+## Trace-only mode (real projects, any language)
	85	+
	86	+To use tdd.md as a CI gate on a non-Bun project, set `tdd.config.json`
	87	+at the repo root:
	88	+
	89	+```json
	90	+{ "mode": "pragmatic", "test_runner": "none" }
	91	+```
	92	+
	93	+In trace-only mode the judge skips checkout and test execution. It still:
	94	+
	95	+- walks the commit log and tags every `red:` / `green:` / `refactor:` /
	96	+ `spike:` commit
	97	+- detects red→green pairings per step (+10 per pair, vs +20 with full
	98	+ verification)
	99	+- counts test files (language-agnostic glob) at each commit's tree via
	100	+ `git ls-tree` and flags drops as `trace-tests-shrunk` (-10)
	101	+
	102	+This works on .NET, Python, Go, Ruby — anywhere Bun can't run the suite.
	103	+Useful as a discipline gate while the AI agent is doing real work.
	104	+
84	105	## Adding a kata
85	106
86	107	Drop a folder under `content/games/<kata-id>/`:

modified src/db.ts +6 −1

@@ -41,7 +41,12 @@ export interface StepVerdict {
41	41	\| "red-did-not-fail"
42	42	\| "green-did-not-pass"
43	43	\| "hidden-tests-failed"
44		- \| "test-deleted";
	44	+ \| "test-deleted"
	45	+ // Trace-only mode: tests not executed, only commit discipline checked.
	46	+ // Used when test_runner: "none" — language-agnostic, useful as a
	47	+ // CI gate on real projects where Bun can't run the test suite.
	48	+ \| "trace-verified"
	49	+ \| "trace-tests-shrunk";
45	50	scoreDelta: number;
46	51	// Coach-style explanation of the verdict — what happened, why the score
47	52	// is what it is, and (when relevant) how to improve next time.

modified src/judge.ts +79 −13

@@ -5,18 +5,35 @@ import { parseCommit, type Phase } from "./commits";
5	5	import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./db";
6	6	import { loadGame, type Game } from "./games";
7	7
8		-// tdd.config.json from the agent's repo selects the scoring mode.
9		-// Falls back to strict when missing or unparseable.
10		-const readMode = async (cwd: string): Promise<Mode> => {
	8	+type TestRunner = "bun" \| "none";
	9	+
	10	+interface TddConfig {
	11	+ mode: Mode;
	12	+ testRunner: TestRunner;
	13	+}
	14	+
	15	+// tdd.config.json from the agent's repo selects the scoring mode and
	16	+// test runner. Falls back to strict / bun when missing or unparseable.
	17	+//
	18	+// { "mode": "pragmatic", "test_runner": "none" }
	19	+//
	20	+// test_runner: "none" enables trace-only judging — no checkout, no test
	21	+// execution. Useful as a CI gate on projects where Bun can't run the
	22	+// suite (e.g. .NET, Python without bun-compat tests).
	23	+const readConfig = async (cwd: string): Promise<TddConfig> => {
11	24	const file = Bun.file(join(cwd, "tdd.config.json"));
12		- if (!(await file.exists())) return "strict";
13		- try {
14		- const cfg = (await file.json()) as { mode?: string };
15		- if (cfg.mode === "pragmatic" \|\| cfg.mode === "learning") return cfg.mode;
16		- return "strict";
17		- } catch {
18		- return "strict";
	25	+ let mode: Mode = "strict";
	26	+ let testRunner: TestRunner = "bun";
	27	+ if (await file.exists()) {
	28	+ try {
	29	+ const cfg = (await file.json()) as { mode?: string; test_runner?: string };
	30	+ if (cfg.mode === "pragmatic" \|\| cfg.mode === "learning") mode = cfg.mode;
	31	+ if (cfg.test_runner === "none") testRunner = "none";
	32	+ } catch {
	33	+ // best effort — bad config falls back to defaults
	34	+ }
19	35	}
	36	+ return { mode, testRunner };
20	37	};
21	38
22	39	// Penalty halving for pragmatic, zeroing for learning. Positive deltas
@@ -58,6 +75,10 @@ const explainStep = (params: {
58	75	: "Your tests pass, but hidden verification was inconclusive. Re-push to retry.";
59	76	case "test-deleted":
60	77	return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle.";
	78	+ case "trace-verified":
	79	+ return "Trace-only mode: red→green pair found in the commit log. Tests weren't executed (test_runner: \"none\"). Switch to bun runner for behaviour verification.";
	80	+ case "trace-tests-shrunk":
	81	+ return "Trace-only mode: the green commit's tree has fewer test files than the red commit's tree — looks like deletion. If you renamed or split test files, the tally still drops.";
61	82	}
62	83	};
63	84
@@ -109,6 +130,22 @@ const runTests = async (cwd: string): Promise<boolean> => {
109	130	return !r.timedOut && r.exitCode === 0;
110	131	};
111	132
	133	+// Language-agnostic test-file counter for trace-only mode. Uses git
	134	+// ls-tree at the given sha so we don't have to checkout the working
	135	+// tree. Matches conventional test-file naming across ecosystems:
	136	+// foo.test.ts, foo.spec.ts, FooTests.cs, FooTest.java, test_foo.py,
	137	+// foo_test.go, FooSpec.scala, foo_spec.rb.
	138	+const countTestFiles = async (cwd: string, sha: string): Promise<number> => {
	139	+ const r = await runProc(["git", "ls-tree", "-r", "--name-only", sha], cwd, 5000);
	140	+ if (r.exitCode !== 0) return 0;
	141	+ const re = /(?:^\|\/)(?:[^/]*\.(?:test\|spec)\.[a-z]+\|[Tt]ests?\/[^/]+\|test_[^/]+\|[^/]+_test\.[a-z]+\|[^/]+[Tt]ests?\.cs\|[^/]+[Tt]est\.java)$/;
	142	+ let count = 0;
	143	+ for (const line of r.stdout.split("\n")) {
	144	+ if (re.test(line)) count++;
	145	+ }
	146	+ return count;
	147	+};
	148	+
112	149	// Count `test(` / `it(` calls in tracked *.test.ts files. Used to detect
113	150	// when an agent deletes tests between red and green to make a regression
114	151	// "pass" — a cardinal TDD sin per the kata spec.
@@ -200,9 +237,8 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
200	237	}
201	238	}
202	239
203		- // Read the agent's mode preference (defaults to strict). Mode
204		- // affects penalties only — verified credits are mode-invariant.
205		- const mode = await readMode(cwd);
	240	+ // Read the agent's mode + runner preferences from tdd.config.json.
	241	+ const { mode, testRunner } = await readConfig(cwd);
206	242
207	243	// Load the kata's authoritative spec — used to fetch hidden tests
208	244	// per step. Repos that don't match a known kata get scored on red→green
@@ -217,6 +253,36 @@ export const judge = async (owner: string, repo: string): Promise<Verdict> => {
217	253	const steps: StepVerdict[] = [];
218	254	for (const [stepId, redSha] of stepRed) {
219	255	const greenSha = stepGreen.get(stepId) ?? null;
	256	+
	257	+ if (testRunner === "none") {
	258	+ // Trace-only path: don't checkout, don't run anything. Score
	259	+ // purely from the commit log + a language-agnostic test-file
	260	+ // count via `git ls-tree`. Useful for non-Bun projects.
	261	+ const redFiles = await countTestFiles(cwd, redSha);
	262	+ const greenFiles = greenSha ? await countTestFiles(cwd, greenSha) : redFiles;
	263	+ const filesShrank = greenSha !== null && greenFiles < redFiles;
	264	+
	265	+ let status: StepVerdict["status"];
	266	+ let baseDelta = 0;
	267	+ if (greenSha === null) {
	268	+ status = "no-green";
	269	+ } else if (filesShrank) {
	270	+ status = "trace-tests-shrunk";
	271	+ baseDelta = -10;
	272	+ } else {
	273	+ status = "trace-verified";
	274	+ baseDelta = 10;
	275	+ }
	276	+ const scoreDelta = applyMode(baseDelta, mode);
	277	+ const explanation = explainStep({ status, redSha, greenSha, hiddenPassed: null, mode });
	278	+ steps.push({
	279	+ stepId, redSha, greenSha,
	280	+ redFailed: null, greenPassed: null, hiddenPassed: null,
	281	+ status, scoreDelta, explanation,
	282	+ });
	283	+ continue;
	284	+ }
	285	+
220	286	await runProc(["git", "checkout", "--quiet", redSha], cwd, 5000);
221	287	const redTestCount = await countTests(cwd);
222	288	const redPassed = await runTests(cwd);

raw .diff