syntaxai/tdd.md · commit bbab5ed

Reports: live test snapshot via deploy-time bun-test bundle

Replaces the placeholder on /reports/live/tests with a real per-test
view sourced from `bun test --reporter=junit`, run at deploy time and
appended to a per-repo bundle.

  scripts/p620/snapshot-tests.ts
    Runs `bun test --reporter=junit`, parses the XML, appends the
    result to content/git-history/<owner>__<name>__tests.json. Each
    deploy adds at most one run (skipped when HEAD is already in the
    bundle). Capped at 50 runs so the file stays bounded.

  src/c14_github.ts
    loadTestBundle: reads the bundle from disk; same shape and home
    as the git-history snapshot.

  src/c32_real_tests.ts
    Aggregates the bundle into TestSnapshot[] (latest run, with
    "since" computed from the oldest run that flagged a test as
    failing) and TestStability[] (pass/fail counts per (file, name)
    across all runs, with a "deleted" flag when a test that previously
    appeared is missing from the latest run). lastBrokenBy is mapped
    via Co-Authored-By footers from the commits bundle.

  src/c21_app.ts
    /reports/live/tests now calls buildLiveTestData. When the bundle
    is empty it shows an honest unavailable-note pointing at the demo;
    once at least one run is in the bundle, real data renders.

  scripts/p620/deploy-tdd-md.sh
    Runs snapshot-tests after snapshot-git-history, before rsync.

V1 is HEAD-only per deploy: stability/flakiness data accumulates run
by run as deploys happen. No git-worktree gymnastics, no per-commit
bun-install. Bumping to historical-commit testing is a future sliver.

Co-Authored-By: Claude Opus 4.7 (1M context) <[email protected]>

author: syntaxai <[email protected]>
date: 2026-05-09 07:34:48 +01:00
parent: b91543a
commit: bbab5ed76d0cb3c9d1bec8b1db7981d8b92a0905

5 files changed · +336 −7

modified scripts/p620/deploy-tdd-md.sh +6 −0

@@ -44,6 +44,12 @@ echo "→ snapshot git history → content/git-history/"
44	44	( cd "$REPO_ROOT" && bun scripts/p620/snapshot-git-history.ts ) \
45	45	\|\| { echo "✗ snapshot-git-history mislukt"; exit 1; }
46	46
	47	+echo "→ snapshot tests (bun test --reporter=junit) → content/git-history/"
	48	+# Runs the test suite at HEAD and appends the result to the per-repo
	49	+# tests bundle. Stability data accumulates run-by-run across deploys.
	50	+( cd "$REPO_ROOT" && bun scripts/p620/snapshot-tests.ts ) \
	51	+ \|\| { echo "✗ snapshot-tests mislukt"; exit 1; }
	52	+
47	53	echo "→ source rsync naar $SSH_HOST:~/$REMOTE_SRC_DIR"
48	54	ssh "$SSH_HOST" "mkdir -p ~/$REMOTE_SRC_DIR"
49	55	# --delete zodat verwijderde files ook weggaan op remote.

added scripts/p620/snapshot-tests.ts +129 −0

@@ -0,0 +1,129 @@
	1	+#!/usr/bin/env bun
	2	+// Run `bun test` on the current HEAD and append the result to a
	3	+// per-repo bundle alongside the git-history snapshot. The container
	4	+// reads this bundle at runtime to render /reports/live/tests for the
	5	+// (private) syntaxai/tdd.md repo without needing a runtime sandbox.
	6	+//
	7	+// Strategy: HEAD-only per deploy. The bundle accumulates one run per
	8	+// deploy (capped at 50), so stability data builds organically over
	9	+// time. No git-worktree gymnastics, no per-commit bun-install.
	10	+//
	11	+// Output: content/git-history/<owner>__<name>__tests.json
	12	+// Schema: { owner, name, runs: TestRunRecord[] } — newest first.
	13	+
	14	+import { spawnSync } from "node:child_process";
	15	+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
	16	+import { resolve } from "node:path";
	17	+
	18	+const REPO_ROOT = resolve(import.meta.dir, "..", "..");
	19	+const OWNER = "syntaxai";
	20	+const NAME = "tdd.md";
	21	+const MAX_RUNS = 50;
	22	+const JUNIT_OUT = "/tmp/tdd-md-test-junit.xml";
	23	+
	24	+const sh = (cmd: string, args: string[]): string => {
	25	+ const r = spawnSync(cmd, args, { cwd: REPO_ROOT, encoding: "utf8" });
	26	+ return (r.stdout ?? "").trim();
	27	+};
	28	+
	29	+const head = sh("git", ["rev-parse", "HEAD"]);
	30	+const branch = sh("git", ["rev-parse", "--abbrev-ref", "HEAD"]);
	31	+if (head === "") {
	32	+ console.error("could not resolve HEAD");
	33	+ process.exit(1);
	34	+}
	35	+
	36	+// Run tests. bun exits non-zero when tests fail — that's fine, we
	37	+// just need the junit XML, which it writes regardless.
	38	+spawnSync("bun", ["test", "--reporter=junit", `--reporter-outfile=${JUNIT_OUT}`], {
	39	+ cwd: REPO_ROOT,
	40	+ stdio: "inherit",
	41	+});
	42	+if (!existsSync(JUNIT_OUT)) {
	43	+ console.error(`✗ junit output missing at ${JUNIT_OUT} — bun test crashed before writing`);
	44	+ process.exit(1);
	45	+}
	46	+const xml = readFileSync(JUNIT_OUT, "utf8");
	47	+
	48	+interface TestRecord {
	49	+ name: string;
	50	+ file: string;
	51	+ status: "pass" \| "fail";
	52	+ durationMs: number;
	53	+}
	54	+
	55	+const decodeXmlEntity = (s: string): string =>
	56	+ s.replace(/'/g, "'").replace(/"/g, '"').replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&");
	57	+
	58	+const parseJunit = (xml: string): TestRecord[] => {
	59	+ const out: TestRecord[] = [];
	60	+ const re = /<testcase\s+([^>]?)(\/>\|>([\s\S]?)<\/testcase>)/g;
	61	+ let m: RegExpExecArray \| null;
	62	+ while ((m = re.exec(xml)) !== null) {
	63	+ const attrs = m[1] ?? "";
	64	+ const inner = m[3] ?? "";
	65	+ const nameRaw = /name="([^"]*)"/.exec(attrs)?.[1] ?? "";
	66	+ const file = /file="([^"]*)"/.exec(attrs)?.[1] ?? "";
	67	+ const time = parseFloat(/time="([^"]*)"/.exec(attrs)?.[1] ?? "0");
	68	+ const failed = /<failure\b/.test(inner) \|\| /<error\b/.test(inner);
	69	+ out.push({
	70	+ name: decodeXmlEntity(nameRaw),
	71	+ file,
	72	+ status: failed ? "fail" : "pass",
	73	+ durationMs: Math.round(time * 1000),
	74	+ });
	75	+ }
	76	+ return out;
	77	+};
	78	+
	79	+const tests = parseJunit(xml);
	80	+const passing = tests.filter((t) => t.status === "pass").length;
	81	+const failing = tests.length - passing;
	82	+const totalDurationMs = tests.reduce((s, t) => s + t.durationMs, 0);
	83	+
	84	+interface TestRunRecord {
	85	+ sha: string;
	86	+ branch: string;
	87	+ ranAt: number;
	88	+ total: number;
	89	+ passing: number;
	90	+ failing: number;
	91	+ durationMs: number;
	92	+ tests: TestRecord[];
	93	+}
	94	+
	95	+interface TestBundle {
	96	+ owner: string;
	97	+ name: string;
	98	+ runs: TestRunRecord[];
	99	+}
	100	+
	101	+const bundlePath = resolve(REPO_ROOT, "content", "git-history", `${OWNER}__${NAME}__tests.json`);
	102	+let bundle: TestBundle = { owner: OWNER, name: NAME, runs: [] };
	103	+if (existsSync(bundlePath)) {
	104	+ try {
	105	+ const parsed = JSON.parse(readFileSync(bundlePath, "utf8")) as TestBundle;
	106	+ if (parsed && Array.isArray(parsed.runs)) bundle = parsed;
	107	+ } catch {
	108	+ // Corrupt or unreadable bundle — start fresh, deploy isn't blocked.
	109	+ }
	110	+}
	111	+
	112	+if (bundle.runs.some((r) => r.sha === head)) {
	113	+ console.log(`✓ tests for ${head.slice(0, 7)} already in bundle (${bundle.runs.length} runs total) — nothing to add`);
	114	+} else {
	115	+ bundle.runs.unshift({
	116	+ sha: head,
	117	+ branch,
	118	+ ranAt: Date.now(),
	119	+ total: tests.length,
	120	+ passing,
	121	+ failing,
	122	+ durationMs: totalDurationMs,
	123	+ tests,
	124	+ });
	125	+ bundle.runs = bundle.runs.slice(0, MAX_RUNS);
	126	+ mkdirSync(resolve(REPO_ROOT, "content", "git-history"), { recursive: true });
	127	+ writeFileSync(bundlePath, JSON.stringify(bundle, null, 2));
	128	+ console.log(`✓ tests at ${head.slice(0, 7)} (${branch}): ${passing}/${tests.length} pass, ${failing} fail → bundle (${bundle.runs.length} runs total)`);
	129	+}

modified src/c14_github.ts +46 −0

@@ -206,3 +206,49 @@ export const fetchRepoCommits = async (
206	206	commitsCache.set(key, { fetchedAt: Date.now(), commits });
207	207	return commits;
208	208	};
	209	+
	210	+// ---------------------------------------------------------------------
	211	+// Test-results bundle. Companion to the git-history bundle above —
	212	+// scripts/p620/snapshot-tests.ts runs `bun test --reporter=junit` at
	213	+// each deploy and appends the result to this JSON file. Lets the
	214	+// container render /reports/live/tests against real data without
	215	+// running tests at runtime.
	216	+// ---------------------------------------------------------------------
	217	+
	218	+export interface TestRecord {
	219	+ name: string;
	220	+ file: string;
	221	+ status: "pass" \| "fail";
	222	+ durationMs: number;
	223	+}
	224	+
	225	+export interface TestRunRecord {
	226	+ sha: string;
	227	+ branch: string;
	228	+ ranAt: number;
	229	+ total: number;
	230	+ passing: number;
	231	+ failing: number;
	232	+ durationMs: number;
	233	+ tests: TestRecord[];
	234	+}
	235	+
	236	+export interface TestBundle {
	237	+ owner: string;
	238	+ name: string;
	239	+ runs: TestRunRecord[];
	240	+}
	241	+
	242	+export const loadTestBundle = async (
	243	+ repoOwner: string,
	244	+ repoName: string,
	245	+): Promise<TestBundle \| null> => {
	246	+ try {
	247	+ const file = Bun.file(`./content/git-history/${repoOwner}__${repoName}__tests.json`);
	248	+ if (!(await file.exists())) return null;
	249	+ const data = (await file.json()) as TestBundle;
	250	+ return Array.isArray(data.runs) ? data : null;
	251	+ } catch {
	252	+ return null;
	253	+ }
	254	+};

modified src/c21_app.ts +15 −7

@@ -36,6 +36,7 @@ import {
36	36	DEMO_STABILITY,
37	37	} from "./c31_reports_demo.ts";
38	38	import { buildLiveReports } from "./c32_real_reports.ts";
	39	+import { buildLiveTestData } from "./c32_real_tests.ts";
39	40	import { parseRepoIdentifier } from "./c31_project_config.ts";
40	41	import { judge } from "./c32_judge.ts";
41	42	import {
@@ -532,18 +533,25 @@ ${rows}
532	533	},
533	534
534	535	"/reports/live/tests": async () => {
	536	+ const data = await buildLiveTestData(LIVE_REPO_OWNER, LIVE_REPO_NAME);
	537	+ const ranOn = data.ranAt ? new Date(data.ranAt).toISOString().slice(0, 10) : null;
	538	+ const period = data.runsCount === 0
	539	+ ? "geen runs in bundle"
	540	+ : `last run ${ranOn} · ${data.runsCount} run${data.runsCount === 1 ? "" : "s"} cumulatief`;
	541	+ const unavailableNote = data.runsCount === 0
	542	+ ? "Nog geen test-runs gebundeld. De volgende deploy draait `bun test --reporter=junit` op de huidige HEAD en publiceert het resultaat hier. Stabiliteit (flaky %, deletion) bouwt zich op naarmate er meer runs in de bundle staan — de demo op [/reports/demo/tests](/reports/demo/tests) toont waar het naartoe groeit."
	543	+ : undefined;
535	544	const html = await renderPage({
536		- title: "Tests overzicht · live (placeholder) — tdd.md",
537		- description: "Placeholder voor de live test-overview — wacht op de sandbox-runner sliver.",
	545	+ title: "Tests overzicht · live — tdd.md",
	546	+ description: `Live test-snapshot van ${LIVE_REPO_OWNER}/${LIVE_REPO_NAME} — ${data.runsCount} run${data.runsCount === 1 ? "" : "s"} gebundeld.`,
538	547	bodyMarkdown: testsOverviewMd({
539		- period: "live",
	548	+ period,
540	549	bannerHtml: LIVE_BANNER_HTML,
541		- snapshots: [],
542		- stability: [],
543		- unavailableNote: "De per-repo test-snapshot en stabiliteitstabel hebben de sandbox-runner sliver nodig (block 1 vervolg). Tot dat klaar is, alleen de exec-summary + drill-down draaien op echte data; de testpagina staat in de [demo](/reports/demo/tests).",
	550	+ snapshots: data.snapshots,
	551	+ stability: data.stability,
	552	+ unavailableNote,
544	553	}),
545	554	ogPath: "https://tdd.md/reports/live/tests",
546		- noindex: true,
547	555	});
548	556	return htmlResponse(html);
549	557	},

added src/c32_real_tests.ts +140 −0

@@ -0,0 +1,140 @@
	1	+// c32 — logic: aggregate the per-deploy test bundle into the same
	2	+// TestSnapshot[] / TestStability[] shape that the demo page renders.
	3	+// HEAD-only snapshots; stability accumulates as more deploys add runs.
	4	+//
	5	+// Pure given the bundle + commits in (no I/O of its own beyond delegating
	6	+// to c14_github's bundle loader and commits fetcher).
	7	+
	8	+import { fetchRepoCommits, loadTestBundle } from "./c14_github.ts";
	9	+import type {
	10	+ AgentReport,
	11	+ TestFailure,
	12	+ TestSnapshot,
	13	+ TestStability,
	14	+} from "./c31_reports_demo.ts";
	15	+
	16	+const detectAgent = (msg: string): AgentReport["slug"] \| null => {
	17	+ if (/Co-Authored-By:.*Claude/i.test(msg)) return "claude-code";
	18	+ if (/Co-Authored-By:.*Cursor/i.test(msg)) return "cursor";
	19	+ if (/Co-Authored-By:.*Aider/i.test(msg)) return "aider";
	20	+ return null;
	21	+};
	22	+
	23	+const shortenTestLabel = (file: string, name: string): string => {
	24	+ const base = file.split("/").pop() ?? file;
	25	+ return `${base} > ${name}`;
	26	+};
	27	+
	28	+export interface LiveTestData {
	29	+ snapshots: TestSnapshot[];
	30	+ stability: TestStability[];
	31	+ runsCount: number;
	32	+ ranAt: number \| null;
	33	+ headSha: string \| null;
	34	+}
	35	+
	36	+export const buildLiveTestData = async (
	37	+ repoOwner: string,
	38	+ repoName: string,
	39	+): Promise<LiveTestData> => {
	40	+ const bundle = await loadTestBundle(repoOwner, repoName);
	41	+ if (!bundle \|\| bundle.runs.length === 0) {
	42	+ return { snapshots: [], stability: [], runsCount: 0, ranAt: null, headSha: null };
	43	+ }
	44	+ const repoSlug = `${repoOwner}/${repoName}`;
	45	+ const latest = bundle.runs[0];
	46	+ if (!latest) {
	47	+ return { snapshots: [], stability: [], runsCount: 0, ranAt: null, headSha: null };
	48	+ }
	49	+
	50	+ // For "since" we want the oldest run that has this test as failing.
	51	+ const oldestFirst = [...bundle.runs].sort((a, b) => a.ranAt - b.ranAt);
	52	+
	53	+ const failures: TestFailure[] = latest.tests
	54	+ .filter((t) => t.status === "fail")
	55	+ .map((t) => {
	56	+ const firstFail = oldestFirst.find((r) =>
	57	+ r.tests.some((x) => x.name === t.name && x.file === t.file && x.status === "fail"),
	58	+ );
	59	+ const sinceTs = firstFail?.ranAt ?? latest.ranAt;
	60	+ return { test: shortenTestLabel(t.file, t.name), since: new Date(sinceTs).toISOString().slice(0, 10) };
	61	+ });
	62	+
	63	+ const snapshot: TestSnapshot = {
	64	+ repo: repoSlug,
	65	+ branch: latest.branch,
	66	+ total: latest.total,
	67	+ passing: latest.passing,
	68	+ failing: latest.failing,
	69	+ failures,
	70	+ };
	71	+
	72	+ // Stability: count pass/fail per (file, name) across every run, with
	73	+ // "deleted" set when a previously-seen test is missing from latest.
	74	+ const commits = await fetchRepoCommits(repoOwner, repoName, 100);
	75	+ const shaToAgent = new Map<string, AgentReport["slug"] \| null>();
	76	+ for (const c of commits) shaToAgent.set(c.sha, detectAgent(c.commit.message));
	77	+
	78	+ interface Stat {
	79	+ name: string;
	80	+ file: string;
	81	+ pass: number;
	82	+ fail: number;
	83	+ lastBrokenSha: string \| null;
	84	+ lastBrokenAt: number;
	85	+ }
	86	+ const stats = new Map<string, Stat>();
	87	+ for (const run of bundle.runs) {
	88	+ for (const t of run.tests) {
	89	+ const key = `${t.file}\|${t.name}`;
	90	+ let s = stats.get(key);
	91	+ if (!s) {
	92	+ s = { name: t.name, file: t.file, pass: 0, fail: 0, lastBrokenSha: null, lastBrokenAt: 0 };
	93	+ stats.set(key, s);
	94	+ }
	95	+ if (t.status === "pass") s.pass++;
	96	+ else {
	97	+ s.fail++;
	98	+ if (run.ranAt > s.lastBrokenAt) {
	99	+ s.lastBrokenSha = run.sha;
	100	+ s.lastBrokenAt = run.ranAt;
	101	+ }
	102	+ }
	103	+ }
	104	+ }
	105	+
	106	+ const latestKeys = new Set(latest.tests.map((t) => `${t.file}\|${t.name}`));
	107	+
	108	+ // lastBrokenBy needs an agent slug; if we can't map a SHA to an agent
	109	+ // (e.g. the commit isn't in the 100-commit window we fetch), fall
	110	+ // back to the agent of the latest run, which is a defensible default
	111	+ // for the dogfood case (one agent producing the history).
	112	+ const fallbackAgent = (shaToAgent.get(latest.sha) ?? "claude-code") as AgentReport["slug"];
	113	+
	114	+ const stability: TestStability[] = Array.from(stats.values())
	115	+ .map<TestStability>((s) => {
	116	+ const mapped = s.lastBrokenSha ? shaToAgent.get(s.lastBrokenSha) : null;
	117	+ const agent = (mapped ?? fallbackAgent) as AgentReport["slug"];
	118	+ const deleted = latestKeys.has(`${s.file}\|${s.name}`) ? 0 : 1;
	119	+ const flagged = s.fail > 0 && (deleted > 0 \|\| s.fail >= Math.max(2, s.pass / 5));
	120	+ return {
	121	+ test: shortenTestLabel(s.file, s.name),
	122	+ repo: repoSlug,
	123	+ pass: s.pass,
	124	+ fail: s.fail,
	125	+ deleted,
	126	+ lastBrokenBy: agent,
	127	+ flagged,
	128	+ };
	129	+ })
	130	+ .sort((a, b) => b.fail - a.fail \|\| b.deleted - a.deleted \|\| b.pass - a.pass)
	131	+ .slice(0, 30);
	132	+
	133	+ return {
	134	+ snapshots: [snapshot],
	135	+ stability,
	136	+ runsCount: bundle.runs.length,
	137	+ ranAt: latest.ranAt,
	138	+ headSha: latest.sha,
	139	+ };
	140	+};

raw .diff