// c51 (reports) — body builders for /reports, /reports/demo, // /reports/live, /reports/demo/agents/:slug, /reports/demo/tests. The // builders take the dataset as an explicit ReportsContext so the same // markdown templates serve both the synthetic demo (DEMO_* from // c31_reports_demo) and the live tdd.md aggregation (c32_real_reports). import { DEMO_REPORTS, type AgentReport, type FailureSlice, type TestSnapshot, type TestStability, } from "./a31_reports_demo.ts"; import { escape } from "./b51_render_layout.ts"; export interface ReportsContext { reports: AgentReport[]; period: string; scopeLabel: string; bannerHtml: string; // Optional narrative — present for the curated demo, omitted for live // where the data has to speak for itself. narrative?: { changedHeading: string; changedBody: string; doingHeading: string; doingBody: string; }; // Trailing footer line (links). Defaults reasonable for both demo + live. footerLinks: string; } export interface TestsOverviewContext { period: string; bannerHtml: string; snapshots: TestSnapshot[]; stability: TestStability[]; // When the runner sliver isn't wired (live mode, today), pass a // placeholder note instead of the snapshot+stability sections. unavailableNote?: string; // Placeholder-test detection: tests with zero `expect()` calls in // their body. Surfaces the failure mode from r/ClaudeCode 1qix264. placeholderTests?: { name: string; file: string; reason: string }[]; } const trendArrow = (delta: number): { glyph: string; cls: string } => delta > 0 ? { glyph: "↑", cls: "up" } : delta < 0 ? { glyph: "↓", cls: "down" } : { glyph: "→", cls: "flat" }; const sparkline = (values: number[], height = 60, width = 320): string => { if (values.length === 0) return ""; const min = Math.min(...values); const max = Math.max(...values); const range = Math.max(1, max - min); const stepX = width / Math.max(1, values.length - 1); const pad = 6; const innerH = height - pad * 2; const points = values .map((v, i) => { const x = (i * stepX).toFixed(1); const y = (pad + innerH - ((v - min) / range) * innerH).toFixed(1); return `${x},${y}`; }) .join(" "); return ``; }; const tile = (a: AgentReport): string => { const arr = trendArrow(a.delta); const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`; return `

${a.score} / 100

${arr.glyph} ${escape(deltaStr)}

${a.commits.toLocaleString()} commits

top issue: ${escape(a.topIssueLabel)} (${a.topIssuePct}%)

`; }; const bars = (mix: FailureSlice[]): string => { const rows = mix .map( (s) => `

${escape(s.label)} ${s.pct}%

`, ) .join("\n"); return `

${rows}

`; }; const streakBox = (a: AgentReport): string => { const cls = a.streakBroken ? "broken" : a.streak >= 30 ? "long" : ""; const label = a.streakBroken ? "recent break" : "consecutive clean cycles"; return `${a.streak} ${label}`; }; const snapshotBlock = (s: TestSnapshot): string => { const failuresHtml = s.failures.length === 0 ? `

all ${s.passing} tests groen

` : s.failures .map( (f) => `

${escape(f.test)} ${f.flaky ? "intermittent · " : ""}sinds ${f.since}

`, ) .concat([`

+ ${s.passing.toLocaleString()} passing tests

`]) .join("\n"); const statusCls = s.failing === 0 ? "ok" : "bad"; return `

${escape(s.repo)} @ ${escape(s.branch)}

${s.total.toLocaleString()} tests · ${s.passing.toLocaleString()} passing${s.failing > 0 ? ` · ${s.failing.toLocaleString()} failing` : ""}

${failuresHtml}

`; }; const agentTagHtml = (slug: AgentReport["slug"]): string => { const name = DEMO_REPORTS.find((r) => r.slug === slug)?.name ?? slug; return `${escape(name)}`; }; const stabilityRow = (s: TestStability): string => { const cls = s.flagged ? "test-stab-row flagged" : "test-stab-row"; const warn = s.flagged ? ` ⚠` : ""; return ` ${escape(s.test)}

${escape(s.repo)}

${s.pass} ${s.fail} ${s.deleted} ${agentTagHtml(s.lastBrokenBy)}${warn} `; }; export const reportsLandingMd = (): string => `# reports > Per-agent TDD-discipline reporting over real project repos. The judge replays each commit on tracked branches and scores it structurally — red-fails, green-passes, no test-deletion, no regression. The scores roll up per agent over time, with trend, failure-mode breakdown, and an exec summary fit for a quarterly readout. Two views of the same shape: - **[/reports/live](/reports/live)** — built from real commit data on \`syntaxai/tdd.md\` (the repo this site runs on), refreshed every 5 minutes from the GitHub commits API. Agent attribution comes from \`Co-Authored-By:\` footers. Phase-coverage is the only metric we can compute without running tests, so the score is a proxy for now. - **[/reports/demo](/reports/demo)** — the polished design preview with synthetic data for three agents and four repos. Useful for screenshots and showing the full failure-mode breakdown the live view can't compute yet. Drill-downs: - [live drill-down per agent](/reports/live/agents/claude-code) · [tests overview (live)](/reports/live/tests) - [demo drill-down per agent](/reports/demo/agents/cursor) · [tests overview (demo)](/reports/demo/tests) Want a real repo on this layer? [Register a project →](/projects) — drops \`.tdd-md.json\` at the repo root, onboards in seconds. Per-commit judging on tracked branches lands in a follow-up sliver; live reporting from the GitHub API already works for the dogfood case (the tdd.md repo itself). ## what gets measured This layer measures **discipline**, not code-quality. Without hidden tests (those only exist on katas), tdd.md can't catch tautologies or weakened assertions on real repos. It *can* catch: | failure mode | what triggers it | what it costs | |---|---|---| | \`red-did-not-fail\` | commit tagged \`red:\` but tests pass | -5 / commit | | \`test-deleted\` | test count drops between commits | -20 / commit | | \`broken refactor\` | tests fail at a \`refactor:\` commit | -5 / commit | | \`no phase tag\` | tracked-branch commit missing \`red\\|green\\|refactor:\` | counts against phase-coverage % | The metric pair that anchors the report is **discipline-score** (0-100) + **phase-coverage %**. An agent with 0% phase-coverage doesn't *do* TDD — its score is N/A, not 0. Don't let a low-volume non-attempt look like a high-volume slip. ## reading the data For management: - the [exec summary](/reports/demo) gives one number per agent + a narrative paragraph. Prints to one page. For team-leads: - the [drill-down](/reports/demo/agents/cursor) shows trend, failure-mix, streak, and the most recent flagged commits with one-click coaching links to the [Claude Code](/blog/2026-05/claude-code-tdd) / [Cursor](/blog/2026-05/cursor-tdd) / [Aider](/blog/2026-05/aider-tdd) posts. [← back to tdd.md](/) · [the blog](/blog) · [the katas](/games) `; export const execSummaryMd = (ctx: ReportsContext): string => { const totalCommits = ctx.reports.reduce((s, a) => s + a.commits, 0); const tiles = ctx.reports.length === 0 ? `

No agent-attributed commits in this dataset.

` : ctx.reports.map(tile).join("\n"); const narrativeBlock = ctx.narrative ? `## ${ctx.narrative.changedHeading} ${ctx.narrative.changedBody} ## ${ctx.narrative.doingHeading} ${ctx.narrative.doingBody} ` : ""; return `# tdd-discipline report · ${ctx.period} ${ctx.bannerHtml} > **Period** ${ctx.period} · **Scope** ${escape(ctx.scopeLabel)} · ${totalCommits.toLocaleString()} AI-attributed commits.

${tiles}

${narrativeBlock}## what this number does *not* measure Discipline, not code quality. Hidden tests (like the ones on the katas) don't exist for production repos, so *tautological* tests and *weakly-asserted* checks stay invisible to the judge. This number says: "the agent honours the TDD cycle". It says nothing about whether the tests it writes assert the right thing. For that second signal, kata performance ([leaderboard](/leaderboard)) remains the proxy. --- ${ctx.footerLinks} `; }; export const agentDrilldownMd = ( slug: AgentReport["slug"], ctx: ReportsContext, ): string | null => { const a = ctx.reports.find((r) => r.slug === slug); if (!a) return null; const arr = trendArrow(a.delta); const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`; const recentRows = a.recent.length === 0 ? `| _no recent attributed activity_ | | | | | |` : a.recent .map( (r) => `| ${r.date} | \`${r.repo}\` | \`${r.sha}\` | ${r.phase} | ${r.failure} | ${r.pts} |`, ) .join("\n"); return `# ${a.name} · drill-down ${ctx.bannerHtml} > Discipline score **${a.score} / 100** ${arr.glyph} ${deltaStr} over ${ctx.period}. ${a.commits.toLocaleString()} commits analysed, phase coverage **${a.phaseCoveragePct}%**. ## trend (30 days)

${sparkline(a.trend)}

${streakBox(a)} ## failure-mode breakdown ${bars(a.failureMix)} Top issue this quarter: **${escape(a.topIssueLabel)}** (${a.topIssuePct}% of commits). ## recent flagged | date | repo | sha | phase | failure | pts | |---|---|---|---|---|---| ${recentRows} ## coaching - ${a.slug === "claude-code" ? `[Claude Code does not do TDD by default](/blog/2026-05/claude-code-tdd) — CLAUDE.md rules + fresh-context boundaries that prevent \`red-did-not-fail\`.` : a.slug === "cursor" ? `[Cursor knows how to do TDD; users skip the parts that matter](/blog/2026-05/cursor-tdd) — Plan Mode, fresh chats, \`.cursor/rules\` to stop test-deletion.` : `[Aider is the closest agent to TDD on rails — until \`--auto-test\`](/blog/2026-05/aider-tdd) — keep auto-test off for green commits, on for refactor.`} - [Tweag's TDD handbook needs a judge](/blog/2026-05/tweag-handbook-tdd) — why local green isn't enough. --- ${ctx.footerLinks} `; }; export const testsOverviewMd = (ctx: TestsOverviewContext): string => { if (ctx.unavailableNote) { return `# tests overview ${ctx.bannerHtml} > ${ctx.unavailableNote} [← exec summary](/reports) · [back to /reports](/reports) `; } const total = ctx.snapshots.reduce((s, r) => s + r.total, 0); const passing = ctx.snapshots.reduce((s, r) => s + r.passing, 0); const failing = ctx.snapshots.reduce((s, r) => s + r.failing, 0); const snapshots = ctx.snapshots.map(snapshotBlock).join("\n"); const stabRows = ctx.stability.map(stabilityRow).join("\n"); const placeholders = ctx.placeholderTests ?? []; const placeholderBlock = placeholders.length === 0 ? `## placeholder tests > No placeholder tests detected at this snapshot. A placeholder is a test whose body contains zero \`expect()\` calls — covered in [the corpus post](/blog/2026-05/agentic-coding-corpus-three-patterns) as the failure mode from r/ClaudeCode 1qix264 ("90 placeholder tests, 100% pass rate"). Detection runs on every deploy. ` : `## placeholder tests · ⚠ ${placeholders.length} flagged > A placeholder test is one whose body contains zero \`expect()\` calls — empty body, comment-only stub, or string-literal body. Covered in [the corpus post](/blog/2026-05/agentic-coding-corpus-three-patterns) as the failure mode from r/ClaudeCode 1qix264. The judge would refuse a merge that includes any of these. | test | file | reason | |---|---|---| ${placeholders.map((p) => `| ${escape(p.name)} | \`${escape(p.file)}\` | ${escape(p.reason)} |`).join("\n")} `; return `# tests overview ${ctx.bannerHtml} > Snapshot of the current test state per repo + stability of individual tests over ${ctx.period}. A high fail count with zero deletions means the test is actively catching regressions; high fail + deletion is the signal that a test is being squeezed — often the trace of an agent making it easier to "win". ## current state · per repo

${snapshots}

**Total**: ${total.toLocaleString()} tests · ${passing.toLocaleString()} passing · ${failing.toLocaleString()} failing${placeholders.length > 0 ? ` · ${placeholders.length} placeholder ⚠` : ""}. ${placeholderBlock} ## test stability · ${ctx.period} Top tests by failure activity this period, with pass/fail/deleted counts and the agent who last broke the test. ${stabRows}

test	pass	fail	del	last broken by

> ⚠ marks tests where a test-deletion or weakening event has been detected this period. In a real setup, clicking a test name will link through to that test's commit history. ## how to read this - **Lots of pass, few fail, 0 del**: healthy. The test does what it should, nobody is sabotaging it. - **Lots of fail, 0 del**: the test is actively catching regressions. Good news — discipline is working. - **Fail and del > 0**: the test is under pressure. Coach the agent that broke it (click the tag icon). - **Snapshot red + stability high**: a known, long-running broken test. Separate concern, not necessarily an agent problem. --- [← exec summary](/reports/demo) · [back to /reports](/reports) `; };