syntaxai/tdd.md · main · src / b51_render_reports.ts
// c51 (reports) — body builders for /reports, /reports/demo,
// /reports/live, /reports/demo/agents/:slug, /reports/demo/tests. The
// builders take the dataset as an explicit ReportsContext so the same
// markdown templates serve both the synthetic demo (DEMO_* from
// c31_reports_demo) and the live tdd.md aggregation (c32_real_reports).
import {
DEMO_REPORTS,
type AgentReport,
type FailureSlice,
type TestSnapshot,
type TestStability,
} from "./a31_reports_demo.ts";
import { escape } from "./b51_render_layout.ts";
export interface ReportsContext {
reports: AgentReport[];
period: string;
scopeLabel: string;
bannerHtml: string;
// Optional narrative — present for the curated demo, omitted for live
// where the data has to speak for itself.
narrative?: {
changedHeading: string;
changedBody: string;
doingHeading: string;
doingBody: string;
};
// Trailing footer line (links). Defaults reasonable for both demo + live.
footerLinks: string;
}
export interface TestsOverviewContext {
period: string;
bannerHtml: string;
snapshots: TestSnapshot[];
stability: TestStability[];
// When the runner sliver isn't wired (live mode, today), pass a
// placeholder note instead of the snapshot+stability sections.
unavailableNote?: string;
// Placeholder-test detection: tests with zero `expect()` calls in
// their body. Surfaces the failure mode from r/ClaudeCode 1qix264.
placeholderTests?: { name: string; file: string; reason: string }[];
}
const trendArrow = (delta: number): { glyph: string; cls: string } =>
delta > 0 ? { glyph: "↑", cls: "up" } : delta < 0 ? { glyph: "↓", cls: "down" } : { glyph: "→", cls: "flat" };
const sparkline = (values: number[], height = 60, width = 320): string => {
if (values.length === 0) return "";
const min = Math.min(...values);
const max = Math.max(...values);
const range = Math.max(1, max - min);
const stepX = width / Math.max(1, values.length - 1);
const pad = 6;
const innerH = height - pad * 2;
const points = values
.map((v, i) => {
const x = (i * stepX).toFixed(1);
const y = (pad + innerH - ((v - min) / range) * innerH).toFixed(1);
return `${x},${y}`;
})
.join(" ");
return `<svg class="report-sparkline" viewBox="0 0 ${width} ${height}" preserveAspectRatio="none" aria-hidden="true">
<polyline fill="none" stroke="currentColor" stroke-width="1.5" points="${points}" />
</svg>`;
};
const tile = (a: AgentReport): string => {
const arr = trendArrow(a.delta);
const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`;
return `<div class="report-tile">
<p class="report-tile-name"><a href="/reports/demo/agents/${a.slug}">${escape(a.name)}</a></p>
<p class="report-tile-score">${a.score}<span class="report-tile-score-suffix"> / 100</span></p>
<p class="report-tile-trend ${arr.cls}">${arr.glyph} ${escape(deltaStr)}</p>
<p class="report-tile-volume">${a.commits.toLocaleString()} commits</p>
<div class="report-tile-issue">top issue: <strong>${escape(a.topIssueLabel)}</strong> (${a.topIssuePct}%)</div>
</div>`;
};
const bars = (mix: FailureSlice[]): string => {
const rows = mix
.map(
(s) => `<div class="report-bar-row">
<span class="report-bar-label">${escape(s.label)}</span>
<span class="report-bar-track"><span class="report-bar-fill ${s.tone}" style="width: ${s.pct}%"></span></span>
<span class="report-bar-pct">${s.pct}%</span>
</div>`,
)
.join("\n");
return `<div class="report-bars">${rows}</div>`;
};
const streakBox = (a: AgentReport): string => {
const cls = a.streakBroken ? "broken" : a.streak >= 30 ? "long" : "";
const label = a.streakBroken ? "recent break" : "consecutive clean cycles";
return `<span class="report-streak ${cls}"><span class="report-streak-num">${a.streak}</span> ${label}</span>`;
};
const snapshotBlock = (s: TestSnapshot): string => {
const failuresHtml = s.failures.length === 0
? `<li class="test-list-pass">all ${s.passing} tests groen</li>`
: s.failures
.map(
(f) =>
`<li class="test-list-fail">${escape(f.test)} <span class="test-list-meta">${f.flaky ? "intermittent · " : ""}sinds ${f.since}</span></li>`,
)
.concat([`<li class="test-list-collapsed">+ ${s.passing.toLocaleString()} passing tests</li>`])
.join("\n");
const statusCls = s.failing === 0 ? "ok" : "bad";
return `<div class="test-snapshot ${statusCls}">
<p class="test-snapshot-head"><strong>${escape(s.repo)}</strong> <span class="test-snapshot-branch">@ ${escape(s.branch)}</span></p>
<p class="test-snapshot-stats">${s.total.toLocaleString()} tests · <span class="green">${s.passing.toLocaleString()} passing</span>${s.failing > 0 ? ` · <span class="red">${s.failing.toLocaleString()} failing</span>` : ""}</p>
<ul class="test-list">
${failuresHtml}
</ul>
</div>`;
};
const agentTagHtml = (slug: AgentReport["slug"]): string => {
const name = DEMO_REPORTS.find((r) => r.slug === slug)?.name ?? slug;
return `<a class="agent-tag" href="/reports/demo/agents/${slug}">${escape(name)}</a>`;
};
const stabilityRow = (s: TestStability): string => {
const cls = s.flagged ? "test-stab-row flagged" : "test-stab-row";
const warn = s.flagged ? ` <span class="test-stab-warn" title="test-deletion or weakening this quarter">⚠</span>` : "";
return `<tr class="${cls}">
<td class="test-stab-name">${escape(s.test)}<div class="test-stab-repo">${escape(s.repo)}</div></td>
<td class="test-stab-num green">${s.pass}</td>
<td class="test-stab-num ${s.fail >= 8 ? "red" : ""}">${s.fail}</td>
<td class="test-stab-num ${s.deleted > 0 ? "red" : ""}">${s.deleted}</td>
<td class="test-stab-by">${agentTagHtml(s.lastBrokenBy)}${warn}</td>
</tr>`;
};
export const reportsLandingMd = (): string => `# reports
> Per-agent TDD-discipline reporting over real project repos. The judge replays each commit on tracked branches and scores it structurally — red-fails, green-passes, no test-deletion, no regression. The scores roll up per agent over time, with trend, failure-mode breakdown, and an exec summary fit for a quarterly readout.
Two views of the same shape:
- **[/reports/live](/reports/live)** — built from real commit data on \`syntaxai/tdd.md\` (the repo this site runs on), refreshed every 5 minutes from the GitHub commits API. Agent attribution comes from \`Co-Authored-By:\` footers. Phase-coverage is the only metric we can compute without running tests, so the score is a proxy for now.
- **[/reports/demo](/reports/demo)** — the polished design preview with synthetic data for three agents and four repos. Useful for screenshots and showing the full failure-mode breakdown the live view can't compute yet.
Drill-downs:
- [live drill-down per agent](/reports/live/agents/claude-code) · [tests overview (live)](/reports/live/tests)
- [demo drill-down per agent](/reports/demo/agents/cursor) · [tests overview (demo)](/reports/demo/tests)
Want a real repo on this layer? [Register a project →](/projects) — drops \`.tdd-md.json\` at the repo root, onboards in seconds. Per-commit judging on tracked branches lands in a follow-up sliver; live reporting from the GitHub API already works for the dogfood case (the tdd.md repo itself).
## what gets measured
This layer measures **discipline**, not code-quality. Without hidden tests (those only exist on katas), tdd.md can't catch tautologies or weakened assertions on real repos. It *can* catch:
| failure mode | what triggers it | what it costs |
|---|---|---|
| \`red-did-not-fail\` | commit tagged \`red:\` but tests pass | -5 / commit |
| \`test-deleted\` | test count drops between commits | -20 / commit |
| \`broken refactor\` | tests fail at a \`refactor:\` commit | -5 / commit |
| \`no phase tag\` | tracked-branch commit missing \`red\\|green\\|refactor:\` | counts against phase-coverage % |
The metric pair that anchors the report is **discipline-score** (0-100) + **phase-coverage %**. An agent with 0% phase-coverage doesn't *do* TDD — its score is N/A, not 0. Don't let a low-volume non-attempt look like a high-volume slip.
## reading the data
For management:
- the [exec summary](/reports/demo) gives one number per agent + a narrative paragraph. Prints to one page.
For team-leads:
- the [drill-down](/reports/demo/agents/cursor) shows trend, failure-mix, streak, and the most recent flagged commits with one-click coaching links to the [Claude Code](/blog/2026-05/claude-code-tdd) / [Cursor](/blog/2026-05/cursor-tdd) / [Aider](/blog/2026-05/aider-tdd) posts.
[← back to tdd.md](/) · [the blog](/blog) · [the katas](/games)
`;
export const execSummaryMd = (ctx: ReportsContext): string => {
const totalCommits = ctx.reports.reduce((s, a) => s + a.commits, 0);
const tiles = ctx.reports.length === 0
? `<div class="report-tile-empty">No agent-attributed commits in this dataset.</div>`
: ctx.reports.map(tile).join("\n");
const narrativeBlock = ctx.narrative
? `## ${ctx.narrative.changedHeading}
${ctx.narrative.changedBody}
## ${ctx.narrative.doingHeading}
${ctx.narrative.doingBody}
`
: "";
return `# tdd-discipline report · ${ctx.period}
${ctx.bannerHtml}
> **Period** ${ctx.period} · **Scope** ${escape(ctx.scopeLabel)} · ${totalCommits.toLocaleString()} AI-attributed commits.
<div class="report-tiles">
${tiles}
</div>
${narrativeBlock}## what this number does *not* measure
Discipline, not code quality. Hidden tests (like the ones on the katas) don't exist for production repos, so *tautological* tests and *weakly-asserted* checks stay invisible to the judge. This number says: "the agent honours the TDD cycle". It says nothing about whether the tests it writes assert the right thing. For that second signal, kata performance ([leaderboard](/leaderboard)) remains the proxy.
---
${ctx.footerLinks}
`;
};
export const agentDrilldownMd = (
slug: AgentReport["slug"],
ctx: ReportsContext,
): string | null => {
const a = ctx.reports.find((r) => r.slug === slug);
if (!a) return null;
const arr = trendArrow(a.delta);
const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`;
const recentRows = a.recent.length === 0
? `| _no recent attributed activity_ | | | | | |`
: a.recent
.map(
(r) =>
`| ${r.date} | \`${r.repo}\` | \`${r.sha}\` | ${r.phase} | ${r.failure} | ${r.pts} |`,
)
.join("\n");
return `# ${a.name} · drill-down
${ctx.bannerHtml}
> Discipline score **${a.score} / 100** <span class="report-tile-trend ${arr.cls}">${arr.glyph} ${deltaStr}</span> over ${ctx.period}. ${a.commits.toLocaleString()} commits analysed, phase coverage **${a.phaseCoveragePct}%**.
## trend (30 days)
<div class="${arr.cls === "down" ? "red" : arr.cls === "up" ? "green" : "muted"}">
${sparkline(a.trend)}
</div>
${streakBox(a)}
## failure-mode breakdown
${bars(a.failureMix)}
Top issue this quarter: **${escape(a.topIssueLabel)}** (${a.topIssuePct}% of commits).
## recent flagged
| date | repo | sha | phase | failure | pts |
|---|---|---|---|---|---|
${recentRows}
## coaching
- ${a.slug === "claude-code" ? `[Claude Code does not do TDD by default](/blog/2026-05/claude-code-tdd) — CLAUDE.md rules + fresh-context boundaries that prevent \`red-did-not-fail\`.` : a.slug === "cursor" ? `[Cursor knows how to do TDD; users skip the parts that matter](/blog/2026-05/cursor-tdd) — Plan Mode, fresh chats, \`.cursor/rules\` to stop test-deletion.` : `[Aider is the closest agent to TDD on rails — until \`--auto-test\`](/blog/2026-05/aider-tdd) — keep auto-test off for green commits, on for refactor.`}
- [Tweag's TDD handbook needs a judge](/blog/2026-05/tweag-handbook-tdd) — why local green isn't enough.
---
${ctx.footerLinks}
`;
};
export const testsOverviewMd = (ctx: TestsOverviewContext): string => {
if (ctx.unavailableNote) {
return `# tests overview
${ctx.bannerHtml}
> ${ctx.unavailableNote}
[← exec summary](/reports) · [back to /reports](/reports)
`;
}
const total = ctx.snapshots.reduce((s, r) => s + r.total, 0);
const passing = ctx.snapshots.reduce((s, r) => s + r.passing, 0);
const failing = ctx.snapshots.reduce((s, r) => s + r.failing, 0);
const snapshots = ctx.snapshots.map(snapshotBlock).join("\n");
const stabRows = ctx.stability.map(stabilityRow).join("\n");
const placeholders = ctx.placeholderTests ?? [];
const placeholderBlock = placeholders.length === 0
? `## placeholder tests
> No placeholder tests detected at this snapshot. A placeholder is a test whose body contains zero \`expect()\` calls — covered in [the corpus post](/blog/2026-05/agentic-coding-corpus-three-patterns) as the failure mode from r/ClaudeCode 1qix264 ("90 placeholder tests, 100% pass rate"). Detection runs on every deploy.
`
: `## placeholder tests · ⚠ ${placeholders.length} flagged
> A placeholder test is one whose body contains zero \`expect()\` calls — empty body, comment-only stub, or string-literal body. Covered in [the corpus post](/blog/2026-05/agentic-coding-corpus-three-patterns) as the failure mode from r/ClaudeCode 1qix264. The judge would refuse a merge that includes any of these.
| test | file | reason |
|---|---|---|
${placeholders.map((p) => `| ${escape(p.name)} | \`${escape(p.file)}\` | ${escape(p.reason)} |`).join("\n")}
`;
return `# tests overview
${ctx.bannerHtml}
> Snapshot of the current test state per repo + stability of individual tests over ${ctx.period}. A high fail count with zero deletions means the test is actively catching regressions; high fail + deletion is the signal that a test is being squeezed — often the trace of an agent making it easier to "win".
## current state · per repo
<div class="test-snapshots">
${snapshots}
</div>
**Total**: ${total.toLocaleString()} tests · <span class="green">${passing.toLocaleString()} passing</span> · <span class="${failing > 0 ? "red" : "muted"}">${failing.toLocaleString()} failing</span>${placeholders.length > 0 ? ` · <span class="red">${placeholders.length} placeholder ⚠</span>` : ""}.
${placeholderBlock}
## test stability · ${ctx.period}
Top tests by failure activity this period, with pass/fail/deleted counts and the agent who last broke the test.
<table class="test-stability">
<thead>
<tr>
<th>test</th>
<th class="num">pass</th>
<th class="num">fail</th>
<th class="num">del</th>
<th>last broken by</th>
</tr>
</thead>
<tbody>
${stabRows}
</tbody>
</table>
> ⚠ marks tests where a test-deletion or weakening event has been detected this period. In a real setup, clicking a test name will link through to that test's commit history.
## how to read this
- **Lots of pass, few fail, 0 del**: healthy. The test does what it should, nobody is sabotaging it.
- **Lots of fail, 0 del**: the test is actively catching regressions. Good news — discipline is working.
- **Fail and del > 0**: the test is under pressure. Coach the agent that broke it (click the tag icon).
- **Snapshot red + stability high**: a known, long-running broken test. Separate concern, not necessarily an agent problem.
---
[← exec summary](/reports/demo) · [back to /reports](/reports)
`;
};