syntaxai/tdd.md · main · src / b51_render_reports.ts

b51_render_reports.ts 344 lines · 15171 bytes raw
// c51 (reports) — body builders for /reports, /reports/demo,
// /reports/live, /reports/demo/agents/:slug, /reports/demo/tests. The
// builders take the dataset as an explicit ReportsContext so the same
// markdown templates serve both the synthetic demo (DEMO_* from
// c31_reports_demo) and the live tdd.md aggregation (c32_real_reports).

import {
  DEMO_REPORTS,
  type AgentReport,
  type FailureSlice,
  type TestSnapshot,
  type TestStability,
} from "./a31_reports_demo.ts";
import { escape } from "./b51_render_layout.ts";

export interface ReportsContext {
  reports: AgentReport[];
  period: string;
  scopeLabel: string;
  bannerHtml: string;
  // Optional narrative — present for the curated demo, omitted for live
  // where the data has to speak for itself.
  narrative?: {
    changedHeading: string;
    changedBody: string;
    doingHeading: string;
    doingBody: string;
  };
  // Trailing footer line (links). Defaults reasonable for both demo + live.
  footerLinks: string;
}

export interface TestsOverviewContext {
  period: string;
  bannerHtml: string;
  snapshots: TestSnapshot[];
  stability: TestStability[];
  // When the runner sliver isn't wired (live mode, today), pass a
  // placeholder note instead of the snapshot+stability sections.
  unavailableNote?: string;
  // Placeholder-test detection: tests with zero `expect()` calls in
  // their body. Surfaces the failure mode from r/ClaudeCode 1qix264.
  placeholderTests?: { name: string; file: string; reason: string }[];
}

const trendArrow = (delta: number): { glyph: string; cls: string } =>
  delta > 0 ? { glyph: "↑", cls: "up" } : delta < 0 ? { glyph: "↓", cls: "down" } : { glyph: "→", cls: "flat" };

const sparkline = (values: number[], height = 60, width = 320): string => {
  if (values.length === 0) return "";
  const min = Math.min(...values);
  const max = Math.max(...values);
  const range = Math.max(1, max - min);
  const stepX = width / Math.max(1, values.length - 1);
  const pad = 6;
  const innerH = height - pad * 2;
  const points = values
    .map((v, i) => {
      const x = (i * stepX).toFixed(1);
      const y = (pad + innerH - ((v - min) / range) * innerH).toFixed(1);
      return `${x},${y}`;
    })
    .join(" ");
  return `<svg class="report-sparkline" viewBox="0 0 ${width} ${height}" preserveAspectRatio="none" aria-hidden="true">
  <polyline fill="none" stroke="currentColor" stroke-width="1.5" points="${points}" />
</svg>`;
};

const tile = (a: AgentReport): string => {
  const arr = trendArrow(a.delta);
  const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`;
  return `<div class="report-tile">
  <p class="report-tile-name"><a href="/reports/demo/agents/${a.slug}">${escape(a.name)}</a></p>
  <p class="report-tile-score">${a.score}<span class="report-tile-score-suffix"> / 100</span></p>
  <p class="report-tile-trend ${arr.cls}">${arr.glyph} ${escape(deltaStr)}</p>
  <p class="report-tile-volume">${a.commits.toLocaleString()} commits</p>
  <div class="report-tile-issue">top issue: <strong>${escape(a.topIssueLabel)}</strong> (${a.topIssuePct}%)</div>
</div>`;
};

const bars = (mix: FailureSlice[]): string => {
  const rows = mix
    .map(
      (s) => `<div class="report-bar-row">
  <span class="report-bar-label">${escape(s.label)}</span>
  <span class="report-bar-track"><span class="report-bar-fill ${s.tone}" style="width: ${s.pct}%"></span></span>
  <span class="report-bar-pct">${s.pct}%</span>
</div>`,
    )
    .join("\n");
  return `<div class="report-bars">${rows}</div>`;
};

const streakBox = (a: AgentReport): string => {
  const cls = a.streakBroken ? "broken" : a.streak >= 30 ? "long" : "";
  const label = a.streakBroken ? "recent break" : "consecutive clean cycles";
  return `<span class="report-streak ${cls}"><span class="report-streak-num">${a.streak}</span> ${label}</span>`;
};

const snapshotBlock = (s: TestSnapshot): string => {
  const failuresHtml = s.failures.length === 0
    ? `<li class="test-list-pass">all ${s.passing} tests groen</li>`
    : s.failures
        .map(
          (f) =>
            `<li class="test-list-fail">${escape(f.test)} <span class="test-list-meta">${f.flaky ? "intermittent · " : ""}sinds ${f.since}</span></li>`,
        )
        .concat([`<li class="test-list-collapsed">+ ${s.passing.toLocaleString()} passing tests</li>`])
        .join("\n");
  const statusCls = s.failing === 0 ? "ok" : "bad";
  return `<div class="test-snapshot ${statusCls}">
  <p class="test-snapshot-head"><strong>${escape(s.repo)}</strong> <span class="test-snapshot-branch">@ ${escape(s.branch)}</span></p>
  <p class="test-snapshot-stats">${s.total.toLocaleString()} tests · <span class="green">${s.passing.toLocaleString()} passing</span>${s.failing > 0 ? ` · <span class="red">${s.failing.toLocaleString()} failing</span>` : ""}</p>
  <ul class="test-list">
${failuresHtml}
  </ul>
</div>`;
};

const agentTagHtml = (slug: AgentReport["slug"]): string => {
  const name = DEMO_REPORTS.find((r) => r.slug === slug)?.name ?? slug;
  return `<a class="agent-tag" href="/reports/demo/agents/${slug}">${escape(name)}</a>`;
};

const stabilityRow = (s: TestStability): string => {
  const cls = s.flagged ? "test-stab-row flagged" : "test-stab-row";
  const warn = s.flagged ? ` <span class="test-stab-warn" title="test-deletion or weakening this quarter">⚠</span>` : "";
  return `<tr class="${cls}">
  <td class="test-stab-name">${escape(s.test)}<div class="test-stab-repo">${escape(s.repo)}</div></td>
  <td class="test-stab-num green">${s.pass}</td>
  <td class="test-stab-num ${s.fail >= 8 ? "red" : ""}">${s.fail}</td>
  <td class="test-stab-num ${s.deleted > 0 ? "red" : ""}">${s.deleted}</td>
  <td class="test-stab-by">${agentTagHtml(s.lastBrokenBy)}${warn}</td>
</tr>`;
};

export const reportsLandingMd = (): string => `# reports

> Per-agent TDD-discipline reporting over real project repos. The judge replays each commit on tracked branches and scores it structurally — red-fails, green-passes, no test-deletion, no regression. The scores roll up per agent over time, with trend, failure-mode breakdown, and an exec summary fit for a quarterly readout.

Two views of the same shape:

- **[/reports/live](/reports/live)** — built from real commit data on \`syntaxai/tdd.md\` (the repo this site runs on), refreshed every 5 minutes from the GitHub commits API. Agent attribution comes from \`Co-Authored-By:\` footers. Phase-coverage is the only metric we can compute without running tests, so the score is a proxy for now.
- **[/reports/demo](/reports/demo)** — the polished design preview with synthetic data for three agents and four repos. Useful for screenshots and showing the full failure-mode breakdown the live view can't compute yet.

Drill-downs:
- [live drill-down per agent](/reports/live/agents/claude-code) · [tests overview (live)](/reports/live/tests)
- [demo drill-down per agent](/reports/demo/agents/cursor) · [tests overview (demo)](/reports/demo/tests)

Want a real repo on this layer? [Register a project →](/projects) — drops \`.tdd-md.json\` at the repo root, onboards in seconds. Per-commit judging on tracked branches lands in a follow-up sliver; live reporting from the GitHub API already works for the dogfood case (the tdd.md repo itself).

## what gets measured

This layer measures **discipline**, not code-quality. Without hidden tests (those only exist on katas), tdd.md can't catch tautologies or weakened assertions on real repos. It *can* catch:

| failure mode | what triggers it | what it costs |
|---|---|---|
| \`red-did-not-fail\` | commit tagged \`red:\` but tests pass | -5 / commit |
| \`test-deleted\` | test count drops between commits | -20 / commit |
| \`broken refactor\` | tests fail at a \`refactor:\` commit | -5 / commit |
| \`no phase tag\` | tracked-branch commit missing \`red\\|green\\|refactor:\` | counts against phase-coverage % |

The metric pair that anchors the report is **discipline-score** (0-100) + **phase-coverage %**. An agent with 0% phase-coverage doesn't *do* TDD — its score is N/A, not 0. Don't let a low-volume non-attempt look like a high-volume slip.

## reading the data

For management:
- the [exec summary](/reports/demo) gives one number per agent + a narrative paragraph. Prints to one page.

For team-leads:
- the [drill-down](/reports/demo/agents/cursor) shows trend, failure-mix, streak, and the most recent flagged commits with one-click coaching links to the [Claude Code](/blog/2026-05/claude-code-tdd) / [Cursor](/blog/2026-05/cursor-tdd) / [Aider](/blog/2026-05/aider-tdd) posts.

[← back to tdd.md](/) · [the blog](/blog) · [the katas](/games)
`;

export const execSummaryMd = (ctx: ReportsContext): string => {
  const totalCommits = ctx.reports.reduce((s, a) => s + a.commits, 0);
  const tiles = ctx.reports.length === 0
    ? `<div class="report-tile-empty">No agent-attributed commits in this dataset.</div>`
    : ctx.reports.map(tile).join("\n");
  const narrativeBlock = ctx.narrative
    ? `## ${ctx.narrative.changedHeading}

${ctx.narrative.changedBody}

## ${ctx.narrative.doingHeading}

${ctx.narrative.doingBody}

`
    : "";
  return `# tdd-discipline report · ${ctx.period}

${ctx.bannerHtml}

> **Period** ${ctx.period} · **Scope** ${escape(ctx.scopeLabel)} · ${totalCommits.toLocaleString()} AI-attributed commits.

<div class="report-tiles">
${tiles}
</div>

${narrativeBlock}## what this number does *not* measure

Discipline, not code quality. Hidden tests (like the ones on the katas) don't exist for production repos, so *tautological* tests and *weakly-asserted* checks stay invisible to the judge. This number says: "the agent honours the TDD cycle". It says nothing about whether the tests it writes assert the right thing. For that second signal, kata performance ([leaderboard](/leaderboard)) remains the proxy.

---

${ctx.footerLinks}
`;
};

export const agentDrilldownMd = (
  slug: AgentReport["slug"],
  ctx: ReportsContext,
): string | null => {
  const a = ctx.reports.find((r) => r.slug === slug);
  if (!a) return null;
  const arr = trendArrow(a.delta);
  const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`;
  const recentRows = a.recent.length === 0
    ? `| _no recent attributed activity_ | | | | | |`
    : a.recent
        .map(
          (r) =>
            `| ${r.date} | \`${r.repo}\` | \`${r.sha}\` | ${r.phase} | ${r.failure} | ${r.pts} |`,
        )
        .join("\n");
  return `# ${a.name} · drill-down

${ctx.bannerHtml}

> Discipline score **${a.score} / 100** <span class="report-tile-trend ${arr.cls}">${arr.glyph} ${deltaStr}</span> over ${ctx.period}. ${a.commits.toLocaleString()} commits analysed, phase coverage **${a.phaseCoveragePct}%**.

## trend (30 days)

<div class="${arr.cls === "down" ? "red" : arr.cls === "up" ? "green" : "muted"}">
${sparkline(a.trend)}
</div>

${streakBox(a)}

## failure-mode breakdown

${bars(a.failureMix)}

Top issue this quarter: **${escape(a.topIssueLabel)}** (${a.topIssuePct}% of commits).

## recent flagged

| date | repo | sha | phase | failure | pts |
|---|---|---|---|---|---|
${recentRows}

## coaching

- ${a.slug === "claude-code" ? `[Claude Code does not do TDD by default](/blog/2026-05/claude-code-tdd) — CLAUDE.md rules + fresh-context boundaries that prevent \`red-did-not-fail\`.` : a.slug === "cursor" ? `[Cursor knows how to do TDD; users skip the parts that matter](/blog/2026-05/cursor-tdd) — Plan Mode, fresh chats, \`.cursor/rules\` to stop test-deletion.` : `[Aider is the closest agent to TDD on rails — until \`--auto-test\`](/blog/2026-05/aider-tdd) — keep auto-test off for green commits, on for refactor.`}
- [Tweag's TDD handbook needs a judge](/blog/2026-05/tweag-handbook-tdd) — why local green isn't enough.

---

${ctx.footerLinks}
`;
};

export const testsOverviewMd = (ctx: TestsOverviewContext): string => {
  if (ctx.unavailableNote) {
    return `# tests overview

${ctx.bannerHtml}

> ${ctx.unavailableNote}

[← exec summary](/reports) · [back to /reports](/reports)
`;
  }
  const total = ctx.snapshots.reduce((s, r) => s + r.total, 0);
  const passing = ctx.snapshots.reduce((s, r) => s + r.passing, 0);
  const failing = ctx.snapshots.reduce((s, r) => s + r.failing, 0);
  const snapshots = ctx.snapshots.map(snapshotBlock).join("\n");
  const stabRows = ctx.stability.map(stabilityRow).join("\n");
  const placeholders = ctx.placeholderTests ?? [];
  const placeholderBlock = placeholders.length === 0
    ? `## placeholder tests

> No placeholder tests detected at this snapshot. A placeholder is a test whose body contains zero \`expect()\` calls — covered in [the corpus post](/blog/2026-05/agentic-coding-corpus-three-patterns) as the failure mode from r/ClaudeCode 1qix264 ("90 placeholder tests, 100% pass rate"). Detection runs on every deploy.
`
    : `## placeholder tests · ⚠ ${placeholders.length} flagged

> A placeholder test is one whose body contains zero \`expect()\` calls — empty body, comment-only stub, or string-literal body. Covered in [the corpus post](/blog/2026-05/agentic-coding-corpus-three-patterns) as the failure mode from r/ClaudeCode 1qix264. The judge would refuse a merge that includes any of these.

| test | file | reason |
|---|---|---|
${placeholders.map((p) => `| ${escape(p.name)} | \`${escape(p.file)}\` | ${escape(p.reason)} |`).join("\n")}
`;
  return `# tests overview

${ctx.bannerHtml}

> Snapshot of the current test state per repo + stability of individual tests over ${ctx.period}. A high fail count with zero deletions means the test is actively catching regressions; high fail + deletion is the signal that a test is being squeezed — often the trace of an agent making it easier to "win".

## current state · per repo

<div class="test-snapshots">
${snapshots}
</div>

**Total**: ${total.toLocaleString()} tests · <span class="green">${passing.toLocaleString()} passing</span> · <span class="${failing > 0 ? "red" : "muted"}">${failing.toLocaleString()} failing</span>${placeholders.length > 0 ? ` · <span class="red">${placeholders.length} placeholder ⚠</span>` : ""}.

${placeholderBlock}

## test stability · ${ctx.period}

Top tests by failure activity this period, with pass/fail/deleted counts and the agent who last broke the test.

<table class="test-stability">
<thead>
  <tr>
    <th>test</th>
    <th class="num">pass</th>
    <th class="num">fail</th>
    <th class="num">del</th>
    <th>last broken by</th>
  </tr>
</thead>
<tbody>
${stabRows}
</tbody>
</table>

> ⚠ marks tests where a test-deletion or weakening event has been detected this period. In a real setup, clicking a test name will link through to that test's commit history.

## how to read this

- **Lots of pass, few fail, 0 del**: healthy. The test does what it should, nobody is sabotaging it.
- **Lots of fail, 0 del**: the test is actively catching regressions. Good news — discipline is working.
- **Fail and del > 0**: the test is under pressure. Coach the agent that broke it (click the tag icon).
- **Snapshot red + stability high**: a known, long-running broken test. Separate concern, not necessarily an agent problem.

---

[← exec summary](/reports/demo) · [back to /reports](/reports)
`;
};