604776052d9e96ab4b6876fa462c9715a4f30a76 diff --git a/public/style.css b/public/style.css index 63119bd3212855595862f2c2ff7cc375f3029770..c5220ab9fee147104a0c0b58813d5367c7320baa 100644 --- a/public/style.css +++ b/public/style.css @@ -183,3 +183,158 @@ main.md strong { font-weight: 600; } background: var(--accent); color: var(--bg); } + +/* --- reports / dashboard ---------------------------------------------- */ + +.report-mockup-banner { + background: var(--code-bg); + border: 1px dashed var(--border); + padding: 0.7rem 1rem; + border-radius: 4px; + font-size: 0.85rem; + color: var(--muted); + margin: 0 0 2rem; + font-family: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace; +} +.report-mockup-banner a { + color: var(--muted); + text-decoration: underline; + text-underline-offset: 2px; +} +.report-mockup-banner a:hover { color: var(--fg); } + +.report-tiles { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 1rem; + margin: 1.5rem 0 2.5rem; +} + +.report-tile { + border: 1px solid var(--border); + border-radius: 6px; + padding: 1.2rem 1.2rem 1rem; + background: var(--code-bg); +} + +.report-tile-name { + font-family: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace; + font-size: 0.8rem; + text-transform: lowercase; + letter-spacing: 0.04em; + color: var(--muted); + margin: 0 0 0.6rem; +} +.report-tile-name a { + color: inherit; + text-decoration: none; +} +.report-tile-name a:hover { color: var(--fg); } + +.report-tile-score { + font-family: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace; + font-size: 2.2rem; + font-weight: 600; + letter-spacing: -0.02em; + margin: 0; + line-height: 1.1; +} +.report-tile-score-suffix { + font-size: 0.95rem; + color: var(--muted); + font-weight: 400; +} + +.report-tile-trend { + font-family: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace; + font-size: 0.9rem; + margin: 0.4rem 0 0.6rem; +} +.report-tile-trend.up { color: var(--green); } +.report-tile-trend.down { color: var(--red); } +.report-tile-trend.flat { color: var(--muted); } + +.report-tile-volume { + font-family: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace; + font-size: 0.78rem; + color: var(--muted); + margin: 0 0 0.8rem; +} + +.report-tile-issue { + font-size: 0.82rem; + color: var(--muted); + border-top: 1px solid var(--border); + padding-top: 0.7rem; +} +.report-tile-issue strong { + color: var(--fg); + font-weight: 500; +} + +.report-bars { + margin: 1rem 0 2rem; +} +.report-bar-row { + display: grid; + grid-template-columns: 180px 1fr 50px; + align-items: center; + gap: 0.8rem; + margin: 0.5rem 0; + font-family: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace; + font-size: 0.85rem; +} +.report-bar-label { color: var(--muted); } +.report-bar-track { + height: 10px; + background: var(--code-bg); + border: 1px solid var(--border); + border-radius: 2px; + overflow: hidden; +} +.report-bar-fill { + display: block; + height: 100%; + background: var(--accent); +} +.report-bar-fill.red { background: var(--red); } +.report-bar-fill.green { background: var(--green); } +.report-bar-fill.muted { background: var(--muted); } +.report-bar-pct { text-align: right; color: var(--fg); } + +.report-streak { + display: inline-block; + padding: 0.4rem 0.8rem; + border: 1px solid var(--border); + border-radius: 4px; + font-family: ui-monospace, "SF Mono", "JetBrains Mono", "Fira Code", Menlo, Consolas, monospace; + font-size: 0.85rem; + color: var(--muted); + margin: 0 0 1.5rem; +} +.report-streak-num { + font-weight: 600; + color: var(--fg); +} +.report-streak.broken { + color: var(--red); + border-color: var(--red); +} +.report-streak.broken .report-streak-num { color: var(--red); } +.report-streak.long { + color: var(--green); + border-color: var(--green); +} +.report-streak.long .report-streak-num { color: var(--green); } + +.report-sparkline { + width: 100%; + height: 80px; + display: block; + margin: 0.5rem 0 1.2rem; +} + +@media (max-width: 600px) { + .report-tiles { grid-template-columns: 1fr; } + .report-bar-row { grid-template-columns: 130px 1fr 50px; } +} diff --git a/src/reports.ts b/src/reports.ts new file mode 100644 index 0000000000000000000000000000000000000000..4c11445c14b0bf3e154074e040adac53f82412b1 --- /dev/null +++ b/src/reports.ts @@ -0,0 +1,298 @@ +// Mockup reporting layer for tdd.md. +// +// All data here is FAKE — wired up only so the management/exec view and +// per-agent drill-down can be designed in the browser before the real +// project-tracking pipeline (block 1) exists. +// +// Real reporting needs: +// - GitHub App / webhook ingest of pushes on tracked branches +// - per-commit judging without hidden tests (red-fails / green-passes / +// no-test-deletion / no-regression) +// - agent attribution (commit footer convention or wrapper-driven) +// Once that exists, the same generators in this file accept real data. + +interface RecentFlagged { + date: string; + repo: string; + sha: string; + phase: "red" | "green" | "refactor"; + failure: string; + pts: number; +} + +interface FailureSlice { + label: string; + pct: number; + tone: "red" | "green" | "muted" | "accent"; +} + +export interface AgentReport { + slug: "claude-code" | "cursor" | "aider"; + name: string; + score: number; + delta: number; + commits: number; + phaseCoveragePct: number; + streak: number; + streakBroken: boolean; + topIssueLabel: string; + topIssuePct: number; + failureMix: FailureSlice[]; + trend: number[]; + recent: RecentFlagged[]; +} + +export const DEMO_PERIOD = "2026-01-01 → 2026-03-31"; +export const DEMO_ORG = "acme-corp"; +export const DEMO_REPOS = 4; + +export const DEMO_REPORTS: AgentReport[] = [ + { + slug: "claude-code", + name: "Claude Code", + score: 78, + delta: +6, + commits: 612, + phaseCoveragePct: 92, + streak: 47, + streakBroken: false, + topIssueLabel: "red-did-not-fail", + topIssuePct: 8, + failureMix: [ + { label: "clean cycles", pct: 84, tone: "green" }, + { label: "red-did-not-fail", pct: 8, tone: "red" }, + { label: "broken refactor", pct: 4, tone: "red" }, + { label: "test-deleted", pct: 2, tone: "red" }, + { label: "no phase tag", pct: 2, tone: "muted" }, + ], + trend: [72, 73, 71, 74, 72, 75, 73, 75, 77, 76, 75, 76, 78, 77, 79, 78, 77, 79, 80, 78, 79, 80, 79, 81, 80, 82, 81, 80, 79, 78], + recent: [ + { date: "2026-03-29", repo: "api-gateway", sha: "f1c8b3a", phase: "red", failure: "red-did-not-fail", pts: -5 }, + { date: "2026-03-24", repo: "billing-service", sha: "9d2e1f4", phase: "refactor", failure: "broken refactor", pts: -5 }, + { date: "2026-03-18", repo: "data-pipeline", sha: "62a9cb7", phase: "green", failure: "no phase tag (parent)", pts: 0 }, + ], + }, + { + slug: "cursor", + name: "Cursor", + score: 54, + delta: -15, + commits: 489, + phaseCoveragePct: 71, + streak: 3, + streakBroken: true, + topIssueLabel: "test-deleted in refactor", + topIssuePct: 14, + failureMix: [ + { label: "clean cycles", pct: 64, tone: "green" }, + { label: "test-deleted", pct: 14, tone: "red" }, + { label: "red-did-not-fail", pct: 9, tone: "red" }, + { label: "broken refactor", pct: 7, tone: "red" }, + { label: "no phase tag", pct: 6, tone: "muted" }, + ], + trend: [69, 70, 71, 72, 70, 71, 72, 73, 72, 71, 72, 70, 68, 65, 60, 55, 50, 52, 54, 53, 56, 54, 52, 55, 53, 54, 56, 55, 54, 54], + recent: [ + { date: "2026-03-28", repo: "api-gateway", sha: "a1b2c3d", phase: "refactor", failure: "test-deleted", pts: -20 }, + { date: "2026-03-26", repo: "api-gateway", sha: "4e5f6a7", phase: "green", failure: "broken refactor", pts: -5 }, + { date: "2026-03-23", repo: "billing-service", sha: "8b9c0d1", phase: "red", failure: "red-did-not-fail", pts: -5 }, + { date: "2026-03-21", repo: "api-gateway", sha: "2e3f4a5", phase: "refactor", failure: "test-deleted", pts: -20 }, + { date: "2026-03-19", repo: "data-pipeline", sha: "6b7c8d9", phase: "refactor", failure: "broken refactor", pts: -5 }, + ], + }, + { + slug: "aider", + name: "Aider", + score: 89, + delta: +2, + commits: 146, + phaseCoveragePct: 96, + streak: 89, + streakBroken: false, + topIssueLabel: "broken refactor", + topIssuePct: 3, + failureMix: [ + { label: "clean cycles", pct: 94, tone: "green" }, + { label: "broken refactor", pct: 3, tone: "red" }, + { label: "red-did-not-fail", pct: 2, tone: "red" }, + { label: "no phase tag", pct: 1, tone: "muted" }, + ], + trend: [87, 88, 89, 88, 87, 89, 90, 89, 88, 89, 90, 88, 89, 90, 91, 89, 88, 89, 90, 89, 90, 91, 89, 88, 89, 90, 89, 90, 89, 89], + recent: [ + { date: "2026-03-27", repo: "data-pipeline", sha: "3a4b5c6", phase: "refactor", failure: "broken refactor", pts: -5 }, + { date: "2026-03-15", repo: "billing-service", sha: "7d8e9f0", phase: "red", failure: "red-did-not-fail", pts: -5 }, + ], + }, +]; + +const escape = (s: string): string => + s.replace(/&/g, "&").replace(/"/g, """).replace(//g, ">"); + +const trendArrow = (delta: number): { glyph: string; cls: string } => + delta > 0 ? { glyph: "↑", cls: "up" } : delta < 0 ? { glyph: "↓", cls: "down" } : { glyph: "→", cls: "flat" }; + +const sparkline = (values: number[], height = 60, width = 320): string => { + if (values.length === 0) return ""; + const min = Math.min(...values); + const max = Math.max(...values); + const range = Math.max(1, max - min); + const stepX = width / Math.max(1, values.length - 1); + const pad = 6; + const innerH = height - pad * 2; + const points = values + .map((v, i) => { + const x = (i * stepX).toFixed(1); + const y = (pad + innerH - ((v - min) / range) * innerH).toFixed(1); + return `${x},${y}`; + }) + .join(" "); + return ``; +}; + +const tile = (a: AgentReport): string => { + const arr = trendArrow(a.delta); + const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`; + return `
+

${escape(a.name)}

+

${a.score} / 100

+

${arr.glyph} ${escape(deltaStr)}

+

${a.commits.toLocaleString()} commits

+
top issue: ${escape(a.topIssueLabel)} (${a.topIssuePct}%)
+
`; +}; + +const bars = (mix: FailureSlice[]): string => { + const rows = mix + .map( + (s) => `
+ ${escape(s.label)} + + ${s.pct}% +
`, + ) + .join("\n"); + return `
${rows}
`; +}; + +const streakBox = (a: AgentReport): string => { + const cls = a.streakBroken ? "broken" : a.streak >= 30 ? "long" : ""; + const label = a.streakBroken ? "recent break" : "consecutive clean cycles"; + return `${a.streak} ${label}`; +}; + +const mockBanner = `
demo data — real reporting wires up when the project-tracking pipeline ships. why tdd.md needs this · about reporting
`; + +export const reportsLandingMd = (): string => `# reports + +> Per-agent TDD-discipline reporting over real project repos. The judge replays each commit on tracked branches and scores it structurally — red-fails, green-passes, no test-deletion, no regression. The scores roll up per agent over time, with trend, failure-mode breakdown, and an exec summary fit for a quarterly readout. + +This is a design preview. The pipeline that ingests real repos isn't wired yet; what you can navigate today is a mockup with synthetic data: + +- [exec summary mockup →](/reports/demo) — single page, 1 quarter, 3 agents +- [per-agent drill-down →](/reports/demo/agents/cursor) — trend, failure mix, recent flagged commits + +## what gets measured + +This layer measures **discipline**, not code-quality. Without hidden tests (those only exist on katas), tdd.md can't catch tautologies or weakened assertions on real repos. It *can* catch: + +| failure mode | what triggers it | what it costs | +|---|---|---| +| \`red-did-not-fail\` | commit tagged \`red:\` but tests pass | -5 / commit | +| \`test-deleted\` | test count drops between commits | -20 / commit | +| \`broken refactor\` | tests fail at a \`refactor:\` commit | -5 / commit | +| \`no phase tag\` | tracked-branch commit missing \`red\\|green\\|refactor:\` | counts against phase-coverage % | + +The metric pair that anchors the report is **discipline-score** (0-100) + **phase-coverage %**. An agent with 0% phase-coverage doesn't *do* TDD — its score is N/A, not 0. Don't let a low-volume non-attempt look like a high-volume slip. + +## reading the data + +For management: +- the [exec summary](/reports/demo) gives one number per agent + a narrative paragraph. Prints to one page. + +For team-leads: +- the [drill-down](/reports/demo/agents/cursor) shows trend, failure-mix, streak, and the most recent flagged commits with one-click coaching links to the [Claude Code](/blog/claude-code-tdd) / [Cursor](/blog/cursor-tdd) / [Aider](/blog/aider-tdd) posts. + +[← back to tdd.md](/) · [the blog](/blog) · [the katas](/games) +`; + +export const execSummaryMd = (): string => { + const totalCommits = DEMO_REPORTS.reduce((s, a) => s + a.commits, 0); + const tiles = DEMO_REPORTS.map(tile).join("\n"); + return `# tdd-discipline rapport · q1 2026 + +${mockBanner} + +> **Periode** ${DEMO_PERIOD} · **Scope** ${DEMO_REPOS} repos · ${totalCommits.toLocaleString()} AI-toegeschreven commits in ${escape(DEMO_ORG)}. + +
+${tiles} +
+ +## wat veranderde dit kwartaal + +Cursor's score zakte 15 punten nadat agent-mode in maart default werd; test-deletion-incidenten stegen van 2% naar 14% van refactor-commits, geconcentreerd in de \`api-gateway\` repo. Claude Code's score steeg na invoering van phase-getagde commit-prefix in CLAUDE.md aan het einde van januari. Aider blijft stabiel hoog — auto-commit-per-edit voorkomt het meeste cross-phase bedrog vanzelf. + +## wat we doen + +- **Cursor in \`api-gateway\`**: agent-mode gedeactiveerd voor refactor-prompts, CONVENTIONS-regel "never delete a test in a refactor commit" gepind ([details →](/reports/demo/agents/cursor)). +- **Claude Code uitrollen**: het CLAUDE.md-template dat in \`billing-service\` werkte naar de andere drie repos kopiëren. +- **Volgende meting**: 2026-04-30, mid-Q2, om te zien of de Cursor-fix vasthoudt. + +## wat dit getal *niet* meet + +Discipline, niet code-kwaliteit. Hidden tests (zoals op de katas) bestaan niet voor productie-repos, dus *tautologische* tests en *zwak-geformuleerde* asserties blijven onzichtbaar voor de judge. Dit cijfer zegt: "de agent volgt de TDD-cyclus eerlijk". Het zegt niets over of de tests die hij schrijft het juiste beweren. Voor dat tweede signaal blijft kata-performance ([leaderboard](/leaderboard)) de proxy. + +--- + +[per-agent drill-down: Claude Code](/reports/demo/agents/claude-code) · [Cursor](/reports/demo/agents/cursor) · [Aider](/reports/demo/agents/aider) · [back to /reports](/reports) +`; +}; + +export const agentDrilldownMd = (slug: AgentReport["slug"]): string | null => { + const a = DEMO_REPORTS.find((r) => r.slug === slug); + if (!a) return null; + const arr = trendArrow(a.delta); + const deltaStr = a.delta > 0 ? `+${a.delta}` : `${a.delta}`; + const recentRows = a.recent + .map( + (r) => + `| ${r.date} | \`${r.repo}\` | \`${r.sha}\` | ${r.phase} | ${r.failure} | ${r.pts} |`, + ) + .join("\n"); + return `# ${a.name} · drill-down + +${mockBanner} + +> Discipline-score **${a.score} / 100** ${arr.glyph} ${deltaStr} over ${DEMO_PERIOD}. ${a.commits.toLocaleString()} commits geanalyseerd, phase-coverage **${a.phaseCoveragePct}%**. + +## trend (30 dagen) + +
+${sparkline(a.trend)} +
+ +${streakBox(a)} + +## failure-mode breakdown + +${bars(a.failureMix)} + +Top issue dit kwartaal: **${escape(a.topIssueLabel)}** (${a.topIssuePct}% van commits). + +## recent flagged + +| date | repo | sha | phase | failure | pts | +|---|---|---|---|---|---| +${recentRows} + +## coaching + +- ${a.slug === "claude-code" ? `[Claude Code does not do TDD by default](/blog/claude-code-tdd) — CLAUDE.md rules + fresh-context boundaries that prevent \`red-did-not-fail\`.` : a.slug === "cursor" ? `[Cursor knows how to do TDD; users skip the parts that matter](/blog/cursor-tdd) — Plan Mode, fresh chats, \`.cursor/rules\` to stop test-deletion.` : `[Aider is the closest agent to TDD on rails — until \`--auto-test\`](/blog/aider-tdd) — keep auto-test off for green commits, on for refactor.`} +- [Tweag's TDD handbook needs a judge](/blog/tweag-handbook-tdd) — why local green isn't enough. + +--- + +[← exec summary](/reports/demo) · [back to /reports](/reports) +`; +}; diff --git a/src/server.ts b/src/server.ts index b419e499297c446a7f195c236c877618adb3d930..01703187c69b01d6cc65a9de2b1777c4851d28ff 100644 --- a/src/server.ts +++ b/src/server.ts @@ -5,6 +5,12 @@ import { parseCommit, computeProgress, type Phase } from "./commits"; import { loadGame, listGames } from "./games"; import { judge } from "./judge"; import { latestRun, allLatestRuns } from "./db"; +import { + reportsLandingMd, + execSummaryMd, + agentDrilldownMd, + DEMO_REPORTS, +} from "./reports"; const HOME_MD = "./content/home.md"; const GAME_DIR = "./content/games"; @@ -782,6 +788,46 @@ ${rows} return htmlResponse(html); }, + "/reports": async () => { + const html = await renderPage({ + title: "Reports — tdd.md", + description: "Per-agent TDD-discipline reporting over real project repos: trend, failure-mode breakdown, and an exec summary fit for a quarterly readout.", + bodyMarkdown: reportsLandingMd(), + ogPath: "https://tdd.md/reports", + noindex: true, + }); + return htmlResponse(html); + }, + + "/reports/demo": async () => { + const html = await renderPage({ + title: "TDD-discipline rapport · Q1 2026 (demo) — tdd.md", + description: "Mockup of the management-level TDD-discipline report — single page, three agents, with trend and narrative.", + bodyMarkdown: execSummaryMd(), + ogPath: "https://tdd.md/reports/demo", + noindex: true, + }); + return htmlResponse(html); + }, + + "/reports/demo/agents/:slug": async (req) => { + const slug = req.params.slug as (typeof DEMO_REPORTS)[number]["slug"]; + const md = agentDrilldownMd(slug); + if (!md) { + const html = await renderNotFound(`/reports/demo/agents/${slug}`); + return htmlResponse(html, 404); + } + const entry = DEMO_REPORTS.find((r) => r.slug === slug)!; + const html = await renderPage({ + title: `${entry.name} drill-down (demo) — tdd.md`, + description: `Per-agent drill-down mockup for ${entry.name}: trend, failure-mode breakdown, recent flagged commits with coaching links.`, + bodyMarkdown: md, + ogPath: `https://tdd.md/reports/demo/agents/${slug}`, + noindex: true, + }); + return htmlResponse(html); + }, + "/guides": async () => { const rows = ALL_GUIDES .map((g) => `| [${g.title}](/guides/${g.slug}) | ${g.description} |`)