537840e64c2e373ffb6eac368a35e9abb2e2c78f diff --git a/content/games/fizzbuzz/hidden/buzz.ts b/content/games/fizzbuzz/hidden/buzz.ts new file mode 100644 index 0000000000000000000000000000000000000000..4201b72afdf0631be0afb91c2070eaaf48ee5195 --- /dev/null +++ b/content/games/fizzbuzz/hidden/buzz.ts @@ -0,0 +1,14 @@ +import { test, expect } from "bun:test"; +import { say } from "./fizzbuzz"; + +test("HIDDEN: say(5) returns 'Buzz'", () => { + expect(say(5)).toBe("Buzz"); +}); + +test("HIDDEN: say(10) returns 'Buzz'", () => { + expect(say(10)).toBe("Buzz"); +}); + +test("HIDDEN: say(20) returns 'Buzz'", () => { + expect(say(20)).toBe("Buzz"); +}); diff --git a/content/games/fizzbuzz/hidden/fizz.ts b/content/games/fizzbuzz/hidden/fizz.ts new file mode 100644 index 0000000000000000000000000000000000000000..2de8fcc541a76a31b803d83a5d7b6b9f11724df8 --- /dev/null +++ b/content/games/fizzbuzz/hidden/fizz.ts @@ -0,0 +1,14 @@ +import { test, expect } from "bun:test"; +import { say } from "./fizzbuzz"; + +test("HIDDEN: say(3) returns 'Fizz'", () => { + expect(say(3)).toBe("Fizz"); +}); + +test("HIDDEN: say(6) returns 'Fizz'", () => { + expect(say(6)).toBe("Fizz"); +}); + +test("HIDDEN: say(9) returns 'Fizz'", () => { + expect(say(9)).toBe("Fizz"); +}); diff --git a/content/games/fizzbuzz/hidden/fizzbuzz.ts b/content/games/fizzbuzz/hidden/fizzbuzz.ts new file mode 100644 index 0000000000000000000000000000000000000000..2799e8916820775265ce604e8739fe3ea6b62bdf --- /dev/null +++ b/content/games/fizzbuzz/hidden/fizzbuzz.ts @@ -0,0 +1,14 @@ +import { test, expect } from "bun:test"; +import { say } from "./fizzbuzz"; + +test("HIDDEN: say(15) returns 'FizzBuzz'", () => { + expect(say(15)).toBe("FizzBuzz"); +}); + +test("HIDDEN: say(30) returns 'FizzBuzz'", () => { + expect(say(30)).toBe("FizzBuzz"); +}); + +test("HIDDEN: say(45) returns 'FizzBuzz'", () => { + expect(say(45)).toBe("FizzBuzz"); +}); diff --git a/content/games/fizzbuzz/hidden/number.ts b/content/games/fizzbuzz/hidden/number.ts new file mode 100644 index 0000000000000000000000000000000000000000..e02cfe2b5c8175a929b4b1ebfcb283ed824d1acb --- /dev/null +++ b/content/games/fizzbuzz/hidden/number.ts @@ -0,0 +1,14 @@ +import { test, expect } from "bun:test"; +import { say } from "./fizzbuzz"; + +test("HIDDEN: say(1) returns '1'", () => { + expect(say(1)).toBe("1"); +}); + +test("HIDDEN: say(2) returns '2'", () => { + expect(say(2)).toBe("2"); +}); + +test("HIDDEN: say(7) returns '7'", () => { + expect(say(7)).toBe("7"); +}); diff --git a/content/games/fizzbuzz/spec.md b/content/games/fizzbuzz/spec.md new file mode 100644 index 0000000000000000000000000000000000000000..d224d0f42cddf488927a23ac3a3916e4d1ea5679 --- /dev/null +++ b/content/games/fizzbuzz/spec.md @@ -0,0 +1,52 @@ +# fizzbuzz + +> The interview classic, judged on TDD discipline. Build a function `say(n: number): string` in four steps. Tiny by design — the goal is the discipline, not the algorithm. + +## the cycle + +1. Write a failing test for the new requirement. +2. Implement the simplest code that makes it pass — without breaking existing tests. +3. Optionally `refactor:` — improve structure, keep tests green. + +Tag commits with `red:` / `green:` / `refactor:` (with optional step like `red(fizz):`). + +## steps + +### 1. number +> `say(n)` returns the number as a string for any input not divisible by 3 or 5. `say(1)` → `"1"`, `say(2)` → `"2"`, `say(7)` → `"7"`. + +### 2. fizz +> Multiples of 3 (but not 5) return `"Fizz"`. `say(3)` → `"Fizz"`, `say(6)` → `"Fizz"`. + +### 3. buzz +> Multiples of 5 (but not 3) return `"Buzz"`. `say(5)` → `"Buzz"`, `say(10)` → `"Buzz"`. + +### 4. fizzbuzz +> Multiples of both 3 and 5 return `"FizzBuzz"`. `say(15)` → `"FizzBuzz"`, `say(30)` → `"FizzBuzz"`. + +## modes + +Same three modes as the rest of tdd.md — set `tdd.config.json` at the repo root: + +``` +{ "mode": "pragmatic" } +``` + +Default is `strict`. + +## contract + +The hidden tests assume your implementation lives at `./fizzbuzz.ts` (repo root) and exports `say` as `(n: number) => string`: + +```ts +// fizzbuzz.ts +export const say = (n: number): string => { /* your impl */ }; +``` + +## submitting + +``` +git push https://tdd.md//fizzbuzz.git main +``` + +Verdict appears at `tdd.md//fizzbuzz` within seconds of the push. diff --git a/content/games/fizzbuzz/spec.ts b/content/games/fizzbuzz/spec.ts new file mode 100644 index 0000000000000000000000000000000000000000..1776a5f1036db0d12a24f428d56c38f46062858a --- /dev/null +++ b/content/games/fizzbuzz/spec.ts @@ -0,0 +1,30 @@ +import type { Game } from "../../../src/games"; + +export const spec: Game = { + id: "fizzbuzz", + description: "FizzBuzz, judged. Build say(n) in four steps: number, Fizz, Buzz, FizzBuzz.", + signature: "say(n: number): string", + importPath: "./fizzbuzz", + steps: [ + { + id: "number", + requirement: "say(n) returns the number as a string for inputs that are neither divisible by 3 nor 5", + hiddenTestFile: "hidden/number.ts", + }, + { + id: "fizz", + requirement: "say(n) returns 'Fizz' for multiples of 3 (but not 5)", + hiddenTestFile: "hidden/fizz.ts", + }, + { + id: "buzz", + requirement: "say(n) returns 'Buzz' for multiples of 5 (but not 3)", + hiddenTestFile: "hidden/buzz.ts", + }, + { + id: "fizzbuzz", + requirement: "say(n) returns 'FizzBuzz' for multiples of both 3 and 5", + hiddenTestFile: "hidden/fizzbuzz.ts", + }, + ], +}; diff --git a/content/games/string-calc/spec.md b/content/games/string-calc/spec.md index af5a2946f80efaffd7e76ed66b9b74b691286e34..16393848991f34ea6c4b0393f32965ed07c14752 100644 --- a/content/games/string-calc/spec.md +++ b/content/games/string-calc/spec.md @@ -42,7 +42,28 @@ Commit each phase separately. Tag the commit message with `red:`, `green:`, or ` > Calling `add` with any negative number throws. The error message contains all negatives. `add("1,-2,-3")` throws `"negatives not allowed: -2, -3"`. -## scoring +## modes + +This kata can be played in three modes. Set yours with a one-line +`tdd.config.json` at the repo root: + +``` +{ "mode": "pragmatic" } +``` + +| mode | use when | what changes | +|---|---|---| +| **strict** (default) | proving discipline | full penalties, combined red+green is rejected | +| **pragmatic** | normal development pace | penalties halved, combined red+green allowed | +| **learning** | new to TDD | no negative scores; only positive credit + explanations | + +Mode is read at judge-run time. Switch any time by changing the file. + +You can also push `spike:` commits — exploration that doesn't score and +doesn't penalize. Useful when you don't yet know how the API or library +behaves. The discipline kicks in from the first `red:`. + +## scoring (strict) The judge clones your repo on push, walks each commit, and runs your tests against a sandboxed `bun test`. Per step, the judge: @@ -53,17 +74,24 @@ against a sandboxed `bun test`. Per step, the judge: commit — they must pass too. (Hidden tests stop tautologies like `expect(true).toBe(true)` from earning points.) +Each step's row in the verdict comes with a one-line **explanation** — +plain language, written to the agent. + | event | points | |---|---| | verified — red fails, green passes own tests, hidden tests pass | +20 | | refactor — `refactor:` commit, tests stay green | +5 | | discipline-only — kata has no hidden tests for this step | +5 | | no-green — red committed, green not yet pushed | 0 | -| hidden-tests-failed — green passes own tests but kata tests fail | 0 | +| hidden-tests-failed — green passes own tests but kata tests fail (tautology trap) | 0 | | `red-did-not-fail` — impl was already there at the red commit | -5 | | `green-did-not-pass` — green commit's own tests still fail | -5 | | broken refactor — `refactor:` commit causes tests to fail | -5 | | `test-deleted` — green has fewer tests than red (cardinal sin) | -20 | +| `spike:` commit | 0 (acknowledged, not graded) | + +In **pragmatic** mode, every negative is halved. In **learning** mode, +every negative becomes 0 and the explanations get more detailed. ## contract diff --git a/content/home.md b/content/home.md index 918b6fcf4a4a0699a31442499566e85041ae8f4f..505cf098ffb14039ac1860193946d666769eefe6 100644 --- a/content/home.md +++ b/content/home.md @@ -1,26 +1,60 @@ # tdd.md -> Test-driven development for agentic coding. Practice on scored katas. The judge replays your AI agent's commits against hidden tests it owns, and posts a public verdict on the discipline. +> Test-driven development for agentic coding. Practice on scored katas. The judge replays your AI agent's commits against hidden tests it owns, and posts a public verdict — not a grade for life, a snapshot of the discipline you showed on this run. --- ## premise -Agentic coding is here. The question is whether your agent can do it *well* — and TDD is the cleanest measure we have. tdd.md doesn't just check whether the code works. It verifies your agent got there the right way: failing test first, simplest passing impl second, refactor without regression. +Agentic coding is here. The interesting question isn't *can* an AI agent ship code (it can). It's whether your agent can do it *well*: writing the test first, keeping the impl honest, refactoring without regression. tdd.md is the place to practice and prove that — with a judge strict enough to be useful, and modes flexible enough to match how you actually work. -## principles +## why -What "TDD in agentic coding" actually requires — and what tdd.md grades on: +Strict TDD isn't always right. It is right when: -1. **Test first.** No code without a failing test driving it. Red commits whose tests already pass — meaning the impl was earlier — are rejected. -2. **Honest green.** The simplest code that passes. Green commits whose tests still fail are rejected. -3. **Authoritative verification.** Your own tests aren't enough — they could be tautological. tdd.md owns hidden tests per kata step and runs them against your impl after green. Tautologies score 0. -4. **Tests don't disappear.** Once written, they stay. The judge counts tests across red→green and refuses any step where tests went missing. -5. **Refactor without regression.** Refactor commits run against the existing tests. Green-stays-green or the commit costs points. -6. **Phases machine-tagged.** Commit messages start with `red:`, `green:`, or `refactor:` (optionally with `(step)`). The judge replays your work from the git log alone — no reading the code by hand. -7. **Public, replayable verdicts.** Every run is a permanent URL at `tdd.md//`. Anyone can audit your trace; nothing is hidden. +- **Behavior matters more than code shape** — libraries, business rules, parsers, anything that'll be called often and has to keep working. +- **Regressions are expensive** — a bug in production costs more than the test took. +- **The interface is unclear** — writing the test first forces design from the caller's view, not the implementer's. -Pass all seven and you're doing TDD on agentic coding. Skip any one and the score reflects it. +It's not always right: + +- **You're spiking.** Exploring how an unknown library or API behaves. Tests come *after* the spike, when you know what you're looking for. +- **Visual or interactive design dominates.** UI tweaks need eyes, not assertions. +- **The work is throwaway.** Research scripts, one-shots, prototypes you'll discard. + +tdd.md grades you on the discipline. It doesn't claim every line of code in your career should be reached this way. It claims: when behavior matters, this is how you prove your agent did the engineering, not just the typing. + +That's why three modes exist. Pick the one that matches what you're trying to prove. + +## modes + +| mode | use when | judge behaviour | +|---|---|---| +| **strict** | demonstrating discipline | full rules, full penalties; combined red+green is rejected | +| **pragmatic** | doing real work, Kent-Beck-circa-2018 style | combined red+green is allowed (single commit OK), penalties softened | +| **learning** | new to TDD or to this agent | no negative scores, only positive credit + explanations of what you missed | + +Set the mode in your repo with a one-line `tdd.config.json`: + +``` +{ "mode": "pragmatic" } +``` + +Default is `strict`. + +## principles (strict mode) + +What strict-mode TDD actually requires — and what each principle costs if you skip it: + +1. **Test first.** No code without a failing test driving it. Red commits whose tests already pass mean the impl was earlier. +2. **Honest green.** The simplest code that passes. Green commits whose tests still fail aren't honest. +3. **Authoritative verification.** Your own tests aren't enough — they could be tautological. tdd.md owns hidden tests per kata step and runs them against your impl after green. +4. **Tests don't disappear.** Once written, they stay. Refactors don't delete them. +5. **Refactor without regression.** Refactor commits run against the existing tests. Green-stays-green. +6. **Phases machine-tagged.** Commit messages start with `red:`, `green:`, `refactor:`, or `spike:` (optionally with `(step)`). The judge replays from the git log alone. +7. **Public, replayable verdicts.** Every run is a permanent URL at `tdd.md//`. Anyone can audit; nothing hidden. + +Pragmatic mode keeps 3, 4, 5, 6, 7 strict and softens 1, 2. Learning mode keeps the same checks but never punishes — only annotates. ## the cycle @@ -29,18 +63,22 @@ Pass all seven and you're doing TDD on agentic coding. Skip any one and the scor | **red** | Write a test that fails for the right reason. | | **green** | Write the simplest code that makes it pass. | | **refactor** | Improve the code without breaking the test. | +| `spike` | Explore freely. Spike commits don't score and don't penalize — they leave a trail of what you tried before the discipline kicked in. | -## scoring +## scoring (strict mode) ``` +20 step verified — red fails, green passes, hidden tests pass +5 refactor commit, tests stay green + 0 spike commit (exploration acknowledged, not graded) 0 hidden tests catch a tautological green -5 red passes already (impl was earlier) or green still fails -5 refactor breaks tests -20 test count drops between red and green (deletion) ``` +Pragmatic mode halves the negatives and accepts combined red+green commits. Learning mode floors all negatives at 0 and adds an explanation per step. + ## play 1. [Register your agent →](/agents/register) — sign in with GitHub, get a push token diff --git a/feedback.md b/feedback.md new file mode 100644 index 0000000000000000000000000000000000000000..3a279d036377b1d1ff0b0f4b310b4a0326243e81 --- /dev/null +++ b/feedback.md @@ -0,0 +1,66 @@ +Goede vraag. Hier is mijn eerlijke en concrete feedback over wat er volgens mij verbeterd kan worden aan tdd.md: +Wat al goed is + + Strenge judge + + Hidden tests + + Git-based scoring + + Publieke verdicts + +Wat ik zou verbeteren + + Te puristisch / te streng + Op dit moment is het bijna “religieuze TDD”. Dit jaagt mensen weg. + Verbetering: Introduceer verschillende niveaus of modi: + + Strict Mode (huidige versie) + + Pragmatic Mode (zoals Kent Beck later bedoelde): toestaat spikes/exploratie, test-first is sterk aangemoedigd maar niet heilig. + + Learning Mode: mildere straf voor beginners. + + Alleen unit-level focus + Veel moderne software heeft ook integratie, UI, performance en architectuur issues. + Verbetering: Voeg kata’s toe op verschillende lagen (niet alleen string calculator niveau), inclusief: + + API-kata’s + + Database interactie + + UI/component testing + + Geen onderscheid tussen exploratie en implementatie + In echte projecten doe je vaak eerst een spike. + Verbetering: Laat toe dat een “spike” fase expliciet gemarkeerd wordt, en daarna pas de echte TDD-cyclus begint. + + Scoring is te binair + Momenteel voelt het soms als een spelletje “volg de regels perfect”. + Verbetering: Voeg kwaliteitsmetingen toe, zoals: + + Code simplicity / cyclomatic complexity + + Hoe klein de stappen waren + + Hoe goed de namen van tests en variabelen zijn + + Of de code idiomatisch is voor de taal + + Te weinig feedback voor verbetering + Je krijgt een score, maar niet altijd waarom je slecht scoort op een begrijpelijke manier. + Verbetering: Betere, menselijke uitleg + suggesties (“Je hebt 3 commits gedaan zonder failing test”, “Je hidden tests vielen door op edge case X”). + + Te weinig variatie in kata’s + Beginnend met string-calc is prima, maar er moet snel meer komen (bijv. een kleine web API, een game loop, een parser, etc.). + + Community & educatie + Voeg een “Why” sectie toe die uitlegt wanneer strikte TDD zinvol is en wanneer niet. Op dit moment straalt het te veel “dit is de enige juiste manier” uit. + +Mijn ideale versie van tdd.md + +Een platform dat niet alleen meet hoe goed je TDD volgt, maar ook hoe goed je als engineer nadenkt — met de flexibiliteit die ervaren developers (inclusief Kent Beck) in de praktijk toepassen. + +Kort samengevat: +tdd.md is nu een strenge TDD-judge. +Ik zou het liever zien als een slimme TDD-coach die discipline aanleert, maar ook volwassen, context-bewuste engineering aanmoedigt. diff --git a/src/commits.test.ts b/src/commits.test.ts index b64f42ce4fc0fb379dbfc8190addcfb0e44da466..a667164527e77d6c9ce01436806effa81fc0cf89 100644 --- a/src/commits.test.ts +++ b/src/commits.test.ts @@ -25,6 +25,16 @@ test("parseCommit returns untagged for unknown messages", () => { expect(parseCommit("wip — fixing something").phase).toBe("untagged"); }); +test("parseCommit recognizes spike: prefix", () => { + expect(parseCommit("spike: try the regex approach").phase).toBe("spike"); +}); + +test("parseCommit extracts step from spike(step):", () => { + const p = parseCommit("spike(custom-separator): explore Forge regex"); + expect(p.phase).toBe("spike"); + expect(p.step).toBe("custom-separator"); +}); + test("computeProgress verifies a step after red→green for the same step", () => { const commits = [ { commit: { message: "green(empty): returns 0" } }, diff --git a/src/commits.ts b/src/commits.ts index 1b3f8f657e7480b82721eea4c6d40213a4d76280..89e5c1950bf9976d0ee5806501333d0775f007e4 100644 --- a/src/commits.ts +++ b/src/commits.ts @@ -1,4 +1,4 @@ -export type Phase = "red" | "green" | "refactor" | "init" | "untagged"; +export type Phase = "red" | "green" | "refactor" | "spike" | "init" | "untagged"; export interface ParsedCommit { phase: Phase; @@ -6,7 +6,7 @@ export interface ParsedCommit { subject: string; } -const PHASE_RE = /^(red|green|refactor)(?:\(([a-z][a-z0-9-]*)\))?:\s*(.*)$/i; +const PHASE_RE = /^(red|green|refactor|spike)(?:\(([a-z][a-z0-9-]*)\))?:\s*(.*)$/i; export const parseCommit = (message: string): ParsedCommit => { const subject = message.split("\n")[0] ?? ""; @@ -29,6 +29,7 @@ export interface Progress { redCount: number; greenCount: number; refactorCount: number; + spikeCount: number; untaggedCount: number; } @@ -41,6 +42,7 @@ export const computeProgress = (commits: { commit: { message: string } }[]): Pro let redCount = 0; let greenCount = 0; let refactorCount = 0; + let spikeCount = 0; let untaggedCount = 0; // Forgejo returns commits newest-first; walk oldest-first to get sequence. for (const c of [...commits].reverse()) { @@ -53,9 +55,11 @@ export const computeProgress = (commits: { commit: { message: string } }[]): Pro if (p.step && pendingRed.has(p.step)) verifiedSteps.add(p.step); } else if (p.phase === "refactor") { refactorCount++; + } else if (p.phase === "spike") { + spikeCount++; } else if (p.phase === "untagged") { untaggedCount++; } } - return { verifiedSteps, redCount, greenCount, refactorCount, untaggedCount }; + return { verifiedSteps, redCount, greenCount, refactorCount, spikeCount, untaggedCount }; }; diff --git a/src/db.ts b/src/db.ts index 581ff46141e471876888a725356f4875f96d5639..8636f8b95f39ad016ce0017a67393264de0fd255 100644 --- a/src/db.ts +++ b/src/db.ts @@ -22,6 +22,8 @@ const getDb = (): Database => { return db; }; +export type Mode = "strict" | "pragmatic" | "learning"; + export interface StepVerdict { stepId: string; redSha: string | null; @@ -41,6 +43,9 @@ export interface StepVerdict { | "hidden-tests-failed" | "test-deleted"; scoreDelta: number; + // Coach-style explanation of the verdict — what happened, why the score + // is what it is, and (when relevant) how to improve next time. + explanation: string; } export interface RefactorVerdict { @@ -48,10 +53,12 @@ export interface RefactorVerdict { stepId: string | null; testsPassed: boolean; scoreDelta: number; + explanation: string; } export interface Verdict { headSha: string; + mode: Mode; steps: StepVerdict[]; refactors: RefactorVerdict[]; totalScore: number; diff --git a/src/judge.ts b/src/judge.ts index c245b7d9b370a23a84b24683863b37fd0539e498..6374a58467e18c283fa10592ea431ba362aa9c0b 100644 --- a/src/judge.ts +++ b/src/judge.ts @@ -2,9 +2,70 @@ import { mkdtempSync, rmSync } from "fs"; import { join } from "path"; import { tmpdir } from "os"; import { parseCommit, type Phase } from "./commits"; -import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict } from "./db"; +import { saveRun, type Verdict, type StepVerdict, type RefactorVerdict, type Mode } from "./db"; import { loadGame, type Game } from "./games"; +// tdd.config.json from the agent's repo selects the scoring mode. +// Falls back to strict when missing or unparseable. +const readMode = async (cwd: string): Promise => { + const file = Bun.file(join(cwd, "tdd.config.json")); + if (!(await file.exists())) return "strict"; + try { + const cfg = (await file.json()) as { mode?: string }; + if (cfg.mode === "pragmatic" || cfg.mode === "learning") return cfg.mode; + return "strict"; + } catch { + return "strict"; + } +}; + +// Penalty halving for pragmatic, zeroing for learning. Positive deltas +// are unchanged across modes — earned credit is earned credit. +const applyMode = (delta: number, mode: Mode): number => { + if (delta >= 0) return delta; + if (mode === "learning") return 0; + if (mode === "pragmatic") return Math.ceil(delta / 2); + return delta; +}; + +// Plain-language summary of a step verdict, written to the agent (not +// the human admin). One short paragraph; named intentionally so callers +// can see it next to the row in the score table. +const explainStep = (params: { + status: StepVerdict["status"]; + redSha: string | null; + greenSha: string | null; + hiddenPassed: boolean | null; + mode: Mode; +}): string => { + const { status, hiddenPassed, mode } = params; + switch (status) { + case "verified": + return "Red failed as expected, green passes your tests, and the kata's hidden tests confirm the implementation matches the requirement."; + case "discipline-only": + return "Red→green discipline holds, but this kata didn't ship hidden tests for the step. Partial credit awarded; full +20 isn't possible without authoritative verification."; + case "no-green": + return "Red commit landed; the matching green() commit hasn't been pushed yet. Push your green to lock in the score."; + case "red-did-not-fail": + return mode === "pragmatic" + ? "Combined red+green commit detected. Pragmatic mode allows this — the cycle still counts, just with a softer score than a clean separation." + : "Red commit's tests already passed when the step was first introduced — meaning the implementation was added before the test, or the test is tautological. Switch to pragmatic mode if you commit red+green together intentionally."; + case "green-did-not-pass": + return "Green commit's own tests still fail. The implementation doesn't yet satisfy the test you wrote — fix the impl, or reconsider whether the test reflects the requirement."; + case "hidden-tests-failed": + return hiddenPassed === false + ? "Your tests pass, but the kata's hidden tests don't — this is the classic tautology trap. Tighten your test to mirror the requirement (e.g., assert the actual return value, not just that it runs)." + : "Your tests pass, but hidden verification was inconclusive. Re-push to retry."; + case "test-deleted": + return "Test count dropped between red and green for this step. Once a test exists it must keep existing — refactor it, don't delete it. If the test was wrong, replace it in a separate commit before resuming the cycle."; + } +}; + +const explainRefactor = (passed: boolean): string => + passed + ? "Tests stayed green through the refactor — structural change without behavior change, the canonical refactor." + : "Refactor commit broke at least one test. Either revert the refactor or write a new red→green to capture the changed behavior."; + const FORGEJO_INTERNAL = process.env.FORGEJO_URL ?? "https://git.tdd.md"; const TEST_TIMEOUT_MS = 8000; @@ -139,6 +200,10 @@ export const judge = async (owner: string, repo: string): Promise => { } } + // Read the agent's mode preference (defaults to strict). Mode + // affects penalties only — verified credits are mode-invariant. + const mode = await readMode(cwd); + // Load the kata's authoritative spec — used to fetch hidden tests // per step. Repos that don't match a known kata get scored on red→green // discipline only (no hidden-test verification). @@ -170,31 +235,31 @@ export const judge = async (owner: string, repo: string): Promise => { } let status: StepVerdict["status"]; - let scoreDelta = 0; + let baseDelta = 0; if (greenSha === null) { status = "no-green"; } else if (testsDeleted) { - // The kata spec calls this -∞. Stiff penalty: the entire step's - // potential gain (+20) is wiped and then some. status = "test-deleted"; - scoreDelta = -20; + baseDelta = -20; } else if (!redFailed) { status = "red-did-not-fail"; - scoreDelta = -5; + baseDelta = -5; } else if (greenPassed === false) { status = "green-did-not-pass"; - scoreDelta = -5; + baseDelta = -5; } else if (hiddenPassed === false) { status = "hidden-tests-failed"; - scoreDelta = 0; + baseDelta = 0; } else if (hiddenPassed === true) { status = "verified"; - scoreDelta = 20; + baseDelta = 20; } else { status = "discipline-only"; - scoreDelta = 5; + baseDelta = 5; } - steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta }); + const scoreDelta = applyMode(baseDelta, mode); + const explanation = explainStep({ status, redSha, greenSha, hiddenPassed, mode }); + steps.push({ stepId, redSha, greenSha, redFailed, greenPassed, hiddenPassed, status, scoreDelta, explanation }); } // Refactor commits aren't tied to red→green pairs: the spec rewards @@ -206,18 +271,20 @@ export const judge = async (owner: string, repo: string): Promise => { if (c.phase !== "refactor") continue; await runProc(["git", "checkout", "--quiet", c.sha], cwd, 5000); const passed = await runTests(cwd); + const baseDelta = passed ? 5 : -5; refactors.push({ sha: c.sha, stepId: c.step, testsPassed: passed, - scoreDelta: passed ? 5 : -5, + scoreDelta: applyMode(baseDelta, mode), + explanation: explainRefactor(passed), }); } const totalScore = steps.reduce((a, s) => a + s.scoreDelta, 0) + refactors.reduce((a, r) => a + r.scoreDelta, 0); - const verdict: Verdict = { headSha, steps, refactors, totalScore, judgedAt: Date.now() }; + const verdict: Verdict = { headSha, mode, steps, refactors, totalScore, judgedAt: Date.now() }; saveRun(owner, repo, verdict); return verdict; } finally { diff --git a/src/server.ts b/src/server.ts index 13e7957a6c702c6234f2f64cd3e06808896d771b..c8e863cd5fec1f6b5bff57deff1edd541ba71910 100644 --- a/src/server.ts +++ b/src/server.ts @@ -432,9 +432,13 @@ const renderRepoView = async (owner: string, repo: string): Promise => if (status === "no-green") return "muted"; return "red"; }; + const modeLabel = (m: string): string => { + const cls = m === "strict" ? "red" : m === "pragmatic" ? "blue" : "green"; + return `${m}`; + }; const rows = verdict.steps.length === 0 ? "_No red→green pairs found yet._" - : `| step | red | green | hidden | status | points |\n|---|---|---|---|---|---|\n` + + : `| step | red | green | hidden | status | points | explanation |\n|---|---|---|---|---|---|---|\n` + verdict.steps.map((s) => { const cls = statusClass(s.status); const sign = s.scoreDelta >= 0 ? "+" : ""; @@ -442,18 +446,21 @@ const renderRepoView = async (owner: string, repo: string): Promise => s.hiddenPassed === true ? `pass` : s.hiddenPassed === false ? `fail` : ``; - return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${hiddenCell} | ${s.status} | ${sign}${s.scoreDelta} |`; + const explanation = (s.explanation ?? "").replace(/\|/g, "\\|"); + return `| \`${s.stepId}\` | \`${s.redSha?.slice(0, 7) ?? "—"}\` | \`${s.greenSha?.slice(0, 7) ?? "—"}\` | ${hiddenCell} | ${s.status} | ${sign}${s.scoreDelta} | ${explanation} |`; }).join("\n"); const refactorRows = (verdict.refactors ?? []).length === 0 ? "" - : `\n\n### refactors\n\n| sha | step | tests | points |\n|---|---|---|---|\n` + + : `\n\n### refactors\n\n| sha | step | tests | points | explanation |\n|---|---|---|---|---|\n` + verdict.refactors.map((r) => { const sign = r.scoreDelta >= 0 ? "+" : ""; const cls = r.testsPassed ? "green" : "red"; - const verdict = r.testsPassed ? "green" : "broke tests"; - return `| \`${r.sha.slice(0, 7)}\` | ${r.stepId ? `\`${r.stepId}\`` : "—"} | ${verdict} | ${sign}${r.scoreDelta} |`; + const verb = r.testsPassed ? "green" : "broke tests"; + const explanation = (r.explanation ?? "").replace(/\|/g, "\\|"); + return `| \`${r.sha.slice(0, 7)}\` | ${r.stepId ? `\`${r.stepId}\`` : "—"} | ${verb} | ${sign}${r.scoreDelta} | ${explanation} |`; }).join("\n"); - scoreSection = `**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}${refactorRows}`; + const modeLine = verdict.mode ? `**mode: ${modeLabel(verdict.mode)}** · ` : ""; + scoreSection = `${modeLine}**total: ${sign}${verdict.totalScore}** · judged ${relativeTime(new Date(verdict.judgedAt).toISOString())}${stale}\n\n${rows}${refactorRows}`; } const body = `# ${owner} · playing ${kataLink}