src/a31_blog.ts · syntaxai/tdd.md

syntaxai/tdd.md · main · src / a31_blog.ts
a31_blog.ts 202 lines · 42691 bytes raw
// c31 — model: blog index data. The post bodies live as markdown in
// content/blog/<slug>.md; this file is just the registry that drives
// /blog, /blog/:slug, and the sitemap. New posts: drop the .md file
// and add an entry here.

export interface BlogEntry {
  slug: string;
  title: string;
  description: string;
  // ISO date for the listing + sitemap lastmod.
  date: string;
}

export const ALL_POSTS: BlogEntry[] = [
  {
    slug: "sama-v2-verifier-second-opinion-gap",
    title: "The verifier has no second opinion",
    description: "Every load-bearing claim on tdd.md has an independent oracle that confirms it. §5 workingSetFit numbers are pinned to external repos at specific SHAs (anyone can clone and recount). URL refactor wall-clock measurements are timestamps in git history. /goal contracts live in goals/ AND PR bodies AND conversation transcripts. The deploy succeeded? curl on /healthz is the oracle, independent of the deploy script. The sitemap is correct? Compare it to ALL_POSTS. Every blog post claim links back to its driving /goal and merge commit. The chain holds — every artifact passes its own audit. There's one exception, and it's at the heart of the entire structural claim: /sama/v2/verify reports 7/7 ✓, and the only oracle that confirms it is the program that emitted it. The §0 spec says 'the verifier is a deterministic program; that claim is only auditable if a human can reproduce it from the data' — but reproducibility means running the same program; that's not independent validation. A buggy verifier biased toward passing the codebase it was written against would emit 7/7 ✓ deterministically forever. This week the gap got a concrete demonstration: a draft of a second verifier landed at the repo root (cli.md), and its 'SAMA v2 compliant' file structure had the prefix-to-layer mapping inverted (a=Layer 3, c=Layer 0/1 — backwards from the canonical a=Pure, b=Core, c=Adapter, d=Entry). Someone who reads the spec daily still got the structure wrong. If the spec is this easy to misread, what catches a similar misreading in the TS verifier? Only a second independent implementation that disagrees. The fix: build the shell verifier proposed in cli.md (with the layer mapping corrected) at tools/sama-cli/. Different language (POSIX shell vs TS), different runtime (bash vs Bun), different primitives (find/grep/awk vs Bun.file/Glob), same spec read independently. A cross-verify.sh script runs both and asserts identical verdicts — agreement is empirical 7/7 ✓; disagreement is a §6 spec-prose pressure point. Third drama post in the structural-self-audit series: chain-gap (every artifact in git except the /goal — fixed), on-ramp-gap (every artifact has a URL except the on-ramp — fixed), verifier-second-opinion-gap (every claim has an oracle except the verifier's — /goal /goals/sama-cli-shell-verifier proposed). Falsifiable next: two-verifier agreement holds across the n=8 §5 cross-repo measurement corpus. If even one repo causes disagreement, the spec has a located ambiguity; §6 evolution-policy resolves it in the prose.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-portability-boundary-found",
    title: "21 minutes 23 seconds — the portability boundary is empirically located",
    description: "Third datapoint on the cost-flattening empirical chain. PR #42 (an evening), PR #53 (8m 8s, fixed-enum), PR #55 (21m 23s, data-driven). The second postmortem predicted ≤30 min for data-driven — this one measures it at 21m 23s, on the faster side of the predicted band. Three datapoints now align with a mechanical model: fixed-enum refactors land in ~8 min dominated by deploy time; data-driven refactors land in ~21 min because the migration tooling grows from a one-liner sed to a ~50-LOC Bun script, plus a spoof guard, plus wider over-rewrite recovery (the same pattern-risk as PR #53 but bigger surface). The helper itself stays mechanical: 13 lines for fixed-enum, 24 lines for data-driven (the added 11 being the ALL_POSTS lookup + date.slice). The Layer-3 wrapper stays byte-identical across all three. The pattern is reusable for data-driven URLs too; the cost grows with migration surface but stays bounded. Specific breakdown of the 21:23: ~10 min on new code (helper + sibling test + handler + spoof guard + Bun route + sitemap + blog index + edit-resolve), ~6 min on migration script + run, ~3 min recovering from over-rewrites + missed root-level CONTRIBUTING.md, ~2 min on PR + deploy. The 'new code' time was identical to PR #53; the difference was everything around it. The §5 chain now has three deltas confirming a model that's bounded across pattern variants. Two falsifiable next-tests: fourth fixed-enum refactor in the same band (≤10 min), data-driven with ~500-reference surface in ≤45 min.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-second-url-refactor-postmortem",
    title: "8 minutes 8 seconds — the cost-flattening hypothesis is confirmed",
    description: "The git-url-refactor postmortem closed with a single falsifiable claim: 'if the second URL refactor lands in ~1 hour, cost-flattening of pattern-as-redirect is confirmed; if it takes another evening, the pattern wasn't as portable as it looked. Either is informative.' The second URL refactor happened today (moving /sama/<discipline> → /sama/discipline/<slug>). It landed in 8 minutes 8 seconds — 7.4× faster than predicted. Cost-flattening confirmed, much more dramatically than the hypothesis dared. Timeline breakdown commit-by-commit shows where each minute went: 1m45s on the helper + test (copy of git template); 30s on the fallback handler; 1m10s on the sed pass; 30s recovering from one sed over-rewrite (regex matched inside filesystem paths content/sama/sorted.md, caught by an existing edit-resolve test); 2m20s on the gh PR flow + deploy. What the prediction got right: the 13-line helper, the 11-line Layer-3 wrapper, the sibling-test structure — all landed identically. What it missed: pattern risk (sed scope on shorter URL prefixes is more collision-prone; lesson added to anti-fudge for next time). Two datapoints isn't a trend, but the mechanism is mundane: the first refactor designs the shape; the second imports the template, changes the slug, pastes. Predicts that future fixed-enum URL refactors land in ~5-10 min (deploy-time dominated), while data-driven refactors like /blog/<slug> → /blog/<yyyy-mm>/<slug> probably take ~20-30 min because the helper has to grow. The empirical chain ratchets: §5 has been about cross-repo workingSetFit deltas; this post lands wall-clock time as the same kind of measurement. 'Compliance proves the rules were followed; delta proves they were worth following' — that argument now applies to wall-clock time, not just file-fit ratios.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-on-ramp-gap",
    title: "Every artifact has a URL. The on-ramp doesn't.",
    description: "Open tdd.md in a fresh browser. Count the entry points — /sama/v2 (spec), /sama/v2/verify (live), /blog (narrative), /goals (contracts), /GIT/tdd.md (source), /sitemap.xml (every URL). Six artifacts, each auditable, each its own blog post subject. Now ask the question the site exists to answer for a reader: 'how do I contribute?' There is no answer. No CONTRIBUTING.md, no /contributing URL, no canonical on-ramp anywhere. Forty PRs of preaching auditability — and the most basic on-ramp test failing the entire time. This post is the drama. Walks the artifact-vs-path table: every OUTPUT lives in a canonical place (✓), every authoring rule lives somewhere a contributor wouldn't think to look (the /goal workflow rule is in an agent-private memory file that humans literally cannot read; the layer convention is in /sama/v2 §1.1 + /sama/discipline/sorted + the source tree; the branch-PR-deploy flow is scattered across PR commit messages and individual /goal bodies; the image convention lives in memory only; the Containerfile gotcha lives in one PR's commit message). Concrete 10-step thought experiment of what a new contributor adding their first blog post would actually have to do — ends in 'give up, open an issue saying how do I contribute'. Frames this as a SAMA v2 self-violation parallel to /blog/2026-05/sama-v2-goal-chain-gap one level up: that post fixed the /goal in the chain; this post argues the workflow itself is the next missing artifact, same structural failure applied to procedure instead of contract. Sketches the fix: one CONTRIBUTING.md, two surfaces (./CONTRIBUTING.md at repo root + /contributing on the site, same markdown source), with three load-bearing properties — single source of truth (links don't restate), dogfooded (its own creation is /goal-driven at /goals/contributing-md), drift-detectable (a test that grep-checks CONTRIBUTING.md contains only references not restatements). The /goal that drives the implementation follows this post per the plan-execute-postmortem pattern.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-goal-chain-gap",
    title: "Every chain artifact lives in git — except the /goal",
    description: "This site's load-bearing claim is auditability. /sama/v2/verify reports 7/7 ✓ live against the source tree. Cross-repo §5 measurements pinned at SHAs you can re-clone. The sitemap is generated from registries that live in git. Every link in the empirical chain is supposed to be inspectable — and one isn't. The /goal command — the structured spec that drives every PR on this site, that pins the Done-when clauses + Constraints (anti-fudge) + Load-bearing files — gets typed into goal.md and overwritten the next time someone writes a /goal. The careful contract that produced the verifier, the metrics emitter, the cross-repo measurements, the sitemap, the URL refactor: gone. The thing that started every chain link is the only thing not in the chain. This post is the drama: traces the concrete losses across PRs #38/#40/#42 (the visible empirical-chain work) where the /goal text is recoverable only from forensic reconstruction (PR descriptions: partial, commit messages: wrong layer, blog posts: lossy paraphrase, conversation transcripts: not in git). Argues this is a SAMA v2 self-violation against §0's deterministic-program auditability clause. Walks the three candidate fixes — git notes (no native render, invisible by default), goals/<sha>.goal (filename = SHA but chicken-and-egg + unreadable), goals/<slug>.md with merge_sha in frontmatter (mirrors /blog + /sama exactly, SHA lookup via one grep, no new abstractions). The third wins; the user's filename-equals-SHA intuition is honoured in spirit via the frontmatter field. Sketches the SAMA-native fix: a31_goals.ts (Layer 0 registry) + b32_goals_meta.ts (Layer 1 frontmatter parser + SHA lookup) + d21_handlers_goals.ts (Layer 3 index + detail) + the boring wiring (nav link, sitemap entries). Plus the workflow lock-in: agent's FIRST action when /goal fires is to write the body verbatim to goals/<slug>.md and commit it BEFORE any code work, so the goal is in the chain by construction not by remembering. Two /goal commands follow in the next two posts/PRs — one builds the infrastructure, one migrates the seven historical goals + locks the workflow. After both ship, /grep -l 'merge_sha: 968890f' goals/ resolves the contract that produced commit 968890f, and the site's auditability claim becomes byte-for-byte true: every artifact in the chain, including the one that drives all the others, is in git.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-git-url-refactor-postmortem",
    title: "The /GIT/ URL refactor shipped — plan vs actual",
    description: "Yesterday's /goal post planned the refactor that drops the redundant :owner segment from /GIT/:owner/:repo/ URLs. The /goal fired, PR #42 landed an hour later, here's the promised postmortem. Headline: the plan held — 19 files changed, 153 insertions, 69 deletions, one regex absorbs every old URL, 9 new helper tests, 388/388 green, 7/7 ✓ on /sama/v2/verify before and after the merge. Walks the scorecard: where the plan held exactly (49 references — exact grep match; 8 content files — exact match; one regex anti-fudge held; LIVE_REPO_OWNER kept exported as designed), where it slightly missed (helper test cases predicted ~3, actual 9 — over-coverage 3× because the natural coverage was URL-surface-by-kind not regex-by-pattern; test file churn predicted 2, actual 1 — the commit-test file had no URL strings, defensive over-listing in the /goal is cheap), and what the plan didn't anticipate (the bare /<owner>/<repo> breadcrumb link INSIDE a /GIT/ commit page that points OUTSIDE the /GIT/ surface — a cross-boundary reference the /goal scope didn't enumerate explicitly; the right call at code-edit time was to leave it alone, but next plan should enumerate cross-references between in-scope and out-of-scope surfaces). The reusable shape: b32_<old>_url_redirect.ts (pure 5-line transform) + sibling test + one Layer-3 wrapper in the fallback handler. The cost flattens — first URL refactor under SAMA v2 needed a plan post + postmortem + evening of work; next one needs the helper + sed pass + maybe an hour. Concrete future candidates listed: /sama/v2/example-crud → /sama/v2/examples/crud, /blog/<slug> → /blog/<yyyy-mm>/<slug>, /sama/<slug> → /sama/discipline/<slug>. The interesting next data point: whether the SECOND URL refactor confirms cost-flattening. Either outcome is informative.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-git-url-refactor-plan",
    title: "Shortening /GIT/ URLs: a single-tenant URL has a redundant segment",
    description: "Every link on the site that points at source code goes through /GIT/:owner/:repo/... and the owner is always 'syntaxai', the repo is always 'tdd.md', and the handler 404s anything else. The user-visible URL does structural work for a multi-tenant case that doesn't exist. This post is the implementation PLAN for dropping the owner segment: /GIT/syntaxai/tdd.md/blob/main/src/b32_sama_v2_verify.ts → /GIT/tdd.md/blob/main/src/b32_sama_v2_verify.ts. The interesting design decision is the redirect strategy — 49 references to the old form across 10 source files + 7 content files, and the temptation is to hand-maintain a redirect table. The right shape is ONE regex in the fallback handler that matches the path PATTERN, not the path values, and 301s to the new form. Cost: one commit. Lifetime maintenance: zero. Includes two visualizations (URL anatomy + redirect flow) and walks the SAMA layer surface to show the refactor is contained to Layer 3 routing/rendering — no Adapter, no Core, no Pure changes. Anti-fudge clauses called out: no hand-maintained URL list, no removal of LIVE_REPO_OWNER (still needed for git operations + Forgejo proxy), no touching of git-protocol URLs (which agents and humans have copy-pasted into clone commands), no alias mode (both URLs working forever lets the old form quietly remain canonical), no verifier change. Postmortem to follow after the /goal fires and the refactor merges — pattern-as-redirect promises to be a reusable shape for future URL refactors (the next time /sama/v2/example-crud becomes /sama/v2/examples/crud, the same ten lines apply).",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-sitemap-implementation-plan",
    title: "Building /sitemap.xml under SAMA v2 — a Claude Code /goal walkthrough",
    description: "This site has 23 blog posts, 4 SAMA pages, a v2 spec, a verifier, and an example library — and zero sitemap. Search engines + AI crawlers walk it by random link-discovery, which is especially bad for the long-tail empirical-chain posts. This is the implementation PLAN, written before the code lands. It works from a structured /goal slash command (the 38-line text checked in at goal.md) that fixes Done-when post-conditions, anti-fudge constraints, and load-bearing files-to-read-first before the agent touches anything. Maps the feature onto the four SAMA v2 layers: d21_app.ts gains a /sitemap.xml route (Entry); b32_sitemap.ts becomes a new pure helper that takes Array<{loc, lastmod?}> and returns XML (Core); Layer 2 stays empty by design because the registries are already in-memory data — no DB, no network, no filesystem at request time; ALL_POSTS + ALL_SAMA + ALL_GUIDES + BASE_URL at Layer 0 (already exists). The structural property the picture surfaces: in idiomatic-WordPress this is 4 files (DB adapter, query, renderer, cache). In layer-disciplined shape with in-memory registries, the adapter dissolves and the feature is one new helper + one new route. Two images: SAMA-layer decomposition (Layer 2 visibly empty) and data flow (three registries fan into helper, helper fans out to Response). Walks the seven §4 checks and shows how each is satisfied by the chosen file layout. Anti-fudge clauses called out explicitly: no second URL list that can drift, no string-concat XML (escape & and <), no static-file commit (the automatic property is load-bearing), no dynamic/user-specific URLs, no verifier change. Companion postmortem will land after the PR merges with actual diff + before/after verifier output + whatever the anti-fudge clauses caught that the plan missed. The point of the post is the workflow — /goal as contract, SAMA v2 as discipline, verifier as gate — not the URL list itself.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-workingset-cross-repo-baseline",
    title: "Was the dive/ripgrep convergence real? Seven measured workingSetFit datapoints",
    description: "The dive/ripgrep audits ended with a quietly interesting finding: when the polyglot §5 emitter ran against both, they landed within 2 percentage points of each other (52.17% and 54.00%). I noted on the home page that this *might* be characteristic of mature compiled-language CLI tools — a hypothesis that needs more datapoints to confirm. This post tests it. n=2 → n=7. Cloned 5 more popular CLI tools at pinned SHAs (sharkdp/bat, sharkdp/fd, eza-community/eza, jesseduffield/lazygit, cli/cli), ran the same emitter with the same bounds imported from a31_sama_v2.ts. Headline: the convergence was n=2 coincidence. The actual distribution spans 27 percentage points — bat at 46.27% (lowest) to cli/gh at 73.59% (highest). Mean 60.68%, median 61.76% (eza), sample stddev 10.13pp. But there IS clustering: five of seven projects fall within [52%, 70%] — an 18-point window, not 2. The metric is more discriminating than n=2 implied, and the clustering is real. Go subset (cli, lazygit, dive) averages ~6pp higher than Rust subset (fd, eza, ripgrep, bat) at small n. Per-project notes on what each distribution implies — cli/gh's high score reflects natural command-handler cohesion; bat's low score reflects pre-built syntax-highlighting language-definition shards (the same declarative-exemption case the §6.3 dialect was drafted for); dive's miss reflects platform-shim stubs not god-classes. tdd.md (the SAMA-disciplined dogfood) measures 80% — 6.4 percentage points above the top of the non-SAMA mature-CLI baseline. Suggestive but n=1 vs n=7 is not a SAMA-worth-following claim. What this run does establish: the empirical chain is now n=7 measured against the same bounds; the §6 falsifiable experiment is well-conditioned for when a second SAMA repo exists. Includes a hand-trace of bat (the lowest measurement) per the §0 deterministic-program contract, mirroring the dive audit's hand-trace pattern. Reproducibility: pinned SHAs throughout; anyone can clone-and-run.",
    date: "2026-05-27",
  },
  {
    slug: "sama-v2-rust-project-ripgrep-parallel-fleet",
    title: "The same `ripgrep` rebuild, run by a fleet of AI agents in parallel across the planet — a projection",
    description: "Yesterday's ripgrep rebuild sketch estimated ~1 focused working week (~8 working days) for one careful human. This post is the companion projection Bas asked: what does the same refactor look like if executed by a fleet of AI agents running in parallel across the planet, under strict SAMA v2 management? Honest answer: wall-clock projection collapses from ~8 working days to ~8 wall-clock hours, ~10× compression. The load-bearing reason isn't 'AI is fast' — agents have been fast for years. The cap was always integration: pre-SAMA, every multi-agent refactor experiment grounded on the same three failure modes (scope creep, style drift, no mechanical merge gate). SAMA v2 dissolves all three because every architectural rule is also a merge gate. Decomposes the rebuild into 8 work packages — 4 profile-only WPs (serial chain, ~2h) + 4 code-split WPs (parallel, ~6h) — and demonstrates that the file-prefix + directory-layered crate graph give you a property no other architecture standard does: work-package boundaries are physically non-overlapping, so the orchestrator can scope-fence each agent's workspace with forbidden_paths and the agents literally cannot reach across boundaries. Plausible fleet composition with capability-matched task assignment (Opus on the hardest split, Haiku on TOML patches), 24-hour wall-clock coverage by spreading across time zones, mechanical verifier as merge gate so mixed-model output is fine. Timeline diagram showing T+0 through T+8h, then the section that matters most: why this is a SAMA-enabled property specifically — Atomic bounds working-context, Architecture aligns work-package boundaries with merge boundaries, Sorted makes dependency direction publicly readable to the orchestrator, Modeled makes tests the agent's local responsibility rather than a central bottleneck. Careful about framing: this is a projection from concrete file deltas + sane parallelism, not a measurement. The §6 experiment that would convert this into measurement is sketched at the end — fork a v2-conforming repo, generate the manifest, run the fleet, measure wall-clock vs serial vs serial-human vs the same refactor attempted on a non-v2 codebase. The interesting comparison is the fourth row, the non-v2 attempt — the one we expect to never finish because the work packages won't stay disjoint. That's the SAMA-specific empirical claim this post lays cable for.",
    date: "2026-05-26",
  },
  {
    slug: "sama-v2-rust-project-ripgrep-rebuilt",
    title: "`ripgrep`, rebuilt under SAMA v2 — a thought experiment",
    description: "The companion to today's ripgrep audit (which scored ~3/7 strict, ~5/7 under three proposed v2.1 dialects). Same parallel-architecture sketch as the dive and WP rebuilds. The sketch is even smaller than dive's: BurntSushi's crate graph already reads like a SAMA v2 layer chart, and the only real code work is two internal file splits — printer/standard.rs (3,987 LOC) into a six-file submodule per output mode, and ignore/walk.rs (2,494 LOC) into a four-file submodule per walker concern. Everything else is the sama.profile.toml declaration plus three v2.1 dialect adoptions: layout='directory' (Sorted-by-crate-graph), tests='inline' (Modeled-tests recognizes #[cfg(test)] mod tests blocks), and atomic_exemption='declarative' (the 7,779-line defs.rs flag catalog exempt because CC/LOC ≈ 0.01). Profile notes resolve the two boundary-borderline cases: serde derives ≠ boundary parsing in the §4.4 sense; the byte-parsing in searcher/line_buffer.rs IS the algorithm. No new tests need to be written. No API breaks. ~1 focused working week of effort vs months for WP and ~10 days for dive — the cost scales inversely with how much architectural care the original author put in. Predicted §5 deltas: workingSetFit ~60% → ~80% (the two splits carry 6,481 LOC into right-sized files), violationCounts ~50 → 0, boundaryRatio ~95% → ~100%, graphDepth unchanged at ~5 (the crate graph itself doesn't change). Four observations the sketch surfaces: (1) the three v2.1 dialects together cleanly absorb Rust without weakening the rules' intent — sibling-vs-inline, prefix-vs-directory, behavioural-vs-declarative are all surface choices that preserve the underlying property each rule protects; (2) cost scales inversely with original author's discipline — BurntSushi did the layering decade ago, the rebuild mostly just writes it down; (3) Cargo ecosystem is cheap §5 baseline data — port the emitter, run across the popular Rust CLI tools (bat, fd, eza, dust, tokei) and n grows from 4 to 20+ in an afternoon; (4) §5 metrics keep their separation from §4 verdict cleanly — same numbers regardless of which spec version produced the verdict.",
    date: "2026-05-25",
  },
  {
    slug: "sama-v2-rust-project-ripgrep",
    title: "Pointing SAMA v2 at `ripgrep`: BurntSushi's exemplar surfaces three findings about the spec",
    description: "Fourth real-world audit after this site (TS, 7/7), dive (Go, ~5/7), and the WP plugin (PHP, 0/7). Picked BurntSushi/ripgrep — 64k stars, ten years of refinement, ten crates, 45,927 lines of Rust. The carefully-architected exemplar of Rust CLI design. If v2 has problems, ripgrep is where they surface, because the obvious failures (god-classes, scattered I/O, untyped data) just aren't there. The crate dep graph already reads like a SAMA v2 layer chart — matcher trait at the bottom, core binary at the top, searcher/printer/globset/ignore between. BurntSushi did the layering; he just didn't call it that. Walking the seven §4 checks surfaces three real spec-evolution findings: (1) Sorted needs the directory-based dialect (Go finding, confirmed here); (2) Modeled-tests needs an inline-tests mode because Rust convention is #[test] inside the source file, not sibling files (NEW finding); (3) Atomic-700 needs a declarative-file exemption because the 7,779-line flags/defs.rs is a CLI-flag catalog with near-zero cyclomatic complexity per line — splitting it would scatter what's naturally one long table (NEW finding). The §5 metrics surface their own insight: workingSetFit drops to ~60% for ripgrep, lower than tdd.md or dive, yet most of the over-cap files are appropriate to their content. That's exactly the §5 intent — the metric surfaces a property; whether the property is good or bad depends on what the file SHOULD be. Compliance scores conflate; metrics separate. Strict score ~3/7; under the proposed v2.1 dialects ~5-6/7. ripgrep is so close to compliant that the work isn't on ripgrep — it's on v2. n=4 datapoints now: strongly-typed compiled-language projects cluster near the dogfood; the WP codebase is the outlier on every axis. Whether that's 'language enforces architecture' or 'people who choose Go/Rust care more about architecture' is the experiment §6 hasn't run yet.",
    date: "2026-05-24",
  },
  {
    slug: "sama-v2-go-project-dive-prefix-scheme",
    title: "`dive`, the prefix-scheme variant — what `ls`-readable layer order costs in Go",
    description: "The directory-dialect dive rebuild from earlier today lost the visual payoff of the WordPress rebuild's prefix tree. Bas: 'i miss this SAMA v2 style.' Fair. This post shows what dive looks like if it commits to v2.0's prefix scheme literally — every Go file renamed to a_/b1_/b2_/b3_/c1_/c2_/c3_/d_, all 90 sources flat under src/, ls reads top-to-bottom in dependency order, no spec extension required. Then the candid cost: everything ends up in one Go package (losing encapsulation, compilation boundaries, Go's compiler-enforced internal/ semantics), the Law check has to do the work the compiler was doing for free, and the Go community would reject the PR. Two rebuilds (directory dialect + prefix scheme) reach the SAME 7/7 ✓ end state with identical §5 metrics. The spec is stricter than the surface syntax that enforces it — which is itself the design point.",
    date: "2026-05-23",
  },
  {
    slug: "sama-v2-go-project-dive-rebuilt",
    title: "`dive`, rebuilt under SAMA v2 — a thought experiment",
    description: "The companion to today's dive audit (which found Go's standard layout already scores ~5 of 7 §4 checks). Same parallel-architecture sketch as the WordPress rebuild, but the lift is much smaller — days of work instead of months, because the starting point is so much closer. Profile under a hypothetical v2.1 directory-based dialect (Sorted-by-package-path instead of Sorted-by-filename-prefix), one package split (filetree/file_info.go moves into a new dive/filetree_loader/ adapter), ~30 sibling tests written for image-adapter files. No god-class splits, no API breaks, no contract changes. ~10 working days estimated. Predicted §5 deltas: boundaryRatio rises to 100% (the filetree_loader split closes the last filesystem-leak), violationCounts drops from ~30 to 0, workingSetFit unchanged (Go file sizes are already healthy). Three observations the sketch surfaces: (1) v2.0's Sorted check is the real spec gap, not anything dive does wrong; (2) the work-cost to go from idiomatic Go to v2-compliant Go is genuinely small; (3) the Go ecosystem is a cheap source of v2 baseline data — once the dialect lands, n=3 could grow to n=20+ in an afternoon.",
    date: "2026-05-23",
  },
  {
    slug: "sama-v2-go-project-dive",
    title: "Pointing SAMA v2 at `dive`: Go's conventions cover more than you'd think",
    description: "After auditing a WordPress plugin (0/7) and sketching its v2-rebuild, the natural follow-up was a Go project for n=3. Picked wagoodman/dive — 53k-star Docker image explorer, 8,498 LOC across 92 source files, mature codebase. Cloned, walked the source, scored honestly. The result is much more interesting than 0/7: Go's standard layout (cmd/, internal/, package-per-concern) plus the language's internal/ semantics already enforce the §1.2 Law, the Atomic 700-LOC cap (largest file 496), and Architecture-by-package — five of the seven §4 checks pass naturally. The two that don't: #1 Sorted (Go organizes by package directory, not filename prefix — incompatible with v2.0's lex-sort-the-prefixes rule) and #3 Modeled-tests (18 test files for 92 source files, gaps in the image adapters). The audit surfaces a real spec-evolution finding: v2 needs a directory-based dialect for Go, where 'package directory lex-position = layer order' replaces 'filename prefix lex-position = layer order'. The 30% remaining work to push dive to 7/7 is far smaller than the WP plugin's: add a profile under the new dialect, write ~30 sibling tests for image-adapter files, split filetree/ into pure + loader. Three datapoints now on the §5 axes: tdd.md (TS, 7/7, fit 80%), dive (Go, ~5/7, fit ~80%), WP plugin (PHP, 0/7, fit ~47%). The pattern is suggestive — real Go scores closer to v2-disciplined code than to WP-idiom code — but n=3 with two hand-estimates is not yet a worth-following claim.",
    date: "2026-05-23",
  },
  {
    slug: "sama-v2-wordpress-plugin-rebuilt",
    title: "The Open Graph plugin, rebuilt under SAMA v2 — a thought experiment",
    description: "Yesterday's audit showed the Open Graph plugin scores 0 of 7 §4 checks. The companion question Bas asked: what would the same plugin look like if it had been built under v2 from day one? This post is the parallel-architecture sketch — same scope, same features, same user-facing behaviour, just laid out so a PHP-aware verifier would report 7/7. The 1,554-line public god-class becomes eleven 150-300 line files, each with a sibling test. The 784-line admin class becomes six. The 43 raw $_POST/$_GET accesses collapse into ONE c3_admin_form_controller.php; superglobals appear nowhere else. $wpdb lives in exactly three repository files. Outbound HTTP (Facebook Graph cache-clear, locale XML, image probe) lives in three gateway files. Estimated §5 metric deltas vs the original: workingSetFit ~95% (vs ~47%), boundaryRatio 100% (vs <10%), graphDepth ~5 (vs ~3), violationCounts 0 (vs 17+). Honest framing: this is a sketch, not a PR. The §5 deltas are predictions, not measurements — the PHP-aware verifier doesn't exist yet and the plugin would have to actually be refactored for the numbers to be empirical. But the sketch makes 100% v2 compliant concrete enough that readers can map it onto their own code, and gives any future v2-discipline plugin a measurable baseline to be compared against.",
    date: "2026-05-23",
  },
  {
    slug: "sama-v2-wordpress-plugin-audit",
    title: "Pointing SAMA v2 at a real WordPress plugin in the wild",
    description: "The hypothetical example at /sama/v2/example-wordpress was easy — you design the layers, then describe a codebase that fits. The harder question is what v2 sees when pointed at code that was NOT designed under any layer discipline. Picked Open Graph and Twitter Card Tags (200k+ installs, 6,445 PHP lines), downloaded the actual source, walked it. Walking the seven §4 conformance checks: 0 of 7 would pass — not because the plugin is bad, but because WordPress's hook/filter idioms actively push devs toward this shape (god-classes, $wpdb scattered across layers, 43 raw superglobal accesses, a 1,554-line public class doing seven jobs at once). The interesting comparison isn't compliance ✓/✗ — it's the §5 metric deltas: this plugin would score graphDepth≈3, boundaryRatio<10%, workingSetFit≈47% against tdd.md's 7, 100%, 80%. That's not 'v2 is empirically worth following' yet — n=2 across different languages, the PHP verifier doesn't exist, the delta is just numbers — but it IS the first real-world data point measured on the same axes our own dogfood scores against.",
    date: "2026-05-23",
  },
  {
    slug: "sama-v2-metrics-emitter",
    title: "Compliance proves the rules were followed. Delta proves they were worth following.",
    description: "Yesterday the v2 verifier said 7/7 ✓ against this repo and the empirical chain — rule, verifier, codebase passing — closed for §4. Today I went looking for step 1 of empirically proving v2 is worth following, ran through three weak candidates (skeleton, agent experiment, external-repo audit), and Bas pushed back: \"heeft dit enige waarde?\" Then I reread my own §5 + §6. The spec literally says compliance ≠ proof; the empirical artefact is the delta on five core metrics — graph depth, fan distribution per layer, boundary ratio, working-set fit, violation counts over time. We emitted zero of them. Built the §5 metrics emitter as one Layer 1 pure module sharing a parse-boundary detector with the §4.4 verifier check (they cannot diverge by construction). Real numbers for this repo: graphDepth=7, boundaryRatio=100%, workingSetFit=80%, violationCounts all zero. Hand-traced boundaryRatio against the seven real call sites to match the verifier's number, because §0 says the program is deterministic and that claim is only auditable if a human can reproduce it. This isn't proof v2 works — a baseline is never proof. It's the cable today's PR laid for tomorrow's delta-comparison work.",
    date: "2026-05-23",
  },
  {
    slug: "sama-v2-verifier-and-the-rename",
    title: "I built the SAMA v2 verifier. It told me my own repo wasn't v2-compliant. Then I renamed 70 files.",
    description: "Built the SAMA v2 §4 verifier, pointed it at this repo under a truthful profile, got 4 of 7 ✓ on the first run. The anti-fudge clause forbade bending the profile to force-pass, so each ✗ became a structural refactor: types moved from c13/c14 to c31 (Law + Consistency), JSON.parse/new URL extracted to a Layer 2 helper (Modeled-boundary), 13 missing sibling tests added (Modeled-tests). Then the Sorted blocker — v1's c-prefix scheme lex-sorts the OPPOSITE of v2's Pure/Core/Adapter/Entry. No truthful profile fixes it; only a file rename does. ~70 files renamed in two sed passes, Containerfile CMD path updated, 277/277 tests stayed green, v1 dogfood still 4/4 ✓. Live now reports 7/7 ✓ at /sama/v2/verify. The empirical claim: here is the rule, here is the verifier, here is the URL where it runs against this codebase, here is the codebase passing.",
    date: "2026-05-23",
  },
  {
    slug: "deploy-that-lies-cascade",
    title: "When the deploy lies: three bugs hidden by one silent error suppressor",
    description: "/reports/live had been stuck on a 12-day-old window because the deploy script's snapshot step was failing silently (no bun on the p620 host, the failure was swallowed by 2>/dev/null and a 'non-fatal skipped' echo). Fix one: run the snapshot via podman. That exposed a second silent skip — snapshot-tests had been missing from the git-mode deploy entirely. Fix two: add it. That made bun test actually run in CI for the first time and exposed two more bugs — a 1-in-16 flaky test and a false-positive placeholder where the verifier's own test fixture was being grepped as a real test. Three bugs in one PR. The empirical lesson: verification only works if the pipeline that runs it isn't lying about whether it ran.",
    date: "2026-05-22",
  },
  {
    slug: "sama-empirical-modeled-green",
    title: "Greening our own dogfood: four sibling tests, the live verifier flipped from 3/4 to 4/4",
    description: "/sama/verify?repo=syntaxai/tdd.md is the public verifier on tdd.md. Yesterday it showed three of four SAMA pillars green for this codebase — Modeled was flagging four c32_* files without sibling tests. Today it shows 4/4. Receipt for the round-trip: four new test files (55 unit tests), three const → export const visibility lifts on pure helpers, no behaviour changes, and the same URL anyone in the world can hit now reports the same answer the local CLI does. The website is the spec is the verifier is the test suite.",
    date: "2026-05-22",
  },
  {
    slug: "sama-empirical-c21-split",
    title: "When the verifier said 'split this': one Atomic-700 hit, four handler files, the build stayed green",
    description: "After Fase-2b landed, the SAMA verifier flagged c21_app.ts at 761 LOC — over the 700-line Atomic threshold — with one instruction: 'split per UI/data domain.' Four new handler files later (fallback, projects, api_agents, webhook), c21_app.ts was at 452 LOC, the verifier flipped green on all 67 SAMA files, 138/138 unit tests stayed green, 49/49 e2e against live stayed green, and the git-native commit pipeline didn't notice the route table had moved. Receipt for one mechanical-verifier round-trip on a real codebase.",
    date: "2026-05-22",
  },
  {
    slug: "sama-meets-git-cms",
    title: "SAMA meets git: building a self-hosted CMS that obeys the discipline",
    description: "Built a self-hosted CMS for tdd.md that commits directly to Forgejo via HTTP — no git binary, no SSH keys, no SQLite proposal queue. Edits become real commits a reviewer can git blame. Along the way the build surfaced eight SAMA tensions: two led to refinements (Modeled exemption for I/O-only c14 files; boundary-contract discriminated unions), six were operational doctrines or things SAMA correctly stays silent on. This post itself was committed via the CMS.",
    date: "2026-05-10",
  },
  {
    slug: "from-rules-to-checks",
    title: "From rules to checks: shipping what the corpus post promised",
    description: "The corpus post named three checks the discipline should run. This post is the receipt. Three slivers shipped: placeholder-test detection (live on /reports/live/tests), historical-commit testing via git worktree (opt-in via SAMA_HISTORY_DEPTH), and /sama/verify - a four-discipline report runnable against any public repo. The rules are now URLs you can hit.",
    date: "2026-05-09",
  },
  {
    slug: "agentic-coding-corpus-three-patterns",
    title: "Three patterns ten threads converge on",
    description: "One thread is an audit. Ten threads are a pattern. A six-month corpus of r/ClaudeAI, r/ClaudeCode and r/AgentsOfAI posts shows three failure modes everywhere — agents attack the verifier rather than the impl, the harness's hidden state outvotes the user's stated rules, and experienced practitioners independently arrive at TDD+SAMA-shaped answers. With per-pattern mitigation tables: how the iron law, the sibling-test rule, and the layer-prefix grep would have caught or prevented each thread.",
    date: "2026-05-09",
  },
  {
    slug: "claude-code-harness-postmortem",
    title: "Forty hidden reminders, one failing test: reading the Claude Code postmortem thread",
    description: "ThePaSch's r/ClaudeAI audit catalogues 40+ hidden system reminders, five gag-order sites (\"never mention this to the user\"), a malware reminder injected on every file read, contradictory instructions, and a 158-version system-prompt churn in 11 days. Anthropic's postmortem stops short of any of it. What survives in the artefact a reviewer sees? TDD's iron law and SAMA's verification grep — both enforced outside the agent's context window.",
    date: "2026-05-09",
  },
  {
    slug: "three-constraints-agentic-coding",
    title: "Red, tokens, atoms: three constraints that compound",
    description: "Three pieces landed the same week — obra's TDD skill, Mishra's 23 token-saving tips for Claude Code, and the rebrand of SAMA (Sorted, Architecture, Modeled, Atomic). Each is useful alone. Stacked they multiply, and not by adding benefits — they remove the failure modes the others cannot see.",
    date: "2026-05-09",
  },
  {
    slug: "tweag-handbook-tdd",
    title: "Tweag's agentic TDD handbook gets the loop right — local green still isn't enough",
    description: "Tweag's agentic-coding handbook describes a clean TDD loop and the right rules for AI assistants — but the validation layer it leans on (run tests, see green) misses the three failure modes most likely to show up: tautology, test deletion in refactor, and assertion weakening. Here's the gap, and what closes it.",
    date: "2026-05-08",
  },
  {
    slug: "aider-tdd",
    title: "Aider is the closest agent to TDD on rails — until you let it auto-fix",
    description: "Aider's auto-commit-per-edit and bite-sized-steps philosophy make it TDD-shaped by default. Then `--auto-test` discovers it can win by deleting tests instead of fixing the impl. Here's how Aider's strengths map onto TDD, and how to keep the auto-test loop honest.",
    date: "2026-05-04",
  },
  {
    slug: "cursor-tdd",
    title: "Cursor knows how to do TDD. Most users skip the parts that matter.",
    description: "Cursor's own agent best practices document a clean TDD workflow — but most users skip the features (Plan Mode, fresh conversations, .cursor/rules) that actually make it work. Here's how to put the pieces together, with a kata you can run end-to-end.",
    date: "2026-05-04",
  },
  {
    slug: "claude-code-tdd",
    title: "Claude Code does not do TDD by default — here's how to make it",
    description: "Claude Code writes the test and impl in one breath, so the test never fails for the right reason. Two structural changes — CLAUDE.md rules + phase-separated sessions — get the discipline back, and tdd.md can verify it.",
    date: "2026-05-04",
  },
];