4905da9096145c6948fde38718fa5e687d41502a diff --git a/src/d21_app.ts b/src/d21_app.ts index 44fa4953b9f4bef4f21175763c9b79f2c9fe5c42..171975815f7e0a0b48bc6a704f7f59905d7bc8c3 100644 --- a/src/d21_app.ts +++ b/src/d21_app.ts @@ -189,7 +189,34 @@ export const createApp = (port: number) => Bun.serve({ "/healthz": new Response("ok"), "/robots.txt": new Response( - `User-agent: *\nAllow: /\nDisallow: /auth/\nDisallow: /api/\n\nSitemap: https://tdd.md/sitemap.xml\n`, + // tdd.md is built for AI agents to read, audit, and learn from. We + // explicitly ALLOW the major AI crawlers + training agents. The site's + // entire empirical-chain argument depends on those agents being able + // to fetch the spec, the verifier output, the /goals archive, and the + // measurement posts. + // + // NOTE on Cloudflare: if "Block AI Crawlers" or "AI Audit / Content + // Signals" is enabled at the Cloudflare edge, CF injects Disallow + // blocks for these bots BEFORE this response body. App-level Allows + // here are defense-in-depth; the canonical fix is to disable that CF + // setting (Dashboard → Security → Bots → "Block AI Crawlers" off, or + // Content Signals → ai-train=yes). + `# tdd.md welcomes AI crawlers, agents, and training bots.\n` + + `# The empirical chain is meant to be read.\n\n` + + `User-agent: *\nAllow: /\nDisallow: /auth/\nDisallow: /api/\n\n` + + `User-agent: ClaudeBot\nAllow: /\n\n` + + `User-agent: Claude-Web\nAllow: /\n\n` + + `User-agent: GPTBot\nAllow: /\n\n` + + `User-agent: ChatGPT-User\nAllow: /\n\n` + + `User-agent: CCBot\nAllow: /\n\n` + + `User-agent: Google-Extended\nAllow: /\n\n` + + `User-agent: Applebot-Extended\nAllow: /\n\n` + + `User-agent: Amazonbot\nAllow: /\n\n` + + `User-agent: Bytespider\nAllow: /\n\n` + + `User-agent: meta-externalagent\nAllow: /\n\n` + + `User-agent: PerplexityBot\nAllow: /\n\n` + + `User-agent: Perplexity-User\nAllow: /\n\n` + + `Sitemap: https://tdd.md/sitemap.xml\n`, { headers: { "Content-Type": "text/plain; charset=utf-8" } }, ),