syntaxai/tdd.md · main · src / b32_anchor_extract.ts

b32_anchor_extract.ts 45 lines · 1496 bytes raw
// c32 — pure: parse rendered HTML and extract anchor entries for
// h2/h3 headings. Used by the docs layout to build the right-rail
// "on this page" navigator. No I/O; given a string in, returns a
// list of anchors out.
//
// Input shape: HTML produced by `marked` (which adds `id` attrs to
// headings via the GFM-slugger by default in our config). When an
// id is missing, we slug-ify the heading text ourselves so the
// anchor link still works.

export interface Anchor {
  level: 2 | 3;
  text: string;
  id: string;
}

const slugify = (raw: string): string =>
  raw
    .toLowerCase()
    .replace(/<[^>]*>/g, "")
    .replace(/&[a-z]+;/g, " ")
    .replace(/[^a-z0-9\s-]/g, "")
    .trim()
    .replace(/\s+/g, "-");

const stripTags = (s: string): string => s.replace(/<[^>]*>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&quot;/g, '"').replace(/&#39;|&apos;/g, "'").trim();

export const extractAnchors = (html: string): Anchor[] => {
  const out: Anchor[] = [];
  const re = /<h([23])(?:\s+([^>]*))?>([\s\S]*?)<\/h\1>/g;
  let m: RegExpExecArray | null;
  while ((m = re.exec(html)) !== null) {
    const level = parseInt(m[1] ?? "2", 10) as 2 | 3;
    const attrs = m[2] ?? "";
    const inner = m[3] ?? "";
    const idMatch = /\bid="([^"]+)"/.exec(attrs);
    const text = stripTags(inner);
    if (!text) continue;
    const id = idMatch?.[1] ?? slugify(text);
    if (!id) continue;
    out.push({ level, text, id });
  }
  return out;
};