syntaxai/tdd.md · main · src / a31_sxdoc_parse.ts

a31_sxdoc_parse.ts 328 lines · 11323 bytes raw
// c31 — HTML → SxDocument parser.
//
// SAMA placement: c31 because this is a parser for external input —
// Modeled.md is explicit: "every external input has a parser in a c31_*
// model — types and parse-functions colocated". HTML strings reach this
// file from the editor's save POST, from the markdown-import script, and
// from the AI-edit response — all "outside the process" → c31.
//
// Why a typed tree and not HTML strings: see c31_sxdoc.ts header.
//
// Why node-html-parser and not Bun's HTMLRewriter: we need a tree we can
// recurse over, not a streaming filter. The dep is pure-logic (no I/O,
// no fs, no spawn) so it doesn't push the file into c14 territory.

import { parse, type HTMLElement, type Node, NodeType } from "node-html-parser";
import type { SxDocument, SxBlock, SxInline, SxMark } from "./a31_sxdoc.ts";
import { SX_DOC_VERSION } from "./a31_sxdoc.ts";

const SHORTCODE_RE = /\[\[sx:([a-z][a-z0-9-]*)((?:\s+[a-z0-9_-]+=(?:"[^"]*"|[^\s"\]]+))*)\s*\]\]/g;
const SHORTCODE_ARG_RE = /([a-z0-9_-]+)=(?:"([^"]*)"|([^\s"\]]+))/g;

const HEADING_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]);

// Block-level tags — used by parseListItem to know where to stop
// collecting inlines and recurse instead. Keep in sync with the
// pushBlocksFromNode dispatcher above.
const BLOCK_TAGS = new Set([
  "p", "h1", "h2", "h3", "h4", "h5", "h6",
  "ul", "ol", "blockquote", "pre",
  "img", "figure", "hr",
  "div", "section", "article", "table",
]);

const MARK_FOR_TAG: Record<string, SxMark> = {
  b: "b", strong: "b",
  i: "i", em: "i",
  u: "u",
  s: "s", strike: "s", del: "s",
  code: "c",
};

export const htmlToSx = (html: string): SxDocument => {
  // Wrap in <root> so we always have a single parent to walk childNodes
  // of, regardless of whether the input has its own wrapper element.
  const root = parse(`<root>${html}</root>`, {
    blockTextElements: { script: false, style: false },
  });
  const rootEl = root.firstChild as HTMLElement;
  const blocks: SxBlock[] = [];
  for (const node of rootEl.childNodes) {
    pushBlocksFromNode(node, blocks);
  }
  return { v: SX_DOC_VERSION, blocks };
};

// ─── block-level dispatch ────────────────────────────────────────────────

const pushBlocksFromNode = (node: Node, out: SxBlock[]): void => {
  if (node.nodeType === NodeType.TEXT_NODE) {
    const text = (node.text ?? "").trim();
    if (text) out.push(...textWithShortcodesToBlocks(text, []));
    return;
  }
  if (node.nodeType !== NodeType.ELEMENT_NODE) return;

  const el = node as HTMLElement;
  const tag = el.tagName?.toLowerCase();
  if (!tag) return;

  // Comments / processing-instructions surface as element nodes with a
  // tagName starting with "!" — drop them, they're not content.
  if (tag === "!" || tag === "comment") return;

  if (tag === "p") {
    const inlines = parseInline(el.childNodes, []);
    if (inlines.length === 0) return;
    out.push(...splitShortcodesFromParagraph(inlines));
    return;
  }

  if (HEADING_TAGS.has(tag)) {
    const level = parseInt(tag.slice(1), 10) as 1 | 2 | 3 | 4 | 5 | 6;
    out.push({ t: "h", level, c: parseInline(el.childNodes, []) });
    return;
  }

  if (tag === "ul" || tag === "ol") { out.push(parseList(el, tag)); return; }
  if (tag === "blockquote")          { out.push(parseQuote(el)); return; }
  if (tag === "pre")                 { out.push(parseCodeBlock(el)); return; }
  if (tag === "img") {
    const img = parseImg(el);
    if (img) out.push(img);
    return;
  }
  if (tag === "figure")              { out.push(parseFigure(el)); return; }
  if (tag === "hr")                  { out.push({ t: "hr" }); return; }

  if (tag === "div" || tag === "section" || tag === "article") {
    for (const child of el.childNodes) pushBlocksFromNode(child, out);
    return;
  }

  // Anything else → escape hatch so round-tripping stays lossless.
  out.push({ t: "html", src: el.outerHTML });
};

// ─── per-block parsers ───────────────────────────────────────────────────

const parseList = (el: HTMLElement, tag: "ul" | "ol"): SxBlock => {
  const items: SxBlock[][] = [];
  for (const child of el.childNodes) {
    if (child.nodeType !== NodeType.ELEMENT_NODE) continue;
    const childEl = child as HTMLElement;
    if (childEl.tagName?.toLowerCase() !== "li") continue;
    const itemBlocks = parseListItem(childEl);
    if (itemBlocks.length > 0) items.push(itemBlocks);
  }
  return { t: tag, items };
};

// Walk an <li>'s children in source-order. Inline runs collect into
// paragraphs; block-level children (nested ul/ol/blockquote/pre/…)
// flush the current inline buffer and recurse as their own block.
// Without this split, parseInline would walk into nested <ul> and the
// inner text would leak into the outer paragraph.
const parseListItem = (li: HTMLElement): SxBlock[] => {
  const result: SxBlock[] = [];
  let inlineBuf: Node[] = [];
  const flushInlines = (): void => {
    if (inlineBuf.length === 0) return;
    const inlines = parseInline(inlineBuf, []);
    if (inlines.length > 0) result.push({ t: "p", c: inlines });
    inlineBuf = [];
  };
  for (const node of li.childNodes) {
    if (node.nodeType === NodeType.ELEMENT_NODE) {
      const t = (node as HTMLElement).tagName?.toLowerCase();
      if (t && BLOCK_TAGS.has(t)) {
        flushInlines();
        pushBlocksFromNode(node, result);
        continue;
      }
    }
    inlineBuf.push(node);
  }
  flushInlines();
  return result;
};

const parseQuote = (el: HTMLElement): SxBlock => {
  const inner: SxBlock[] = [];
  for (const child of el.childNodes) pushBlocksFromNode(child, inner);
  if (inner.length === 0) {
    const inlines = parseInline(el.childNodes, []);
    if (inlines.length > 0) inner.push({ t: "p", c: inlines });
  }
  return { t: "quote", c: inner };
};

const parseCodeBlock = (el: HTMLElement): SxBlock => {
  // Canonical shape: <pre><code class="language-X">…</code></pre>.
  // Loose <pre>text</pre> also supported.
  const codeChild = el.querySelector("code");
  const inner = codeChild ?? el;
  const lang = parseLangFromClass(inner.getAttribute("class") ?? "");
  return { t: "code", lang, src: decodeEntities(inner.innerHTML) };
};

const parseImg = (el: HTMLElement): SxBlock | null => {
  const src = el.getAttribute("src") ?? "";
  if (!src) return null;
  const block: { t: "img"; src: string; alt?: string; w?: number; h?: number } = { t: "img", src };
  const alt = el.getAttribute("alt");
  if (alt) block.alt = alt;
  const w = numAttr(el, "width"); if (w !== undefined) block.w = w;
  const h = numAttr(el, "height"); if (h !== undefined) block.h = h;
  return block as SxBlock;
};

const parseFigure = (el: HTMLElement): SxBlock => {
  const img = el.querySelector("img");
  const caption = el.querySelector("figcaption");
  if (img) {
    const src = img.getAttribute("src") ?? "";
    if (src) {
      const block: { t: "img"; src: string; alt?: string; caption?: string; w?: number; h?: number } = { t: "img", src };
      const alt = img.getAttribute("alt"); if (alt) block.alt = alt;
      if (caption) block.caption = caption.text;
      const w = numAttr(img, "width"); if (w !== undefined) block.w = w;
      const h = numAttr(img, "height"); if (h !== undefined) block.h = h;
      return block as SxBlock;
    }
  }
  return { t: "html", src: el.outerHTML };
};

// ─── inline parsing ──────────────────────────────────────────────────────

const parseInline = (nodes: Node[] | undefined, marks: SxMark[]): SxInline[] => {
  if (!nodes) return [];
  const out: SxInline[] = [];
  for (const node of nodes) {
    if (node.nodeType === NodeType.TEXT_NODE) {
      const v = decodeEntities(node.text ?? "");
      if (v.length > 0) {
        out.push({ t: "text", v, ...(marks.length ? { m: dedupeMarks(marks) } : {}) });
      }
      continue;
    }
    if (node.nodeType !== NodeType.ELEMENT_NODE) continue;
    const el = node as HTMLElement;
    const tag = el.tagName?.toLowerCase();
    if (!tag) continue;

    if (tag === "br") {
      out.push({ t: "text", v: "\n", ...(marks.length ? { m: dedupeMarks(marks) } : {}) });
      continue;
    }

    if (tag === "a") {
      const href = el.getAttribute("href") ?? "";
      out.push({ t: "a", href, c: parseInline(el.childNodes, marks) });
      continue;
    }

    const mark = MARK_FOR_TAG[tag];
    if (mark) {
      out.push(...parseInline(el.childNodes, [...marks, mark]));
      continue;
    }

    // <span>, <font>, etc. — strip wrapper, keep contents.
    out.push(...parseInline(el.childNodes, marks));
  }
  return out;
};

const dedupeMarks = (marks: SxMark[]): SxMark[] => {
  const seen = new Set<SxMark>();
  const out: SxMark[] = [];
  for (const m of marks) if (!seen.has(m)) { seen.add(m); out.push(m); }
  return out;
};

// ─── shortcode lifting ──────────────────────────────────────────────────

// When a <p> contains [[sx:foo]] tokens mixed with text, split it into
// (paragraph)(shortcode)(paragraph) blocks so the document is queryable
// per-shortcode rather than per-paragraph-with-substring.
const splitShortcodesFromParagraph = (inlines: SxInline[]): SxBlock[] => {
  const out: SxBlock[] = [];
  let buf: SxInline[] = [];
  const flush = (): void => {
    if (buf.length > 0 && buf.some((i) => !(i.t === "text" && i.v.trim() === ""))) {
      out.push({ t: "p", c: buf });
    }
    buf = [];
  };
  for (const i of inlines) {
    if (i.t !== "text" || !SHORTCODE_RE.test(i.v)) {
      buf.push(i);
      continue;
    }
    SHORTCODE_RE.lastIndex = 0;
    const blocks = textWithShortcodesToBlocks(i.v, i.m ?? []);
    for (const b of blocks) {
      if (b.t === "shortcode") {
        flush();
        out.push(b);
      } else if (b.t === "p") {
        for (const inner of b.c) buf.push(inner);
      }
    }
  }
  flush();
  return out;
};

const textWithShortcodesToBlocks = (text: string, marks: SxMark[]): SxBlock[] => {
  const out: SxBlock[] = [];
  let last = 0;
  SHORTCODE_RE.lastIndex = 0;
  for (const m of text.matchAll(SHORTCODE_RE)) {
    const idx = m.index ?? 0;
    if (idx > last) {
      const before = text.slice(last, idx);
      if (before.trim() !== "") {
        out.push({ t: "p", c: [{ t: "text", v: before, ...(marks.length ? { m: marks } : {}) }] });
      }
    }
    const name = m[1]!;
    const args: Record<string, string> = {};
    for (const a of (m[2] ?? "").matchAll(SHORTCODE_ARG_RE)) {
      args[a[1]!] = a[2] ?? a[3] ?? "";
    }
    out.push({ t: "shortcode", name, args });
    last = idx + m[0].length;
  }
  const tail = text.slice(last);
  if (tail.trim() !== "") {
    out.push({ t: "p", c: [{ t: "text", v: tail, ...(marks.length ? { m: marks } : {}) }] });
  }
  return out;
};

// ─── small helpers ───────────────────────────────────────────────────────

const parseLangFromClass = (cls: string): string => {
  const m = cls.match(/(?:^|\s)language-([\w-]+)/);
  return m?.[1] ?? "";
};

const numAttr = (el: HTMLElement, name: string): number | undefined => {
  const v = el.getAttribute(name);
  if (!v) return undefined;
  const n = parseInt(v, 10);
  return Number.isFinite(n) ? n : undefined;
};

const decodeEntities = (s: string): string =>
  s
    .replace(/&amp;/g, "&")
    .replace(/&lt;/g, "<")
    .replace(/&gt;/g, ">")
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/&nbsp;/g, " ");