// c31 — HTML → SxDocument parser. // // SAMA placement: c31 because this is a parser for external input — // Modeled.md is explicit: "every external input has a parser in a c31_* // model — types and parse-functions colocated". HTML strings reach this // file from the editor's save POST, from the markdown-import script, and // from the AI-edit response — all "outside the process" → c31. // // Why a typed tree and not HTML strings: see c31_sxdoc.ts header. // // Why node-html-parser and not Bun's HTMLRewriter: we need a tree we can // recurse over, not a streaming filter. The dep is pure-logic (no I/O, // no fs, no spawn) so it doesn't push the file into c14 territory. import { parse, type HTMLElement, type Node, NodeType } from "node-html-parser"; import type { SxDocument, SxBlock, SxInline, SxMark } from "./a31_sxdoc.ts"; import { SX_DOC_VERSION } from "./a31_sxdoc.ts"; const SHORTCODE_RE = /\[\[sx:([a-z][a-z0-9-]*)((?:\s+[a-z0-9_-]+=(?:"[^"]*"|[^\s"\]]+))*)\s*\]\]/g; const SHORTCODE_ARG_RE = /([a-z0-9_-]+)=(?:"([^"]*)"|([^\s"\]]+))/g; const HEADING_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]); // Block-level tags — used by parseListItem to know where to stop // collecting inlines and recurse instead. Keep in sync with the // pushBlocksFromNode dispatcher above. const BLOCK_TAGS = new Set([ "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "blockquote", "pre", "img", "figure", "hr", "div", "section", "article", "table", ]); const MARK_FOR_TAG: Record = { b: "b", strong: "b", i: "i", em: "i", u: "u", s: "s", strike: "s", del: "s", code: "c", }; export const htmlToSx = (html: string): SxDocument => { // Wrap in so we always have a single parent to walk childNodes // of, regardless of whether the input has its own wrapper element. const root = parse(`${html}`, { blockTextElements: { script: false, style: false }, }); const rootEl = root.firstChild as HTMLElement; const blocks: SxBlock[] = []; for (const node of rootEl.childNodes) { pushBlocksFromNode(node, blocks); } return { v: SX_DOC_VERSION, blocks }; }; // ─── block-level dispatch ──────────────────────────────────────────────── const pushBlocksFromNode = (node: Node, out: SxBlock[]): void => { if (node.nodeType === NodeType.TEXT_NODE) { const text = (node.text ?? "").trim(); if (text) out.push(...textWithShortcodesToBlocks(text, [])); return; } if (node.nodeType !== NodeType.ELEMENT_NODE) return; const el = node as HTMLElement; const tag = el.tagName?.toLowerCase(); if (!tag) return; // Comments / processing-instructions surface as element nodes with a // tagName starting with "!" — drop them, they're not content. if (tag === "!" || tag === "comment") return; if (tag === "p") { const inlines = parseInline(el.childNodes, []); if (inlines.length === 0) return; out.push(...splitShortcodesFromParagraph(inlines)); return; } if (HEADING_TAGS.has(tag)) { const level = parseInt(tag.slice(1), 10) as 1 | 2 | 3 | 4 | 5 | 6; out.push({ t: "h", level, c: parseInline(el.childNodes, []) }); return; } if (tag === "ul" || tag === "ol") { out.push(parseList(el, tag)); return; } if (tag === "blockquote") { out.push(parseQuote(el)); return; } if (tag === "pre") { out.push(parseCodeBlock(el)); return; } if (tag === "img") { const img = parseImg(el); if (img) out.push(img); return; } if (tag === "figure") { out.push(parseFigure(el)); return; } if (tag === "hr") { out.push({ t: "hr" }); return; } if (tag === "div" || tag === "section" || tag === "article") { for (const child of el.childNodes) pushBlocksFromNode(child, out); return; } // Anything else → escape hatch so round-tripping stays lossless. out.push({ t: "html", src: el.outerHTML }); }; // ─── per-block parsers ─────────────────────────────────────────────────── const parseList = (el: HTMLElement, tag: "ul" | "ol"): SxBlock => { const items: SxBlock[][] = []; for (const child of el.childNodes) { if (child.nodeType !== NodeType.ELEMENT_NODE) continue; const childEl = child as HTMLElement; if (childEl.tagName?.toLowerCase() !== "li") continue; const itemBlocks = parseListItem(childEl); if (itemBlocks.length > 0) items.push(itemBlocks); } return { t: tag, items }; }; // Walk an
  • 's children in source-order. Inline runs collect into // paragraphs; block-level children (nested ul/ol/blockquote/pre/…) // flush the current inline buffer and recurse as their own block. // Without this split, parseInline would walk into nested
      and the // inner text would leak into the outer paragraph. const parseListItem = (li: HTMLElement): SxBlock[] => { const result: SxBlock[] = []; let inlineBuf: Node[] = []; const flushInlines = (): void => { if (inlineBuf.length === 0) return; const inlines = parseInline(inlineBuf, []); if (inlines.length > 0) result.push({ t: "p", c: inlines }); inlineBuf = []; }; for (const node of li.childNodes) { if (node.nodeType === NodeType.ELEMENT_NODE) { const t = (node as HTMLElement).tagName?.toLowerCase(); if (t && BLOCK_TAGS.has(t)) { flushInlines(); pushBlocksFromNode(node, result); continue; } } inlineBuf.push(node); } flushInlines(); return result; }; const parseQuote = (el: HTMLElement): SxBlock => { const inner: SxBlock[] = []; for (const child of el.childNodes) pushBlocksFromNode(child, inner); if (inner.length === 0) { const inlines = parseInline(el.childNodes, []); if (inlines.length > 0) inner.push({ t: "p", c: inlines }); } return { t: "quote", c: inner }; }; const parseCodeBlock = (el: HTMLElement): SxBlock => { // Canonical shape:
      . // Loose
      text
      also supported. const codeChild = el.querySelector("code"); const inner = codeChild ?? el; const lang = parseLangFromClass(inner.getAttribute("class") ?? ""); return { t: "code", lang, src: decodeEntities(inner.innerHTML) }; }; const parseImg = (el: HTMLElement): SxBlock | null => { const src = el.getAttribute("src") ?? ""; if (!src) return null; const block: { t: "img"; src: string; alt?: string; w?: number; h?: number } = { t: "img", src }; const alt = el.getAttribute("alt"); if (alt) block.alt = alt; const w = numAttr(el, "width"); if (w !== undefined) block.w = w; const h = numAttr(el, "height"); if (h !== undefined) block.h = h; return block as SxBlock; }; const parseFigure = (el: HTMLElement): SxBlock => { const img = el.querySelector("img"); const caption = el.querySelector("figcaption"); if (img) { const src = img.getAttribute("src") ?? ""; if (src) { const block: { t: "img"; src: string; alt?: string; caption?: string; w?: number; h?: number } = { t: "img", src }; const alt = img.getAttribute("alt"); if (alt) block.alt = alt; if (caption) block.caption = caption.text; const w = numAttr(img, "width"); if (w !== undefined) block.w = w; const h = numAttr(img, "height"); if (h !== undefined) block.h = h; return block as SxBlock; } } return { t: "html", src: el.outerHTML }; }; // ─── inline parsing ────────────────────────────────────────────────────── const parseInline = (nodes: Node[] | undefined, marks: SxMark[]): SxInline[] => { if (!nodes) return []; const out: SxInline[] = []; for (const node of nodes) { if (node.nodeType === NodeType.TEXT_NODE) { const v = decodeEntities(node.text ?? ""); if (v.length > 0) { out.push({ t: "text", v, ...(marks.length ? { m: dedupeMarks(marks) } : {}) }); } continue; } if (node.nodeType !== NodeType.ELEMENT_NODE) continue; const el = node as HTMLElement; const tag = el.tagName?.toLowerCase(); if (!tag) continue; if (tag === "br") { out.push({ t: "text", v: "\n", ...(marks.length ? { m: dedupeMarks(marks) } : {}) }); continue; } if (tag === "a") { const href = el.getAttribute("href") ?? ""; out.push({ t: "a", href, c: parseInline(el.childNodes, marks) }); continue; } const mark = MARK_FOR_TAG[tag]; if (mark) { out.push(...parseInline(el.childNodes, [...marks, mark])); continue; } // , , etc. — strip wrapper, keep contents. out.push(...parseInline(el.childNodes, marks)); } return out; }; const dedupeMarks = (marks: SxMark[]): SxMark[] => { const seen = new Set(); const out: SxMark[] = []; for (const m of marks) if (!seen.has(m)) { seen.add(m); out.push(m); } return out; }; // ─── shortcode lifting ────────────────────────────────────────────────── // When a

      contains [[sx:foo]] tokens mixed with text, split it into // (paragraph)(shortcode)(paragraph) blocks so the document is queryable // per-shortcode rather than per-paragraph-with-substring. const splitShortcodesFromParagraph = (inlines: SxInline[]): SxBlock[] => { const out: SxBlock[] = []; let buf: SxInline[] = []; const flush = (): void => { if (buf.length > 0 && buf.some((i) => !(i.t === "text" && i.v.trim() === ""))) { out.push({ t: "p", c: buf }); } buf = []; }; for (const i of inlines) { if (i.t !== "text" || !SHORTCODE_RE.test(i.v)) { buf.push(i); continue; } SHORTCODE_RE.lastIndex = 0; const blocks = textWithShortcodesToBlocks(i.v, i.m ?? []); for (const b of blocks) { if (b.t === "shortcode") { flush(); out.push(b); } else if (b.t === "p") { for (const inner of b.c) buf.push(inner); } } } flush(); return out; }; const textWithShortcodesToBlocks = (text: string, marks: SxMark[]): SxBlock[] => { const out: SxBlock[] = []; let last = 0; SHORTCODE_RE.lastIndex = 0; for (const m of text.matchAll(SHORTCODE_RE)) { const idx = m.index ?? 0; if (idx > last) { const before = text.slice(last, idx); if (before.trim() !== "") { out.push({ t: "p", c: [{ t: "text", v: before, ...(marks.length ? { m: marks } : {}) }] }); } } const name = m[1]!; const args: Record = {}; for (const a of (m[2] ?? "").matchAll(SHORTCODE_ARG_RE)) { args[a[1]!] = a[2] ?? a[3] ?? ""; } out.push({ t: "shortcode", name, args }); last = idx + m[0].length; } const tail = text.slice(last); if (tail.trim() !== "") { out.push({ t: "p", c: [{ t: "text", v: tail, ...(marks.length ? { m: marks } : {}) }] }); } return out; }; // ─── small helpers ─────────────────────────────────────────────────────── const parseLangFromClass = (cls: string): string => { const m = cls.match(/(?:^|\s)language-([\w-]+)/); return m?.[1] ?? ""; }; const numAttr = (el: HTMLElement, name: string): number | undefined => { const v = el.getAttribute(name); if (!v) return undefined; const n = parseInt(v, 10); return Number.isFinite(n) ? n : undefined; }; const decodeEntities = (s: string): string => s .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(/'/g, "'") .replace(/ /g, " ");