syntaxai/tdd.md · main · src / a31_sxdoc_parse.ts
// c31 — HTML → SxDocument parser.
//
// SAMA placement: c31 because this is a parser for external input —
// Modeled.md is explicit: "every external input has a parser in a c31_*
// model — types and parse-functions colocated". HTML strings reach this
// file from the editor's save POST, from the markdown-import script, and
// from the AI-edit response — all "outside the process" → c31.
//
// Why a typed tree and not HTML strings: see c31_sxdoc.ts header.
//
// Why node-html-parser and not Bun's HTMLRewriter: we need a tree we can
// recurse over, not a streaming filter. The dep is pure-logic (no I/O,
// no fs, no spawn) so it doesn't push the file into c14 territory.
import { parse, type HTMLElement, type Node, NodeType } from "node-html-parser";
import type { SxDocument, SxBlock, SxInline, SxMark } from "./a31_sxdoc.ts";
import { SX_DOC_VERSION } from "./a31_sxdoc.ts";
const SHORTCODE_RE = /\[\[sx:([a-z][a-z0-9-]*)((?:\s+[a-z0-9_-]+=(?:"[^"]*"|[^\s"\]]+))*)\s*\]\]/g;
const SHORTCODE_ARG_RE = /([a-z0-9_-]+)=(?:"([^"]*)"|([^\s"\]]+))/g;
const HEADING_TAGS = new Set(["h1", "h2", "h3", "h4", "h5", "h6"]);
// Block-level tags — used by parseListItem to know where to stop
// collecting inlines and recurse instead. Keep in sync with the
// pushBlocksFromNode dispatcher above.
const BLOCK_TAGS = new Set([
"p", "h1", "h2", "h3", "h4", "h5", "h6",
"ul", "ol", "blockquote", "pre",
"img", "figure", "hr",
"div", "section", "article", "table",
]);
const MARK_FOR_TAG: Record<string, SxMark> = {
b: "b", strong: "b",
i: "i", em: "i",
u: "u",
s: "s", strike: "s", del: "s",
code: "c",
};
export const htmlToSx = (html: string): SxDocument => {
// Wrap in <root> so we always have a single parent to walk childNodes
// of, regardless of whether the input has its own wrapper element.
const root = parse(`<root>${html}</root>`, {
blockTextElements: { script: false, style: false },
});
const rootEl = root.firstChild as HTMLElement;
const blocks: SxBlock[] = [];
for (const node of rootEl.childNodes) {
pushBlocksFromNode(node, blocks);
}
return { v: SX_DOC_VERSION, blocks };
};
// ─── block-level dispatch ────────────────────────────────────────────────
const pushBlocksFromNode = (node: Node, out: SxBlock[]): void => {
if (node.nodeType === NodeType.TEXT_NODE) {
const text = (node.text ?? "").trim();
if (text) out.push(...textWithShortcodesToBlocks(text, []));
return;
}
if (node.nodeType !== NodeType.ELEMENT_NODE) return;
const el = node as HTMLElement;
const tag = el.tagName?.toLowerCase();
if (!tag) return;
// Comments / processing-instructions surface as element nodes with a
// tagName starting with "!" — drop them, they're not content.
if (tag === "!" || tag === "comment") return;
if (tag === "p") {
const inlines = parseInline(el.childNodes, []);
if (inlines.length === 0) return;
out.push(...splitShortcodesFromParagraph(inlines));
return;
}
if (HEADING_TAGS.has(tag)) {
const level = parseInt(tag.slice(1), 10) as 1 | 2 | 3 | 4 | 5 | 6;
out.push({ t: "h", level, c: parseInline(el.childNodes, []) });
return;
}
if (tag === "ul" || tag === "ol") { out.push(parseList(el, tag)); return; }
if (tag === "blockquote") { out.push(parseQuote(el)); return; }
if (tag === "pre") { out.push(parseCodeBlock(el)); return; }
if (tag === "img") {
const img = parseImg(el);
if (img) out.push(img);
return;
}
if (tag === "figure") { out.push(parseFigure(el)); return; }
if (tag === "hr") { out.push({ t: "hr" }); return; }
if (tag === "div" || tag === "section" || tag === "article") {
for (const child of el.childNodes) pushBlocksFromNode(child, out);
return;
}
// Anything else → escape hatch so round-tripping stays lossless.
out.push({ t: "html", src: el.outerHTML });
};
// ─── per-block parsers ───────────────────────────────────────────────────
const parseList = (el: HTMLElement, tag: "ul" | "ol"): SxBlock => {
const items: SxBlock[][] = [];
for (const child of el.childNodes) {
if (child.nodeType !== NodeType.ELEMENT_NODE) continue;
const childEl = child as HTMLElement;
if (childEl.tagName?.toLowerCase() !== "li") continue;
const itemBlocks = parseListItem(childEl);
if (itemBlocks.length > 0) items.push(itemBlocks);
}
return { t: tag, items };
};
// Walk an <li>'s children in source-order. Inline runs collect into
// paragraphs; block-level children (nested ul/ol/blockquote/pre/…)
// flush the current inline buffer and recurse as their own block.
// Without this split, parseInline would walk into nested <ul> and the
// inner text would leak into the outer paragraph.
const parseListItem = (li: HTMLElement): SxBlock[] => {
const result: SxBlock[] = [];
let inlineBuf: Node[] = [];
const flushInlines = (): void => {
if (inlineBuf.length === 0) return;
const inlines = parseInline(inlineBuf, []);
if (inlines.length > 0) result.push({ t: "p", c: inlines });
inlineBuf = [];
};
for (const node of li.childNodes) {
if (node.nodeType === NodeType.ELEMENT_NODE) {
const t = (node as HTMLElement).tagName?.toLowerCase();
if (t && BLOCK_TAGS.has(t)) {
flushInlines();
pushBlocksFromNode(node, result);
continue;
}
}
inlineBuf.push(node);
}
flushInlines();
return result;
};
const parseQuote = (el: HTMLElement): SxBlock => {
const inner: SxBlock[] = [];
for (const child of el.childNodes) pushBlocksFromNode(child, inner);
if (inner.length === 0) {
const inlines = parseInline(el.childNodes, []);
if (inlines.length > 0) inner.push({ t: "p", c: inlines });
}
return { t: "quote", c: inner };
};
const parseCodeBlock = (el: HTMLElement): SxBlock => {
// Canonical shape: <pre><code class="language-X">…</code></pre>.
// Loose <pre>text</pre> also supported.
const codeChild = el.querySelector("code");
const inner = codeChild ?? el;
const lang = parseLangFromClass(inner.getAttribute("class") ?? "");
return { t: "code", lang, src: decodeEntities(inner.innerHTML) };
};
const parseImg = (el: HTMLElement): SxBlock | null => {
const src = el.getAttribute("src") ?? "";
if (!src) return null;
const block: { t: "img"; src: string; alt?: string; w?: number; h?: number } = { t: "img", src };
const alt = el.getAttribute("alt");
if (alt) block.alt = alt;
const w = numAttr(el, "width"); if (w !== undefined) block.w = w;
const h = numAttr(el, "height"); if (h !== undefined) block.h = h;
return block as SxBlock;
};
const parseFigure = (el: HTMLElement): SxBlock => {
const img = el.querySelector("img");
const caption = el.querySelector("figcaption");
if (img) {
const src = img.getAttribute("src") ?? "";
if (src) {
const block: { t: "img"; src: string; alt?: string; caption?: string; w?: number; h?: number } = { t: "img", src };
const alt = img.getAttribute("alt"); if (alt) block.alt = alt;
if (caption) block.caption = caption.text;
const w = numAttr(img, "width"); if (w !== undefined) block.w = w;
const h = numAttr(img, "height"); if (h !== undefined) block.h = h;
return block as SxBlock;
}
}
return { t: "html", src: el.outerHTML };
};
// ─── inline parsing ──────────────────────────────────────────────────────
const parseInline = (nodes: Node[] | undefined, marks: SxMark[]): SxInline[] => {
if (!nodes) return [];
const out: SxInline[] = [];
for (const node of nodes) {
if (node.nodeType === NodeType.TEXT_NODE) {
const v = decodeEntities(node.text ?? "");
if (v.length > 0) {
out.push({ t: "text", v, ...(marks.length ? { m: dedupeMarks(marks) } : {}) });
}
continue;
}
if (node.nodeType !== NodeType.ELEMENT_NODE) continue;
const el = node as HTMLElement;
const tag = el.tagName?.toLowerCase();
if (!tag) continue;
if (tag === "br") {
out.push({ t: "text", v: "\n", ...(marks.length ? { m: dedupeMarks(marks) } : {}) });
continue;
}
if (tag === "a") {
const href = el.getAttribute("href") ?? "";
out.push({ t: "a", href, c: parseInline(el.childNodes, marks) });
continue;
}
const mark = MARK_FOR_TAG[tag];
if (mark) {
out.push(...parseInline(el.childNodes, [...marks, mark]));
continue;
}
// <span>, <font>, etc. — strip wrapper, keep contents.
out.push(...parseInline(el.childNodes, marks));
}
return out;
};
const dedupeMarks = (marks: SxMark[]): SxMark[] => {
const seen = new Set<SxMark>();
const out: SxMark[] = [];
for (const m of marks) if (!seen.has(m)) { seen.add(m); out.push(m); }
return out;
};
// ─── shortcode lifting ──────────────────────────────────────────────────
// When a <p> contains [[sx:foo]] tokens mixed with text, split it into
// (paragraph)(shortcode)(paragraph) blocks so the document is queryable
// per-shortcode rather than per-paragraph-with-substring.
const splitShortcodesFromParagraph = (inlines: SxInline[]): SxBlock[] => {
const out: SxBlock[] = [];
let buf: SxInline[] = [];
const flush = (): void => {
if (buf.length > 0 && buf.some((i) => !(i.t === "text" && i.v.trim() === ""))) {
out.push({ t: "p", c: buf });
}
buf = [];
};
for (const i of inlines) {
if (i.t !== "text" || !SHORTCODE_RE.test(i.v)) {
buf.push(i);
continue;
}
SHORTCODE_RE.lastIndex = 0;
const blocks = textWithShortcodesToBlocks(i.v, i.m ?? []);
for (const b of blocks) {
if (b.t === "shortcode") {
flush();
out.push(b);
} else if (b.t === "p") {
for (const inner of b.c) buf.push(inner);
}
}
}
flush();
return out;
};
const textWithShortcodesToBlocks = (text: string, marks: SxMark[]): SxBlock[] => {
const out: SxBlock[] = [];
let last = 0;
SHORTCODE_RE.lastIndex = 0;
for (const m of text.matchAll(SHORTCODE_RE)) {
const idx = m.index ?? 0;
if (idx > last) {
const before = text.slice(last, idx);
if (before.trim() !== "") {
out.push({ t: "p", c: [{ t: "text", v: before, ...(marks.length ? { m: marks } : {}) }] });
}
}
const name = m[1]!;
const args: Record<string, string> = {};
for (const a of (m[2] ?? "").matchAll(SHORTCODE_ARG_RE)) {
args[a[1]!] = a[2] ?? a[3] ?? "";
}
out.push({ t: "shortcode", name, args });
last = idx + m[0].length;
}
const tail = text.slice(last);
if (tail.trim() !== "") {
out.push({ t: "p", c: [{ t: "text", v: tail, ...(marks.length ? { m: marks } : {}) }] });
}
return out;
};
// ─── small helpers ───────────────────────────────────────────────────────
const parseLangFromClass = (cls: string): string => {
const m = cls.match(/(?:^|\s)language-([\w-]+)/);
return m?.[1] ?? "";
};
const numAttr = (el: HTMLElement, name: string): number | undefined => {
const v = el.getAttribute(name);
if (!v) return undefined;
const n = parseInt(v, 10);
return Number.isFinite(n) ? n : undefined;
};
const decodeEntities = (s: string): string =>
s
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ /g, " ");