syntaxai/tdd.md · main · src / a31_diff_parse.ts

a31_diff_parse.ts 161 lines · 5134 bytes raw
// c31 — model: pure parser for unified-diff output. Takes the raw text
// emitted by `git diff` / Forgejo's `.diff` endpoint and produces the
// structured shape c51_render_commit consumes. No I/O, no I/O assumptions
// — handed a string, returns a tree.

export type DiffLineKind = "context" | "added" | "removed";

export interface DiffLine {
  kind: DiffLineKind;
  text: string;
  // 1-based line numbers in the old / new file. Null for the side
  // that doesn't have this line (e.g. additions have oldNum:null).
  oldNum: number | null;
  newNum: number | null;
}

export interface DiffHunk {
  oldStart: number;
  oldLength: number;
  newStart: number;
  newLength: number;
  // The "@@ ... @@" suffix Forgejo/git puts after the second @@. Often
  // the surrounding function/section name. Free text, may be empty.
  heading: string;
  lines: DiffLine[];
}

export interface DiffFile {
  // Path on the new side. For deletes this is the old path mirrored
  // here so one field is enough to render a row.
  path: string;
  // Old path, set only on renames + deletes. Equal to `path` for
  // straightforward edits.
  oldPath: string;
  status: "added" | "removed" | "modified" | "renamed";
  hunks: DiffHunk[];
  added: number;
  removed: number;
}

export interface ParsedDiff {
  files: DiffFile[];
}

// Parse a `@@ -oldStart,oldLength +newStart,newLength @@ heading` header.
// Returns null when the line doesn't match. The length parts are
// optional in unified-diff (defaults to 1) — handle both shapes.
const HUNK_HEADER = /^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(.*)$/;

const parseHunkHeader = (line: string): Omit<DiffHunk, "lines"> | null => {
  const m = HUNK_HEADER.exec(line);
  if (!m) return null;
  return {
    oldStart: parseInt(m[1]!, 10),
    oldLength: m[2] !== undefined ? parseInt(m[2], 10) : 1,
    newStart: parseInt(m[3]!, 10),
    newLength: m[4] !== undefined ? parseInt(m[4], 10) : 1,
    heading: (m[5] ?? "").trim(),
  };
};

export const parseUnifiedDiff = (raw: string): ParsedDiff => {
  const files: DiffFile[] = [];
  let currentFile: DiffFile | null = null;
  let currentHunk: DiffHunk | null = null;
  let oldLineNo = 0;
  let newLineNo = 0;

  const lines = raw.split("\n");
  for (let i = 0; i < lines.length; i++) {
    const line = lines[i] ?? "";

    if (line.startsWith("diff --git ")) {
      // New file boundary. Try to extract paths from "a/X b/Y" — git
      // emits them quoted only when special chars are present, which
      // we don't expect for our markdown content.
      const m = /^diff --git a\/(.+) b\/(.+)$/.exec(line);
      const oldPath = m?.[1] ?? "";
      const path = m?.[2] ?? "";
      currentFile = {
        path,
        oldPath,
        status: "modified",
        hunks: [],
        added: 0,
        removed: 0,
      };
      currentHunk = null;
      files.push(currentFile);
      continue;
    }

    if (currentFile === null) continue; // preamble, skip

    if (line.startsWith("new file mode")) {
      currentFile.status = "added";
      continue;
    }
    if (line.startsWith("deleted file mode")) {
      currentFile.status = "removed";
      continue;
    }
    if (line.startsWith("rename from ") || line.startsWith("rename to ")) {
      currentFile.status = "renamed";
      continue;
    }
    // Skip the index, ---/+++ headers — useful info already captured
    // from "diff --git" / mode lines.
    if (
      line.startsWith("index ") ||
      line.startsWith("--- ") ||
      line.startsWith("+++ ") ||
      line.startsWith("similarity index") ||
      line.startsWith("Binary files")
    ) {
      continue;
    }

    if (line.startsWith("@@")) {
      const header = parseHunkHeader(line);
      if (!header) continue;
      currentHunk = { ...header, lines: [] };
      currentFile.hunks.push(currentHunk);
      oldLineNo = header.oldStart;
      newLineNo = header.newStart;
      continue;
    }

    if (currentHunk === null) continue;

    // Body lines — first char is the marker. An empty string at the
    // tail of the input (from a trailing "\n") falls through as
    // context with text "" — that matches what git emits.
    const marker = line[0] ?? " ";
    const text = line.slice(1);

    if (marker === "+") {
      currentHunk.lines.push({ kind: "added", text, oldNum: null, newNum: newLineNo });
      newLineNo++;
      currentFile.added++;
    } else if (marker === "-") {
      currentHunk.lines.push({ kind: "removed", text, oldNum: oldLineNo, newNum: null });
      oldLineNo++;
      currentFile.removed++;
    } else if (marker === " " || marker === "") {
      // Skip a stray empty line that follows the last hunk before the
      // next "diff --git" — it's not a real context line.
      const next = lines[i + 1] ?? "";
      if (line === "" && (next.startsWith("diff --git ") || next === "")) continue;
      currentHunk.lines.push({ kind: "context", text, oldNum: oldLineNo, newNum: newLineNo });
      oldLineNo++;
      newLineNo++;
    } else if (marker === "\\") {
      // "\ No newline at end of file" — informational, skip.
      continue;
    }
  }

  return { files };
};