syntaxai/tdd.md · main · src / c14_working_set_walker.ts

c14_working_set_walker.ts 106 lines · 3959 bytes raw
// c14 — adapter: filesystem walker that produces a polyglot
// WorkingSetFile summary for an external source tree (Go or Rust).
// Recursive directory walk; counts lines of each .go / .rs file using
// the same `content.split("\n").length` rule as b32_sama_v2_metrics so
// the cross-language metric matches the TS metric byte-for-byte.
//
// Skipped directories are the conventional non-source trees that
// would otherwise inflate the denominator with vendored / generated
// / build artefacts: .git, target/ (Rust build output), vendor/ (Go
// vendored deps), node_modules/ (incidental, defensive).
//
// The walker is hermetic — given a path that is a directory it
// resolves the file set deterministically. Calls into the pure helper
// in b32_working_set_polyglot.ts for the ratio.

import { readdirSync, readFileSync, statSync } from "node:fs";
import { resolve } from "node:path";
import {
  computeWorkingSetFitPolyglot,
  type PolyglotLanguage,
  type WorkingSetFile,
  type WorkingSetResult,
} from "./b32_working_set_polyglot.ts";

const SKIPPED_DIRS: ReadonlySet<string> = new Set([
  ".git",
  "target",
  "vendor",
  "node_modules",
]);

const EXTENSION_FOR: Record<PolyglotLanguage, string> = {
  go: ".go",
  rust: ".rs",
};

// Walk a directory and return every {path, locCount} pair for files
// whose extension matches the target language. Paths are returned
// repo-relative (i.e. relative to the `repoRoot` passed in) so they're
// stable across machines.
export const collectPolyglotFiles = (
  repoRoot: string,
  lang: PolyglotLanguage,
): WorkingSetFile[] => {
  const ext = EXTENSION_FOR[lang];
  const out: WorkingSetFile[] = [];

  const walk = (absDir: string, relDir: string): void => {
    let entries: ReturnType<typeof readdirSync>;
    try {
      entries = readdirSync(absDir, { withFileTypes: true });
    } catch {
      // Permission errors / non-existent: surface to caller, but
      // letting one bad subtree halt the whole measurement would be
      // worse than reporting the partial set. Return silently here;
      // the CLI's smoke checks at the top level will catch a totally
      // unreadable root.
      return;
    }
    for (const e of entries) {
      if (e.name.startsWith(".") && e.name !== ".") {
        // .git, .github, .vscode, ...: defensive skip on all dotdirs
        // for directories; dotfiles are skipped too (they're never
        // .go/.rs sources anyway, but the explicit skip is cheap).
        if (e.isDirectory() && SKIPPED_DIRS.has(e.name)) continue;
        if (e.isDirectory()) continue; // skip all hidden dirs
      }
      if (e.isDirectory()) {
        if (SKIPPED_DIRS.has(e.name)) continue;
        const sub = resolve(absDir, e.name);
        const subRel = relDir === "" ? e.name : `${relDir}/${e.name}`;
        walk(sub, subRel);
        continue;
      }
      if (!e.isFile()) continue;
      if (!e.name.endsWith(ext)) continue;
      const abs = resolve(absDir, e.name);
      const relPath = relDir === "" ? e.name : `${relDir}/${e.name}`;
      const content = readFileSync(abs, "utf8");
      // Match b32_sama_v2_metrics.ts: lines = content.split("\n").length.
      const locCount = content.split("\n").length;
      out.push({ path: relPath, locCount });
    }
  };

  const root = resolve(repoRoot);
  const rootStat = statSync(root);
  if (!rootStat.isDirectory()) {
    throw new Error(`expected a directory, got: ${repoRoot}`);
  }
  walk(root, "");
  // Sort for deterministic output (readdirSync is platform-dependent).
  out.sort((a, b) => a.path.localeCompare(b.path));
  return out;
};

// Convenience: walk + compute in one call. Used by the CLI script.
export const measureWorkingSetForRepo = (
  repoRoot: string,
  lang: PolyglotLanguage,
): WorkingSetResult & { files: WorkingSetFile[] } => {
  const files = collectPolyglotFiles(repoRoot, lang);
  const result = computeWorkingSetFitPolyglot(files, lang);
  return { ...result, files };
};