scripts / smoke-large-corpus-routing.ts

import fs from "node:fs/promises";
import os from "node:os";
import path from "node:path";

import { orchestrateRagRequest } from "../packages/core/src/orchestrator";
import { createDefaultMcpRuntime } from "../packages/mcp-server/src/defaultRuntime";

function assert(condition: unknown, message: string): asserts condition {
  if (!condition) {
    throw new Error(message);
  }
}

async function main() {
  const tempRoot = await fs.mkdtemp(path.join(os.tmpdir(), "rag-v2-large-corpus-"));
  const runtime = createDefaultMcpRuntime();

  try {
    const exportDir = path.join(tempRoot, "official-export");
    await fs.mkdir(exportDir, { recursive: true });
    await fs.writeFile(
      path.join(exportDir, "chat.html"),
      "<html><body><h1>Chat export</h1><p>Conversation summary.</p></body></html>"
    );
    await fs.writeFile(
      path.join(exportDir, "image-1.png"),
      Buffer.from([0x89, 0x50, 0x4e, 0x47])
    );
    await fs.writeFile(
      path.join(exportDir, "image-2.jpg"),
      Buffer.from([0xff, 0xd8, 0xff, 0xe0])
    );
    await fs.writeFile(
      path.join(exportDir, "image-3.webp"),
      Buffer.from("RIFFWEBP")
    );

    const globalResult = await orchestrateRagRequest(
      {
        query: "What themes dominate this export overall?",
        paths: [exportDir],
        outputMode: "prepared-prompt",
      },
      runtime
    );

    assert(
      globalResult.route === "global-summary",
      "Expected overview question on directory to use global-summary route."
    );
    assert(
      globalResult.preparedPrompt.includes("manifest:"),
      "Expected prepared prompt to include generated directory manifest context."
    );

    const jsonlPath = path.join(tempRoot, "dataclaw_export.jsonl");
    const repeatedLine = JSON.stringify({
      conversation_id: "conv-1",
      role: "user",
      content: "Session tool usage and metadata sample.",
    });
    const targetLine = JSON.stringify({
      conversation_id: "conv-999",
      role: "assistant",
      content:
        "Target tool usage entry with export routing evidence and hierarchy marker.",
    });
    const largeJsonl = [
      ...Array.from({ length: 8500 }, () => repeatedLine),
      targetLine,
      ...Array.from({ length: 499 }, () => repeatedLine),
    ].join("\n");
    await fs.writeFile(jsonlPath, `${largeJsonl}\n`, "utf8");

    const localResult = await orchestrateRagRequest(
      {
        query:
          "Find the specific export routing evidence and hierarchy marker entry in this file.",
        paths: [jsonlPath],
        outputMode: "search-results",
      },
      runtime
    );

    assert(
      localResult.route === "hierarchical-retrieval",
      "Expected oversized text file with local lookup query to use hierarchical-retrieval route."
    );
    assert(
      localResult.diagnostics.notes?.some((note) => note.includes("scope=local")),
      "Expected diagnostics to record large-corpus local classification."
    );
    assert(
      localResult.diagnostics.notes?.some((note) => note.includes("Built hierarchical index")),
      "Expected diagnostics to report hierarchical index construction."
    );
    assert(
      localResult.candidates.length > 0,
      "Expected hierarchical retrieval to produce candidates."
    );
    assert(
      localResult.candidates.some(
        (candidate) =>
          candidate.content.includes("hierarchy marker") &&
          candidate.metadata?.retrievalMode === "hierarchical-retrieval"
      ),
      "Expected hierarchical retrieval to surface the target chunk with hierarchical metadata."
    );

    const cachedLocalResult = await orchestrateRagRequest(
      {
        query:
          "Find the specific export routing evidence and hierarchy marker entry in this file.",
        paths: [jsonlPath],
        outputMode: "search-results",
      },
      runtime
    );

    assert(
      cachedLocalResult.diagnostics.notes?.some((note) =>
        note.includes("Reused cached large-corpus analysis")
      ),
      "Expected repeated large-corpus request to reuse cached analysis."
    );

    const correctiveResult = await orchestrateRagRequest(
      {
        query:
          "Find the hierarchy marker entry and the nonexistent sentinel field in this file.",
        paths: [jsonlPath],
        outputMode: "search-results",
      },
      runtime
    );

    assert(
      correctiveResult.diagnostics.notes?.some((note) =>
        note.includes("Weak hierarchical evidence triggered a corrective retry")
      ),
      "Expected weak hierarchical evidence to trigger a corrective retry."
    );

    console.log("Large-corpus routing smoke test passed.");
    console.log(`Global route: ${globalResult.route}`);
    console.log(`Local route: ${localResult.route}`);
  } finally {
    await fs.rm(tempRoot, { recursive: true, force: true });
  }
}

main().catch((error) => {
  const message = error instanceof Error ? error.message : String(error);
  console.error(`Large-corpus routing smoke test failed: ${message}`);
  process.exit(1);
});