src / local / search.ts

/**
 * @file local/search.ts
 * Bridges the local document store into the swarm search pipeline.
 *
 * Converts local document chunks into SearchHit and CrawledSource objects
 * that flow through the same scoring, deduplication, and reporting paths
 * as web-sourced content.
 *
 * Now supports:
 * - Progressive source retrieval (proprietary → internal → reference → general)
 * - Role-based auto-routing via library tags
 * - Context-enriched chunks (includes surrounding chunk text)
 * - Library priority boosting in relevance scores
 */

import { SearchHit, CrawledSource, WorkerRole, SourceTier } from "../types";
import { getGlobalStore, LocalSearchHit, LibraryPriority } from "./store";

/** Map library priority to source tier. */
const PRIORITY_TIER_MAP: Record<LibraryPriority, SourceTier> = {
  proprietary: "reference",
  internal: "reference",
  reference: "reference",
  general: "general",
};

/** Map library priority to domain score. */
const PRIORITY_DOMAIN_SCORES: Record<LibraryPriority, number> = {
  proprietary: 95,
  internal: 90,
  reference: 85,
  general: 75,
};

const LOCAL_FRESHNESS_SCORE = 70;

function localHitToSearchHit(hit: LocalSearchHit): SearchHit {
  const snippet = hit.text.slice(0, 250).replace(/\n+/g, " ").trim();
  return {
    url: `local://${hit.libraryName}/${hit.fileRelPath || hit.fileName}#chunk${hit.chunkIndex}`,
    title: `${hit.fileName} (${hit.libraryName})`,
    snippet,
  };
}

function localHitToCrawledSource(
  hit: LocalSearchHit,
  query: string,
  role: WorkerRole,
  label: string,
  contentLimit: number,
): CrawledSource {
  let text = "";
  if (hit.contextBefore) {
    text += hit.contextBefore + "\n\n---\n\n";
  }
  text += hit.text;
  if (hit.contextAfter) {
    text += "\n\n---\n\n" + hit.contextAfter;
  }
  text = text.slice(0, contentLimit);

  const priority = hit.libraryPriority;
  const tier = PRIORITY_TIER_MAP[priority];
  const domainScore = PRIORITY_DOMAIN_SCORES[priority];

  const baseRelevance = Math.min(1, hit.score * 1.5);
  const priorityBoost =
    priority === "proprietary"
      ? 0.15
      : priority === "internal"
        ? 0.1
        : priority === "reference"
          ? 0.05
          : 0;
  const relevanceScore = Math.min(1, baseRelevance + priorityBoost);

  return {
    url: `local://${hit.libraryName}/${hit.fileRelPath || hit.fileName}#chunk${hit.chunkIndex}`,
    finalUrl: `local://${hit.libraryName}/${hit.fileRelPath || hit.fileName}#chunk${hit.chunkIndex}`,
    title: hit.heading
      ? `${hit.fileName}${hit.heading} (${hit.libraryName})`
      : `${hit.fileName} (${hit.libraryName})`,
    description: text.slice(0, 250).replace(/\n+/g, " ").trim(),
    published: null,
    text,
    wordCount: hit.wordCount,
    outlinks: [],
    sourceQuery: query,
    workerRole: role,
    workerLabel: label,
    domainScore,
    freshnessScore: LOCAL_FRESHNESS_SCORE,
    tier,
    relevanceScore,
    origin: "local" as const,
  };
}

export function searchLocalCollections(
  query: string,
  maxResults: number,
  collectionIds?: ReadonlyArray<string>,
): ReadonlyArray<SearchHit> {
  const store = getGlobalStore();
  if (!store.hasCollections()) return [];

  const hits = store.search(query, maxResults, collectionIds);
  return hits.map(localHitToSearchHit);
}

export function searchLocalForRole(
  query: string,
  role: WorkerRole,
  maxResults: number = 8,
  roleCollectionMap?: ReadonlyMap<string, ReadonlyArray<string>>,
): ReadonlyArray<SearchHit> {
  const store = getGlobalStore();
  if (!store.hasCollections()) return [];

  const hits = store.searchByRole(query, role, maxResults, roleCollectionMap);
  return hits.map(localHitToSearchHit);
}

/**
 * Progressive harvest: searches local libraries in priority order
 * (proprietary first, then internal, reference, general).
 * This is the "progressive source approach" — proprietary knowledge
 * is preferred, web fills remaining gaps.
 */
export function harvestLocalSources(
  queries: ReadonlyArray<string>,
  role: WorkerRole,
  label: string,
  maxTotal: number,
  contentLimit: number,
  collectionIds?: ReadonlyArray<string>,
  roleCollectionMap?: ReadonlyMap<string, ReadonlyArray<string>>,
): ReadonlyArray<CrawledSource> {
  const store = getGlobalStore();
  if (!store.hasCollections()) return [];

  const seen = new Set<string>();
  const sources: CrawledSource[] = [];

  const useProgressive = !collectionIds && !roleCollectionMap?.get(role);

  for (const query of queries) {
    if (sources.length >= maxTotal) break;

    const remaining = maxTotal - sources.length;
    let hits: ReadonlyArray<LocalSearchHit>;

    if (useProgressive) {
      hits = store.searchProgressive(query, remaining);
    } else {
      const targetIds = roleCollectionMap?.get(role) ?? collectionIds;
      hits = store.search(query, remaining, targetIds);
    }

    for (const hit of hits) {
      if (sources.length >= maxTotal) break;

      const dedupeKey = `${hit.filePath}:${hit.chunkIndex}`;
      if (seen.has(dedupeKey)) continue;
      seen.add(dedupeKey);

      sources.push(
        localHitToCrawledSource(hit, query, role, label, contentLimit),
      );
    }
  }

  return sources;
}

export function isLocalUrl(url: string): boolean {
  return url.startsWith("local://");
}