src / retrieval / tfidf.ts

/**
 * @file retrieval/tfidf.ts
 * Lightweight TF-IDF vector engine for semantic similarity search.
 *
 * Why TF-IDF instead of embeddings?
 * - Zero external dependencies (no embedding model required)
 * - Works instantly on plugin start (no model loading)
 * - Surprisingly effective for short-text matching (memory entries are sentences)
 * - If an embedding model IS loaded, the AI relevance scorer can augment this
 *
 * Optimizations:
 * - Sparse vectors (Map<string, number>) instead of dense arrays
 * - Inverted index for O(1) document lookup per term
 * - Incremental IDF updates (no full recompute on insert)
 * - Pre-computed document norms for fast cosine similarity
 */

import { STOP_WORDS, MIN_TERM_LENGTH, MAX_VOCAB_SIZE, MAX_TERMS_PER_DOC } from "../constants";
import type { TfIdfVector } from "../types";

/** Tokenize and normalize text into terms. */
export function tokenize(text: string): string[] {
  return text
    .toLowerCase()
    .replace(/[^a-z0-9\s\-_]/g, " ")
    .split(/\s+/)
    .filter(t => t.length >= MIN_TERM_LENGTH && !STOP_WORDS.has(t))
    .slice(0, MAX_TERMS_PER_DOC);
}

/**
 * In-memory TF-IDF index.
 * Supports incremental inserts (no full reindex needed).
 */
export class TfIdfIndex {
  /** term → Set of document IDs containing this term */
  private invertedIndex = new Map<string, Set<string>>();
  /** docId → sparse TF vector (normalized by doc length) */
  private docVectors = new Map<string, TfIdfVector>();
  /** docId → precomputed L2 norm of the TF vector */
  private docNorms = new Map<string, number>();
  /** Total number of indexed documents */
  private docCount = 0;
  /** Cached IDF values — invalidated on add/remove */
  private idfCache = new Map<string, number>();
  private idfDirty = true;

  /** Add or update a document in the index. */
  addDocument(docId: string, text: string): void {
    // Remove old version if exists
    if (this.docVectors.has(docId)) {
      this.removeDocument(docId);
    }

    const terms = tokenize(text);
    if (terms.length === 0) return;

    // Compute term frequencies
    const tf = new Map<string, number>();
    for (const term of terms) {
      tf.set(term, (tf.get(term) ?? 0) + 1);
    }

    // Normalize TF by document length (prevents long docs from dominating)
    const maxTf = Math.max(...tf.values());
    const normalizedTf: TfIdfVector = new Map();
    for (const [term, count] of tf) {
      normalizedTf.set(term, 0.5 + 0.5 * (count / maxTf));
    }

    // Update inverted index
    for (const term of tf.keys()) {
      if (!this.invertedIndex.has(term)) {
        // Vocabulary cap
        if (this.invertedIndex.size >= MAX_VOCAB_SIZE) continue;
        this.invertedIndex.set(term, new Set());
      }
      this.invertedIndex.get(term)!.add(docId);
    }

    this.docVectors.set(docId, normalizedTf);
    this.docCount++;
    this.idfDirty = true;

    // Precompute norm (will be refined when IDF is applied at query time)
    this.docNorms.set(docId, this.computeNorm(normalizedTf));
  }

  /** Remove a document from the index. */
  removeDocument(docId: string): void {
    const vec = this.docVectors.get(docId);
    if (!vec) return;

    for (const term of vec.keys()) {
      const docs = this.invertedIndex.get(term);
      if (docs) {
        docs.delete(docId);
        if (docs.size === 0) this.invertedIndex.delete(term);
      }
    }
    this.docVectors.delete(docId);
    this.docNorms.delete(docId);
    this.docCount--;
    this.idfDirty = true;
  }

  /**
   * Search the index and return document IDs sorted by TF-IDF cosine similarity.
   * Returns array of [docId, score] pairs.
   */
  search(queryText: string, topK: number = 10): Array<[string, number]> {
    const queryTerms = tokenize(queryText);
    if (queryTerms.length === 0 || this.docCount === 0) return [];

    // Rebuild IDF cache if index changed since last search
    if (this.idfDirty) {
      this.idfCache.clear();
      for (const [term, docs] of this.invertedIndex) {
        this.idfCache.set(term, Math.log(1 + this.docCount / docs.size));
      }
      this.idfDirty = false;
    }

    // Build query TF-IDF vector
    const queryTf = new Map<string, number>();
    for (const term of queryTerms) {
      queryTf.set(term, (queryTf.get(term) ?? 0) + 1);
    }
    const maxQueryTf = Math.max(...queryTf.values());

    const queryVec: TfIdfVector = new Map();
    for (const [term, count] of queryTf) {
      const idf = this.idfCache.get(term);
      if (!idf) continue;
      queryVec.set(term, (0.5 + 0.5 * (count / maxQueryTf)) * idf);
    }

    if (queryVec.size === 0) return [];

    const queryNorm = this.computeNorm(queryVec);
    if (queryNorm === 0) return [];

    // Find candidate documents (only docs sharing at least one query term)
    const candidates = new Set<string>();
    for (const term of queryVec.keys()) {
      const docs = this.invertedIndex.get(term);
      if (docs) for (const docId of docs) candidates.add(docId);
    }

    // Score via cosine similarity with cached IDF
    const scores: Array<[string, number]> = [];
    for (const docId of candidates) {
      const docVec = this.docVectors.get(docId);
      if (!docVec) continue;

      let dotProduct = 0;
      let docNormSq = 0;

      for (const [term, docTf] of docVec) {
        const idf = this.idfCache.get(term) ?? 0;
        const docTfIdf = docTf * idf;
        docNormSq += docTfIdf * docTfIdf;

        const qw = queryVec.get(term);
        if (qw !== undefined) dotProduct += docTfIdf * qw;
      }

      const docNorm = Math.sqrt(docNormSq);
      if (docNorm === 0) continue;
      const sim = dotProduct / (docNorm * queryNorm);
      if (sim > 0) scores.push([docId, sim]);
    }

    scores.sort((a, b) => b[1] - a[1]);
    return scores.slice(0, topK);
  }

  /** Compute L2 norm of a sparse vector. */
  private computeNorm(vec: TfIdfVector): number {
    let sumSq = 0;
    for (const w of vec.values()) sumSq += w * w;
    return Math.sqrt(sumSq);
  }

  /** Get index statistics. */
  get stats(): { vocabSize: number; docCount: number } {
    return { vocabSize: this.invertedIndex.size, docCount: this.docCount };
  }

  /** Clear the entire index. */
  clear(): void {
    this.invertedIndex.clear();
    this.docVectors.clear();
    this.docNorms.clear();
    this.idfCache.clear();
    this.idfDirty = true;
    this.docCount = 0;
  }
}