packages / core / src / rewrite.ts

import type { RagQueryRewrite } from "./policyContracts";

const STOPWORDS = new Set([
  "a",
  "an",
  "and",
  "are",
  "as",
  "at",
  "be",
  "by",
  "for",
  "from",
  "how",
  "in",
  "is",
  "it",
  "of",
  "on",
  "or",
  "that",
  "the",
  "to",
  "what",
  "when",
  "where",
  "which",
  "who",
  "why",
  "with",
]);

function normalizeWhitespace(value: string) {
  return value.trim().replace(/\s+/g, " ");
}

function uniquePush(results: RagQueryRewrite[], rewrite: RagQueryRewrite) {
  const normalized = normalizeWhitespace(rewrite.text);
  if (!normalized) {
    return;
  }

  if (results.some((existing) => existing.text.toLowerCase() === normalized.toLowerCase())) {
    return;
  }

  results.push({
    label: rewrite.label,
    text: normalized,
  });
}

function buildKeywordRewrite(prompt: string) {
  const keywords = prompt
    .toLowerCase()
    .replace(/[^a-z0-9\s]/g, " ")
    .split(/\s+/)
    .filter((token) => token.length >= 3 && !STOPWORDS.has(token));

  return keywords.join(" ");
}

function buildSplitRewrite(prompt: string) {
  const splitParts = prompt
    .split(/\b(?:and|then|also)\b|[,;:?]/i)
    .map((part) => normalizeWhitespace(part))
    .filter((part) => part.length > 0);

  return splitParts.length > 1 ? splitParts[0] : "";
}

function buildQuotedRewrite(prompt: string) {
  const quotedSpans = [...prompt.matchAll(/"([^"]+)"|'([^']+)'/g)]
    .map((match) => normalizeWhitespace(match[1] ?? match[2] ?? ""))
    .filter((span) => span.length > 0);

  return quotedSpans.join(" ");
}

export function generateCoreQueryRewrites(
  prompt: string,
  multiQueryCount: number
): RagQueryRewrite[] {
  const rewrites: RagQueryRewrite[] = [];
  const maxVariants = Math.max(1, Math.min(multiQueryCount, 4));

  uniquePush(rewrites, { label: "original", text: prompt });
  uniquePush(rewrites, {
    label: "keywords",
    text: buildKeywordRewrite(prompt),
  });
  uniquePush(rewrites, {
    label: "decomposed",
    text: buildSplitRewrite(prompt),
  });
  uniquePush(rewrites, {
    label: "quoted-span",
    text: buildQuotedRewrite(prompt),
  });

  return rewrites.slice(0, maxVariants);
}