src / services / ingestion / web.ts

import { JSDOM } from "jsdom";
import { Readability } from "@mozilla/readability";

function spoofHeaders(url: string) {
  return {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Referer": new URL(url).origin,
  };
}

export async function extractWebContent(url: string, contentLimit: number = 20000) {
    try {
        const headers = spoofHeaders(url);
        const response = await fetch(url, { headers });
        if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
        
        const html = await response.text();
        const doc = new JSDOM(html, { url });
        const reader = new Readability(doc.window.document);
        const article = reader.parse();
        
        if (!article) return { title: url, content: "Could not parse content." };

        let content = article.textContent || "";
        if (content && content.length > contentLimit) {
            content = content.substring(0, contentLimit) + "... [Truncated]";
        }

        return {
            title: article.title || url,
            content: content,
            byline: article.byline,
            siteName: article.siteName
        };
    } catch (e: any) {
        throw new Error(`Web extraction failed: ${e.message}`);
    }
}