deep-swarm-research

Public
src / net / pdf-extractor.ts
/**
 * @file net/pdf-extractor.ts
 * Handles PDF detection, text extraction, and embedded image extraction.
 *
 * When a URL serves a PDF (detected via Content-Type header or URL pattern),
 * this module extracts clean text + embedded images from the PDF instead of
 * passing raw binary bytes through the HTML extractor (which produces garbled output).
 */

import { PDFParse } from "pdf-parse";
import * as fs from "node:fs";
import * as path from "node:path";
import * as os from "node:os";
import { ExtractedPage, Outlink } from "../types";
import { DESCRIPTION_FALLBACK_CHARS } from "../constants";

/** Patterns that strongly indicate a URL points to a PDF. */
const PDF_URL_RE = /\.pdf(\?.*)?$/i;

/** Additional known PDF-serving URL patterns (academic preprint servers, etc.). */
const PDF_HOST_PATH_PATTERNS: ReadonlyArray<RegExp> = [
  /arxiv\.org\/pdf\//i,
  /arxiv\.org\/ftp\//i,
  /biorxiv\.org\/content\/.*\.full\.pdf/i,
  /medrxiv\.org\/content\/.*\.full\.pdf/i,
  /papers\.ssrn\.com\/sol3\/Delivery\.cfm/i,
  /dl\.acm\.org\/doi\/pdf\//i,
  /ieeexplore\.ieee\.org\/stampPDF/i,
  /link\.springer\.com\/content\/pdf\//i,
  /pdfs\.semanticscholar\.org\//i,
  /openreview\.net\/pdf/i,
  /proceedings\.neurips\.cc\/paper_files\/.*\.pdf/i,
  /aclanthology\.org\/.*\.pdf/i,
  /pnas\.org\/doi\/pdf\//i,
  /science\.org\/doi\/pdf\//i,
  /nature\.com\/articles\/.*\.pdf/i,
  /researchgate\.net\/.*\/download/i,
];

/**
 * Checks whether a URL is likely to serve a PDF, based on the URL alone.
 */
export function isPdfUrl(url: string): boolean {
  if (PDF_URL_RE.test(url)) return true;
  return PDF_HOST_PATH_PATTERNS.some((re) => re.test(url));
}

/**
 * Checks whether a Content-Type header value indicates PDF content.
 */
export function isPdfContentType(
  contentType: string | null | undefined,
): boolean {
  if (!contentType) return false;
  const lower = contentType.toLowerCase();
  return (
    lower.includes("application/pdf") || lower.includes("application/x-pdf")
  );
}

export interface PdfImage {
  readonly page: number;
  readonly format: string;
  readonly filePath: string;
  readonly width: number;
  readonly height: number;
  readonly byteSize: number;
}

/**
 * Extracts text and images from a PDF buffer using pdf-parse v2.
 */
export async function extractPdf(
  buffer: Buffer,
  sourceUrl: string,
  finalUrl: string,
  contentLimit: number,
  extractImages: boolean = true,
  maxImages: number = 20,
): Promise<ExtractedPage & { images: ReadonlyArray<PdfImage> }> {
  const data = new Uint8Array(buffer);
  const parser = new PDFParse({ data } as any);

  let rawText = "";
  let title = "";
  let author = "";
  let pageCount = 0;
  let creationDate: string | null = null;

  try {
    const info = await parser.getInfo();
    pageCount = info.total || 0;

    if (info.info) {
      title = sanitizeMetaString(info.info.Title) || "";
      author = sanitizeMetaString(info.info.Author) || "";
      creationDate = extractDateFromPdfInfo(info.info);
    }

    const textResult = await parser.getText({
      lineEnforce: true,
      lineThreshold: 5,
    });
    rawText = textResult.text || "";
  } catch (err) {
    throw new Error(
      `PDF parsing failed: ${err instanceof Error ? err.message : String(err)}`,
    );
  }

  const cleanedText = cleanPdfText(rawText);
  const truncatedText = cleanedText.slice(0, contentLimit);

  if (!title) {
    title = inferTitleFromText(cleanedText);
  }

  const images: PdfImage[] = [];
  if (extractImages) {
    try {
      const imageResult = await parser.getImage({
        imageThreshold: 50,
        imageDataUrl: true,
        imageBuffer: true,
      } as any);

      if (imageResult?.pages) {
        const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "pdf-images-"));

        for (const page of imageResult.pages) {
          if (images.length >= maxImages) break;
          if (!(page as any).images) continue;
          for (const img of (page as any).images) {
            if (images.length >= maxImages) break;

            const imgWidth = img.width || 0;
            const imgHeight = img.height || 0;

            let imgBuffer: Buffer | null = null;
            let format = "png";

            if (img.data && img.data.length > 0) {
              imgBuffer = Buffer.from(img.data);
            } else if (img.dataUrl) {
              const match = (img.dataUrl as string).match(
                /^data:image\/(\w+);base64,(.+)$/,
              );
              if (match) {
                format = match[1];
                imgBuffer = Buffer.from(match[2], "base64");
              }
            }

            if (!imgBuffer || imgBuffer.length < 200) continue;

            const fileName = `page${(page as any).pageNumber || 1}_img${images.length + 1}.${format}`;
            const filePath = path.join(tmpDir, fileName);
            fs.writeFileSync(filePath, imgBuffer);

            images.push({
              page: (page as any).pageNumber || 1,
              format,
              filePath,
              width: imgWidth,
              height: imgHeight,
              byteSize: imgBuffer.length,
            });
          }
        }
      }
    } catch {
    }
  }

  await parser.destroy();

  const wordCount = countWords(truncatedText);
  const description = buildDescription(title, author, pageCount, cleanedText);

  return {
    url: sourceUrl,
    finalUrl,
    title,
    description,
    published: creationDate,
    text: truncatedText,
    wordCount,
    outlinks: extractUrlsFromText(cleanedText, finalUrl),
    images,
  };
}

/**
 * Cleans up typical PDF text extraction artifacts.
 */
function cleanPdfText(raw: string): string {
  return raw
    .replace(/\r\n/g, "\n")
    .replace(/\r/g, "\n")
    .replace(/(\w)-\n(\w)/g, "$1$2")
    .replace(/[ \t]+/g, " ")
    .replace(/\n{3,}/g, "\n\n")
    .replace(/\n\s*\d+\s*\n/g, "\n")
    .split("\n")
    .map((line) => line.trim())
    .join("\n")
    .trim();
}

function inferTitleFromText(text: string): string {
  const lines = text.split("\n").filter((l) => l.trim().length > 5);
  if (lines.length === 0) return "";
  const candidate = lines[0].trim();
  return candidate.length <= 200 ? candidate : candidate.slice(0, 200);
}

function sanitizeMetaString(val: unknown): string {
  if (typeof val !== "string") return "";
  return val.replace(/\0/g, "").trim();
}

function extractDateFromPdfInfo(info: Record<string, unknown>): string | null {
  for (const key of ["CreationDate", "ModDate", "created", "modified"]) {
    const raw = info[key];
    if (typeof raw !== "string" || !raw) continue;

    const pdfDateMatch = raw.match(/D:(\d{4})(\d{2})(\d{2})/);
    if (pdfDateMatch) {
      return `${pdfDateMatch[1]}-${pdfDateMatch[2]}-${pdfDateMatch[3]}`;
    }

    try {
      const d = new Date(raw);
      if (!isNaN(d.getTime())) return d.toISOString().slice(0, 10);
    } catch {
      continue;
    }
  }
  return null;
}

function buildDescription(
  title: string,
  author: string,
  pageCount: number,
  text: string,
): string {
  const parts: string[] = [];
  if (title) parts.push(title);
  if (author) parts.push(`by ${author}`);
  if (pageCount > 0) parts.push(`(${pageCount} pages)`);

  const metaLine = parts.length > 0 ? parts.join(" ") + ". " : "";
  const textPreview = text.slice(
    0,
    DESCRIPTION_FALLBACK_CHARS - metaLine.length,
  );
  return (metaLine + textPreview).slice(0, DESCRIPTION_FALLBACK_CHARS);
}

function extractUrlsFromText(
  text: string,
  baseUrl: string,
): ReadonlyArray<Outlink> {
  const urlRe = /https?:\/\/[^\s)<>"']+/gi;
  const matches = text.match(urlRe) || [];
  const seen = new Set<string>();
  const links: Outlink[] = [];

  let baseHost: string;
  try {
    baseHost = new URL(baseUrl).hostname;
  } catch {
    baseHost = "";
  }

  for (const rawUrl of matches) {
    if (links.length >= 20) break;
    const cleanUrl = rawUrl.replace(/[.,;:!?)]+$/, "");
    if (seen.has(cleanUrl)) continue;
    try {
      const parsed = new URL(cleanUrl);
      if (parsed.hostname === baseHost) continue;
      seen.add(cleanUrl);
      links.push({
        text: parsed.hostname + parsed.pathname.slice(0, 60),
        href: cleanUrl,
      });
    } catch {
      continue;
    }
  }
  return links;
}

function countWords(text: string): number {
  return text.split(/\s+/).filter(Boolean).length;
}