src / extractor.ts

export const extractTextContent = (body: string, contentLimit: number, searchTerms?: string[]) => {
	const mainMarkers = [
		/<main[^>]*>([\s\S]*)<\/main>/i,
		/<article[^>]*>([\s\S]*)<\/article>/i,
		/<div[^>]*class="[^"]*(?:content|page|post|article|main)[^"]*"[^>]*>([\s\S]*)<\/div>/i,
	];

	let contentArea = body;
	for (const marker of mainMarkers) {
		const match = marker.exec(body);
		if (match && match[1].length > 100) {
			contentArea = match[1];
			break;
		}
	}

	let cleanBody = contentArea
		.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
		.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
		.replace(/<!--[\s\S]*?-->/g, '')
		.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, ''); 
	
	cleanBody = cleanBody
		.replace(/&nbsp;/g, ' ')
		.replace(/&amp;/g, '&')
		.replace(/&lt;/g, '<')
		.replace(/&gt;/g, '>')
		.replace(/&quot;/g, '"')
		.replace(/&#39;/g, "'");
	
	cleanBody = cleanBody
		.replace(/<\/(div|p|h[1-6]|li|tr|br|section|article|main)>/gi, '\n')
		.replace(/<br\s*\/?>/gi, '\n');
	
	cleanBody = cleanBody.replace(/<[^>]+>/g, ' ');
	
	cleanBody = cleanBody
		.replace(/[ \t]+/g, ' ')
		.replace(/\n\s+/g, '\n')
		.replace(/\n{3,}/g, '\n\n')
		.trim();
	
	if (!cleanBody || cleanBody.length < 50) {
		if (contentArea !== body) {
			return extractTextContent(body, contentLimit, searchTerms);
		}
		if (!cleanBody) return '';
	}
	
	const allContent = cleanBody;
	
	if (searchTerms?.length && contentLimit < allContent.length) {
		const padding = `.{0,${Math.floor(contentLimit / (searchTerms.length * 2))}}`;
		const matches = searchTerms
			.map(term => {
				const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
				return new RegExp(padding + escapedTerm + padding, 'gi').exec(allContent);
			})
			.filter(match => !!match)
			.sort((a, b) => a!.index - b!.index);
		
		let content = '';
		let nextMinIndex = 0;
		for (const match of matches) {
			if (!match) continue;
			content += match.index >= nextMinIndex
				? match[0]
				: match[0].slice(nextMinIndex - match.index);
			nextMinIndex = match.index + match[0].length;
		}
		return content;
	}
	
	return allContent.slice(0, contentLimit);
};

export const extractLinks = (body: string, url: string, maxLinks: number, searchTerms?: string[]) => {
	return [...body.matchAll(/<a\s+[^>]*?href="([^"]+)"[^>]*>((?:\n|.)*?)<\/a>/g)]
		.map((match, index) => ({
			index,
			label: match[2]?.replace(/\\[ntr]|\s|<(?:[^>"]|"[^"]*")+>/g, " ").trim() || "",
			link: (() => {
				try {
					return match[1]?.startsWith("http") ? match[1] : new URL(match[1], url).href;
				} catch {
					return match[1];
				}
			})()
		}))
		.filter(({ link }) => link?.startsWith("http"))
		.map((x, index, { length }) => {
			const ratio = 1 / Math.min(1, /\d/g.exec(x.link)?.length || 1);
			const score = ratio * (100 - (x.label.length + x.link.length + (20 * index / length))) + (1 - ratio) * x.label.split(/\s+/).length;
			return {
				...x,
				score: searchTerms?.length && searchTerms.reduce((acc, term) => acc + (x.label.toLowerCase().includes(term.toLowerCase()) ? 1000 : 0), score) || score,
			};
		})
		.sort((a, b) => b.score - a.score)
		.filter((x, i, arr) => !arr.find((y, j) => j < i && y.link === x.link))
		.slice(0, maxLinks)
		.map(({ label, link }) => [label, link] as [string, string]);
};

export const extractImages = (body: string, url: string, maxImages: number, searchTerms?: string[]) => {
	return [...body.matchAll(/<img(\s+[^>]*)/g)]
		.filter(x => x[1])
		.map(([, attributes], index) => {
			const alt = attributes.match(/\salt="([^"]+)"/)?.[1] || "";
			const src = attributes.match(/\ssrc="([^"]+)"/)?.[1];
			return {
				index,
				alt,
				src: (() => {
					try {
						return src?.startsWith("http") ? src : new URL(src!, url).href;
					} catch {
						return src;
					}
				})(),
				score: searchTerms?.length && searchTerms.reduce((acc, term) => acc + (alt.toLowerCase().includes(term.toLowerCase()) ? 1000 : 0), alt.length) || alt.length,
			};
		})
		.filter(({ src }) => src && src.startsWith('http') && src.match(/\.(svg|png|webp|gif|jpe?g)(\?.*)?$/i))
		.sort((a, b) => b.score - a.score)
		.slice(0, maxImages)
		.sort((a, b) => a.index - b.index)
		.map(({ src, alt }) => [alt, src] as [string, string]);
};