Forked from danielsig/visit-website
src / extractor.ts
export const extractTextContent = (body: string, contentLimit: number, searchTerms?: string[]) => {
const mainMarkers = [
/<main[^>]*>([\s\S]*)<\/main>/i,
/<article[^>]*>([\s\S]*)<\/article>/i,
/<div[^>]*class="[^"]*(?:content|page|post|article|main)[^"]*"[^>]*>([\s\S]*)<\/div>/i,
];
let contentArea = body;
for (const marker of mainMarkers) {
const match = marker.exec(body);
if (match && match[1].length > 100) {
contentArea = match[1];
break;
}
}
let cleanBody = contentArea
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<!--[\s\S]*?-->/g, '')
.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '');
cleanBody = cleanBody
.replace(/ /g, ' ')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, "'");
cleanBody = cleanBody
.replace(/<\/(div|p|h[1-6]|li|tr|br|section|article|main)>/gi, '\n')
.replace(/<br\s*\/?>/gi, '\n');
cleanBody = cleanBody.replace(/<[^>]+>/g, ' ');
cleanBody = cleanBody
.replace(/[ \t]+/g, ' ')
.replace(/\n\s+/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
if (!cleanBody || cleanBody.length < 50) {
if (contentArea !== body) {
return extractTextContent(body, contentLimit, searchTerms);
}
if (!cleanBody) return '';
}
const allContent = cleanBody;
if (searchTerms?.length && contentLimit < allContent.length) {
const padding = `.{0,${Math.floor(contentLimit / (searchTerms.length * 2))}}`;
const matches = searchTerms
.map(term => {
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
return new RegExp(padding + escapedTerm + padding, 'gi').exec(allContent);
})
.filter(match => !!match)
.sort((a, b) => a!.index - b!.index);
let content = '';
let nextMinIndex = 0;
for (const match of matches) {
if (!match) continue;
content += match.index >= nextMinIndex
? match[0]
: match[0].slice(nextMinIndex - match.index);
nextMinIndex = match.index + match[0].length;
}
return content;
}
return allContent.slice(0, contentLimit);
};
export const extractLinks = (body: string, url: string, maxLinks: number, searchTerms?: string[]) => {
return [...body.matchAll(/<a\s+[^>]*?href="([^"]+)"[^>]*>((?:\n|.)*?)<\/a>/g)]
.map((match, index) => ({
index,
label: match[2]?.replace(/\\[ntr]|\s|<(?:[^>"]|"[^"]*")+>/g, " ").trim() || "",
link: (() => {
try {
return match[1]?.startsWith("http") ? match[1] : new URL(match[1], url).href;
} catch {
return match[1];
}
})()
}))
.filter(({ link }) => link?.startsWith("http"))
.map((x, index, { length }) => {
const ratio = 1 / Math.min(1, /\d/g.exec(x.link)?.length || 1);
const score = ratio * (100 - (x.label.length + x.link.length + (20 * index / length))) + (1 - ratio) * x.label.split(/\s+/).length;
return {
...x,
score: searchTerms?.length && searchTerms.reduce((acc, term) => acc + (x.label.toLowerCase().includes(term.toLowerCase()) ? 1000 : 0), score) || score,
};
})
.sort((a, b) => b.score - a.score)
.filter((x, i, arr) => !arr.find((y, j) => j < i && y.link === x.link))
.slice(0, maxLinks)
.map(({ label, link }) => [label, link] as [string, string]);
};
export const extractImages = (body: string, url: string, maxImages: number, searchTerms?: string[]) => {
return [...body.matchAll(/<img(\s+[^>]*)/g)]
.filter(x => x[1])
.map(([, attributes], index) => {
const alt = attributes.match(/\salt="([^"]+)"/)?.[1] || "";
const src = attributes.match(/\ssrc="([^"]+)"/)?.[1];
return {
index,
alt,
src: (() => {
try {
return src?.startsWith("http") ? src : new URL(src!, url).href;
} catch {
return src;
}
})(),
score: searchTerms?.length && searchTerms.reduce((acc, term) => acc + (alt.toLowerCase().includes(term.toLowerCase()) ? 1000 : 0), alt.length) || alt.length,
};
})
.filter(({ src }) => src && src.startsWith('http') && src.match(/\.(svg|png|webp|gif|jpe?g)(\?.*)?$/i))
.sort((a, b) => b.score - a.score)
.slice(0, maxImages)
.sort((a, b) => a.index - b.index)
.map(({ src, alt }) => [alt, src] as [string, string]);
};