src / parsers / epubParser.ts

// @ts-ignore - epub2 doesn't have complete types
import { EPub } from "epub2";

/**
 * Parse EPUB files and extract text content
 */
export async function parseEPUB(filePath: string): Promise<string> {
  return new Promise((resolve, reject) => {
    try {
      const epub = new EPub(filePath);
      
      epub.on("error", (error: Error) => {
        console.error(`Error parsing EPUB file ${filePath}:`, error);
        resolve("");
      });
      
      const stripHtml = (input: string) =>
        input.replace(/<[^>]*>/g, " ");

      const getManifestEntry = (chapterId: string) => {
        return (epub as unknown as { manifest?: Record<string, { [key: string]: string }> }).manifest?.[chapterId];
      };

      const decodeMediaType = (entry?: { [key: string]: string }) =>
        entry?.["media-type"] || entry?.mediaType || "";

      const shouldReadRaw = (mediaType: string) => {
        const normalized = mediaType.toLowerCase();
        if (!normalized) {
          return true;
        }

        if (normalized === "application/xhtml+xml" || normalized === "image/svg+xml") {
          return false;
        }

        if (normalized.startsWith("text/")) {
          return true;
        }

        if (normalized.includes("html")) {
          return true;
        }

        return true;
      };

      const readChapter = async (chapterId: string): Promise<string> => {
        const manifestEntry = getManifestEntry(chapterId);
        if (!manifestEntry) {
          console.warn(`EPUB chapter ${chapterId} missing manifest entry in ${filePath}, skipping`);
          return "";
        }

        const mediaType = decodeMediaType(manifestEntry);
        if (shouldReadRaw(mediaType)) {
          return new Promise((res, rej) => {
            epub.getFile(
              chapterId,
              (error: Error | null, data?: Buffer) => {
                if (error) {
                  rej(error);
                } else if (!data) {
                  res("");
                } else {
                  res(stripHtml(data.toString("utf-8")));
                }
              }
            );
          });
        }

        return new Promise((res, rej) => {
          epub.getChapter(
            chapterId,
            (error: Error | null, text?: string) => {
              if (error) {
                rej(error);
              } else if (typeof text === "string") {
                res(stripHtml(text));
              } else {
                res("");
              }
            }
          );
        });
      };

      epub.on("end", async () => {
        try {
          const chapters = epub.flow;
          const textParts: string[] = [];
          
          for (const chapter of chapters) {
            try {
              const chapterId = chapter.id;
              if (!chapterId) {
                console.warn(`EPUB chapter missing id in ${filePath}, skipping`);
                textParts.push("");
                continue;
              }

              const text = await readChapter(chapterId);
              textParts.push(text);
            } catch (chapterError) {
              console.error(`Error reading chapter ${chapter.id}:`, chapterError);
            }
          }
          
          const fullText = textParts.join("\n\n");
          resolve(
            fullText
              .replace(/\s+/g, " ")
              .replace(/\n+/g, "\n")
              .trim()
          );
        } catch (error) {
          console.error(`Error processing EPUB chapters:`, error);
          resolve("");
        }
      });
      
      epub.parse();
    } catch (error) {
      console.error(`Error initializing EPUB parser for ${filePath}:`, error);
      resolve("");
    }
  });
}