src / services / ingestion / youtube.ts

import { YtDlp, BIN_DIR } from "ytdlp-nodejs";
import ffmpegPath from "ffmpeg-static";
import * as fs from "fs";
import * as path from "path";
import * as os from "os";
import * as crypto from "crypto";
import { spawn, execSync } from "child_process";

const platform = os.platform();
let ytdlpBinaryPath: string;

function getSystemBinary(name: string): string | null {
    try {
        return execSync(`which ${name}`, { stdio: 'pipe' }).toString().trim();
    } catch {
        return null;
    }
}

// On Fedora/Linux, it's often better to use the system installed yt-dlp
const systemYtdlp = getSystemBinary("yt-dlp");
if (systemYtdlp) {
    ytdlpBinaryPath = systemYtdlp;
} else if (platform === 'win32') {
    ytdlpBinaryPath = path.join(BIN_DIR, 'yt-dlp.exe');
} else if (platform === 'darwin') {
    ytdlpBinaryPath = path.join(BIN_DIR, 'yt-dlp_macos');
} else {
    ytdlpBinaryPath = path.join(BIN_DIR, 'yt-dlp');
}

export async function downloadAndTranscribeYoutube(
    url: string, 
    outputDir: string, 
    whisperPath: string | undefined
): Promise<string> {
    
    // On Linux, if whisperPath is not set, try finding 'whisper-cpp' or 'main' in path
    if (!whisperPath || !fs.existsSync(whisperPath)) {
        const systemWhisper = getSystemBinary("whisper-cpp") || getSystemBinary("whisper");
        if (systemWhisper) {
            whisperPath = systemWhisper;
        } else {
            throw new Error("Whisper binary path is not configured or does not exist. On Fedora, install it with 'sudo dnf install whisper-cpp' or set 'whisperBinaryPath' in settings.");
        }
    }
    
    if (!ffmpegPath) throw new Error("ffmpeg not found");

    const ytdlp = new YtDlp({
        binaryPath: ytdlpBinaryPath,
        ffmpegPath,
    });

    const runId = crypto.randomBytes(4).toString("hex");
    const audioRaw = path.join(outputDir, `yt_${runId}_raw.wav`);
    const audio16k = path.join(outputDir, `yt_${runId}_16k.wav`);
    const txtOutput = path.join(outputDir, `yt_${runId}_16k.wav.txt`); // whisper usually appends .txt or we specify output

    try {
        console.log(`[YouTube] Downloading ${url}...`);
        await ytdlp.downloadAsync(url, {
            format: { filter: "audioonly", type: "wav" },
            output: audioRaw,
        });

        console.log(`[YouTube] Converting to 16kHz WAV...`);
        await new Promise<void>((resolve, reject) => {
            const ffmpeg = spawn(ffmpegPath!, ['-i', audioRaw, '-ar', '16000', '-ac', '1', '-c:a', 'pcm_s16le', audio16k]);
            ffmpeg.on('close', (code) => code === 0 ? resolve() : reject(new Error(`ffmpeg exited with ${code}`)));
        });

        console.log(`[YouTube] Transcribing with local Whisper...`);
        // Command format: main.exe -f input.wav -otxt
        // Adjust arguments based on standard whisper.cpp cli
        await new Promise<void>((resolve, reject) => {
            const whisper = spawn(whisperPath, [
                '-f', audio16k,
                '-otxt', // Output text
                '-of', path.join(outputDir, `yt_${runId}_16k`) // Output file prefix (will add .wav.txt)
            ]);
            
            whisper.stdout.on('data', (data) => console.log(`[Whisper] ${data}`));
            whisper.stderr.on('data', (data) => console.error(`[Whisper Err] ${data}`));
            
            whisper.on('close', (code) => code === 0 ? resolve() : reject(new Error(`Whisper exited with ${code}`)));
        });

        if (fs.existsSync(txtOutput)) {
            const text = fs.readFileSync(txtOutput, 'utf-8');
            return text;
        } else {
            // Try without the extra .wav extension if whisper behaves differently
            const altPath = path.join(outputDir, `yt_${runId}_16k.txt`);
            if (fs.existsSync(altPath)) {
                return fs.readFileSync(altPath, 'utf-8');
            }
            throw new Error("Transcription output file not found.");
        }

    } finally {
        // Cleanup
        [audioRaw, audio16k, txtOutput, path.join(outputDir, `yt_${runId}_16k.txt`)].forEach(f => {
            if (fs.existsSync(f)) fs.unlinkSync(f);
        });
    }
}