dist / services / ingestion / youtube.js

"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    var desc = Object.getOwnPropertyDescriptor(m, k);
    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
      desc = { enumerable: true, get: function() { return m[k]; } };
    }
    Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
    if (k2 === undefined) k2 = k;
    o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
    Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
    o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
    var ownKeys = function(o) {
        ownKeys = Object.getOwnPropertyNames || function (o) {
            var ar = [];
            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
            return ar;
        };
        return ownKeys(o);
    };
    return function (mod) {
        if (mod && mod.__esModule) return mod;
        var result = {};
        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
        __setModuleDefault(result, mod);
        return result;
    };
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
    return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.downloadAndTranscribeYoutube = downloadAndTranscribeYoutube;
const ytdlp_nodejs_1 = require("ytdlp-nodejs");
const ffmpeg_static_1 = __importDefault(require("ffmpeg-static"));
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
const os = __importStar(require("os"));
const crypto = __importStar(require("crypto"));
const child_process_1 = require("child_process");
const platform = os.platform();
let ytdlpBinaryPath;
function getSystemBinary(name) {
    try {
        return (0, child_process_1.execSync)(`which ${name}`, { stdio: 'pipe' }).toString().trim();
    }
    catch {
        return null;
    }
}
// On Fedora/Linux, it's often better to use the system installed yt-dlp
const systemYtdlp = getSystemBinary("yt-dlp");
if (systemYtdlp) {
    ytdlpBinaryPath = systemYtdlp;
}
else if (platform === 'win32') {
    ytdlpBinaryPath = path.join(ytdlp_nodejs_1.BIN_DIR, 'yt-dlp.exe');
}
else if (platform === 'darwin') {
    ytdlpBinaryPath = path.join(ytdlp_nodejs_1.BIN_DIR, 'yt-dlp_macos');
}
else {
    ytdlpBinaryPath = path.join(ytdlp_nodejs_1.BIN_DIR, 'yt-dlp');
}
async function downloadAndTranscribeYoutube(url, outputDir, whisperPath) {
    // On Linux, if whisperPath is not set, try finding 'whisper-cpp' or 'main' in path
    if (!whisperPath || !fs.existsSync(whisperPath)) {
        const systemWhisper = getSystemBinary("whisper-cpp") || getSystemBinary("whisper");
        if (systemWhisper) {
            whisperPath = systemWhisper;
        }
        else {
            throw new Error("Whisper binary path is not configured or does not exist. On Fedora, install it with 'sudo dnf install whisper-cpp' or set 'whisperBinaryPath' in settings.");
        }
    }
    if (!ffmpeg_static_1.default)
        throw new Error("ffmpeg not found");
    const ytdlp = new ytdlp_nodejs_1.YtDlp({
        binaryPath: ytdlpBinaryPath,
        ffmpegPath: ffmpeg_static_1.default,
    });
    const runId = crypto.randomBytes(4).toString("hex");
    const audioRaw = path.join(outputDir, `yt_${runId}_raw.wav`);
    const audio16k = path.join(outputDir, `yt_${runId}_16k.wav`);
    const txtOutput = path.join(outputDir, `yt_${runId}_16k.wav.txt`); // whisper usually appends .txt or we specify output
    try {
        console.log(`[YouTube] Downloading ${url}...`);
        await ytdlp.downloadAsync(url, {
            format: { filter: "audioonly", type: "wav" },
            output: audioRaw,
        });
        console.log(`[YouTube] Converting to 16kHz WAV...`);
        await new Promise((resolve, reject) => {
            const ffmpeg = (0, child_process_1.spawn)(ffmpeg_static_1.default, ['-i', audioRaw, '-ar', '16000', '-ac', '1', '-c:a', 'pcm_s16le', audio16k]);
            ffmpeg.on('close', (code) => code === 0 ? resolve() : reject(new Error(`ffmpeg exited with ${code}`)));
        });
        console.log(`[YouTube] Transcribing with local Whisper...`);
        // Command format: main.exe -f input.wav -otxt
        // Adjust arguments based on standard whisper.cpp cli
        await new Promise((resolve, reject) => {
            const whisper = (0, child_process_1.spawn)(whisperPath, [
                '-f', audio16k,
                '-otxt', // Output text
                '-of', path.join(outputDir, `yt_${runId}_16k`) // Output file prefix (will add .wav.txt)
            ]);
            whisper.stdout.on('data', (data) => console.log(`[Whisper] ${data}`));
            whisper.stderr.on('data', (data) => console.error(`[Whisper Err] ${data}`));
            whisper.on('close', (code) => code === 0 ? resolve() : reject(new Error(`Whisper exited with ${code}`)));
        });
        if (fs.existsSync(txtOutput)) {
            const text = fs.readFileSync(txtOutput, 'utf-8');
            return text;
        }
        else {
            // Try without the extra .wav extension if whisper behaves differently
            const altPath = path.join(outputDir, `yt_${runId}_16k.txt`);
            if (fs.existsSync(altPath)) {
                return fs.readFileSync(altPath, 'utf-8');
            }
            throw new Error("Transcription output file not found.");
        }
    }
    finally {
        // Cleanup
        [audioRaw, audio16k, txtOutput, path.join(outputDir, `yt_${runId}_16k.txt`)].forEach(f => {
            if (fs.existsSync(f))
                fs.unlinkSync(f);
        });
    }
}