From 6343978c5fbc557267a3b0b725bd829a0f83af6f Mon Sep 17 00:00:00 2001 From: Peifan Li Date: Tue, 9 Dec 2025 22:06:31 -0500 Subject: [PATCH] feat: Add yt-dlp download functionality and helpers --- .../services/downloaders/YtDlpDownloader.ts | 1069 ++++++++++------- build-and-push.sh | 4 +- 2 files changed, 658 insertions(+), 415 deletions(-) diff --git a/backend/src/services/downloaders/YtDlpDownloader.ts b/backend/src/services/downloaders/YtDlpDownloader.ts index d728180..3cd5704 100644 --- a/backend/src/services/downloaders/YtDlpDownloader.ts +++ b/backend/src/services/downloaders/YtDlpDownloader.ts @@ -1,451 +1,694 @@ import axios from "axios"; +import { spawn } from "child_process"; import fs from "fs-extra"; import path from "path"; -import youtubedl from "youtube-dl-exec"; import { IMAGES_DIR, SUBTITLES_DIR, VIDEOS_DIR } from "../../config/paths"; import { formatVideoFilename } from "../../utils/helpers"; import * as storageService from "../storageService"; import { Video } from "../storageService"; const YT_DLP_PATH = process.env.YT_DLP_PATH || "yt-dlp"; -const PROVIDER_SCRIPT = process.env.BGUTIL_SCRIPT_PATH || path.join(process.cwd(), "bgutil-ytdlp-pot-provider/server/build/generate_once.js"); +const PROVIDER_SCRIPT = + process.env.BGUTIL_SCRIPT_PATH || + path.join( + process.cwd(), + "bgutil-ytdlp-pot-provider/server/build/generate_once.js" + ); + +/** + * Convert camelCase flag names to kebab-case CLI arguments + */ +function convertFlagToArg(flag: string): string { + return `--${flag.replace(/([A-Z])/g, "-$1").toLowerCase()}`; +} + +/** + * Convert flags object to yt-dlp CLI arguments array + */ +function flagsToArgs(flags: Record): string[] { + const args: string[] = []; + + for (const [key, value] of Object.entries(flags)) { + if (value === undefined || value === null) { + continue; + } + + // Handle special cases + if (key === "extractorArgs") { + // Support semicolon-separated extractor args (e.g., "youtube:key=value;other:key=value") + if (typeof value === "string" && value.includes(";")) { + const parts = value.split(";"); + for (const part of parts) { + if (part.trim()) { + args.push("--extractor-args", part.trim()); + } + } + } else { + args.push("--extractor-args", value); + } + continue; + } + + if (key === "addHeader") { + // addHeader is an array of "key:value" strings + if (Array.isArray(value)) { + for (const header of value) { + args.push("--add-header", header); + } + } else { + args.push("--add-header", value); + } + continue; + } + + // Convert camelCase to kebab-case + const argName = convertFlagToArg(key); + + if (typeof value === "boolean") { + if (value) { + args.push(argName); + } + } else if (typeof value === "string" || typeof value === "number") { + args.push(argName, String(value)); + } else if (Array.isArray(value)) { + // For arrays, join with comma or repeat the flag + args.push(argName, value.join(",")); + } + } + + return args; +} + +/** + * Execute yt-dlp with JSON output and return parsed result + */ +async function executeYtDlpJson( + url: string, + flags: Record = {} +): Promise { + const args = [ + "--dump-single-json", + "--no-warnings", + ...flagsToArgs(flags), + url, + ]; + + console.log(`Executing: ${YT_DLP_PATH} ${args.join(" ")}`); + + return new Promise((resolve, reject) => { + const subprocess = spawn(YT_DLP_PATH, args, { + stdio: ["ignore", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + + subprocess.stdout?.on("data", (data: Buffer) => { + stdout += data.toString(); + }); + + subprocess.stderr?.on("data", (data: Buffer) => { + stderr += data.toString(); + }); + + subprocess.on("close", (code) => { + if (code !== 0) { + const error = new Error(`yt-dlp process exited with code ${code}`); + (error as any).stderr = stderr; + reject(error); + return; + } + + if ( + stderr && + !stderr.includes("[download]") && + !stderr.includes("[info]") + ) { + console.warn("yt-dlp stderr:", stderr); + } + + try { + resolve(JSON.parse(stdout)); + } catch (parseError) { + console.error("Failed to parse yt-dlp JSON output:", parseError); + console.error("Output:", stdout); + reject(new Error("Failed to parse yt-dlp output as JSON")); + } + }); + + subprocess.on("error", (error) => { + reject(error); + }); + }); +} + +/** + * Execute yt-dlp with spawn for progress tracking + * Returns a subprocess-like object with kill() method + */ +function executeYtDlpSpawn( + url: string, + flags: Record = {} +): { + stdout: NodeJS.ReadableStream | null; + stderr: NodeJS.ReadableStream | null; + kill: (signal?: NodeJS.Signals) => boolean; + then: ( + onFulfilled?: (value: void) => void | Promise, + onRejected?: (reason: any) => void | Promise + ) => Promise; +} { + const args = [...flagsToArgs(flags), url]; + + console.log(`Spawning: ${YT_DLP_PATH} ${args.join(" ")}`); + + const subprocess = spawn(YT_DLP_PATH, args, { + stdio: ["ignore", "pipe", "pipe"], + }); + + let resolved = false; + let rejected = false; + let resolveFn: (() => void) | null = null; + let rejectFn: ((error: Error) => void) | null = null; + + const promise = new Promise((resolve, reject) => { + resolveFn = resolve; + rejectFn = reject; + + subprocess.on("close", (code) => { + if (code === 0) { + if (!resolved && !rejected) { + resolved = true; + resolve(); + } + } else { + if (!resolved && !rejected) { + rejected = true; + reject(new Error(`yt-dlp process exited with code ${code}`)); + } + } + }); + + subprocess.on("error", (error) => { + if (!resolved && !rejected) { + rejected = true; + reject(error); + } + }); + }); + + return { + stdout: subprocess.stdout, + stderr: subprocess.stderr, + kill: (signal?: NodeJS.Signals) => { + if (!subprocess.killed) { + return subprocess.kill(signal); + } + return false; + }, + then: promise.then.bind(promise), + }; +} // Helper function to extract author from XiaoHongShu page when yt-dlp doesn't provide it async function extractXiaoHongShuAuthor(url: string): Promise { - try { - console.log("Attempting to extract XiaoHongShu author from webpage..."); - const response = await axios.get(url, { - headers: { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' - }, - timeout: 10000 - }); - - const html = response.data; - - // Try to find author name in the JSON data embedded in the page - // XiaoHongShu embeds data in window.__INITIAL_STATE__ - const match = html.match(/"nickname":"([^"]+)"/); - if (match && match[1]) { - console.log("Found XiaoHongShu author:", match[1]); - return match[1]; - } - - // Alternative: try to find in user info - const userMatch = html.match(/"user":\{[^}]*"nickname":"([^"]+)"/); - if (userMatch && userMatch[1]) { - console.log("Found XiaoHongShu author (user):", userMatch[1]); - return userMatch[1]; - } - - console.log("Could not extract XiaoHongShu author from webpage"); - return null; - } catch (error) { - console.error("Error extracting XiaoHongShu author:", error); - return null; + try { + console.log("Attempting to extract XiaoHongShu author from webpage..."); + const response = await axios.get(url, { + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + }, + timeout: 10000, + }); + + const html = response.data; + + // Try to find author name in the JSON data embedded in the page + // XiaoHongShu embeds data in window.__INITIAL_STATE__ + const match = html.match(/"nickname":"([^"]+)"/); + if (match && match[1]) { + console.log("Found XiaoHongShu author:", match[1]); + return match[1]; } + + // Alternative: try to find in user info + const userMatch = html.match(/"user":\{[^}]*"nickname":"([^"]+)"/); + if (userMatch && userMatch[1]) { + console.log("Found XiaoHongShu author (user):", userMatch[1]); + return userMatch[1]; + } + + console.log("Could not extract XiaoHongShu author from webpage"); + return null; + } catch (error) { + console.error("Error extracting XiaoHongShu author:", error); + return null; + } } export class YtDlpDownloader { - // Search for videos (primarily for YouTube, but could be adapted) - static async search(query: string): Promise { - console.log("Processing search request for query:", query); + // Search for videos (primarily for YouTube, but could be adapted) + static async search(query: string): Promise { + console.log("Processing search request for query:", query); - // Use ytsearch for searching - const searchResults = await youtubedl(`ytsearch5:${query}`, { - dumpSingleJson: true, - noWarnings: true, - skipDownload: true, - playlistEnd: 5, // Limit to 5 results - extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`, - } as any, { execPath: YT_DLP_PATH } as any); + // Use ytsearch for searching + const searchResults = await executeYtDlpJson(`ytsearch5:${query}`, { + noWarnings: true, + skipDownload: true, + playlistEnd: 5, // Limit to 5 results + extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`, + }); - if (!searchResults || !(searchResults as any).entries) { - return []; + if (!searchResults || !searchResults.entries) { + return []; + } + + // Format the search results + const formattedResults = searchResults.entries.map((entry: any) => ({ + id: entry.id, + title: entry.title, + author: entry.uploader, + thumbnailUrl: entry.thumbnail, + duration: entry.duration, + viewCount: entry.view_count, + sourceUrl: `https://www.youtube.com/watch?v=${entry.id}`, // Default to YT for search results + source: "youtube", + })); + + console.log( + `Found ${formattedResults.length} search results for "${query}"` + ); + + return formattedResults; + } + + // Get video info without downloading + static async getVideoInfo( + url: string + ): Promise<{ + title: string; + author: string; + date: string; + thumbnailUrl: string; + }> { + try { + const info = await executeYtDlpJson(url, { + noWarnings: true, + preferFreeFormats: true, + extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`, + }); + + return { + title: info.title || "Video", + author: info.uploader || "Unknown", + date: + info.upload_date || + new Date().toISOString().slice(0, 10).replace(/-/g, ""), + thumbnailUrl: info.thumbnail, + }; + } catch (error) { + console.error("Error fetching video info:", error); + return { + title: "Video", + author: "Unknown", + date: new Date().toISOString().slice(0, 10).replace(/-/g, ""), + thumbnailUrl: "", + }; + } + } + + // Get the latest video URL from a channel + static async getLatestVideoUrl(channelUrl: string): Promise { + try { + console.log("Fetching latest video for channel:", channelUrl); + + // Append /videos to channel URL to ensure we get videos and not the channel tab + let targetUrl = channelUrl; + if ( + channelUrl.includes("youtube.com/") && + !channelUrl.includes("/videos") && + !channelUrl.includes("/shorts") && + !channelUrl.includes("/streams") + ) { + // Check if it looks like a channel URL + if ( + channelUrl.includes("/@") || + channelUrl.includes("/channel/") || + channelUrl.includes("/c/") || + channelUrl.includes("/user/") + ) { + targetUrl = `${channelUrl}/videos`; + console.log("Modified channel URL to:", targetUrl); } + } - // Format the search results - const formattedResults = (searchResults as any).entries.map((entry: any) => ({ - id: entry.id, - title: entry.title, - author: entry.uploader, - thumbnailUrl: entry.thumbnail, - duration: entry.duration, - viewCount: entry.view_count, - sourceUrl: `https://www.youtube.com/watch?v=${entry.id}`, // Default to YT for search results - source: "youtube", - })); + // Use yt-dlp to get the first video in the channel (playlist) + const result = await executeYtDlpJson(targetUrl, { + playlistEnd: 5, + noWarnings: true, + flatPlaylist: true, // We only need the ID/URL, not full info + extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`, + }); - console.log( - `Found ${formattedResults.length} search results for "${query}"` + // If it's a playlist/channel, 'entries' will contain the videos + if (result.entries && result.entries.length > 0) { + // Iterate through entries to find a valid video + // Sometimes the first entry is the channel/tab itself (e.g. id starts with UC) + for (const entry of result.entries) { + // Skip entries that look like channel IDs (start with UC and are 24 chars) + // or entries without a title/url that look like metadata + if (entry.id && entry.id.startsWith("UC") && entry.id.length === 24) { + continue; + } + + const videoId = entry.id; + if (videoId) { + return `https://www.youtube.com/watch?v=${videoId}`; + } + if (entry.url) { + return entry.url; + } + } + } + return null; + } catch (error) { + console.error("Error fetching latest video URL:", error); + return null; + } + } + + // Download video + static async downloadVideo( + videoUrl: string, + downloadId?: string, + onStart?: (cancel: () => void) => void + ): Promise