feat: Add yt-dlp download functionality and helpers

This commit is contained in:
Peifan Li
2025-12-09 22:06:31 -05:00
parent e8bf51acc0
commit 6343978c5f
2 changed files with 658 additions and 415 deletions

View File

@@ -1,14 +1,214 @@
import axios from "axios";
import { spawn } from "child_process";
import fs from "fs-extra";
import path from "path";
import youtubedl from "youtube-dl-exec";
import { IMAGES_DIR, SUBTITLES_DIR, VIDEOS_DIR } from "../../config/paths";
import { formatVideoFilename } from "../../utils/helpers";
import * as storageService from "../storageService";
import { Video } from "../storageService";
const YT_DLP_PATH = process.env.YT_DLP_PATH || "yt-dlp";
const PROVIDER_SCRIPT = process.env.BGUTIL_SCRIPT_PATH || path.join(process.cwd(), "bgutil-ytdlp-pot-provider/server/build/generate_once.js");
const PROVIDER_SCRIPT =
process.env.BGUTIL_SCRIPT_PATH ||
path.join(
process.cwd(),
"bgutil-ytdlp-pot-provider/server/build/generate_once.js"
);
/**
* Convert camelCase flag names to kebab-case CLI arguments
*/
function convertFlagToArg(flag: string): string {
return `--${flag.replace(/([A-Z])/g, "-$1").toLowerCase()}`;
}
/**
* Convert flags object to yt-dlp CLI arguments array
*/
function flagsToArgs(flags: Record<string, any>): string[] {
const args: string[] = [];
for (const [key, value] of Object.entries(flags)) {
if (value === undefined || value === null) {
continue;
}
// Handle special cases
if (key === "extractorArgs") {
// Support semicolon-separated extractor args (e.g., "youtube:key=value;other:key=value")
if (typeof value === "string" && value.includes(";")) {
const parts = value.split(";");
for (const part of parts) {
if (part.trim()) {
args.push("--extractor-args", part.trim());
}
}
} else {
args.push("--extractor-args", value);
}
continue;
}
if (key === "addHeader") {
// addHeader is an array of "key:value" strings
if (Array.isArray(value)) {
for (const header of value) {
args.push("--add-header", header);
}
} else {
args.push("--add-header", value);
}
continue;
}
// Convert camelCase to kebab-case
const argName = convertFlagToArg(key);
if (typeof value === "boolean") {
if (value) {
args.push(argName);
}
} else if (typeof value === "string" || typeof value === "number") {
args.push(argName, String(value));
} else if (Array.isArray(value)) {
// For arrays, join with comma or repeat the flag
args.push(argName, value.join(","));
}
}
return args;
}
/**
* Execute yt-dlp with JSON output and return parsed result
*/
async function executeYtDlpJson(
url: string,
flags: Record<string, any> = {}
): Promise<any> {
const args = [
"--dump-single-json",
"--no-warnings",
...flagsToArgs(flags),
url,
];
console.log(`Executing: ${YT_DLP_PATH} ${args.join(" ")}`);
return new Promise<any>((resolve, reject) => {
const subprocess = spawn(YT_DLP_PATH, args, {
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
subprocess.stdout?.on("data", (data: Buffer) => {
stdout += data.toString();
});
subprocess.stderr?.on("data", (data: Buffer) => {
stderr += data.toString();
});
subprocess.on("close", (code) => {
if (code !== 0) {
const error = new Error(`yt-dlp process exited with code ${code}`);
(error as any).stderr = stderr;
reject(error);
return;
}
if (
stderr &&
!stderr.includes("[download]") &&
!stderr.includes("[info]")
) {
console.warn("yt-dlp stderr:", stderr);
}
try {
resolve(JSON.parse(stdout));
} catch (parseError) {
console.error("Failed to parse yt-dlp JSON output:", parseError);
console.error("Output:", stdout);
reject(new Error("Failed to parse yt-dlp output as JSON"));
}
});
subprocess.on("error", (error) => {
reject(error);
});
});
}
/**
* Execute yt-dlp with spawn for progress tracking
* Returns a subprocess-like object with kill() method
*/
function executeYtDlpSpawn(
url: string,
flags: Record<string, any> = {}
): {
stdout: NodeJS.ReadableStream | null;
stderr: NodeJS.ReadableStream | null;
kill: (signal?: NodeJS.Signals) => boolean;
then: (
onFulfilled?: (value: void) => void | Promise<void>,
onRejected?: (reason: any) => void | Promise<void>
) => Promise<void>;
} {
const args = [...flagsToArgs(flags), url];
console.log(`Spawning: ${YT_DLP_PATH} ${args.join(" ")}`);
const subprocess = spawn(YT_DLP_PATH, args, {
stdio: ["ignore", "pipe", "pipe"],
});
let resolved = false;
let rejected = false;
let resolveFn: (() => void) | null = null;
let rejectFn: ((error: Error) => void) | null = null;
const promise = new Promise<void>((resolve, reject) => {
resolveFn = resolve;
rejectFn = reject;
subprocess.on("close", (code) => {
if (code === 0) {
if (!resolved && !rejected) {
resolved = true;
resolve();
}
} else {
if (!resolved && !rejected) {
rejected = true;
reject(new Error(`yt-dlp process exited with code ${code}`));
}
}
});
subprocess.on("error", (error) => {
if (!resolved && !rejected) {
rejected = true;
reject(error);
}
});
});
return {
stdout: subprocess.stdout,
stderr: subprocess.stderr,
kill: (signal?: NodeJS.Signals) => {
if (!subprocess.killed) {
return subprocess.kill(signal);
}
return false;
},
then: promise.then.bind(promise),
};
}
// Helper function to extract author from XiaoHongShu page when yt-dlp doesn't provide it
async function extractXiaoHongShuAuthor(url: string): Promise<string | null> {
@@ -16,9 +216,10 @@ async function extractXiaoHongShuAuthor(url: string): Promise<string | null> {
console.log("Attempting to extract XiaoHongShu author from webpage...");
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
},
timeout: 10000
timeout: 10000,
});
const html = response.data;
@@ -52,20 +253,19 @@ export class YtDlpDownloader {
console.log("Processing search request for query:", query);
// Use ytsearch for searching
const searchResults = await youtubedl(`ytsearch5:${query}`, {
dumpSingleJson: true,
const searchResults = await executeYtDlpJson(`ytsearch5:${query}`, {
noWarnings: true,
skipDownload: true,
playlistEnd: 5, // Limit to 5 results
extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
} as any, { execPath: YT_DLP_PATH } as any);
});
if (!searchResults || !(searchResults as any).entries) {
if (!searchResults || !searchResults.entries) {
return [];
}
// Format the search results
const formattedResults = (searchResults as any).entries.map((entry: any) => ({
const formattedResults = searchResults.entries.map((entry: any) => ({
id: entry.id,
title: entry.title,
author: entry.uploader,
@@ -84,20 +284,27 @@ export class YtDlpDownloader {
}
// Get video info without downloading
static async getVideoInfo(url: string): Promise<{ title: string; author: string; date: string; thumbnailUrl: string }> {
static async getVideoInfo(
url: string
): Promise<{
title: string;
author: string;
date: string;
thumbnailUrl: string;
}> {
try {
const info = await youtubedl(url, {
dumpSingleJson: true,
const info = await executeYtDlpJson(url, {
noWarnings: true,
preferFreeFormats: true,
// youtubeSkipDashManifest: true, // Specific to YT, might want to keep or make conditional
extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
} as any, { execPath: YT_DLP_PATH } as any);
});
return {
title: info.title || "Video",
author: info.uploader || "Unknown",
date: info.upload_date || new Date().toISOString().slice(0, 10).replace(/-/g, ""),
date:
info.upload_date ||
new Date().toISOString().slice(0, 10).replace(/-/g, ""),
thumbnailUrl: info.thumbnail,
};
} catch (error) {
@@ -118,31 +325,40 @@ export class YtDlpDownloader {
// Append /videos to channel URL to ensure we get videos and not the channel tab
let targetUrl = channelUrl;
if (channelUrl.includes('youtube.com/') && !channelUrl.includes('/videos') && !channelUrl.includes('/shorts') && !channelUrl.includes('/streams')) {
if (
channelUrl.includes("youtube.com/") &&
!channelUrl.includes("/videos") &&
!channelUrl.includes("/shorts") &&
!channelUrl.includes("/streams")
) {
// Check if it looks like a channel URL
if (channelUrl.includes('/@') || channelUrl.includes('/channel/') || channelUrl.includes('/c/') || channelUrl.includes('/user/')) {
if (
channelUrl.includes("/@") ||
channelUrl.includes("/channel/") ||
channelUrl.includes("/c/") ||
channelUrl.includes("/user/")
) {
targetUrl = `${channelUrl}/videos`;
console.log("Modified channel URL to:", targetUrl);
}
}
// Use yt-dlp to get the first video in the channel (playlist)
const result = await youtubedl(targetUrl, {
dumpSingleJson: true,
const result = await executeYtDlpJson(targetUrl, {
playlistEnd: 5,
noWarnings: true,
flatPlaylist: true, // We only need the ID/URL, not full info
extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
} as any, { execPath: YT_DLP_PATH } as any);
});
// If it's a playlist/channel, 'entries' will contain the videos
if ((result as any).entries && (result as any).entries.length > 0) {
if (result.entries && result.entries.length > 0) {
// Iterate through entries to find a valid video
// Sometimes the first entry is the channel/tab itself (e.g. id starts with UC)
for (const entry of (result as any).entries) {
for (const entry of result.entries) {
// Skip entries that look like channel IDs (start with UC and are 24 chars)
// or entries without a title/url that look like metadata
if (entry.id && entry.id.startsWith('UC') && entry.id.length === 24) {
if (entry.id && entry.id.startsWith("UC") && entry.id.length === 24) {
continue;
}
@@ -163,7 +379,11 @@ export class YtDlpDownloader {
}
// Download video
static async downloadVideo(videoUrl: string, downloadId?: string, onStart?: (cancel: () => void) => void): Promise<Video> {
static async downloadVideo(
videoUrl: string,
downloadId?: string,
onStart?: (cancel: () => void) => void
): Promise<Video> {
console.log("Detected URL:", videoUrl);
// Create a safe base filename (without extension)
@@ -174,19 +394,25 @@ export class YtDlpDownloader {
const videoFilename = `${safeBaseFilename}.mp4`;
const thumbnailFilename = `${safeBaseFilename}.jpg`;
let videoTitle, videoAuthor, videoDate, videoDescription, thumbnailUrl, thumbnailSaved, source;
let videoTitle,
videoAuthor,
videoDate,
videoDescription,
thumbnailUrl,
thumbnailSaved,
source;
let finalVideoFilename = videoFilename;
let finalThumbnailFilename = thumbnailFilename;
let subtitles: Array<{ language: string; filename: string; path: string }> = [];
let subtitles: Array<{ language: string; filename: string; path: string }> =
[];
try {
// Get video info first
const info = await youtubedl(videoUrl, {
dumpSingleJson: true,
const info = await executeYtDlpJson(videoUrl, {
noWarnings: true,
preferFreeFormats: true,
extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
} as any, { execPath: YT_DLP_PATH } as any);
});
console.log("Video info:", {
title: info.title,
@@ -199,7 +425,10 @@ export class YtDlpDownloader {
videoAuthor = info.uploader || "Unknown";
// If author is unknown and it's a XiaoHongShu video, try custom extraction
if ((!info.uploader || info.uploader === "Unknown") && info.extractor === "XiaoHongShu") {
if (
(!info.uploader || info.uploader === "Unknown") &&
info.extractor === "XiaoHongShu"
) {
const customAuthor = await extractXiaoHongShuAuthor(videoUrl);
if (customAuthor) {
videoAuthor = customAuthor;
@@ -213,7 +442,11 @@ export class YtDlpDownloader {
source = info.extractor || "generic";
// Update the safe base filename with the actual title
const newSafeBaseFilename = formatVideoFilename(videoTitle, videoAuthor, videoDate);
const newSafeBaseFilename = formatVideoFilename(
videoTitle,
videoAuthor,
videoDate
);
const newVideoFilename = `${newSafeBaseFilename}.mp4`;
const newThumbnailFilename = `${newSafeBaseFilename}.jpg`;
@@ -231,35 +464,35 @@ export class YtDlpDownloader {
if (downloadId) {
storageService.updateActiveDownload(downloadId, {
filename: videoTitle,
progress: 0
progress: 0,
});
}
// Prepare flags
const flags: any = {
const flags: Record<string, any> = {
output: newVideoPath,
format: "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
mergeOutputFormat: "mp4",
writeSubs: true,
writeAutoSubs: true,
convertSubs: "vtt",
extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
};
// Add YouTube specific flags if it's a YouTube URL
if (videoUrl.includes("youtube.com") || videoUrl.includes("youtu.be")) {
flags.format = "bestvideo[ext=mp4][vcodec^=avc1]+bestaudio[ext=m4a][acodec=aac]/bestvideo[ext=mp4][vcodec=h264]+bestaudio[ext=m4a]/best[ext=mp4]/best";
flags['extractor-args'] = "youtube:player_client=android";
flags.format =
"bestvideo[ext=mp4][vcodec^=avc1]+bestaudio[ext=m4a][acodec=aac]/bestvideo[ext=mp4][vcodec=h264]+bestaudio[ext=m4a]/best[ext=mp4]/best";
// Combine YouTube extractor args with PO token provider
flags.extractorArgs = `youtube:player_client=android;youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`;
flags.addHeader = [
'Referer:https://www.youtube.com/',
'User-Agent:Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36'
"Referer:https://www.youtube.com/",
"User-Agent:Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
];
}
// Add PO Token provider args
flags.extractorArgs = `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`;
// Use exec to capture stdout for progress
const subprocess = youtubedl.exec(videoUrl, flags, { execPath: YT_DLP_PATH } as any);
// Use spawn to capture stdout for progress
const subprocess = executeYtDlpSpawn(videoUrl, flags);
if (onStart) {
onStart(() => {
@@ -294,10 +527,12 @@ export class YtDlpDownloader {
});
}
subprocess.stdout?.on('data', (data: Buffer) => {
subprocess.stdout?.on("data", (data: Buffer) => {
const output = data.toString();
// Parse progress: [download] 23.5% of 10.00MiB at 2.00MiB/s ETA 00:05
const progressMatch = output.match(/(\d+\.?\d*)%\s+of\s+([~\d\w.]+)\s+at\s+([~\d\w.\/]+)/);
const progressMatch = output.match(
/(\d+\.?\d*)%\s+of\s+([~\d\w.]+)\s+at\s+([~\d\w.\/]+)/
);
if (progressMatch && downloadId) {
const percentage = parseFloat(progressMatch[1]);
@@ -307,7 +542,7 @@ export class YtDlpDownloader {
storageService.updateActiveDownload(downloadId, {
progress: percentage,
totalSize: totalSize,
speed: speed
speed: speed,
});
}
});
@@ -349,7 +584,10 @@ export class YtDlpDownloader {
// Scan for subtitle files
try {
const baseFilename = newSafeBaseFilename;
const subtitleFiles = fs.readdirSync(VIDEOS_DIR).filter((file: string) =>
const subtitleFiles = fs
.readdirSync(VIDEOS_DIR)
.filter(
(file: string) =>
file.startsWith(baseFilename) && file.endsWith(".vtt")
);
@@ -357,7 +595,9 @@ export class YtDlpDownloader {
for (const subtitleFile of subtitleFiles) {
// Parse language from filename (e.g., video_123.en.vtt -> en)
const match = subtitleFile.match(/\.([a-z]{2}(?:-[A-Z]{2})?)(?:\..*?)?\.vtt$/);
const match = subtitleFile.match(
/\.([a-z]{2}(?:-[A-Z]{2})?)(?:\..*?)?\.vtt$/
);
const language = match ? match[1] : "unknown";
// Move subtitle to subtitles directory
@@ -366,19 +606,21 @@ export class YtDlpDownloader {
const destSubPath = path.join(SUBTITLES_DIR, destSubFilename);
// Read VTT file and fix alignment for centering
let vttContent = fs.readFileSync(sourceSubPath, 'utf-8');
let vttContent = fs.readFileSync(sourceSubPath, "utf-8");
// Replace align:start with align:middle for centered subtitles
// Also remove position:0% which forces left positioning
vttContent = vttContent.replace(/ align:start/g, ' align:middle');
vttContent = vttContent.replace(/ position:0%/g, '');
vttContent = vttContent.replace(/ align:start/g, " align:middle");
vttContent = vttContent.replace(/ position:0%/g, "");
// Write cleaned VTT to destination
fs.writeFileSync(destSubPath, vttContent, 'utf-8');
fs.writeFileSync(destSubPath, vttContent, "utf-8");
// Remove original file
fs.unlinkSync(sourceSubPath);
console.log(`Processed and moved subtitle ${subtitleFile} to ${destSubPath}`);
console.log(
`Processed and moved subtitle ${subtitleFile} to ${destSubPath}`
);
subtitles.push({
language,
@@ -389,7 +631,6 @@ export class YtDlpDownloader {
} catch (subtitleError) {
console.error("Error processing subtitle files:", subtitleError);
}
} catch (error) {
console.error("Error in download process:", error);
throw error;
@@ -422,7 +663,9 @@ export class YtDlpDownloader {
const finalVideoPath = path.join(VIDEOS_DIR, finalVideoFilename);
try {
const { getVideoDuration } = await import("../../services/metadataService");
const { getVideoDuration } = await import(
"../../services/metadataService"
);
const duration = await getVideoDuration(finalVideoPath);
if (duration) {
videoData.duration = duration.toString();

View File

@@ -5,8 +5,8 @@ DOCKER_PATH="/Applications/Docker.app/Contents/Resources/bin/docker"
USERNAME="franklioxygen"
VERSION=$1
BACKEND_LATEST="$USERNAME/mytube:backend-latest"
FRONTEND_LATEST="$USERNAME/mytube:frontend-latest"
BACKEND_LATEST="$USERNAME/mytube:backend-test"
FRONTEND_LATEST="$USERNAME/mytube:frontend-test"
if [ -n "$VERSION" ]; then
echo "🔖 Version specified: $VERSION"