feat: Add yt-dlp download functionality and helpers

2025-12-09 22:06:31 -05:00
parent e8bf51acc0
commit 6343978c5f
2 changed files with 658 additions and 415 deletions
--- a/backend/src/services/downloaders/YtDlpDownloader.ts
+++ b/backend/src/services/downloaders/YtDlpDownloader.ts
@@ -1,14 +1,214 @@
 import axios from "axios";
+import { spawn } from "child_process";
 import fs from "fs-extra";
 import path from "path";
-import youtubedl from "youtube-dl-exec";
 import { IMAGES_DIR, SUBTITLES_DIR, VIDEOS_DIR } from "../../config/paths";
 import { formatVideoFilename } from "../../utils/helpers";
 import * as storageService from "../storageService";
 import { Video } from "../storageService";

 const YT_DLP_PATH = process.env.YT_DLP_PATH || "yt-dlp";
-const PROVIDER_SCRIPT = process.env.BGUTIL_SCRIPT_PATH || path.join(process.cwd(), "bgutil-ytdlp-pot-provider/server/build/generate_once.js");
+const PROVIDER_SCRIPT =
+  process.env.BGUTIL_SCRIPT_PATH ||
+  path.join(
+    process.cwd(),
+    "bgutil-ytdlp-pot-provider/server/build/generate_once.js"
+  );
+
+/**
+ * Convert camelCase flag names to kebab-case CLI arguments
+ */
+function convertFlagToArg(flag: string): string {
+  return `--${flag.replace(/([A-Z])/g, "-$1").toLowerCase()}`;
+}
+
+/**
+ * Convert flags object to yt-dlp CLI arguments array
+ */
+function flagsToArgs(flags: Record<string, any>): string[] {
+  const args: string[] = [];
+
+  for (const [key, value] of Object.entries(flags)) {
+    if (value === undefined || value === null) {
+      continue;
+    }
+
+    // Handle special cases
+    if (key === "extractorArgs") {
+      // Support semicolon-separated extractor args (e.g., "youtube:key=value;other:key=value")
+      if (typeof value === "string" && value.includes(";")) {
+        const parts = value.split(";");
+        for (const part of parts) {
+          if (part.trim()) {
+            args.push("--extractor-args", part.trim());
+          }
+        }
+      } else {
+        args.push("--extractor-args", value);
+      }
+      continue;
+    }
+
+    if (key === "addHeader") {
+      // addHeader is an array of "key:value" strings
+      if (Array.isArray(value)) {
+        for (const header of value) {
+          args.push("--add-header", header);
+        }
+      } else {
+        args.push("--add-header", value);
+      }
+      continue;
+    }
+
+    // Convert camelCase to kebab-case
+    const argName = convertFlagToArg(key);
+
+    if (typeof value === "boolean") {
+      if (value) {
+        args.push(argName);
+      }
+    } else if (typeof value === "string" || typeof value === "number") {
+      args.push(argName, String(value));
+    } else if (Array.isArray(value)) {
+      // For arrays, join with comma or repeat the flag
+      args.push(argName, value.join(","));
+    }
+  }
+
+  return args;
+}
+
+/**
+ * Execute yt-dlp with JSON output and return parsed result
+ */
+async function executeYtDlpJson(
+  url: string,
+  flags: Record<string, any> = {}
+): Promise<any> {
+  const args = [
+    "--dump-single-json",
+    "--no-warnings",
+    ...flagsToArgs(flags),
+    url,
+  ];
+
+  console.log(`Executing: ${YT_DLP_PATH} ${args.join(" ")}`);
+
+  return new Promise<any>((resolve, reject) => {
+    const subprocess = spawn(YT_DLP_PATH, args, {
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+
+    let stdout = "";
+    let stderr = "";
+
+    subprocess.stdout?.on("data", (data: Buffer) => {
+      stdout += data.toString();
+    });
+
+    subprocess.stderr?.on("data", (data: Buffer) => {
+      stderr += data.toString();
+    });
+
+    subprocess.on("close", (code) => {
+      if (code !== 0) {
+        const error = new Error(`yt-dlp process exited with code ${code}`);
+        (error as any).stderr = stderr;
+        reject(error);
+        return;
+      }
+
+      if (
+        stderr &&
+        !stderr.includes("[download]") &&
+        !stderr.includes("[info]")
+      ) {
+        console.warn("yt-dlp stderr:", stderr);
+      }
+
+      try {
+        resolve(JSON.parse(stdout));
+      } catch (parseError) {
+        console.error("Failed to parse yt-dlp JSON output:", parseError);
+        console.error("Output:", stdout);
+        reject(new Error("Failed to parse yt-dlp output as JSON"));
+      }
+    });
+
+    subprocess.on("error", (error) => {
+      reject(error);
+    });
+  });
+}
+
+/**
+ * Execute yt-dlp with spawn for progress tracking
+ * Returns a subprocess-like object with kill() method
+ */
+function executeYtDlpSpawn(
+  url: string,
+  flags: Record<string, any> = {}
+): {
+  stdout: NodeJS.ReadableStream | null;
+  stderr: NodeJS.ReadableStream | null;
+  kill: (signal?: NodeJS.Signals) => boolean;
+  then: (
+    onFulfilled?: (value: void) => void | Promise<void>,
+    onRejected?: (reason: any) => void | Promise<void>
+  ) => Promise<void>;
+} {
+  const args = [...flagsToArgs(flags), url];
+
+  console.log(`Spawning: ${YT_DLP_PATH} ${args.join(" ")}`);
+
+  const subprocess = spawn(YT_DLP_PATH, args, {
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+
+  let resolved = false;
+  let rejected = false;
+  let resolveFn: (() => void) | null = null;
+  let rejectFn: ((error: Error) => void) | null = null;
+
+  const promise = new Promise<void>((resolve, reject) => {
+    resolveFn = resolve;
+    rejectFn = reject;
+
+    subprocess.on("close", (code) => {
+      if (code === 0) {
+        if (!resolved && !rejected) {
+          resolved = true;
+          resolve();
+        }
+      } else {
+        if (!resolved && !rejected) {
+          rejected = true;
+          reject(new Error(`yt-dlp process exited with code ${code}`));
+        }
+      }
+    });
+
+    subprocess.on("error", (error) => {
+      if (!resolved && !rejected) {
+        rejected = true;
+        reject(error);
+      }
+    });
+  });
+
+  return {
+    stdout: subprocess.stdout,
+    stderr: subprocess.stderr,
+    kill: (signal?: NodeJS.Signals) => {
+      if (!subprocess.killed) {
+        return subprocess.kill(signal);
+      }
+      return false;
+    },
+    then: promise.then.bind(promise),
+  };
+}

 // Helper function to extract author from XiaoHongShu page when yt-dlp doesn't provide it
 async function extractXiaoHongShuAuthor(url: string): Promise<string | null> {
@@ -16,9 +216,10 @@ async function extractXiaoHongShuAuthor(url: string): Promise<string | null> {
    console.log("Attempting to extract XiaoHongShu author from webpage...");
    const response = await axios.get(url, {
      headers: {
-                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+        "User-Agent":
+          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
      },
-            timeout: 10000
+      timeout: 10000,
    });

    const html = response.data;
@@ -52,20 +253,19 @@ export class YtDlpDownloader {
    console.log("Processing search request for query:", query);

    // Use ytsearch for searching
-        const searchResults = await youtubedl(`ytsearch5:${query}`, {
-            dumpSingleJson: true,
+    const searchResults = await executeYtDlpJson(`ytsearch5:${query}`, {
      noWarnings: true,
      skipDownload: true,
      playlistEnd: 5, // Limit to 5 results
      extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
-        } as any, { execPath: YT_DLP_PATH } as any);
+    });

-        if (!searchResults || !(searchResults as any).entries) {
+    if (!searchResults || !searchResults.entries) {
      return [];
    }

    // Format the search results
-        const formattedResults = (searchResults as any).entries.map((entry: any) => ({
+    const formattedResults = searchResults.entries.map((entry: any) => ({
      id: entry.id,
      title: entry.title,
      author: entry.uploader,
@@ -84,20 +284,27 @@ export class YtDlpDownloader {
  }

  // Get video info without downloading
-    static async getVideoInfo(url: string): Promise<{ title: string; author: string; date: string; thumbnailUrl: string }> {
+  static async getVideoInfo(
+    url: string
+  ): Promise<{
+    title: string;
+    author: string;
+    date: string;
+    thumbnailUrl: string;
+  }> {
    try {
-            const info = await youtubedl(url, {
-                dumpSingleJson: true,
+      const info = await executeYtDlpJson(url, {
        noWarnings: true,
        preferFreeFormats: true,
-                // youtubeSkipDashManifest: true, // Specific to YT, might want to keep or make conditional
        extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
-            } as any, { execPath: YT_DLP_PATH } as any);
+      });

      return {
        title: info.title || "Video",
        author: info.uploader || "Unknown",
-                date: info.upload_date || new Date().toISOString().slice(0, 10).replace(/-/g, ""),
+        date:
+          info.upload_date ||
+          new Date().toISOString().slice(0, 10).replace(/-/g, ""),
        thumbnailUrl: info.thumbnail,
      };
    } catch (error) {
@@ -118,31 +325,40 @@ export class YtDlpDownloader {

      // Append /videos to channel URL to ensure we get videos and not the channel tab
      let targetUrl = channelUrl;
-            if (channelUrl.includes('youtube.com/') && !channelUrl.includes('/videos') && !channelUrl.includes('/shorts') && !channelUrl.includes('/streams')) {
+      if (
+        channelUrl.includes("youtube.com/") &&
+        !channelUrl.includes("/videos") &&
+        !channelUrl.includes("/shorts") &&
+        !channelUrl.includes("/streams")
+      ) {
        // Check if it looks like a channel URL
-                if (channelUrl.includes('/@') || channelUrl.includes('/channel/') || channelUrl.includes('/c/') || channelUrl.includes('/user/')) {
+        if (
+          channelUrl.includes("/@") ||
+          channelUrl.includes("/channel/") ||
+          channelUrl.includes("/c/") ||
+          channelUrl.includes("/user/")
+        ) {
          targetUrl = `${channelUrl}/videos`;
          console.log("Modified channel URL to:", targetUrl);
        }
      }

      // Use yt-dlp to get the first video in the channel (playlist)
-            const result = await youtubedl(targetUrl, {
-                dumpSingleJson: true,
+      const result = await executeYtDlpJson(targetUrl, {
        playlistEnd: 5,
        noWarnings: true,
        flatPlaylist: true, // We only need the ID/URL, not full info
        extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
-            } as any, { execPath: YT_DLP_PATH } as any);
+      });

      // If it's a playlist/channel, 'entries' will contain the videos
-            if ((result as any).entries && (result as any).entries.length > 0) {
+      if (result.entries && result.entries.length > 0) {
        // Iterate through entries to find a valid video
        // Sometimes the first entry is the channel/tab itself (e.g. id starts with UC)
-                for (const entry of (result as any).entries) {
+        for (const entry of result.entries) {
          // Skip entries that look like channel IDs (start with UC and are 24 chars)
          // or entries without a title/url that look like metadata
-                    if (entry.id && entry.id.startsWith('UC') && entry.id.length === 24) {
+          if (entry.id && entry.id.startsWith("UC") && entry.id.length === 24) {
            continue;
          }

@@ -163,7 +379,11 @@ export class YtDlpDownloader {
  }

  // Download video
-    static async downloadVideo(videoUrl: string, downloadId?: string, onStart?: (cancel: () => void) => void): Promise<Video> {
+  static async downloadVideo(
+    videoUrl: string,
+    downloadId?: string,
+    onStart?: (cancel: () => void) => void
+  ): Promise<Video> {
    console.log("Detected URL:", videoUrl);

    // Create a safe base filename (without extension)
@@ -174,19 +394,25 @@ export class YtDlpDownloader {
    const videoFilename = `${safeBaseFilename}.mp4`;
    const thumbnailFilename = `${safeBaseFilename}.jpg`;

-        let videoTitle, videoAuthor, videoDate, videoDescription, thumbnailUrl, thumbnailSaved, source;
+    let videoTitle,
+      videoAuthor,
+      videoDate,
+      videoDescription,
+      thumbnailUrl,
+      thumbnailSaved,
+      source;
    let finalVideoFilename = videoFilename;
    let finalThumbnailFilename = thumbnailFilename;
-        let subtitles: Array<{ language: string; filename: string; path: string }> = [];
+    let subtitles: Array<{ language: string; filename: string; path: string }> =
+      [];

    try {
      // Get video info first
-            const info = await youtubedl(videoUrl, {
-                dumpSingleJson: true,
+      const info = await executeYtDlpJson(videoUrl, {
        noWarnings: true,
        preferFreeFormats: true,
        extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
-            } as any, { execPath: YT_DLP_PATH } as any);
+      });

      console.log("Video info:", {
        title: info.title,
@@ -199,7 +425,10 @@ export class YtDlpDownloader {
      videoAuthor = info.uploader || "Unknown";

      // If author is unknown and it's a XiaoHongShu video, try custom extraction
-            if ((!info.uploader || info.uploader === "Unknown") && info.extractor === "XiaoHongShu") {
+      if (
+        (!info.uploader || info.uploader === "Unknown") &&
+        info.extractor === "XiaoHongShu"
+      ) {
        const customAuthor = await extractXiaoHongShuAuthor(videoUrl);
        if (customAuthor) {
          videoAuthor = customAuthor;
@@ -213,7 +442,11 @@ export class YtDlpDownloader {
      source = info.extractor || "generic";

      // Update the safe base filename with the actual title
-            const newSafeBaseFilename = formatVideoFilename(videoTitle, videoAuthor, videoDate);
+      const newSafeBaseFilename = formatVideoFilename(
+        videoTitle,
+        videoAuthor,
+        videoDate
+      );
      const newVideoFilename = `${newSafeBaseFilename}.mp4`;
      const newThumbnailFilename = `${newSafeBaseFilename}.jpg`;

@@ -231,35 +464,35 @@ export class YtDlpDownloader {
      if (downloadId) {
        storageService.updateActiveDownload(downloadId, {
          filename: videoTitle,
-                    progress: 0
+          progress: 0,
        });
      }

      // Prepare flags
-            const flags: any = {
+      const flags: Record<string, any> = {
        output: newVideoPath,
        format: "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
        mergeOutputFormat: "mp4",
        writeSubs: true,
        writeAutoSubs: true,
        convertSubs: "vtt",
+        extractorArgs: `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`,
      };

      // Add YouTube specific flags if it's a YouTube URL
      if (videoUrl.includes("youtube.com") || videoUrl.includes("youtu.be")) {
-                 flags.format = "bestvideo[ext=mp4][vcodec^=avc1]+bestaudio[ext=m4a][acodec=aac]/bestvideo[ext=mp4][vcodec=h264]+bestaudio[ext=m4a]/best[ext=mp4]/best";
-                 flags['extractor-args'] = "youtube:player_client=android";
+        flags.format =
+          "bestvideo[ext=mp4][vcodec^=avc1]+bestaudio[ext=m4a][acodec=aac]/bestvideo[ext=mp4][vcodec=h264]+bestaudio[ext=m4a]/best[ext=mp4]/best";
+        // Combine YouTube extractor args with PO token provider
+        flags.extractorArgs = `youtube:player_client=android;youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`;
        flags.addHeader = [
-                    'Referer:https://www.youtube.com/',
-                    'User-Agent:Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36'
+          "Referer:https://www.youtube.com/",
+          "User-Agent:Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
        ];
      }

-            // Add PO Token provider args
-            flags.extractorArgs = `youtubepot-bgutilscript:script_path=${PROVIDER_SCRIPT}`;
-
-            // Use exec to capture stdout for progress
-            const subprocess = youtubedl.exec(videoUrl, flags, { execPath: YT_DLP_PATH } as any);
+      // Use spawn to capture stdout for progress
+      const subprocess = executeYtDlpSpawn(videoUrl, flags);

      if (onStart) {
        onStart(() => {
@@ -294,10 +527,12 @@ export class YtDlpDownloader {
        });
      }

-            subprocess.stdout?.on('data', (data: Buffer) => {
+      subprocess.stdout?.on("data", (data: Buffer) => {
        const output = data.toString();
        // Parse progress: [download]  23.5% of 10.00MiB at  2.00MiB/s ETA 00:05
-                const progressMatch = output.match(/(\d+\.?\d*)%\s+of\s+([~\d\w.]+)\s+at\s+([~\d\w.\/]+)/);
+        const progressMatch = output.match(
+          /(\d+\.?\d*)%\s+of\s+([~\d\w.]+)\s+at\s+([~\d\w.\/]+)/
+        );

        if (progressMatch && downloadId) {
          const percentage = parseFloat(progressMatch[1]);
@@ -307,7 +542,7 @@ export class YtDlpDownloader {
          storageService.updateActiveDownload(downloadId, {
            progress: percentage,
            totalSize: totalSize,
-                        speed: speed
+            speed: speed,
          });
        }
      });
@@ -349,7 +584,10 @@ export class YtDlpDownloader {
      // Scan for subtitle files
      try {
        const baseFilename = newSafeBaseFilename;
-                const subtitleFiles = fs.readdirSync(VIDEOS_DIR).filter((file: string) => 
+        const subtitleFiles = fs
+          .readdirSync(VIDEOS_DIR)
+          .filter(
+            (file: string) =>
              file.startsWith(baseFilename) && file.endsWith(".vtt")
          );

@@ -357,7 +595,9 @@ export class YtDlpDownloader {

        for (const subtitleFile of subtitleFiles) {
          // Parse language from filename (e.g., video_123.en.vtt -> en)
-                    const match = subtitleFile.match(/\.([a-z]{2}(?:-[A-Z]{2})?)(?:\..*?)?\.vtt$/);
+          const match = subtitleFile.match(
+            /\.([a-z]{2}(?:-[A-Z]{2})?)(?:\..*?)?\.vtt$/
+          );
          const language = match ? match[1] : "unknown";

          // Move subtitle to subtitles directory
@@ -366,19 +606,21 @@ export class YtDlpDownloader {
          const destSubPath = path.join(SUBTITLES_DIR, destSubFilename);

          // Read VTT file and fix alignment for centering
-                    let vttContent = fs.readFileSync(sourceSubPath, 'utf-8');
+          let vttContent = fs.readFileSync(sourceSubPath, "utf-8");
          // Replace align:start with align:middle for centered subtitles
          // Also remove position:0% which forces left positioning
-                    vttContent = vttContent.replace(/ align:start/g, ' align:middle');
-                    vttContent = vttContent.replace(/ position:0%/g, '');
+          vttContent = vttContent.replace(/ align:start/g, " align:middle");
+          vttContent = vttContent.replace(/ position:0%/g, "");

          // Write cleaned VTT to destination
-                    fs.writeFileSync(destSubPath, vttContent, 'utf-8');
+          fs.writeFileSync(destSubPath, vttContent, "utf-8");

          // Remove original file
          fs.unlinkSync(sourceSubPath);

-                    console.log(`Processed and moved subtitle ${subtitleFile} to ${destSubPath}`);
+          console.log(
+            `Processed and moved subtitle ${subtitleFile} to ${destSubPath}`
+          );

          subtitles.push({
            language,
@@ -389,7 +631,6 @@ export class YtDlpDownloader {
      } catch (subtitleError) {
        console.error("Error processing subtitle files:", subtitleError);
      }
-
    } catch (error) {
      console.error("Error in download process:", error);
      throw error;
@@ -422,7 +663,9 @@ export class YtDlpDownloader {
    const finalVideoPath = path.join(VIDEOS_DIR, finalVideoFilename);

    try {
-             const { getVideoDuration } = await import("../../services/metadataService");
+      const { getVideoDuration } = await import(
+        "../../services/metadataService"
+      );
      const duration = await getVideoDuration(finalVideoPath);
      if (duration) {
        videoData.duration = duration.toString();
--- a/build-and-push.sh
+++ b/build-and-push.sh
@@ -5,8 +5,8 @@ DOCKER_PATH="/Applications/Docker.app/Contents/Resources/bin/docker"
 USERNAME="franklioxygen"
 VERSION=$1

-BACKEND_LATEST="$USERNAME/mytube:backend-latest"
-FRONTEND_LATEST="$USERNAME/mytube:frontend-latest"
+BACKEND_LATEST="$USERNAME/mytube:backend-test"
+FRONTEND_LATEST="$USERNAME/mytube:frontend-test"

 if [ -n "$VERSION" ]; then
  echo "🔖 Version specified: $VERSION"