Spaces:

Krishna346
/

Youtube-summarizer-api

Running

App Files Files Community

bskrishna2006 commited on 3 days ago

Commit

b4562f5

1 Parent(s): 034b462

Add youtube_transcript_api for better cloud compatibility

Browse files

Files changed (2) hide show

requirements.txt +2 -1
services/transcript.py +134 -9

requirements.txt CHANGED Viewed

@@ -14,9 +14,10 @@ requests>=2.31.0
 httpx>=0.24.0,<0.26.0
 # =============================================================================
-# YouTube Download
 # =============================================================================
 yt-dlp>=2024.1.1
 # =============================================================================
 # Groq API for Summarization (FREE)

 httpx>=0.24.0,<0.26.0
 # =============================================================================
+# YouTube Download & Transcripts
 # =============================================================================
 yt-dlp>=2024.1.1
+youtube_transcript_api>=0.6.0
 # =============================================================================
 # Groq API for Summarization (FREE)

services/transcript.py CHANGED Viewed

@@ -2,8 +2,9 @@
 Transcript Service for YouTube Videos
 This service extracts transcripts from YouTube videos using multiple methods:
-1. First, try to get existing subtitles/captions (fastest, no model needed)
-2. If no subtitles available, fallback to audio extraction + Whisper transcription
 The fallback uses the SpeechToTextService for local Whisper transcription.
 """
@@ -12,7 +13,15 @@ import re
 import os
 import tempfile
 import logging
-from typing import Optional, Tuple
 import yt_dlp
@@ -86,9 +95,113 @@ class TranscriptService:
         return text
     def get_subtitles(self, url: str, lang: str = "en") -> Optional[dict]:
         """
-        Try to get existing subtitles from YouTube.
         Args:
             url: YouTube video URL
@@ -187,8 +300,10 @@ class TranscriptService:
         """
         Get transcript from a YouTube video.
-        First tries to get subtitles. If unavailable and use_whisper_fallback is True,
-        falls back to audio extraction and Whisper transcription.
         Args:
             url: YouTube video URL
@@ -198,14 +313,24 @@ class TranscriptService:
             Dictionary with:
                 - transcript: The transcript text
                 - language: Detected/extracted language code
-                - source: "subtitles" or "whisper"
                 - word_count: Number of words
         Raises:
             Exception: If transcript cannot be obtained
         """
-        # Try subtitles first (faster, no model needed)
-        logger.info("Attempting to get subtitles...")
         result = self.get_subtitles(url)
         if result:

 Transcript Service for YouTube Videos
 This service extracts transcripts from YouTube videos using multiple methods:
+1. First, try youtube_transcript_api (works well on cloud platforms)
+2. Then try yt-dlp subtitle extraction
+3. If no subtitles available, fallback to audio extraction + Whisper transcription
 The fallback uses the SpeechToTextService for local Whisper transcription.
 """
 import os
 import tempfile
 import logging
+from typing import Optional, Tuple, List
+# Try to import youtube_transcript_api (more reliable for cloud deployments)
+try:
+    from youtube_transcript_api import YouTubeTranscriptApi
+    from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
+    HAS_YOUTUBE_TRANSCRIPT_API = True
+except ImportError:
+    HAS_YOUTUBE_TRANSCRIPT_API = False
 import yt_dlp
         return text
+    def get_transcript_api(self, video_id: str) -> Optional[dict]:
+        """
+        Get transcript using youtube_transcript_api (works better on cloud platforms).
+        Args:
+            video_id: YouTube video ID
+        Returns:
+            Dictionary with transcript and language, or None if not available
+        """
+        if not HAS_YOUTUBE_TRANSCRIPT_API:
+            logger.info("youtube_transcript_api not installed, skipping...")
+            return None
+        try:
+            # Try to get transcript in preferred languages
+            preferred_langs = ['en', 'en-IN', 'hi', 'ta', 'te', 'kn', 'ml', 'gu', 'bn', 'mr', 'pa', 'ur']
+            try:
+                transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+                # Try to find a manual transcript first, then auto-generated
+                transcript = None
+                detected_lang = "eng"
+                # First try manual transcripts
+                for lang in preferred_langs:
+                    try:
+                        transcript = transcript_list.find_manually_created_transcript([lang])
+                        detected_lang = lang
+                        break
+                    except:
+                        pass
+                # Then try auto-generated
+                if not transcript:
+                    for lang in preferred_langs:
+                        try:
+                            transcript = transcript_list.find_generated_transcript([lang])
+                            detected_lang = lang
+                            break
+                        except:
+                            pass
+                # If still no transcript, try to get any available
+                if not transcript:
+                    for t in transcript_list:
+                        transcript = t
+                        detected_lang = t.language_code
+                        break
+                if transcript:
+                    # Fetch the actual transcript
+                    transcript_data = transcript.fetch()
+                    # Combine all text
+                    text_parts = [entry['text'] for entry in transcript_data]
+                    full_text = ' '.join(text_parts)
+                    # Clean the text
+                    clean_text = self.clean_autogen_transcript(full_text)
+                    if len(clean_text.strip()) < 50:
+                        logger.info("Transcript too short")
+                        return None
+                    # Normalize language code
+                    lang_map = {
+                        "en": "eng", "en-IN": "eng", "en-US": "eng", "en-GB": "eng",
+                        "hi": "hin", "hi-IN": "hin",
+                        "ta": "tam", "ta-IN": "tam",
+                        "te": "tel", "te-IN": "tel",
+                        "kn": "kan", "kn-IN": "kan",
+                        "ml": "mal", "ml-IN": "mal",
+                        "gu": "guj", "gu-IN": "guj",
+                        "bn": "ben", "bn-IN": "ben",
+                        "mr": "mar", "mr-IN": "mar",
+                        "pa": "pan", "pa-IN": "pan",
+                        "ur": "urd", "ur-PK": "urd",
+                    }
+                    normalized_lang = lang_map.get(detected_lang, detected_lang)
+                    logger.info(f"Transcript fetched via API (language: {normalized_lang})")
+                    return {
+                        "transcript": clean_text,
+                        "language": normalized_lang,
+                        "source": "youtube_api",
+                        "word_count": len(clean_text.split())
+                    }
+            except TranscriptsDisabled:
+                logger.info("Transcripts are disabled for this video")
+                return None
+            except NoTranscriptFound:
+                logger.info("No transcript found for this video")
+                return None
+        except Exception as e:
+            logger.warning(f"youtube_transcript_api failed: {e}")
+            return None
+        return None
     def get_subtitles(self, url: str, lang: str = "en") -> Optional[dict]:
         """
+        Try to get existing subtitles from YouTube using yt-dlp.
         Args:
             url: YouTube video URL
         """
         Get transcript from a YouTube video.
+        Tries multiple methods in order:
+        1. youtube_transcript_api (works best on cloud platforms)
+        2. yt-dlp subtitle extraction
+        3. Whisper transcription (fallback)
         Args:
             url: YouTube video URL
             Dictionary with:
                 - transcript: The transcript text
                 - language: Detected/extracted language code
+                - source: "youtube_api", "subtitles", or "whisper"
                 - word_count: Number of words
         Raises:
             Exception: If transcript cannot be obtained
         """
+        # Extract video ID for API-based methods
+        video_id = self.extract_video_id(url)
+        # Method 1: Try youtube_transcript_api first (best for cloud platforms)
+        logger.info("Attempting to get transcript via YouTube API...")
+        result = self.get_transcript_api(video_id)
+        if result:
+            return result
+        # Method 2: Try yt-dlp subtitle extraction
+        logger.info("Attempting to get subtitles via yt-dlp...")
         result = self.get_subtitles(url)
         if result: