bskrishna2006 commited on
Commit
b4562f5
·
1 Parent(s): 034b462

Add youtube_transcript_api for better cloud compatibility

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. services/transcript.py +134 -9
requirements.txt CHANGED
@@ -14,9 +14,10 @@ requests>=2.31.0
14
  httpx>=0.24.0,<0.26.0
15
 
16
  # =============================================================================
17
- # YouTube Download
18
  # =============================================================================
19
  yt-dlp>=2024.1.1
 
20
 
21
  # =============================================================================
22
  # Groq API for Summarization (FREE)
 
14
  httpx>=0.24.0,<0.26.0
15
 
16
  # =============================================================================
17
+ # YouTube Download & Transcripts
18
  # =============================================================================
19
  yt-dlp>=2024.1.1
20
+ youtube_transcript_api>=0.6.0
21
 
22
  # =============================================================================
23
  # Groq API for Summarization (FREE)
services/transcript.py CHANGED
@@ -2,8 +2,9 @@
2
  Transcript Service for YouTube Videos
3
 
4
  This service extracts transcripts from YouTube videos using multiple methods:
5
- 1. First, try to get existing subtitles/captions (fastest, no model needed)
6
- 2. If no subtitles available, fallback to audio extraction + Whisper transcription
 
7
 
8
  The fallback uses the SpeechToTextService for local Whisper transcription.
9
  """
@@ -12,7 +13,15 @@ import re
12
  import os
13
  import tempfile
14
  import logging
15
- from typing import Optional, Tuple
 
 
 
 
 
 
 
 
16
 
17
  import yt_dlp
18
 
@@ -86,9 +95,113 @@ class TranscriptService:
86
 
87
  return text
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def get_subtitles(self, url: str, lang: str = "en") -> Optional[dict]:
90
  """
91
- Try to get existing subtitles from YouTube.
92
 
93
  Args:
94
  url: YouTube video URL
@@ -187,8 +300,10 @@ class TranscriptService:
187
  """
188
  Get transcript from a YouTube video.
189
 
190
- First tries to get subtitles. If unavailable and use_whisper_fallback is True,
191
- falls back to audio extraction and Whisper transcription.
 
 
192
 
193
  Args:
194
  url: YouTube video URL
@@ -198,14 +313,24 @@ class TranscriptService:
198
  Dictionary with:
199
  - transcript: The transcript text
200
  - language: Detected/extracted language code
201
- - source: "subtitles" or "whisper"
202
  - word_count: Number of words
203
 
204
  Raises:
205
  Exception: If transcript cannot be obtained
206
  """
207
- # Try subtitles first (faster, no model needed)
208
- logger.info("Attempting to get subtitles...")
 
 
 
 
 
 
 
 
 
 
209
  result = self.get_subtitles(url)
210
 
211
  if result:
 
2
  Transcript Service for YouTube Videos
3
 
4
  This service extracts transcripts from YouTube videos using multiple methods:
5
+ 1. First, try youtube_transcript_api (works well on cloud platforms)
6
+ 2. Then try yt-dlp subtitle extraction
7
+ 3. If no subtitles available, fallback to audio extraction + Whisper transcription
8
 
9
  The fallback uses the SpeechToTextService for local Whisper transcription.
10
  """
 
13
  import os
14
  import tempfile
15
  import logging
16
+ from typing import Optional, Tuple, List
17
+
18
+ # Try to import youtube_transcript_api (more reliable for cloud deployments)
19
+ try:
20
+ from youtube_transcript_api import YouTubeTranscriptApi
21
+ from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
22
+ HAS_YOUTUBE_TRANSCRIPT_API = True
23
+ except ImportError:
24
+ HAS_YOUTUBE_TRANSCRIPT_API = False
25
 
26
  import yt_dlp
27
 
 
95
 
96
  return text
97
 
98
+ def get_transcript_api(self, video_id: str) -> Optional[dict]:
99
+ """
100
+ Get transcript using youtube_transcript_api (works better on cloud platforms).
101
+
102
+ Args:
103
+ video_id: YouTube video ID
104
+
105
+ Returns:
106
+ Dictionary with transcript and language, or None if not available
107
+ """
108
+ if not HAS_YOUTUBE_TRANSCRIPT_API:
109
+ logger.info("youtube_transcript_api not installed, skipping...")
110
+ return None
111
+
112
+ try:
113
+ # Try to get transcript in preferred languages
114
+ preferred_langs = ['en', 'en-IN', 'hi', 'ta', 'te', 'kn', 'ml', 'gu', 'bn', 'mr', 'pa', 'ur']
115
+
116
+ try:
117
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
118
+
119
+ # Try to find a manual transcript first, then auto-generated
120
+ transcript = None
121
+ detected_lang = "eng"
122
+
123
+ # First try manual transcripts
124
+ for lang in preferred_langs:
125
+ try:
126
+ transcript = transcript_list.find_manually_created_transcript([lang])
127
+ detected_lang = lang
128
+ break
129
+ except:
130
+ pass
131
+
132
+ # Then try auto-generated
133
+ if not transcript:
134
+ for lang in preferred_langs:
135
+ try:
136
+ transcript = transcript_list.find_generated_transcript([lang])
137
+ detected_lang = lang
138
+ break
139
+ except:
140
+ pass
141
+
142
+ # If still no transcript, try to get any available
143
+ if not transcript:
144
+ for t in transcript_list:
145
+ transcript = t
146
+ detected_lang = t.language_code
147
+ break
148
+
149
+ if transcript:
150
+ # Fetch the actual transcript
151
+ transcript_data = transcript.fetch()
152
+
153
+ # Combine all text
154
+ text_parts = [entry['text'] for entry in transcript_data]
155
+ full_text = ' '.join(text_parts)
156
+
157
+ # Clean the text
158
+ clean_text = self.clean_autogen_transcript(full_text)
159
+
160
+ if len(clean_text.strip()) < 50:
161
+ logger.info("Transcript too short")
162
+ return None
163
+
164
+ # Normalize language code
165
+ lang_map = {
166
+ "en": "eng", "en-IN": "eng", "en-US": "eng", "en-GB": "eng",
167
+ "hi": "hin", "hi-IN": "hin",
168
+ "ta": "tam", "ta-IN": "tam",
169
+ "te": "tel", "te-IN": "tel",
170
+ "kn": "kan", "kn-IN": "kan",
171
+ "ml": "mal", "ml-IN": "mal",
172
+ "gu": "guj", "gu-IN": "guj",
173
+ "bn": "ben", "bn-IN": "ben",
174
+ "mr": "mar", "mr-IN": "mar",
175
+ "pa": "pan", "pa-IN": "pan",
176
+ "ur": "urd", "ur-PK": "urd",
177
+ }
178
+ normalized_lang = lang_map.get(detected_lang, detected_lang)
179
+
180
+ logger.info(f"Transcript fetched via API (language: {normalized_lang})")
181
+
182
+ return {
183
+ "transcript": clean_text,
184
+ "language": normalized_lang,
185
+ "source": "youtube_api",
186
+ "word_count": len(clean_text.split())
187
+ }
188
+
189
+ except TranscriptsDisabled:
190
+ logger.info("Transcripts are disabled for this video")
191
+ return None
192
+ except NoTranscriptFound:
193
+ logger.info("No transcript found for this video")
194
+ return None
195
+
196
+ except Exception as e:
197
+ logger.warning(f"youtube_transcript_api failed: {e}")
198
+ return None
199
+
200
+ return None
201
+
202
  def get_subtitles(self, url: str, lang: str = "en") -> Optional[dict]:
203
  """
204
+ Try to get existing subtitles from YouTube using yt-dlp.
205
 
206
  Args:
207
  url: YouTube video URL
 
300
  """
301
  Get transcript from a YouTube video.
302
 
303
+ Tries multiple methods in order:
304
+ 1. youtube_transcript_api (works best on cloud platforms)
305
+ 2. yt-dlp subtitle extraction
306
+ 3. Whisper transcription (fallback)
307
 
308
  Args:
309
  url: YouTube video URL
 
313
  Dictionary with:
314
  - transcript: The transcript text
315
  - language: Detected/extracted language code
316
+ - source: "youtube_api", "subtitles", or "whisper"
317
  - word_count: Number of words
318
 
319
  Raises:
320
  Exception: If transcript cannot be obtained
321
  """
322
+ # Extract video ID for API-based methods
323
+ video_id = self.extract_video_id(url)
324
+
325
+ # Method 1: Try youtube_transcript_api first (best for cloud platforms)
326
+ logger.info("Attempting to get transcript via YouTube API...")
327
+ result = self.get_transcript_api(video_id)
328
+
329
+ if result:
330
+ return result
331
+
332
+ # Method 2: Try yt-dlp subtitle extraction
333
+ logger.info("Attempting to get subtitles via yt-dlp...")
334
  result = self.get_subtitles(url)
335
 
336
  if result: