Samuel Thomas commited on
Commit
0f3a132
·
1 Parent(s): 0264a40

missing dependency

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. tools.py +165 -257
requirements.txt CHANGED
@@ -25,4 +25,5 @@ sentencepiece
25
  nltk
26
  SpeechRecognition
27
  pandas
28
- openai-whisper
 
 
25
  nltk
26
  SpeechRecognition
27
  pandas
28
+ openai-whisper
29
+ openpyxl
tools.py CHANGED
@@ -18,7 +18,7 @@ from langchain_huggingface import HuggingFacePipeline
18
  from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
19
  import time
20
  from collections import Counter
21
- from pydantic import Field, BaseModel
22
  import hashlib
23
  import json
24
  import numpy as np
@@ -2151,43 +2151,45 @@ class YouTubeTranscriptExtractor(BaseTool):
2151
  "Downloads a YouTube video and extracts the complete audio transcript using speech recognition. "
2152
  "Use this tool for questions about what people say in YouTube videos. "
2153
  "Input should be a dict with keys: 'youtube_url' and optional parameters. "
2154
- "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US'}"
 
 
 
 
 
2155
  )
2156
 
2157
- # Define Pydantic fields for the attributes we need to set
2158
  recognizer: Any = Field(default=None, exclude=True)
2159
 
2160
  class Config:
2161
  arbitrary_types_allowed = True
2162
- extra = "allow"
2163
 
2164
- def __init__(self, **kwargs):
2165
  super().__init__(**kwargs)
2166
 
2167
- # Initialize directories
2168
  self.cache_dir = '/tmp/youtube_transcript_cache/'
2169
  self.audio_dir = '/tmp/audio/'
2170
  self.chunks_dir = '/tmp/audio_chunks/'
2171
 
2172
- # Initialize speech recognizer
2173
  self.recognizer = sr.Recognizer()
2174
  self.recognizer.energy_threshold = 4000
2175
  self.recognizer.pause_threshold = 0.8
2176
 
2177
- # Create directories
2178
  for dir_path in [self.cache_dir, self.audio_dir, self.chunks_dir]:
2179
  os.makedirs(dir_path, exist_ok=True)
2180
 
2181
- def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
2182
- """Get configuration value with fallback to defaults"""
2183
  defaults = {
2184
  'language': 'en-US',
2185
- 'chunk_length_ms': 30000, # 30 seconds
2186
- 'silence_thresh': -40, # dB
2187
  'audio_quality': 'best',
2188
  'cache_enabled': True,
2189
- 'min_silence_len': 500, # minimum silence length to split on
2190
- 'overlap_ms': 1000, # 1 second overlap between chunks
 
 
2191
  }
2192
 
2193
  if input_data and key in input_data:
@@ -2195,15 +2197,12 @@ class YouTubeTranscriptExtractor(BaseTool):
2195
  return defaults.get(key, default_value)
2196
 
2197
  def _get_video_hash(self, url: str) -> str:
2198
- """Generate hash for video URL for caching"""
2199
  return hashlib.md5(url.encode()).hexdigest()
2200
 
2201
  def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
2202
- """Get cache file path"""
2203
  return os.path.join(self.cache_dir, f"{video_hash}_{cache_type}")
2204
 
2205
  def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
2206
- """Load data from cache"""
2207
  if not cache_enabled or not os.path.exists(cache_path):
2208
  return None
2209
  try:
@@ -2214,7 +2213,6 @@ class YouTubeTranscriptExtractor(BaseTool):
2214
  return None
2215
 
2216
  def _save_to_cache(self, cache_path: str, data: Any, cache_enabled: bool = True):
2217
- """Save data to cache"""
2218
  if not cache_enabled:
2219
  return
2220
  try:
@@ -2224,7 +2222,6 @@ class YouTubeTranscriptExtractor(BaseTool):
2224
  print(f"Error saving cache: {str(e)}")
2225
 
2226
  def _clean_directory(self, directory: str):
2227
- """Clean directory contents"""
2228
  if os.path.exists(directory):
2229
  for filename in os.listdir(directory):
2230
  file_path = os.path.join(directory, filename)
@@ -2236,29 +2233,28 @@ class YouTubeTranscriptExtractor(BaseTool):
2236
  except Exception as e:
2237
  print(f'Failed to delete {file_path}. Reason: {e}')
2238
 
2239
- def download_youtube_audio(self, url: str, video_hash: str, input_data: Dict[str, Any] = None) -> Optional[str]:
2240
- """Download YouTube video as audio file"""
2241
  audio_quality = self._get_config('audio_quality', 'best', input_data)
2242
  output_filename = f'{video_hash}.wav'
2243
  output_path = os.path.join(self.audio_dir, output_filename)
2244
 
2245
- # Check cache
2246
  cache_enabled = self._get_config('cache_enabled', True, input_data)
2247
  if cache_enabled and os.path.exists(output_path):
2248
  print(f"Using cached audio: {output_path}")
2249
  return output_path
2250
 
2251
- # Clean directory
2252
  self._clean_directory(self.audio_dir)
2253
 
 
 
 
2254
  try:
2255
- # Updated yt-dlp configuration for better compatibility
2256
- ydl_opts = {
2257
  'format': 'bestaudio[ext=m4a]/bestaudio/best',
2258
  'outtmpl': os.path.join(self.audio_dir, f'{video_hash}.%(ext)s'),
2259
- 'quiet': False, # Set to False for debugging
2260
  'no_warnings': False,
2261
- 'extract_flat': False,
2262
  'writethumbnail': False,
2263
  'writeinfojson': False,
2264
  'postprocessors': [{
@@ -2266,362 +2262,274 @@ class YouTubeTranscriptExtractor(BaseTool):
2266
  'preferredcodec': 'wav',
2267
  'preferredquality': '192' if audio_quality == 'best' else '128',
2268
  }],
2269
- # Add user agent and headers to avoid blocking
2270
  'http_headers': {
2271
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
2272
  },
2273
- # Add cookie handling
2274
- 'cookiefile': None,
2275
  'nocheckcertificate': True,
2276
  }
2277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2278
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
2279
- print(f"Downloading audio from: {url}")
2280
  ydl.download([url])
2281
-
2282
- # Check if the output file exists
2283
  if os.path.exists(output_path):
2284
  print(f"Audio downloaded successfully: {output_path}")
2285
  return output_path
2286
  else:
2287
- # Look for any downloaded file with the video hash
2288
  possible_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}.*'))
2289
  if possible_files:
2290
- # Convert to WAV if needed
2291
  source_file = possible_files[0]
2292
  if not source_file.endswith('.wav'):
2293
  try:
2294
  audio = AudioSegment.from_file(source_file)
2295
  audio.export(output_path, format="wav")
2296
- os.remove(source_file) # Clean up original
2297
  print(f"Audio converted to WAV: {output_path}")
2298
  return output_path
2299
  except Exception as e:
2300
  print(f"Error converting audio: {str(e)}")
2301
  return None
2302
- else:
2303
- return source_file
 
 
 
2304
 
2305
- print("No audio file found after download")
2306
  return None
2307
 
 
 
 
 
 
2308
  except Exception as e:
2309
- print(f"Error downloading YouTube audio: {str(e)}")
2310
- # Try alternative format as fallback
2311
- try:
2312
- print("Trying alternative download method...")
2313
- fallback_opts = {
2314
- 'format': 'worst[ext=mp4]',
2315
- 'outtmpl': os.path.join(self.audio_dir, f'{video_hash}_fallback.%(ext)s'),
2316
- 'quiet': False,
2317
- }
2318
-
2319
- with yt_dlp.YoutubeDL(fallback_opts) as ydl:
2320
- ydl.download([url])
2321
-
2322
- # Look for fallback file and convert
2323
- fallback_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}_fallback.*'))
2324
- if fallback_files:
2325
- source_file = fallback_files[0]
2326
- try:
2327
- audio = AudioSegment.from_file(source_file)
2328
- audio.export(output_path, format="wav")
2329
- os.remove(source_file)
2330
- print(f"Fallback audio converted: {output_path}")
2331
- return output_path
2332
- except Exception as conv_e:
2333
- print(f"Error converting fallback audio: {str(conv_e)}")
2334
-
2335
- except Exception as fallback_e:
2336
- print(f"Fallback download also failed: {str(fallback_e)}")
2337
-
2338
  return None
2339
 
2340
- def _split_audio_intelligent(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
2341
- """Split audio into chunks intelligently based on silence"""
2342
  self._clean_directory(self.chunks_dir)
2343
-
2344
  try:
2345
- # Load audio
2346
  audio = AudioSegment.from_wav(audio_path)
2347
-
2348
- # Get configuration
2349
  chunk_length_ms = self._get_config('chunk_length_ms', 30000, input_data)
2350
  silence_thresh = self._get_config('silence_thresh', -40, input_data)
2351
  min_silence_len = self._get_config('min_silence_len', 500, input_data)
2352
- overlap_ms = self._get_config('overlap_ms', 1000, input_data)
2353
 
2354
- # First try to split on silence
2355
  chunks = split_on_silence(
2356
  audio,
2357
  min_silence_len=min_silence_len,
2358
  silence_thresh=silence_thresh,
2359
- keep_silence=True
2360
  )
2361
 
2362
- # If no silence-based splits or chunks too large, split by time
2363
- if not chunks or any(len(chunk) > chunk_length_ms * 2 for chunk in chunks):
2364
- print("Using time-based splitting...")
2365
- chunks = []
2366
- for i in range(0, len(audio), chunk_length_ms - overlap_ms):
2367
- chunk = audio[i:i + chunk_length_ms]
2368
- if len(chunk) > 1000: # Only add chunks longer than 1 second
2369
- chunks.append(chunk)
2370
 
2371
- # Save chunks and create metadata
2372
- chunk_data = []
2373
- current_time = 0
2374
 
2375
- for i, chunk in enumerate(chunks):
2376
- if len(chunk) < 1000: # Skip very short chunks
2377
- continue
 
 
 
 
 
 
 
 
 
2378
 
2379
  chunk_filename = os.path.join(self.chunks_dir, f"chunk_{i:04d}.wav")
2380
- chunk.export(chunk_filename, format="wav")
2381
-
2382
- duration = len(chunk) / 1000.0 # in seconds
2383
-
2384
- chunk_info = {
2385
- 'filename': chunk_filename,
2386
- 'index': i,
2387
- 'start_time': current_time,
2388
- 'duration': duration,
2389
- 'end_time': current_time + duration
2390
- }
2391
 
2392
- chunk_data.append(chunk_info)
2393
- current_time += duration
 
 
 
2394
 
2395
  print(f"Split audio into {len(chunk_data)} chunks")
2396
  return chunk_data
2397
-
2398
  except Exception as e:
2399
  print(f"Error splitting audio: {str(e)}")
2400
- # Fallback: return original file as single chunk
2401
- try:
2402
  audio = AudioSegment.from_wav(audio_path)
2403
  duration = len(audio) / 1000.0
2404
- return [{
2405
- 'filename': audio_path,
2406
- 'index': 0,
2407
- 'start_time': 0,
2408
- 'duration': duration,
2409
- 'end_time': duration
2410
- }]
2411
- except:
2412
- return []
2413
 
2414
- def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
2415
- """Transcribe a single audio chunk"""
2416
  chunk_path = chunk_info['filename']
 
 
 
 
 
2417
  try:
2418
  language = self._get_config('language', 'en-US', input_data)
2419
-
2420
  with sr.AudioFile(chunk_path) as source:
2421
- # Adjust for ambient noise
2422
- self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
2423
  audio_data = self.recognizer.record(source)
2424
 
2425
- # Try Google Speech Recognition
2426
  try:
2427
  text = self.recognizer.recognize_google(audio_data, language=language)
2428
- return {
2429
- 'text': text,
2430
- 'confidence': 1.0,
2431
- 'start_time': chunk_info['start_time'],
2432
- 'end_time': chunk_info['end_time'],
2433
- 'duration': chunk_info['duration'],
2434
- 'index': chunk_info['index'],
2435
- 'success': True
2436
- }
2437
-
2438
  except sr.UnknownValueError:
2439
- # Try without language specification
2440
- try:
2441
  text = self.recognizer.recognize_google(audio_data)
2442
- return {
2443
- 'text': text,
2444
- 'confidence': 0.8,
2445
- 'start_time': chunk_info['start_time'],
2446
- 'end_time': chunk_info['end_time'],
2447
- 'duration': chunk_info['duration'],
2448
- 'index': chunk_info['index'],
2449
- 'success': True
2450
- }
2451
  except sr.UnknownValueError:
2452
- return {
2453
- 'text': '[INAUDIBLE]',
2454
- 'confidence': 0.0,
2455
- 'start_time': chunk_info['start_time'],
2456
- 'end_time': chunk_info['end_time'],
2457
- 'duration': chunk_info['duration'],
2458
- 'index': chunk_info['index'],
2459
- 'success': False
2460
- }
2461
  except sr.RequestError as e:
2462
- return {
2463
- 'text': f'[RECOGNITION_ERROR: {str(e)}]',
2464
- 'confidence': 0.0,
2465
- 'start_time': chunk_info['start_time'],
2466
- 'end_time': chunk_info['end_time'],
2467
- 'duration': chunk_info['duration'],
2468
- 'index': chunk_info['index'],
2469
- 'success': False,
2470
- 'error': str(e)
2471
- }
2472
-
2473
  except Exception as e:
2474
- return {
2475
- 'text': f'[ERROR: {str(e)}]',
2476
- 'confidence': 0.0,
2477
- 'start_time': chunk_info.get('start_time', 0),
2478
- 'end_time': chunk_info.get('end_time', 0),
2479
- 'duration': chunk_info.get('duration', 0),
2480
- 'index': chunk_info.get('index', 0),
2481
- 'success': False,
2482
- 'error': str(e)
2483
- }
2484
 
2485
- def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
2486
- """Transcribe audio chunks in parallel"""
2487
  results = []
2488
-
2489
- # Use fewer workers to avoid API rate limits
2490
- max_workers = min(3, len(chunk_data))
2491
 
2492
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
2493
  future_to_chunk = {
2494
  executor.submit(self._transcribe_audio_chunk, chunk_info, input_data): chunk_info
2495
  for chunk_info in chunk_data
2496
  }
2497
-
2498
  for future in as_completed(future_to_chunk):
2499
  chunk_info = future_to_chunk[future]
2500
  try:
2501
  result = future.result()
2502
  results.append(result)
2503
- if result['success']:
2504
- preview = result['text'][:50] + "..." if len(result['text']) > 50 else result['text']
2505
- print(f"Transcribed chunk {result['index']}: {preview}")
2506
- else:
2507
- print(f"Failed to transcribe chunk {result['index']}: {result['text']}")
2508
  except Exception as e:
2509
  print(f"Error processing chunk {chunk_info.get('index', '?')}: {str(e)}")
2510
  results.append({
2511
- 'text': f'[PROCESSING_ERROR: {str(e)}]',
2512
- 'confidence': 0.0,
2513
- 'start_time': chunk_info.get('start_time', 0),
2514
- 'end_time': chunk_info.get('end_time', 0),
2515
- 'duration': chunk_info.get('duration', 0),
2516
- 'index': chunk_info.get('index', 0),
2517
- 'success': False,
2518
- 'error': str(e)
2519
  })
2520
-
2521
- # Sort results by chunk index to maintain order
2522
  results.sort(key=lambda x: x['index'])
2523
  return results
2524
 
2525
- def extract_transcript(self, audio_path: str, video_hash: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]:
2526
- """Extract complete transcript from audio file"""
2527
  cache_enabled = self._get_config('cache_enabled', True, input_data)
2528
  cache_path = self._get_cache_path(video_hash, "transcript.json")
2529
 
2530
- # Check cache
2531
  cached_transcript = self._load_from_cache(cache_path, cache_enabled)
2532
  if cached_transcript:
2533
  print("Using cached transcript")
2534
  return cached_transcript
2535
 
2536
  try:
2537
- # Step 1: Split audio into manageable chunks
2538
  print("Splitting audio into chunks...")
2539
  chunk_data = self._split_audio_intelligent(audio_path, input_data)
2540
-
2541
  if not chunk_data:
2542
- return {
2543
- 'error': 'Failed to split audio into chunks',
2544
- 'full_transcript': '',
2545
- 'success_rate': 0.0
2546
- }
2547
 
2548
- # Step 2: Transcribe all chunks
2549
  print(f"Transcribing {len(chunk_data)} audio chunks...")
2550
  transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
2551
-
2552
- # Step 3: Combine results
2553
- successful_results = [r for r in transcript_results if r['success']]
2554
- full_text = ' '.join([r['text'] for r in successful_results])
2555
-
2556
- # Calculate statistics
2557
- total_chunks = len(transcript_results)
2558
- successful_chunks = len(successful_results)
2559
- success_rate = successful_chunks / total_chunks if total_chunks > 0 else 0
2560
- word_count = len(full_text.split()) if full_text else 0
2561
-
2562
  final_result = {
2563
- 'full_transcript': full_text,
2564
- 'word_count': word_count,
2565
- 'total_chunks': total_chunks,
2566
- 'successful_chunks': successful_chunks,
2567
- 'success_rate': success_rate,
2568
- 'extraction_timestamp': time.time(),
2569
- 'extraction_date': time.strftime('%Y-%m-%d %H:%M:%S'),
2570
  'detailed_results': transcript_results
2571
  }
2572
-
2573
- # Cache results
2574
  self._save_to_cache(cache_path, final_result, cache_enabled)
2575
-
2576
  print(f"Transcript extraction completed. Success rate: {success_rate:.1%}")
2577
  return final_result
2578
-
2579
  except Exception as e:
2580
  print(f"Error during transcript extraction: {str(e)}")
2581
- return {
2582
- 'error': str(e),
2583
- 'full_transcript': '',
2584
- 'success_rate': 0.0
2585
- }
2586
 
2587
- def _run(self, youtube_url: str, **kwargs) -> str:
2588
- """Main execution method"""
2589
- input_data = {
2590
- 'youtube_url': youtube_url,
2591
- **kwargs
2592
- }
2593
-
2594
- if not youtube_url:
2595
- return "Error: youtube_url is required."
2596
 
2597
  try:
2598
- # Generate video hash for caching
2599
  video_hash = self._get_video_hash(youtube_url)
2600
-
2601
- # Step 1: Download audio
2602
- print(f"Downloading YouTube audio from {youtube_url}...")
2603
  audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
2604
  if not audio_path or not os.path.exists(audio_path):
2605
- return "Error: Failed to download the YouTube audio. Please check the URL and try again."
2606
 
2607
- # Step 2: Extract transcript
2608
  print("Extracting audio transcript...")
2609
  transcript_result = self.extract_transcript(audio_path, video_hash, input_data)
2610
 
2611
- if transcript_result.get("error"):
2612
- return f"Error: {transcript_result['error']}"
2613
-
2614
- main_transcript = transcript_result.get('full_transcript', '')
2615
 
2616
- if not main_transcript:
2617
- return "Error: No transcript could be extracted from the audio."
2618
-
2619
- print(f"Transcript extracted successfully. Word count: {transcript_result.get('word_count', 0)}")
2620
- print(f"Success rate: {transcript_result.get('success_rate', 0):.1%}")
2621
 
 
2622
  return "TRANSCRIPT: " + main_transcript
2623
-
2624
  except Exception as e:
 
2625
  return f"Error during transcript extraction: {str(e)}"
2626
 
2627
  # Factory function to create the tool
 
18
  from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
19
  import time
20
  from collections import Counter
21
+ from pydantic import Field, BaseModel, Extra
22
  import hashlib
23
  import json
24
  import numpy as np
 
2151
  "Downloads a YouTube video and extracts the complete audio transcript using speech recognition. "
2152
  "Use this tool for questions about what people say in YouTube videos. "
2153
  "Input should be a dict with keys: 'youtube_url' and optional parameters. "
2154
+ "Optional parameters include 'language' (e.g., 'en-US'), "
2155
+ "'cookies_file_path' (path to a cookies TXT file for authentication), "
2156
+ "or 'cookies_from_browser' (string specifying browser for cookies, e.g., 'chrome', 'firefox:profileName', 'edge+keyringName:profileName::containerName'). "
2157
+ "Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US'} or "
2158
+ "{'youtube_url': '...', 'cookies_file_path': '/path/to/cookies.txt'} or "
2159
+ "{'youtube_url': '...', 'cookies_from_browser': 'chrome'}"
2160
  )
2161
 
 
2162
  recognizer: Any = Field(default=None, exclude=True)
2163
 
2164
  class Config:
2165
  arbitrary_types_allowed = True
2166
+ extra = Extra.allow # Adjusted if pydantic v1 style
2167
 
2168
+ def __init__(self, **kwargs: Any):
2169
  super().__init__(**kwargs)
2170
 
 
2171
  self.cache_dir = '/tmp/youtube_transcript_cache/'
2172
  self.audio_dir = '/tmp/audio/'
2173
  self.chunks_dir = '/tmp/audio_chunks/'
2174
 
 
2175
  self.recognizer = sr.Recognizer()
2176
  self.recognizer.energy_threshold = 4000
2177
  self.recognizer.pause_threshold = 0.8
2178
 
 
2179
  for dir_path in [self.cache_dir, self.audio_dir, self.chunks_dir]:
2180
  os.makedirs(dir_path, exist_ok=True)
2181
 
2182
+ def _get_config(self, key: str, default_value: Any = None, input_data: Optional[Dict[str, Any]] = None) -> Any:
 
2183
  defaults = {
2184
  'language': 'en-US',
2185
+ 'chunk_length_ms': 30000,
2186
+ 'silence_thresh': -40,
2187
  'audio_quality': 'best',
2188
  'cache_enabled': True,
2189
+ 'min_silence_len': 500,
2190
+ 'overlap_ms': 1000,
2191
+ 'cookies_file_path': None, # New: Path to a cookies file
2192
+ 'cookies_from_browser': None # New: Browser string e.g., "chrome", "firefox:profile_name"
2193
  }
2194
 
2195
  if input_data and key in input_data:
 
2197
  return defaults.get(key, default_value)
2198
 
2199
  def _get_video_hash(self, url: str) -> str:
 
2200
  return hashlib.md5(url.encode()).hexdigest()
2201
 
2202
  def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
 
2203
  return os.path.join(self.cache_dir, f"{video_hash}_{cache_type}")
2204
 
2205
  def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
 
2206
  if not cache_enabled or not os.path.exists(cache_path):
2207
  return None
2208
  try:
 
2213
  return None
2214
 
2215
  def _save_to_cache(self, cache_path: str, data: Any, cache_enabled: bool = True):
 
2216
  if not cache_enabled:
2217
  return
2218
  try:
 
2222
  print(f"Error saving cache: {str(e)}")
2223
 
2224
  def _clean_directory(self, directory: str):
 
2225
  if os.path.exists(directory):
2226
  for filename in os.listdir(directory):
2227
  file_path = os.path.join(directory, filename)
 
2233
  except Exception as e:
2234
  print(f'Failed to delete {file_path}. Reason: {e}')
2235
 
2236
+ def download_youtube_audio(self, url: str, video_hash: str, input_data: Optional[Dict[str, Any]] = None) -> Optional[str]:
 
2237
  audio_quality = self._get_config('audio_quality', 'best', input_data)
2238
  output_filename = f'{video_hash}.wav'
2239
  output_path = os.path.join(self.audio_dir, output_filename)
2240
 
 
2241
  cache_enabled = self._get_config('cache_enabled', True, input_data)
2242
  if cache_enabled and os.path.exists(output_path):
2243
  print(f"Using cached audio: {output_path}")
2244
  return output_path
2245
 
 
2246
  self._clean_directory(self.audio_dir)
2247
 
2248
+ cookies_file_path = self._get_config('cookies_file_path', None, input_data)
2249
+ cookies_from_browser_str = self._get_config('cookies_from_browser', None, input_data)
2250
+
2251
  try:
2252
+ ydl_opts: Dict[str, Any] = {
 
2253
  'format': 'bestaudio[ext=m4a]/bestaudio/best',
2254
  'outtmpl': os.path.join(self.audio_dir, f'{video_hash}.%(ext)s'),
2255
+ 'quiet': False,
2256
  'no_warnings': False,
2257
+ 'extract_flat': False, # Ensure this is false for actual downloads
2258
  'writethumbnail': False,
2259
  'writeinfojson': False,
2260
  'postprocessors': [{
 
2262
  'preferredcodec': 'wav',
2263
  'preferredquality': '192' if audio_quality == 'best' else '128',
2264
  }],
 
2265
  'http_headers': {
2266
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
2267
  },
 
 
2268
  'nocheckcertificate': True,
2269
  }
2270
 
2271
+ if cookies_file_path:
2272
+ ydl_opts['cookiefile'] = cookies_file_path
2273
+ print(f"Using cookies from file: {cookies_file_path}")
2274
+ elif cookies_from_browser_str:
2275
+ parsed_browser, parsed_profile, parsed_keyring, parsed_container = None, None, None, None
2276
+
2277
+ temp_str = cookies_from_browser_str
2278
+
2279
+ if '::' in temp_str:
2280
+ main_part_before_container, parsed_container_val = temp_str.split('::', 1)
2281
+ parsed_container = parsed_container_val if parsed_container_val else None
2282
+ temp_str = main_part_before_container
2283
+
2284
+ if ':' in temp_str:
2285
+ browser_keyring_part, parsed_profile_val = temp_str.split(':', 1)
2286
+ parsed_profile = parsed_profile_val if parsed_profile_val else None
2287
+ temp_str = browser_keyring_part
2288
+
2289
+ if '+' in temp_str:
2290
+ parsed_browser_val, parsed_keyring_val = temp_str.split('+', 1)
2291
+ parsed_browser = parsed_browser_val
2292
+ parsed_keyring = parsed_keyring_val if parsed_keyring_val else None
2293
+ else:
2294
+ parsed_browser = temp_str
2295
+
2296
+ if parsed_browser:
2297
+ # yt-dlp expects cookiesfrombrowser as a tuple: (BROWSER, PROFILE, KEYRING, CONTAINER)
2298
+ final_tuple: Tuple[Optional[str], ...] = (
2299
+ parsed_browser,
2300
+ parsed_profile,
2301
+ parsed_keyring,
2302
+ parsed_container
2303
+ )
2304
+ ydl_opts['cookiesfrombrowser'] = final_tuple
2305
+ print(f"Attempting to use cookies from browser spec '{cookies_from_browser_str}', parsed as: {final_tuple}")
2306
+ else:
2307
+ print(f"Invalid or empty browser name in cookies_from_browser string: '{cookies_from_browser_str}'")
2308
+
2309
+
2310
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
2311
+ print(f"Downloading audio from: {url} with options: {ydl_opts}")
2312
  ydl.download([url])
2313
+
 
2314
  if os.path.exists(output_path):
2315
  print(f"Audio downloaded successfully: {output_path}")
2316
  return output_path
2317
  else:
 
2318
  possible_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}.*'))
2319
  if possible_files:
 
2320
  source_file = possible_files[0]
2321
  if not source_file.endswith('.wav'):
2322
  try:
2323
  audio = AudioSegment.from_file(source_file)
2324
  audio.export(output_path, format="wav")
2325
+ os.remove(source_file)
2326
  print(f"Audio converted to WAV: {output_path}")
2327
  return output_path
2328
  except Exception as e:
2329
  print(f"Error converting audio: {str(e)}")
2330
  return None
2331
+ else: # Already a .wav, possibly due to postprocessor already creating it with a different ext pattern
2332
+ if source_file != output_path: # if names differ due to original extension
2333
+ shutil.move(source_file, output_path)
2334
+ print(f"Audio file found: {output_path}")
2335
+ return output_path
2336
 
2337
+ print(f"No audio file found at expected path after download: {output_path}")
2338
  return None
2339
 
2340
+ except yt_dlp.utils.DownloadError as de:
2341
+ print(f"yt-dlp DownloadError: {str(de)}")
2342
+ if "Sign in to confirm you're not a bot" in str(de) and not (cookies_file_path or cookies_from_browser_str):
2343
+ print("Authentication required. Consider using 'cookies_file_path' or 'cookies_from_browser' options.")
2344
+ return None
2345
  except Exception as e:
2346
+ print(f"Error downloading YouTube audio: {type(e).__name__} - {str(e)}")
2347
+ # Fallback attempt is removed as it's unlikely to succeed if the primary authenticated attempt fails due to bot detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2348
  return None
2349
 
2350
+ def _split_audio_intelligent(self, audio_path: str, input_data: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
 
2351
  self._clean_directory(self.chunks_dir)
 
2352
  try:
 
2353
  audio = AudioSegment.from_wav(audio_path)
 
 
2354
  chunk_length_ms = self._get_config('chunk_length_ms', 30000, input_data)
2355
  silence_thresh = self._get_config('silence_thresh', -40, input_data)
2356
  min_silence_len = self._get_config('min_silence_len', 500, input_data)
2357
+ overlap_ms = self._get_config('overlap_ms', 1000, input_data) # Not used in current split_on_silence
2358
 
 
2359
  chunks = split_on_silence(
2360
  audio,
2361
  min_silence_len=min_silence_len,
2362
  silence_thresh=silence_thresh,
2363
+ keep_silence=True
2364
  )
2365
 
2366
+ processed_chunks: List[AudioSegment] = [] # type: ignore
2367
+ # Combine small chunks or re-chunk if silence splitting is ineffective
2368
+ temp_chunk: Optional[AudioSegment] = None # type: ignore
2369
+ for chunk in chunks:
2370
+ if temp_chunk is None:
2371
+ temp_chunk = chunk
2372
+ else:
2373
+ temp_chunk += chunk
2374
 
2375
+ if len(temp_chunk) > chunk_length_ms / 2 or chunk == chunks[-1]: # Arbitrary threshold to combine small chunks
2376
+ processed_chunks.append(temp_chunk)
2377
+ temp_chunk = None
2378
 
2379
+ if not processed_chunks or any(len(p_chunk) > chunk_length_ms * 1.5 for p_chunk in processed_chunks): # If still problematic
2380
+ print("Using time-based splitting due to ineffective silence splitting or overly large chunks...")
2381
+ processed_chunks = []
2382
+ for i in range(0, len(audio), chunk_length_ms - overlap_ms):
2383
+ chunk_segment = audio[i:i + chunk_length_ms]
2384
+ if len(chunk_segment) > 1000:
2385
+ processed_chunks.append(chunk_segment)
2386
+
2387
+ chunk_data = []
2388
+ current_time_ms = 0
2389
+ for i, chunk_segment in enumerate(processed_chunks):
2390
+ if len(chunk_segment) < 1000: continue
2391
 
2392
  chunk_filename = os.path.join(self.chunks_dir, f"chunk_{i:04d}.wav")
2393
+ chunk_segment.export(chunk_filename, format="wav")
2394
+
2395
+ duration_s = len(chunk_segment) / 1000.0
2396
+ start_time_s = current_time_ms / 1000.0
2397
+ end_time_s = start_time_s + duration_s
 
 
 
 
 
 
2398
 
2399
+ chunk_data.append({
2400
+ 'filename': chunk_filename, 'index': i,
2401
+ 'start_time': start_time_s, 'duration': duration_s, 'end_time': end_time_s
2402
+ })
2403
+ current_time_ms += len(chunk_segment) # Approximation, true timestamping is harder
2404
 
2405
  print(f"Split audio into {len(chunk_data)} chunks")
2406
  return chunk_data
 
2407
  except Exception as e:
2408
  print(f"Error splitting audio: {str(e)}")
2409
+ try: # Fallback: single chunk
 
2410
  audio = AudioSegment.from_wav(audio_path)
2411
  duration = len(audio) / 1000.0
2412
+ return [{'filename': audio_path, 'index': 0, 'start_time': 0, 'duration': duration, 'end_time': duration}]
2413
+ except: return []
2414
+
 
 
 
 
 
 
2415
 
2416
+ def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
 
2417
  chunk_path = chunk_info['filename']
2418
+ base_result = {
2419
+ 'start_time': chunk_info.get('start_time', 0), 'end_time': chunk_info.get('end_time', 0),
2420
+ 'duration': chunk_info.get('duration', 0), 'index': chunk_info.get('index', -1),
2421
+ 'success': False, 'confidence': 0.0
2422
+ }
2423
  try:
2424
  language = self._get_config('language', 'en-US', input_data)
 
2425
  with sr.AudioFile(chunk_path) as source:
2426
+ self.recognizer.adjust_for_ambient_noise(source, duration=0.2) # Shorter adjustment
 
2427
  audio_data = self.recognizer.record(source)
2428
 
 
2429
  try:
2430
  text = self.recognizer.recognize_google(audio_data, language=language)
2431
+ return {**base_result, 'text': text, 'confidence': 1.0, 'success': True}
 
 
 
 
 
 
 
 
 
2432
  except sr.UnknownValueError:
2433
+ try: # Try without specific language
 
2434
  text = self.recognizer.recognize_google(audio_data)
2435
+ return {**base_result, 'text': text, 'confidence': 0.8, 'success': True} # Lower confidence
 
 
 
 
 
 
 
 
2436
  except sr.UnknownValueError:
2437
+ return {**base_result, 'text': '[INAUDIBLE]'}
 
 
 
 
 
 
 
 
2438
  except sr.RequestError as e:
2439
+ return {**base_result, 'text': f'[RECOGNITION_ERROR: {str(e)}]', 'error': str(e)}
 
 
 
 
 
 
 
 
 
 
2440
  except Exception as e:
2441
+ return {**base_result, 'text': f'[ERROR: {str(e)}]', 'error': str(e)}
 
 
 
 
 
 
 
 
 
2442
 
2443
+ def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
 
2444
  results = []
2445
+ max_workers = min(os.cpu_count() or 1, 4) # Limit workers
 
 
2446
 
2447
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
2448
  future_to_chunk = {
2449
  executor.submit(self._transcribe_audio_chunk, chunk_info, input_data): chunk_info
2450
  for chunk_info in chunk_data
2451
  }
 
2452
  for future in as_completed(future_to_chunk):
2453
  chunk_info = future_to_chunk[future]
2454
  try:
2455
  result = future.result()
2456
  results.append(result)
2457
+ status = "Transcribed" if result['success'] else "Failed"
2458
+ preview = result['text'][:50] + "..." if len(result['text']) > 50 else result['text']
2459
+ print(f"{status} chunk {result['index']}: {preview}")
 
 
2460
  except Exception as e:
2461
  print(f"Error processing chunk {chunk_info.get('index', '?')}: {str(e)}")
2462
  results.append({
2463
+ 'text': f'[PROCESSING_ERROR: {str(e)}]', 'confidence': 0.0,
2464
+ 'start_time': chunk_info.get('start_time', 0), 'end_time': chunk_info.get('end_time', 0),
2465
+ 'duration': chunk_info.get('duration', 0), 'index': chunk_info.get('index', 0),
2466
+ 'success': False, 'error': str(e)
 
 
 
 
2467
  })
 
 
2468
  results.sort(key=lambda x: x['index'])
2469
  return results
2470
 
2471
+ def extract_transcript(self, audio_path: str, video_hash: str, input_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
 
2472
  cache_enabled = self._get_config('cache_enabled', True, input_data)
2473
  cache_path = self._get_cache_path(video_hash, "transcript.json")
2474
 
 
2475
  cached_transcript = self._load_from_cache(cache_path, cache_enabled)
2476
  if cached_transcript:
2477
  print("Using cached transcript")
2478
  return cached_transcript
2479
 
2480
  try:
 
2481
  print("Splitting audio into chunks...")
2482
  chunk_data = self._split_audio_intelligent(audio_path, input_data)
 
2483
  if not chunk_data:
2484
+ return {'error': 'Failed to split audio', 'full_transcript': '', 'success_rate': 0.0}
 
 
 
 
2485
 
 
2486
  print(f"Transcribing {len(chunk_data)} audio chunks...")
2487
  transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
2488
+
2489
+ successful_chunks = [r for r in transcript_results if r['success']]
2490
+ full_text = ' '.join([r['text'] for r in successful_chunks if r['text'] and '[INAUDIBLE]' not in r['text'] and 'ERROR' not in r['text']]).strip()
2491
+
2492
+ total_c = len(transcript_results)
2493
+ successful_c = len(successful_chunks)
2494
+ success_rate = successful_c / total_c if total_c > 0 else 0.0
2495
+
 
 
 
2496
  final_result = {
2497
+ 'full_transcript': full_text, 'word_count': len(full_text.split()),
2498
+ 'total_chunks': total_c, 'successful_chunks': successful_c, 'success_rate': success_rate,
2499
+ 'extraction_timestamp': time.time(), 'extraction_date': time.strftime('%Y-%m-%d %H:%M:%S'),
 
 
 
 
2500
  'detailed_results': transcript_results
2501
  }
 
 
2502
  self._save_to_cache(cache_path, final_result, cache_enabled)
 
2503
  print(f"Transcript extraction completed. Success rate: {success_rate:.1%}")
2504
  return final_result
 
2505
  except Exception as e:
2506
  print(f"Error during transcript extraction: {str(e)}")
2507
+ return {'error': str(e), 'full_transcript': '', 'success_rate': 0.0}
 
 
 
 
2508
 
2509
+ def _run(self, youtube_url: str, **kwargs: Any) -> str:
2510
+ input_data = {'youtube_url': youtube_url, **kwargs}
2511
+ if not youtube_url: return "Error: youtube_url is required."
 
 
 
 
 
 
2512
 
2513
  try:
 
2514
  video_hash = self._get_video_hash(youtube_url)
2515
+ print(f"Processing YouTube URL: {youtube_url} (Hash: {video_hash})")
2516
+
 
2517
  audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
2518
  if not audio_path or not os.path.exists(audio_path):
2519
+ return "Error: Failed to download YouTube audio. Check URL or authentication (cookies)."
2520
 
 
2521
  print("Extracting audio transcript...")
2522
  transcript_result = self.extract_transcript(audio_path, video_hash, input_data)
2523
 
2524
+ if transcript_result.get("error"): return f"Error: {transcript_result['error']}"
 
 
 
2525
 
2526
+ main_transcript = transcript_result.get('full_transcript', '')
2527
+ if not main_transcript: return "Error: No transcript could be extracted."
 
 
 
2528
 
2529
+ print(f"Transcript extracted. Word count: {transcript_result.get('word_count',0)}. Success: {transcript_result.get('success_rate',0):.1%}")
2530
  return "TRANSCRIPT: " + main_transcript
 
2531
  except Exception as e:
2532
+ print(f"Unhandled error in _run: {str(e)}") # For debugging
2533
  return f"Error during transcript extraction: {str(e)}"
2534
 
2535
  # Factory function to create the tool