Spaces:
Sleeping
Sleeping
Samuel Thomas
commited on
Commit
·
0f3a132
1
Parent(s):
0264a40
missing dependency
Browse files- requirements.txt +2 -1
- tools.py +165 -257
requirements.txt
CHANGED
@@ -25,4 +25,5 @@ sentencepiece
|
|
25 |
nltk
|
26 |
SpeechRecognition
|
27 |
pandas
|
28 |
-
openai-whisper
|
|
|
|
25 |
nltk
|
26 |
SpeechRecognition
|
27 |
pandas
|
28 |
+
openai-whisper
|
29 |
+
openpyxl
|
tools.py
CHANGED
@@ -18,7 +18,7 @@ from langchain_huggingface import HuggingFacePipeline
|
|
18 |
from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
|
19 |
import time
|
20 |
from collections import Counter
|
21 |
-
from pydantic import Field, BaseModel
|
22 |
import hashlib
|
23 |
import json
|
24 |
import numpy as np
|
@@ -2151,43 +2151,45 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2151 |
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition. "
|
2152 |
"Use this tool for questions about what people say in YouTube videos. "
|
2153 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
2154 |
-
"
|
|
|
|
|
|
|
|
|
|
|
2155 |
)
|
2156 |
|
2157 |
-
# Define Pydantic fields for the attributes we need to set
|
2158 |
recognizer: Any = Field(default=None, exclude=True)
|
2159 |
|
2160 |
class Config:
|
2161 |
arbitrary_types_allowed = True
|
2162 |
-
extra =
|
2163 |
|
2164 |
-
def __init__(self, **kwargs):
|
2165 |
super().__init__(**kwargs)
|
2166 |
|
2167 |
-
# Initialize directories
|
2168 |
self.cache_dir = '/tmp/youtube_transcript_cache/'
|
2169 |
self.audio_dir = '/tmp/audio/'
|
2170 |
self.chunks_dir = '/tmp/audio_chunks/'
|
2171 |
|
2172 |
-
# Initialize speech recognizer
|
2173 |
self.recognizer = sr.Recognizer()
|
2174 |
self.recognizer.energy_threshold = 4000
|
2175 |
self.recognizer.pause_threshold = 0.8
|
2176 |
|
2177 |
-
# Create directories
|
2178 |
for dir_path in [self.cache_dir, self.audio_dir, self.chunks_dir]:
|
2179 |
os.makedirs(dir_path, exist_ok=True)
|
2180 |
|
2181 |
-
def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
|
2182 |
-
"""Get configuration value with fallback to defaults"""
|
2183 |
defaults = {
|
2184 |
'language': 'en-US',
|
2185 |
-
'chunk_length_ms': 30000,
|
2186 |
-
'silence_thresh': -40,
|
2187 |
'audio_quality': 'best',
|
2188 |
'cache_enabled': True,
|
2189 |
-
'min_silence_len': 500,
|
2190 |
-
'overlap_ms': 1000,
|
|
|
|
|
2191 |
}
|
2192 |
|
2193 |
if input_data and key in input_data:
|
@@ -2195,15 +2197,12 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2195 |
return defaults.get(key, default_value)
|
2196 |
|
2197 |
def _get_video_hash(self, url: str) -> str:
|
2198 |
-
"""Generate hash for video URL for caching"""
|
2199 |
return hashlib.md5(url.encode()).hexdigest()
|
2200 |
|
2201 |
def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
|
2202 |
-
"""Get cache file path"""
|
2203 |
return os.path.join(self.cache_dir, f"{video_hash}_{cache_type}")
|
2204 |
|
2205 |
def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
|
2206 |
-
"""Load data from cache"""
|
2207 |
if not cache_enabled or not os.path.exists(cache_path):
|
2208 |
return None
|
2209 |
try:
|
@@ -2214,7 +2213,6 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2214 |
return None
|
2215 |
|
2216 |
def _save_to_cache(self, cache_path: str, data: Any, cache_enabled: bool = True):
|
2217 |
-
"""Save data to cache"""
|
2218 |
if not cache_enabled:
|
2219 |
return
|
2220 |
try:
|
@@ -2224,7 +2222,6 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2224 |
print(f"Error saving cache: {str(e)}")
|
2225 |
|
2226 |
def _clean_directory(self, directory: str):
|
2227 |
-
"""Clean directory contents"""
|
2228 |
if os.path.exists(directory):
|
2229 |
for filename in os.listdir(directory):
|
2230 |
file_path = os.path.join(directory, filename)
|
@@ -2236,29 +2233,28 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2236 |
except Exception as e:
|
2237 |
print(f'Failed to delete {file_path}. Reason: {e}')
|
2238 |
|
2239 |
-
def download_youtube_audio(self, url: str, video_hash: str, input_data: Dict[str, Any] = None) -> Optional[str]:
|
2240 |
-
"""Download YouTube video as audio file"""
|
2241 |
audio_quality = self._get_config('audio_quality', 'best', input_data)
|
2242 |
output_filename = f'{video_hash}.wav'
|
2243 |
output_path = os.path.join(self.audio_dir, output_filename)
|
2244 |
|
2245 |
-
# Check cache
|
2246 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
2247 |
if cache_enabled and os.path.exists(output_path):
|
2248 |
print(f"Using cached audio: {output_path}")
|
2249 |
return output_path
|
2250 |
|
2251 |
-
# Clean directory
|
2252 |
self._clean_directory(self.audio_dir)
|
2253 |
|
|
|
|
|
|
|
2254 |
try:
|
2255 |
-
|
2256 |
-
ydl_opts = {
|
2257 |
'format': 'bestaudio[ext=m4a]/bestaudio/best',
|
2258 |
'outtmpl': os.path.join(self.audio_dir, f'{video_hash}.%(ext)s'),
|
2259 |
-
'quiet': False,
|
2260 |
'no_warnings': False,
|
2261 |
-
'extract_flat': False,
|
2262 |
'writethumbnail': False,
|
2263 |
'writeinfojson': False,
|
2264 |
'postprocessors': [{
|
@@ -2266,362 +2262,274 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2266 |
'preferredcodec': 'wav',
|
2267 |
'preferredquality': '192' if audio_quality == 'best' else '128',
|
2268 |
}],
|
2269 |
-
# Add user agent and headers to avoid blocking
|
2270 |
'http_headers': {
|
2271 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
2272 |
},
|
2273 |
-
# Add cookie handling
|
2274 |
-
'cookiefile': None,
|
2275 |
'nocheckcertificate': True,
|
2276 |
}
|
2277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2278 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
2279 |
-
print(f"Downloading audio from: {url}")
|
2280 |
ydl.download([url])
|
2281 |
-
|
2282 |
-
# Check if the output file exists
|
2283 |
if os.path.exists(output_path):
|
2284 |
print(f"Audio downloaded successfully: {output_path}")
|
2285 |
return output_path
|
2286 |
else:
|
2287 |
-
# Look for any downloaded file with the video hash
|
2288 |
possible_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}.*'))
|
2289 |
if possible_files:
|
2290 |
-
# Convert to WAV if needed
|
2291 |
source_file = possible_files[0]
|
2292 |
if not source_file.endswith('.wav'):
|
2293 |
try:
|
2294 |
audio = AudioSegment.from_file(source_file)
|
2295 |
audio.export(output_path, format="wav")
|
2296 |
-
os.remove(source_file)
|
2297 |
print(f"Audio converted to WAV: {output_path}")
|
2298 |
return output_path
|
2299 |
except Exception as e:
|
2300 |
print(f"Error converting audio: {str(e)}")
|
2301 |
return None
|
2302 |
-
else:
|
2303 |
-
|
|
|
|
|
|
|
2304 |
|
2305 |
-
print("No audio file found after download")
|
2306 |
return None
|
2307 |
|
|
|
|
|
|
|
|
|
|
|
2308 |
except Exception as e:
|
2309 |
-
print(f"Error downloading YouTube audio: {str(e)}")
|
2310 |
-
#
|
2311 |
-
try:
|
2312 |
-
print("Trying alternative download method...")
|
2313 |
-
fallback_opts = {
|
2314 |
-
'format': 'worst[ext=mp4]',
|
2315 |
-
'outtmpl': os.path.join(self.audio_dir, f'{video_hash}_fallback.%(ext)s'),
|
2316 |
-
'quiet': False,
|
2317 |
-
}
|
2318 |
-
|
2319 |
-
with yt_dlp.YoutubeDL(fallback_opts) as ydl:
|
2320 |
-
ydl.download([url])
|
2321 |
-
|
2322 |
-
# Look for fallback file and convert
|
2323 |
-
fallback_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}_fallback.*'))
|
2324 |
-
if fallback_files:
|
2325 |
-
source_file = fallback_files[0]
|
2326 |
-
try:
|
2327 |
-
audio = AudioSegment.from_file(source_file)
|
2328 |
-
audio.export(output_path, format="wav")
|
2329 |
-
os.remove(source_file)
|
2330 |
-
print(f"Fallback audio converted: {output_path}")
|
2331 |
-
return output_path
|
2332 |
-
except Exception as conv_e:
|
2333 |
-
print(f"Error converting fallback audio: {str(conv_e)}")
|
2334 |
-
|
2335 |
-
except Exception as fallback_e:
|
2336 |
-
print(f"Fallback download also failed: {str(fallback_e)}")
|
2337 |
-
|
2338 |
return None
|
2339 |
|
2340 |
-
def _split_audio_intelligent(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
2341 |
-
"""Split audio into chunks intelligently based on silence"""
|
2342 |
self._clean_directory(self.chunks_dir)
|
2343 |
-
|
2344 |
try:
|
2345 |
-
# Load audio
|
2346 |
audio = AudioSegment.from_wav(audio_path)
|
2347 |
-
|
2348 |
-
# Get configuration
|
2349 |
chunk_length_ms = self._get_config('chunk_length_ms', 30000, input_data)
|
2350 |
silence_thresh = self._get_config('silence_thresh', -40, input_data)
|
2351 |
min_silence_len = self._get_config('min_silence_len', 500, input_data)
|
2352 |
-
overlap_ms = self._get_config('overlap_ms', 1000, input_data)
|
2353 |
|
2354 |
-
# First try to split on silence
|
2355 |
chunks = split_on_silence(
|
2356 |
audio,
|
2357 |
min_silence_len=min_silence_len,
|
2358 |
silence_thresh=silence_thresh,
|
2359 |
-
keep_silence=True
|
2360 |
)
|
2361 |
|
2362 |
-
|
2363 |
-
|
2364 |
-
|
2365 |
-
|
2366 |
-
|
2367 |
-
|
2368 |
-
|
2369 |
-
|
2370 |
|
2371 |
-
|
2372 |
-
|
2373 |
-
|
2374 |
|
2375 |
-
|
2376 |
-
|
2377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2378 |
|
2379 |
chunk_filename = os.path.join(self.chunks_dir, f"chunk_{i:04d}.wav")
|
2380 |
-
|
2381 |
-
|
2382 |
-
|
2383 |
-
|
2384 |
-
|
2385 |
-
'filename': chunk_filename,
|
2386 |
-
'index': i,
|
2387 |
-
'start_time': current_time,
|
2388 |
-
'duration': duration,
|
2389 |
-
'end_time': current_time + duration
|
2390 |
-
}
|
2391 |
|
2392 |
-
chunk_data.append(
|
2393 |
-
|
|
|
|
|
|
|
2394 |
|
2395 |
print(f"Split audio into {len(chunk_data)} chunks")
|
2396 |
return chunk_data
|
2397 |
-
|
2398 |
except Exception as e:
|
2399 |
print(f"Error splitting audio: {str(e)}")
|
2400 |
-
# Fallback:
|
2401 |
-
try:
|
2402 |
audio = AudioSegment.from_wav(audio_path)
|
2403 |
duration = len(audio) / 1000.0
|
2404 |
-
return [{
|
2405 |
-
|
2406 |
-
|
2407 |
-
'start_time': 0,
|
2408 |
-
'duration': duration,
|
2409 |
-
'end_time': duration
|
2410 |
-
}]
|
2411 |
-
except:
|
2412 |
-
return []
|
2413 |
|
2414 |
-
def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
|
2415 |
-
"""Transcribe a single audio chunk"""
|
2416 |
chunk_path = chunk_info['filename']
|
|
|
|
|
|
|
|
|
|
|
2417 |
try:
|
2418 |
language = self._get_config('language', 'en-US', input_data)
|
2419 |
-
|
2420 |
with sr.AudioFile(chunk_path) as source:
|
2421 |
-
|
2422 |
-
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
2423 |
audio_data = self.recognizer.record(source)
|
2424 |
|
2425 |
-
# Try Google Speech Recognition
|
2426 |
try:
|
2427 |
text = self.recognizer.recognize_google(audio_data, language=language)
|
2428 |
-
return {
|
2429 |
-
'text': text,
|
2430 |
-
'confidence': 1.0,
|
2431 |
-
'start_time': chunk_info['start_time'],
|
2432 |
-
'end_time': chunk_info['end_time'],
|
2433 |
-
'duration': chunk_info['duration'],
|
2434 |
-
'index': chunk_info['index'],
|
2435 |
-
'success': True
|
2436 |
-
}
|
2437 |
-
|
2438 |
except sr.UnknownValueError:
|
2439 |
-
# Try without language
|
2440 |
-
try:
|
2441 |
text = self.recognizer.recognize_google(audio_data)
|
2442 |
-
return {
|
2443 |
-
'text': text,
|
2444 |
-
'confidence': 0.8,
|
2445 |
-
'start_time': chunk_info['start_time'],
|
2446 |
-
'end_time': chunk_info['end_time'],
|
2447 |
-
'duration': chunk_info['duration'],
|
2448 |
-
'index': chunk_info['index'],
|
2449 |
-
'success': True
|
2450 |
-
}
|
2451 |
except sr.UnknownValueError:
|
2452 |
-
return {
|
2453 |
-
'text': '[INAUDIBLE]',
|
2454 |
-
'confidence': 0.0,
|
2455 |
-
'start_time': chunk_info['start_time'],
|
2456 |
-
'end_time': chunk_info['end_time'],
|
2457 |
-
'duration': chunk_info['duration'],
|
2458 |
-
'index': chunk_info['index'],
|
2459 |
-
'success': False
|
2460 |
-
}
|
2461 |
except sr.RequestError as e:
|
2462 |
-
return {
|
2463 |
-
'text': f'[RECOGNITION_ERROR: {str(e)}]',
|
2464 |
-
'confidence': 0.0,
|
2465 |
-
'start_time': chunk_info['start_time'],
|
2466 |
-
'end_time': chunk_info['end_time'],
|
2467 |
-
'duration': chunk_info['duration'],
|
2468 |
-
'index': chunk_info['index'],
|
2469 |
-
'success': False,
|
2470 |
-
'error': str(e)
|
2471 |
-
}
|
2472 |
-
|
2473 |
except Exception as e:
|
2474 |
-
return {
|
2475 |
-
'text': f'[ERROR: {str(e)}]',
|
2476 |
-
'confidence': 0.0,
|
2477 |
-
'start_time': chunk_info.get('start_time', 0),
|
2478 |
-
'end_time': chunk_info.get('end_time', 0),
|
2479 |
-
'duration': chunk_info.get('duration', 0),
|
2480 |
-
'index': chunk_info.get('index', 0),
|
2481 |
-
'success': False,
|
2482 |
-
'error': str(e)
|
2483 |
-
}
|
2484 |
|
2485 |
-
def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
2486 |
-
"""Transcribe audio chunks in parallel"""
|
2487 |
results = []
|
2488 |
-
|
2489 |
-
# Use fewer workers to avoid API rate limits
|
2490 |
-
max_workers = min(3, len(chunk_data))
|
2491 |
|
2492 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2493 |
future_to_chunk = {
|
2494 |
executor.submit(self._transcribe_audio_chunk, chunk_info, input_data): chunk_info
|
2495 |
for chunk_info in chunk_data
|
2496 |
}
|
2497 |
-
|
2498 |
for future in as_completed(future_to_chunk):
|
2499 |
chunk_info = future_to_chunk[future]
|
2500 |
try:
|
2501 |
result = future.result()
|
2502 |
results.append(result)
|
2503 |
-
if result['success']
|
2504 |
-
|
2505 |
-
|
2506 |
-
else:
|
2507 |
-
print(f"Failed to transcribe chunk {result['index']}: {result['text']}")
|
2508 |
except Exception as e:
|
2509 |
print(f"Error processing chunk {chunk_info.get('index', '?')}: {str(e)}")
|
2510 |
results.append({
|
2511 |
-
'text': f'[PROCESSING_ERROR: {str(e)}]',
|
2512 |
-
'
|
2513 |
-
'
|
2514 |
-
'
|
2515 |
-
'duration': chunk_info.get('duration', 0),
|
2516 |
-
'index': chunk_info.get('index', 0),
|
2517 |
-
'success': False,
|
2518 |
-
'error': str(e)
|
2519 |
})
|
2520 |
-
|
2521 |
-
# Sort results by chunk index to maintain order
|
2522 |
results.sort(key=lambda x: x['index'])
|
2523 |
return results
|
2524 |
|
2525 |
-
def extract_transcript(self, audio_path: str, video_hash: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]:
|
2526 |
-
"""Extract complete transcript from audio file"""
|
2527 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
2528 |
cache_path = self._get_cache_path(video_hash, "transcript.json")
|
2529 |
|
2530 |
-
# Check cache
|
2531 |
cached_transcript = self._load_from_cache(cache_path, cache_enabled)
|
2532 |
if cached_transcript:
|
2533 |
print("Using cached transcript")
|
2534 |
return cached_transcript
|
2535 |
|
2536 |
try:
|
2537 |
-
# Step 1: Split audio into manageable chunks
|
2538 |
print("Splitting audio into chunks...")
|
2539 |
chunk_data = self._split_audio_intelligent(audio_path, input_data)
|
2540 |
-
|
2541 |
if not chunk_data:
|
2542 |
-
return {
|
2543 |
-
'error': 'Failed to split audio into chunks',
|
2544 |
-
'full_transcript': '',
|
2545 |
-
'success_rate': 0.0
|
2546 |
-
}
|
2547 |
|
2548 |
-
# Step 2: Transcribe all chunks
|
2549 |
print(f"Transcribing {len(chunk_data)} audio chunks...")
|
2550 |
transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
|
2551 |
-
|
2552 |
-
|
2553 |
-
|
2554 |
-
|
2555 |
-
|
2556 |
-
|
2557 |
-
|
2558 |
-
|
2559 |
-
success_rate = successful_chunks / total_chunks if total_chunks > 0 else 0
|
2560 |
-
word_count = len(full_text.split()) if full_text else 0
|
2561 |
-
|
2562 |
final_result = {
|
2563 |
-
'full_transcript': full_text,
|
2564 |
-
'
|
2565 |
-
'
|
2566 |
-
'successful_chunks': successful_chunks,
|
2567 |
-
'success_rate': success_rate,
|
2568 |
-
'extraction_timestamp': time.time(),
|
2569 |
-
'extraction_date': time.strftime('%Y-%m-%d %H:%M:%S'),
|
2570 |
'detailed_results': transcript_results
|
2571 |
}
|
2572 |
-
|
2573 |
-
# Cache results
|
2574 |
self._save_to_cache(cache_path, final_result, cache_enabled)
|
2575 |
-
|
2576 |
print(f"Transcript extraction completed. Success rate: {success_rate:.1%}")
|
2577 |
return final_result
|
2578 |
-
|
2579 |
except Exception as e:
|
2580 |
print(f"Error during transcript extraction: {str(e)}")
|
2581 |
-
return {
|
2582 |
-
'error': str(e),
|
2583 |
-
'full_transcript': '',
|
2584 |
-
'success_rate': 0.0
|
2585 |
-
}
|
2586 |
|
2587 |
-
def _run(self, youtube_url: str, **kwargs) -> str:
|
2588 |
-
|
2589 |
-
|
2590 |
-
'youtube_url': youtube_url,
|
2591 |
-
**kwargs
|
2592 |
-
}
|
2593 |
-
|
2594 |
-
if not youtube_url:
|
2595 |
-
return "Error: youtube_url is required."
|
2596 |
|
2597 |
try:
|
2598 |
-
# Generate video hash for caching
|
2599 |
video_hash = self._get_video_hash(youtube_url)
|
2600 |
-
|
2601 |
-
|
2602 |
-
print(f"Downloading YouTube audio from {youtube_url}...")
|
2603 |
audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
|
2604 |
if not audio_path or not os.path.exists(audio_path):
|
2605 |
-
return "Error: Failed to download
|
2606 |
|
2607 |
-
# Step 2: Extract transcript
|
2608 |
print("Extracting audio transcript...")
|
2609 |
transcript_result = self.extract_transcript(audio_path, video_hash, input_data)
|
2610 |
|
2611 |
-
if transcript_result.get("error"):
|
2612 |
-
return f"Error: {transcript_result['error']}"
|
2613 |
-
|
2614 |
-
main_transcript = transcript_result.get('full_transcript', '')
|
2615 |
|
2616 |
-
|
2617 |
-
|
2618 |
-
|
2619 |
-
print(f"Transcript extracted successfully. Word count: {transcript_result.get('word_count', 0)}")
|
2620 |
-
print(f"Success rate: {transcript_result.get('success_rate', 0):.1%}")
|
2621 |
|
|
|
2622 |
return "TRANSCRIPT: " + main_transcript
|
2623 |
-
|
2624 |
except Exception as e:
|
|
|
2625 |
return f"Error during transcript extraction: {str(e)}"
|
2626 |
|
2627 |
# Factory function to create the tool
|
|
|
18 |
from typing import TypedDict, List, Optional, Dict, Any, Annotated, Literal, Union, Tuple, Set, Type
|
19 |
import time
|
20 |
from collections import Counter
|
21 |
+
from pydantic import Field, BaseModel, Extra
|
22 |
import hashlib
|
23 |
import json
|
24 |
import numpy as np
|
|
|
2151 |
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition. "
|
2152 |
"Use this tool for questions about what people say in YouTube videos. "
|
2153 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
2154 |
+
"Optional parameters include 'language' (e.g., 'en-US'), "
|
2155 |
+
"'cookies_file_path' (path to a cookies TXT file for authentication), "
|
2156 |
+
"or 'cookies_from_browser' (string specifying browser for cookies, e.g., 'chrome', 'firefox:profileName', 'edge+keyringName:profileName::containerName'). "
|
2157 |
+
"Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US'} or "
|
2158 |
+
"{'youtube_url': '...', 'cookies_file_path': '/path/to/cookies.txt'} or "
|
2159 |
+
"{'youtube_url': '...', 'cookies_from_browser': 'chrome'}"
|
2160 |
)
|
2161 |
|
|
|
2162 |
recognizer: Any = Field(default=None, exclude=True)
|
2163 |
|
2164 |
class Config:
|
2165 |
arbitrary_types_allowed = True
|
2166 |
+
extra = Extra.allow # Adjusted if pydantic v1 style
|
2167 |
|
2168 |
+
def __init__(self, **kwargs: Any):
|
2169 |
super().__init__(**kwargs)
|
2170 |
|
|
|
2171 |
self.cache_dir = '/tmp/youtube_transcript_cache/'
|
2172 |
self.audio_dir = '/tmp/audio/'
|
2173 |
self.chunks_dir = '/tmp/audio_chunks/'
|
2174 |
|
|
|
2175 |
self.recognizer = sr.Recognizer()
|
2176 |
self.recognizer.energy_threshold = 4000
|
2177 |
self.recognizer.pause_threshold = 0.8
|
2178 |
|
|
|
2179 |
for dir_path in [self.cache_dir, self.audio_dir, self.chunks_dir]:
|
2180 |
os.makedirs(dir_path, exist_ok=True)
|
2181 |
|
2182 |
+
def _get_config(self, key: str, default_value: Any = None, input_data: Optional[Dict[str, Any]] = None) -> Any:
|
|
|
2183 |
defaults = {
|
2184 |
'language': 'en-US',
|
2185 |
+
'chunk_length_ms': 30000,
|
2186 |
+
'silence_thresh': -40,
|
2187 |
'audio_quality': 'best',
|
2188 |
'cache_enabled': True,
|
2189 |
+
'min_silence_len': 500,
|
2190 |
+
'overlap_ms': 1000,
|
2191 |
+
'cookies_file_path': None, # New: Path to a cookies file
|
2192 |
+
'cookies_from_browser': None # New: Browser string e.g., "chrome", "firefox:profile_name"
|
2193 |
}
|
2194 |
|
2195 |
if input_data and key in input_data:
|
|
|
2197 |
return defaults.get(key, default_value)
|
2198 |
|
2199 |
def _get_video_hash(self, url: str) -> str:
|
|
|
2200 |
return hashlib.md5(url.encode()).hexdigest()
|
2201 |
|
2202 |
def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
|
|
|
2203 |
return os.path.join(self.cache_dir, f"{video_hash}_{cache_type}")
|
2204 |
|
2205 |
def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
|
|
|
2206 |
if not cache_enabled or not os.path.exists(cache_path):
|
2207 |
return None
|
2208 |
try:
|
|
|
2213 |
return None
|
2214 |
|
2215 |
def _save_to_cache(self, cache_path: str, data: Any, cache_enabled: bool = True):
|
|
|
2216 |
if not cache_enabled:
|
2217 |
return
|
2218 |
try:
|
|
|
2222 |
print(f"Error saving cache: {str(e)}")
|
2223 |
|
2224 |
def _clean_directory(self, directory: str):
|
|
|
2225 |
if os.path.exists(directory):
|
2226 |
for filename in os.listdir(directory):
|
2227 |
file_path = os.path.join(directory, filename)
|
|
|
2233 |
except Exception as e:
|
2234 |
print(f'Failed to delete {file_path}. Reason: {e}')
|
2235 |
|
2236 |
+
def download_youtube_audio(self, url: str, video_hash: str, input_data: Optional[Dict[str, Any]] = None) -> Optional[str]:
|
|
|
2237 |
audio_quality = self._get_config('audio_quality', 'best', input_data)
|
2238 |
output_filename = f'{video_hash}.wav'
|
2239 |
output_path = os.path.join(self.audio_dir, output_filename)
|
2240 |
|
|
|
2241 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
2242 |
if cache_enabled and os.path.exists(output_path):
|
2243 |
print(f"Using cached audio: {output_path}")
|
2244 |
return output_path
|
2245 |
|
|
|
2246 |
self._clean_directory(self.audio_dir)
|
2247 |
|
2248 |
+
cookies_file_path = self._get_config('cookies_file_path', None, input_data)
|
2249 |
+
cookies_from_browser_str = self._get_config('cookies_from_browser', None, input_data)
|
2250 |
+
|
2251 |
try:
|
2252 |
+
ydl_opts: Dict[str, Any] = {
|
|
|
2253 |
'format': 'bestaudio[ext=m4a]/bestaudio/best',
|
2254 |
'outtmpl': os.path.join(self.audio_dir, f'{video_hash}.%(ext)s'),
|
2255 |
+
'quiet': False,
|
2256 |
'no_warnings': False,
|
2257 |
+
'extract_flat': False, # Ensure this is false for actual downloads
|
2258 |
'writethumbnail': False,
|
2259 |
'writeinfojson': False,
|
2260 |
'postprocessors': [{
|
|
|
2262 |
'preferredcodec': 'wav',
|
2263 |
'preferredquality': '192' if audio_quality == 'best' else '128',
|
2264 |
}],
|
|
|
2265 |
'http_headers': {
|
2266 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
2267 |
},
|
|
|
|
|
2268 |
'nocheckcertificate': True,
|
2269 |
}
|
2270 |
|
2271 |
+
if cookies_file_path:
|
2272 |
+
ydl_opts['cookiefile'] = cookies_file_path
|
2273 |
+
print(f"Using cookies from file: {cookies_file_path}")
|
2274 |
+
elif cookies_from_browser_str:
|
2275 |
+
parsed_browser, parsed_profile, parsed_keyring, parsed_container = None, None, None, None
|
2276 |
+
|
2277 |
+
temp_str = cookies_from_browser_str
|
2278 |
+
|
2279 |
+
if '::' in temp_str:
|
2280 |
+
main_part_before_container, parsed_container_val = temp_str.split('::', 1)
|
2281 |
+
parsed_container = parsed_container_val if parsed_container_val else None
|
2282 |
+
temp_str = main_part_before_container
|
2283 |
+
|
2284 |
+
if ':' in temp_str:
|
2285 |
+
browser_keyring_part, parsed_profile_val = temp_str.split(':', 1)
|
2286 |
+
parsed_profile = parsed_profile_val if parsed_profile_val else None
|
2287 |
+
temp_str = browser_keyring_part
|
2288 |
+
|
2289 |
+
if '+' in temp_str:
|
2290 |
+
parsed_browser_val, parsed_keyring_val = temp_str.split('+', 1)
|
2291 |
+
parsed_browser = parsed_browser_val
|
2292 |
+
parsed_keyring = parsed_keyring_val if parsed_keyring_val else None
|
2293 |
+
else:
|
2294 |
+
parsed_browser = temp_str
|
2295 |
+
|
2296 |
+
if parsed_browser:
|
2297 |
+
# yt-dlp expects cookiesfrombrowser as a tuple: (BROWSER, PROFILE, KEYRING, CONTAINER)
|
2298 |
+
final_tuple: Tuple[Optional[str], ...] = (
|
2299 |
+
parsed_browser,
|
2300 |
+
parsed_profile,
|
2301 |
+
parsed_keyring,
|
2302 |
+
parsed_container
|
2303 |
+
)
|
2304 |
+
ydl_opts['cookiesfrombrowser'] = final_tuple
|
2305 |
+
print(f"Attempting to use cookies from browser spec '{cookies_from_browser_str}', parsed as: {final_tuple}")
|
2306 |
+
else:
|
2307 |
+
print(f"Invalid or empty browser name in cookies_from_browser string: '{cookies_from_browser_str}'")
|
2308 |
+
|
2309 |
+
|
2310 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
2311 |
+
print(f"Downloading audio from: {url} with options: {ydl_opts}")
|
2312 |
ydl.download([url])
|
2313 |
+
|
|
|
2314 |
if os.path.exists(output_path):
|
2315 |
print(f"Audio downloaded successfully: {output_path}")
|
2316 |
return output_path
|
2317 |
else:
|
|
|
2318 |
possible_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}.*'))
|
2319 |
if possible_files:
|
|
|
2320 |
source_file = possible_files[0]
|
2321 |
if not source_file.endswith('.wav'):
|
2322 |
try:
|
2323 |
audio = AudioSegment.from_file(source_file)
|
2324 |
audio.export(output_path, format="wav")
|
2325 |
+
os.remove(source_file)
|
2326 |
print(f"Audio converted to WAV: {output_path}")
|
2327 |
return output_path
|
2328 |
except Exception as e:
|
2329 |
print(f"Error converting audio: {str(e)}")
|
2330 |
return None
|
2331 |
+
else: # Already a .wav, possibly due to postprocessor already creating it with a different ext pattern
|
2332 |
+
if source_file != output_path: # if names differ due to original extension
|
2333 |
+
shutil.move(source_file, output_path)
|
2334 |
+
print(f"Audio file found: {output_path}")
|
2335 |
+
return output_path
|
2336 |
|
2337 |
+
print(f"No audio file found at expected path after download: {output_path}")
|
2338 |
return None
|
2339 |
|
2340 |
+
except yt_dlp.utils.DownloadError as de:
|
2341 |
+
print(f"yt-dlp DownloadError: {str(de)}")
|
2342 |
+
if "Sign in to confirm you're not a bot" in str(de) and not (cookies_file_path or cookies_from_browser_str):
|
2343 |
+
print("Authentication required. Consider using 'cookies_file_path' or 'cookies_from_browser' options.")
|
2344 |
+
return None
|
2345 |
except Exception as e:
|
2346 |
+
print(f"Error downloading YouTube audio: {type(e).__name__} - {str(e)}")
|
2347 |
+
# Fallback attempt is removed as it's unlikely to succeed if the primary authenticated attempt fails due to bot detection
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2348 |
return None
|
2349 |
|
2350 |
+
def _split_audio_intelligent(self, audio_path: str, input_data: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
|
|
2351 |
self._clean_directory(self.chunks_dir)
|
|
|
2352 |
try:
|
|
|
2353 |
audio = AudioSegment.from_wav(audio_path)
|
|
|
|
|
2354 |
chunk_length_ms = self._get_config('chunk_length_ms', 30000, input_data)
|
2355 |
silence_thresh = self._get_config('silence_thresh', -40, input_data)
|
2356 |
min_silence_len = self._get_config('min_silence_len', 500, input_data)
|
2357 |
+
overlap_ms = self._get_config('overlap_ms', 1000, input_data) # Not used in current split_on_silence
|
2358 |
|
|
|
2359 |
chunks = split_on_silence(
|
2360 |
audio,
|
2361 |
min_silence_len=min_silence_len,
|
2362 |
silence_thresh=silence_thresh,
|
2363 |
+
keep_silence=True
|
2364 |
)
|
2365 |
|
2366 |
+
processed_chunks: List[AudioSegment] = [] # type: ignore
|
2367 |
+
# Combine small chunks or re-chunk if silence splitting is ineffective
|
2368 |
+
temp_chunk: Optional[AudioSegment] = None # type: ignore
|
2369 |
+
for chunk in chunks:
|
2370 |
+
if temp_chunk is None:
|
2371 |
+
temp_chunk = chunk
|
2372 |
+
else:
|
2373 |
+
temp_chunk += chunk
|
2374 |
|
2375 |
+
if len(temp_chunk) > chunk_length_ms / 2 or chunk == chunks[-1]: # Arbitrary threshold to combine small chunks
|
2376 |
+
processed_chunks.append(temp_chunk)
|
2377 |
+
temp_chunk = None
|
2378 |
|
2379 |
+
if not processed_chunks or any(len(p_chunk) > chunk_length_ms * 1.5 for p_chunk in processed_chunks): # If still problematic
|
2380 |
+
print("Using time-based splitting due to ineffective silence splitting or overly large chunks...")
|
2381 |
+
processed_chunks = []
|
2382 |
+
for i in range(0, len(audio), chunk_length_ms - overlap_ms):
|
2383 |
+
chunk_segment = audio[i:i + chunk_length_ms]
|
2384 |
+
if len(chunk_segment) > 1000:
|
2385 |
+
processed_chunks.append(chunk_segment)
|
2386 |
+
|
2387 |
+
chunk_data = []
|
2388 |
+
current_time_ms = 0
|
2389 |
+
for i, chunk_segment in enumerate(processed_chunks):
|
2390 |
+
if len(chunk_segment) < 1000: continue
|
2391 |
|
2392 |
chunk_filename = os.path.join(self.chunks_dir, f"chunk_{i:04d}.wav")
|
2393 |
+
chunk_segment.export(chunk_filename, format="wav")
|
2394 |
+
|
2395 |
+
duration_s = len(chunk_segment) / 1000.0
|
2396 |
+
start_time_s = current_time_ms / 1000.0
|
2397 |
+
end_time_s = start_time_s + duration_s
|
|
|
|
|
|
|
|
|
|
|
|
|
2398 |
|
2399 |
+
chunk_data.append({
|
2400 |
+
'filename': chunk_filename, 'index': i,
|
2401 |
+
'start_time': start_time_s, 'duration': duration_s, 'end_time': end_time_s
|
2402 |
+
})
|
2403 |
+
current_time_ms += len(chunk_segment) # Approximation, true timestamping is harder
|
2404 |
|
2405 |
print(f"Split audio into {len(chunk_data)} chunks")
|
2406 |
return chunk_data
|
|
|
2407 |
except Exception as e:
|
2408 |
print(f"Error splitting audio: {str(e)}")
|
2409 |
+
try: # Fallback: single chunk
|
|
|
2410 |
audio = AudioSegment.from_wav(audio_path)
|
2411 |
duration = len(audio) / 1000.0
|
2412 |
+
return [{'filename': audio_path, 'index': 0, 'start_time': 0, 'duration': duration, 'end_time': duration}]
|
2413 |
+
except: return []
|
2414 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
2415 |
|
2416 |
+
def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
|
2417 |
chunk_path = chunk_info['filename']
|
2418 |
+
base_result = {
|
2419 |
+
'start_time': chunk_info.get('start_time', 0), 'end_time': chunk_info.get('end_time', 0),
|
2420 |
+
'duration': chunk_info.get('duration', 0), 'index': chunk_info.get('index', -1),
|
2421 |
+
'success': False, 'confidence': 0.0
|
2422 |
+
}
|
2423 |
try:
|
2424 |
language = self._get_config('language', 'en-US', input_data)
|
|
|
2425 |
with sr.AudioFile(chunk_path) as source:
|
2426 |
+
self.recognizer.adjust_for_ambient_noise(source, duration=0.2) # Shorter adjustment
|
|
|
2427 |
audio_data = self.recognizer.record(source)
|
2428 |
|
|
|
2429 |
try:
|
2430 |
text = self.recognizer.recognize_google(audio_data, language=language)
|
2431 |
+
return {**base_result, 'text': text, 'confidence': 1.0, 'success': True}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2432 |
except sr.UnknownValueError:
|
2433 |
+
try: # Try without specific language
|
|
|
2434 |
text = self.recognizer.recognize_google(audio_data)
|
2435 |
+
return {**base_result, 'text': text, 'confidence': 0.8, 'success': True} # Lower confidence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2436 |
except sr.UnknownValueError:
|
2437 |
+
return {**base_result, 'text': '[INAUDIBLE]'}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2438 |
except sr.RequestError as e:
|
2439 |
+
return {**base_result, 'text': f'[RECOGNITION_ERROR: {str(e)}]', 'error': str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2440 |
except Exception as e:
|
2441 |
+
return {**base_result, 'text': f'[ERROR: {str(e)}]', 'error': str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2442 |
|
2443 |
+
def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
|
|
|
2444 |
results = []
|
2445 |
+
max_workers = min(os.cpu_count() or 1, 4) # Limit workers
|
|
|
|
|
2446 |
|
2447 |
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2448 |
future_to_chunk = {
|
2449 |
executor.submit(self._transcribe_audio_chunk, chunk_info, input_data): chunk_info
|
2450 |
for chunk_info in chunk_data
|
2451 |
}
|
|
|
2452 |
for future in as_completed(future_to_chunk):
|
2453 |
chunk_info = future_to_chunk[future]
|
2454 |
try:
|
2455 |
result = future.result()
|
2456 |
results.append(result)
|
2457 |
+
status = "Transcribed" if result['success'] else "Failed"
|
2458 |
+
preview = result['text'][:50] + "..." if len(result['text']) > 50 else result['text']
|
2459 |
+
print(f"{status} chunk {result['index']}: {preview}")
|
|
|
|
|
2460 |
except Exception as e:
|
2461 |
print(f"Error processing chunk {chunk_info.get('index', '?')}: {str(e)}")
|
2462 |
results.append({
|
2463 |
+
'text': f'[PROCESSING_ERROR: {str(e)}]', 'confidence': 0.0,
|
2464 |
+
'start_time': chunk_info.get('start_time', 0), 'end_time': chunk_info.get('end_time', 0),
|
2465 |
+
'duration': chunk_info.get('duration', 0), 'index': chunk_info.get('index', 0),
|
2466 |
+
'success': False, 'error': str(e)
|
|
|
|
|
|
|
|
|
2467 |
})
|
|
|
|
|
2468 |
results.sort(key=lambda x: x['index'])
|
2469 |
return results
|
2470 |
|
2471 |
+
def extract_transcript(self, audio_path: str, video_hash: str, input_data: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
|
|
2472 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
2473 |
cache_path = self._get_cache_path(video_hash, "transcript.json")
|
2474 |
|
|
|
2475 |
cached_transcript = self._load_from_cache(cache_path, cache_enabled)
|
2476 |
if cached_transcript:
|
2477 |
print("Using cached transcript")
|
2478 |
return cached_transcript
|
2479 |
|
2480 |
try:
|
|
|
2481 |
print("Splitting audio into chunks...")
|
2482 |
chunk_data = self._split_audio_intelligent(audio_path, input_data)
|
|
|
2483 |
if not chunk_data:
|
2484 |
+
return {'error': 'Failed to split audio', 'full_transcript': '', 'success_rate': 0.0}
|
|
|
|
|
|
|
|
|
2485 |
|
|
|
2486 |
print(f"Transcribing {len(chunk_data)} audio chunks...")
|
2487 |
transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
|
2488 |
+
|
2489 |
+
successful_chunks = [r for r in transcript_results if r['success']]
|
2490 |
+
full_text = ' '.join([r['text'] for r in successful_chunks if r['text'] and '[INAUDIBLE]' not in r['text'] and 'ERROR' not in r['text']]).strip()
|
2491 |
+
|
2492 |
+
total_c = len(transcript_results)
|
2493 |
+
successful_c = len(successful_chunks)
|
2494 |
+
success_rate = successful_c / total_c if total_c > 0 else 0.0
|
2495 |
+
|
|
|
|
|
|
|
2496 |
final_result = {
|
2497 |
+
'full_transcript': full_text, 'word_count': len(full_text.split()),
|
2498 |
+
'total_chunks': total_c, 'successful_chunks': successful_c, 'success_rate': success_rate,
|
2499 |
+
'extraction_timestamp': time.time(), 'extraction_date': time.strftime('%Y-%m-%d %H:%M:%S'),
|
|
|
|
|
|
|
|
|
2500 |
'detailed_results': transcript_results
|
2501 |
}
|
|
|
|
|
2502 |
self._save_to_cache(cache_path, final_result, cache_enabled)
|
|
|
2503 |
print(f"Transcript extraction completed. Success rate: {success_rate:.1%}")
|
2504 |
return final_result
|
|
|
2505 |
except Exception as e:
|
2506 |
print(f"Error during transcript extraction: {str(e)}")
|
2507 |
+
return {'error': str(e), 'full_transcript': '', 'success_rate': 0.0}
|
|
|
|
|
|
|
|
|
2508 |
|
2509 |
+
def _run(self, youtube_url: str, **kwargs: Any) -> str:
|
2510 |
+
input_data = {'youtube_url': youtube_url, **kwargs}
|
2511 |
+
if not youtube_url: return "Error: youtube_url is required."
|
|
|
|
|
|
|
|
|
|
|
|
|
2512 |
|
2513 |
try:
|
|
|
2514 |
video_hash = self._get_video_hash(youtube_url)
|
2515 |
+
print(f"Processing YouTube URL: {youtube_url} (Hash: {video_hash})")
|
2516 |
+
|
|
|
2517 |
audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
|
2518 |
if not audio_path or not os.path.exists(audio_path):
|
2519 |
+
return "Error: Failed to download YouTube audio. Check URL or authentication (cookies)."
|
2520 |
|
|
|
2521 |
print("Extracting audio transcript...")
|
2522 |
transcript_result = self.extract_transcript(audio_path, video_hash, input_data)
|
2523 |
|
2524 |
+
if transcript_result.get("error"): return f"Error: {transcript_result['error']}"
|
|
|
|
|
|
|
2525 |
|
2526 |
+
main_transcript = transcript_result.get('full_transcript', '')
|
2527 |
+
if not main_transcript: return "Error: No transcript could be extracted."
|
|
|
|
|
|
|
2528 |
|
2529 |
+
print(f"Transcript extracted. Word count: {transcript_result.get('word_count',0)}. Success: {transcript_result.get('success_rate',0):.1%}")
|
2530 |
return "TRANSCRIPT: " + main_transcript
|
|
|
2531 |
except Exception as e:
|
2532 |
+
print(f"Unhandled error in _run: {str(e)}") # For debugging
|
2533 |
return f"Error during transcript extraction: {str(e)}"
|
2534 |
|
2535 |
# Factory function to create the tool
|