Spaces:
Sleeping
Sleeping
Samuel Thomas
commited on
Commit
·
0264a40
1
Parent(s):
7ab4cd0
youtube transcript correction
Browse files
tools.py
CHANGED
@@ -2124,42 +2124,58 @@ def create_enhanced_youtube_qa_tool(**kwargs):
|
|
2124 |
"""Factory function to create the enhanced tool with custom parameters"""
|
2125 |
return EnhancedYoutubeScreenshotQA(**kwargs)
|
2126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2127 |
|
2128 |
class YouTubeTranscriptExtractor(BaseTool):
|
2129 |
name: str = "youtube_transcript_extractor"
|
2130 |
description: str = (
|
2131 |
-
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition
|
2132 |
-
|
2133 |
-
"Use this tool for questions like 'what does jim say in response to a question in this video',"
|
2134 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
2135 |
-
|
2136 |
-
#"'silence_thresh' (default: -40), 'use_enhanced_model' (default: True), 'audio_quality' (default: 'best'), "
|
2137 |
-
#"'enable_speaker_id' (default: True), 'max_speakers' (default: 5), 'speaker_min_duration' (default: 2.0). "
|
2138 |
-
"Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US', 'enable_speaker_id': True}"
|
2139 |
)
|
2140 |
|
2141 |
# Define Pydantic fields for the attributes we need to set
|
2142 |
recognizer: Any = Field(default=None, exclude=True)
|
2143 |
|
2144 |
class Config:
|
2145 |
-
# Allow arbitrary types
|
2146 |
arbitrary_types_allowed = True
|
2147 |
-
# Allow extra fields to be set
|
2148 |
extra = "allow"
|
2149 |
|
2150 |
def __init__(self, **kwargs):
|
2151 |
super().__init__(**kwargs)
|
2152 |
|
2153 |
# Initialize directories
|
2154 |
-
cache_dir = '/tmp/youtube_transcript_cache/'
|
2155 |
-
audio_dir = '/tmp/audio/'
|
2156 |
-
chunks_dir = '/tmp/audio_chunks/'
|
2157 |
|
2158 |
# Initialize speech recognizer
|
2159 |
self.recognizer = sr.Recognizer()
|
|
|
|
|
2160 |
|
2161 |
# Create directories
|
2162 |
-
for dir_path in [cache_dir, audio_dir, chunks_dir]:
|
2163 |
os.makedirs(dir_path, exist_ok=True)
|
2164 |
|
2165 |
def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
|
@@ -2168,19 +2184,10 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2168 |
'language': 'en-US',
|
2169 |
'chunk_length_ms': 30000, # 30 seconds
|
2170 |
'silence_thresh': -40, # dB
|
2171 |
-
'use_enhanced_model': True,
|
2172 |
'audio_quality': 'best',
|
2173 |
'cache_enabled': True,
|
2174 |
-
'parallel_processing': True,
|
2175 |
-
'overlap_ms': 1000, # 1 second overlap between chunks
|
2176 |
'min_silence_len': 500, # minimum silence length to split on
|
2177 |
-
'
|
2178 |
-
'pause_threshold': 0.8, # recognizer pause threshold
|
2179 |
-
'enable_speaker_id': True, # enable speaker identification
|
2180 |
-
'max_speakers': 5, # maximum number of speakers to identify
|
2181 |
-
'speaker_min_duration': 2.0, # minimum duration (seconds) for speaker segment
|
2182 |
-
'speaker_confidence_threshold': 0.6, # confidence threshold for speaker assignment
|
2183 |
-
'voice_activity_threshold': 0.01 # threshold for voice activity detection
|
2184 |
}
|
2185 |
|
2186 |
if input_data and key in input_data:
|
@@ -2193,8 +2200,7 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2193 |
|
2194 |
def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
|
2195 |
"""Get cache file path"""
|
2196 |
-
cache_dir
|
2197 |
-
return os.path.join(cache_dir, f"{video_hash}_{cache_type}")
|
2198 |
|
2199 |
def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
|
2200 |
"""Load data from cache"""
|
@@ -2217,12 +2223,24 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2217 |
except Exception as e:
|
2218 |
print(f"Error saving cache: {str(e)}")
|
2219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2220 |
def download_youtube_audio(self, url: str, video_hash: str, input_data: Dict[str, Any] = None) -> Optional[str]:
|
2221 |
"""Download YouTube video as audio file"""
|
2222 |
-
audio_dir = '/tmp/audio/'
|
2223 |
audio_quality = self._get_config('audio_quality', 'best', input_data)
|
2224 |
output_filename = f'{video_hash}.wav'
|
2225 |
-
output_path = os.path.join(audio_dir, output_filename)
|
2226 |
|
2227 |
# Check cache
|
2228 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
@@ -2231,147 +2249,97 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2231 |
return output_path
|
2232 |
|
2233 |
# Clean directory
|
2234 |
-
self._clean_directory(audio_dir)
|
2235 |
|
2236 |
try:
|
2237 |
-
#
|
2238 |
-
temp_video_path = os.path.join(audio_dir, f'{video_hash}_temp.%(ext)s')
|
2239 |
-
|
2240 |
ydl_opts = {
|
2241 |
-
'format': 'bestaudio/best'
|
2242 |
-
'outtmpl':
|
2243 |
-
'quiet':
|
2244 |
-
'
|
2245 |
-
'
|
2246 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2247 |
}
|
2248 |
|
2249 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
|
2250 |
ydl.download([url])
|
2251 |
|
2252 |
-
#
|
2253 |
-
temp_files = glob.glob(os.path.join(audio_dir, f'{video_hash}_temp.*'))
|
2254 |
-
if not temp_files:
|
2255 |
-
print("No temporary audio file found")
|
2256 |
-
return None
|
2257 |
-
|
2258 |
-
temp_file = temp_files[0]
|
2259 |
-
|
2260 |
-
# Convert to WAV if not already
|
2261 |
-
if not temp_file.endswith('.wav'):
|
2262 |
-
try:
|
2263 |
-
audio = AudioSegment.from_file(temp_file)
|
2264 |
-
audio.export(output_path, format="wav")
|
2265 |
-
os.remove(temp_file) # Clean up temp file
|
2266 |
-
except Exception as e:
|
2267 |
-
print(f"Error converting audio: {str(e)}")
|
2268 |
-
# Try to rename if it's already the right format
|
2269 |
-
if os.path.exists(temp_file):
|
2270 |
-
os.rename(temp_file, output_path)
|
2271 |
-
else:
|
2272 |
-
os.rename(temp_file, output_path)
|
2273 |
-
|
2274 |
if os.path.exists(output_path):
|
2275 |
-
print(f"Audio
|
2276 |
return output_path
|
2277 |
else:
|
2278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2279 |
return None
|
2280 |
|
2281 |
except Exception as e:
|
2282 |
print(f"Error downloading YouTube audio: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2283 |
return None
|
2284 |
|
2285 |
-
def _clean_directory(self, directory: str):
|
2286 |
-
"""Clean directory contents"""
|
2287 |
-
if os.path.exists(directory):
|
2288 |
-
for filename in os.listdir(directory):
|
2289 |
-
file_path = os.path.join(directory, filename)
|
2290 |
-
try:
|
2291 |
-
if os.path.isfile(file_path) or os.path.islink(file_path):
|
2292 |
-
os.unlink(file_path)
|
2293 |
-
elif os.path.isdir(file_path):
|
2294 |
-
shutil.rmtree(file_path)
|
2295 |
-
except Exception as e:
|
2296 |
-
print(f'Failed to delete {file_path}. Reason: {e}')
|
2297 |
-
|
2298 |
-
def _extract_voice_features(self, audio_path: str) -> Optional[np.ndarray]:
|
2299 |
-
"""Extract voice features for speaker identification using librosa"""
|
2300 |
-
try:
|
2301 |
-
# Load audio with librosa
|
2302 |
-
y, sr = librosa.load(audio_path, sr=None)
|
2303 |
-
|
2304 |
-
# Extract MFCC features (commonly used for speaker identification)
|
2305 |
-
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
|
2306 |
-
|
2307 |
-
# Extract additional features
|
2308 |
-
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
|
2309 |
-
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
|
2310 |
-
zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
|
2311 |
-
|
2312 |
-
# Combine features and take mean across time
|
2313 |
-
features = np.concatenate([
|
2314 |
-
np.mean(mfccs, axis=1),
|
2315 |
-
np.mean(spectral_centroids),
|
2316 |
-
np.mean(spectral_rolloff),
|
2317 |
-
np.mean(zero_crossing_rate)
|
2318 |
-
])
|
2319 |
-
|
2320 |
-
return features
|
2321 |
-
|
2322 |
-
except Exception as e:
|
2323 |
-
print(f"Error extracting voice features from {audio_path}: {str(e)}")
|
2324 |
-
return None
|
2325 |
-
|
2326 |
-
def _detect_voice_activity(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Tuple[float, float]]:
|
2327 |
-
"""Detect voice activity in audio chunk"""
|
2328 |
-
try:
|
2329 |
-
y, sr = librosa.load(audio_path, sr=None)
|
2330 |
-
|
2331 |
-
# Simple voice activity detection based on energy
|
2332 |
-
frame_length = int(0.025 * sr) # 25ms frames
|
2333 |
-
hop_length = int(0.010 * sr) # 10ms hop
|
2334 |
-
|
2335 |
-
# Calculate short-time energy
|
2336 |
-
energy = []
|
2337 |
-
for i in range(0, len(y) - frame_length, hop_length):
|
2338 |
-
frame = y[i:i + frame_length]
|
2339 |
-
energy.append(np.sum(frame ** 2))
|
2340 |
-
|
2341 |
-
energy = np.array(energy)
|
2342 |
-
threshold = self._get_config('voice_activity_threshold', 0.01, input_data)
|
2343 |
-
|
2344 |
-
# Find voice segments
|
2345 |
-
voice_frames = energy > (np.max(energy) * threshold)
|
2346 |
-
|
2347 |
-
# Convert frame indices to time segments
|
2348 |
-
voice_segments = []
|
2349 |
-
in_voice = False
|
2350 |
-
start_time = 0
|
2351 |
-
|
2352 |
-
for i, is_voice in enumerate(voice_frames):
|
2353 |
-
time_sec = i * hop_length / sr
|
2354 |
-
if is_voice and not in_voice:
|
2355 |
-
start_time = time_sec
|
2356 |
-
in_voice = True
|
2357 |
-
elif not is_voice and in_voice:
|
2358 |
-
voice_segments.append((start_time, time_sec))
|
2359 |
-
in_voice = False
|
2360 |
-
|
2361 |
-
# Close last segment if needed
|
2362 |
-
if in_voice:
|
2363 |
-
voice_segments.append((start_time, len(y) / sr))
|
2364 |
-
|
2365 |
-
return voice_segments
|
2366 |
-
|
2367 |
-
except Exception as e:
|
2368 |
-
print(f"Error in voice activity detection: {str(e)}")
|
2369 |
-
return [(0, librosa.get_duration(filename=audio_path))]
|
2370 |
-
|
2371 |
def _split_audio_intelligent(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
2372 |
-
"""Split audio into chunks intelligently based on silence
|
2373 |
-
chunks_dir
|
2374 |
-
self._clean_directory(chunks_dir)
|
2375 |
|
2376 |
try:
|
2377 |
# Load audio
|
@@ -2402,40 +2370,46 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2402 |
|
2403 |
# Save chunks and create metadata
|
2404 |
chunk_data = []
|
|
|
|
|
2405 |
for i, chunk in enumerate(chunks):
|
2406 |
if len(chunk) < 1000: # Skip very short chunks
|
2407 |
continue
|
2408 |
|
2409 |
-
chunk_filename = os.path.join(chunks_dir, f"chunk_{i:04d}.wav")
|
2410 |
chunk.export(chunk_filename, format="wav")
|
2411 |
|
2412 |
-
# Calculate timing information
|
2413 |
-
start_time = sum(len(chunks[j]) for j in range(i)) / 1000.0 # in seconds
|
2414 |
duration = len(chunk) / 1000.0 # in seconds
|
2415 |
|
2416 |
chunk_info = {
|
2417 |
'filename': chunk_filename,
|
2418 |
'index': i,
|
2419 |
-
'start_time':
|
2420 |
'duration': duration,
|
2421 |
-
'end_time':
|
2422 |
}
|
2423 |
|
2424 |
chunk_data.append(chunk_info)
|
|
|
2425 |
|
2426 |
print(f"Split audio into {len(chunk_data)} chunks")
|
2427 |
return chunk_data
|
2428 |
|
2429 |
except Exception as e:
|
2430 |
print(f"Error splitting audio: {str(e)}")
|
2431 |
-
# Fallback: return original file
|
2432 |
-
|
2433 |
-
|
2434 |
-
|
2435 |
-
|
2436 |
-
|
2437 |
-
|
2438 |
-
|
|
|
|
|
|
|
|
|
|
|
2439 |
|
2440 |
def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
|
2441 |
"""Transcribe a single audio chunk"""
|
@@ -2443,375 +2417,115 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2443 |
try:
|
2444 |
language = self._get_config('language', 'en-US', input_data)
|
2445 |
|
2446 |
-
# Configure recognizer
|
2447 |
-
self.recognizer.energy_threshold = self._get_config('energy_threshold', 4000, input_data)
|
2448 |
-
self.recognizer.pause_threshold = self._get_config('pause_threshold', 0.8, input_data)
|
2449 |
-
|
2450 |
with sr.AudioFile(chunk_path) as source:
|
2451 |
# Adjust for ambient noise
|
2452 |
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
2453 |
audio_data = self.recognizer.record(source)
|
2454 |
|
2455 |
-
# Try Google Speech Recognition
|
2456 |
try:
|
2457 |
text = self.recognizer.recognize_google(audio_data, language=language)
|
2458 |
-
|
2459 |
'text': text,
|
2460 |
-
'confidence': 1.0,
|
2461 |
-
'method': 'google',
|
2462 |
-
'chunk': os.path.basename(chunk_path),
|
2463 |
'start_time': chunk_info['start_time'],
|
2464 |
'end_time': chunk_info['end_time'],
|
2465 |
'duration': chunk_info['duration'],
|
2466 |
-
'index': chunk_info['index']
|
|
|
2467 |
}
|
2468 |
|
2469 |
-
# Extract voice features if speaker ID is enabled
|
2470 |
-
if self._get_config('enable_speaker_id', True, input_data):
|
2471 |
-
features = self._extract_voice_features(chunk_path)
|
2472 |
-
result['voice_features'] = features.tolist() if features is not None else None
|
2473 |
-
|
2474 |
-
return result
|
2475 |
-
|
2476 |
except sr.UnknownValueError:
|
2477 |
-
# Try
|
2478 |
try:
|
2479 |
-
# Try with alternative language detection
|
2480 |
text = self.recognizer.recognize_google(audio_data)
|
2481 |
-
|
2482 |
'text': text,
|
2483 |
-
'confidence': 0.8,
|
2484 |
-
'method': 'google_auto',
|
2485 |
-
'chunk': os.path.basename(chunk_path),
|
2486 |
'start_time': chunk_info['start_time'],
|
2487 |
'end_time': chunk_info['end_time'],
|
2488 |
'duration': chunk_info['duration'],
|
2489 |
-
'index': chunk_info['index']
|
|
|
2490 |
}
|
2491 |
-
|
2492 |
-
if self._get_config('enable_speaker_id', True, input_data):
|
2493 |
-
features = self._extract_voice_features(chunk_path)
|
2494 |
-
result['voice_features'] = features.tolist() if features is not None else None
|
2495 |
-
|
2496 |
-
return result
|
2497 |
-
|
2498 |
except sr.UnknownValueError:
|
2499 |
return {
|
2500 |
'text': '[INAUDIBLE]',
|
2501 |
'confidence': 0.0,
|
2502 |
-
'method': 'failed',
|
2503 |
-
'chunk': os.path.basename(chunk_path),
|
2504 |
'start_time': chunk_info['start_time'],
|
2505 |
'end_time': chunk_info['end_time'],
|
2506 |
'duration': chunk_info['duration'],
|
2507 |
'index': chunk_info['index'],
|
2508 |
-
'
|
2509 |
}
|
2510 |
except sr.RequestError as e:
|
2511 |
-
print(f"Google Speech Recognition error: {e}")
|
2512 |
return {
|
2513 |
-
'text': '[RECOGNITION_ERROR]',
|
2514 |
'confidence': 0.0,
|
2515 |
-
'method': 'error',
|
2516 |
-
'chunk': os.path.basename(chunk_path),
|
2517 |
'start_time': chunk_info['start_time'],
|
2518 |
'end_time': chunk_info['end_time'],
|
2519 |
'duration': chunk_info['duration'],
|
2520 |
'index': chunk_info['index'],
|
2521 |
-
'
|
2522 |
-
'
|
2523 |
}
|
2524 |
|
2525 |
except Exception as e:
|
2526 |
-
print(f"Error transcribing chunk {chunk_path}: {str(e)}")
|
2527 |
return {
|
2528 |
-
'text': '[ERROR]',
|
2529 |
'confidence': 0.0,
|
2530 |
-
'method': 'error',
|
2531 |
-
'chunk': os.path.basename(chunk_path),
|
2532 |
'start_time': chunk_info.get('start_time', 0),
|
2533 |
'end_time': chunk_info.get('end_time', 0),
|
2534 |
'duration': chunk_info.get('duration', 0),
|
2535 |
'index': chunk_info.get('index', 0),
|
2536 |
-
'
|
2537 |
-
'
|
2538 |
}
|
2539 |
|
2540 |
-
def _identify_speakers(self, transcript_results: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
2541 |
-
"""Identify speakers using voice features clustering"""
|
2542 |
-
enable_speaker_id = self._get_config('enable_speaker_id', True, input_data)
|
2543 |
-
if not enable_speaker_id:
|
2544 |
-
# Add default speaker tags
|
2545 |
-
for result in transcript_results:
|
2546 |
-
result['speaker_id'] = 'SPEAKER_1'
|
2547 |
-
result['speaker_confidence'] = 1.0
|
2548 |
-
return transcript_results
|
2549 |
-
|
2550 |
-
try:
|
2551 |
-
# Filter results with valid voice features and text
|
2552 |
-
valid_results = []
|
2553 |
-
features_list = []
|
2554 |
-
|
2555 |
-
for result in transcript_results:
|
2556 |
-
if (result.get('voice_features') is not None and
|
2557 |
-
result['text'] not in ['[INAUDIBLE]', '[RECOGNITION_ERROR]', '[ERROR]', '[PROCESSING_ERROR]']):
|
2558 |
-
valid_results.append(result)
|
2559 |
-
features_list.append(result['voice_features'])
|
2560 |
-
|
2561 |
-
if len(features_list) < 2:
|
2562 |
-
# Not enough data for clustering
|
2563 |
-
for result in transcript_results:
|
2564 |
-
result['speaker_id'] = 'SPEAKER_1'
|
2565 |
-
result['speaker_confidence'] = 1.0
|
2566 |
-
return transcript_results
|
2567 |
-
|
2568 |
-
# Normalize features
|
2569 |
-
features_array = np.array(features_list)
|
2570 |
-
scaler = StandardScaler()
|
2571 |
-
normalized_features = scaler.fit_transform(features_array)
|
2572 |
-
|
2573 |
-
# Determine optimal number of speakers
|
2574 |
-
max_speakers = min(self._get_config('max_speakers', 5, input_data), len(features_list))
|
2575 |
-
|
2576 |
-
# Use elbow method to find optimal clusters (simplified)
|
2577 |
-
best_k = 1
|
2578 |
-
if len(features_list) > 1:
|
2579 |
-
best_score = float('inf')
|
2580 |
-
for k in range(1, min(max_speakers + 1, len(features_list) + 1)):
|
2581 |
-
try:
|
2582 |
-
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
2583 |
-
labels = kmeans.fit_predict(normalized_features)
|
2584 |
-
if k > 1:
|
2585 |
-
score = kmeans.inertia_
|
2586 |
-
if score < best_score:
|
2587 |
-
best_score = score
|
2588 |
-
best_k = k
|
2589 |
-
except:
|
2590 |
-
continue
|
2591 |
-
|
2592 |
-
# Don't use too many clusters for short audio
|
2593 |
-
if len(features_list) < 10:
|
2594 |
-
best_k = min(best_k, 2)
|
2595 |
-
|
2596 |
-
# Perform final clustering
|
2597 |
-
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
|
2598 |
-
speaker_labels = kmeans.fit_predict(normalized_features)
|
2599 |
-
|
2600 |
-
# Calculate speaker assignment confidence
|
2601 |
-
distances = kmeans.transform(normalized_features)
|
2602 |
-
confidences = []
|
2603 |
-
for i, label in enumerate(speaker_labels):
|
2604 |
-
# Confidence based on distance to assigned cluster vs. nearest other cluster
|
2605 |
-
dist_to_assigned = distances[i][label]
|
2606 |
-
other_distances = np.delete(distances[i], label)
|
2607 |
-
if len(other_distances) > 0:
|
2608 |
-
dist_to_nearest_other = np.min(other_distances)
|
2609 |
-
confidence = max(0.1, min(1.0, dist_to_nearest_other / (dist_to_assigned + 1e-6)))
|
2610 |
-
else:
|
2611 |
-
confidence = 1.0
|
2612 |
-
confidences.append(confidence)
|
2613 |
-
|
2614 |
-
# Assign speaker IDs back to results
|
2615 |
-
valid_idx = 0
|
2616 |
-
speaker_duration = {} # Track duration per speaker
|
2617 |
-
|
2618 |
-
for result in transcript_results:
|
2619 |
-
if (result.get('voice_features') is not None and
|
2620 |
-
result['text'] not in ['[INAUDIBLE]', '[RECOGNITION_ERROR]', '[ERROR]', '[PROCESSING_ERROR]']):
|
2621 |
-
|
2622 |
-
speaker_label = speaker_labels[valid_idx]
|
2623 |
-
confidence = confidences[valid_idx]
|
2624 |
-
|
2625 |
-
# Filter by confidence threshold
|
2626 |
-
conf_threshold = self._get_config('speaker_confidence_threshold', 0.6, input_data)
|
2627 |
-
if confidence < conf_threshold:
|
2628 |
-
speaker_id = 'SPEAKER_UNKNOWN'
|
2629 |
-
else:
|
2630 |
-
speaker_id = f'SPEAKER_{speaker_label + 1}'
|
2631 |
-
|
2632 |
-
result['speaker_id'] = speaker_id
|
2633 |
-
result['speaker_confidence'] = confidence
|
2634 |
-
|
2635 |
-
# Track speaker duration
|
2636 |
-
if speaker_id in speaker_duration:
|
2637 |
-
speaker_duration[speaker_id] += result['duration']
|
2638 |
-
else:
|
2639 |
-
speaker_duration[speaker_id] = result['duration']
|
2640 |
-
|
2641 |
-
valid_idx += 1
|
2642 |
-
else:
|
2643 |
-
# Handle invalid results
|
2644 |
-
result['speaker_id'] = 'SPEAKER_UNKNOWN'
|
2645 |
-
result['speaker_confidence'] = 0.0
|
2646 |
-
|
2647 |
-
# Filter out speakers with insufficient duration
|
2648 |
-
min_duration = self._get_config('speaker_min_duration', 2.0, input_data)
|
2649 |
-
speakers_to_merge = [s for s, d in speaker_duration.items() if d < min_duration and s != 'SPEAKER_UNKNOWN']
|
2650 |
-
|
2651 |
-
# Merge low-duration speakers into SPEAKER_UNKNOWN
|
2652 |
-
for result in transcript_results:
|
2653 |
-
if result['speaker_id'] in speakers_to_merge:
|
2654 |
-
result['speaker_id'] = 'SPEAKER_UNKNOWN'
|
2655 |
-
result['speaker_confidence'] = 0.3
|
2656 |
-
|
2657 |
-
print(f"Identified {best_k} speakers based on voice characteristics")
|
2658 |
-
return transcript_results
|
2659 |
-
|
2660 |
-
except Exception as e:
|
2661 |
-
print(f"Error in speaker identification: {str(e)}")
|
2662 |
-
# Fallback: assign all to single speaker
|
2663 |
-
for result in transcript_results:
|
2664 |
-
result['speaker_id'] = 'SPEAKER_1'
|
2665 |
-
result['speaker_confidence'] = 1.0
|
2666 |
-
return transcript_results
|
2667 |
-
|
2668 |
def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
2669 |
"""Transcribe audio chunks in parallel"""
|
2670 |
results = []
|
2671 |
-
|
2672 |
-
|
2673 |
-
|
2674 |
-
|
2675 |
-
|
2676 |
-
|
2677 |
-
|
2678 |
-
|
2679 |
-
|
2680 |
-
}
|
2681 |
|
2682 |
-
|
2683 |
-
|
2684 |
-
|
2685 |
-
|
2686 |
-
|
2687 |
-
|
2688 |
-
|
2689 |
-
print(f"
|
2690 |
-
|
2691 |
-
|
2692 |
-
|
2693 |
-
|
2694 |
-
|
2695 |
-
|
2696 |
-
|
2697 |
-
|
2698 |
-
|
2699 |
-
|
2700 |
-
|
2701 |
-
|
2702 |
-
|
2703 |
-
|
2704 |
-
result = self._transcribe_audio_chunk(chunk_info, input_data)
|
2705 |
-
results.append(result)
|
2706 |
-
print(f"Transcribed {result['chunk']}: {result['text'][:50]}..." if len(result['text']) > 50 else f"Transcribed {result['chunk']}: {result['text']}")
|
2707 |
|
2708 |
# Sort results by chunk index to maintain order
|
2709 |
results.sort(key=lambda x: x['index'])
|
2710 |
return results
|
2711 |
|
2712 |
-
def _post_process_transcript(self, transcript_results: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
|
2713 |
-
"""Post-process and analyze transcript results with speaker information"""
|
2714 |
-
enable_speaker_id = self._get_config('enable_speaker_id', True, input_data)
|
2715 |
-
|
2716 |
-
# Identify speakers if enabled
|
2717 |
-
if enable_speaker_id:
|
2718 |
-
transcript_results = self._identify_speakers(transcript_results, input_data)
|
2719 |
-
|
2720 |
-
# Combine text with speaker tags
|
2721 |
-
full_text_parts = []
|
2722 |
-
speaker_tagged_text = []
|
2723 |
-
successful_chunks = 0
|
2724 |
-
total_confidence = 0.0
|
2725 |
-
method_counts = {}
|
2726 |
-
speaker_stats = {}
|
2727 |
-
|
2728 |
-
current_speaker = None
|
2729 |
-
current_speaker_text = []
|
2730 |
-
|
2731 |
-
for result in transcript_results:
|
2732 |
-
text = result['text']
|
2733 |
-
speaker = result.get('speaker_id', 'SPEAKER_1')
|
2734 |
-
start_time = result.get('start_time', 0)
|
2735 |
-
|
2736 |
-
if text not in ['[INAUDIBLE]', '[RECOGNITION_ERROR]', '[ERROR]', '[PROCESSING_ERROR]']:
|
2737 |
-
full_text_parts.append(text)
|
2738 |
-
successful_chunks += 1
|
2739 |
-
total_confidence += result['confidence']
|
2740 |
-
|
2741 |
-
# Handle speaker transitions
|
2742 |
-
if enable_speaker_id:
|
2743 |
-
if current_speaker != speaker:
|
2744 |
-
# Save previous speaker's text
|
2745 |
-
if current_speaker and current_speaker_text:
|
2746 |
-
combined_text = ' '.join(current_speaker_text)
|
2747 |
-
speaker_tagged_text.append(f"[{current_speaker}]: {combined_text}")
|
2748 |
-
|
2749 |
-
# Start new speaker
|
2750 |
-
current_speaker = speaker
|
2751 |
-
current_speaker_text = [text]
|
2752 |
-
else:
|
2753 |
-
# Continue with same speaker
|
2754 |
-
current_speaker_text.append(text)
|
2755 |
-
else:
|
2756 |
-
speaker_tagged_text.append(text)
|
2757 |
-
|
2758 |
-
# Update speaker statistics
|
2759 |
-
if speaker in speaker_stats:
|
2760 |
-
speaker_stats[speaker]['duration'] += result.get('duration', 0)
|
2761 |
-
speaker_stats[speaker]['word_count'] += len(text.split())
|
2762 |
-
speaker_stats[speaker]['segments'] += 1
|
2763 |
-
else:
|
2764 |
-
speaker_stats[speaker] = {
|
2765 |
-
'duration': result.get('duration', 0),
|
2766 |
-
'word_count': len(text.split()),
|
2767 |
-
'segments': 1,
|
2768 |
-
'confidence': result.get('speaker_confidence', 1.0)
|
2769 |
-
}
|
2770 |
-
|
2771 |
-
method = result['method']
|
2772 |
-
method_counts[method] = method_counts.get(method, 0) + 1
|
2773 |
-
|
2774 |
-
# Add final speaker text
|
2775 |
-
if enable_speaker_id and current_speaker and current_speaker_text:
|
2776 |
-
combined_text = ' '.join(current_speaker_text)
|
2777 |
-
speaker_tagged_text.append(f"[{current_speaker}]: {combined_text}")
|
2778 |
-
|
2779 |
-
# Combine texts
|
2780 |
-
combined_text = ' '.join(full_text_parts)
|
2781 |
-
speaker_formatted_text = combined_text
|
2782 |
-
|
2783 |
-
# Calculate statistics
|
2784 |
-
word_count = len(combined_text.split()) if combined_text else 0
|
2785 |
-
char_count = len(combined_text)
|
2786 |
-
avg_confidence = total_confidence / max(1, successful_chunks)
|
2787 |
-
success_rate = successful_chunks / len(transcript_results) if transcript_results else 0
|
2788 |
-
|
2789 |
-
# Estimate speaking duration (rough approximation: 150 words per minute)
|
2790 |
-
estimated_duration_minutes = word_count / 150 if word_count > 0 else 0
|
2791 |
-
|
2792 |
-
return {
|
2793 |
-
'full_transcript': combined_text,
|
2794 |
-
'speaker_tagged_transcript': speaker_formatted_text,
|
2795 |
-
'word_count': word_count,
|
2796 |
-
'character_count': char_count,
|
2797 |
-
'chunk_count': len(transcript_results),
|
2798 |
-
'successful_chunks': successful_chunks,
|
2799 |
-
'success_rate': success_rate,
|
2800 |
-
'average_confidence': avg_confidence,
|
2801 |
-
'method_distribution': method_counts,
|
2802 |
-
'estimated_duration_minutes': estimated_duration_minutes,
|
2803 |
-
'speaker_identification_enabled': enable_speaker_id,
|
2804 |
-
'speaker_statistics': speaker_stats,
|
2805 |
-
'total_speakers': len([s for s in speaker_stats.keys() if s != 'SPEAKER_UNKNOWN']),
|
2806 |
-
'detailed_results': transcript_results
|
2807 |
-
}
|
2808 |
-
|
2809 |
def extract_transcript(self, audio_path: str, video_hash: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]:
|
2810 |
"""Extract complete transcript from audio file"""
|
2811 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
2812 |
-
|
2813 |
-
cache_suffix = "transcript_with_speakers.json" if enable_speaker_id else "transcript.json"
|
2814 |
-
cache_path = self._get_cache_path(video_hash, cache_suffix)
|
2815 |
|
2816 |
# Check cache
|
2817 |
cached_transcript = self._load_from_cache(cache_path, cache_enabled)
|
@@ -2828,7 +2542,6 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2828 |
return {
|
2829 |
'error': 'Failed to split audio into chunks',
|
2830 |
'full_transcript': '',
|
2831 |
-
'speaker_tagged_transcript': '',
|
2832 |
'success_rate': 0.0
|
2833 |
}
|
2834 |
|
@@ -2836,17 +2549,31 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2836 |
print(f"Transcribing {len(chunk_data)} audio chunks...")
|
2837 |
transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
|
2838 |
|
2839 |
-
# Step 3:
|
2840 |
-
|
2841 |
-
|
2842 |
-
|
2843 |
-
#
|
2844 |
-
|
2845 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2846 |
|
2847 |
# Cache results
|
2848 |
self._save_to_cache(cache_path, final_result, cache_enabled)
|
2849 |
-
|
|
|
2850 |
return final_result
|
2851 |
|
2852 |
except Exception as e:
|
@@ -2854,7 +2581,6 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2854 |
return {
|
2855 |
'error': str(e),
|
2856 |
'full_transcript': '',
|
2857 |
-
'speaker_tagged_transcript': '',
|
2858 |
'success_rate': 0.0
|
2859 |
}
|
2860 |
|
@@ -2876,7 +2602,7 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2876 |
print(f"Downloading YouTube audio from {youtube_url}...")
|
2877 |
audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
|
2878 |
if not audio_path or not os.path.exists(audio_path):
|
2879 |
-
return "Error: Failed to download the YouTube audio."
|
2880 |
|
2881 |
# Step 2: Extract transcript
|
2882 |
print("Extracting audio transcript...")
|
@@ -2885,18 +2611,19 @@ class YouTubeTranscriptExtractor(BaseTool):
|
|
2885 |
if transcript_result.get("error"):
|
2886 |
return f"Error: {transcript_result['error']}"
|
2887 |
|
2888 |
-
|
2889 |
-
|
|
|
|
|
2890 |
|
2891 |
-
|
2892 |
-
print(f"
|
2893 |
|
2894 |
return "TRANSCRIPT: " + main_transcript
|
2895 |
|
2896 |
except Exception as e:
|
2897 |
return f"Error during transcript extraction: {str(e)}"
|
2898 |
|
2899 |
-
|
2900 |
# Factory function to create the tool
|
2901 |
def create_youtube_transcript_tool(**kwargs):
|
2902 |
"""Factory function to create the transcript extraction tool with custom parameters"""
|
|
|
2124 |
"""Factory function to create the enhanced tool with custom parameters"""
|
2125 |
return EnhancedYoutubeScreenshotQA(**kwargs)
|
2126 |
|
2127 |
+
import os
|
2128 |
+
import json
|
2129 |
+
import hashlib
|
2130 |
+
import time
|
2131 |
+
import shutil
|
2132 |
+
import glob
|
2133 |
+
from typing import Dict, Any, List, Optional
|
2134 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
2135 |
+
import yt_dlp
|
2136 |
+
import speech_recognition as sr
|
2137 |
+
from pydantic import Field
|
2138 |
+
from pydantic.v1 import BaseModel
|
2139 |
+
from pydub import AudioSegment
|
2140 |
+
from pydub.silence import split_on_silence
|
2141 |
+
|
2142 |
+
|
2143 |
+
class BaseTool(BaseModel):
|
2144 |
+
name: str
|
2145 |
+
description: str
|
2146 |
+
|
2147 |
|
2148 |
class YouTubeTranscriptExtractor(BaseTool):
|
2149 |
name: str = "youtube_transcript_extractor"
|
2150 |
description: str = (
|
2151 |
+
"Downloads a YouTube video and extracts the complete audio transcript using speech recognition. "
|
2152 |
+
"Use this tool for questions about what people say in YouTube videos. "
|
|
|
2153 |
"Input should be a dict with keys: 'youtube_url' and optional parameters. "
|
2154 |
+
"Example: {'youtube_url': 'https://youtube.com/watch?v=xyz', 'language': 'en-US'}"
|
|
|
|
|
|
|
2155 |
)
|
2156 |
|
2157 |
# Define Pydantic fields for the attributes we need to set
|
2158 |
recognizer: Any = Field(default=None, exclude=True)
|
2159 |
|
2160 |
class Config:
|
|
|
2161 |
arbitrary_types_allowed = True
|
|
|
2162 |
extra = "allow"
|
2163 |
|
2164 |
def __init__(self, **kwargs):
|
2165 |
super().__init__(**kwargs)
|
2166 |
|
2167 |
# Initialize directories
|
2168 |
+
self.cache_dir = '/tmp/youtube_transcript_cache/'
|
2169 |
+
self.audio_dir = '/tmp/audio/'
|
2170 |
+
self.chunks_dir = '/tmp/audio_chunks/'
|
2171 |
|
2172 |
# Initialize speech recognizer
|
2173 |
self.recognizer = sr.Recognizer()
|
2174 |
+
self.recognizer.energy_threshold = 4000
|
2175 |
+
self.recognizer.pause_threshold = 0.8
|
2176 |
|
2177 |
# Create directories
|
2178 |
+
for dir_path in [self.cache_dir, self.audio_dir, self.chunks_dir]:
|
2179 |
os.makedirs(dir_path, exist_ok=True)
|
2180 |
|
2181 |
def _get_config(self, key: str, default_value=None, input_data: Dict[str, Any] = None):
|
|
|
2184 |
'language': 'en-US',
|
2185 |
'chunk_length_ms': 30000, # 30 seconds
|
2186 |
'silence_thresh': -40, # dB
|
|
|
2187 |
'audio_quality': 'best',
|
2188 |
'cache_enabled': True,
|
|
|
|
|
2189 |
'min_silence_len': 500, # minimum silence length to split on
|
2190 |
+
'overlap_ms': 1000, # 1 second overlap between chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
2191 |
}
|
2192 |
|
2193 |
if input_data and key in input_data:
|
|
|
2200 |
|
2201 |
def _get_cache_path(self, video_hash: str, cache_type: str) -> str:
|
2202 |
"""Get cache file path"""
|
2203 |
+
return os.path.join(self.cache_dir, f"{video_hash}_{cache_type}")
|
|
|
2204 |
|
2205 |
def _load_from_cache(self, cache_path: str, cache_enabled: bool = True) -> Optional[Any]:
|
2206 |
"""Load data from cache"""
|
|
|
2223 |
except Exception as e:
|
2224 |
print(f"Error saving cache: {str(e)}")
|
2225 |
|
2226 |
+
def _clean_directory(self, directory: str):
|
2227 |
+
"""Clean directory contents"""
|
2228 |
+
if os.path.exists(directory):
|
2229 |
+
for filename in os.listdir(directory):
|
2230 |
+
file_path = os.path.join(directory, filename)
|
2231 |
+
try:
|
2232 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
2233 |
+
os.unlink(file_path)
|
2234 |
+
elif os.path.isdir(file_path):
|
2235 |
+
shutil.rmtree(file_path)
|
2236 |
+
except Exception as e:
|
2237 |
+
print(f'Failed to delete {file_path}. Reason: {e}')
|
2238 |
+
|
2239 |
def download_youtube_audio(self, url: str, video_hash: str, input_data: Dict[str, Any] = None) -> Optional[str]:
|
2240 |
"""Download YouTube video as audio file"""
|
|
|
2241 |
audio_quality = self._get_config('audio_quality', 'best', input_data)
|
2242 |
output_filename = f'{video_hash}.wav'
|
2243 |
+
output_path = os.path.join(self.audio_dir, output_filename)
|
2244 |
|
2245 |
# Check cache
|
2246 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
|
|
2249 |
return output_path
|
2250 |
|
2251 |
# Clean directory
|
2252 |
+
self._clean_directory(self.audio_dir)
|
2253 |
|
2254 |
try:
|
2255 |
+
# Updated yt-dlp configuration for better compatibility
|
|
|
|
|
2256 |
ydl_opts = {
|
2257 |
+
'format': 'bestaudio[ext=m4a]/bestaudio/best',
|
2258 |
+
'outtmpl': os.path.join(self.audio_dir, f'{video_hash}.%(ext)s'),
|
2259 |
+
'quiet': False, # Set to False for debugging
|
2260 |
+
'no_warnings': False,
|
2261 |
+
'extract_flat': False,
|
2262 |
+
'writethumbnail': False,
|
2263 |
+
'writeinfojson': False,
|
2264 |
+
'postprocessors': [{
|
2265 |
+
'key': 'FFmpegExtractAudio',
|
2266 |
+
'preferredcodec': 'wav',
|
2267 |
+
'preferredquality': '192' if audio_quality == 'best' else '128',
|
2268 |
+
}],
|
2269 |
+
# Add user agent and headers to avoid blocking
|
2270 |
+
'http_headers': {
|
2271 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
2272 |
+
},
|
2273 |
+
# Add cookie handling
|
2274 |
+
'cookiefile': None,
|
2275 |
+
'nocheckcertificate': True,
|
2276 |
}
|
2277 |
|
2278 |
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
2279 |
+
print(f"Downloading audio from: {url}")
|
2280 |
ydl.download([url])
|
2281 |
|
2282 |
+
# Check if the output file exists
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2283 |
if os.path.exists(output_path):
|
2284 |
+
print(f"Audio downloaded successfully: {output_path}")
|
2285 |
return output_path
|
2286 |
else:
|
2287 |
+
# Look for any downloaded file with the video hash
|
2288 |
+
possible_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}.*'))
|
2289 |
+
if possible_files:
|
2290 |
+
# Convert to WAV if needed
|
2291 |
+
source_file = possible_files[0]
|
2292 |
+
if not source_file.endswith('.wav'):
|
2293 |
+
try:
|
2294 |
+
audio = AudioSegment.from_file(source_file)
|
2295 |
+
audio.export(output_path, format="wav")
|
2296 |
+
os.remove(source_file) # Clean up original
|
2297 |
+
print(f"Audio converted to WAV: {output_path}")
|
2298 |
+
return output_path
|
2299 |
+
except Exception as e:
|
2300 |
+
print(f"Error converting audio: {str(e)}")
|
2301 |
+
return None
|
2302 |
+
else:
|
2303 |
+
return source_file
|
2304 |
+
|
2305 |
+
print("No audio file found after download")
|
2306 |
return None
|
2307 |
|
2308 |
except Exception as e:
|
2309 |
print(f"Error downloading YouTube audio: {str(e)}")
|
2310 |
+
# Try alternative format as fallback
|
2311 |
+
try:
|
2312 |
+
print("Trying alternative download method...")
|
2313 |
+
fallback_opts = {
|
2314 |
+
'format': 'worst[ext=mp4]',
|
2315 |
+
'outtmpl': os.path.join(self.audio_dir, f'{video_hash}_fallback.%(ext)s'),
|
2316 |
+
'quiet': False,
|
2317 |
+
}
|
2318 |
+
|
2319 |
+
with yt_dlp.YoutubeDL(fallback_opts) as ydl:
|
2320 |
+
ydl.download([url])
|
2321 |
+
|
2322 |
+
# Look for fallback file and convert
|
2323 |
+
fallback_files = glob.glob(os.path.join(self.audio_dir, f'{video_hash}_fallback.*'))
|
2324 |
+
if fallback_files:
|
2325 |
+
source_file = fallback_files[0]
|
2326 |
+
try:
|
2327 |
+
audio = AudioSegment.from_file(source_file)
|
2328 |
+
audio.export(output_path, format="wav")
|
2329 |
+
os.remove(source_file)
|
2330 |
+
print(f"Fallback audio converted: {output_path}")
|
2331 |
+
return output_path
|
2332 |
+
except Exception as conv_e:
|
2333 |
+
print(f"Error converting fallback audio: {str(conv_e)}")
|
2334 |
+
|
2335 |
+
except Exception as fallback_e:
|
2336 |
+
print(f"Fallback download also failed: {str(fallback_e)}")
|
2337 |
+
|
2338 |
return None
|
2339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2340 |
def _split_audio_intelligent(self, audio_path: str, input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
2341 |
+
"""Split audio into chunks intelligently based on silence"""
|
2342 |
+
self._clean_directory(self.chunks_dir)
|
|
|
2343 |
|
2344 |
try:
|
2345 |
# Load audio
|
|
|
2370 |
|
2371 |
# Save chunks and create metadata
|
2372 |
chunk_data = []
|
2373 |
+
current_time = 0
|
2374 |
+
|
2375 |
for i, chunk in enumerate(chunks):
|
2376 |
if len(chunk) < 1000: # Skip very short chunks
|
2377 |
continue
|
2378 |
|
2379 |
+
chunk_filename = os.path.join(self.chunks_dir, f"chunk_{i:04d}.wav")
|
2380 |
chunk.export(chunk_filename, format="wav")
|
2381 |
|
|
|
|
|
2382 |
duration = len(chunk) / 1000.0 # in seconds
|
2383 |
|
2384 |
chunk_info = {
|
2385 |
'filename': chunk_filename,
|
2386 |
'index': i,
|
2387 |
+
'start_time': current_time,
|
2388 |
'duration': duration,
|
2389 |
+
'end_time': current_time + duration
|
2390 |
}
|
2391 |
|
2392 |
chunk_data.append(chunk_info)
|
2393 |
+
current_time += duration
|
2394 |
|
2395 |
print(f"Split audio into {len(chunk_data)} chunks")
|
2396 |
return chunk_data
|
2397 |
|
2398 |
except Exception as e:
|
2399 |
print(f"Error splitting audio: {str(e)}")
|
2400 |
+
# Fallback: return original file as single chunk
|
2401 |
+
try:
|
2402 |
+
audio = AudioSegment.from_wav(audio_path)
|
2403 |
+
duration = len(audio) / 1000.0
|
2404 |
+
return [{
|
2405 |
+
'filename': audio_path,
|
2406 |
+
'index': 0,
|
2407 |
+
'start_time': 0,
|
2408 |
+
'duration': duration,
|
2409 |
+
'end_time': duration
|
2410 |
+
}]
|
2411 |
+
except:
|
2412 |
+
return []
|
2413 |
|
2414 |
def _transcribe_audio_chunk(self, chunk_info: Dict[str, Any], input_data: Dict[str, Any] = None) -> Dict[str, Any]:
|
2415 |
"""Transcribe a single audio chunk"""
|
|
|
2417 |
try:
|
2418 |
language = self._get_config('language', 'en-US', input_data)
|
2419 |
|
|
|
|
|
|
|
|
|
2420 |
with sr.AudioFile(chunk_path) as source:
|
2421 |
# Adjust for ambient noise
|
2422 |
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
2423 |
audio_data = self.recognizer.record(source)
|
2424 |
|
2425 |
+
# Try Google Speech Recognition
|
2426 |
try:
|
2427 |
text = self.recognizer.recognize_google(audio_data, language=language)
|
2428 |
+
return {
|
2429 |
'text': text,
|
2430 |
+
'confidence': 1.0,
|
|
|
|
|
2431 |
'start_time': chunk_info['start_time'],
|
2432 |
'end_time': chunk_info['end_time'],
|
2433 |
'duration': chunk_info['duration'],
|
2434 |
+
'index': chunk_info['index'],
|
2435 |
+
'success': True
|
2436 |
}
|
2437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2438 |
except sr.UnknownValueError:
|
2439 |
+
# Try without language specification
|
2440 |
try:
|
|
|
2441 |
text = self.recognizer.recognize_google(audio_data)
|
2442 |
+
return {
|
2443 |
'text': text,
|
2444 |
+
'confidence': 0.8,
|
|
|
|
|
2445 |
'start_time': chunk_info['start_time'],
|
2446 |
'end_time': chunk_info['end_time'],
|
2447 |
'duration': chunk_info['duration'],
|
2448 |
+
'index': chunk_info['index'],
|
2449 |
+
'success': True
|
2450 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2451 |
except sr.UnknownValueError:
|
2452 |
return {
|
2453 |
'text': '[INAUDIBLE]',
|
2454 |
'confidence': 0.0,
|
|
|
|
|
2455 |
'start_time': chunk_info['start_time'],
|
2456 |
'end_time': chunk_info['end_time'],
|
2457 |
'duration': chunk_info['duration'],
|
2458 |
'index': chunk_info['index'],
|
2459 |
+
'success': False
|
2460 |
}
|
2461 |
except sr.RequestError as e:
|
|
|
2462 |
return {
|
2463 |
+
'text': f'[RECOGNITION_ERROR: {str(e)}]',
|
2464 |
'confidence': 0.0,
|
|
|
|
|
2465 |
'start_time': chunk_info['start_time'],
|
2466 |
'end_time': chunk_info['end_time'],
|
2467 |
'duration': chunk_info['duration'],
|
2468 |
'index': chunk_info['index'],
|
2469 |
+
'success': False,
|
2470 |
+
'error': str(e)
|
2471 |
}
|
2472 |
|
2473 |
except Exception as e:
|
|
|
2474 |
return {
|
2475 |
+
'text': f'[ERROR: {str(e)}]',
|
2476 |
'confidence': 0.0,
|
|
|
|
|
2477 |
'start_time': chunk_info.get('start_time', 0),
|
2478 |
'end_time': chunk_info.get('end_time', 0),
|
2479 |
'duration': chunk_info.get('duration', 0),
|
2480 |
'index': chunk_info.get('index', 0),
|
2481 |
+
'success': False,
|
2482 |
+
'error': str(e)
|
2483 |
}
|
2484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2485 |
def _transcribe_chunks_parallel(self, chunk_data: List[Dict[str, Any]], input_data: Dict[str, Any] = None) -> List[Dict[str, Any]]:
|
2486 |
"""Transcribe audio chunks in parallel"""
|
2487 |
results = []
|
2488 |
+
|
2489 |
+
# Use fewer workers to avoid API rate limits
|
2490 |
+
max_workers = min(3, len(chunk_data))
|
2491 |
+
|
2492 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2493 |
+
future_to_chunk = {
|
2494 |
+
executor.submit(self._transcribe_audio_chunk, chunk_info, input_data): chunk_info
|
2495 |
+
for chunk_info in chunk_data
|
2496 |
+
}
|
|
|
2497 |
|
2498 |
+
for future in as_completed(future_to_chunk):
|
2499 |
+
chunk_info = future_to_chunk[future]
|
2500 |
+
try:
|
2501 |
+
result = future.result()
|
2502 |
+
results.append(result)
|
2503 |
+
if result['success']:
|
2504 |
+
preview = result['text'][:50] + "..." if len(result['text']) > 50 else result['text']
|
2505 |
+
print(f"Transcribed chunk {result['index']}: {preview}")
|
2506 |
+
else:
|
2507 |
+
print(f"Failed to transcribe chunk {result['index']}: {result['text']}")
|
2508 |
+
except Exception as e:
|
2509 |
+
print(f"Error processing chunk {chunk_info.get('index', '?')}: {str(e)}")
|
2510 |
+
results.append({
|
2511 |
+
'text': f'[PROCESSING_ERROR: {str(e)}]',
|
2512 |
+
'confidence': 0.0,
|
2513 |
+
'start_time': chunk_info.get('start_time', 0),
|
2514 |
+
'end_time': chunk_info.get('end_time', 0),
|
2515 |
+
'duration': chunk_info.get('duration', 0),
|
2516 |
+
'index': chunk_info.get('index', 0),
|
2517 |
+
'success': False,
|
2518 |
+
'error': str(e)
|
2519 |
+
})
|
|
|
|
|
|
|
2520 |
|
2521 |
# Sort results by chunk index to maintain order
|
2522 |
results.sort(key=lambda x: x['index'])
|
2523 |
return results
|
2524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2525 |
def extract_transcript(self, audio_path: str, video_hash: str, input_data: Dict[str, Any] = None) -> Dict[str, Any]:
|
2526 |
"""Extract complete transcript from audio file"""
|
2527 |
cache_enabled = self._get_config('cache_enabled', True, input_data)
|
2528 |
+
cache_path = self._get_cache_path(video_hash, "transcript.json")
|
|
|
|
|
2529 |
|
2530 |
# Check cache
|
2531 |
cached_transcript = self._load_from_cache(cache_path, cache_enabled)
|
|
|
2542 |
return {
|
2543 |
'error': 'Failed to split audio into chunks',
|
2544 |
'full_transcript': '',
|
|
|
2545 |
'success_rate': 0.0
|
2546 |
}
|
2547 |
|
|
|
2549 |
print(f"Transcribing {len(chunk_data)} audio chunks...")
|
2550 |
transcript_results = self._transcribe_chunks_parallel(chunk_data, input_data)
|
2551 |
|
2552 |
+
# Step 3: Combine results
|
2553 |
+
successful_results = [r for r in transcript_results if r['success']]
|
2554 |
+
full_text = ' '.join([r['text'] for r in successful_results])
|
2555 |
+
|
2556 |
+
# Calculate statistics
|
2557 |
+
total_chunks = len(transcript_results)
|
2558 |
+
successful_chunks = len(successful_results)
|
2559 |
+
success_rate = successful_chunks / total_chunks if total_chunks > 0 else 0
|
2560 |
+
word_count = len(full_text.split()) if full_text else 0
|
2561 |
+
|
2562 |
+
final_result = {
|
2563 |
+
'full_transcript': full_text,
|
2564 |
+
'word_count': word_count,
|
2565 |
+
'total_chunks': total_chunks,
|
2566 |
+
'successful_chunks': successful_chunks,
|
2567 |
+
'success_rate': success_rate,
|
2568 |
+
'extraction_timestamp': time.time(),
|
2569 |
+
'extraction_date': time.strftime('%Y-%m-%d %H:%M:%S'),
|
2570 |
+
'detailed_results': transcript_results
|
2571 |
+
}
|
2572 |
|
2573 |
# Cache results
|
2574 |
self._save_to_cache(cache_path, final_result, cache_enabled)
|
2575 |
+
|
2576 |
+
print(f"Transcript extraction completed. Success rate: {success_rate:.1%}")
|
2577 |
return final_result
|
2578 |
|
2579 |
except Exception as e:
|
|
|
2581 |
return {
|
2582 |
'error': str(e),
|
2583 |
'full_transcript': '',
|
|
|
2584 |
'success_rate': 0.0
|
2585 |
}
|
2586 |
|
|
|
2602 |
print(f"Downloading YouTube audio from {youtube_url}...")
|
2603 |
audio_path = self.download_youtube_audio(youtube_url, video_hash, input_data)
|
2604 |
if not audio_path or not os.path.exists(audio_path):
|
2605 |
+
return "Error: Failed to download the YouTube audio. Please check the URL and try again."
|
2606 |
|
2607 |
# Step 2: Extract transcript
|
2608 |
print("Extracting audio transcript...")
|
|
|
2611 |
if transcript_result.get("error"):
|
2612 |
return f"Error: {transcript_result['error']}"
|
2613 |
|
2614 |
+
main_transcript = transcript_result.get('full_transcript', '')
|
2615 |
+
|
2616 |
+
if not main_transcript:
|
2617 |
+
return "Error: No transcript could be extracted from the audio."
|
2618 |
|
2619 |
+
print(f"Transcript extracted successfully. Word count: {transcript_result.get('word_count', 0)}")
|
2620 |
+
print(f"Success rate: {transcript_result.get('success_rate', 0):.1%}")
|
2621 |
|
2622 |
return "TRANSCRIPT: " + main_transcript
|
2623 |
|
2624 |
except Exception as e:
|
2625 |
return f"Error during transcript extraction: {str(e)}"
|
2626 |
|
|
|
2627 |
# Factory function to create the tool
|
2628 |
def create_youtube_transcript_tool(**kwargs):
|
2629 |
"""Factory function to create the transcript extraction tool with custom parameters"""
|