import requests, random from bs4 import BeautifulSoup import requests, time from urllib.parse import urlparse, parse_qs from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable def extract_video_id(url: str) -> str: qs = parse_qs(urlparse(url).query) return qs.get('v', [urlparse(url).path.split('/')[-1]])[0] def scrape_metadata(url): headers = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/115.0.0.0 Safari/537.36" ) } for attempt in range(3): r = requests.get(url, headers=headers) if r.status_code == 200: break if r.status_code == 429: wait = (2 ** attempt) + random.random() # exponential backoff :contentReference[oaicite:6]{index=6} time.sleep(wait) else: r.raise_for_status() r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") title_tag = soup.title desc_tag = soup.find("meta", {"name": "description"}) return ( title_tag.string if title_tag else "", desc_tag["content"] if desc_tag else "" ) def fetch_transcript(video_id: str): try: fetched = YouTubeTranscriptApi().fetch(video_id) return fetched.to_raw_data() # JSON-friendly list of dicts :contentReference[oaicite:7]{index=7} except (TranscriptsDisabled, VideoUnavailable): return [] def get_youtube_info(url: str): vid = extract_video_id(url) title, desc = scrape_metadata(url) captions = fetch_transcript(vid) return {"videoId": vid, "title": title, "description": desc, "captions": captions}