Spaces:
Sleeping
Sleeping
import requests, random | |
from bs4 import BeautifulSoup | |
import requests, time | |
from urllib.parse import urlparse, parse_qs | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable | |
def extract_video_id(url: str) -> str: | |
qs = parse_qs(urlparse(url).query) | |
return qs.get('v', [urlparse(url).path.split('/')[-1]])[0] | |
def scrape_metadata(url): | |
headers = { | |
"User-Agent": ( | |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " | |
"AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/115.0.0.0 Safari/537.36" | |
) | |
} | |
for attempt in range(3): | |
r = requests.get(url, headers=headers) | |
if r.status_code == 200: | |
break | |
if r.status_code == 429: | |
wait = (2 ** attempt) + random.random() # exponential backoff :contentReference[oaicite:6]{index=6} | |
time.sleep(wait) | |
else: | |
r.raise_for_status() | |
r.raise_for_status() | |
soup = BeautifulSoup(r.text, "html.parser") | |
title_tag = soup.title | |
desc_tag = soup.find("meta", {"name": "description"}) | |
return ( | |
title_tag.string if title_tag else "", | |
desc_tag["content"] if desc_tag else "" | |
) | |
def fetch_transcript(video_id: str): | |
try: | |
fetched = YouTubeTranscriptApi().fetch(video_id) | |
return fetched.to_raw_data() # JSON-friendly list of dicts :contentReference[oaicite:7]{index=7} | |
except (TranscriptsDisabled, VideoUnavailable): | |
return [] | |
def get_youtube_info(url: str): | |
vid = extract_video_id(url) | |
title, desc = scrape_metadata(url) | |
captions = fetch_transcript(vid) | |
return {"videoId": vid, "title": title, "description": desc, "captions": captions} | |