video-fit-score / youtube_scraper.py
naveenus's picture
Update youtube_scraper.py
e80b0b5 verified
import requests, random
from bs4 import BeautifulSoup
import requests, time
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
def extract_video_id(url: str) -> str:
qs = parse_qs(urlparse(url).query)
return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]
def scrape_metadata(url):
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
for attempt in range(3):
r = requests.get(url, headers=headers)
if r.status_code == 200:
break
if r.status_code == 429:
wait = (2 ** attempt) + random.random() # exponential backoff :contentReference[oaicite:6]{index=6}
time.sleep(wait)
else:
r.raise_for_status()
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
title_tag = soup.title
desc_tag = soup.find("meta", {"name": "description"})
return (
title_tag.string if title_tag else "",
desc_tag["content"] if desc_tag else ""
)
def fetch_transcript(video_id: str):
try:
fetched = YouTubeTranscriptApi().fetch(video_id)
return fetched.to_raw_data() # JSON-friendly list of dicts :contentReference[oaicite:7]{index=7}
except (TranscriptsDisabled, VideoUnavailable):
return []
def get_youtube_info(url: str):
vid = extract_video_id(url)
title, desc = scrape_metadata(url)
captions = fetch_transcript(vid)
return {"videoId": vid, "title": title, "description": desc, "captions": captions}