Spaces:
Sleeping
Sleeping
File size: 1,737 Bytes
e80b0b5 18e7e25 eef9091 18e7e25 eef9091 18e7e25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import requests, random
from bs4 import BeautifulSoup
import requests, time
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
def extract_video_id(url: str) -> str:
qs = parse_qs(urlparse(url).query)
return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]
def scrape_metadata(url):
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/115.0.0.0 Safari/537.36"
)
}
for attempt in range(3):
r = requests.get(url, headers=headers)
if r.status_code == 200:
break
if r.status_code == 429:
wait = (2 ** attempt) + random.random() # exponential backoff :contentReference[oaicite:6]{index=6}
time.sleep(wait)
else:
r.raise_for_status()
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
title_tag = soup.title
desc_tag = soup.find("meta", {"name": "description"})
return (
title_tag.string if title_tag else "",
desc_tag["content"] if desc_tag else ""
)
def fetch_transcript(video_id: str):
try:
fetched = YouTubeTranscriptApi().fetch(video_id)
return fetched.to_raw_data() # JSON-friendly list of dicts :contentReference[oaicite:7]{index=7}
except (TranscriptsDisabled, VideoUnavailable):
return []
def get_youtube_info(url: str):
vid = extract_video_id(url)
title, desc = scrape_metadata(url)
captions = fetch_transcript(vid)
return {"videoId": vid, "title": title, "description": desc, "captions": captions}
|