File size: 1,737 Bytes
e80b0b5
18e7e25
eef9091
18e7e25
 
 
 
 
 
 
eef9091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18e7e25
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import requests, random
from bs4 import BeautifulSoup
import requests, time
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable

def extract_video_id(url: str) -> str:
    qs = parse_qs(urlparse(url).query)
    return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]

def scrape_metadata(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }
    for attempt in range(3):
        r = requests.get(url, headers=headers)
        if r.status_code == 200:
            break
        if r.status_code == 429:
            wait = (2 ** attempt) + random.random()  # exponential backoff :contentReference[oaicite:6]{index=6}
            time.sleep(wait)
        else:
            r.raise_for_status()
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    title_tag = soup.title
    desc_tag  = soup.find("meta", {"name": "description"})
    return (
        title_tag.string if title_tag else "",
        desc_tag["content"] if desc_tag else ""
    )


def fetch_transcript(video_id: str):
    try:
        fetched = YouTubeTranscriptApi().fetch(video_id)
        return fetched.to_raw_data()      # JSON-friendly list of dicts :contentReference[oaicite:7]{index=7}
    except (TranscriptsDisabled, VideoUnavailable):
        return []

def get_youtube_info(url: str):
    vid = extract_video_id(url)
    title, desc = scrape_metadata(url)
    captions = fetch_transcript(vid)
    return {"videoId": vid, "title": title, "description": desc, "captions": captions}