naveenus commited on
Commit
18e7e25
·
verified ·
1 Parent(s): 5e5049b

Create youtube_scraper.py

Browse files
Files changed (1) hide show
  1. youtube_scraper.py +28 -0
youtube_scraper.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urlparse, parse_qs
4
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
5
+
6
+ def extract_video_id(url: str) -> str:
7
+ qs = parse_qs(urlparse(url).query)
8
+ return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]
9
+
10
+ def scrape_metadata(url: str):
11
+ r = requests.get(url); r.raise_for_status()
12
+ soup = BeautifulSoup(r.text, 'html.parser')
13
+ title = soup.title.string or ""
14
+ desc = soup.find('meta', {'name':'description'})
15
+ return title, (desc['content'] if desc else "")
16
+
17
+ def fetch_transcript(video_id: str):
18
+ try:
19
+ fetched = YouTubeTranscriptApi().fetch(video_id)
20
+ return fetched.to_raw_data() # JSON-friendly list of dicts :contentReference[oaicite:7]{index=7}
21
+ except (TranscriptsDisabled, VideoUnavailable):
22
+ return []
23
+
24
+ def get_youtube_info(url: str):
25
+ vid = extract_video_id(url)
26
+ title, desc = scrape_metadata(url)
27
+ captions = fetch_transcript(vid)
28
+ return {"videoId": vid, "title": title, "description": desc, "captions": captions}