naveenus commited on
Commit
eef9091
·
verified ·
1 Parent(s): 554e5f5

Update youtube_scraper.py

Browse files
Files changed (1) hide show
  1. youtube_scraper.py +27 -6
youtube_scraper.py CHANGED
@@ -1,5 +1,6 @@
1
  import requests
2
  from bs4 import BeautifulSoup
 
3
  from urllib.parse import urlparse, parse_qs
4
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
5
 
@@ -7,12 +8,32 @@ def extract_video_id(url: str) -> str:
7
  qs = parse_qs(urlparse(url).query)
8
  return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]
9
 
10
- def scrape_metadata(url: str):
11
- r = requests.get(url); r.raise_for_status()
12
- soup = BeautifulSoup(r.text, 'html.parser')
13
- title = soup.title.string or ""
14
- desc = soup.find('meta', {'name':'description'})
15
- return title, (desc['content'] if desc else "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  def fetch_transcript(video_id: str):
18
  try:
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
+ import requests, time
4
  from urllib.parse import urlparse, parse_qs
5
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
6
 
 
8
  qs = parse_qs(urlparse(url).query)
9
  return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]
10
 
11
+ def scrape_metadata(url):
12
+ headers = {
13
+ "User-Agent": (
14
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
15
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
16
+ "Chrome/115.0.0.0 Safari/537.36"
17
+ )
18
+ }
19
+ for attempt in range(3):
20
+ r = requests.get(url, headers=headers)
21
+ if r.status_code == 200:
22
+ break
23
+ if r.status_code == 429:
24
+ wait = (2 ** attempt) + random.random() # exponential backoff :contentReference[oaicite:6]{index=6}
25
+ time.sleep(wait)
26
+ else:
27
+ r.raise_for_status()
28
+ r.raise_for_status()
29
+ soup = BeautifulSoup(r.text, "html.parser")
30
+ title_tag = soup.title
31
+ desc_tag = soup.find("meta", {"name": "description"})
32
+ return (
33
+ title_tag.string if title_tag else "",
34
+ desc_tag["content"] if desc_tag else ""
35
+ )
36
+
37
 
38
  def fetch_transcript(video_id: str):
39
  try: