Spaces:

naveenus
/

video-fit-score

Sleeping

naveenus commited on Jul 3

Commit

eef9091

verified ·

1 Parent(s): 554e5f5

Update youtube_scraper.py

Files changed (1) hide show

youtube_scraper.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, parse_qs
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
@@ -7,12 +8,32 @@ def extract_video_id(url: str) -> str:
     qs = parse_qs(urlparse(url).query)
     return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]
-def scrape_metadata(url: str):
-    r = requests.get(url); r.raise_for_status()
-    soup = BeautifulSoup(r.text, 'html.parser')
-    title = soup.title.string or ""
-    desc  = soup.find('meta', {'name':'description'})
-    return title, (desc['content'] if desc else "")
 def fetch_transcript(video_id: str):
     try:

 import requests
 from bs4 import BeautifulSoup
+import requests, time
 from urllib.parse import urlparse, parse_qs
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
     qs = parse_qs(urlparse(url).query)
     return qs.get('v', [urlparse(url).path.split('/')[-1]])[0]
+def scrape_metadata(url):
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/115.0.0.0 Safari/537.36"
+        )
+    }
+    for attempt in range(3):
+        r = requests.get(url, headers=headers)
+        if r.status_code == 200:
+            break
+        if r.status_code == 429:
+            wait = (2 ** attempt) + random.random()  # exponential backoff :contentReference[oaicite:6]{index=6}
+            time.sleep(wait)
+        else:
+            r.raise_for_status()
+    r.raise_for_status()
+    soup = BeautifulSoup(r.text, "html.parser")
+    title_tag = soup.title
+    desc_tag  = soup.find("meta", {"name": "description"})
+    return (
+        title_tag.string if title_tag else "",
+        desc_tag["content"] if desc_tag else ""
+    )
 def fetch_transcript(video_id: str):
     try: