Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 19

Commit

c72e054

1 Parent(s): 76187cf

undid the changes

Browse files

Files changed (1) hide show

components/fetchers/scraper.py +17 -61

components/fetchers/scraper.py CHANGED Viewed

@@ -12,68 +12,36 @@ HEADERS = {
     )
 }
 def clean_text(text: str) -> str:
-    """Remove HTML tags and collapse whitespace."""
     soup = BeautifulSoup(text, "html.parser")
     cleaned = soup.get_text(separator=" ", strip=True)
-    return " ".join(cleaned.split())
 def is_low_quality(text: str) -> bool:
-    """Heuristic to detect low-value content like navbars, footers, etc."""
-    if not text or len(text.split()) < 50:
         return True
     junk_markers = [
-        "subscribe", "click here", "latest headlines", "more from",
-        "privacy policy", "video", "terms of service", "back to top",
-        "all rights reserved", "advertisement", "read more", "sign in"
     ]
     return any(marker in text.lower() for marker in junk_markers)
-def fallback_html_extract(html: str) -> Optional[str]:
-    """Very basic content extractor as a last resort."""
-    try:
-        soup = BeautifulSoup(html, "html.parser")
-        paragraphs = soup.find_all("p")
-        text = " ".join(p.get_text(strip=True) for p in paragraphs)
-        cleaned = clean_text(text)
-        return cleaned if len(cleaned.split()) >= 50 else None
-    except Exception as e:
-        print(f"⚠️ Fallback extract failed: {e}")
-        return None
 def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
-    """Extract meaningful text from a given URL using multiple methods."""
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
-        if response.status_code != 200:
-            print(f"⚠️ Bad status ({response.status_code}) for {url}")
-            return None
-        html = response.text
-        # Attempt trafilatura
-        extracted = trafilatura.extract(
-            html,
-            include_comments=False,
-            include_tables=False,
-            no_fallback=False
-        )
-        if extracted:
-            text = clean_text(extracted)
-            if not is_low_quality(text):
-                return text
-            else:
-                print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
-        else:
-            print(f"⚠️ Trafilatura extraction failed or empty: {url}")
     except Exception as e:
         print(f"⚠️ Trafilatura failed for {url}: {e}")
@@ -88,19 +56,7 @@ def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
                 return text
             else:
                 print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
-        else:
-            print(f"⚠️ Newspaper3k extracted no text: {url}")
     except Exception as e:
         print(f"⚠️ Newspaper3k failed for {url}: {e}")
-    # Final fallback to basic HTML parsing
-    try:
-        if html:
-            fallback = fallback_html_extract(html)
-            if fallback:
-                print(f"✅ Used fallback extractor for: {url}")
-                return fallback
-    except Exception as e:
-        print(f"⚠️ Final fallback failed for {url}: {e}")
     return None

     )
 }
 def clean_text(text: str) -> str:
+    # Remove HTML tags, collapse whitespace
     soup = BeautifulSoup(text, "html.parser")
     cleaned = soup.get_text(separator=" ", strip=True)
+    cleaned = " ".join(cleaned.split())
+    return cleaned
 def is_low_quality(text: str) -> bool:
+    """Detect navigation garbage, footers, or low-word-count dumps."""
+    if not text or len(text.split()) < 120:
         return True
     junk_markers = [
+        "subscribe", "click here", "latest headlines", "more from", "privacy policy",
+        "video", "terms of service", "back to top", "all rights reserved"
     ]
     return any(marker in text.lower() for marker in junk_markers)
 def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
+    # Try Trafilatura first
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
+        if response.status_code == 200:
+            html = response.text
+            extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
+            if extracted:
+                text = clean_text(extracted)
+                if not is_low_quality(text):
+                    return text
+                else:
+                    print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
     except Exception as e:
         print(f"⚠️ Trafilatura failed for {url}: {e}")
                 return text
             else:
                 print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
     except Exception as e:
         print(f"⚠️ Newspaper3k failed for {url}: {e}")
     return None