Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 19

Commit

3521e98

1 Parent(s): 93ca074

improvements 1

Browse files

Files changed (3) hide show

components/fetchers/scraper.py +61 -17
components/generators/daily_feed.py +5 -5
pipeline/news_ingest.py +1 -1

components/fetchers/scraper.py CHANGED Viewed

@@ -12,36 +12,68 @@ HEADERS = {
     )
 }
 def clean_text(text: str) -> str:
-    # Remove HTML tags, collapse whitespace
     soup = BeautifulSoup(text, "html.parser")
     cleaned = soup.get_text(separator=" ", strip=True)
-    cleaned = " ".join(cleaned.split())
-    return cleaned
 def is_low_quality(text: str) -> bool:
-    """Detect navigation garbage, footers, or low-word-count dumps."""
-    if not text or len(text.split()) < 120:
         return True
     junk_markers = [
-        "subscribe", "click here", "latest headlines", "more from", "privacy policy",
-        "video", "terms of service", "back to top", "all rights reserved"
     ]
     return any(marker in text.lower() for marker in junk_markers)
 def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
-    # Try Trafilatura first
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
-        if response.status_code == 200:
-            html = response.text
-            extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
-            if extracted:
-                text = clean_text(extracted)
-                if not is_low_quality(text):
-                    return text
-                else:
-                    print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
     except Exception as e:
         print(f"⚠️ Trafilatura failed for {url}: {e}")
@@ -56,7 +88,19 @@ def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
                 return text
             else:
                 print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
     except Exception as e:
         print(f"⚠️ Newspaper3k failed for {url}: {e}")
     return None

     )
 }
 def clean_text(text: str) -> str:
+    """Remove HTML tags and collapse whitespace."""
     soup = BeautifulSoup(text, "html.parser")
     cleaned = soup.get_text(separator=" ", strip=True)
+    return " ".join(cleaned.split())
 def is_low_quality(text: str) -> bool:
+    """Heuristic to detect low-value content like navbars, footers, etc."""
+    if not text or len(text.split()) < 50:
         return True
     junk_markers = [
+        "subscribe", "click here", "latest headlines", "more from",
+        "privacy policy", "video", "terms of service", "back to top",
+        "all rights reserved", "advertisement", "read more", "sign in"
     ]
     return any(marker in text.lower() for marker in junk_markers)
+def fallback_html_extract(html: str) -> Optional[str]:
+    """Very basic content extractor as a last resort."""
+    try:
+        soup = BeautifulSoup(html, "html.parser")
+        paragraphs = soup.find_all("p")
+        text = " ".join(p.get_text(strip=True) for p in paragraphs)
+        cleaned = clean_text(text)
+        return cleaned if len(cleaned.split()) >= 50 else None
+    except Exception as e:
+        print(f"⚠️ Fallback extract failed: {e}")
+        return None
 def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
+    """Extract meaningful text from a given URL using multiple methods."""
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
+        if response.status_code != 200:
+            print(f"⚠️ Bad status ({response.status_code}) for {url}")
+            return None
+        html = response.text
+        # Attempt trafilatura
+        extracted = trafilatura.extract(
+            html,
+            include_comments=False,
+            include_tables=False,
+            no_fallback=False
+        )
+        if extracted:
+            text = clean_text(extracted)
+            if not is_low_quality(text):
+                return text
+            else:
+                print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
+        else:
+            print(f"⚠️ Trafilatura extraction failed or empty: {url}")
     except Exception as e:
         print(f"⚠️ Trafilatura failed for {url}: {e}")
                 return text
             else:
                 print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
+        else:
+            print(f"⚠️ Newspaper3k extracted no text: {url}")
     except Exception as e:
         print(f"⚠️ Newspaper3k failed for {url}: {e}")
+    # Final fallback to basic HTML parsing
+    try:
+        if html:
+            fallback = fallback_html_extract(html)
+            if fallback:
+                print(f"✅ Used fallback extractor for: {url}")
+                return fallback
+    except Exception as e:
+        print(f"⚠️ Final fallback failed for {url}: {e}")
     return None

components/generators/daily_feed.py CHANGED Viewed

@@ -34,13 +34,13 @@ HEADERS = {
 def build_prompt(content: str, topic: str) -> str:
     base_instruction = (
         "You are Nuse’s official news summarizer — insightful, punchy, and always on point. 🧠✨\n"
-        "Your job is to scan the content below and extract the key news items. For each item, craft a crisp summary (25–30 words), add 1–2 fitting emojis, and make it pop.\n"
-        "List each summary on a new line starting with a dash (-). This is how Nuse keeps it clean and scannable.\n"
         "\n"
         "Example format:\n"
-        "- India stuns Australia in a last-ball thriller at the World Cup finals 🏏🇮🇳\n"
-        "- U.S. imposes sweeping tariffs on Chinese tech giants, rattling global markets 📉🇺🇸\n"
-        "- Ceasefire breakthrough: Netanyahu bows to pressure after week-long escalation 🔥🕊️\n"
         "\n"
         "Be sharp. Be brief. No fluff. No preambles. Just the summaries.\n"
         "Return only the final summary block — no extra commentary, no prompt repetition."

 def build_prompt(content: str, topic: str) -> str:
     base_instruction = (
         "You are Nuse’s official news summarizer — insightful, punchy, and always on point. 🧠✨\n"
+        "Your job is to scan the content below and extract the key news items. For each item, craft a crisp summary (15–20 words), add 1–2 fitting emojis, and make it pop.\n"
+        "List each summary on a new line starting with a dash (-) and no numbers. This is how Nuse keeps it clean and scannable.\n"
         "\n"
         "Example format:\n"
+        "- India stuns Australia in a last-ball thriller at the World Cup finals 🏏🇮🇳\n (15–20 words)"
+        "- U.S. imposes sweeping tariffs on Chinese tech giants, rattling global markets 📉🇺🇸\n (15–20 words)"
+        "- Ceasefire breakthrough: Netanyahu bows to pressure after week-long escalation 🔥🕊️\n (15–20 words)"
         "\n"
         "Be sharp. Be brief. No fluff. No preambles. Just the summaries.\n"
         "Return only the final summary block — no extra commentary, no prompt repetition."

pipeline/news_ingest.py CHANGED Viewed

@@ -63,7 +63,7 @@ async def main():
     for query in QUERIES:
         print(f"🔍 Searching for: {query}")
         try:
-            results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
             print(f"   → Found {len(results)} links for '{query}'.")
             for item in results:

     for query in QUERIES:
         print(f"🔍 Searching for: {query}")
         try:
+            results = fetch_google_news(query, API_KEY, CSE_ID, num_results=30)
             print(f"   → Found {len(results)} links for '{query}'.")
             for item in results: