Spaces:

nelsonjq
/

meltwater-remove-duplicate-sources

Sleeping

App Files Files Community

nelsonjq commited on Jul 28, 2024

Commit

6ae41c9

verified ·

1 Parent(s): ca05614

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -1

app.py CHANGED Viewed

@@ -1,10 +1,59 @@
 import gradio as gr
 import urllib.request
 from bs4 import BeautifulSoup
 from duckduckgo_search import DDGS
 import re
 import markdown  # Import markdown library to convert Markdown to HTML
 # Function to fetch and parse HTML content using urllib
 def fetch_metadata(url):
     req = urllib.request.Request(url)
@@ -27,6 +76,19 @@ def fetch_metadata(url):
             # Extracting og:title meta tag
             og_title = soup.find('meta', property='og:title')
             og_title = og_title['content'] if og_title else title_tag
             # Formatting the result
             markdown_text = f"[{og_title}]({url}) ({site_name})"
@@ -53,7 +115,15 @@ def duckduckgo_search_fallback(url):
                 result_source = "The Guardian"
             elif "apnews.com" in result_url:
                 result_source = "AP News"
             markdown_text = f"[{result_title}]({result_url}) ({result_source})"
             return markdown_text
         else:
@@ -80,7 +150,7 @@ def process_urls(urls):
         # Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
         try:
-            ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also check the correct capitalization of the source name (the Guardian -> The Guardian).
             Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
             Please answer only with one line of the markdown output.
             Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. António Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)

 import gradio as gr
 import urllib.request
+from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from duckduckgo_search import DDGS
 import re
 import markdown  # Import markdown library to convert Markdown to HTML
+# Function to correct custom source names
+def correct_source_names(site_name):
+    dict_source_names = {
+        "dw": "DW",
+        "dw.com": "DW",
+        "ap news": "Associated Press",
+        "ap": "Associated Press",
+        "mongabay environmental news": "Mongabay",
+        "mongabay": "Mongabay",
+        "guardian": "The Guardian",
+        "guardian.com": "The Guardian",
+        "bbc": "BBC",
+        "bbc.com/news": "BBC"
+    }
+    # Convert the site_name to lowercase for case-insensitive comparison
+    site_name_lower = site_name.lower()
+    # Check the site_name with the dictionary keys
+    for key, value in dict_source_names.items():
+        if key in site_name_lower:
+            return value
+    # If no match is found, return the original site_name
+    return site_name.capitalize()
+# Function to infer the source name from the URL
+def infer_source_from_url(url):
+    try:
+        # Parse the URL to extract the domain
+        domain = urlparse(url).netloc
+        print(domain)
+        # Remove known subdomains like 'www', 'm', 'mobile'
+        domain_parts = domain.split('.')
+        if len(domain_parts) > 2 and domain_parts[0] in ['www', 'm', 'mobile']:
+            domain_parts.pop(0)
+        # Focus on the second-to-last part for the main domain
+        primary_domain = domain_parts[-2] if len(domain_parts) > 1 else domain_parts[0]
+        # Correct source names
+        primary_domain = correct_source_names(primary_domain)
+        return primary_domain
+    except Exception as e:
+        return "Unknown Source"
 # Function to fetch and parse HTML content using urllib
 def fetch_metadata(url):
     req = urllib.request.Request(url)
             # Extracting og:title meta tag
             og_title = soup.find('meta', property='og:title')
             og_title = og_title['content'] if og_title else title_tag
+            # Attempt to infer the source from the URL
+            if site_name == "Unknown Source":
+              if "reuters.com" in url:
+                  site_name = "Reuters"
+              elif "guardian.com" in url:
+                  site_name = "The Guardian"
+              elif "apnews.com" in url:
+                  site_name = "Associated Press"
+              elif "bbc.com/news" in url:
+                  site_name = "BBC"
+              else:
+                  site_name = infer_source_from_url(url)
             # Formatting the result
             markdown_text = f"[{og_title}]({url}) ({site_name})"
                 result_source = "The Guardian"
             elif "apnews.com" in result_url:
                 result_source = "AP News"
+            elif "bbc.com/news" in result_url:
+                result_source = "BBC"
+            else:
+                result_source = infer_source_from_url(result_url)
+            # Correct source name with custom dictionary
+            result_source = correct_source_names(result_source)
             markdown_text = f"[{result_title}]({result_url}) ({result_source})"
             return markdown_text
         else:
         # Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
         try:
+            ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also correct capitalization of the source name if necessary.
             Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
             Please answer only with one line of the markdown output.
             Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. António Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)