Spaces:

nelsonjq
/

meltwater-remove-duplicate-sources

Sleeping

App Files Files Community

nelsonjq commited on Jul 28, 2024

Commit

12ee044

verified ·

1 Parent(s): f7e9cb9

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -46

app.py CHANGED Viewed

@@ -1,53 +1,131 @@
 import gradio as gr
 import re
-import markdown  # Import markdown library to convert markdown to HTML
-#def html_to_markdown(html_text):
-    # Convert HTML to Markdown
-#   return html2text.html2text(html_text)
-def process_markdown(text):
-    # Regex patterns for each case
-    patterns = [
-        re.compile(r'\[([^\]]+?) - ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
-        re.compile(r'\[([^\]]+?) \| ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
-        re.compile(r'\[([^\]]+?) \(([^\)]+)\)\]\((http://[^\)]+?)\) \(([^)]+)\)')
-    ]
-    def process_line(match):
-        title = match.group(1)  # The title part inside the square brackets
-        url = match.group(3)    # The URL inside the parentheses
-        second_mention_source = match.group(4) # The source name in parentheses
-        # Convert apostrophes to HTML entity
-        title = title.replace("'", "’")
-        # Return the formatted Markdown line
-        return f'[{title}]({url}) ({second_mention_source})'
-    # Process each pattern
-    for pattern in patterns:
-        text = pattern.sub(process_line, text)
-    return text
-def convert_and_process(html_text):
-    # Convert HTML to Markdown
-    #markdown_text = html_to_markdown(html_text)
-    # Process the Markdown text
-    processed_markdown = process_markdown(html_text)
-    # Convert the processed Markdown back to HTML
-    html_output = markdown.markdown(processed_markdown)
-    return html_output
 # Create the Gradio interface
 iface = gr.Interface(
-    fn=convert_and_process,
-    inputs=gr.Textbox(label="Paste your markdown text here ⤵️"),
-    outputs=gr.HTML(label="Processed HTML Output"),  # Output as HTML
-    title="Removing duplicated sources from Meltwater headlines",
-    description="1. Convert your text copied from Meltwater (with hyperlinks) into Markdown here: https://euangoddard.github.io/clipboard2markdown/\n\n 2. Paste your markdown text here to clean up the double source mentions, and submit to display the output as HTML."
 )
-# Launch the interface
-iface.launch(share=True)

 import gradio as gr
+import urllib.request
+from bs4 import BeautifulSoup
+from duckduckgo_search import DDGS
 import re
+import markdown  # Import markdown library to convert Markdown to HTML
+# Function to fetch and parse HTML content using urllib
+def fetch_metadata(url):
+    req = urllib.request.Request(url)
+    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
+    req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
+    req.add_header('Accept-Language', 'en-US,en;q=0.5')
+    try:
+        with urllib.request.urlopen(req) as response:
+            html = response.read()
+            soup = BeautifulSoup(html, 'html.parser')
+            # Extracting title tag
+            title_tag = soup.title.string if soup.title else 'No title available'
+            # Extracting og:site_name meta tag
+            site_name = soup.find('meta', property='og:site_name')
+            site_name = site_name['content'] if site_name else 'Unknown Source'
+            # Extracting og:title meta tag
+            og_title = soup.find('meta', property='og:title')
+            og_title = og_title['content'] if og_title else title_tag
+            # Formatting the result
+            markdown_text = f"[{og_title}]({url}) ({site_name})"
+            return markdown_text
+    except Exception as e:
+        return duckduckgo_search_fallback(url)
+# Function to perform DuckDuckGo search as a fallback
+def duckduckgo_search_fallback(url):
+    try:
+        # Use the library to perform the search
+        results = DDGS().text(url, max_results=1)
+        if results:
+            top_result = results[0]
+            result_title = top_result.get('title', 'No title found')
+            result_url = top_result.get('href', 'No URL found')
+            result_source = "Unknown Source"  # Default fallback
+            # Attempt to infer the source from the URL
+            if "reuters.com" in result_url:
+                result_source = "Reuters"
+            elif "guardian.com" in result_url:
+                result_source = "The Guardian"
+            elif "apnews.com" in result_url:
+                result_source = "AP News"
+            markdown_text = f"[{result_title}]({result_url}) ({result_source})"
+            return markdown_text
+        else:
+            return f"No search results found for {url}"
+    except Exception as e:
+        return f"Failed to fetch data for {url}: {str(e)}"
+# Function to trim and format the output
+def trimming_chat_answer(text):
+    pattern = r'\[.*?\]\(.*?\) \([^\)]+\)'
+    match = re.search(pattern, text)
+    return match.group(0) if match else "No match found"
+# Integrated function to process URLs and format the output
+def process_urls(urls):
+    results = []
+    for url in urls:
+        # Try to fetch metadata directly from the URL
+        raw_markdown = fetch_metadata(url)
+        # If raw_markdown is a fallback message, try DuckDuckGo search
+        if "Failed to fetch data" in raw_markdown or "No search results" in raw_markdown:
+            raw_markdown = duckduckgo_search_fallback(url)
+        # Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
+        try:
+            ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also check the correct capitalization of the source name (the Guardian -> The Guardian).
+            Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
+            Please answer only with one line of the markdown output.
+            Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. António Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)
+            Example output = [Montana is a frontier for deep carbon storage, Mr. António Guterres from the United Nations claims for urgent action](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)
+            Input:
+            {raw_markdown}
+            """, model='claude-3-haiku')
+        except Exception as e:
+            ddgschat = raw_markdown  # If there's an error with the chat API, use raw_markdown
+        # Trim the result using the chat output or fallback to raw_markdown
+        clean_markdown = trimming_chat_answer(ddgschat)
+        if clean_markdown == "No match found":
+            clean_markdown = raw_markdown
+        results.append(clean_markdown)
+    return results
+# Gradio app function
+def gradio_interface(input_text):
+    # Split the input text by line breaks to get URLs
+    urls = input_text.strip().split('\n')
+    # Process the URLs and get the results
+    results = process_urls(urls)
+    # Join the results with line breaks for display
+    html_output = "<br>".join(results)
+    return markdown.markdown(html_output)
 # Create the Gradio interface
 iface = gr.Interface(
+    fn=gradio_interface,
+    inputs="textarea",
+    outputs="html",
+    title="News headline and title scraper - Melty 2.0 🔭",
+    description="""Enter URLs separated by line breaks to fetch metadata and format it into markdown.\n\n
+    \t\t
+    👀 Example input:\n
+    \t\thttps://www.example1.com\n
+    \t\thttps://www.example2.org\n\n
+    🎯 Example output:\n
+    \t\t[Headline 1](https://www.example1.com) (Source)\n
+    \t\t[Headline 2](https://www.example2.org) (Source)"""
 )
+# Launch the Gradio app
+iface.launch()