|
import gradio as gr |
|
import urllib.request |
|
from bs4 import BeautifulSoup |
|
from duckduckgo_search import DDGS |
|
import re |
|
import markdown |
|
|
|
|
|
def fetch_metadata(url): |
|
req = urllib.request.Request(url) |
|
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') |
|
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') |
|
req.add_header('Accept-Language', 'en-US,en;q=0.5') |
|
|
|
try: |
|
with urllib.request.urlopen(req) as response: |
|
html = response.read() |
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
|
|
|
title_tag = soup.title.string if soup.title else 'No title available' |
|
|
|
|
|
site_name = soup.find('meta', property='og:site_name') |
|
site_name = site_name['content'] if site_name else 'Unknown Source' |
|
|
|
|
|
og_title = soup.find('meta', property='og:title') |
|
og_title = og_title['content'] if og_title else title_tag |
|
|
|
|
|
markdown_text = f"[{og_title}]({url}) ({site_name})" |
|
return markdown_text |
|
except Exception as e: |
|
return duckduckgo_search_fallback(url) |
|
|
|
|
|
def duckduckgo_search_fallback(url): |
|
try: |
|
|
|
results = DDGS().text(url, max_results=1) |
|
|
|
if results: |
|
top_result = results[0] |
|
result_title = top_result.get('title', 'No title found') |
|
result_url = top_result.get('href', 'No URL found') |
|
result_source = "Unknown Source" |
|
|
|
|
|
if "reuters.com" in result_url: |
|
result_source = "Reuters" |
|
elif "guardian.com" in result_url: |
|
result_source = "The Guardian" |
|
elif "apnews.com" in result_url: |
|
result_source = "AP News" |
|
|
|
markdown_text = f"[{result_title}]({result_url}) ({result_source})" |
|
return markdown_text |
|
else: |
|
return f"No search results found for {url}" |
|
except Exception as e: |
|
return f"Failed to fetch data for {url}: {str(e)}" |
|
|
|
|
|
def trimming_chat_answer(text): |
|
pattern = r'\[.*?\]\(.*?\) \([^\)]+\)' |
|
match = re.search(pattern, text) |
|
return match.group(0) if match else "No match found" |
|
|
|
|
|
def process_urls(urls): |
|
results = [] |
|
for url in urls: |
|
|
|
raw_markdown = fetch_metadata(url) |
|
|
|
|
|
if "Failed to fetch data" in raw_markdown or "No search results" in raw_markdown: |
|
raw_markdown = duckduckgo_search_fallback(url) |
|
|
|
|
|
try: |
|
ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also check the correct capitalization of the source name (the Guardian -> The Guardian). |
|
Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline. |
|
Please answer only with one line of the markdown output. |
|
Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. Ant贸nio Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News) |
|
Example output = [Montana is a frontier for deep carbon storage, Mr. Ant贸nio Guterres from the United Nations claims for urgent action](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News) |
|
|
|
Input: |
|
{raw_markdown} |
|
""", model='claude-3-haiku') |
|
except Exception as e: |
|
ddgschat = raw_markdown |
|
|
|
|
|
clean_markdown = trimming_chat_answer(ddgschat) |
|
if clean_markdown == "No match found": |
|
clean_markdown = raw_markdown |
|
|
|
results.append(clean_markdown) |
|
|
|
return results |
|
|
|
|
|
def gradio_interface(input_text): |
|
|
|
urls = input_text.strip().split('\n') |
|
|
|
results = process_urls(urls) |
|
|
|
html_output = "<br>".join(results) |
|
return markdown.markdown(html_output) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs="textarea", |
|
outputs="html", |
|
title="News headline and title scraper - Melty 2.0 馃敪", |
|
description="""Enter URLs separated by line breaks to fetch metadata and format it into markdown.\n\n |
|
\t\t |
|
馃憖 Example input:\n |
|
\t\thttps://www.example1.com\n |
|
\t\thttps://www.example2.org\n\n |
|
馃幆 Example output:\n |
|
\t\t[Headline 1](https://www.example1.com) (Source)\n |
|
\t\t[Headline 2](https://www.example2.org) (Source)""" |
|
) |
|
|
|
|
|
iface.launch() |
|
|