|
import gradio as gr |
|
import urllib.request |
|
from urllib.parse import urlparse |
|
from bs4 import BeautifulSoup |
|
from duckduckgo_search import DDGS |
|
import re |
|
import markdown |
|
|
|
|
|
|
|
def correct_source_names(site_name): |
|
dict_source_names = { |
|
"dw": "DW", |
|
"dw.com": "DW", |
|
"ap news": "Associated Press", |
|
"ap": "Associated Press", |
|
"mongabay environmental news": "Mongabay", |
|
"mongabay": "Mongabay", |
|
"guardian": "The Guardian", |
|
"guardian.com": "The Guardian", |
|
"bbc": "BBC", |
|
"bbc.com/news": "BBC" |
|
} |
|
|
|
|
|
site_name_lower = site_name.lower() |
|
|
|
|
|
for key, value in dict_source_names.items(): |
|
if key in site_name_lower: |
|
return value |
|
|
|
|
|
return site_name.capitalize() |
|
|
|
|
|
def infer_source_from_url(url): |
|
try: |
|
|
|
domain = urlparse(url).netloc |
|
print(domain) |
|
|
|
domain_parts = domain.split('.') |
|
if len(domain_parts) > 2 and domain_parts[0] in ['www', 'm', 'mobile']: |
|
domain_parts.pop(0) |
|
|
|
|
|
primary_domain = domain_parts[-2] if len(domain_parts) > 1 else domain_parts[0] |
|
|
|
|
|
primary_domain = correct_source_names(primary_domain) |
|
return primary_domain |
|
except Exception as e: |
|
return "Unknown Source" |
|
|
|
|
|
|
|
def fetch_metadata(url): |
|
req = urllib.request.Request(url) |
|
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0') |
|
req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8') |
|
req.add_header('Accept-Language', 'en-US,en;q=0.5') |
|
|
|
try: |
|
with urllib.request.urlopen(req) as response: |
|
html = response.read() |
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
|
|
|
title_tag = soup.title.string if soup.title else 'No title available' |
|
|
|
|
|
site_name = soup.find('meta', property='og:site_name') |
|
site_name = site_name['content'] if site_name else 'Unknown Source' |
|
|
|
|
|
og_title = soup.find('meta', property='og:title') |
|
og_title = og_title['content'] if og_title else title_tag |
|
|
|
|
|
if site_name == "Unknown Source": |
|
if "reuters.com" in url: |
|
site_name = "Reuters" |
|
elif "guardian.com" in url: |
|
site_name = "The Guardian" |
|
elif "apnews.com" in url: |
|
site_name = "Associated Press" |
|
elif "bbc.com/news" in url: |
|
site_name = "BBC" |
|
else: |
|
site_name = infer_source_from_url(url) |
|
|
|
|
|
site_name = correct_source_names(site_name) |
|
|
|
|
|
markdown_text = f"[{og_title}]({url}) ({site_name})" |
|
return markdown_text |
|
except Exception as e: |
|
return duckduckgo_search_fallback(url) |
|
|
|
|
|
def duckduckgo_search_fallback(url): |
|
try: |
|
|
|
results = DDGS().text(url, max_results=1) |
|
|
|
if results: |
|
top_result = results[0] |
|
result_title = top_result.get('title', 'No title found') |
|
result_url = top_result.get('href', 'No URL found') |
|
result_source = "Unknown Source" |
|
|
|
|
|
if "reuters.com" in result_url: |
|
result_source = "Reuters" |
|
elif "guardian.com" in result_url: |
|
result_source = "The Guardian" |
|
elif "apnews.com" in result_url: |
|
result_source = "AP News" |
|
elif "bbc.com/news" in result_url: |
|
result_source = "BBC" |
|
|
|
else: |
|
result_source = infer_source_from_url(result_url) |
|
|
|
|
|
result_source = correct_source_names(result_source) |
|
|
|
markdown_text = f"[{result_title}]({result_url}) ({result_source})" |
|
return markdown_text |
|
else: |
|
return f"No search results found for {url}" |
|
except Exception as e: |
|
return f"Failed to fetch data for {url}: {str(e)}" |
|
|
|
|
|
def trimming_chat_answer(text): |
|
pattern = r'\[.*?\]\(.*?\) \([^\)]+\)' |
|
match = re.search(pattern, text) |
|
return match.group(0) if match else "No match found" |
|
|
|
|
|
def process_urls(urls): |
|
results = [] |
|
for url in urls: |
|
url = url.strip() |
|
if url: |
|
|
|
raw_markdown = fetch_metadata(url) |
|
|
|
|
|
if "Failed to fetch data" in raw_markdown or "No search results" in raw_markdown: |
|
raw_markdown = duckduckgo_search_fallback(url) |
|
|
|
|
|
try: |
|
ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also correct capitalization of the source name if necessary. |
|
Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline. |
|
Please answer only with one line of the markdown output. |
|
Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. António Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News) |
|
Example output = [Montana is a frontier for deep carbon storage, Mr. António Guterres from the United Nations claims for urgent action](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News) |
|
|
|
Input: |
|
{raw_markdown} |
|
""", model='claude-3-haiku') |
|
except Exception as e: |
|
ddgschat = raw_markdown |
|
|
|
|
|
clean_markdown = trimming_chat_answer(ddgschat) |
|
if clean_markdown == "No match found": |
|
clean_markdown = raw_markdown |
|
|
|
results.append(clean_markdown) |
|
|
|
return results |
|
|
|
|
|
def gradio_interface(input_text): |
|
|
|
urls = input_text.strip().split('\n') |
|
|
|
results = process_urls(urls) |
|
|
|
html_output = "<br>".join(results) |
|
return markdown.markdown(html_output) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs="textarea", |
|
outputs="html", |
|
title="News headline and title scraper - Melty 2.0 🔭", |
|
description="""Enter URLs separated by line breaks to fetch metadata and format it into markdown.\n\n |
|
\t\t |
|
👀 Example input:\n |
|
\t\thttps://www.example1.com\n |
|
\t\thttps://www.example2.org\n\n |
|
🎯 Example output:\n |
|
\t\t[Headline 1](https://www.example1.com) (Source)\n |
|
\t\t[Headline 2](https://www.example2.org) (Source)""" |
|
) |
|
|
|
|
|
iface.launch() |
|
|