Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,59 @@
|
|
1 |
import gradio as gr
|
2 |
import urllib.request
|
|
|
3 |
from bs4 import BeautifulSoup
|
4 |
from duckduckgo_search import DDGS
|
5 |
import re
|
6 |
import markdown # Import markdown library to convert Markdown to HTML
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
# Function to fetch and parse HTML content using urllib
|
9 |
def fetch_metadata(url):
|
10 |
req = urllib.request.Request(url)
|
@@ -27,6 +76,19 @@ def fetch_metadata(url):
|
|
27 |
# Extracting og:title meta tag
|
28 |
og_title = soup.find('meta', property='og:title')
|
29 |
og_title = og_title['content'] if og_title else title_tag
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
# Formatting the result
|
32 |
markdown_text = f"[{og_title}]({url}) ({site_name})"
|
@@ -53,7 +115,15 @@ def duckduckgo_search_fallback(url):
|
|
53 |
result_source = "The Guardian"
|
54 |
elif "apnews.com" in result_url:
|
55 |
result_source = "AP News"
|
|
|
|
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
markdown_text = f"[{result_title}]({result_url}) ({result_source})"
|
58 |
return markdown_text
|
59 |
else:
|
@@ -80,7 +150,7 @@ def process_urls(urls):
|
|
80 |
|
81 |
# Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
|
82 |
try:
|
83 |
-
ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also
|
84 |
Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
|
85 |
Please answer only with one line of the markdown output.
|
86 |
Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. Ant贸nio Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)
|
|
|
1 |
import gradio as gr
|
2 |
import urllib.request
|
3 |
+
from urllib.parse import urlparse
|
4 |
from bs4 import BeautifulSoup
|
5 |
from duckduckgo_search import DDGS
|
6 |
import re
|
7 |
import markdown # Import markdown library to convert Markdown to HTML
|
8 |
|
9 |
+
|
10 |
+
# Function to correct custom source names
|
11 |
+
def correct_source_names(site_name):
|
12 |
+
dict_source_names = {
|
13 |
+
"dw": "DW",
|
14 |
+
"dw.com": "DW",
|
15 |
+
"ap news": "Associated Press",
|
16 |
+
"ap": "Associated Press",
|
17 |
+
"mongabay environmental news": "Mongabay",
|
18 |
+
"mongabay": "Mongabay",
|
19 |
+
"guardian": "The Guardian",
|
20 |
+
"guardian.com": "The Guardian",
|
21 |
+
"bbc": "BBC",
|
22 |
+
"bbc.com/news": "BBC"
|
23 |
+
}
|
24 |
+
|
25 |
+
# Convert the site_name to lowercase for case-insensitive comparison
|
26 |
+
site_name_lower = site_name.lower()
|
27 |
+
|
28 |
+
# Check the site_name with the dictionary keys
|
29 |
+
for key, value in dict_source_names.items():
|
30 |
+
if key in site_name_lower:
|
31 |
+
return value
|
32 |
+
|
33 |
+
# If no match is found, return the original site_name
|
34 |
+
return site_name.capitalize()
|
35 |
+
|
36 |
+
# Function to infer the source name from the URL
|
37 |
+
def infer_source_from_url(url):
|
38 |
+
try:
|
39 |
+
# Parse the URL to extract the domain
|
40 |
+
domain = urlparse(url).netloc
|
41 |
+
print(domain)
|
42 |
+
# Remove known subdomains like 'www', 'm', 'mobile'
|
43 |
+
domain_parts = domain.split('.')
|
44 |
+
if len(domain_parts) > 2 and domain_parts[0] in ['www', 'm', 'mobile']:
|
45 |
+
domain_parts.pop(0)
|
46 |
+
|
47 |
+
# Focus on the second-to-last part for the main domain
|
48 |
+
primary_domain = domain_parts[-2] if len(domain_parts) > 1 else domain_parts[0]
|
49 |
+
|
50 |
+
# Correct source names
|
51 |
+
primary_domain = correct_source_names(primary_domain)
|
52 |
+
return primary_domain
|
53 |
+
except Exception as e:
|
54 |
+
return "Unknown Source"
|
55 |
+
|
56 |
+
|
57 |
# Function to fetch and parse HTML content using urllib
|
58 |
def fetch_metadata(url):
|
59 |
req = urllib.request.Request(url)
|
|
|
76 |
# Extracting og:title meta tag
|
77 |
og_title = soup.find('meta', property='og:title')
|
78 |
og_title = og_title['content'] if og_title else title_tag
|
79 |
+
|
80 |
+
# Attempt to infer the source from the URL
|
81 |
+
if site_name == "Unknown Source":
|
82 |
+
if "reuters.com" in url:
|
83 |
+
site_name = "Reuters"
|
84 |
+
elif "guardian.com" in url:
|
85 |
+
site_name = "The Guardian"
|
86 |
+
elif "apnews.com" in url:
|
87 |
+
site_name = "Associated Press"
|
88 |
+
elif "bbc.com/news" in url:
|
89 |
+
site_name = "BBC"
|
90 |
+
else:
|
91 |
+
site_name = infer_source_from_url(url)
|
92 |
|
93 |
# Formatting the result
|
94 |
markdown_text = f"[{og_title}]({url}) ({site_name})"
|
|
|
115 |
result_source = "The Guardian"
|
116 |
elif "apnews.com" in result_url:
|
117 |
result_source = "AP News"
|
118 |
+
elif "bbc.com/news" in result_url:
|
119 |
+
result_source = "BBC"
|
120 |
|
121 |
+
else:
|
122 |
+
result_source = infer_source_from_url(result_url)
|
123 |
+
|
124 |
+
# Correct source name with custom dictionary
|
125 |
+
result_source = correct_source_names(result_source)
|
126 |
+
|
127 |
markdown_text = f"[{result_title}]({result_url}) ({result_source})"
|
128 |
return markdown_text
|
129 |
else:
|
|
|
150 |
|
151 |
# Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
|
152 |
try:
|
153 |
+
ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also correct capitalization of the source name if necessary.
|
154 |
Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
|
155 |
Please answer only with one line of the markdown output.
|
156 |
Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. Ant贸nio Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)
|