nelsonjq commited on
Commit
6ae41c9
verified
1 Parent(s): ca05614

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -1
app.py CHANGED
@@ -1,10 +1,59 @@
1
  import gradio as gr
2
  import urllib.request
 
3
  from bs4 import BeautifulSoup
4
  from duckduckgo_search import DDGS
5
  import re
6
  import markdown # Import markdown library to convert Markdown to HTML
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # Function to fetch and parse HTML content using urllib
9
  def fetch_metadata(url):
10
  req = urllib.request.Request(url)
@@ -27,6 +76,19 @@ def fetch_metadata(url):
27
  # Extracting og:title meta tag
28
  og_title = soup.find('meta', property='og:title')
29
  og_title = og_title['content'] if og_title else title_tag
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # Formatting the result
32
  markdown_text = f"[{og_title}]({url}) ({site_name})"
@@ -53,7 +115,15 @@ def duckduckgo_search_fallback(url):
53
  result_source = "The Guardian"
54
  elif "apnews.com" in result_url:
55
  result_source = "AP News"
 
 
56
 
 
 
 
 
 
 
57
  markdown_text = f"[{result_title}]({result_url}) ({result_source})"
58
  return markdown_text
59
  else:
@@ -80,7 +150,7 @@ def process_urls(urls):
80
 
81
  # Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
82
  try:
83
- ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also check the correct capitalization of the source name (the Guardian -> The Guardian).
84
  Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
85
  Please answer only with one line of the markdown output.
86
  Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. Ant贸nio Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)
 
1
  import gradio as gr
2
  import urllib.request
3
+ from urllib.parse import urlparse
4
  from bs4 import BeautifulSoup
5
  from duckduckgo_search import DDGS
6
  import re
7
  import markdown # Import markdown library to convert Markdown to HTML
8
 
9
+
10
+ # Function to correct custom source names
11
+ def correct_source_names(site_name):
12
+ dict_source_names = {
13
+ "dw": "DW",
14
+ "dw.com": "DW",
15
+ "ap news": "Associated Press",
16
+ "ap": "Associated Press",
17
+ "mongabay environmental news": "Mongabay",
18
+ "mongabay": "Mongabay",
19
+ "guardian": "The Guardian",
20
+ "guardian.com": "The Guardian",
21
+ "bbc": "BBC",
22
+ "bbc.com/news": "BBC"
23
+ }
24
+
25
+ # Convert the site_name to lowercase for case-insensitive comparison
26
+ site_name_lower = site_name.lower()
27
+
28
+ # Check the site_name with the dictionary keys
29
+ for key, value in dict_source_names.items():
30
+ if key in site_name_lower:
31
+ return value
32
+
33
+ # If no match is found, return the original site_name
34
+ return site_name.capitalize()
35
+
36
+ # Function to infer the source name from the URL
37
+ def infer_source_from_url(url):
38
+ try:
39
+ # Parse the URL to extract the domain
40
+ domain = urlparse(url).netloc
41
+ print(domain)
42
+ # Remove known subdomains like 'www', 'm', 'mobile'
43
+ domain_parts = domain.split('.')
44
+ if len(domain_parts) > 2 and domain_parts[0] in ['www', 'm', 'mobile']:
45
+ domain_parts.pop(0)
46
+
47
+ # Focus on the second-to-last part for the main domain
48
+ primary_domain = domain_parts[-2] if len(domain_parts) > 1 else domain_parts[0]
49
+
50
+ # Correct source names
51
+ primary_domain = correct_source_names(primary_domain)
52
+ return primary_domain
53
+ except Exception as e:
54
+ return "Unknown Source"
55
+
56
+
57
  # Function to fetch and parse HTML content using urllib
58
  def fetch_metadata(url):
59
  req = urllib.request.Request(url)
 
76
  # Extracting og:title meta tag
77
  og_title = soup.find('meta', property='og:title')
78
  og_title = og_title['content'] if og_title else title_tag
79
+
80
+ # Attempt to infer the source from the URL
81
+ if site_name == "Unknown Source":
82
+ if "reuters.com" in url:
83
+ site_name = "Reuters"
84
+ elif "guardian.com" in url:
85
+ site_name = "The Guardian"
86
+ elif "apnews.com" in url:
87
+ site_name = "Associated Press"
88
+ elif "bbc.com/news" in url:
89
+ site_name = "BBC"
90
+ else:
91
+ site_name = infer_source_from_url(url)
92
 
93
  # Formatting the result
94
  markdown_text = f"[{og_title}]({url}) ({site_name})"
 
115
  result_source = "The Guardian"
116
  elif "apnews.com" in result_url:
117
  result_source = "AP News"
118
+ elif "bbc.com/news" in result_url:
119
+ result_source = "BBC"
120
 
121
+ else:
122
+ result_source = infer_source_from_url(result_url)
123
+
124
+ # Correct source name with custom dictionary
125
+ result_source = correct_source_names(result_source)
126
+
127
  markdown_text = f"[{result_title}]({result_url}) ({result_source})"
128
  return markdown_text
129
  else:
 
150
 
151
  # Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
152
  try:
153
+ ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also correct capitalization of the source name if necessary.
154
  Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
155
  Please answer only with one line of the markdown output.
156
  Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. Ant贸nio Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)