nelsonjq commited on
Commit
12ee044
·
verified ·
1 Parent(s): f7e9cb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -46
app.py CHANGED
@@ -1,53 +1,131 @@
1
  import gradio as gr
 
 
 
2
  import re
3
- import markdown # Import markdown library to convert markdown to HTML
4
-
5
- #def html_to_markdown(html_text):
6
- # Convert HTML to Markdown
7
- # return html2text.html2text(html_text)
8
-
9
- def process_markdown(text):
10
- # Regex patterns for each case
11
- patterns = [
12
- re.compile(r'\[([^\]]+?) - ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
13
- re.compile(r'\[([^\]]+?) \| ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
14
- re.compile(r'\[([^\]]+?) \(([^\)]+)\)\]\((http://[^\)]+?)\) \(([^)]+)\)')
15
- ]
16
-
17
- def process_line(match):
18
- title = match.group(1) # The title part inside the square brackets
19
- url = match.group(3) # The URL inside the parentheses
20
- second_mention_source = match.group(4) # The source name in parentheses
21
-
22
- # Convert apostrophes to HTML entity
23
- title = title.replace("'", "’")
24
-
25
- # Return the formatted Markdown line
26
- return f'[{title}]({url}) ({second_mention_source})'
27
-
28
- # Process each pattern
29
- for pattern in patterns:
30
- text = pattern.sub(process_line, text)
31
-
32
- return text
33
-
34
- def convert_and_process(html_text):
35
- # Convert HTML to Markdown
36
- #markdown_text = html_to_markdown(html_text)
37
- # Process the Markdown text
38
- processed_markdown = process_markdown(html_text)
39
- # Convert the processed Markdown back to HTML
40
- html_output = markdown.markdown(processed_markdown)
41
- return html_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # Create the Gradio interface
44
  iface = gr.Interface(
45
- fn=convert_and_process,
46
- inputs=gr.Textbox(label="Paste your markdown text here ⤵️"),
47
- outputs=gr.HTML(label="Processed HTML Output"), # Output as HTML
48
- title="Removing duplicated sources from Meltwater headlines",
49
- description="1. Convert your text copied from Meltwater (with hyperlinks) into Markdown here: https://euangoddard.github.io/clipboard2markdown/\n\n 2. Paste your markdown text here to clean up the double source mentions, and submit to display the output as HTML."
 
 
 
 
 
 
 
50
  )
51
 
52
- # Launch the interface
53
- iface.launch(share=True)
 
1
  import gradio as gr
2
+ import urllib.request
3
+ from bs4 import BeautifulSoup
4
+ from duckduckgo_search import DDGS
5
  import re
6
+ import markdown # Import markdown library to convert Markdown to HTML
7
+
8
+ # Function to fetch and parse HTML content using urllib
9
+ def fetch_metadata(url):
10
+ req = urllib.request.Request(url)
11
+ req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0')
12
+ req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8')
13
+ req.add_header('Accept-Language', 'en-US,en;q=0.5')
14
+
15
+ try:
16
+ with urllib.request.urlopen(req) as response:
17
+ html = response.read()
18
+ soup = BeautifulSoup(html, 'html.parser')
19
+
20
+ # Extracting title tag
21
+ title_tag = soup.title.string if soup.title else 'No title available'
22
+
23
+ # Extracting og:site_name meta tag
24
+ site_name = soup.find('meta', property='og:site_name')
25
+ site_name = site_name['content'] if site_name else 'Unknown Source'
26
+
27
+ # Extracting og:title meta tag
28
+ og_title = soup.find('meta', property='og:title')
29
+ og_title = og_title['content'] if og_title else title_tag
30
+
31
+ # Formatting the result
32
+ markdown_text = f"[{og_title}]({url}) ({site_name})"
33
+ return markdown_text
34
+ except Exception as e:
35
+ return duckduckgo_search_fallback(url)
36
+
37
+ # Function to perform DuckDuckGo search as a fallback
38
+ def duckduckgo_search_fallback(url):
39
+ try:
40
+ # Use the library to perform the search
41
+ results = DDGS().text(url, max_results=1)
42
+
43
+ if results:
44
+ top_result = results[0]
45
+ result_title = top_result.get('title', 'No title found')
46
+ result_url = top_result.get('href', 'No URL found')
47
+ result_source = "Unknown Source" # Default fallback
48
+
49
+ # Attempt to infer the source from the URL
50
+ if "reuters.com" in result_url:
51
+ result_source = "Reuters"
52
+ elif "guardian.com" in result_url:
53
+ result_source = "The Guardian"
54
+ elif "apnews.com" in result_url:
55
+ result_source = "AP News"
56
+
57
+ markdown_text = f"[{result_title}]({result_url}) ({result_source})"
58
+ return markdown_text
59
+ else:
60
+ return f"No search results found for {url}"
61
+ except Exception as e:
62
+ return f"Failed to fetch data for {url}: {str(e)}"
63
+
64
+ # Function to trim and format the output
65
+ def trimming_chat_answer(text):
66
+ pattern = r'\[.*?\]\(.*?\) \([^\)]+\)'
67
+ match = re.search(pattern, text)
68
+ return match.group(0) if match else "No match found"
69
+
70
+ # Integrated function to process URLs and format the output
71
+ def process_urls(urls):
72
+ results = []
73
+ for url in urls:
74
+ # Try to fetch metadata directly from the URL
75
+ raw_markdown = fetch_metadata(url)
76
+
77
+ # If raw_markdown is a fallback message, try DuckDuckGo search
78
+ if "Failed to fetch data" in raw_markdown or "No search results" in raw_markdown:
79
+ raw_markdown = duckduckgo_search_fallback(url)
80
+
81
+ # Use the DuckDuckGo chat to format the output, but fallback to raw_markdown if unsuccessful
82
+ try:
83
+ ddgschat = DDGS().chat(f"""Please rewrite the following markdown string so the news headline is capitalized as a sentence, only proper nouns or names should have capital initials. Also check the correct capitalization of the source name (the Guardian -> The Guardian).
84
+ Then, please check if the source name (surrounded by round parenthesis) is repeated inside the headline (surrounded by square brackets); if it is repeated, please remove the source name mention from the headline keeping the URL and the source name in parentheses outside the headline.
85
+ Please answer only with one line of the markdown output.
86
+ Example input = [Montana Is a Frontier for Deep Carbon Storage, Mr. António Guterres from the United Nations Claims for Urgent Action - Inside Climate News](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)
87
+ Example output = [Montana is a frontier for deep carbon storage, Mr. António Guterres from the United Nations claims for urgent action](https://insideclimatenews.org/news/18072024/montana-deep-carbon-storage-controversies/) (Inside Climate News)
88
+
89
+ Input:
90
+ {raw_markdown}
91
+ """, model='claude-3-haiku')
92
+ except Exception as e:
93
+ ddgschat = raw_markdown # If there's an error with the chat API, use raw_markdown
94
+
95
+ # Trim the result using the chat output or fallback to raw_markdown
96
+ clean_markdown = trimming_chat_answer(ddgschat)
97
+ if clean_markdown == "No match found":
98
+ clean_markdown = raw_markdown
99
+
100
+ results.append(clean_markdown)
101
+
102
+ return results
103
+
104
+ # Gradio app function
105
+ def gradio_interface(input_text):
106
+ # Split the input text by line breaks to get URLs
107
+ urls = input_text.strip().split('\n')
108
+ # Process the URLs and get the results
109
+ results = process_urls(urls)
110
+ # Join the results with line breaks for display
111
+ html_output = "<br>".join(results)
112
+ return markdown.markdown(html_output)
113
 
114
  # Create the Gradio interface
115
  iface = gr.Interface(
116
+ fn=gradio_interface,
117
+ inputs="textarea",
118
+ outputs="html",
119
+ title="News headline and title scraper - Melty 2.0 🔭",
120
+ description="""Enter URLs separated by line breaks to fetch metadata and format it into markdown.\n\n
121
+ \t\t
122
+ 👀 Example input:\n
123
+ \t\thttps://www.example1.com\n
124
+ \t\thttps://www.example2.org\n\n
125
+ 🎯 Example output:\n
126
+ \t\t[Headline 1](https://www.example1.com) (Source)\n
127
+ \t\t[Headline 2](https://www.example2.org) (Source)"""
128
  )
129
 
130
+ # Launch the Gradio app
131
+ iface.launch()