Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import re
|
3 |
+
import markdown # Import markdown library to convert markdown to HTML
|
4 |
+
|
5 |
+
#def html_to_markdown(html_text):
|
6 |
+
# Convert HTML to Markdown
|
7 |
+
# return html2text.html2text(html_text)
|
8 |
+
|
9 |
+
def process_markdown(text):
|
10 |
+
# Regex patterns for each case
|
11 |
+
patterns = [
|
12 |
+
re.compile(r'\[([^\]]+?) - ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
|
13 |
+
re.compile(r'\[([^\]]+?) \| ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
|
14 |
+
re.compile(r'\[([^\]]+?) \(([^\)]+)\)\]\((http://[^\)]+?)\) \(([^)]+)\)')
|
15 |
+
]
|
16 |
+
|
17 |
+
def process_line(match):
|
18 |
+
title = match.group(1) # The title part inside the square brackets
|
19 |
+
url = match.group(3) # The URL inside the parentheses
|
20 |
+
second_mention_source = match.group(4) # The source name in parentheses
|
21 |
+
|
22 |
+
# Convert apostrophes to HTML entity
|
23 |
+
title = title.replace("'", "’")
|
24 |
+
|
25 |
+
# Return the formatted Markdown line
|
26 |
+
return f'[{title}]({url}) ({second_mention_source})'
|
27 |
+
|
28 |
+
# Process each pattern
|
29 |
+
for pattern in patterns:
|
30 |
+
text = pattern.sub(process_line, text)
|
31 |
+
|
32 |
+
return text
|
33 |
+
|
34 |
+
def convert_and_process(html_text):
|
35 |
+
# Convert HTML to Markdown
|
36 |
+
#markdown_text = html_to_markdown(html_text)
|
37 |
+
# Process the Markdown text
|
38 |
+
processed_markdown = process_markdown(html_text)
|
39 |
+
# Convert the processed Markdown back to HTML
|
40 |
+
html_output = markdown.markdown(processed_markdown)
|
41 |
+
return html_output
|
42 |
+
|
43 |
+
# Create the Gradio interface
|
44 |
+
iface = gr.Interface(
|
45 |
+
fn=convert_and_process,
|
46 |
+
inputs=gr.Textbox(label="Paste your markdown text here ⤵️"),
|
47 |
+
outputs=gr.HTML(label="Processed HTML Output"), # Output as HTML
|
48 |
+
title="Removing duplicated sources from Meltwater headlines",
|
49 |
+
description="1. Convert your text copied from Meltwater (with hyperlinks) into Markdown here: https://euangoddard.github.io/clipboard2markdown/\n\n 2. Paste your markdown text here to clean up the double source mentions, and submit to display the output as HTML."
|
50 |
+
)
|
51 |
+
|
52 |
+
# Launch the interface
|
53 |
+
iface.launch(share=True)
|