File size: 2,043 Bytes
4362da8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import gradio as gr
import re
import markdown # Import markdown library to convert markdown to HTML
#def html_to_markdown(html_text):
# Convert HTML to Markdown
# return html2text.html2text(html_text)
def process_markdown(text):
# Regex patterns for each case
patterns = [
re.compile(r'\[([^\]]+?) - ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
re.compile(r'\[([^\]]+?) \| ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
re.compile(r'\[([^\]]+?) \(([^\)]+)\)\]\((http://[^\)]+?)\) \(([^)]+)\)')
]
def process_line(match):
title = match.group(1) # The title part inside the square brackets
url = match.group(3) # The URL inside the parentheses
second_mention_source = match.group(4) # The source name in parentheses
# Convert apostrophes to HTML entity
title = title.replace("'", "’")
# Return the formatted Markdown line
return f'[{title}]({url}) ({second_mention_source})'
# Process each pattern
for pattern in patterns:
text = pattern.sub(process_line, text)
return text
def convert_and_process(html_text):
# Convert HTML to Markdown
#markdown_text = html_to_markdown(html_text)
# Process the Markdown text
processed_markdown = process_markdown(html_text)
# Convert the processed Markdown back to HTML
html_output = markdown.markdown(processed_markdown)
return html_output
# Create the Gradio interface
iface = gr.Interface(
fn=convert_and_process,
inputs=gr.Textbox(label="Paste your markdown text here ⤵️"),
outputs=gr.HTML(label="Processed HTML Output"), # Output as HTML
title="Removing duplicated sources from Meltwater headlines",
description="1. Convert your text copied from Meltwater (with hyperlinks) into Markdown here: https://euangoddard.github.io/clipboard2markdown/\n\n 2. Paste your markdown text here to clean up the double source mentions, and submit to display the output as HTML."
)
# Launch the interface
iface.launch(share=True) |