import gradio as gr import re import markdown # Import markdown library to convert markdown to HTML #def html_to_markdown(html_text): # Convert HTML to Markdown # return html2text.html2text(html_text) def process_markdown(text): # Regex patterns for each case patterns = [ re.compile(r'\[([^\]]+?) - ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'), re.compile(r'\[([^\]]+?) \| ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'), re.compile(r'\[([^\]]+?) \(([^\)]+)\)\]\((http://[^\)]+?)\) \(([^)]+)\)') ] def process_line(match): title = match.group(1) # The title part inside the square brackets url = match.group(3) # The URL inside the parentheses second_mention_source = match.group(4) # The source name in parentheses # Convert apostrophes to HTML entity title = title.replace("'", "’") # Return the formatted Markdown line return f'[{title}]({url}) ({second_mention_source})' # Process each pattern for pattern in patterns: text = pattern.sub(process_line, text) return text def convert_and_process(html_text): # Convert HTML to Markdown #markdown_text = html_to_markdown(html_text) # Process the Markdown text processed_markdown = process_markdown(html_text) # Convert the processed Markdown back to HTML html_output = markdown.markdown(processed_markdown) return html_output # Create the Gradio interface iface = gr.Interface( fn=convert_and_process, inputs=gr.Textbox(label="Paste your markdown text here ⤵️"), outputs=gr.HTML(label="Processed HTML Output"), # Output as HTML title="Removing duplicated sources from Meltwater headlines", description="1. Convert your text copied from Meltwater (with hyperlinks) into Markdown here: https://euangoddard.github.io/clipboard2markdown/\n\n 2. Paste your markdown text here to clean up the double source mentions, and submit to display the output as HTML." ) # Launch the interface iface.launch(share=True)