Spaces:

nelsonjq
/

meltwater-remove-duplicate-sources

Sleeping

File size: 2,043 Bytes

4362da8

import gradio as gr
import re
import markdown  # Import markdown library to convert markdown to HTML

#def html_to_markdown(html_text):
    # Convert HTML to Markdown
#   return html2text.html2text(html_text)

def process_markdown(text):
    # Regex patterns for each case
    patterns = [
        re.compile(r'\[([^\]]+?) - ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
        re.compile(r'\[([^\]]+?) \| ([^\(]+)\]\((http://[^\)]+?)\) \(([^)]+)\)'),
        re.compile(r'\[([^\]]+?) \(([^\)]+)\)\]\((http://[^\)]+?)\) \(([^)]+)\)')
    ]

    def process_line(match):
        title = match.group(1)  # The title part inside the square brackets
        url = match.group(3)    # The URL inside the parentheses
        second_mention_source = match.group(4) # The source name in parentheses

        # Convert apostrophes to HTML entity
        title = title.replace("'", "’")

        # Return the formatted Markdown line
        return f'[{title}]({url}) ({second_mention_source})'

    # Process each pattern
    for pattern in patterns:
        text = pattern.sub(process_line, text)

    return text

def convert_and_process(html_text):
    # Convert HTML to Markdown
    #markdown_text = html_to_markdown(html_text)
    # Process the Markdown text
    processed_markdown = process_markdown(html_text)
    # Convert the processed Markdown back to HTML
    html_output = markdown.markdown(processed_markdown)
    return html_output

# Create the Gradio interface
iface = gr.Interface(
    fn=convert_and_process,
    inputs=gr.Textbox(label="Paste your markdown text here ⤵️"),
    outputs=gr.HTML(label="Processed HTML Output"),  # Output as HTML
    title="Removing duplicated sources from Meltwater headlines",
    description="1. Convert your text copied from Meltwater (with hyperlinks) into Markdown here: https://euangoddard.github.io/clipboard2markdown/\n\n 2. Paste your markdown text here to clean up the double source mentions, and submit to display the output as HTML."
)

# Launch the interface
iface.launch(share=True)