Spaces:

blasisd
/

transum-feed

Sleeping

App Files Files Community

blasisd commited on Mar 5

Commit

58bde27

1 Parent(s): 8fc416b

Initial commit

Browse files

Files changed (5) hide show

requirements.txt +10 -0
src/config.py +83 -0
src/logging_conf.py +56 -0
src/task_management.py +250 -0
src/transum_app.py +232 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+bs4
+feedparser
+gradio
+protobuf
+pydantic
+python-dotenv
+sentencepiece
+torch
+spaces
+transformers

src/config.py ADDED Viewed

	@@ -0,0 +1,83 @@

+LANGUAGES = {
+    "el": "Greek",
+    "en": "English",
+    "es": "Spanish",
+    "fr": "French",
+    "de": "German",
+    "it": "Italian",
+}
+LANG_LEX_2_CODE = {
+    "English": "eng_Latn",
+    "French": "fra_Latn",
+    "Spanish": "spa_Latn",
+    "Italian": "ita_Latn",
+    "German": "deu_Latn",
+    "Greek": "ell_Grek",
+    "Chinese": "zho_Hans",
+    "Japanese": "jpn_Jpan",
+    "Russian": "rus_Cyrl",
+    "Arabic": "arb_Arab",
+    "Portuguese": "por_Latn",
+    "Dutch": "nld_Latn",
+    "Turkish": "tur_Latn",
+    "Hindi": "hin_Deva",
+    "Korean": "kor_Hang",
+    "Vietnamese": "vie_Latn",
+    "Thai": "tha_Thai",
+    "Polish": "pol_Latn",
+    "Swedish": "swe_Latn",
+    "Finnish": "fin_Latn",
+    "Danish": "dan_Latn",
+    "Norwegian": "nob_Latn",
+    "Czech": "ces_Latn",
+    "Hungarian": "hun_Latn",
+    "Romanian": "ron_Latn",
+    "Hebrew": "heb_Hebr",
+    "Ukrainian": "ukr_Cyrl",
+    "Bulgarian": "bul_Cyrl",
+    "Indonesian": "ind_Latn",
+    "Malay": "zsm_Latn",
+    "Tamil": "tam_Taml",
+    "Telugu": "tel_Telu",
+    "Urdu": "urd_Arab",
+}
+# SUMMARIZATION_PREFIXES = {
+#     "en": "summarize: ",  # English
+#     "fr": "résume: ",  # French
+#     "es": "resume: ",  # Spanish
+#     "it": "riassumi: ",  # Italian
+#     "de": "fasse zusammen: ",  # German
+#     "el": "σύνοψη: ",  # Greek
+#     "zh": "总结: ",  # Chinese (Simplified)
+#     "ja": "要約: ",  # Japanese
+#     "ru": "резюме: ",  # Russian
+#     "ar": "لخص: ",  # Arabic
+#     "pt": "resuma: ",  # Portuguese
+#     "nl": "vat samen: ",  # Dutch
+#     "tr": "özetle: ",  # Turkish
+#     "hi": "सारांश: ",  # Hindi
+#     "ko": "요약: ",  # Korean
+#     "vi": "tóm tắt: ",  # Vietnamese
+#     "th": "สรุป: ",  # Thai
+#     "pl": "podsumuj: ",  # Polish
+#     "sv": "sammanfatta: ",  # Swedish
+#     "fi": "tiivistä: ",  # Finnish
+#     "da": "opsummer: ",  # Danish
+#     "no": "oppsummer: ",  # Norwegian
+#     "cs": "shrnutí: ",  # Czech
+#     "hu": "összefoglalás: ",  # Hungarian
+#     "ro": "rezumă: ",  # Romanian
+#     "he": "לסכם: ",  # Hebrew
+#     "uk": "резюме: ",  # Ukrainian
+#     "bg": "резюме: ",  # Bulgarian
+#     "id": "ringkasan: ",  # Indonesian
+#     "ms": "ringkasan: ",  # Malay
+#     "ta": "சுருக்கம்: ",  # Tamil
+#     "te": "సారాంశం: ",  # Telugu
+#     "ur": "خلاصہ: ",  # Urdu
+#     # Add more languages as needed
+# }

src/logging_conf.py ADDED Viewed

	@@ -0,0 +1,56 @@

+LOGGING_CONFIG = {
+    "version": 1,
+    "disable_existing_loggers": False,
+    "formatters": {
+        "standard": {
+            "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        },
+        "detailed": {
+            "format": "%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(lineno)d - %(message)s",
+        },
+        "simple": {
+            "format": "%(levelname)s - %(message)s",
+        },
+    },
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "formatter": "simple",
+            "level": "INFO",
+        },
+        "file_info": {
+            "class": "logging.FileHandler",
+            "filename": "info.log",
+            "formatter": "standard",
+            "level": "INFO",
+        },
+        "file_debug": {
+            "class": "logging.FileHandler",
+            "filename": "debug.log",
+            "formatter": "detailed",
+            "level": "DEBUG",
+        },
+        "file_error": {
+            "class": "logging.FileHandler",
+            "filename": "error.log",
+            "formatter": "detailed",
+            "level": "ERROR",
+        },
+    },
+    "loggers": {
+        "": {  # root logger
+            "handlers": ["console", "file_info"],
+            "level": "INFO",
+        },
+        "src.task_management": {
+            "handlers": ["console", "file_debug"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+        "src.transum_app": {
+            "handlers": ["console", "file_error"],
+            "level": "ERROR",
+            "propagate": False,
+        },
+    },
+}

src/task_management.py ADDED Viewed

	@@ -0,0 +1,250 @@

+import logging
+import logging.config
+from typing import Dict, List
+import feedparser
+import torch
+from bs4 import BeautifulSoup
+from functools import wraps
+from time import time
+from pydantic import HttpUrl
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    pipeline,
+)
+from config import LANGUAGES, LANG_LEX_2_CODE
+from logging_conf import LOGGING_CONFIG
+logging.config.dictConfig(LOGGING_CONFIG)
+logger = logging.getLogger("src.task_management")
+def proc_timer(f):
+    @wraps(f)
+    def wrapper(*args, **kw):
+        ts = time()
+        result = f(*args, **kw)
+        te = time()
+        logger.info(f"func:{f.__name__} args:[{args}, {kw}] took: {te - ts}:%2.4f sec")
+        return result
+    return wrapper
+class TaskManager:
+    """TaskManager class managing the summarization, translation,
+    feed-parsing and other necessary processing tasks
+    """
+    def __init__(self):
+        # The supported, by our application, translation languages
+        self.supported_langs = LANGUAGES.values()
+        # Load the bart-large-cnn model and tokenizer
+        summarization_model_name = "facebook/bart-large-cnn"
+        # Move model for summarization to GPU if available
+        # self.summarization_device = (
+        #     0 if torch.cuda.is_available() else -1
+        # )  # 0 for GPU, -1 for CPU
+        self.summarization_device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        self.summarization_config = AutoConfig.from_pretrained(summarization_model_name)
+        self.summarizer = AutoModelForSeq2SeqLM.from_pretrained(
+            summarization_model_name
+        ).to(self.summarization_device)
+        self.summarization_tokenizer = AutoTokenizer.from_pretrained(
+            summarization_model_name
+        )
+        # Check if CUDA is available and set the device
+        self.translation_device = (
+            "cpu"  # torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        )
+        # Load translation pipeline for model facebook/nllb-200-distilled-1.3B
+        self.translator = pipeline(
+            "translation",
+            model="facebook/nllb-200-distilled-1.3B",
+            device=self.translation_device,
+        )
+    # @proc_timer
+    def summarize(
+        self, txt_to_summarize: str, max_length: int = 30, min_length: int = 10
+    ) -> str:
+        """Summarization task, used for summarizing the provided text
+        Args:
+            txt_to_summarize (str): the text that need to be summarized
+            max_length (int, optional): the max_length downlimit of the summarized text. Defaults to 30.
+            min_length (int, optional): the min_length downlimit of the summarized text. Defaults to 10.
+        Returns:
+            str: the summarized text
+        """
+        full_text_length = len(txt_to_summarize)
+        # Adapt max and min lengths for summary, if larger than they should be
+        max_perc_init_length = round(full_text_length * 0.3)
+        max_length = (
+            max_perc_init_length
+            if self.summarization_config.max_length > 0.5 * full_text_length
+            else max(max_length, self.summarization_config.max_length)
+        )
+        # Min length is the minimum of the following two:
+        # the min to max default config values factor, multiplied by real max
+        # the default config minimum value
+        min_to_max_perc = (
+            self.summarization_config.min_length / self.summarization_config.max_length
+        )
+        min_length = min(
+            round(min_to_max_perc * max_length), self.summarization_config.min_length
+        )
+        # Tokenize input
+        inputs = self.summarization_tokenizer(
+            txt_to_summarize, return_tensors="pt", max_length=1024, truncation=True
+        ).to(self.summarization_device)
+        # Generate summary with custom max_length
+        summary_ids = self.summarizer.generate(
+            inputs["input_ids"],
+            max_length=max_length,  # Set max_length here
+            min_length=min_length,  # Set min_length here
+            num_beams=4,  # Optional: Use beam search
+            early_stopping=True,  # Optional: Stop early if EOS is reached
+        )
+        # Decode the summary
+        summary_txt = self.summarization_tokenizer.decode(
+            summary_ids[0], skip_special_tokens=True
+        )
+        return summary_txt
+    # @proc_timer
+    def translate(self, txt_to_translate: str, src_lang: str, tgt_lang: str) -> str:
+        """Translate the provided text from a source language to a target language
+        Args:
+            txt_to_translate (str): the text to translate
+            src_lang (str): the source language of the initial text
+            tgt_lang (str): the target language the initial text should be translated to
+        Raises:
+            RuntimeError: error in case of unsupported source language
+            RuntimeError: error in case of unsupported target language
+            RuntimeError: error in case of translation failure
+        Returns:
+            str: the translated text
+        """
+        # Raise error in case of unsupported languages
+        if src_lang not in self.supported_langs:
+            raise RuntimeError("Unsupported source language.")
+        if tgt_lang not in self.supported_langs:
+            raise RuntimeError("Unsupported target language.")
+        # Translate the text using the NLLB model
+        src_lang = LANG_LEX_2_CODE.get(src_lang, src_lang)
+        tgt_lang = LANG_LEX_2_CODE.get(tgt_lang, tgt_lang)
+        translated_text = self.translator(
+            txt_to_translate, src_lang=src_lang, tgt_lang=tgt_lang, batch_size=10
+        )[0]["translation_text"]
+        # If something goes wrong with the translation raise error
+        if len(translated_text) <= 0:
+            raise RuntimeError("Failed to generate translation.")
+        return translated_text
+    def parse_and_process_feed(
+        self,
+        rss_url: HttpUrl,
+        src_lang: str,
+        tgt_lang: str,
+        entries_limit: int = None,
+    ) -> List[Dict]:
+        """Parse the input feed, and process the feed entries keeping the important information,
+        summarizing and translating it
+        Args:
+            rss_url (HttpUrl): the feed url to parse
+            src_lang (str): the feed's initial language
+            tgt_lang (str): the target language to which the content will be translated
+            entries_limit (int, optional): the number of feed-entries to be processed. Defaults to None (process all).
+        Returns:
+            List[Dict]: a list of dictionaries, each one containing the processed info regarding
+            title, author, content and link for the respective feed entry
+        """
+        src_lang = LANGUAGES.get(src_lang, src_lang)
+        tgt_lang = LANGUAGES.get(tgt_lang, tgt_lang)
+        default_lang = LANGUAGES.get("en", "en")
+        feed = feedparser.parse(rss_url)
+        # Return the maximum number of entries in case entries is None or exceeding entries length
+        processed_entries = feed.entries[:entries_limit]
+        # Iterate over each entry in the feed
+        for entry in processed_entries:
+            title = entry.get("title", "")
+            author = entry.get("author", "")
+            link = entry.get("link", "")
+            content = entry.get(
+                "summary", entry.get("content", entry.get("description", ""))
+            )
+            soup = BeautifulSoup(content, features="html.parser")
+            content = "".join(soup.findAll(text=True))
+            # If source language is not English, first translate it to English to summarize
+            if src_lang != default_lang:
+                content = self.translate(
+                    content, src_lang=src_lang, tgt_lang=default_lang
+                )
+            # Summarize the content
+            summarized_content = self.summarize(content, max_length=30, min_length=10)
+            # Translate the title and summarized content
+            translated_title = self.translate(
+                title, src_lang=src_lang, tgt_lang=tgt_lang
+            )
+            # Unless the target language is already the default, translate it
+            translated_content = (
+                self.translate(
+                    summarized_content, src_lang=default_lang, tgt_lang=tgt_lang
+                )
+                if tgt_lang != default_lang
+                else summarized_content
+            )
+            # Update entry
+            entry.update(
+                {
+                    "title": translated_title,
+                    "content": translated_content,
+                    "author": author,
+                    "link": link,
+                }
+            )
+        return processed_entries

src/transum_app.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import gradio as gr
+# import spaces
+from typing import Dict, List, Tuple
+from pydantic import HttpUrl
+from task_management import TaskManager
+from config import LANGUAGES
+# Gradio interface
+# @spaces.GPU
+def process_rss(
+    rss_url: HttpUrl,
+    source_lang: str,
+    target_lang: str,
+    entries_limit: int = None,
+) -> List[Dict]:
+    """The wrapper to the respective task management function to retrieve the
+    summarized and translated entries from the feed
+    Args:
+        rss_url (HttpUrl): the url
+        src_lang (str): _description_
+        tgt_lang (str): _description_
+        entries_limit (int, optional): _description_. Defaults to None.
+    Raises:
+        gr.Error: _description_
+    Returns:
+        List[Dict]: _description_
+    """
+    try:
+        tm = TaskManager()
+        processed_entries = tm.parse_and_process_feed(
+            rss_url, source_lang, target_lang, entries_limit
+        )
+    except Exception as e:
+        raise gr.Error(e)
+    return processed_entries, len(processed_entries)
+# Custom css
+custom_css = """
+#messOut textarea {
+    font-weight: bold;
+}
+#entriesTab {
+    background-color: white;
+}
+"""
+# Create a scrollable Markdown component
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    css=custom_css,
+) as demo:
+    # Add a title using Markdown
+    gr.Markdown("# RSS Feed Summarizer and Translator")
+    # Add a description using Markdown
+    gr.Markdown(
+        "Input an RSS feed URL and specify the source and target languages to get summarized and translated content."
+    )
+    rss_entries = gr.State([])
+    with gr.Row():
+        # Step for starting points and options' steps for entries' dropdowns (retrieve and view)
+        step = 5
+        with gr.Column():
+            rss_url = gr.Textbox(label="RSS Feed URL")
+            languages_lst = LANGUAGES.keys()
+            source_lang = gr.Dropdown(
+                choices=languages_lst,
+                value="",
+                label="Source Language",
+            )
+            target_lang = gr.Dropdown(
+                choices=languages_lst,
+                value="",
+                label="Target Language",
+            )
+            options_lst = list(range(5, 205, 5))
+            entries_to_retrieve = gr.Dropdown(
+                choices=options_lst,
+                value=options_lst[0],
+                label="Max Entries To Retrieve",
+            )
+            with gr.Row():
+                clear_btn = gr.ClearButton(value="Clear")  # Clear button
+                submit_btn = gr.Button("Submit", variant="primary")
+        with gr.Column():
+            # Message for feed entries retrieved and spinner purposes
+            message_output = gr.Textbox(
+                label="Entries Retrieved: ",
+                interactive=False,
+                elem_id="messOut",
+            )
+            def submit_request(
+                feed_url: HttpUrl,
+                src_lang: str,
+                tgt_lang: str,
+                entries_limit: int,
+                latest_entries_num: int,
+            ) -> Tuple[List[Dict], int, str]:
+                """Calls format_processed_entries and format_processed_entries,
+                everytime submit button is pressed in order to retrieve feed entries,
+                format them and show them in the respective output component
+                Args:
+                    feed_url (HttpUrl): the feed url
+                    src_lang (str): source language
+                    tgt_lang (str): target_language
+                    entries_limit (int): the entries' limit (to retrieve)
+                    latest_entries_num (int): the number of the latest entries retrieved (if submission button has been pressed before)
+                Returns:
+                    Tuple[List[Dict], int, str]: the feed entries retrieved, the number of those entries, the entries properly formatted
+                """
+                proc_entries, entries_num = process_rss(
+                    feed_url, src_lang, tgt_lang, entries_limit
+                )
+                # entries_updated = update_entries(latest_entries_num)
+                formatted_updated_entries = format_processed_entries(proc_entries)
+                return proc_entries, entries_num, formatted_updated_entries
+            with gr.Tab("Feed Summaries:", visible=True, elem_id="entriesTab"):
+                # Create a scrollable Markdown component
+                markdown_output = gr.Markdown(height="400px")
+                entries_to_view = gr.Dropdown(
+                    choices=[options_lst[0]],
+                    value=options_lst[0],
+                    label="Max Entries To View",
+                )
+                @gr.on(
+                    [entries_to_view.change],
+                    inputs=[
+                        rss_entries,
+                        entries_to_view,
+                    ],
+                    outputs=[markdown_output],
+                )
+                def format_processed_entries(
+                    processed_entries: List[Dict], entries_limit: int = None
+                ) -> str:
+                    """Format the output entries
+                    Args:
+                        processed_entries (List[Dict]): the entries retrieved from the feed that have been processed
+                        entries_limit (int): a limit for the entries to view
+                    Returns:
+                        str: the formatted output containing the entries
+                    """
+                    entries_limit = entries_limit or len(processed_entries) or None
+                    # Format the output for Gradio
+                    output = ""
+                    for entry in processed_entries[:entries_limit]:
+                        output += f"### {entry.get('title', '---')}\n\n"
+                        output += f"**Author:** {entry.get('author', '-')}\n\n"
+                        output += f"{entry.get('content', '')}\n\n"
+                        link = entry.get("link", "")
+                        if link:
+                            output += f"[Read more]({link})\n\n"
+                        output += "---\n\n"
+                    return output
+                # Function to handle dropdown options for viewing entries
+                @gr.on(
+                    [rss_entries.change],
+                    inputs=[rss_entries],
+                    outputs=[entries_to_view],
+                )
+                def update_view_dropdown(view_entries: List[Dict]) -> gr.Dropdown:
+                    """Update the options for view dropdown
+                    Args:
+                        view_entries (List[Dict]): the view entries list
+                    Returns:
+                        gr.Dropdown: a dropdown component with the updated options regarding view entries
+                    """
+                    max_entries_shown = len(view_entries) or None
+                    # Update the dropdown options with the new length
+                    dropdown_options = list(range(step, max_entries_shown + step, step))
+                    # Return outputs to update components
+                    return gr.Dropdown(
+                        choices=dropdown_options,
+                        value=entries_to_view.value,
+                        label="Entries to view",
+                    )
+    # Link the function to the button
+    submit_btn.click(
+        submit_request,
+        inputs=[rss_url, source_lang, target_lang, entries_to_retrieve, message_output],
+        outputs=[rss_entries, message_output, markdown_output],
+    )
+    # Link the Clear button to reset inputs and outputs
+    clear_btn.add(
+        components=[
+            rss_url,
+            source_lang,
+            target_lang,
+            markdown_output,
+            entries_to_view,
+            entries_to_retrieve,
+        ]
+    )
+# Launch the interface
+demo.launch()