Spaces:
Sleeping
Sleeping
Initial commit
Browse files- requirements.txt +10 -0
- src/config.py +83 -0
- src/logging_conf.py +56 -0
- src/task_management.py +250 -0
- src/transum_app.py +232 -0
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bs4
|
2 |
+
feedparser
|
3 |
+
gradio
|
4 |
+
protobuf
|
5 |
+
pydantic
|
6 |
+
python-dotenv
|
7 |
+
sentencepiece
|
8 |
+
torch
|
9 |
+
spaces
|
10 |
+
transformers
|
src/config.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LANGUAGES = {
|
2 |
+
"el": "Greek",
|
3 |
+
"en": "English",
|
4 |
+
"es": "Spanish",
|
5 |
+
"fr": "French",
|
6 |
+
"de": "German",
|
7 |
+
"it": "Italian",
|
8 |
+
}
|
9 |
+
|
10 |
+
|
11 |
+
LANG_LEX_2_CODE = {
|
12 |
+
"English": "eng_Latn",
|
13 |
+
"French": "fra_Latn",
|
14 |
+
"Spanish": "spa_Latn",
|
15 |
+
"Italian": "ita_Latn",
|
16 |
+
"German": "deu_Latn",
|
17 |
+
"Greek": "ell_Grek",
|
18 |
+
"Chinese": "zho_Hans",
|
19 |
+
"Japanese": "jpn_Jpan",
|
20 |
+
"Russian": "rus_Cyrl",
|
21 |
+
"Arabic": "arb_Arab",
|
22 |
+
"Portuguese": "por_Latn",
|
23 |
+
"Dutch": "nld_Latn",
|
24 |
+
"Turkish": "tur_Latn",
|
25 |
+
"Hindi": "hin_Deva",
|
26 |
+
"Korean": "kor_Hang",
|
27 |
+
"Vietnamese": "vie_Latn",
|
28 |
+
"Thai": "tha_Thai",
|
29 |
+
"Polish": "pol_Latn",
|
30 |
+
"Swedish": "swe_Latn",
|
31 |
+
"Finnish": "fin_Latn",
|
32 |
+
"Danish": "dan_Latn",
|
33 |
+
"Norwegian": "nob_Latn",
|
34 |
+
"Czech": "ces_Latn",
|
35 |
+
"Hungarian": "hun_Latn",
|
36 |
+
"Romanian": "ron_Latn",
|
37 |
+
"Hebrew": "heb_Hebr",
|
38 |
+
"Ukrainian": "ukr_Cyrl",
|
39 |
+
"Bulgarian": "bul_Cyrl",
|
40 |
+
"Indonesian": "ind_Latn",
|
41 |
+
"Malay": "zsm_Latn",
|
42 |
+
"Tamil": "tam_Taml",
|
43 |
+
"Telugu": "tel_Telu",
|
44 |
+
"Urdu": "urd_Arab",
|
45 |
+
}
|
46 |
+
|
47 |
+
|
48 |
+
# SUMMARIZATION_PREFIXES = {
|
49 |
+
# "en": "summarize: ", # English
|
50 |
+
# "fr": "résume: ", # French
|
51 |
+
# "es": "resume: ", # Spanish
|
52 |
+
# "it": "riassumi: ", # Italian
|
53 |
+
# "de": "fasse zusammen: ", # German
|
54 |
+
# "el": "σύνοψη: ", # Greek
|
55 |
+
# "zh": "总结: ", # Chinese (Simplified)
|
56 |
+
# "ja": "要約: ", # Japanese
|
57 |
+
# "ru": "резюме: ", # Russian
|
58 |
+
# "ar": "لخص: ", # Arabic
|
59 |
+
# "pt": "resuma: ", # Portuguese
|
60 |
+
# "nl": "vat samen: ", # Dutch
|
61 |
+
# "tr": "özetle: ", # Turkish
|
62 |
+
# "hi": "सारांश: ", # Hindi
|
63 |
+
# "ko": "요약: ", # Korean
|
64 |
+
# "vi": "tóm tắt: ", # Vietnamese
|
65 |
+
# "th": "สรุป: ", # Thai
|
66 |
+
# "pl": "podsumuj: ", # Polish
|
67 |
+
# "sv": "sammanfatta: ", # Swedish
|
68 |
+
# "fi": "tiivistä: ", # Finnish
|
69 |
+
# "da": "opsummer: ", # Danish
|
70 |
+
# "no": "oppsummer: ", # Norwegian
|
71 |
+
# "cs": "shrnutí: ", # Czech
|
72 |
+
# "hu": "összefoglalás: ", # Hungarian
|
73 |
+
# "ro": "rezumă: ", # Romanian
|
74 |
+
# "he": "לסכם: ", # Hebrew
|
75 |
+
# "uk": "резюме: ", # Ukrainian
|
76 |
+
# "bg": "резюме: ", # Bulgarian
|
77 |
+
# "id": "ringkasan: ", # Indonesian
|
78 |
+
# "ms": "ringkasan: ", # Malay
|
79 |
+
# "ta": "சுருக்கம்: ", # Tamil
|
80 |
+
# "te": "సారాంశం: ", # Telugu
|
81 |
+
# "ur": "خلاصہ: ", # Urdu
|
82 |
+
# # Add more languages as needed
|
83 |
+
# }
|
src/logging_conf.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
LOGGING_CONFIG = {
|
2 |
+
"version": 1,
|
3 |
+
"disable_existing_loggers": False,
|
4 |
+
"formatters": {
|
5 |
+
"standard": {
|
6 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
7 |
+
},
|
8 |
+
"detailed": {
|
9 |
+
"format": "%(asctime)s - %(name)s - %(levelname)s - %(module)s - %(lineno)d - %(message)s",
|
10 |
+
},
|
11 |
+
"simple": {
|
12 |
+
"format": "%(levelname)s - %(message)s",
|
13 |
+
},
|
14 |
+
},
|
15 |
+
"handlers": {
|
16 |
+
"console": {
|
17 |
+
"class": "logging.StreamHandler",
|
18 |
+
"formatter": "simple",
|
19 |
+
"level": "INFO",
|
20 |
+
},
|
21 |
+
"file_info": {
|
22 |
+
"class": "logging.FileHandler",
|
23 |
+
"filename": "info.log",
|
24 |
+
"formatter": "standard",
|
25 |
+
"level": "INFO",
|
26 |
+
},
|
27 |
+
"file_debug": {
|
28 |
+
"class": "logging.FileHandler",
|
29 |
+
"filename": "debug.log",
|
30 |
+
"formatter": "detailed",
|
31 |
+
"level": "DEBUG",
|
32 |
+
},
|
33 |
+
"file_error": {
|
34 |
+
"class": "logging.FileHandler",
|
35 |
+
"filename": "error.log",
|
36 |
+
"formatter": "detailed",
|
37 |
+
"level": "ERROR",
|
38 |
+
},
|
39 |
+
},
|
40 |
+
"loggers": {
|
41 |
+
"": { # root logger
|
42 |
+
"handlers": ["console", "file_info"],
|
43 |
+
"level": "INFO",
|
44 |
+
},
|
45 |
+
"src.task_management": {
|
46 |
+
"handlers": ["console", "file_debug"],
|
47 |
+
"level": "DEBUG",
|
48 |
+
"propagate": False,
|
49 |
+
},
|
50 |
+
"src.transum_app": {
|
51 |
+
"handlers": ["console", "file_error"],
|
52 |
+
"level": "ERROR",
|
53 |
+
"propagate": False,
|
54 |
+
},
|
55 |
+
},
|
56 |
+
}
|
src/task_management.py
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import logging.config
|
3 |
+
|
4 |
+
from typing import Dict, List
|
5 |
+
|
6 |
+
import feedparser
|
7 |
+
import torch
|
8 |
+
|
9 |
+
from bs4 import BeautifulSoup
|
10 |
+
from functools import wraps
|
11 |
+
from time import time
|
12 |
+
from pydantic import HttpUrl
|
13 |
+
from transformers import (
|
14 |
+
AutoConfig,
|
15 |
+
AutoModelForSeq2SeqLM,
|
16 |
+
AutoTokenizer,
|
17 |
+
pipeline,
|
18 |
+
)
|
19 |
+
|
20 |
+
from config import LANGUAGES, LANG_LEX_2_CODE
|
21 |
+
from logging_conf import LOGGING_CONFIG
|
22 |
+
|
23 |
+
|
24 |
+
logging.config.dictConfig(LOGGING_CONFIG)
|
25 |
+
logger = logging.getLogger("src.task_management")
|
26 |
+
|
27 |
+
|
28 |
+
def proc_timer(f):
|
29 |
+
@wraps(f)
|
30 |
+
def wrapper(*args, **kw):
|
31 |
+
ts = time()
|
32 |
+
result = f(*args, **kw)
|
33 |
+
te = time()
|
34 |
+
logger.info(f"func:{f.__name__} args:[{args}, {kw}] took: {te - ts}:%2.4f sec")
|
35 |
+
return result
|
36 |
+
|
37 |
+
return wrapper
|
38 |
+
|
39 |
+
|
40 |
+
class TaskManager:
|
41 |
+
"""TaskManager class managing the summarization, translation,
|
42 |
+
feed-parsing and other necessary processing tasks
|
43 |
+
"""
|
44 |
+
|
45 |
+
def __init__(self):
|
46 |
+
# The supported, by our application, translation languages
|
47 |
+
self.supported_langs = LANGUAGES.values()
|
48 |
+
|
49 |
+
# Load the bart-large-cnn model and tokenizer
|
50 |
+
summarization_model_name = "facebook/bart-large-cnn"
|
51 |
+
|
52 |
+
# Move model for summarization to GPU if available
|
53 |
+
# self.summarization_device = (
|
54 |
+
# 0 if torch.cuda.is_available() else -1
|
55 |
+
# ) # 0 for GPU, -1 for CPU
|
56 |
+
self.summarization_device = torch.device(
|
57 |
+
"cuda" if torch.cuda.is_available() else "cpu"
|
58 |
+
)
|
59 |
+
|
60 |
+
self.summarization_config = AutoConfig.from_pretrained(summarization_model_name)
|
61 |
+
|
62 |
+
self.summarizer = AutoModelForSeq2SeqLM.from_pretrained(
|
63 |
+
summarization_model_name
|
64 |
+
).to(self.summarization_device)
|
65 |
+
|
66 |
+
self.summarization_tokenizer = AutoTokenizer.from_pretrained(
|
67 |
+
summarization_model_name
|
68 |
+
)
|
69 |
+
|
70 |
+
# Check if CUDA is available and set the device
|
71 |
+
self.translation_device = (
|
72 |
+
"cpu" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
73 |
+
)
|
74 |
+
|
75 |
+
# Load translation pipeline for model facebook/nllb-200-distilled-1.3B
|
76 |
+
self.translator = pipeline(
|
77 |
+
"translation",
|
78 |
+
model="facebook/nllb-200-distilled-1.3B",
|
79 |
+
device=self.translation_device,
|
80 |
+
)
|
81 |
+
|
82 |
+
# @proc_timer
|
83 |
+
def summarize(
|
84 |
+
self, txt_to_summarize: str, max_length: int = 30, min_length: int = 10
|
85 |
+
) -> str:
|
86 |
+
"""Summarization task, used for summarizing the provided text
|
87 |
+
|
88 |
+
Args:
|
89 |
+
txt_to_summarize (str): the text that need to be summarized
|
90 |
+
max_length (int, optional): the max_length downlimit of the summarized text. Defaults to 30.
|
91 |
+
min_length (int, optional): the min_length downlimit of the summarized text. Defaults to 10.
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
str: the summarized text
|
95 |
+
"""
|
96 |
+
|
97 |
+
full_text_length = len(txt_to_summarize)
|
98 |
+
|
99 |
+
# Adapt max and min lengths for summary, if larger than they should be
|
100 |
+
max_perc_init_length = round(full_text_length * 0.3)
|
101 |
+
max_length = (
|
102 |
+
max_perc_init_length
|
103 |
+
if self.summarization_config.max_length > 0.5 * full_text_length
|
104 |
+
else max(max_length, self.summarization_config.max_length)
|
105 |
+
)
|
106 |
+
|
107 |
+
# Min length is the minimum of the following two:
|
108 |
+
# the min to max default config values factor, multiplied by real max
|
109 |
+
# the default config minimum value
|
110 |
+
min_to_max_perc = (
|
111 |
+
self.summarization_config.min_length / self.summarization_config.max_length
|
112 |
+
)
|
113 |
+
min_length = min(
|
114 |
+
round(min_to_max_perc * max_length), self.summarization_config.min_length
|
115 |
+
)
|
116 |
+
|
117 |
+
# Tokenize input
|
118 |
+
inputs = self.summarization_tokenizer(
|
119 |
+
txt_to_summarize, return_tensors="pt", max_length=1024, truncation=True
|
120 |
+
).to(self.summarization_device)
|
121 |
+
|
122 |
+
# Generate summary with custom max_length
|
123 |
+
summary_ids = self.summarizer.generate(
|
124 |
+
inputs["input_ids"],
|
125 |
+
max_length=max_length, # Set max_length here
|
126 |
+
min_length=min_length, # Set min_length here
|
127 |
+
num_beams=4, # Optional: Use beam search
|
128 |
+
early_stopping=True, # Optional: Stop early if EOS is reached
|
129 |
+
)
|
130 |
+
|
131 |
+
# Decode the summary
|
132 |
+
summary_txt = self.summarization_tokenizer.decode(
|
133 |
+
summary_ids[0], skip_special_tokens=True
|
134 |
+
)
|
135 |
+
|
136 |
+
return summary_txt
|
137 |
+
|
138 |
+
# @proc_timer
|
139 |
+
def translate(self, txt_to_translate: str, src_lang: str, tgt_lang: str) -> str:
|
140 |
+
"""Translate the provided text from a source language to a target language
|
141 |
+
|
142 |
+
Args:
|
143 |
+
txt_to_translate (str): the text to translate
|
144 |
+
src_lang (str): the source language of the initial text
|
145 |
+
tgt_lang (str): the target language the initial text should be translated to
|
146 |
+
|
147 |
+
Raises:
|
148 |
+
RuntimeError: error in case of unsupported source language
|
149 |
+
RuntimeError: error in case of unsupported target language
|
150 |
+
RuntimeError: error in case of translation failure
|
151 |
+
|
152 |
+
Returns:
|
153 |
+
str: the translated text
|
154 |
+
"""
|
155 |
+
|
156 |
+
# Raise error in case of unsupported languages
|
157 |
+
if src_lang not in self.supported_langs:
|
158 |
+
raise RuntimeError("Unsupported source language.")
|
159 |
+
if tgt_lang not in self.supported_langs:
|
160 |
+
raise RuntimeError("Unsupported target language.")
|
161 |
+
|
162 |
+
# Translate the text using the NLLB model
|
163 |
+
src_lang = LANG_LEX_2_CODE.get(src_lang, src_lang)
|
164 |
+
tgt_lang = LANG_LEX_2_CODE.get(tgt_lang, tgt_lang)
|
165 |
+
translated_text = self.translator(
|
166 |
+
txt_to_translate, src_lang=src_lang, tgt_lang=tgt_lang, batch_size=10
|
167 |
+
)[0]["translation_text"]
|
168 |
+
|
169 |
+
# If something goes wrong with the translation raise error
|
170 |
+
if len(translated_text) <= 0:
|
171 |
+
raise RuntimeError("Failed to generate translation.")
|
172 |
+
|
173 |
+
return translated_text
|
174 |
+
|
175 |
+
def parse_and_process_feed(
|
176 |
+
self,
|
177 |
+
rss_url: HttpUrl,
|
178 |
+
src_lang: str,
|
179 |
+
tgt_lang: str,
|
180 |
+
entries_limit: int = None,
|
181 |
+
) -> List[Dict]:
|
182 |
+
"""Parse the input feed, and process the feed entries keeping the important information,
|
183 |
+
summarizing and translating it
|
184 |
+
|
185 |
+
Args:
|
186 |
+
rss_url (HttpUrl): the feed url to parse
|
187 |
+
src_lang (str): the feed's initial language
|
188 |
+
tgt_lang (str): the target language to which the content will be translated
|
189 |
+
entries_limit (int, optional): the number of feed-entries to be processed. Defaults to None (process all).
|
190 |
+
|
191 |
+
Returns:
|
192 |
+
List[Dict]: a list of dictionaries, each one containing the processed info regarding
|
193 |
+
title, author, content and link for the respective feed entry
|
194 |
+
"""
|
195 |
+
|
196 |
+
src_lang = LANGUAGES.get(src_lang, src_lang)
|
197 |
+
tgt_lang = LANGUAGES.get(tgt_lang, tgt_lang)
|
198 |
+
default_lang = LANGUAGES.get("en", "en")
|
199 |
+
|
200 |
+
feed = feedparser.parse(rss_url)
|
201 |
+
|
202 |
+
# Return the maximum number of entries in case entries is None or exceeding entries length
|
203 |
+
processed_entries = feed.entries[:entries_limit]
|
204 |
+
|
205 |
+
# Iterate over each entry in the feed
|
206 |
+
for entry in processed_entries:
|
207 |
+
title = entry.get("title", "")
|
208 |
+
author = entry.get("author", "")
|
209 |
+
link = entry.get("link", "")
|
210 |
+
content = entry.get(
|
211 |
+
"summary", entry.get("content", entry.get("description", ""))
|
212 |
+
)
|
213 |
+
|
214 |
+
soup = BeautifulSoup(content, features="html.parser")
|
215 |
+
content = "".join(soup.findAll(text=True))
|
216 |
+
|
217 |
+
# If source language is not English, first translate it to English to summarize
|
218 |
+
if src_lang != default_lang:
|
219 |
+
content = self.translate(
|
220 |
+
content, src_lang=src_lang, tgt_lang=default_lang
|
221 |
+
)
|
222 |
+
|
223 |
+
# Summarize the content
|
224 |
+
summarized_content = self.summarize(content, max_length=30, min_length=10)
|
225 |
+
|
226 |
+
# Translate the title and summarized content
|
227 |
+
translated_title = self.translate(
|
228 |
+
title, src_lang=src_lang, tgt_lang=tgt_lang
|
229 |
+
)
|
230 |
+
|
231 |
+
# Unless the target language is already the default, translate it
|
232 |
+
translated_content = (
|
233 |
+
self.translate(
|
234 |
+
summarized_content, src_lang=default_lang, tgt_lang=tgt_lang
|
235 |
+
)
|
236 |
+
if tgt_lang != default_lang
|
237 |
+
else summarized_content
|
238 |
+
)
|
239 |
+
|
240 |
+
# Update entry
|
241 |
+
entry.update(
|
242 |
+
{
|
243 |
+
"title": translated_title,
|
244 |
+
"content": translated_content,
|
245 |
+
"author": author,
|
246 |
+
"link": link,
|
247 |
+
}
|
248 |
+
)
|
249 |
+
|
250 |
+
return processed_entries
|
src/transum_app.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
# import spaces
|
4 |
+
|
5 |
+
from typing import Dict, List, Tuple
|
6 |
+
from pydantic import HttpUrl
|
7 |
+
|
8 |
+
from task_management import TaskManager
|
9 |
+
from config import LANGUAGES
|
10 |
+
|
11 |
+
|
12 |
+
# Gradio interface
|
13 |
+
# @spaces.GPU
|
14 |
+
def process_rss(
|
15 |
+
rss_url: HttpUrl,
|
16 |
+
source_lang: str,
|
17 |
+
target_lang: str,
|
18 |
+
entries_limit: int = None,
|
19 |
+
) -> List[Dict]:
|
20 |
+
"""The wrapper to the respective task management function to retrieve the
|
21 |
+
summarized and translated entries from the feed
|
22 |
+
|
23 |
+
Args:
|
24 |
+
rss_url (HttpUrl): the url
|
25 |
+
src_lang (str): _description_
|
26 |
+
tgt_lang (str): _description_
|
27 |
+
entries_limit (int, optional): _description_. Defaults to None.
|
28 |
+
|
29 |
+
Raises:
|
30 |
+
gr.Error: _description_
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
List[Dict]: _description_
|
34 |
+
"""
|
35 |
+
try:
|
36 |
+
tm = TaskManager()
|
37 |
+
processed_entries = tm.parse_and_process_feed(
|
38 |
+
rss_url, source_lang, target_lang, entries_limit
|
39 |
+
)
|
40 |
+
except Exception as e:
|
41 |
+
raise gr.Error(e)
|
42 |
+
|
43 |
+
return processed_entries, len(processed_entries)
|
44 |
+
|
45 |
+
|
46 |
+
# Custom css
|
47 |
+
custom_css = """
|
48 |
+
#messOut textarea {
|
49 |
+
font-weight: bold;
|
50 |
+
}
|
51 |
+
|
52 |
+
#entriesTab {
|
53 |
+
background-color: white;
|
54 |
+
}
|
55 |
+
"""
|
56 |
+
|
57 |
+
# Create a scrollable Markdown component
|
58 |
+
with gr.Blocks(
|
59 |
+
theme=gr.themes.Soft(),
|
60 |
+
css=custom_css,
|
61 |
+
) as demo:
|
62 |
+
# Add a title using Markdown
|
63 |
+
gr.Markdown("# RSS Feed Summarizer and Translator")
|
64 |
+
|
65 |
+
# Add a description using Markdown
|
66 |
+
gr.Markdown(
|
67 |
+
"Input an RSS feed URL and specify the source and target languages to get summarized and translated content."
|
68 |
+
)
|
69 |
+
|
70 |
+
rss_entries = gr.State([])
|
71 |
+
|
72 |
+
with gr.Row():
|
73 |
+
# Step for starting points and options' steps for entries' dropdowns (retrieve and view)
|
74 |
+
step = 5
|
75 |
+
|
76 |
+
with gr.Column():
|
77 |
+
rss_url = gr.Textbox(label="RSS Feed URL")
|
78 |
+
|
79 |
+
languages_lst = LANGUAGES.keys()
|
80 |
+
|
81 |
+
source_lang = gr.Dropdown(
|
82 |
+
choices=languages_lst,
|
83 |
+
value="",
|
84 |
+
label="Source Language",
|
85 |
+
)
|
86 |
+
target_lang = gr.Dropdown(
|
87 |
+
choices=languages_lst,
|
88 |
+
value="",
|
89 |
+
label="Target Language",
|
90 |
+
)
|
91 |
+
|
92 |
+
options_lst = list(range(5, 205, 5))
|
93 |
+
entries_to_retrieve = gr.Dropdown(
|
94 |
+
choices=options_lst,
|
95 |
+
value=options_lst[0],
|
96 |
+
label="Max Entries To Retrieve",
|
97 |
+
)
|
98 |
+
|
99 |
+
with gr.Row():
|
100 |
+
clear_btn = gr.ClearButton(value="Clear") # Clear button
|
101 |
+
submit_btn = gr.Button("Submit", variant="primary")
|
102 |
+
|
103 |
+
with gr.Column():
|
104 |
+
# Message for feed entries retrieved and spinner purposes
|
105 |
+
message_output = gr.Textbox(
|
106 |
+
label="Entries Retrieved: ",
|
107 |
+
interactive=False,
|
108 |
+
elem_id="messOut",
|
109 |
+
)
|
110 |
+
|
111 |
+
def submit_request(
|
112 |
+
feed_url: HttpUrl,
|
113 |
+
src_lang: str,
|
114 |
+
tgt_lang: str,
|
115 |
+
entries_limit: int,
|
116 |
+
latest_entries_num: int,
|
117 |
+
) -> Tuple[List[Dict], int, str]:
|
118 |
+
"""Calls format_processed_entries and format_processed_entries,
|
119 |
+
everytime submit button is pressed in order to retrieve feed entries,
|
120 |
+
format them and show them in the respective output component
|
121 |
+
|
122 |
+
Args:
|
123 |
+
feed_url (HttpUrl): the feed url
|
124 |
+
src_lang (str): source language
|
125 |
+
tgt_lang (str): target_language
|
126 |
+
entries_limit (int): the entries' limit (to retrieve)
|
127 |
+
latest_entries_num (int): the number of the latest entries retrieved (if submission button has been pressed before)
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
Tuple[List[Dict], int, str]: the feed entries retrieved, the number of those entries, the entries properly formatted
|
131 |
+
"""
|
132 |
+
|
133 |
+
proc_entries, entries_num = process_rss(
|
134 |
+
feed_url, src_lang, tgt_lang, entries_limit
|
135 |
+
)
|
136 |
+
# entries_updated = update_entries(latest_entries_num)
|
137 |
+
formatted_updated_entries = format_processed_entries(proc_entries)
|
138 |
+
return proc_entries, entries_num, formatted_updated_entries
|
139 |
+
|
140 |
+
with gr.Tab("Feed Summaries:", visible=True, elem_id="entriesTab"):
|
141 |
+
# Create a scrollable Markdown component
|
142 |
+
markdown_output = gr.Markdown(height="400px")
|
143 |
+
|
144 |
+
entries_to_view = gr.Dropdown(
|
145 |
+
choices=[options_lst[0]],
|
146 |
+
value=options_lst[0],
|
147 |
+
label="Max Entries To View",
|
148 |
+
)
|
149 |
+
|
150 |
+
@gr.on(
|
151 |
+
[entries_to_view.change],
|
152 |
+
inputs=[
|
153 |
+
rss_entries,
|
154 |
+
entries_to_view,
|
155 |
+
],
|
156 |
+
outputs=[markdown_output],
|
157 |
+
)
|
158 |
+
def format_processed_entries(
|
159 |
+
processed_entries: List[Dict], entries_limit: int = None
|
160 |
+
) -> str:
|
161 |
+
"""Format the output entries
|
162 |
+
|
163 |
+
Args:
|
164 |
+
processed_entries (List[Dict]): the entries retrieved from the feed that have been processed
|
165 |
+
entries_limit (int): a limit for the entries to view
|
166 |
+
|
167 |
+
Returns:
|
168 |
+
str: the formatted output containing the entries
|
169 |
+
"""
|
170 |
+
entries_limit = entries_limit or len(processed_entries) or None
|
171 |
+
|
172 |
+
# Format the output for Gradio
|
173 |
+
output = ""
|
174 |
+
for entry in processed_entries[:entries_limit]:
|
175 |
+
output += f"### {entry.get('title', '---')}\n\n"
|
176 |
+
output += f"**Author:** {entry.get('author', '-')}\n\n"
|
177 |
+
output += f"{entry.get('content', '')}\n\n"
|
178 |
+
link = entry.get("link", "")
|
179 |
+
if link:
|
180 |
+
output += f"[Read more]({link})\n\n"
|
181 |
+
output += "---\n\n"
|
182 |
+
|
183 |
+
return output
|
184 |
+
|
185 |
+
# Function to handle dropdown options for viewing entries
|
186 |
+
@gr.on(
|
187 |
+
[rss_entries.change],
|
188 |
+
inputs=[rss_entries],
|
189 |
+
outputs=[entries_to_view],
|
190 |
+
)
|
191 |
+
def update_view_dropdown(view_entries: List[Dict]) -> gr.Dropdown:
|
192 |
+
"""Update the options for view dropdown
|
193 |
+
|
194 |
+
Args:
|
195 |
+
view_entries (List[Dict]): the view entries list
|
196 |
+
|
197 |
+
Returns:
|
198 |
+
gr.Dropdown: a dropdown component with the updated options regarding view entries
|
199 |
+
"""
|
200 |
+
max_entries_shown = len(view_entries) or None
|
201 |
+
|
202 |
+
# Update the dropdown options with the new length
|
203 |
+
dropdown_options = list(range(step, max_entries_shown + step, step))
|
204 |
+
|
205 |
+
# Return outputs to update components
|
206 |
+
return gr.Dropdown(
|
207 |
+
choices=dropdown_options,
|
208 |
+
value=entries_to_view.value,
|
209 |
+
label="Entries to view",
|
210 |
+
)
|
211 |
+
|
212 |
+
# Link the function to the button
|
213 |
+
submit_btn.click(
|
214 |
+
submit_request,
|
215 |
+
inputs=[rss_url, source_lang, target_lang, entries_to_retrieve, message_output],
|
216 |
+
outputs=[rss_entries, message_output, markdown_output],
|
217 |
+
)
|
218 |
+
|
219 |
+
# Link the Clear button to reset inputs and outputs
|
220 |
+
clear_btn.add(
|
221 |
+
components=[
|
222 |
+
rss_url,
|
223 |
+
source_lang,
|
224 |
+
target_lang,
|
225 |
+
markdown_output,
|
226 |
+
entries_to_view,
|
227 |
+
entries_to_retrieve,
|
228 |
+
]
|
229 |
+
)
|
230 |
+
|
231 |
+
# Launch the interface
|
232 |
+
demo.launch()
|