Spaces:

pratik-19
/

VerbaLink

Sleeping

App Files Files Community

pratik-19 commited on Jan 25

Commit

40e2d84

1 Parent(s): 7334e78

minor changes

Browse files

Files changed (1) hide show

app.py +27 -27

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from transformers import MBart50Tokenizer, AutoModelForSeq2SeqLM, pipeline
 from langdetect import detect
 def load_models():
     tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")
@@ -26,30 +27,10 @@ def detect_language(text):
     return lang_code
-def summarize_text(text, lang_code):
-    mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX")  # Default to English if unsupported
-    inputs = tokenizer(
-        f"<{mbart_lang_code}>{text}",
-        return_tensors="pt",
-        max_length=1024,
-        truncation=True
-    )
-    summary_ids = summarizer.model.generate(
-        inputs["input_ids"],
-        max_length=100,
-        min_length=30,
-        length_penalty=2.0,
-        num_beams=4
-    )
-    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-    summary = summary.replace(f"< >", "").strip()
-    return summary
-def translate_to_english(text, lang_code):
-    # Set the language to English explicitly for translation
-    mbart_lang_code = "en_XX"  # Always translate to English
     # Encode the input text for translation
     inputs = tokenizer(
         f"<{mbart_lang_code}>{text}",
@@ -70,11 +51,28 @@ def translate_to_english(text, lang_code):
     translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
     # Remove any special language code tokens like "<en_XX>"
-    translated_text = translated_text.replace("< >< >", "").strip()
     return translated_text
 st.title("Multilingual Summarization and Translation App")
 st.markdown("""This app detects the language of the input text, summarizes it in the same language, and translates it into English.""")
@@ -91,11 +89,13 @@ if st.button("Process Text"):
             st.warning(f"The detected language ({lang_code}) is not supported by the model.")
         else:
             try:
                 summary = summarize_text(user_input, lang_code)
                 st.write(f"### Summarized Text ({lang_code}):")
                 st.write(summary)
-                translation = translate_to_english(summary, lang_code)
                 st.write("### Translated Text (English):")
                 st.write(translation)

 from transformers import MBart50Tokenizer, AutoModelForSeq2SeqLM, pipeline
 from langdetect import detect
+import re
 def load_models():
     tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")
     return lang_code
+def translate_to_english(text):
+    # Always translate to English (en_XX)
+    mbart_lang_code = "en_XX"
     # Encode the input text for translation
     inputs = tokenizer(
         f"<{mbart_lang_code}>{text}",
     translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
     # Remove any special language code tokens like "<en_XX>"
+    translated_text = re.sub(r"<[^>]+>", "", translated_text).strip()
     return translated_text
+def summarize_text(text, lang_code):
+    mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX")  # Default to English if unsupported
+    inputs = tokenizer(
+        f"<{mbart_lang_code}>{text}",
+        return_tensors="pt",
+        max_length=1024,
+        truncation=True
+    )
+    summary_ids = summarizer.model.generate(
+        inputs["input_ids"],
+        max_length=100,
+        min_length=30,
+        length_penalty=2.0,
+        num_beams=4
+    )
+    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+    summary = summary.replace(f"<>", "").strip()
+    return summary
 st.title("Multilingual Summarization and Translation App")
 st.markdown("""This app detects the language of the input text, summarizes it in the same language, and translates it into English.""")
             st.warning(f"The detected language ({lang_code}) is not supported by the model.")
         else:
             try:
+                # First summarize the text
                 summary = summarize_text(user_input, lang_code)
                 st.write(f"### Summarized Text ({lang_code}):")
                 st.write(summary)
+                # Then translate the summary to English
+                translation = translate_to_english(summary)
                 st.write("### Translated Text (English):")
                 st.write(translation)