pratik-19 commited on
Commit
40e2d84
·
1 Parent(s): 7334e78

minor changes

Browse files
Files changed (1) hide show
  1. app.py +27 -27
app.py CHANGED
@@ -1,5 +1,6 @@
1
  from transformers import MBart50Tokenizer, AutoModelForSeq2SeqLM, pipeline
2
  from langdetect import detect
 
3
 
4
  def load_models():
5
  tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")
@@ -26,30 +27,10 @@ def detect_language(text):
26
  return lang_code
27
 
28
 
29
- def summarize_text(text, lang_code):
30
- mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX") # Default to English if unsupported
31
- inputs = tokenizer(
32
- f"<{mbart_lang_code}>{text}",
33
- return_tensors="pt",
34
- max_length=1024,
35
- truncation=True
36
- )
37
- summary_ids = summarizer.model.generate(
38
- inputs["input_ids"],
39
- max_length=100,
40
- min_length=30,
41
- length_penalty=2.0,
42
- num_beams=4
43
- )
44
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
45
- summary = summary.replace(f"< >", "").strip()
46
- return summary
47
-
48
-
49
- def translate_to_english(text, lang_code):
50
- # Set the language to English explicitly for translation
51
- mbart_lang_code = "en_XX" # Always translate to English
52
-
53
  # Encode the input text for translation
54
  inputs = tokenizer(
55
  f"<{mbart_lang_code}>{text}",
@@ -70,11 +51,28 @@ def translate_to_english(text, lang_code):
70
  translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
71
 
72
  # Remove any special language code tokens like "<en_XX>"
73
- translated_text = translated_text.replace("< >< >", "").strip()
74
 
75
  return translated_text
76
 
77
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  st.title("Multilingual Summarization and Translation App")
80
  st.markdown("""This app detects the language of the input text, summarizes it in the same language, and translates it into English.""")
@@ -91,11 +89,13 @@ if st.button("Process Text"):
91
  st.warning(f"The detected language ({lang_code}) is not supported by the model.")
92
  else:
93
  try:
 
94
  summary = summarize_text(user_input, lang_code)
95
  st.write(f"### Summarized Text ({lang_code}):")
96
  st.write(summary)
97
 
98
- translation = translate_to_english(summary, lang_code)
 
99
  st.write("### Translated Text (English):")
100
  st.write(translation)
101
 
 
1
  from transformers import MBart50Tokenizer, AutoModelForSeq2SeqLM, pipeline
2
  from langdetect import detect
3
+ import re
4
 
5
  def load_models():
6
  tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50")
 
27
  return lang_code
28
 
29
 
30
+ def translate_to_english(text):
31
+ # Always translate to English (en_XX)
32
+ mbart_lang_code = "en_XX"
33
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # Encode the input text for translation
35
  inputs = tokenizer(
36
  f"<{mbart_lang_code}>{text}",
 
51
  translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
52
 
53
  # Remove any special language code tokens like "<en_XX>"
54
+ translated_text = re.sub(r"<[^>]+>", "", translated_text).strip()
55
 
56
  return translated_text
57
 
58
+ def summarize_text(text, lang_code):
59
+ mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX") # Default to English if unsupported
60
+ inputs = tokenizer(
61
+ f"<{mbart_lang_code}>{text}",
62
+ return_tensors="pt",
63
+ max_length=1024,
64
+ truncation=True
65
+ )
66
+ summary_ids = summarizer.model.generate(
67
+ inputs["input_ids"],
68
+ max_length=100,
69
+ min_length=30,
70
+ length_penalty=2.0,
71
+ num_beams=4
72
+ )
73
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
74
+ summary = summary.replace(f"<>", "").strip()
75
+ return summary
76
 
77
  st.title("Multilingual Summarization and Translation App")
78
  st.markdown("""This app detects the language of the input text, summarizes it in the same language, and translates it into English.""")
 
89
  st.warning(f"The detected language ({lang_code}) is not supported by the model.")
90
  else:
91
  try:
92
+ # First summarize the text
93
  summary = summarize_text(user_input, lang_code)
94
  st.write(f"### Summarized Text ({lang_code}):")
95
  st.write(summary)
96
 
97
+ # Then translate the summary to English
98
+ translation = translate_to_english(summary)
99
  st.write("### Translated Text (English):")
100
  st.write(translation)
101