pratik-19 commited on
Commit
ed34fa1
·
1 Parent(s): 40e2d84

minor changes

Browse files
Files changed (1) hide show
  1. app.py +15 -13
app.py CHANGED
@@ -27,34 +27,35 @@ def detect_language(text):
27
  return lang_code
28
 
29
 
30
- def translate_to_english(text):
31
- # Always translate to English (en_XX)
32
- mbart_lang_code = "en_XX"
33
 
34
- # Encode the input text for translation
35
  inputs = tokenizer(
36
- f"<{mbart_lang_code}>{text}",
37
  return_tensors="pt",
38
  max_length=1024,
39
  truncation=True
40
  )
41
-
42
- # Perform the translation
43
- translated_ids = translator.model.generate(
44
  inputs["input_ids"],
45
  max_length=100,
46
  length_penalty=2.0,
47
- num_beams=4
 
 
48
  )
49
 
50
  # Decode the translated text
51
  translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
52
-
53
- # Remove any special language code tokens like "<en_XX>"
54
  translated_text = re.sub(r"<[^>]+>", "", translated_text).strip()
55
 
56
  return translated_text
57
 
 
58
  def summarize_text(text, lang_code):
59
  mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX") # Default to English if unsupported
60
  inputs = tokenizer(
@@ -71,9 +72,10 @@ def summarize_text(text, lang_code):
71
  num_beams=4
72
  )
73
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
74
- summary = summary.replace(f"<>", "").strip()
75
  return summary
76
 
 
77
  st.title("Multilingual Summarization and Translation App")
78
  st.markdown("""This app detects the language of the input text, summarizes it in the same language, and translates it into English.""")
79
 
@@ -95,7 +97,7 @@ if st.button("Process Text"):
95
  st.write(summary)
96
 
97
  # Then translate the summary to English
98
- translation = translate_to_english(summary)
99
  st.write("### Translated Text (English):")
100
  st.write(translation)
101
 
 
27
  return lang_code
28
 
29
 
30
+ def translate_to_english(text, src_lang):
31
+ # Define the target language as English
32
+ tgt_lang = "en_XX"
33
 
34
+ # Tokenize the input text with the appropriate source and target language tokens
35
  inputs = tokenizer(
36
+ text,
37
  return_tensors="pt",
38
  max_length=1024,
39
  truncation=True
40
  )
41
+
42
+ # Specify the source language and target language in the generation call
43
+ translated_ids = model.generate(
44
  inputs["input_ids"],
45
  max_length=100,
46
  length_penalty=2.0,
47
+ num_beams=4,
48
+ decoder_start_token_id=tokenizer.lang_code_to_id[tgt_lang], # Explicitly set the target language
49
+ forced_bos_token_id=tokenizer.lang_code_to_id[src_lang] # Set the source language
50
  )
51
 
52
  # Decode the translated text
53
  translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
 
 
54
  translated_text = re.sub(r"<[^>]+>", "", translated_text).strip()
55
 
56
  return translated_text
57
 
58
+
59
  def summarize_text(text, lang_code):
60
  mbart_lang_code = LANGUAGE_CODES.get(lang_code, "en_XX") # Default to English if unsupported
61
  inputs = tokenizer(
 
72
  num_beams=4
73
  )
74
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
75
+ summary = re.sub(r"<[^>]+>", "", summary).strip()
76
  return summary
77
 
78
+
79
  st.title("Multilingual Summarization and Translation App")
80
  st.markdown("""This app detects the language of the input text, summarizes it in the same language, and translates it into English.""")
81
 
 
97
  st.write(summary)
98
 
99
  # Then translate the summary to English
100
+ translation = translate_to_english(summary, LANGUAGE_CODES.get(lang_code, "en_XX"))
101
  st.write("### Translated Text (English):")
102
  st.write(translation)
103