Rajut commited on
Commit
bb6ed43
·
verified ·
1 Parent(s): 2076542

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -0
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz
3
+ from transformers import pipeline, MBart50TokenizerFast, MBartForConditionalGeneration
4
+ from multiprocessing import Pool, cpu_count
5
+ import tempfile
6
+
7
+ # Load summarization pipeline
8
+ summarizer = pipeline("summarization", model="Falconsai/text_summarization")
9
+
10
+ # Load translation model and tokenizer
11
+ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
12
+ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
13
+
14
+ # Define max chunk length
15
+ max_chunk_length = 1024
16
+
17
+ # Function to chunk text
18
+ def chunk_text(text, max_chunk_length):
19
+ chunks = []
20
+ current_chunk = ""
21
+ for sentence in text.split("."):
22
+ if len(current_chunk) + len(sentence) + 1 <= max_chunk_length:
23
+ if current_chunk != "":
24
+ current_chunk += " "
25
+ current_chunk += sentence.strip()
26
+ else:
27
+ chunks.append(current_chunk)
28
+ current_chunk = sentence.strip()
29
+ if current_chunk != "":
30
+ chunks.append(current_chunk)
31
+ return chunks
32
+
33
+ # Function to summarize and translate a chunk
34
+ def summarize_and_translate_chunk(chunk, lang):
35
+ summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False)
36
+ summary_text = summary[0]['summary_text']
37
+
38
+ # Translate summary
39
+ translated_chunk = translate_summary(summary_text, lang)
40
+ return translated_chunk
41
+
42
+ # Function to translate the summary
43
+ def translate_summary(summary, lang):
44
+ # Chunk text if it exceeds maximum length
45
+ if len(summary) > max_chunk_length:
46
+ chunks = chunk_text(summary, max_chunk_length)
47
+ else:
48
+ chunks = [summary]
49
+
50
+ # Translate each chunk
51
+ translated_chunks = []
52
+ for chunk in chunks:
53
+ inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)
54
+ generated_tokens = model.generate(
55
+ **inputs,
56
+ forced_bos_token_id=tokenizer.lang_code_to_id[lang],
57
+ max_length=1024,
58
+ num_beams=4,
59
+ early_stopping=True,
60
+ length_penalty=2.0,
61
+ )
62
+ translated_chunks.append(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0])
63
+
64
+ return " ".join(translated_chunks)
65
+
66
+
67
+
68
+ # Function to read PDF and summarize and translate chunk by chunk
69
+ def summarize_and_translate_pdf(uploaded_file, lang):
70
+ # Save uploaded PDF to a temporary file
71
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
72
+ temp_file.write(uploaded_file.read())
73
+ temp_file_path = temp_file.name
74
+
75
+ try:
76
+ doc = fitz.open(temp_file_path)
77
+ except FileNotFoundError:
78
+ st.error("File not found. Please make sure the file path is correct.")
79
+ return []
80
+
81
+ total_chunks = len(doc)
82
+ chunks = []
83
+
84
+ for i in range(total_chunks):
85
+ page = doc.load_page(i)
86
+ text = page.get_text()
87
+ chunks.extend([text[j:j+max_chunk_length] for j in range(0, len(text), max_chunk_length)])
88
+
89
+ # Use multiprocessing to parallelize the process
90
+ with Pool(cpu_count()) as pool:
91
+ translated_chunks = pool.starmap(summarize_and_translate_chunk, [(chunk, lang) for chunk in chunks])
92
+
93
+ # Delete temporary file
94
+ os.unlink(temp_file_path)
95
+
96
+ return translated_chunks
97
+
98
+
99
+ # Streamlit UI
100
+ st.title("PDF Summarization and Translation")
101
+
102
+ # File upload
103
+ uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
104
+ if uploaded_file:
105
+ # Display uploaded file
106
+ st.write("Uploaded PDF file:", uploaded_file.name)
107
+
108
+ # Language selection
109
+ languages = {
110
+ "Arabic": "ar_AR", "Czech": "cs_CZ", "German": "de_DE", "English": "en_XX", "Spanish": "es_XX",
111
+ "Estonian": "et_EE", "Finnish": "fi_FI", "French": "fr_XX", "Gujarati": "gu_IN", "Hindi": "hi_IN",
112
+ "Italian": "it_IT", "Japanese": "ja_XX", "Kazakh": "kk_KZ", "Korean": "ko_KR", "Lithuanian": "lt_LT",
113
+ "Latvian": "lv_LV", "Burmese": "my_MM", "Nepali": "ne_NP", "Dutch": "nl_XX", "Romanian": "ro_RO",
114
+ "Russian": "ru_RU", "Sinhala": "si_LK", "Turkish": "tr_TR", "Vietnamese": "vi_VN", "Chinese": "zh_CN",
115
+ "Afrikaans": "af_ZA", "Azerbaijani": "az_AZ", "Bengali": "bn_IN", "Persian": "fa_IR", "Hebrew": "he_IL",
116
+ "Croatian": "hr_HR", "Indonesian": "id_ID", "Georgian": "ka_GE", "Khmer": "km_KH", "Macedonian": "mk_MK",
117
+ "Malayalam": "ml_IN", "Mongolian": "mn_MN", "Marathi": "mr_IN", "Polish": "pl_PL", "Pashto": "ps_AF",
118
+ "Portuguese": "pt_XX", "Swedish": "sv_SE", "Swahili": "sw_KE", "Tamil": "ta_IN", "Telugu": "te_IN",
119
+ "Thai": "th_TH", "Tagalog": "tl_XX", "Ukrainian": "uk_UA", "Urdu": "ur_PK", "Xhosa": "xh_ZA",
120
+ "Galician": "gl_ES", "Slovene": "sl_SI"
121
+ }
122
+
123
+ lang = st.selectbox("Select language for translation", list(languages.keys()))
124
+
125
+ # Translate PDF
126
+ if st.button("Summarize and Translate"):
127
+ translated_chunks = summarize_and_translate_pdf(uploaded_file, languages[lang])
128
+
129
+ # Display translated text
130
+ st.header("Translated Summary")
131
+ for chunk in translated_chunks:
132
+ st.write(chunk)