Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,64 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import MarianTokenizer, MarianMTModel
|
3 |
+
from ar_correct import ar_correct
|
4 |
+
import mishkal.tashkeel
|
5 |
+
from transformers import BertTokenizer, AutoModelForSeq2SeqLM, pipeline
|
6 |
+
from arabert.preprocess import ArabertPreprocessor
|
7 |
|
8 |
+
# Initialize Mishkal vocalizer
|
9 |
+
vocalizer = mishkal.tashkeel.TashkeelClass()
|
10 |
+
|
11 |
+
# Initialize Marian tokenizer and model for translation
|
12 |
+
mname = "marefa-nlp/marefa-mt-en-ar"
|
13 |
+
tokenizer = MarianTokenizer.from_pretrained(mname)
|
14 |
+
model = MarianMTModel.from_pretrained(mname)
|
15 |
+
|
16 |
+
# Initialize BERT tokenizer and model for summarization
|
17 |
+
model_name = "malmarjeh/mbert2mbert-arabic-text-summarization"
|
18 |
+
preprocessor = ArabertPreprocessor(model_name="")
|
19 |
+
tokenizer_summarization = BertTokenizer.from_pretrained(model_name)
|
20 |
+
model_summarization = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
21 |
+
pipeline_summarization = pipeline("text2text-generation", model=model_summarization, tokenizer=tokenizer_summarization)
|
22 |
+
|
23 |
+
|
24 |
+
def main():
|
25 |
+
st.title("U3reb Demo")
|
26 |
+
|
27 |
+
# Text Input
|
28 |
+
input_text = st.text_area("Enter Arabic Text:")
|
29 |
+
|
30 |
+
# Tokenization
|
31 |
+
st.subheader("Tokenization (Mishkal)")
|
32 |
+
if input_text:
|
33 |
+
text_mishkal = vocalizer.tashkeel(input_text)
|
34 |
+
st.write("Tokenized Text (with diacritics):", text_mishkal)
|
35 |
+
|
36 |
+
# Translation
|
37 |
+
st.subheader("Translation")
|
38 |
+
if input_text:
|
39 |
+
translated_tokens = model.generate(**tokenizer.prepare_seq2seq_batch([input_text], return_tensors="pt"))
|
40 |
+
translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
|
41 |
+
st.write("Translated Text:", translated_text)
|
42 |
+
|
43 |
+
# Arabic Text Correction
|
44 |
+
st.subheader("Arabic Text Correction (ar_correct)")
|
45 |
+
if input_text:
|
46 |
+
corrected_text = ar_correct(input_text)
|
47 |
+
st.write("Corrected Text:", corrected_text)
|
48 |
+
|
49 |
+
# Text Summarization
|
50 |
+
st.subheader("Text Summarization")
|
51 |
+
if input_text:
|
52 |
+
preprocessed_text = preprocessor.preprocess(input_text)
|
53 |
+
result = pipeline_summarization(preprocessed_text,
|
54 |
+
pad_token_id=tokenizer_summarization.eos_token_id,
|
55 |
+
num_beams=3,
|
56 |
+
repetition_penalty=3.0,
|
57 |
+
max_length=200,
|
58 |
+
length_penalty=1.0,
|
59 |
+
no_repeat_ngram_size=3)[0]['generated_text']
|
60 |
+
st.write("Summarized Text:", result)
|
61 |
+
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
main()
|