moThecarpenter80 commited on
Commit
d075e6c
·
verified ·
1 Parent(s): 94f9b65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -2
app.py CHANGED
@@ -1,4 +1,64 @@
1
  import streamlit as st
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from transformers import MarianTokenizer, MarianMTModel
3
+ from ar_correct import ar_correct
4
+ import mishkal.tashkeel
5
+ from transformers import BertTokenizer, AutoModelForSeq2SeqLM, pipeline
6
+ from arabert.preprocess import ArabertPreprocessor
7
 
8
+ # Initialize Mishkal vocalizer
9
+ vocalizer = mishkal.tashkeel.TashkeelClass()
10
+
11
+ # Initialize Marian tokenizer and model for translation
12
+ mname = "marefa-nlp/marefa-mt-en-ar"
13
+ tokenizer = MarianTokenizer.from_pretrained(mname)
14
+ model = MarianMTModel.from_pretrained(mname)
15
+
16
+ # Initialize BERT tokenizer and model for summarization
17
+ model_name = "malmarjeh/mbert2mbert-arabic-text-summarization"
18
+ preprocessor = ArabertPreprocessor(model_name="")
19
+ tokenizer_summarization = BertTokenizer.from_pretrained(model_name)
20
+ model_summarization = AutoModelForSeq2SeqLM.from_pretrained(model_name)
21
+ pipeline_summarization = pipeline("text2text-generation", model=model_summarization, tokenizer=tokenizer_summarization)
22
+
23
+
24
+ def main():
25
+ st.title("U3reb Demo")
26
+
27
+ # Text Input
28
+ input_text = st.text_area("Enter Arabic Text:")
29
+
30
+ # Tokenization
31
+ st.subheader("Tokenization (Mishkal)")
32
+ if input_text:
33
+ text_mishkal = vocalizer.tashkeel(input_text)
34
+ st.write("Tokenized Text (with diacritics):", text_mishkal)
35
+
36
+ # Translation
37
+ st.subheader("Translation")
38
+ if input_text:
39
+ translated_tokens = model.generate(**tokenizer.prepare_seq2seq_batch([input_text], return_tensors="pt"))
40
+ translated_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokens]
41
+ st.write("Translated Text:", translated_text)
42
+
43
+ # Arabic Text Correction
44
+ st.subheader("Arabic Text Correction (ar_correct)")
45
+ if input_text:
46
+ corrected_text = ar_correct(input_text)
47
+ st.write("Corrected Text:", corrected_text)
48
+
49
+ # Text Summarization
50
+ st.subheader("Text Summarization")
51
+ if input_text:
52
+ preprocessed_text = preprocessor.preprocess(input_text)
53
+ result = pipeline_summarization(preprocessed_text,
54
+ pad_token_id=tokenizer_summarization.eos_token_id,
55
+ num_beams=3,
56
+ repetition_penalty=3.0,
57
+ max_length=200,
58
+ length_penalty=1.0,
59
+ no_repeat_ngram_size=3)[0]['generated_text']
60
+ st.write("Summarized Text:", result)
61
+
62
+
63
+ if __name__ == "__main__":
64
+ main()