ramysaidagieb commited on
Commit
41804af
·
verified ·
1 Parent(s): 07a57f2

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -246
app.py DELETED
@@ -1,246 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import docx
4
- import fitz # PyMuPDF
5
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
6
- from arabert.preprocess import ArabertPreprocessor
7
- from datasets import Dataset
8
- import uuid
9
- import re
10
- import logging
11
- from datetime import datetime
12
- import gradio.routes
13
-
14
- # Setup logging
15
- logging.basicConfig(level=logging.INFO)
16
- logger = logging.getLogger(__name__)
17
-
18
- # Initialize Arabic preprocessor and tokenizer
19
- model_name = "aubmindlab/bert-base-arabertv2"
20
- arabert_preprocessor = ArabertPreprocessor(model_name=model_name, keep_emojis=True)
21
- tokenizer = AutoTokenizer.from_pretrained(model_name)
22
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
23
-
24
- # Directory to save fine-tuned model
25
- MODEL_SAVE_PATH = "./fine_tuned_model"
26
-
27
- # Function to extract text from .docx
28
- def extract_text_docx(file_path):
29
- logger.info(f"{datetime.now()}: Extracting text from .docx file: {file_path}")
30
- try:
31
- doc = docx.Document(file_path)
32
- text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
33
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .docx")
34
- return text
35
- except Exception as e:
36
- logger.error(f"{datetime.now()}: Error extracting text from .docx: {e}")
37
- return ""
38
-
39
- # Function to extract text from .pdf
40
- def extract_text_pdf(file_path):
41
- logger.info(f"{datetime.now()}: Extracting text from .pdf file: {file_path}")
42
- try:
43
- doc = fitz.open(file_path)
44
- text = ""
45
- for page in doc:
46
- text += page.get_text()
47
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .pdf")
48
- return text
49
- except Exception as e:
50
- logger.error(f"{datetime.now()}: Error extracting text from .pdf: {e}")
51
- return ""
52
-
53
- # Function to clean and preprocess text
54
- def preprocess_text(text):
55
- logger.info(f"{datetime.now()}: Preprocessing text (length: {len(text)} characters)")
56
- text = re.sub(r'\s+', ' ', text) # Remove extra spaces
57
- text = arabert_preprocessor.preprocess(text) # Normalize Arabic text
58
- logger.info(f"{datetime.now()}: Text preprocessed, new length: {len(text)} characters")
59
- return text.strip()
60
-
61
- # Function to chunk text for dataset
62
- def chunk_text(text, max_length=512):
63
- logger.info(f"{datetime.now()}: Chunking text into segments")
64
- words = text.split()
65
- chunks = []
66
- current_chunk = []
67
- current_length = 0
68
- for word in words:
69
- current_chunk.append(word)
70
- current_length += len(word) + 1
71
- if current_length >= max_length:
72
- chunks.append(" ".join(current_chunk))
73
- current_chunk = []
74
- current_length = 0
75
- if current_chunk:
76
- chunks.append(" ".join(current_chunk))
77
- logger.info(f"{datetime.now()}: Created {len(chunks)} text chunks")
78
- return chunks
79
-
80
- # Function to prepare dataset
81
- def prepare_dataset(text):
82
- logger.info(f"{datetime.now()}: Preparing dataset")
83
- chunks = chunk_text(text)
84
- data = {"text": chunks}
85
- dataset = Dataset.from_dict(data)
86
- logger.info(f"{datetime.now()}: Dataset prepared with {len(dataset)} examples")
87
- return dataset
88
-
89
- # Function to tokenize dataset
90
- def tokenize_dataset(dataset):
91
- logger.info(f"{datetime.now()}: Tokenizing dataset")
92
- def tokenize_function(examples):
93
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
94
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
95
- logger.info(f"{datetime.now()}: Dataset tokenized")
96
- return tokenized_dataset
97
-
98
- # Function to fine-tune model
99
- def fine_tune_model(dataset):
100
- logger.info(f"{datetime.now()}: Starting model fine-tuning")
101
- training_args = TrainingArguments(
102
- output_dir="./results",
103
- num_train_epochs=1,
104
- per_device_train_batch_size=4,
105
- save_steps=10_000,
106
- save_total_limit=2,
107
- logging_dir='./logs',
108
- logging_steps=200,
109
- )
110
-
111
- trainer = Trainer(
112
- model=model,
113
- args=training_args,
114
- train_dataset=dataset,
115
- )
116
-
117
- trainer.train()
118
- model.save_pretrained(MODEL_SAVE_PATH)
119
- tokenizer.save_pretrained(MODEL_SAVE_PATH)
120
- logger.info(f"{datetime.now()}: Model fine-tuned and saved to {MODEL_SAVE_PATH}")
121
-
122
- # Function to handle file upload and training
123
- def upload_and_train(files, progress=gr.Progress()):
124
- uploaded_files = []
125
- all_text = ""
126
- training_log = []
127
-
128
- def log_and_update(step, desc, progress_value):
129
- msg = f"{datetime.now()}: {desc}"
130
- logger.info(msg)
131
- training_log.append(msg)
132
- progress(progress_value, desc=desc)
133
- return "\n".join(training_log)
134
-
135
- log_and_update("Starting upload", "Loading books...", 0.1)
136
- for file in files:
137
- file_name = os.path.basename(file.name)
138
- uploaded_files.append(file_name)
139
- if file_name.endswith(".docx"):
140
- text = extract_text_docx(file.name)
141
- elif file_name.endswith(".pdf"):
142
- text = extract_text_pdf(file.name)
143
- else:
144
- continue
145
- all_text += text + "\n"
146
-
147
- if not all_text.strip():
148
- msg = f"{datetime.now()}: No valid text extracted from uploaded files."
149
- logger.error(msg)
150
- training_log.append(msg)
151
- return "\n".join(training_log), uploaded_files
152
-
153
- log_and_update("Text extraction complete", "Extracting ideas...", 0.4)
154
- cleaned_text = preprocess_text(all_text)
155
-
156
- log_and_update("Preprocessing complete", "Preparing dataset...", 0.6)
157
- dataset = prepare_dataset(cleaned_text)
158
- tokenized_dataset = tokenize_dataset(dataset)
159
-
160
- log_and_update("Dataset preparation complete", "Training in progress...", 0.8)
161
- fine_tune_model(tokenized_dataset)
162
-
163
- log_and_update("Training complete", "Training completed!", 1.0)
164
-
165
- # Example QA
166
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
167
- example_question = "ما هو قانون الإيمان وفقًا للكتاب؟"
168
- example_answer = qa_pipeline(question=example_question, context=cleaned_text[:512])["answer"]
169
-
170
- final_message = (
171
- f"Training process finished: Enter your question\n\n"
172
- f"**مثال لسؤال**: {example_question}\n"
173
- f"**الإجابة**: {example_answer}\n\n"
174
- f"**سجل التدريب**:\n" + "\n".join(training_log)
175
- )
176
- return final_message, uploaded_files
177
-
178
- # Function to answer questions
179
- def answer_question(question, context):
180
- if not os.path.exists(MODEL_SAVE_PATH):
181
- return "النظام لم يتم تدريبه بعد. الرجاء رفع الكتب وتدريب النظام أولاً."
182
-
183
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
184
- answer = qa_pipeline(question=question, context=context[:512])["answer"]
185
- return answer
186
-
187
- # Main Gradio Interface (for training and QA)
188
- with gr.Blocks() as main_demo:
189
- gr.Markdown("# نظام ذكاء اصطناعي لتحليل الكتب باللغة العربية")
190
-
191
- with gr.Row():
192
- with gr.Column():
193
- file_upload = gr.File(file_types=[".docx", ".pdf"], label="رفع الكتب", file_count="multiple")
194
- upload_button = gr.Button("رفع وتدريب")
195
- uploaded_files = gr.Textbox(label="الكتب المرفوعة")
196
-
197
- with gr.Column():
198
- training_status = gr.Textbox(label="حالة التدريب", lines=10)
199
-
200
- with gr.Row():
201
- question_input = gr.Textbox(label="أدخل سؤالك بالعربية", placeholder="مثال: ما هو قانون الإيمان؟")
202
- answer_output = gr.Textbox(label="الإجابة")
203
- ask_button = gr.Button("طرح السؤال")
204
-
205
- # Event handlers
206
- upload_button.click(
207
- fn=upload_and_train,
208
- inputs=[file_upload],
209
- outputs=[training_status, uploaded_files]
210
- )
211
-
212
- ask_button.click(
213
- fn=answer_question,
214
- inputs=[question_input, gr.State(value="")],
215
- outputs=[answer_output]
216
- )
217
-
218
- # Question-Answering Only Interface
219
- with gr.Blocks() as answer_demo:
220
- gr.Markdown("# طرح الأسئلة على نظام تحليل الكتب باللغة العربية")
221
- gr.Markdown("أدخل سؤالك بالعربية وسيتم الإجابة بناءً على محتوى الكتب المدربة.")
222
-
223
- question_input = gr.Textbox(label="أدخل سؤالك", placeholder="مثال: ما هو قانون الإيمان؟")
224
- answer_output = gr.Textbox(label="الإجابة")
225
- ask_button = gr.Button("طرح السؤال")
226
-
227
- ask_button.click(
228
- fn=answer_question,
229
- inputs=[question_input, gr.State(value="")],
230
- outputs=[answer_output]
231
- )
232
-
233
- # Combine both interfaces with routes
234
- app = gr.mount_gradio_app(
235
- gradio.routes.App.create_app(),
236
- main_demo,
237
- path="/",
238
- )
239
- app = gr.mount_gradio_app(
240
- app,
241
- answer_demo,
242
- path="/answer",
243
- )
244
-
245
- if __name__ == "__main__":
246
- main_demo.launch()