ramysaidagieb commited on
Commit
3582f77
·
verified ·
1 Parent(s): 3eea176

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -251
app.py DELETED
@@ -1,251 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import docx
4
- import fitz # PyMuPDF
5
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
6
- from datasets import Dataset
7
- import re
8
- import logging
9
- from datetime import datetime
10
- import gradio.routes
11
-
12
- # Setup logging
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- # Initialize tokenizer
17
- model_name = "aubmindlab/bert-base-arabertv2"
18
- tokenizer = AutoTokenizer.from_pretrained(model_name)
19
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
20
-
21
- # Directory to save fine-tuned model
22
- MODEL_SAVE_PATH = "./fine_tuned_model"
23
-
24
- # Custom Arabic text preprocessing function
25
- def preprocess_arabic_text(text):
26
- logger.info(f"{datetime.now()}: Preprocessing text (length: {len(text)} characters)")
27
- # Remove Arabic diacritics
28
- diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
29
- text = diacritics.sub('', text)
30
- # Normalize Arabic characters
31
- text = re.sub(r'[أإآ]', 'ا', text)
32
- text = re.sub(r'ى', 'ي', text)
33
- text = re.sub(r'ة', 'ه', text)
34
- # Remove extra spaces and non-essential characters
35
- text = re.sub(r'\s+', ' ', text)
36
- text = re.sub(r'[^\w\s]', '', text)
37
- logger.info(f"{datetime.now()}: Text preprocessed, new length: {len(text)} characters")
38
- return text.strip()
39
-
40
- # Function to extract text from .docx
41
- def extract_text_docx(file_path):
42
- logger.info(f"{datetime.now()}: Extracting text from .docx file: {file_path}")
43
- try:
44
- doc = docx.Document(file_path)
45
- text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
46
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .docx")
47
- return text
48
- except Exception as e:
49
- logger.error(f"{datetime.now()}: Error extracting text from .docx: {e}")
50
- return ""
51
-
52
- # Function to extract text from .pdf
53
- def extract_text_pdf(file_path):
54
- logger.info(f"{datetime.now()}: Extracting text from .pdf file: {file_path}")
55
- try:
56
- doc = fitz.open(file_path)
57
- text = ""
58
- for page in doc:
59
- text += page.get_text()
60
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .pdf")
61
- return text
62
- except Exception as e:
63
- logger.error(f"{datetime.now()}: Error extracting text from .pdf: {e}")
64
- return ""
65
-
66
- # Function to chunk text for dataset
67
- def chunk_text(text, max_length=512):
68
- logger.info(f"{datetime.now()}: Chunking text into segments")
69
- words = text.split()
70
- chunks = []
71
- current_chunk = []
72
- current_length = 0
73
- for word in words:
74
- current_chunk.append(word)
75
- current_length += len(word) + 1
76
- if current_length >= max_length:
77
- chunks.append(" ".join(current_chunk))
78
- current_chunk = []
79
- current_length = 0
80
- if current_chunk:
81
- chunks.append(" ".join(current_chunk))
82
- logger.info(f"{datetime.now()}: Created {len(chunks)} text chunks")
83
- return chunks
84
-
85
- # Function to prepare dataset
86
- def prepare_dataset(text):
87
- logger.info(f"{datetime.now()}: Preparing dataset")
88
- chunks = chunk_text(text)
89
- data = {"text": chunks}
90
- dataset = Dataset.from_dict(data)
91
- logger.info(f"{datetime.now()}: Dataset prepared with {len(dataset)} examples")
92
- return dataset
93
-
94
- # Function to tokenize dataset
95
- def tokenize_dataset(dataset):
96
- logger.info(f"{datetime.now()}: Tokenizing dataset")
97
- def tokenize_function(examples):
98
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
99
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
100
- logger.info(f"{datetime.now()}: Dataset tokenized")
101
- return tokenized_dataset
102
-
103
- # Function to fine-tune model
104
- def fine_tune_model(dataset):
105
- logger.info(f"{datetime.now()}: Starting model fine-tuning")
106
- training_args = TrainingArguments(
107
- output_dir="./results",
108
- num_train_epochs=1,
109
- per_device_train_batch_size=4,
110
- save_steps=10_000,
111
- save_total_limit=2,
112
- logging_dir='./logs',
113
- logging_steps=200,
114
- )
115
-
116
- trainer = Trainer(
117
- model=model,
118
- args=training_args,
119
- train_dataset=dataset,
120
- )
121
-
122
- trainer.train()
123
- model.save_pretrained(MODEL_SAVE_PATH)
124
- tokenizer.save_pretrained(MODEL_SAVE_PATH)
125
- logger.info(f"{datetime.now()}: Model fine-tuned and saved to {MODEL_SAVE_PATH}")
126
-
127
- # Function to handle file upload and training
128
- def upload_and_train(files, progress=gr.Progress()):
129
- uploaded_files = []
130
- all_text = ""
131
- training_log = []
132
-
133
- def log_and_update(step, desc, progress_value):
134
- msg = f"{datetime.now()}: {desc}"
135
- logger.info(msg)
136
- training_log.append(msg)
137
- progress(progress_value, desc=desc)
138
- return "\n".join(training_log)
139
-
140
- log_and_update("Starting upload", "Loading books...", 0.1)
141
- for file in files:
142
- file_name = os.path.basename(file.name)
143
- uploaded_files.append(file_name)
144
- if file_name.endswith(".docx"):
145
- text = extract_text_docx(file.name)
146
- elif file_name.endswith(".pdf"):
147
- text = extract_text_pdf(file.name)
148
- else:
149
- continue
150
- all_text += text + "\n"
151
-
152
- if not all_text.strip():
153
- msg = f"{datetime.now()}: No valid text extracted from uploaded files."
154
- logger.error(msg)
155
- training_log.append(msg)
156
- return "\n".join(training_log), uploaded_files
157
-
158
- log_and_update("Text extraction complete", "Extracting ideas...", 0.4)
159
- cleaned_text = preprocess_arabic_text(all_text)
160
-
161
- log_and_update("Preprocessing complete", "Preparing dataset...", 0.6)
162
- dataset = prepare_dataset(cleaned_text)
163
- tokenized_dataset = tokenize_dataset(dataset)
164
-
165
- log_and_update("Dataset preparation complete", "Training in progress...", 0.8)
166
- fine_tune_model(tokenized_dataset)
167
-
168
- log_and_update("Training complete", "Training completed!", 1.0)
169
-
170
- # Example QA
171
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
172
- example_question = "ما هو قانون الإيمان وفقًا للكتاب؟"
173
- example_answer = qa_pipeline(question=example_question, context=cleaned_text[:512])["answer"]
174
-
175
- final_message = (
176
- f"Training process finished: Enter your question\n\n"
177
- f"**مثال لسؤال**: {example_question}\n"
178
- f"**الإجابة**: {example_answer}\n\n"
179
- f"**سجل التدريب**:\n" + "\n".join(training_log)
180
- )
181
- return final_message, uploaded_files
182
-
183
- # Function to answer questions
184
- def answer_question(question, context):
185
- if not os.path.exists(MODEL_SAVE_PATH):
186
- return "النظام لم يتم تدريبه بعد. الرجاء رفع الكتب وتدريب النظام أولاً."
187
-
188
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
189
- answer = qa_pipeline(question=question, context=context[:512])["answer"]
190
- return answer
191
-
192
- # Main Gradio Interface (for training and QA)
193
- with gr.Blocks() as main_demo:
194
- gr.Markdown("# نظام ذكاء اصطناعي لتحليل الكتب باللغة العربية")
195
-
196
- with gr.Row():
197
- with gr.Column():
198
- file_upload = gr.File(file_types=[".docx", ".pdf"], label="رفع الكتب", file_count="multiple")
199
- upload_button = gr.Button("رفع وتدريب")
200
- uploaded_files = gr.Textbox(label="الكتب المرفوعة")
201
-
202
- with gr.Column():
203
- training_status = gr.Textbox(label="حالة التدريب", lines=10)
204
-
205
- with gr.Row():
206
- question_input = gr.Textbox(label="أدخل سؤالك بالعربية", placeholder="مثال: ما هو قانون الإيمان؟")
207
- answer_output = gr.Textbox(label="الإجابة")
208
- ask_button = gr.Button("طرح السؤال")
209
-
210
- # Event handlers
211
- upload_button.click(
212
- fn=upload_and_train,
213
- inputs=[file_upload],
214
- outputs=[training_status, uploaded_files]
215
- )
216
-
217
- ask_button.click(
218
- fn=answer_question,
219
- inputs=[question_input, gr.State(value="")],
220
- outputs=[answer_output]
221
- )
222
-
223
- # Question-Answering Only Interface
224
- with gr.Blocks() as answer_demo:
225
- gr.Markdown("# طرح الأسئلة على نظام تحليل الكتب باللغة العربية")
226
- gr.Markdown("أدخل سؤالك بالعربية وسيتم الإجابة بناءً على محتوى الكتب المدربة.")
227
-
228
- question_input = gr.Textbox(label="أدخل سؤالك", placeholder="مثال: ما هو قانون الإيمان؟")
229
- answer_output = gr.Textbox(label="الإجابة")
230
- ask_button = gr.Button("طرح السؤال")
231
-
232
- ask_button.click(
233
- fn=answer_question,
234
- inputs=[question_input, gr.State(value="")],
235
- outputs=[answer_output]
236
- )
237
-
238
- # Combine both interfaces with routes
239
- app = gr.mount_gradio_app(
240
- gradio.routes.App.create_app(),
241
- main_demo,
242
- path="/",
243
- )
244
- app = gr.mount_gradio_app(
245
- app,
246
- answer_demo,
247
- path="/answer",
248
- )
249
-
250
- if __name__ == "__main__":
251
- main_demo.launch()