ramysaidagieb commited on
Commit
27e9342
·
verified ·
1 Parent(s): 513a1ff

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -261
app.py DELETED
@@ -1,261 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import docx
4
- import fitz # PyMuPDF
5
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
6
- from datasets import Dataset
7
- import re
8
- import logging
9
- from datetime import datetime
10
- import gradio.routes
11
- import warnings
12
-
13
- # Suppress FutureWarning from huggingface_hub
14
- warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")
15
-
16
- # Setup logging
17
- logging.basicConfig(level=logging.INFO)
18
- logger = logging.getLogger(__name__)
19
-
20
- # Initialize tokenizer and model with error handling
21
- model_name = "aubmindlab/bert-base-arabertv2"
22
- try:
23
- logger.info(f"{datetime.now()}: Loading tokenizer for {model_name}")
24
- tokenizer = AutoTokenizer.from_pretrained(model_name)
25
- logger.info(f"{datetime.now()}: Loading model for {model_name}")
26
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
27
- except Exception as e:
28
- logger.error(f"{datetime.now()}: Failed to load model/tokenizer: {e}")
29
- raise
30
-
31
- # Directory to save fine-tuned model
32
- MODEL_SAVE_PATH = "./fine_tuned_model"
33
-
34
- # Custom Arabic text preprocessing function
35
- def preprocess_arabic_text(text):
36
- logger.info(f"{datetime.now()}: Preprocessing text (length: {len(text)} characters)")
37
- # Remove Arabic diacritics
38
- diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
39
- text = diacritics.sub('', text)
40
- # Normalize Arabic characters
41
- text = re.sub(r'[أإآ]', 'ا', text)
42
- text = re.sub(r'ى', 'ي', text)
43
- text = re.sub(r'ة', 'ه', text)
44
- # Remove extra spaces and non-essential characters
45
- text = re.sub(r'\s+', ' ', text)
46
- text = re.sub(r'[^\w\s]', '', text)
47
- logger.info(f"{datetime.now()}: Text preprocessed, new length: {len(text)} characters")
48
- return text.strip()
49
-
50
- # Function to extract text from .docx
51
- def extract_text_docx(file_path):
52
- logger.info(f"{datetime.now()}: Extracting text from .docx file: {file_path}")
53
- try:
54
- doc = docx.Document(file_path)
55
- text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
56
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .docx")
57
- return text
58
- except Exception as e:
59
- logger.error(f"{datetime.now()}: Error extracting text from .docx: {e}")
60
- return ""
61
-
62
- # Function to extract text from .pdf
63
- def extract_text_pdf(file_path):
64
- logger.info(f"{datetime.now()}: Extracting text from .pdf file: {file_path}")
65
- try:
66
- doc = fitz.open(file_path)
67
- text = ""
68
- for page in doc:
69
- text += page.get_text()
70
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .pdf")
71
- return text
72
- except Exception as e:
73
- logger.error(f"{datetime.now()}: Error extracting text from .pdf: {e}")
74
- return ""
75
-
76
- # Function to chunk text for dataset
77
- def chunk_text(text, max_length=512):
78
- logger.info(f"{datetime.now()}: Chunking text into segments")
79
- words = text.split()
80
- chunks = []
81
- current_chunk = []
82
- current_length = 0
83
- for word in words:
84
- current_chunk.append(word)
85
- current_length += len(word) + 1
86
- if current_length >= max_length:
87
- chunks.append(" ".join(current_chunk))
88
- current_chunk = []
89
- current_length = 0
90
- if current_chunk:
91
- chunks.append(" ".join(current_chunk))
92
- logger.info(f"{datetime.now()}: Created {len(chunks)} text chunks")
93
- return chunks
94
-
95
- # Function to prepare dataset
96
- def prepare_dataset(text):
97
- logger.info(f"{datetime.now()}: Preparing dataset")
98
- chunks = chunk_text(text)
99
- data = {"text": chunks}
100
- dataset = Dataset.from_dict(data)
101
- logger.info(f"{datetime.now()}: Dataset prepared with {len(dataset)} examples")
102
- return dataset
103
-
104
- # Function to tokenize dataset
105
- def tokenize_dataset(dataset):
106
- logger.info(f"{datetime.now()}: Tokenizing dataset")
107
- def tokenize_function(examples):
108
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
109
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
110
- logger.info(f"{datetime.now()}: Dataset tokenized")
111
- return tokenized_dataset
112
-
113
- # Function to fine-tune model
114
- def fine_tune_model(dataset):
115
- logger.info(f"{datetime.now()}: Starting model fine-tuning")
116
- training_args = TrainingArguments(
117
- output_dir="./results",
118
- num_train_epochs=1,
119
- per_device_train_batch_size=4,
120
- save_steps=10_000,
121
- save_total_limit=2,
122
- logging_dir='./logs',
123
- logging_steps=200,
124
- )
125
-
126
- trainer = Trainer(
127
- model=model,
128
- args=training_args,
129
- train_dataset=dataset,
130
- )
131
-
132
- trainer.train()
133
- model.save_pretrained(MODEL_SAVE_PATH)
134
- tokenizer.save_pretrained(MODEL_SAVE_PATH)
135
- logger.info(f"{datetime.now()}: Model fine-tuned and saved to {MODEL_SAVE_PATH}")
136
-
137
- # Function to handle file upload and training
138
- def upload_and_train(files, progress=gr.Progress()):
139
- uploaded_files = []
140
- all_text = ""
141
- training_log = []
142
-
143
- def log_and_update(step, desc, future_warningprogress_value):
144
- msg = f"{datetime.now()}: {desc}"
145
- logger.info(msg)
146
- training_log.append(msg)
147
- progress(progress_value, desc=desc)
148
- return "\n".join(training_log)
149
-
150
- log_and_update("Starting upload", "Loading books...", 0.1)
151
- for file in files:
152
- file_name = os.path.basename(file.name)
153
- uploaded_files.append(file_name)
154
- if file_name.endswith(".docx"):
155
- text = extract_text_docx(file.name)
156
- elif file_name.endswith(".pdf"):
157
- text = extract_text_pdf(file.name)
158
- else:
159
- continue
160
- all_text += text + "\n"
161
-
162
- if not all_text.strip():
163
- msg = f"{datetime.now()}: No valid text extracted from uploaded files."
164
- logger.error(msg)
165
- training_log.append(msg)
166
- return "\n".join(training_log), uploaded_files
167
-
168
- log_and_update("Text extraction complete", "Extracting ideas...", 0.4)
169
- cleaned_text = preprocess_arabic_text(all_text)
170
-
171
- log_and_update("Preprocessing complete", "Preparing dataset...", 0.6)
172
- dataset = prepare_dataset(cleaned_text)
173
- tokenized_dataset = tokenize_dataset(dataset)
174
-
175
- log_and_update("Dataset preparation complete", "Training in progress...", 0.8)
176
- fine_tune_model(tokenized_dataset)
177
-
178
- log_and_update("Training complete", "Training completed!", 1.0)
179
-
180
- # Example QA
181
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
182
- example_question = "ما هو قانون الإيمان وفقًا للكتاب؟"
183
- example_answer = qa_pipeline(question=example_question, context=cleaned_text[:512])["answer"]
184
-
185
- final_message = (
186
- f"Training process finished: Enter your question\n\n"
187
- f"**مثال لسؤال**: {example_question}\n"
188
- f"**الإجابة**: {example_answer}\n\n"
189
- f"**سجل التدريب**:\n" + "\n".join(training_log)
190
- )
191
- return final_message, uploaded_files
192
-
193
- # Function to answer questions
194
- def answer_question(question, context):
195
- if not os.path.exists(MODEL_SAVE_PATH):
196
- return "النظام لم يتم تدريبه بعد. الرجاء رفع الكتب وتدريب النظام أولاً."
197
-
198
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
199
- answer = qa_pipeline(question=question, context=context[:512])["answer"]
200
- return answer
201
-
202
- # Main Gradio Interface (for training and QA)
203
- with gr.Blocks() as main_demo:
204
- gr.Markdown("# نظام ذكاء اصطناعي لتحليل الكتب باللغة العربية")
205
-
206
- with gr.Row():
207
- with gr.Column():
208
- file_upload = gr.File(file_types=[".docx", ".pdf"], label="رفع الكتب", file_count="multiple")
209
- upload_button = gr.Button("رفع وتدريب")
210
- uploaded_files = gr.Textbox(label="الكتب المرفوعة")
211
-
212
- with gr.Column():
213
- training_status = gr.Textbox(label="حالة التدريب", lines=10)
214
-
215
- with gr.Row():
216
- question_input = gr.Textbox(label="أدخل سؤالك بالعربية", placeholder="مثال: ما هو قانون الإيمان؟")
217
- answer_output = gr.Textbox(label="الإجابة")
218
- ask_button = gr.Button("طرح السؤال")
219
-
220
- # Event handlers
221
- upload_button.click(
222
- fn=upload_and_train,
223
- inputs=[file_upload],
224
- outputs=[training_status, uploaded_files]
225
- )
226
-
227
- ask_button.click(
228
- fn=answer_question,
229
- inputs=[question_input, gr.State(value="")],
230
- outputs=[answer_output]
231
- )
232
-
233
- # Question-Answering Only Interface
234
- with gr.Blocks() as answer_demo:
235
- gr.Markdown("# طرح الأسئلة على نظام تحليل الكتب باللغة العربية")
236
- gr.Markdown("أدخل سؤالك بالعربية وسيتم الإجابة بناءً على محتوى الكتب المدربة.")
237
-
238
- question_input = gr.Textbox(label="أدخل سؤالك", placeholder="مثال: ما هو قانون الإيمان؟")
239
- answer_output = gr.Textbox(label="الإجابة")
240
- ask_button = gr.Button("طرح السؤال")
241
-
242
- ask_button.click(
243
- fn=answer_question,
244
- inputs=[question_input, gr.State(value="")],
245
- outputs=[answer_output]
246
- )
247
-
248
- # Combine both interfaces with routes
249
- app = gr.mount_gradio_app(
250
- gradio.routes.App.create_app(),
251
- main_demo,
252
- path="/",
253
- )
254
- app = gr.mount_gradio_app(
255
- app,
256
- answer_demo,
257
- path="/answer",
258
- )
259
-
260
- if __name__ == "__main__":
261
- main_demo.launch()