ramysaidagieb commited on
Commit
8779ba9
·
verified ·
1 Parent(s): ca16700

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -247
app.py DELETED
@@ -1,247 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import docx
4
- import fitz # PyMuPDF
5
- from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
6
- from datasets import Dataset
7
- import re
8
- import logging
9
- from datetime import datetime
10
- import warnings
11
-
12
- # Suppress FutureWarning from huggingface_hub
13
- warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")
14
-
15
- # Setup logging
16
- logging.basicConfig(level=logging.INFO)
17
- logger = logging.getLogger(__name__)
18
-
19
- # Initialize tokenizer and model with error handling
20
- model_name = "aubmindlab/bert-base-arabertv2"
21
- try:
22
- logger.info(f"{datetime.now()}: Loading tokenizer for {model_name}")
23
- tokenizer = AutoTokenizer.from_pretrained(model_name)
24
- logger.info(f"{datetime.now()}: Loading model for {model_name}")
25
- model = AutoModelForQuestionAnswering.from_pretrained(model_name)
26
- except Exception as e:
27
- logger.error(f"{datetime.now()}: Failed to load model/tokenizer: {e}")
28
- raise
29
-
30
- # Directory to save fine-tuned model
31
- MODEL_SAVE_PATH = "./fine_tuned_model"
32
-
33
- # Custom Arabic text preprocessing function
34
- def preprocess_arabic_text(text):
35
- logger.info(f"{datetime.now()}: Preprocessing text (length: {len(text)} characters)")
36
- # Remove Arabic diacritics
37
- diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
38
- text = diacritics.sub('', text)
39
- # Normalize Arabic characters
40
- text = re.sub(r'[أإآ]', 'ا', text)
41
- text = re.sub(r'ى', 'ي', text)
42
- text = re.sub(r'ة', 'ه', text)
43
- # Remove extra spaces and non-essential characters
44
- text = re.sub(r'\s+', ' ', text)
45
- text = re.sub(r'[^\w\s]', '', text)
46
- logger.info(f"{datetime.now()}: Text preprocessed, new length: {len(text)} characters")
47
- return text.strip()
48
-
49
- # Function to extract text from .docx
50
- def extract_text_docx(file_path):
51
- logger.info(f"{datetime.now()}: Extracting text from .docx file: {file_path}")
52
- try:
53
- doc = docx.Document(file_path)
54
- text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
55
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .docx")
56
- return text
57
- except Exception as e:
58
- logger.error(f"{datetime.now()}: Error extracting text from .docx: {e}")
59
- return ""
60
-
61
- # Function to extract text from .pdf
62
- def extract_text_pdf(file_path):
63
- logger.info(f"{datetime.now()}: Extracting text from .pdf file: {file_path}")
64
- try:
65
- doc = fitz.open(file_path)
66
- text = ""
67
- for page in doc:
68
- text += page.get_text()
69
- logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .pdf")
70
- return text
71
- except Exception as e:
72
- logger.error(f"{datetime.now()}: Error extracting text from .pdf: {e}")
73
- return ""
74
-
75
- # Function to chunk text for dataset
76
- def chunk_text(text, max_length=512):
77
- logger.info(f"{datetime.now()}: Chunking text into segments")
78
- words = text.split()
79
- chunks = []
80
- current_chunk = []
81
- current_length = 0
82
- for word in words:
83
- current_chunk.append(word)
84
- current_length += len(word) + 1
85
- if current_length >= max_length:
86
- chunks.append(" ".join(current_chunk))
87
- current_chunk = []
88
- current_length = 0
89
- if current_chunk:
90
- chunks.append(" ".join(current_chunk))
91
- logger.info(f"{datetime.now()}: Created {len(chunks)} text chunks")
92
- return chunks
93
-
94
- # Function to prepare dataset
95
- def prepare_dataset(text):
96
- logger.info(f"{datetime.now()}: Preparing dataset")
97
- chunks = chunk_text(text)
98
- data = {"text": chunks}
99
- dataset = Dataset.from_dict(data)
100
- logger.info(f"{datetime.now()}: Dataset prepared with {len(dataset)} examples")
101
- return dataset
102
-
103
- # Function to tokenize dataset
104
- def tokenize_dataset(dataset):
105
- logger.info(f"{datetime.now()}: Tokenizing dataset")
106
- def tokenize_function(examples):
107
- return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
108
- tokenized_dataset = dataset.map(tokenize_function, batched=True)
109
- logger.info(f"{datetime.now()}: Dataset tokenized")
110
- return tokenized_dataset
111
-
112
- # Function to fine-tune model
113
- def fine_tune_model(dataset):
114
- logger.info(f"{datetime.now()}: Starting model fine-tuning")
115
- training_args = TrainingArguments(
116
- output_dir="./results",
117
- num_train_epochs=1,
118
- per_device_train_batch_size=4,
119
- save_steps=10_000,
120
- save_total_limit=2,
121
- logging_dir='./logs',
122
- logging_steps=200,
123
- )
124
-
125
- trainer = Trainer(
126
- model=model,
127
- args=training_args,
128
- train_dataset=dataset,
129
- )
130
-
131
- trainer.train()
132
- model.save_pretrained(MODEL_SAVE_PATH)
133
- tokenizer.save_pretrained(MODEL_SAVE_PATH)
134
- logger.info(f"{datetime.now()}: Model fine-tuned and saved to {MODEL_SAVE_PATH}")
135
-
136
- # Function to handle file upload and training
137
- def upload_and_train(files, progress=gr.Progress()):
138
- uploaded_files = []
139
- all_text = ""
140
- training_log = []
141
-
142
- def log_and_update(step, desc, progress_value):
143
- msg = f"{datetime.now()}: {desc}"
144
- logger.info(msg)
145
- training_log.append(msg)
146
- progress(progress_value, desc=desc)
147
- return "\n".join(training_log)
148
-
149
- log_and_update("Starting upload", "Loading books...", 0.1)
150
- for file in files:
151
- file_name = os.path.basename(file.name)
152
- uploaded_files.append(file_name)
153
- if file_name.endswith(".docx"):
154
- text = extract_text_docx(file.name)
155
- elif file_name.endswith(".pdf"):
156
- text = extract_text_pdf(file.name)
157
- else:
158
- continue
159
- all_text += text + "\n"
160
-
161
- if not all_text.strip():
162
- msg = f"{datetime.now()}: No valid text extracted from uploaded files."
163
- logger.error(msg)
164
- training_log.append(msg)
165
- return "\n".join(training_log), uploaded_files
166
-
167
- log_and_update("Text extraction complete", "Extracting ideas...", 0.4)
168
- cleaned_text = preprocess_arabic_text(all_text)
169
-
170
- log_and_update("Preprocessing complete", "Preparing dataset...", 0.6)
171
- dataset = prepare_dataset(cleaned_text)
172
- tokenized_dataset = tokenize_dataset(dataset)
173
-
174
- log_and_update("Dataset preparation complete", "Training in progress...", 0.8)
175
- fine_tune_model(tokenized_dataset)
176
-
177
- log_and_update("Training complete", "Training completed!", 1.0)
178
-
179
- # Example QA
180
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
181
- example_question = "ما هو قانون الإيمان وفقًا للكتاب؟"
182
- example_answer = qa_pipeline(question=example_question, context=cleaned_text[:512])["answer"]
183
-
184
- final_message = (
185
- f"Training process finished: Enter your question\n\n"
186
- f"**مثال لسؤال**: {example_question}\n"
187
- f"**الإجابة**: {example_answer}\n\n"
188
- f"**سجل التدريب**:\n" + "\n".join(training_log)
189
- )
190
- return final_message, uploaded_files
191
-
192
- # Function to answer questions
193
- def answer_question(question, context):
194
- if not os.path.exists(MODEL_SAVE_PATH):
195
- return "النظام لم يتم تدريبه بعد. الرجاء رفع الكتب وتدريب النظام أولاً."
196
-
197
- qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
198
- answer = qa_pipeline(question=question, context=context[:512])["answer"]
199
- return answer
200
-
201
- # Gradio Interface with Tabs
202
- with gr.Blocks(title="Arabic Book Analysis AI") as demo:
203
- gr.Markdown("# نظام ذكاء اصطناعي لتحليل الكتب باللغة العربية")
204
-
205
- with gr.Tabs():
206
- with gr.TabItem("التدريب والسؤال"):
207
- with gr.Row():
208
- with gr.Column():
209
- file_upload = gr.File(file_types=[".docx", ".pdf"], label="رفع الكتب", file_count="multiple")
210
- upload_button = gr.Button("رفع وتدريب")
211
- uploaded_files = gr.Textbox(label="الكتب المرفوعة")
212
-
213
- with gr.Column():
214
- training_status = gr.Textbox(label="حالة التدريب", lines=10)
215
-
216
- with gr.Row():
217
- question_input = gr.Textbox(label="أدخل سؤالك بالعربية", placeholder="مثال: ما هو قانون الإيمان؟")
218
- answer_output = gr.Textbox(label="الإجابة")
219
- ask_button = gr.Button("طرح السؤال")
220
-
221
- # Event handlers
222
- upload_button.click(
223
- fn=upload_and_train,
224
- inputs=[file_upload],
225
- outputs=[training_status, uploaded_files]
226
- )
227
-
228
- ask_button.click(
229
- fn=answer_question,
230
- inputs=[question_input, gr.State(value="")],
231
- outputs=[answer_output]
232
- )
233
-
234
- with gr.TabItem("طرح الأسئلة فقط"):
235
- gr.Markdown("أدخل سؤالك بالعربية وسيتم الإجابة بناءً على محتوى الكتب المدربة.")
236
- question_input_qa = gr.Textbox(label="أدخل سؤالك", placeholder="مثال: ما هو قانون الإيمان؟")
237
- answer_output_qa = gr.Textbox(label="الإجابة")
238
- ask_button_qa = gr.Button("طرح السؤال")
239
-
240
- ask_button_qa.click(
241
- fn=answer_question,
242
- inputs=[question_input_qa, gr.State(value="")],
243
- outputs=[answer_output_qa]
244
- )
245
-
246
- if __name__ == "__main__":
247
- demo.launch()