Spaces:
Configuration error
Configuration error
Delete app.py
Browse files
app.py
DELETED
@@ -1,247 +0,0 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
import os
|
3 |
-
import docx
|
4 |
-
import fitz # PyMuPDF
|
5 |
-
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
|
6 |
-
from datasets import Dataset
|
7 |
-
import re
|
8 |
-
import logging
|
9 |
-
from datetime import datetime
|
10 |
-
import warnings
|
11 |
-
|
12 |
-
# Suppress FutureWarning from huggingface_hub
|
13 |
-
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")
|
14 |
-
|
15 |
-
# Setup logging
|
16 |
-
logging.basicConfig(level=logging.INFO)
|
17 |
-
logger = logging.getLogger(__name__)
|
18 |
-
|
19 |
-
# Initialize tokenizer and model with error handling
|
20 |
-
model_name = "aubmindlab/bert-base-arabertv2"
|
21 |
-
try:
|
22 |
-
logger.info(f"{datetime.now()}: Loading tokenizer for {model_name}")
|
23 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
24 |
-
logger.info(f"{datetime.now()}: Loading model for {model_name}")
|
25 |
-
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
|
26 |
-
except Exception as e:
|
27 |
-
logger.error(f"{datetime.now()}: Failed to load model/tokenizer: {e}")
|
28 |
-
raise
|
29 |
-
|
30 |
-
# Directory to save fine-tuned model
|
31 |
-
MODEL_SAVE_PATH = "./fine_tuned_model"
|
32 |
-
|
33 |
-
# Custom Arabic text preprocessing function
|
34 |
-
def preprocess_arabic_text(text):
|
35 |
-
logger.info(f"{datetime.now()}: Preprocessing text (length: {len(text)} characters)")
|
36 |
-
# Remove Arabic diacritics
|
37 |
-
diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
|
38 |
-
text = diacritics.sub('', text)
|
39 |
-
# Normalize Arabic characters
|
40 |
-
text = re.sub(r'[أإآ]', 'ا', text)
|
41 |
-
text = re.sub(r'ى', 'ي', text)
|
42 |
-
text = re.sub(r'ة', 'ه', text)
|
43 |
-
# Remove extra spaces and non-essential characters
|
44 |
-
text = re.sub(r'\s+', ' ', text)
|
45 |
-
text = re.sub(r'[^\w\s]', '', text)
|
46 |
-
logger.info(f"{datetime.now()}: Text preprocessed, new length: {len(text)} characters")
|
47 |
-
return text.strip()
|
48 |
-
|
49 |
-
# Function to extract text from .docx
|
50 |
-
def extract_text_docx(file_path):
|
51 |
-
logger.info(f"{datetime.now()}: Extracting text from .docx file: {file_path}")
|
52 |
-
try:
|
53 |
-
doc = docx.Document(file_path)
|
54 |
-
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
55 |
-
logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .docx")
|
56 |
-
return text
|
57 |
-
except Exception as e:
|
58 |
-
logger.error(f"{datetime.now()}: Error extracting text from .docx: {e}")
|
59 |
-
return ""
|
60 |
-
|
61 |
-
# Function to extract text from .pdf
|
62 |
-
def extract_text_pdf(file_path):
|
63 |
-
logger.info(f"{datetime.now()}: Extracting text from .pdf file: {file_path}")
|
64 |
-
try:
|
65 |
-
doc = fitz.open(file_path)
|
66 |
-
text = ""
|
67 |
-
for page in doc:
|
68 |
-
text += page.get_text()
|
69 |
-
logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .pdf")
|
70 |
-
return text
|
71 |
-
except Exception as e:
|
72 |
-
logger.error(f"{datetime.now()}: Error extracting text from .pdf: {e}")
|
73 |
-
return ""
|
74 |
-
|
75 |
-
# Function to chunk text for dataset
|
76 |
-
def chunk_text(text, max_length=512):
|
77 |
-
logger.info(f"{datetime.now()}: Chunking text into segments")
|
78 |
-
words = text.split()
|
79 |
-
chunks = []
|
80 |
-
current_chunk = []
|
81 |
-
current_length = 0
|
82 |
-
for word in words:
|
83 |
-
current_chunk.append(word)
|
84 |
-
current_length += len(word) + 1
|
85 |
-
if current_length >= max_length:
|
86 |
-
chunks.append(" ".join(current_chunk))
|
87 |
-
current_chunk = []
|
88 |
-
current_length = 0
|
89 |
-
if current_chunk:
|
90 |
-
chunks.append(" ".join(current_chunk))
|
91 |
-
logger.info(f"{datetime.now()}: Created {len(chunks)} text chunks")
|
92 |
-
return chunks
|
93 |
-
|
94 |
-
# Function to prepare dataset
|
95 |
-
def prepare_dataset(text):
|
96 |
-
logger.info(f"{datetime.now()}: Preparing dataset")
|
97 |
-
chunks = chunk_text(text)
|
98 |
-
data = {"text": chunks}
|
99 |
-
dataset = Dataset.from_dict(data)
|
100 |
-
logger.info(f"{datetime.now()}: Dataset prepared with {len(dataset)} examples")
|
101 |
-
return dataset
|
102 |
-
|
103 |
-
# Function to tokenize dataset
|
104 |
-
def tokenize_dataset(dataset):
|
105 |
-
logger.info(f"{datetime.now()}: Tokenizing dataset")
|
106 |
-
def tokenize_function(examples):
|
107 |
-
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
108 |
-
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
109 |
-
logger.info(f"{datetime.now()}: Dataset tokenized")
|
110 |
-
return tokenized_dataset
|
111 |
-
|
112 |
-
# Function to fine-tune model
|
113 |
-
def fine_tune_model(dataset):
|
114 |
-
logger.info(f"{datetime.now()}: Starting model fine-tuning")
|
115 |
-
training_args = TrainingArguments(
|
116 |
-
output_dir="./results",
|
117 |
-
num_train_epochs=1,
|
118 |
-
per_device_train_batch_size=4,
|
119 |
-
save_steps=10_000,
|
120 |
-
save_total_limit=2,
|
121 |
-
logging_dir='./logs',
|
122 |
-
logging_steps=200,
|
123 |
-
)
|
124 |
-
|
125 |
-
trainer = Trainer(
|
126 |
-
model=model,
|
127 |
-
args=training_args,
|
128 |
-
train_dataset=dataset,
|
129 |
-
)
|
130 |
-
|
131 |
-
trainer.train()
|
132 |
-
model.save_pretrained(MODEL_SAVE_PATH)
|
133 |
-
tokenizer.save_pretrained(MODEL_SAVE_PATH)
|
134 |
-
logger.info(f"{datetime.now()}: Model fine-tuned and saved to {MODEL_SAVE_PATH}")
|
135 |
-
|
136 |
-
# Function to handle file upload and training
|
137 |
-
def upload_and_train(files, progress=gr.Progress()):
|
138 |
-
uploaded_files = []
|
139 |
-
all_text = ""
|
140 |
-
training_log = []
|
141 |
-
|
142 |
-
def log_and_update(step, desc, progress_value):
|
143 |
-
msg = f"{datetime.now()}: {desc}"
|
144 |
-
logger.info(msg)
|
145 |
-
training_log.append(msg)
|
146 |
-
progress(progress_value, desc=desc)
|
147 |
-
return "\n".join(training_log)
|
148 |
-
|
149 |
-
log_and_update("Starting upload", "Loading books...", 0.1)
|
150 |
-
for file in files:
|
151 |
-
file_name = os.path.basename(file.name)
|
152 |
-
uploaded_files.append(file_name)
|
153 |
-
if file_name.endswith(".docx"):
|
154 |
-
text = extract_text_docx(file.name)
|
155 |
-
elif file_name.endswith(".pdf"):
|
156 |
-
text = extract_text_pdf(file.name)
|
157 |
-
else:
|
158 |
-
continue
|
159 |
-
all_text += text + "\n"
|
160 |
-
|
161 |
-
if not all_text.strip():
|
162 |
-
msg = f"{datetime.now()}: No valid text extracted from uploaded files."
|
163 |
-
logger.error(msg)
|
164 |
-
training_log.append(msg)
|
165 |
-
return "\n".join(training_log), uploaded_files
|
166 |
-
|
167 |
-
log_and_update("Text extraction complete", "Extracting ideas...", 0.4)
|
168 |
-
cleaned_text = preprocess_arabic_text(all_text)
|
169 |
-
|
170 |
-
log_and_update("Preprocessing complete", "Preparing dataset...", 0.6)
|
171 |
-
dataset = prepare_dataset(cleaned_text)
|
172 |
-
tokenized_dataset = tokenize_dataset(dataset)
|
173 |
-
|
174 |
-
log_and_update("Dataset preparation complete", "Training in progress...", 0.8)
|
175 |
-
fine_tune_model(tokenized_dataset)
|
176 |
-
|
177 |
-
log_and_update("Training complete", "Training completed!", 1.0)
|
178 |
-
|
179 |
-
# Example QA
|
180 |
-
qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
|
181 |
-
example_question = "ما هو قانون الإيمان وفقًا للكتاب؟"
|
182 |
-
example_answer = qa_pipeline(question=example_question, context=cleaned_text[:512])["answer"]
|
183 |
-
|
184 |
-
final_message = (
|
185 |
-
f"Training process finished: Enter your question\n\n"
|
186 |
-
f"**مثال لسؤال**: {example_question}\n"
|
187 |
-
f"**الإجابة**: {example_answer}\n\n"
|
188 |
-
f"**سجل التدريب**:\n" + "\n".join(training_log)
|
189 |
-
)
|
190 |
-
return final_message, uploaded_files
|
191 |
-
|
192 |
-
# Function to answer questions
|
193 |
-
def answer_question(question, context):
|
194 |
-
if not os.path.exists(MODEL_SAVE_PATH):
|
195 |
-
return "النظام لم يتم تدريبه بعد. الرجاء رفع الكتب وتدريب النظام أولاً."
|
196 |
-
|
197 |
-
qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
|
198 |
-
answer = qa_pipeline(question=question, context=context[:512])["answer"]
|
199 |
-
return answer
|
200 |
-
|
201 |
-
# Gradio Interface with Tabs
|
202 |
-
with gr.Blocks(title="Arabic Book Analysis AI") as demo:
|
203 |
-
gr.Markdown("# نظام ذكاء اصطناعي لتحليل الكتب باللغة العربية")
|
204 |
-
|
205 |
-
with gr.Tabs():
|
206 |
-
with gr.TabItem("التدريب والسؤال"):
|
207 |
-
with gr.Row():
|
208 |
-
with gr.Column():
|
209 |
-
file_upload = gr.File(file_types=[".docx", ".pdf"], label="رفع الكتب", file_count="multiple")
|
210 |
-
upload_button = gr.Button("رفع وتدريب")
|
211 |
-
uploaded_files = gr.Textbox(label="الكتب المرفوعة")
|
212 |
-
|
213 |
-
with gr.Column():
|
214 |
-
training_status = gr.Textbox(label="حالة التدريب", lines=10)
|
215 |
-
|
216 |
-
with gr.Row():
|
217 |
-
question_input = gr.Textbox(label="أدخل سؤالك بالعربية", placeholder="مثال: ما هو قانون الإيمان؟")
|
218 |
-
answer_output = gr.Textbox(label="الإجابة")
|
219 |
-
ask_button = gr.Button("طرح السؤال")
|
220 |
-
|
221 |
-
# Event handlers
|
222 |
-
upload_button.click(
|
223 |
-
fn=upload_and_train,
|
224 |
-
inputs=[file_upload],
|
225 |
-
outputs=[training_status, uploaded_files]
|
226 |
-
)
|
227 |
-
|
228 |
-
ask_button.click(
|
229 |
-
fn=answer_question,
|
230 |
-
inputs=[question_input, gr.State(value="")],
|
231 |
-
outputs=[answer_output]
|
232 |
-
)
|
233 |
-
|
234 |
-
with gr.TabItem("طرح الأسئلة فقط"):
|
235 |
-
gr.Markdown("أدخل سؤالك بالعربية وسيتم الإجابة بناءً على محتوى الكتب المدربة.")
|
236 |
-
question_input_qa = gr.Textbox(label="أدخل سؤالك", placeholder="مثال: ما هو قانون الإيمان؟")
|
237 |
-
answer_output_qa = gr.Textbox(label="الإجابة")
|
238 |
-
ask_button_qa = gr.Button("طرح السؤال")
|
239 |
-
|
240 |
-
ask_button_qa.click(
|
241 |
-
fn=answer_question,
|
242 |
-
inputs=[question_input_qa, gr.State(value="")],
|
243 |
-
outputs=[answer_output_qa]
|
244 |
-
)
|
245 |
-
|
246 |
-
if __name__ == "__main__":
|
247 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|