Spaces:

ramysaidagieb
/

pope3

Configuration error

App Files Files Community

pope3 / app.py

ramysaidagieb

Upload 3 files

029c0b0 verified 4 months ago

raw

history blame

9.72 kB

	import gradio as gr
	import os
	import docx
	import fitz # PyMuPDF
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
	from datasets import Dataset
	import re
	import logging
	from datetime import datetime
	import warnings

	# Suppress FutureWarning from huggingface_hub
	warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize tokenizer and model with error handling
	model_name = "aubmindlab/bert-base-arabertv2"
	try:
	logger.info(f"{datetime.now()}: Loading tokenizer for {model_name}")
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	logger.info(f"{datetime.now()}: Loading model for {model_name}")
	model = AutoModelForQuestionAnswering.from_pretrained(model_name)
	except Exception as e:
	logger.error(f"{datetime.now()}: Failed to load model/tokenizer: {e}")
	raise

	# Directory to save fine-tuned model
	MODEL_SAVE_PATH = "./fine_tuned_model"

	# Custom Arabic text preprocessing function
	def preprocess_arabic_text(text):
	logger.info(f"{datetime.now()}: Preprocessing text (length: {len(text)} characters)")
	# Remove Arabic diacritics
	diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
	text = diacritics.sub('', text)
	# Normalize Arabic characters
	text = re.sub(r'[أإآ]', 'ا', text)
	text = re.sub(r'ى', 'ي', text)
	text = re.sub(r'ة', 'ه', text)
	# Remove extra spaces and non-essential characters
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[^\w\s]', '', text)
	logger.info(f"{datetime.now()}: Text preprocessed, new length: {len(text)} characters")
	return text.strip()

	# Function to extract text from .docx
	def extract_text_docx(file_path):
	logger.info(f"{datetime.now()}: Extracting text from .docx file: {file_path}")
	try:
	doc = docx.Document(file_path)
	text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
	logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .docx")
	return text
	except Exception as e:
	logger.error(f"{datetime.now()}: Error extracting text from .docx: {e}")
	return ""

	# Function to extract text from .pdf
	def extract_text_pdf(file_path):
	logger.info(f"{datetime.now()}: Extracting text from .pdf file: {file_path}")
	try:
	doc = fitz.open(file_path)
	text = ""
	for page in doc:
	text += page.get_text()
	logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .pdf")
	return text
	except Exception as e:
	logger.error(f"{datetime.now()}: Error extracting text from .pdf: {e}")
	return ""

	# Function to chunk text for dataset
	def chunk_text(text, max_length=512):
	logger.info(f"{datetime.now()}: Chunking text into segments")
	words = text.split()
	chunks = []
	current_chunk = []
	current_length = 0
	for word in words:
	current_chunk.append(word)
	current_length += len(word) + 1
	if current_length >= max_length:
	chunks.append(" ".join(current_chunk))
	current_chunk = []
	current_length = 0
	if current_chunk:
	chunks.append(" ".join(current_chunk))
	logger.info(f"{datetime.now()}: Created {len(chunks)} text chunks")
	return chunks

	# Function to prepare dataset
	def prepare_dataset(text):
	logger.info(f"{datetime.now()}: Preparing dataset")
	chunks = chunk_text(text)
	data = {"text": chunks}
	dataset = Dataset.from_dict(data)
	logger.info(f"{datetime.now()}: Dataset prepared with {len(dataset)} examples")
	return dataset

	# Function to tokenize dataset
	def tokenize_dataset(dataset):
	logger.info(f"{datetime.now()}: Tokenizing dataset")
	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
	tokenized_dataset = dataset.map(tokenize_function, batched=True)
	logger.info(f"{datetime.now()}: Dataset tokenized")
	return tokenized_dataset

	# Function to fine-tune model
	def fine_tune_model(dataset):
	logger.info(f"{datetime.now()}: Starting model fine-tuning")
	training_args = TrainingArguments(
	output_dir="./results",
	num_train_epochs=1,
	per_device_train_batch_size=4,
	save_steps=10_000,
	save_total_limit=2,
	logging_dir='./logs',
	logging_steps=200,
	)

	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=dataset,
	)

	trainer.train()
	model.save_pretrained(MODEL_SAVE_PATH)
	tokenizer.save_pretrained(MODEL_SAVE_PATH)
	logger.info(f"{datetime.now()}: Model fine-tuned and saved to {MODEL_SAVE_PATH}")

	# Function to handle file upload and training
	def upload_and_train(files, progress=gr.Progress()):
	uploaded_files = []
	all_text = ""
	training_log = []

	def log_and_update(step, desc, progress_value):
	msg = f"{datetime.now()}: {desc}"
	logger.info(msg)
	training_log.append(msg)
	progress(progress_value, desc=desc)
	return "\n".join(training_log)

	log_and_update("Starting upload", "Loading books...", 0.1)
	for file in files:
	file_name = os.path.basename(file.name)
	uploaded_files.append(file_name)
	if file_name.endswith(".docx"):
	text = extract_text_docx(file.name)
	elif file_name.endswith(".pdf"):
	text = extract_text_pdf(file.name)
	else:
	continue
	all_text += text + "\n"

	if not all_text.strip():
	msg = f"{datetime.now()}: No valid text extracted from uploaded files."
	logger.error(msg)
	training_log.append(msg)
	return "\n".join(training_log), uploaded_files

	log_and_update("Text extraction complete", "Extracting ideas...", 0.4)
	cleaned_text = preprocess_arabic_text(all_text)

	log_and_update("Preprocessing complete", "Preparing dataset...", 0.6)
	dataset = prepare_dataset(cleaned_text)
	tokenized_dataset = tokenize_dataset(dataset)

	log_and_update("Dataset preparation complete", "Training in progress...", 0.8)
	fine_tune_model(tokenized_dataset)

	log_and_update("Training complete", "Training completed!", 1.0)

	# Example QA
	qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
	example_question = "ما هو قانون الإيمان وفقًا للكتاب؟"
	example_answer = qa_pipeline(question=example_question, context=cleaned_text[:512])["answer"]

	final_message = (
	f"Training process finished: Enter your question\n\n"
	f"مثال لسؤال: {example_question}\n"
	f"الإجابة: {example_answer}\n\n"
	f"سجل التدريب:\n" + "\n".join(training_log)
	)
	return final_message, uploaded_files

	# Function to answer questions
	def answer_question(question, context):
	if not os.path.exists(MODEL_SAVE_PATH):
	return "النظام لم يتم تدريبه بعد. الرجاء رفع الكتب وتدريب النظام أولاً."

	qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
	answer = qa_pipeline(question=question, context=context[:512])["answer"]
	return answer

	# Gradio Interface with Tabs
	with gr.Blocks(title="Arabic Book Analysis AI") as demo:
	gr.Markdown("# نظام ذكاء اصطناعي لتحليل الكتب باللغة العربية")

	with gr.Tabs():
	with gr.TabItem("التدريب والسؤال"):
	with gr.Row():
	with gr.Column():
	file_upload = gr.File(file_types=[".docx", ".pdf"], label="رفع الكتب", file_count="multiple")
	upload_button = gr.Button("رفع وتدريب")
	uploaded_files = gr.Textbox(label="الكتب المرفوعة")

	with gr.Column():
	training_status = gr.Textbox(label="حالة التدريب", lines=10)

	with gr.Row():
	question_input = gr.Textbox(label="أدخل سؤالك بالعربية", placeholder="مثال: ما هو قانون الإيمان؟")
	answer_output = gr.Textbox(label="الإجابة")
	ask_button = gr.Button("طرح السؤال")

	# Event handlers
	upload_button.click(
	fn=upload_and_train,
	inputs=[file_upload],
	outputs=[training_status, uploaded_files]
	)

	ask_button.click(
	fn=answer_question,
	inputs=[question_input, gr.State(value="")],
	outputs=[answer_output]
	)

	with gr.TabItem("طرح الأسئلة فقط"):
	gr.Markdown("أدخل سؤالك بالعربية وسيتم الإجابة بناءً على محتوى الكتب المدربة.")
	question_input_qa = gr.Textbox(label="أدخل سؤالك", placeholder="مثال: ما هو قانون الإيمان؟")
	answer_output_qa = gr.Textbox(label="الإجابة")
	ask_button_qa = gr.Button("طرح السؤال")

	ask_button_qa.click(
	fn=answer_question,
	inputs=[question_input_qa, gr.State(value="")],
	outputs=[answer_output_qa]
	)

	if __name__ == "__main__":
	demo.launch()