Spaces:

ramysaidagieb
/

pope3

Configuration error

App Files Files Community

ramysaidagieb commited on Apr 30

Commit

8779ba9

verified ·

1 Parent(s): ca16700

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -247

app.py DELETED Viewed

@@ -1,247 +0,0 @@
-import gradio as gr
-import os
-import docx
-import fitz  # PyMuPDF
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, pipeline
-from datasets import Dataset
-import re
-import logging
-from datetime import datetime
-import warnings
-# Suppress FutureWarning from huggingface_hub
-warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Initialize tokenizer and model with error handling
-model_name = "aubmindlab/bert-base-arabertv2"
-try:
-    logger.info(f"{datetime.now()}: Loading tokenizer for {model_name}")
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    logger.info(f"{datetime.now()}: Loading model for {model_name}")
-    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
-except Exception as e:
-    logger.error(f"{datetime.now()}: Failed to load model/tokenizer: {e}")
-    raise
-# Directory to save fine-tuned model
-MODEL_SAVE_PATH = "./fine_tuned_model"
-# Custom Arabic text preprocessing function
-def preprocess_arabic_text(text):
-    logger.info(f"{datetime.now()}: Preprocessing text (length: {len(text)} characters)")
-    # Remove Arabic diacritics
-    diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
-    text = diacritics.sub('', text)
-    # Normalize Arabic characters
-    text = re.sub(r'[أإآ]', 'ا', text)
-    text = re.sub(r'ى', 'ي', text)
-    text = re.sub(r'ة', 'ه', text)
-    # Remove extra spaces and non-essential characters
-    text = re.sub(r'\s+', ' ', text)
-    text = re.sub(r'[^\w\s]', '', text)
-    logger.info(f"{datetime.now()}: Text preprocessed, new length: {len(text)} characters")
-    return text.strip()
-# Function to extract text from .docx
-def extract_text_docx(file_path):
-    logger.info(f"{datetime.now()}: Extracting text from .docx file: {file_path}")
-    try:
-        doc = docx.Document(file_path)
-        text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-        logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .docx")
-        return text
-    except Exception as e:
-        logger.error(f"{datetime.now()}: Error extracting text from .docx: {e}")
-        return ""
-# Function to extract text from .pdf
-def extract_text_pdf(file_path):
-    logger.info(f"{datetime.now()}: Extracting text from .pdf file: {file_path}")
-    try:
-        doc = fitz.open(file_path)
-        text = ""
-        for page in doc:
-            text += page.get_text()
-        logger.info(f"{datetime.now()}: Successfully extracted {len(text)} characters from .pdf")
-        return text
-    except Exception as e:
-        logger.error(f"{datetime.now()}: Error extracting text from .pdf: {e}")
-        return ""
-# Function to chunk text for dataset
-def chunk_text(text, max_length=512):
-    logger.info(f"{datetime.now()}: Chunking text into segments")
-    words = text.split()
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for word in words:
-        current_chunk.append(word)
-        current_length += len(word) + 1
-        if current_length >= max_length:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = []
-            current_length = 0
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    logger.info(f"{datetime.now()}: Created {len(chunks)} text chunks")
-    return chunks
-# Function to prepare dataset
-def prepare_dataset(text):
-    logger.info(f"{datetime.now()}: Preparing dataset")
-    chunks = chunk_text(text)
-    data = {"text": chunks}
-    dataset = Dataset.from_dict(data)
-    logger.info(f"{datetime.now()}: Dataset prepared with {len(dataset)} examples")
-    return dataset
-# Function to tokenize dataset
-def tokenize_dataset(dataset):
-    logger.info(f"{datetime.now()}: Tokenizing dataset")
-    def tokenize_function(examples):
-        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
-    tokenized_dataset = dataset.map(tokenize_function, batched=True)
-    logger.info(f"{datetime.now()}: Dataset tokenized")
-    return tokenized_dataset
-# Function to fine-tune model
-def fine_tune_model(dataset):
-    logger.info(f"{datetime.now()}: Starting model fine-tuning")
-    training_args = TrainingArguments(
-        output_dir="./results",
-        num_train_epochs=1,
-        per_device_train_batch_size=4,
-        save_steps=10_000,
-        save_total_limit=2,
-        logging_dir='./logs',
-        logging_steps=200,
-    )
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=dataset,
-    )
-    trainer.train()
-    model.save_pretrained(MODEL_SAVE_PATH)
-    tokenizer.save_pretrained(MODEL_SAVE_PATH)
-    logger.info(f"{datetime.now()}: Model fine-tuned and saved to {MODEL_SAVE_PATH}")
-# Function to handle file upload and training
-def upload_and_train(files, progress=gr.Progress()):
-    uploaded_files = []
-    all_text = ""
-    training_log = []
-    def log_and_update(step, desc, progress_value):
-        msg = f"{datetime.now()}: {desc}"
-        logger.info(msg)
-        training_log.append(msg)
-        progress(progress_value, desc=desc)
-        return "\n".join(training_log)
-    log_and_update("Starting upload", "Loading books...", 0.1)
-    for file in files:
-        file_name = os.path.basename(file.name)
-        uploaded_files.append(file_name)
-        if file_name.endswith(".docx"):
-            text = extract_text_docx(file.name)
-        elif file_name.endswith(".pdf"):
-            text = extract_text_pdf(file.name)
-        else:
-            continue
-        all_text += text + "\n"
-    if not all_text.strip():
-        msg = f"{datetime.now()}: No valid text extracted from uploaded files."
-        logger.error(msg)
-        training_log.append(msg)
-        return "\n".join(training_log), uploaded_files
-    log_and_update("Text extraction complete", "Extracting ideas...", 0.4)
-    cleaned_text = preprocess_arabic_text(all_text)
-    log_and_update("Preprocessing complete", "Preparing dataset...", 0.6)
-    dataset = prepare_dataset(cleaned_text)
-    tokenized_dataset = tokenize_dataset(dataset)
-    log_and_update("Dataset preparation complete", "Training in progress...", 0.8)
-    fine_tune_model(tokenized_dataset)
-    log_and_update("Training complete", "Training completed!", 1.0)
-    # Example QA
-    qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
-    example_question = "ما هو قانون الإيمان وفقًا للكتاب؟"
-    example_answer = qa_pipeline(question=example_question, context=cleaned_text[:512])["answer"]
-    final_message = (
-        f"Training process finished: Enter your question\n\n"
-        f"**مثال لسؤال**: {example_question}\n"
-        f"**الإجابة**: {example_answer}\n\n"
-        f"**سجل التدريب**:\n" + "\n".join(training_log)
-    )
-    return final_message, uploaded_files
-# Function to answer questions
-def answer_question(question, context):
-    if not os.path.exists(MODEL_SAVE_PATH):
-        return "النظام لم يتم تدريبه بعد. الرجاء رفع الكتب وتدريب النظام أولاً."
-    qa_pipeline = pipeline("question-answering", model=MODEL_SAVE_PATH, tokenizer=MODEL_SAVE_PATH)
-    answer = qa_pipeline(question=question, context=context[:512])["answer"]
-    return answer
-# Gradio Interface with Tabs
-with gr.Blocks(title="Arabic Book Analysis AI") as demo:
-    gr.Markdown("# نظام ذكاء اصطناعي لتحليل الكتب باللغة العربية")
-    with gr.Tabs():
-        with gr.TabItem("التدريب والسؤال"):
-            with gr.Row():
-                with gr.Column():
-                    file_upload = gr.File(file_types=[".docx", ".pdf"], label="رفع الكتب", file_count="multiple")
-                    upload_button = gr.Button("رفع وتدريب")
-                    uploaded_files = gr.Textbox(label="الكتب المرفوعة")
-                with gr.Column():
-                    training_status = gr.Textbox(label="حالة التدريب", lines=10)
-            with gr.Row():
-                question_input = gr.Textbox(label="أدخل سؤالك بالعربية", placeholder="مثال: ما هو قانون الإيمان؟")
-                answer_output = gr.Textbox(label="الإجابة")
-                ask_button = gr.Button("طرح السؤال")
-            # Event handlers
-            upload_button.click(
-                fn=upload_and_train,
-                inputs=[file_upload],
-                outputs=[training_status, uploaded_files]
-            )
-            ask_button.click(
-                fn=answer_question,
-                inputs=[question_input, gr.State(value="")],
-                outputs=[answer_output]
-            )
-        with gr.TabItem("طرح الأسئلة فقط"):
-            gr.Markdown("أدخل سؤالك بالعربية وسيتم الإجابة بناءً على محتوى الكتب المدربة.")
-            question_input_qa = gr.Textbox(label="أدخل سؤالك", placeholder="مثال: ما هو قانون الإيمان؟")
-            answer_output_qa = gr.Textbox(label="الإجابة")
-            ask_button_qa = gr.Button("طرح السؤال")
-            ask_button_qa.click(
-                fn=answer_question,
-                inputs=[question_input_qa, gr.State(value="")],
-                outputs=[answer_output_qa]
-            )
-if __name__ == "__main__":
-    demo.launch()