Spaces:

ramysaidagieb
/

Answer1

Sleeping

App Files Files Community

ramysaidagieb commited on Apr 27

Commit

66289a9

verified ·

1 Parent(s): d28c712

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -113

app.py CHANGED Viewed

@@ -1,122 +1,105 @@
 import gradio as gr
 import os
 import tempfile
 import faiss
-import torch
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.vectorstores import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.prompts import PromptTemplate
-from langchain.chains import RetrievalQA
-from langchain.llms import HuggingFacePipeline
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-from pdfminer.high_level import extract_text as extract_pdf_text
-import docx
-import nltk
-nltk.download('punkt')
-from nltk.tokenize import sent_tokenize
-uploaded_texts = []
-vector_store = None
-qa_chain = None
-embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
-embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
-model_name = "csebuetnlp/mT5_small"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
-pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
-llm = HuggingFacePipeline(pipeline=pipe)
-ARABIC_PROMPT_TEMPLATE = """
-أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
-لا تستخدم أي معلومات خارجية.
-السؤال: {question}
-الإجابة:
-"""
-def format_arabic_prompt(question):
-    return ARABIC_PROMPT_TEMPLATE.format(question=question)
-def extract_text_from_file(file_path):
-    if file_path.endswith(".pdf"):
-        return extract_pdf_text(file_path)
-    elif file_path.endswith(".docx") or file_path.endswith(".doc"):
-        doc = docx.Document(file_path)
-        return "\n".join([para.text for para in doc.paragraphs])
-    else:
-        raise ValueError("Unsupported file format")
-def arabic_split_text(text):
-    sentences = sent_tokenize(text, language='arabic')
-    chunks = []
-    chunk = ""
-    for sentence in sentences:
-        if len(chunk) + len(sentence) <= 500:
-            chunk += " " + sentence
         else:
-            chunks.append(chunk.strip())
-            chunk = sentence
-    if chunk:
-        chunks.append(chunk.strip())
-    return chunks
-def train_from_texts(texts):
-    global vector_store, qa_chain
-    splitter = RecursiveCharacterTextSplitter(
-        chunk_size=500,
-        chunk_overlap=100,
-        length_function=len,
-    )
-    all_chunks = []
     for text in texts:
-        chunks = arabic_split_text(text)
-        all_chunks.extend(chunks)
-    vectors = embeddings.embed_documents(all_chunks)
-    dimension = len(vectors[0])
-    index = faiss.IndexFlatL2(dimension)
-    vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)
-    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
-    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
-def upload_book(file, progress=gr.Progress()):
-    with tempfile.NamedTemporaryFile(delete=False) as tmp:
-        tmp.write(file.read())
-        tmp_path = tmp.name
-    progress(0.2, desc="تحميل الملف...")
-    extracted_text = extract_text_from_file(tmp_path)
-    uploaded_texts.append(extracted_text)
-    progress(0.5, desc="معالجة النص...")
-    train_from_texts(uploaded_texts)
-    progress(1.0, desc="اكتمل التدريب!")
-    return "النظام جاهز للإجابة على أسئلتك"
-def answer_question(user_question):
-    if qa_chain is None:
-        return "الرجاء رفع كتاب أولاً."
-    prompt = format_arabic_prompt(user_question)
-    result = qa_chain.run(prompt)
-    return result
 with gr.Blocks() as demo:
-    with gr.Tab("تحميل الكتب"):
-        upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
-        upload_output = gr.Textbox(label="حالة النظام")
-        upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)
-    with gr.Tab("اسأل الكتاب"):
-        question = gr.Textbox(label="اكتب سؤالك بالعربية")
-        answer = gr.Textbox(label="الإجابة")
-        ask_button = gr.Button("إرسال السؤال")
-        ask_button.click(answer_question, inputs=question, outputs=answer)
-demo.launch(share=True)

 import gradio as gr
 import os
 import tempfile
+import pdfminer.high_level
+import docx2txt
 import faiss
+import numpy as np
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+# Load Arabic embedding model
+embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
+# FAISS index (vector store)
+index = None
+texts = []
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_path):
+    return pdfminer.high_level.extract_text(pdf_path)
+# Function to extract text from DOCX
+def extract_text_from_docx(docx_path):
+    return docx2txt.process(docx_path)
+# Function to process uploaded files
+def process_files(files, progress=gr.Progress()):
+    global index, texts
+    texts = []
+    temp_dir = tempfile.mkdtemp()
+    # Step 1: Extract text
+    progress(0.1, desc="جارٍ استخراج النصوص من الكتب...")
+    for file in files:
+        file_path = os.path.join(temp_dir, file.name)
+        with open(file_path, "wb") as f:
+            f.write(file.read())
+        if file.name.endswith(".pdf"):
+            text = extract_text_from_pdf(file_path)
+        elif file.name.endswith(".docx") or file.name.endswith(".doc"):
+            text = extract_text_from_docx(file_path)
         else:
+            continue
+        texts.append(text)
+    # Step 2: Chunk the text
+    progress(0.4, desc="تقطيع النصوص إلى فقرات...")
+    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    chunks = []
     for text in texts:
+        chunks.extend(splitter.split_text(text))
+    # Step 3: Embed the text
+    progress(0.7, desc="تحويل الفقرات إلى متجهات...")
+    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
+    # Step 4: Build FAISS index
+    progress(0.9, desc="بناء قاعدة بيانات البحث...")
+    index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(np.array(embeddings))
+    texts.clear()
+    texts.extend(chunks)
+    return "✅ النظام جاهز للإجابة على أسئلتك"
+# Function to answer Arabic questions
+def answer_question(question):
+    global index, texts
+    if index is None or len(texts) == 0:
+        return "❗ من فضلك قم بتحميل الكتب أولاً."
+    # Embed the question
+    question_embedding = embedding_model.encode([question])
+    # Search in FAISS
+    distances, indices = index.search(np.array(question_embedding), k=5)
+    retrieved_chunks = [texts[i] for i in indices[0]]
+    # Simple answer: concatenate most relevant chunks
+    answer = "\n".join(retrieved_chunks)
+    return answer
+# Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# 📚 محرك محاكاة دماغ المؤلف - Arabic Book Brain AI")
+    with gr.Tab("رفع الكتب"):
+        upload = gr.File(file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
+        train_button = gr.Button("ابدأ التدريب على الكتب")
+        training_output = gr.Textbox(label="حالة التدريب")
+    with gr.Tab("اسأل الكتب"):
+        question_input = gr.Textbox(label="اكتب سؤالك هنا باللغة العربية")
+        answer_output = gr.Textbox(label="الإجابة")
+        ask_button = gr.Button("أرسل السؤال")
+    train_button.click(fn=process_files, inputs=[upload], outputs=[training_output])
+    ask_button.click(fn=answer_question, inputs=[question_input], outputs=[answer_output])
+demo.launch(share=True)