Spaces:
Sleeping
Sleeping
File size: 4,221 Bytes
8876843 d28c712 8876843 d28c712 66289a9 8876843 d28c712 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 d28c712 66289a9 8876843 66289a9 d28c712 66289a9 d28c712 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 8876843 66289a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
# Creating your fully corrected Hugging Face Space project
# app.py
import os
import tempfile
import gradio as gr
import faiss
import numpy as np
from transformers import AutoModel, AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pdfminer.high_level import extract_text
from docx import Document
# Load Arabic embedding model
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
index = None
texts = []
def extract_text_from_pdf(pdf_path):
return extract_text(pdf_path)
def extract_text_from_docx(docx_path):
doc = Document(docx_path)
return "\n".join([para.text for para in doc.paragraphs])
def process_files(files, progress=gr.Progress()):
global index, texts
texts = []
temp_dir = tempfile.mkdtemp()
# Step 1: Extract text
progress(0.1, desc="\u062c\u0627\u0631\u0650 \u0627\u0633\u062a\u062e\u0631\u0627\u062c \u0627\u0644\u0646\u0635\u0648\u0635 \u0645\u0646 \u0627\u0644\u0643\u062a\u0628...")
for file in files:
file_path = os.path.join(temp_dir, file.name)
with open(file_path, "wb") as f:
f.write(file.file.read())
if file.name.endswith(".pdf"):
text = extract_text_from_pdf(file_path)
elif file.name.endswith(".docx") or file.name.endswith(".doc"):
text = extract_text_from_docx(file_path)
else:
continue
texts.append(text)
# Step 2: Chunk the text
progress(0.4, desc="\u062a\u0642\u0637\u064a\u0639 \u0627\u0644\u0646\u0635\u0648\u0635 \u0625\u0644\u0649 \u0641\u0642\u0631\u0627\u062a...")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = []
for text in texts:
chunks.extend(splitter.split_text(text))
# Step 3: Embed the text
progress(0.7, desc="\u062a\u062d\u0648\u064a\u0644 \u0627\u0644\u0641\u0642\u0631\u0627\u062a \u0625\u0644\u0649 \u0645\u062a\u062c\u0647\u0627\u062a...")
embeddings = embedding_model.encode(chunks, show_progress_bar=True)
# Step 4: Build FAISS index
progress(0.9, desc="\u0628\u0646\u0627\u0621 \u0642\u0627\u0639\u062f\u0629 \u0628\u064a\u0627\u0646\u0627\u062a \u0627\u0644\u0628\u062d\u062b...")
embeddings = np.array(embeddings).astype(np.float32)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
texts.clear()
texts.extend(chunks)
return "\u2705 \u0627\u0644\u0646\u0638\u0627\u0645 \u062c\u0627\u0647\u0632 \u0644\u0644\u0625\u062c\u0627\u0628\u0629 \u0639\u0644\u0649 \u0623\u0633\u0626\u0644\u062a\u0643"
def answer_question(question):
if index is None:
return "\u064a\u0631\u062c\u0649 \u062a\u062d\u0645\u064a\u0644 \u0643\u062a\u0628 \u0648\u0627\u0644\u0646\u0642\u0631 \u0639\u0644\u0649 \"\u0627\u0628\u062f\u0623 \u0627\u0644\u062a\u062f\u0631\u064a\u0628\" \u0623\u0648\u0644\u0627"
embedded_question = embedding_model.encode([question]).astype(np.float32)
D, I = index.search(embedded_question, k=1)
if len(I[0]) == 0:
return "\u0644\u0645 \u064a\u062a\u0645 \u0627\u0644\u0639\u062b\u0648\u0631 \u0639\u0644\u0649 \u0625\u062c\u0627\u0628\u0629."
answer = texts[I[0][0]]
return answer
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown("# \ud83d\udcda محاكاة دماغ المؤلف بناءً على الكتب المرفوعة")
with gr.Row():
files = gr.File(label="ارفع ملفات الكتب", file_types=[".pdf", ".docx", ".doc"], file_count="multiple")
upload_button = gr.Button("ابدأ التدريب على الكتب")
output_text = gr.Textbox(label="مخرجات التدريب", interactive=False)
upload_button.click(fn=process_files, inputs=[files], outputs=[output_text])
gr.Markdown("## اطرح سؤالك بعد إكمال التدريب:")
question = gr.Textbox(label="سؤالك بالعربية")
answer = gr.Textbox(label="الإجابة", interactive=False)
ask_button = gr.Button("أجب عن سؤالي")
ask_button.click(fn=answer_question, inputs=[question], outputs=[answer])
demo.launch(share=True)
|