Spaces:

ramysaidagieb
/

Answer1

Sleeping

App Files Files Community

Answer1 / app.py

ramysaidagieb

Upload 5 files

d28c712 verified 4 months ago

raw

history blame

4.26 kB

	import gradio as gr
	import os
	import tempfile
	import faiss
	import torch
	from langchain.embeddings import HuggingFaceEmbeddings
	from langchain.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.prompts import PromptTemplate
	from langchain.chains import RetrievalQA
	from langchain.llms import HuggingFacePipeline
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
	from pdfminer.high_level import extract_text as extract_pdf_text
	import docx
	import nltk

	nltk.download('punkt')
	from nltk.tokenize import sent_tokenize

	uploaded_texts = []
	vector_store = None
	qa_chain = None

	embedding_model_name = "CAMeL-Lab/bert-base-arabic-camelbert-mix"
	embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

	model_name = "csebuetnlp/mT5_small"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)
	llm = HuggingFacePipeline(pipeline=pipe)

	ARABIC_PROMPT_TEMPLATE = """
	أنت نظام ذكي يجيب بناءً فقط على المعلومات المستخرجة من الكتب.
	لا تستخدم أي معلومات خارجية.
	السؤال: {question}
	الإجابة:
	"""

	def format_arabic_prompt(question):
	return ARABIC_PROMPT_TEMPLATE.format(question=question)

	def extract_text_from_file(file_path):
	if file_path.endswith(".pdf"):
	return extract_pdf_text(file_path)
	elif file_path.endswith(".docx") or file_path.endswith(".doc"):
	doc = docx.Document(file_path)
	return "\n".join([para.text for para in doc.paragraphs])
	else:
	raise ValueError("Unsupported file format")

	def arabic_split_text(text):
	sentences = sent_tokenize(text, language='arabic')
	chunks = []
	chunk = ""
	for sentence in sentences:
	if len(chunk) + len(sentence) <= 500:
	chunk += " " + sentence
	else:
	chunks.append(chunk.strip())
	chunk = sentence
	if chunk:
	chunks.append(chunk.strip())
	return chunks

	def train_from_texts(texts):
	global vector_store, qa_chain

	splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=100,
	length_function=len,
	)

	all_chunks = []
	for text in texts:
	chunks = arabic_split_text(text)
	all_chunks.extend(chunks)

	vectors = embeddings.embed_documents(all_chunks)
	dimension = len(vectors[0])
	index = faiss.IndexFlatL2(dimension)
	vector_store = FAISS(embedding_function=embeddings, index=index, documents=all_chunks)

	retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 10})
	qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

	def upload_book(file, progress=gr.Progress()):
	with tempfile.NamedTemporaryFile(delete=False) as tmp:
	tmp.write(file.read())
	tmp_path = tmp.name

	progress(0.2, desc="تحميل الملف...")
	extracted_text = extract_text_from_file(tmp_path)
	uploaded_texts.append(extracted_text)
	progress(0.5, desc="معالجة النص...")

	train_from_texts(uploaded_texts)
	progress(1.0, desc="اكتمل التدريب!")
	return "النظام جاهز للإجابة على أسئلتك"

	def answer_question(user_question):
	if qa_chain is None:
	return "الرجاء رفع كتاب أولاً."
	prompt = format_arabic_prompt(user_question)
	result = qa_chain.run(prompt)
	return result

	with gr.Blocks() as demo:
	with gr.Tab("تحميل الكتب"):
	upload_button = gr.File(label="ارفع كتابك (.pdf .docx .doc)", file_types=[".pdf", ".docx", ".doc"])
	upload_output = gr.Textbox(label="حالة النظام")
	upload_button.upload(upload_book, inputs=upload_button, outputs=upload_output)

	with gr.Tab("اسأل الكتاب"):
	question = gr.Textbox(label="اكتب سؤالك بالعربية")
	answer = gr.Textbox(label="الإجابة")
	ask_button = gr.Button("إرسال السؤال")
	ask_button.click(answer_question, inputs=question, outputs=answer)

	demo.launch(share=True)