Spaces:

afouda
/

Wisal_QA

Runtime error

App Files Files Community

Wisal_QA / User_Specific_Documents.py

afouda

Update User_Specific_Documents.py

0db2979 verified about 1 month ago

raw

history blame contribute delete

5.54 kB

	import os
	import gradio as gr
	from openai import OpenAI
	import weaviate
	from weaviate.classes.init import Auth
	import pypdf # Replaced PyPDF2
	import docx
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from dotenv import load_dotenv
	from prompt_template import (
	Prompt_template_translation,
	Prompt_template_LLM_Generation,
	Prompt_template_Reranker,
	Prompt_template_Wisal,
	Prompt_template_Halluciations,
	Prompt_template_paraphrasing,
	Prompt_template_Translate_to_original,
	Prompt_template_relevance,
	Prompt_template_User_document_prompt
	)
	# ─── Configuration ─────────────────────────────────────────────────────────────

	openai = OpenAI(
	api_key=DEEPINFRA_API_KEY,
	base_url="https://api.deepinfra.com/v1/openai",
	)
	# Initialize Weaviate client
	client = weaviate.connect_to_weaviate_cloud(
	cluster_url=WEAVIATE_URL,
	auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
	)
	# ─── Utility: Extract raw text ──────────────────────────────────────────────────
	def extract_text(file_path: str) -> str:
	ext = os.path.splitext(file_path)[1].lower()
	if ext == ".pdf":
	text = ""
	with open(file_path, "rb") as f:
	reader = pypdf.PdfReader(f)
	for page in reader.pages:
	page_text = page.extract_text() or ""
	text += page_text + "\n"
	elif ext == ".docx":
	doc = docx.Document(file_path)
	text = "\n".join(p.text for p in doc.paragraphs)
	elif ext == ".txt":
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()
	else:
	raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
	return text
	# ─── Chunker & Embed ──────────────────────────────────────────────────────────
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	separators=["\n\n", "\n", " "],
	)
	def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]:
	"""Embed texts in batches to avoid API limits."""
	all_embeddings = []
	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]
	resp = openai.embeddings.create(
	model="Qwen/Qwen3-Embedding-8B",
	input=batch,
	encoding_format="float"
	)
	all_embeddings.extend([item.embedding for item in resp.data])
	return all_embeddings
	# ─── Ingest & Index ───────────────────────────────────────────────────────────
	def ingest_file(file_path: str) -> str:
	raw = extract_text(file_path)
	docs = splitter.split_text(raw)
	texts = [chunk for chunk in docs]
	vectors = embed_texts(texts)
	# Get the collection
	documents = client.collections.get("Book")
	# Batch insert with new API
	with client.batch.dynamic() as batch:
	for txt, vec in zip(texts, vectors):
	batch.add_object(
	collection="Book",
	properties={"text": txt},
	vector=vec
	)
	return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
	# ───────────────────────────────────────────── Query & Answer ───────────────────────────────────────────────────────────
	def answer_question(question: str) -> str:
	q_vec = embed_texts([question])[0]
	documents = client.collections.get("Book")
	response = documents.query.near_vector(
	near_vector=q_vec,
	limit=5,
	return_metadata=["distance"]
	)
	hits = response.objects
	context = "\n\n".join(hit.properties["text"] for hit in hits)
	print(context)

	UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=question, document=context)
	chat = openai.chat.completions.create(
	model="Qwen/Qwen3-32B",
	messages=[
	{"role": "user", "content": UserSpecificDocument_prompt
	}
	],
	temperature=0,
	reasoning_effort="none"
	)
	return chat.choices[0].message.content
	# ─── Gradio Interface ─────────────────────────────────────────────────────────
	with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
	gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
	with gr.Row():
	up = gr.File(label="Select document")
	btn = gr.Button("Ingest")
	out = gr.Textbox(label="Status", interactive=False)
	btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
	with gr.Row():
	q = gr.Textbox(placeholder="Your question...", lines=2)
	ask = gr.Button("Ask")
	ans = gr.Textbox(label="Answer", lines=6, interactive=False)
	ask.click(fn=answer_question, inputs=q, outputs=ans)
	if __name__ == "__main__":
	demo.launch(debug=True)