Wisal_QA / User_Specific_Documents.py
afouda's picture
Update User_Specific_Documents.py
0db2979 verified
import os
import gradio as gr
from openai import OpenAI
import weaviate
from weaviate.classes.init import Auth
import pypdf # Replaced PyPDF2
import docx
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from prompt_template import (
Prompt_template_translation,
Prompt_template_LLM_Generation,
Prompt_template_Reranker,
Prompt_template_Wisal,
Prompt_template_Halluciations,
Prompt_template_paraphrasing,
Prompt_template_Translate_to_original,
Prompt_template_relevance,
Prompt_template_User_document_prompt
)
# ─── Configuration ─────────────────────────────────────────────────────────────
openai = OpenAI(
api_key=DEEPINFRA_API_KEY,
base_url="https://api.deepinfra.com/v1/openai",
)
# Initialize Weaviate client
client = weaviate.connect_to_weaviate_cloud(
cluster_url=WEAVIATE_URL,
auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)
# ─── Utility: Extract raw text ──────────────────────────────────────────────────
def extract_text(file_path: str) -> str:
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
text = ""
with open(file_path, "rb") as f:
reader = pypdf.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text() or ""
text += page_text + "\n"
elif ext == ".docx":
doc = docx.Document(file_path)
text = "\n".join(p.text for p in doc.paragraphs)
elif ext == ".txt":
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
else:
raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
return text
# ─── Chunker & Embed ──────────────────────────────────────────────────────────
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", " "],
)
def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]:
"""Embed texts in batches to avoid API limits."""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
resp = openai.embeddings.create(
model="Qwen/Qwen3-Embedding-8B",
input=batch,
encoding_format="float"
)
all_embeddings.extend([item.embedding for item in resp.data])
return all_embeddings
# ─── Ingest & Index ───────────────────────────────────────────────────────────
def ingest_file(file_path: str) -> str:
raw = extract_text(file_path)
docs = splitter.split_text(raw)
texts = [chunk for chunk in docs]
vectors = embed_texts(texts)
# Get the collection
documents = client.collections.get("Book")
# Batch insert with new API
with client.batch.dynamic() as batch:
for txt, vec in zip(texts, vectors):
batch.add_object(
collection="Book",
properties={"text": txt},
vector=vec
)
return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
# ───────────────────────────────────────────── Query & Answer ───────────────────────────────────────────────────────────
def answer_question(question: str) -> str:
q_vec = embed_texts([question])[0]
documents = client.collections.get("Book")
response = documents.query.near_vector(
near_vector=q_vec,
limit=5,
return_metadata=["distance"]
)
hits = response.objects
context = "\n\n".join(hit.properties["text"] for hit in hits)
print(context)
UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=question, document=context)
chat = openai.chat.completions.create(
model="Qwen/Qwen3-32B",
messages=[
{"role": "user", "content": UserSpecificDocument_prompt
}
],
temperature=0,
reasoning_effort="none"
)
return chat.choices[0].message.content
# ─── Gradio Interface ─────────────────────────────────────────────────────────
with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
with gr.Row():
up = gr.File(label="Select document")
btn = gr.Button("Ingest")
out = gr.Textbox(label="Status", interactive=False)
btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
with gr.Row():
q = gr.Textbox(placeholder="Your question...", lines=2)
ask = gr.Button("Ask")
ans = gr.Textbox(label="Answer", lines=6, interactive=False)
ask.click(fn=answer_question, inputs=q, outputs=ans)
if __name__ == "__main__":
demo.launch(debug=True)