|
import os |
|
import gradio as gr |
|
from openai import OpenAI |
|
import weaviate |
|
from weaviate.classes.init import Auth |
|
import pypdf |
|
import docx |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from dotenv import load_dotenv |
|
from prompt_template import ( |
|
Prompt_template_translation, |
|
Prompt_template_LLM_Generation, |
|
Prompt_template_Reranker, |
|
Prompt_template_Wisal, |
|
Prompt_template_Halluciations, |
|
Prompt_template_paraphrasing, |
|
Prompt_template_Translate_to_original, |
|
Prompt_template_relevance, |
|
Prompt_template_User_document_prompt |
|
) |
|
|
|
|
|
openai = OpenAI( |
|
api_key=DEEPINFRA_API_KEY, |
|
base_url="https://api.deepinfra.com/v1/openai", |
|
) |
|
|
|
client = weaviate.connect_to_weaviate_cloud( |
|
cluster_url=WEAVIATE_URL, |
|
auth_credentials=Auth.api_key(WEAVIATE_API_KEY), |
|
) |
|
|
|
def extract_text(file_path: str) -> str: |
|
ext = os.path.splitext(file_path)[1].lower() |
|
if ext == ".pdf": |
|
text = "" |
|
with open(file_path, "rb") as f: |
|
reader = pypdf.PdfReader(f) |
|
for page in reader.pages: |
|
page_text = page.extract_text() or "" |
|
text += page_text + "\n" |
|
elif ext == ".docx": |
|
doc = docx.Document(file_path) |
|
text = "\n".join(p.text for p in doc.paragraphs) |
|
elif ext == ".txt": |
|
with open(file_path, "r", encoding="utf-8") as f: |
|
text = f.read() |
|
else: |
|
raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.") |
|
return text |
|
|
|
splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=1000, |
|
chunk_overlap=200, |
|
separators=["\n\n", "\n", " "], |
|
) |
|
def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]: |
|
"""Embed texts in batches to avoid API limits.""" |
|
all_embeddings = [] |
|
for i in range(0, len(texts), batch_size): |
|
batch = texts[i:i + batch_size] |
|
resp = openai.embeddings.create( |
|
model="Qwen/Qwen3-Embedding-8B", |
|
input=batch, |
|
encoding_format="float" |
|
) |
|
all_embeddings.extend([item.embedding for item in resp.data]) |
|
return all_embeddings |
|
|
|
def ingest_file(file_path: str) -> str: |
|
raw = extract_text(file_path) |
|
docs = splitter.split_text(raw) |
|
texts = [chunk for chunk in docs] |
|
vectors = embed_texts(texts) |
|
|
|
documents = client.collections.get("Book") |
|
|
|
with client.batch.dynamic() as batch: |
|
for txt, vec in zip(texts, vectors): |
|
batch.add_object( |
|
collection="Book", |
|
properties={"text": txt}, |
|
vector=vec |
|
) |
|
return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}" |
|
|
|
def answer_question(question: str) -> str: |
|
q_vec = embed_texts([question])[0] |
|
documents = client.collections.get("Book") |
|
response = documents.query.near_vector( |
|
near_vector=q_vec, |
|
limit=5, |
|
return_metadata=["distance"] |
|
) |
|
hits = response.objects |
|
context = "\n\n".join(hit.properties["text"] for hit in hits) |
|
print(context) |
|
|
|
UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=question, document=context) |
|
chat = openai.chat.completions.create( |
|
model="Qwen/Qwen3-32B", |
|
messages=[ |
|
{"role": "user", "content": UserSpecificDocument_prompt |
|
} |
|
], |
|
temperature=0, |
|
reasoning_effort="none" |
|
) |
|
return chat.choices[0].message.content |
|
|
|
with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo: |
|
gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!") |
|
with gr.Row(): |
|
up = gr.File(label="Select document") |
|
btn = gr.Button("Ingest") |
|
out = gr.Textbox(label="Status", interactive=False) |
|
btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out) |
|
with gr.Row(): |
|
q = gr.Textbox(placeholder="Your question...", lines=2) |
|
ask = gr.Button("Ask") |
|
ans = gr.Textbox(label="Answer", lines=6, interactive=False) |
|
ask.click(fn=answer_question, inputs=q, outputs=ans) |
|
if __name__ == "__main__": |
|
demo.launch(debug=True) |