import os import gradio as gr from openai import OpenAI import weaviate from weaviate.classes.init import Auth import pypdf # Replaced PyPDF2 import docx from langchain.text_splitter import RecursiveCharacterTextSplitter from dotenv import load_dotenv from prompt_template import ( Prompt_template_translation, Prompt_template_LLM_Generation, Prompt_template_Reranker, Prompt_template_Wisal, Prompt_template_Halluciations, Prompt_template_paraphrasing, Prompt_template_Translate_to_original, Prompt_template_relevance ) # ─── Configuration ───────────────────────────────────────────────────────────── openai = OpenAI( api_key=DEEPINFRA_API_KEY, base_url="https://api.deepinfra.com/v1/openai", ) # Initialize Weaviate client client = weaviate.connect_to_weaviate_cloud( cluster_url=WEAVIATE_URL, auth_credentials=Auth.api_key(WEAVIATE_API_KEY), ) # ─── Utility: Extract raw text ────────────────────────────────────────────────── def extract_text(file_path: str) -> str: ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": text = "" with open(file_path, "rb") as f: reader = pypdf.PdfReader(f) for page in reader.pages: page_text = page.extract_text() or "" text += page_text + "\n" elif ext == ".docx": doc = docx.Document(file_path) text = "\n".join(p.text for p in doc.paragraphs) elif ext == ".txt": with open(file_path, "r", encoding="utf-8") as f: text = f.read() else: raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.") return text # ─── Chunker & Embed ────────────────────────────────────────────────────────── splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", " "], ) def embed_texts(texts: list[str], batch_size: int = 50) -> list[list[float]]: """Embed texts in batches to avoid API limits.""" all_embeddings = [] for i in range(0, len(texts), batch_size): batch = texts[i:i + batch_size] resp = openai.embeddings.create( model="Qwen/Qwen3-Embedding-8B", input=batch, encoding_format="float" ) all_embeddings.extend([item.embedding for item in resp.data]) return all_embeddings # ─── Ingest & Index ─────────────────────────────────────────────────────────── def ingest_file(file_path: str) -> str: raw = extract_text(file_path) docs = splitter.split_text(raw) texts = [chunk for chunk in docs] vectors = embed_texts(texts) # Get the collection documents = client.collections.get("books") # Batch insert with new API with client.batch.dynamic() as batch: for txt, vec in zip(texts, vectors): batch.add_object( collection="books", properties={"text": txt}, vector=vec ) return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}" # ─── Query & Answer ─────────────────────────────────────────────────────────── def answer_question(question: str) -> str: q_vec = embed_texts([question])[0] documents = client.collections.get("books") response = documents.query.near_vector( near_vector=q_vec, limit=5, return_metadata=["distance"] ) hits = response.objects context = "\n\n".join(hit.properties["text"] for hit in hits) print(context) wisal_prompt = Prompt_template_Wisal.format(new_query=question, document=context) chat = openai.chat.completions.create( model="Qwen/Qwen3-32B", messages=[ {"role": "user", "content": wisal_prompt } ], temperature=0, reasoning_effort="none" ) return chat.choices[0].message.content # ─── Gradio Interface ───────────────────────────────────────────────────────── with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo: gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!") with gr.Row(): up = gr.File(label="Select document") btn = gr.Button("Ingest") out = gr.Textbox(label="Status", interactive=False) btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out) with gr.Row(): q = gr.Textbox(placeholder="Your question...", lines=2) ask = gr.Button("Ask") ans = gr.Textbox(label="Answer", lines=6, interactive=False) ask.click(fn=answer_question, inputs=q, outputs=ans) if __name__ == "__main__": demo.launch(debug=True)