Spaces:

afouda
/

Wisal_QA

Runtime error

App Files Files Community

afouda commited on Jul 8

Commit

326b67d

verified ·

1 Parent(s): 52581b8

Update User_Specific_Documents.py

Browse files

Files changed (1) hide show

User_Specific_Documents.py +133 -130

User_Specific_Documents.py CHANGED Viewed

@@ -1,131 +1,134 @@
-import os
-import gradio as gr
-from openai import OpenAI
-import weaviate
-from weaviate.classes.init import Auth
-import pypdf  # Replaced PyPDF2
-import docx
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from dotenv import load_dotenv
-from prompt_template import (
-    Prompt_template_translation,
-    Prompt_template_LLM_Generation,
-    Prompt_template_Reranker,
-    Prompt_template_Wisal,
-    Prompt_template_Halluciations,
-    Prompt_template_paraphrasing,
-    Prompt_template_Translate_to_original,
-    Prompt_template_relevance,
-    Prompt_template_User_document_prompt
-)
-# ─── Configuration ─────────────────────────────────────────────────────────────
-from dotenv import load_dotenv
-load_dotenv()
-DEEPINFRA_TOKEN = os.getenv("DEEPINFRA_API_KEY")
-WEAVIATE_URL = os.getenv("WEAVIATE_URL")
-WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
-if not DEEPINFRA_TOKEN or not WEAVIATE_URL or not WEAVIATE_API_KEY:
-    raise ValueError("Please set DEEPINFRA_TOKEN, WEAVIATE_URL, and WEAVIATE_API_KEY in .env or environment.")
-# Initialize DeepInfra-compatible OpenAI client
-openai = OpenAI(
-    api_key=DEEPINFRA_TOKEN,
-    base_url="https://api.deepinfra.com/v1/openai",
-)
-# Initialize Weaviate client
-client = weaviate.connect_to_weaviate_cloud(
-    cluster_url=WEAVIATE_URL,
-    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
-)
-# ─── Utility: Extract raw text ──────────────────────────────────────────────────
-def extract_text(file_path: str) -> str:
-    ext = os.path.splitext(file_path)[1].lower()
-    if ext == ".pdf":
-        text = ""
-        with open(file_path, "rb") as f:
-            reader = pypdf.PdfReader(f)
-            for page in reader.pages:
-                page_text = page.extract_text() or ""
-                text += page_text + "\n"
-    elif ext == ".docx":
-        doc = docx.Document(file_path)
-        text = "\n".join(p.text for p in doc.paragraphs)
-    elif ext == ".txt":
-        with open(file_path, "r", encoding="utf-8") as f:
-            text = f.read()
-    else:
-        raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
-    return text
-# ─── Chunker & Embed ──────────────────────────────────────────────────────────
-splitter = RecursiveCharacterTextSplitter(
-    chunk_size=1000,
-    chunk_overlap=200,
-    separators=["\n\n", "\n", " "],
-)
-def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]:
-    """Embed texts in batches to avoid API limits."""
-    all_embeddings = []
-    for i in range(0, len(texts), batch_size):
-        batch = texts[i:i + batch_size]
-        resp = openai.embeddings.create(
-            model="Qwen/Qwen3-Embedding-8B",
-            input=batch,
-            encoding_format="float"
-        )
-        all_embeddings.extend([item.embedding for item in resp.data])
-    return all_embeddings
-# ─── Ingest & Index ───────────────────────────────────────────────────────────
-def ingest_file(file_path: str) -> str:
-    raw = extract_text(file_path)
-    docs = splitter.split_text(raw)
-    texts = [chunk for chunk in docs]
-    vectors = embed_texts(texts)
-    # Get the collection
-    documents = client.collections.get("Book")
-    # Batch insert with new API
-    with client.batch.dynamic() as batch:
-        for txt, vec in zip(texts, vectors):
-            batch.add_object(
-                collection="Book",
-                properties={"text": txt},
-                vector=vec
-            )
-    return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
-# ───────────────────────────────────────────── Query & Answer ───────────────────────────────────────────────────────────
-def answer_question(question: str) -> str:
-    q_vec = embed_texts([question])[0]
-    documents = client.collections.get("Book")
-    response = documents.query.near_vector(
-        near_vector=q_vec,
-        limit=5,
-        return_metadata=["distance"]
-    )
-    hits = response.objects
-    context = "\n\n".join(hit.properties["text"] for hit in hits)
-    print(context)
-    UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=question, document=context)
-    chat = openai.chat.completions.create(
-        model="Qwen/Qwen3-32B",
-        messages=[
-            {"role": "user", "content": UserSpecificDocument_prompt
-            }
-        ],
-        temperature=0,
-        reasoning_effort="none"
-    )
-    return chat.choices[0].message.content
-# ─── Gradio Interface ─────────────────────────────────────────────────────────
-with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
-    gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
-    with gr.Row():
-        up = gr.File(label="Select document")
-        btn = gr.Button("Ingest")
-        out = gr.Textbox(label="Status", interactive=False)
-    btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
-    with gr.Row():
-        q = gr.Textbox(placeholder="Your question...", lines=2)
-        ask = gr.Button("Ask")
-        ans = gr.Textbox(label="Answer", lines=6, interactive=False)
-    ask.click(fn=answer_question, inputs=q, outputs=ans)
-if __name__ == "__main__":
     demo.launch(debug=True)

+import os
+import gradio as gr
+from openai import OpenAI
+import weaviate
+from weaviate.classes.init import Auth
+import pypdf  # Replaced PyPDF2
+import docx
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from dotenv import load_dotenv
+from prompt_template import (
+    Prompt_template_translation,
+    Prompt_template_LLM_Generation,
+    Prompt_template_Reranker,
+    Prompt_template_Wisal,
+    Prompt_template_Halluciations,
+    Prompt_template_paraphrasing,
+    Prompt_template_Translate_to_original,
+    Prompt_template_relevance,
+    Prompt_template_User_document_prompt
+)
+# ─── Configuration ─────────────────────────────────────────────────────────────
+GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo"
+TAVILY_API_KEY="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"
+OPENAI_API_KEY="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"
+QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E"
+QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io"
+OPENAI_API_KEY="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"
+WEAVIATE_URL="https://xbvlj5rpqyiswspww0tthq.c0.us-west3.gcp.weaviate.cloud"
+WEAVIATE_API_KEY="RU9acU1CYnNRTjY1S1ZFc18zNS9tQktaWlcwTzFEUjlscEVCUGF4YU5xRWx2MDhmTUtIdUhnOWdOTGVZPV92MjAw"
+DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4"
+DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai"
+openai = OpenAI(
+    api_key=DEEPINFRA_TOKEN,
+    base_url="https://api.deepinfra.com/v1/openai",
+)
+# Initialize Weaviate client
+client = weaviate.connect_to_weaviate_cloud(
+    cluster_url=WEAVIATE_URL,
+    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
+)
+# ─── Utility: Extract raw text ──────────────────────────────────────────────────
+def extract_text(file_path: str) -> str:
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".pdf":
+        text = ""
+        with open(file_path, "rb") as f:
+            reader = pypdf.PdfReader(f)
+            for page in reader.pages:
+                page_text = page.extract_text() or ""
+                text += page_text + "\n"
+    elif ext == ".docx":
+        doc = docx.Document(file_path)
+        text = "\n".join(p.text for p in doc.paragraphs)
+    elif ext == ".txt":
+        with open(file_path, "r", encoding="utf-8") as f:
+            text = f.read()
+    else:
+        raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
+    return text
+# ─── Chunker & Embed ──────────────────────────────────────────────────────────
+splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1000,
+    chunk_overlap=200,
+    separators=["\n\n", "\n", " "],
+)
+def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]:
+    """Embed texts in batches to avoid API limits."""
+    all_embeddings = []
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i + batch_size]
+        resp = openai.embeddings.create(
+            model="Qwen/Qwen3-Embedding-8B",
+            input=batch,
+            encoding_format="float"
+        )
+        all_embeddings.extend([item.embedding for item in resp.data])
+    return all_embeddings
+# ─── Ingest & Index ───────────────────────────────────────────────────────────
+def ingest_file(file_path: str) -> str:
+    raw = extract_text(file_path)
+    docs = splitter.split_text(raw)
+    texts = [chunk for chunk in docs]
+    vectors = embed_texts(texts)
+    # Get the collection
+    documents = client.collections.get("Book")
+    # Batch insert with new API
+    with client.batch.dynamic() as batch:
+        for txt, vec in zip(texts, vectors):
+            batch.add_object(
+                collection="Book",
+                properties={"text": txt},
+                vector=vec
+            )
+    return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
+# ───────────────────────────────────────────── Query & Answer ───────────────────────────────────────────────────────────
+def answer_question(question: str) -> str:
+    q_vec = embed_texts([question])[0]
+    documents = client.collections.get("Book")
+    response = documents.query.near_vector(
+        near_vector=q_vec,
+        limit=5,
+        return_metadata=["distance"]
+    )
+    hits = response.objects
+    context = "\n\n".join(hit.properties["text"] for hit in hits)
+    print(context)
+    UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=question, document=context)
+    chat = openai.chat.completions.create(
+        model="Qwen/Qwen3-32B",
+        messages=[
+            {"role": "user", "content": UserSpecificDocument_prompt
+            }
+        ],
+        temperature=0,
+        reasoning_effort="none"
+    )
+    return chat.choices[0].message.content
+# ─── Gradio Interface ─────────────────────────────────────────────────────────
+with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
+    gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
+    with gr.Row():
+        up = gr.File(label="Select document")
+        btn = gr.Button("Ingest")
+        out = gr.Textbox(label="Status", interactive=False)
+    btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
+    with gr.Row():
+        q = gr.Textbox(placeholder="Your question...", lines=2)
+        ask = gr.Button("Ask")
+        ans = gr.Textbox(label="Answer", lines=6, interactive=False)
+    ask.click(fn=answer_question, inputs=q, outputs=ans)
+if __name__ == "__main__":
     demo.launch(debug=True)