File size: 5,322 Bytes
52581b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b98196
52581b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e57beb8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import os
import gradio as gr
from openai import OpenAI
import weaviate
from weaviate.classes.init import Auth
import pypdf  # Replaced PyPDF2
import docx
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from prompt_template import (
    Prompt_template_translation,
    Prompt_template_LLM_Generation,
    Prompt_template_Reranker,
    Prompt_template_Wisal,
    Prompt_template_Halluciations,
    Prompt_template_paraphrasing,
    Prompt_template_Translate_to_original,
    Prompt_template_relevance
)
# ─── Configuration ─────────────────────────────────────────────────────────────

openai = OpenAI(
    api_key=DEEPINFRA_API_KEY,
    base_url="https://api.deepinfra.com/v1/openai",
)
# Initialize Weaviate client
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)
# ─── Utility: Extract raw text ──────────────────────────────────────────────────
def extract_text(file_path: str) -> str:
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        text = ""
        with open(file_path, "rb") as f:
            reader = pypdf.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
    elif ext == ".docx":
        doc = docx.Document(file_path)
        text = "\n".join(p.text for p in doc.paragraphs)
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
    return text
# ─── Chunker & Embed ──────────────────────────────────────────────────────────
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " "],
)
def embed_texts(texts: list[str], batch_size: int = 50) -> list[list[float]]:
    """Embed texts in batches to avoid API limits."""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        resp = openai.embeddings.create(
            model="Qwen/Qwen3-Embedding-8B",
            input=batch,
            encoding_format="float"
        )
        all_embeddings.extend([item.embedding for item in resp.data])
    return all_embeddings
# ─── Ingest & Index ───────────────────────────────────────────────────────────
def ingest_file(file_path: str) -> str:
    raw = extract_text(file_path)
    docs = splitter.split_text(raw)
    texts = [chunk for chunk in docs]
    vectors = embed_texts(texts)
    # Get the collection
    documents = client.collections.get("books")
    # Batch insert with new API
    with client.batch.dynamic() as batch:
        for txt, vec in zip(texts, vectors):
            batch.add_object(
                collection="books",
                properties={"text": txt},
                vector=vec
            )
    return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
# ─── Query & Answer ───────────────────────────────────────────────────────────
def answer_question(question: str) -> str:
    q_vec = embed_texts([question])[0]
    documents = client.collections.get("books")
    response = documents.query.near_vector(
        near_vector=q_vec,
        limit=5,
        return_metadata=["distance"]
    )
    hits = response.objects
    context = "\n\n".join(hit.properties["text"] for hit in hits)
    print(context)
    wisal_prompt = Prompt_template_Wisal.format(new_query=question, document=context)
    chat = openai.chat.completions.create(
        model="Qwen/Qwen3-32B",
        messages=[
            {"role": "user", "content": wisal_prompt
            }
        ],
        temperature=0,
        reasoning_effort="none"
    )
    return chat.choices[0].message.content
# ─── Gradio Interface ─────────────────────────────────────────────────────────
with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
    gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
    with gr.Row():
        up = gr.File(label="Select document")
        btn = gr.Button("Ingest")
        out = gr.Textbox(label="Status", interactive=False)
    btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
    with gr.Row():
        q = gr.Textbox(placeholder="Your question...", lines=2)
        ask = gr.Button("Ask")
        ans = gr.Textbox(label="Answer", lines=6, interactive=False)
    ask.click(fn=answer_question, inputs=q, outputs=ans)
if __name__ == "__main__":
    demo.launch(debug=True)