afouda commited on
Commit
326b67d
Β·
verified Β·
1 Parent(s): 52581b8

Update User_Specific_Documents.py

Browse files
Files changed (1) hide show
  1. User_Specific_Documents.py +133 -130
User_Specific_Documents.py CHANGED
@@ -1,131 +1,134 @@
1
- import os
2
- import gradio as gr
3
- from openai import OpenAI
4
- import weaviate
5
- from weaviate.classes.init import Auth
6
- import pypdf # Replaced PyPDF2
7
- import docx
8
- from langchain.text_splitter import RecursiveCharacterTextSplitter
9
- from dotenv import load_dotenv
10
- from prompt_template import (
11
- Prompt_template_translation,
12
- Prompt_template_LLM_Generation,
13
- Prompt_template_Reranker,
14
- Prompt_template_Wisal,
15
- Prompt_template_Halluciations,
16
- Prompt_template_paraphrasing,
17
- Prompt_template_Translate_to_original,
18
- Prompt_template_relevance,
19
- Prompt_template_User_document_prompt
20
- )
21
- # ─── Configuration ─────────────────────────────────────────────────────────────
22
- from dotenv import load_dotenv
23
- load_dotenv()
24
- DEEPINFRA_TOKEN = os.getenv("DEEPINFRA_API_KEY")
25
- WEAVIATE_URL = os.getenv("WEAVIATE_URL")
26
- WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
27
- if not DEEPINFRA_TOKEN or not WEAVIATE_URL or not WEAVIATE_API_KEY:
28
- raise ValueError("Please set DEEPINFRA_TOKEN, WEAVIATE_URL, and WEAVIATE_API_KEY in .env or environment.")
29
- # Initialize DeepInfra-compatible OpenAI client
30
- openai = OpenAI(
31
- api_key=DEEPINFRA_TOKEN,
32
- base_url="https://api.deepinfra.com/v1/openai",
33
- )
34
- # Initialize Weaviate client
35
- client = weaviate.connect_to_weaviate_cloud(
36
- cluster_url=WEAVIATE_URL,
37
- auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
38
- )
39
- # ─── Utility: Extract raw text ──────────────────────────────────────────────────
40
- def extract_text(file_path: str) -> str:
41
- ext = os.path.splitext(file_path)[1].lower()
42
- if ext == ".pdf":
43
- text = ""
44
- with open(file_path, "rb") as f:
45
- reader = pypdf.PdfReader(f)
46
- for page in reader.pages:
47
- page_text = page.extract_text() or ""
48
- text += page_text + "\n"
49
- elif ext == ".docx":
50
- doc = docx.Document(file_path)
51
- text = "\n".join(p.text for p in doc.paragraphs)
52
- elif ext == ".txt":
53
- with open(file_path, "r", encoding="utf-8") as f:
54
- text = f.read()
55
- else:
56
- raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
57
- return text
58
- # ─── Chunker & Embed ──────────────────────────────────────────────────────────
59
- splitter = RecursiveCharacterTextSplitter(
60
- chunk_size=1000,
61
- chunk_overlap=200,
62
- separators=["\n\n", "\n", " "],
63
- )
64
- def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]:
65
- """Embed texts in batches to avoid API limits."""
66
- all_embeddings = []
67
- for i in range(0, len(texts), batch_size):
68
- batch = texts[i:i + batch_size]
69
- resp = openai.embeddings.create(
70
- model="Qwen/Qwen3-Embedding-8B",
71
- input=batch,
72
- encoding_format="float"
73
- )
74
- all_embeddings.extend([item.embedding for item in resp.data])
75
- return all_embeddings
76
- # ─── Ingest & Index ───────────────────────────────────────────────────────────
77
- def ingest_file(file_path: str) -> str:
78
- raw = extract_text(file_path)
79
- docs = splitter.split_text(raw)
80
- texts = [chunk for chunk in docs]
81
- vectors = embed_texts(texts)
82
- # Get the collection
83
- documents = client.collections.get("Book")
84
- # Batch insert with new API
85
- with client.batch.dynamic() as batch:
86
- for txt, vec in zip(texts, vectors):
87
- batch.add_object(
88
- collection="Book",
89
- properties={"text": txt},
90
- vector=vec
91
- )
92
- return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
93
- # ───────────────────────────────────────────── Query & Answer ───────────────────────────────────────────────────────────
94
- def answer_question(question: str) -> str:
95
- q_vec = embed_texts([question])[0]
96
- documents = client.collections.get("Book")
97
- response = documents.query.near_vector(
98
- near_vector=q_vec,
99
- limit=5,
100
- return_metadata=["distance"]
101
- )
102
- hits = response.objects
103
- context = "\n\n".join(hit.properties["text"] for hit in hits)
104
- print(context)
105
-
106
- UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=question, document=context)
107
- chat = openai.chat.completions.create(
108
- model="Qwen/Qwen3-32B",
109
- messages=[
110
- {"role": "user", "content": UserSpecificDocument_prompt
111
- }
112
- ],
113
- temperature=0,
114
- reasoning_effort="none"
115
- )
116
- return chat.choices[0].message.content
117
- # ─── Gradio Interface ─────────────────────────────────────────────────────────
118
- with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
119
- gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
120
- with gr.Row():
121
- up = gr.File(label="Select document")
122
- btn = gr.Button("Ingest")
123
- out = gr.Textbox(label="Status", interactive=False)
124
- btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
125
- with gr.Row():
126
- q = gr.Textbox(placeholder="Your question...", lines=2)
127
- ask = gr.Button("Ask")
128
- ans = gr.Textbox(label="Answer", lines=6, interactive=False)
129
- ask.click(fn=answer_question, inputs=q, outputs=ans)
130
- if __name__ == "__main__":
 
 
 
131
  demo.launch(debug=True)
 
1
+ import os
2
+ import gradio as gr
3
+ from openai import OpenAI
4
+ import weaviate
5
+ from weaviate.classes.init import Auth
6
+ import pypdf # Replaced PyPDF2
7
+ import docx
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from dotenv import load_dotenv
10
+ from prompt_template import (
11
+ Prompt_template_translation,
12
+ Prompt_template_LLM_Generation,
13
+ Prompt_template_Reranker,
14
+ Prompt_template_Wisal,
15
+ Prompt_template_Halluciations,
16
+ Prompt_template_paraphrasing,
17
+ Prompt_template_Translate_to_original,
18
+ Prompt_template_relevance,
19
+ Prompt_template_User_document_prompt
20
+ )
21
+ # ─── Configuration ─────────────────────────────────────────────────────────────
22
+ GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo"
23
+ TAVILY_API_KEY="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"
24
+ OPENAI_API_KEY="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"
25
+ QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E"
26
+ QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io"
27
+ OPENAI_API_KEY="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"
28
+ WEAVIATE_URL="https://xbvlj5rpqyiswspww0tthq.c0.us-west3.gcp.weaviate.cloud"
29
+ WEAVIATE_API_KEY="RU9acU1CYnNRTjY1S1ZFc18zNS9tQktaWlcwTzFEUjlscEVCUGF4YU5xRWx2MDhmTUtIdUhnOWdOTGVZPV92MjAw"
30
+ DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4"
31
+ DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai"
32
+
33
+ openai = OpenAI(
34
+ api_key=DEEPINFRA_TOKEN,
35
+ base_url="https://api.deepinfra.com/v1/openai",
36
+ )
37
+ # Initialize Weaviate client
38
+ client = weaviate.connect_to_weaviate_cloud(
39
+ cluster_url=WEAVIATE_URL,
40
+ auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
41
+ )
42
+ # ─── Utility: Extract raw text ──────────────────────────────────────────────────
43
+ def extract_text(file_path: str) -> str:
44
+ ext = os.path.splitext(file_path)[1].lower()
45
+ if ext == ".pdf":
46
+ text = ""
47
+ with open(file_path, "rb") as f:
48
+ reader = pypdf.PdfReader(f)
49
+ for page in reader.pages:
50
+ page_text = page.extract_text() or ""
51
+ text += page_text + "\n"
52
+ elif ext == ".docx":
53
+ doc = docx.Document(file_path)
54
+ text = "\n".join(p.text for p in doc.paragraphs)
55
+ elif ext == ".txt":
56
+ with open(file_path, "r", encoding="utf-8") as f:
57
+ text = f.read()
58
+ else:
59
+ raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")
60
+ return text
61
+ # ─── Chunker & Embed ──────────────────────────────────────────────────────────
62
+ splitter = RecursiveCharacterTextSplitter(
63
+ chunk_size=1000,
64
+ chunk_overlap=200,
65
+ separators=["\n\n", "\n", " "],
66
+ )
67
+ def embed_texts(texts: list[str], batch_size: int = 70) -> list[list[float]]:
68
+ """Embed texts in batches to avoid API limits."""
69
+ all_embeddings = []
70
+ for i in range(0, len(texts), batch_size):
71
+ batch = texts[i:i + batch_size]
72
+ resp = openai.embeddings.create(
73
+ model="Qwen/Qwen3-Embedding-8B",
74
+ input=batch,
75
+ encoding_format="float"
76
+ )
77
+ all_embeddings.extend([item.embedding for item in resp.data])
78
+ return all_embeddings
79
+ # ─── Ingest & Index ───────────────────────────────────────────────────────────
80
+ def ingest_file(file_path: str) -> str:
81
+ raw = extract_text(file_path)
82
+ docs = splitter.split_text(raw)
83
+ texts = [chunk for chunk in docs]
84
+ vectors = embed_texts(texts)
85
+ # Get the collection
86
+ documents = client.collections.get("Book")
87
+ # Batch insert with new API
88
+ with client.batch.dynamic() as batch:
89
+ for txt, vec in zip(texts, vectors):
90
+ batch.add_object(
91
+ collection="Book",
92
+ properties={"text": txt},
93
+ vector=vec
94
+ )
95
+ return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
96
+ # ───────────────────────────────────────────── Query & Answer ───────────────────────────────────────────────────────────
97
+ def answer_question(question: str) -> str:
98
+ q_vec = embed_texts([question])[0]
99
+ documents = client.collections.get("Book")
100
+ response = documents.query.near_vector(
101
+ near_vector=q_vec,
102
+ limit=5,
103
+ return_metadata=["distance"]
104
+ )
105
+ hits = response.objects
106
+ context = "\n\n".join(hit.properties["text"] for hit in hits)
107
+ print(context)
108
+
109
+ UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(new_query=question, document=context)
110
+ chat = openai.chat.completions.create(
111
+ model="Qwen/Qwen3-32B",
112
+ messages=[
113
+ {"role": "user", "content": UserSpecificDocument_prompt
114
+ }
115
+ ],
116
+ temperature=0,
117
+ reasoning_effort="none"
118
+ )
119
+ return chat.choices[0].message.content
120
+ # ─── Gradio Interface ─────────────────────────────────────────────────────────
121
+ with gr.Blocks(title="Document Q&A with Qwen & Weaviate") as demo:
122
+ gr.Markdown("## Upload a PDF, DOCX, or TXT and then ask away!")
123
+ with gr.Row():
124
+ up = gr.File(label="Select document")
125
+ btn = gr.Button("Ingest")
126
+ out = gr.Textbox(label="Status", interactive=False)
127
+ btn.click(fn=lambda f: ingest_file(f.name), inputs=up, outputs=out)
128
+ with gr.Row():
129
+ q = gr.Textbox(placeholder="Your question...", lines=2)
130
+ ask = gr.Button("Ask")
131
+ ans = gr.Textbox(label="Answer", lines=6, interactive=False)
132
+ ask.click(fn=answer_question, inputs=q, outputs=ans)
133
+ if __name__ == "__main__":
134
  demo.launch(debug=True)