Update Old_Document.py
Browse files- Old_Document.py +140 -134
Old_Document.py
CHANGED
@@ -1,135 +1,141 @@
|
|
1 |
-
import os
|
2 |
-
import asyncio
|
3 |
-
from dotenv import load_dotenv
|
4 |
-
import gradio as gr
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
return
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
if
|
68 |
-
return
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
demo.launch(debug=True)
|
|
|
1 |
+
import os
|
2 |
+
import asyncio
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo"
|
7 |
+
TAVILY_API_KEY="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"
|
8 |
+
OPENAI_API_KEY="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"
|
9 |
+
QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E"
|
10 |
+
QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io"
|
11 |
+
OPENAI_API_KEY="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"
|
12 |
+
WEAVIATE_URL="https://xbvlj5rpqyiswspww0tthq.c0.us-west3.gcp.weaviate.cloud"
|
13 |
+
WEAVIATE_API_KEY="RU9acU1CYnNRTjY1S1ZFc18zNS9tQktaWlcwTzFEUjlscEVCUGF4YU5xRWx2MDhmTUtIdUhnOWdOTGVZPV92MjAw"
|
14 |
+
DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4"
|
15 |
+
DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai"
|
16 |
+
|
17 |
+
|
18 |
+
# if not (DEEPINFRA_TOKEN and WEAVIATE_URL and WEAVIATE_API_KEY):
|
19 |
+
# raise ValueError("Please set all required keys in .env")
|
20 |
+
|
21 |
+
# DeepInfra client
|
22 |
+
from openai import OpenAI
|
23 |
+
openai = OpenAI(
|
24 |
+
api_key=DEEPINFRA_TOKEN,
|
25 |
+
base_url="https://api.deepinfra.com/v1/openai",
|
26 |
+
)
|
27 |
+
|
28 |
+
# Weaviate client
|
29 |
+
import weaviate
|
30 |
+
from weaviate.classes.init import Auth
|
31 |
+
from contextlib import contextmanager
|
32 |
+
|
33 |
+
@contextmanager
|
34 |
+
def weaviate_client():
|
35 |
+
client = weaviate.connect_to_weaviate_cloud(
|
36 |
+
cluster_url=WEAVIATE_URL,
|
37 |
+
auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
|
38 |
+
)
|
39 |
+
try:
|
40 |
+
yield client
|
41 |
+
finally:
|
42 |
+
client.close()
|
43 |
+
|
44 |
+
# Global path tracker
|
45 |
+
last_uploaded_path = None
|
46 |
+
|
47 |
+
# Embed function
|
48 |
+
def embed_texts(texts: list[str], batch_size: int = 50) -> list[list[float]]:
|
49 |
+
all_embeddings = []
|
50 |
+
for i in range(0, len(texts), batch_size):
|
51 |
+
batch = texts[i : i + batch_size]
|
52 |
+
try:
|
53 |
+
resp = openai.embeddings.create(
|
54 |
+
model="Qwen/Qwen3-Embedding-8B",
|
55 |
+
input=batch,
|
56 |
+
encoding_format="float"
|
57 |
+
)
|
58 |
+
batch_embs = [item.embedding for item in resp.data]
|
59 |
+
all_embeddings.extend(batch_embs)
|
60 |
+
except Exception as e:
|
61 |
+
print(f"Embedding error: {e}")
|
62 |
+
all_embeddings.extend([[] for _ in batch])
|
63 |
+
return all_embeddings
|
64 |
+
|
65 |
+
def encode_query(query: str) -> list[float] | None:
|
66 |
+
embs = embed_texts([query], batch_size=1)
|
67 |
+
if embs and embs[0]:
|
68 |
+
return embs[0]
|
69 |
+
return None
|
70 |
+
|
71 |
+
async def old_Document(query: str, top_k: int = 1) -> dict:
|
72 |
+
qe = encode_query(query)
|
73 |
+
if not qe:
|
74 |
+
return {"answer": []}
|
75 |
+
|
76 |
+
try:
|
77 |
+
with weaviate_client() as client:
|
78 |
+
coll = client.collections.get("Old_Documents")
|
79 |
+
res = coll.query.near_vector(
|
80 |
+
near_vector=qe,
|
81 |
+
limit=top_k,
|
82 |
+
return_properties=["text"]
|
83 |
+
)
|
84 |
+
if not getattr(res, "objects", None):
|
85 |
+
return {"answer": []}
|
86 |
+
return {
|
87 |
+
"answer": [obj.properties.get("text", "[No Text]") for obj in res.objects]
|
88 |
+
}
|
89 |
+
except Exception as e:
|
90 |
+
print("RAG Error:", e)
|
91 |
+
return {"answer": []}
|
92 |
+
|
93 |
+
# New functions to support Gradio app
|
94 |
+
def ingest_file(path: str) -> str:
|
95 |
+
global last_uploaded_path
|
96 |
+
last_uploaded_path = path
|
97 |
+
return f"Old document ingested: {os.path.basename(path)}"
|
98 |
+
|
99 |
+
def answer_question(query: str) -> str:
|
100 |
+
try:
|
101 |
+
rag_resp = asyncio.run(old_Document(query))
|
102 |
+
chunks = rag_resp.get("answer", [])
|
103 |
+
if not chunks:
|
104 |
+
return "Sorry, I couldn't find relevant content in the old document."
|
105 |
+
|
106 |
+
return "\n".join(f"- {c}" for c in chunks)
|
107 |
+
except Exception as e:
|
108 |
+
return f"Error processing your request: {e}"
|
109 |
+
|
110 |
+
# Gradio interface for Old Documents
|
111 |
+
with gr.Blocks(title="Old Documents RAG") as demo:
|
112 |
+
gr.Markdown("## Old Documents RAG")
|
113 |
+
query = gr.Textbox(placeholder="Your question...", lines=2, label="Ask about Old Documents")
|
114 |
+
doc_file = gr.File(label="Upload Old Document (PDF, DOCX, TXT)")
|
115 |
+
btn = gr.Button("Submit")
|
116 |
+
out = gr.Textbox(label="Answer from Old Documents", lines=8, interactive=False)
|
117 |
+
|
118 |
+
def process_old_doc(query, doc_file):
|
119 |
+
if doc_file:
|
120 |
+
# Save and ingest the uploaded file
|
121 |
+
upload_dir = os.path.join(os.path.dirname(__file__), "uploaded_docs")
|
122 |
+
os.makedirs(upload_dir, exist_ok=True)
|
123 |
+
safe_filename = os.path.basename(doc_file.name)
|
124 |
+
save_path = os.path.join(upload_dir, safe_filename)
|
125 |
+
with open(save_path, "wb") as f:
|
126 |
+
f.write(doc_file.read())
|
127 |
+
status = ingest_file(save_path)
|
128 |
+
answer = answer_question(query)
|
129 |
+
return f"{status}\n\n{answer}"
|
130 |
+
else:
|
131 |
+
# Use last uploaded file or return error if none exists
|
132 |
+
if last_uploaded_path:
|
133 |
+
answer = answer_question(query)
|
134 |
+
return f"[Using previously uploaded document: {os.path.basename(last_uploaded_path)}]\n\n{answer}"
|
135 |
+
else:
|
136 |
+
return "No document uploaded. Please upload an old document to proceed."
|
137 |
+
|
138 |
+
btn.click(fn=process_old_doc, inputs=[query, doc_file], outputs=out)
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
demo.launch(debug=True)
|