afouda commited on
Commit
163369c
·
verified ·
1 Parent(s): bf0ba5b

Update Old_Document.py

Browse files
Files changed (1) hide show
  1. Old_Document.py +140 -134
Old_Document.py CHANGED
@@ -1,135 +1,141 @@
1
- import os
2
- import asyncio
3
- from dotenv import load_dotenv
4
- import gradio as gr
5
-
6
- # Load env variables
7
- load_dotenv()
8
- DEEPINFRA_TOKEN = os.getenv("DEEPINFRA_API_KEY")
9
- WEAVIATE_URL = os.getenv("WEAVIATE_URL")
10
- WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
11
-
12
- if not (DEEPINFRA_TOKEN and WEAVIATE_URL and WEAVIATE_API_KEY):
13
- raise ValueError("Please set all required keys in .env")
14
-
15
- # DeepInfra client
16
- from openai import OpenAI
17
- openai = OpenAI(
18
- api_key=DEEPINFRA_TOKEN,
19
- base_url="https://api.deepinfra.com/v1/openai",
20
- )
21
-
22
- # Weaviate client
23
- import weaviate
24
- from weaviate.classes.init import Auth
25
- from contextlib import contextmanager
26
-
27
- @contextmanager
28
- def weaviate_client():
29
- client = weaviate.connect_to_weaviate_cloud(
30
- cluster_url=WEAVIATE_URL,
31
- auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
32
- )
33
- try:
34
- yield client
35
- finally:
36
- client.close()
37
-
38
- # Global path tracker
39
- last_uploaded_path = None
40
-
41
- # Embed function
42
- def embed_texts(texts: list[str], batch_size: int = 50) -> list[list[float]]:
43
- all_embeddings = []
44
- for i in range(0, len(texts), batch_size):
45
- batch = texts[i : i + batch_size]
46
- try:
47
- resp = openai.embeddings.create(
48
- model="Qwen/Qwen3-Embedding-8B",
49
- input=batch,
50
- encoding_format="float"
51
- )
52
- batch_embs = [item.embedding for item in resp.data]
53
- all_embeddings.extend(batch_embs)
54
- except Exception as e:
55
- print(f"Embedding error: {e}")
56
- all_embeddings.extend([[] for _ in batch])
57
- return all_embeddings
58
-
59
- def encode_query(query: str) -> list[float] | None:
60
- embs = embed_texts([query], batch_size=1)
61
- if embs and embs[0]:
62
- return embs[0]
63
- return None
64
-
65
- async def old_Document(query: str, top_k: int = 1) -> dict:
66
- qe = encode_query(query)
67
- if not qe:
68
- return {"answer": []}
69
-
70
- try:
71
- with weaviate_client() as client:
72
- coll = client.collections.get("Old_Documents")
73
- res = coll.query.near_vector(
74
- near_vector=qe,
75
- limit=top_k,
76
- return_properties=["text"]
77
- )
78
- if not getattr(res, "objects", None):
79
- return {"answer": []}
80
- return {
81
- "answer": [obj.properties.get("text", "[No Text]") for obj in res.objects]
82
- }
83
- except Exception as e:
84
- print("RAG Error:", e)
85
- return {"answer": []}
86
-
87
- # New functions to support Gradio app
88
- def ingest_file(path: str) -> str:
89
- global last_uploaded_path
90
- last_uploaded_path = path
91
- return f"Old document ingested: {os.path.basename(path)}"
92
-
93
- def answer_question(query: str) -> str:
94
- try:
95
- rag_resp = asyncio.run(old_Document(query))
96
- chunks = rag_resp.get("answer", [])
97
- if not chunks:
98
- return "Sorry, I couldn't find relevant content in the old document."
99
-
100
- return "\n".join(f"- {c}" for c in chunks)
101
- except Exception as e:
102
- return f"Error processing your request: {e}"
103
-
104
- # Gradio interface for Old Documents
105
- with gr.Blocks(title="Old Documents RAG") as demo:
106
- gr.Markdown("## Old Documents RAG")
107
- query = gr.Textbox(placeholder="Your question...", lines=2, label="Ask about Old Documents")
108
- doc_file = gr.File(label="Upload Old Document (PDF, DOCX, TXT)")
109
- btn = gr.Button("Submit")
110
- out = gr.Textbox(label="Answer from Old Documents", lines=8, interactive=False)
111
-
112
- def process_old_doc(query, doc_file):
113
- if doc_file:
114
- # Save and ingest the uploaded file
115
- upload_dir = os.path.join(os.path.dirname(__file__), "uploaded_docs")
116
- os.makedirs(upload_dir, exist_ok=True)
117
- safe_filename = os.path.basename(doc_file.name)
118
- save_path = os.path.join(upload_dir, safe_filename)
119
- with open(save_path, "wb") as f:
120
- f.write(doc_file.read())
121
- status = ingest_file(save_path)
122
- answer = answer_question(query)
123
- return f"{status}\n\n{answer}"
124
- else:
125
- # Use last uploaded file or return error if none exists
126
- if last_uploaded_path:
127
- answer = answer_question(query)
128
- return f"[Using previously uploaded document: {os.path.basename(last_uploaded_path)}]\n\n{answer}"
129
- else:
130
- return "No document uploaded. Please upload an old document to proceed."
131
-
132
- btn.click(fn=process_old_doc, inputs=[query, doc_file], outputs=out)
133
-
134
- if __name__ == "__main__":
 
 
 
 
 
 
135
  demo.launch(debug=True)
 
1
+ import os
2
+ import asyncio
3
+ from dotenv import load_dotenv
4
+ import gradio as gr
5
+
6
+ GEMINI_API_KEY="AIzaSyCUCivstFpC9pq_jMHMYdlPrmh9Bx97dFo"
7
+ TAVILY_API_KEY="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"
8
+ OPENAI_API_KEY="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"
9
+ QDRANT_API_KEY="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIiwiZXhwIjoxNzUxMDUxNzg4fQ.I9J-K7OM0BtcNKgj2d4uVM8QYAHYfFCVAyP4rlZkK2E"
10
+ QDRANT_URL="https://6a3aade6-e8ad-4a6c-a579-21f5af90b7e8.us-east4-0.gcp.cloud.qdrant.io"
11
+ OPENAI_API_KEY="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"
12
+ WEAVIATE_URL="https://xbvlj5rpqyiswspww0tthq.c0.us-west3.gcp.weaviate.cloud"
13
+ WEAVIATE_API_KEY="RU9acU1CYnNRTjY1S1ZFc18zNS9tQktaWlcwTzFEUjlscEVCUGF4YU5xRWx2MDhmTUtIdUhnOWdOTGVZPV92MjAw"
14
+ DEEPINFRA_API_KEY="285LUJulGIprqT6hcPhiXtcrphU04FG4"
15
+ DEEPINFRA_BASE_URL="https://api.deepinfra.com/v1/openai"
16
+
17
+
18
+ # if not (DEEPINFRA_TOKEN and WEAVIATE_URL and WEAVIATE_API_KEY):
19
+ # raise ValueError("Please set all required keys in .env")
20
+
21
+ # DeepInfra client
22
+ from openai import OpenAI
23
+ openai = OpenAI(
24
+ api_key=DEEPINFRA_TOKEN,
25
+ base_url="https://api.deepinfra.com/v1/openai",
26
+ )
27
+
28
+ # Weaviate client
29
+ import weaviate
30
+ from weaviate.classes.init import Auth
31
+ from contextlib import contextmanager
32
+
33
+ @contextmanager
34
+ def weaviate_client():
35
+ client = weaviate.connect_to_weaviate_cloud(
36
+ cluster_url=WEAVIATE_URL,
37
+ auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
38
+ )
39
+ try:
40
+ yield client
41
+ finally:
42
+ client.close()
43
+
44
+ # Global path tracker
45
+ last_uploaded_path = None
46
+
47
+ # Embed function
48
+ def embed_texts(texts: list[str], batch_size: int = 50) -> list[list[float]]:
49
+ all_embeddings = []
50
+ for i in range(0, len(texts), batch_size):
51
+ batch = texts[i : i + batch_size]
52
+ try:
53
+ resp = openai.embeddings.create(
54
+ model="Qwen/Qwen3-Embedding-8B",
55
+ input=batch,
56
+ encoding_format="float"
57
+ )
58
+ batch_embs = [item.embedding for item in resp.data]
59
+ all_embeddings.extend(batch_embs)
60
+ except Exception as e:
61
+ print(f"Embedding error: {e}")
62
+ all_embeddings.extend([[] for _ in batch])
63
+ return all_embeddings
64
+
65
+ def encode_query(query: str) -> list[float] | None:
66
+ embs = embed_texts([query], batch_size=1)
67
+ if embs and embs[0]:
68
+ return embs[0]
69
+ return None
70
+
71
+ async def old_Document(query: str, top_k: int = 1) -> dict:
72
+ qe = encode_query(query)
73
+ if not qe:
74
+ return {"answer": []}
75
+
76
+ try:
77
+ with weaviate_client() as client:
78
+ coll = client.collections.get("Old_Documents")
79
+ res = coll.query.near_vector(
80
+ near_vector=qe,
81
+ limit=top_k,
82
+ return_properties=["text"]
83
+ )
84
+ if not getattr(res, "objects", None):
85
+ return {"answer": []}
86
+ return {
87
+ "answer": [obj.properties.get("text", "[No Text]") for obj in res.objects]
88
+ }
89
+ except Exception as e:
90
+ print("RAG Error:", e)
91
+ return {"answer": []}
92
+
93
+ # New functions to support Gradio app
94
+ def ingest_file(path: str) -> str:
95
+ global last_uploaded_path
96
+ last_uploaded_path = path
97
+ return f"Old document ingested: {os.path.basename(path)}"
98
+
99
+ def answer_question(query: str) -> str:
100
+ try:
101
+ rag_resp = asyncio.run(old_Document(query))
102
+ chunks = rag_resp.get("answer", [])
103
+ if not chunks:
104
+ return "Sorry, I couldn't find relevant content in the old document."
105
+
106
+ return "\n".join(f"- {c}" for c in chunks)
107
+ except Exception as e:
108
+ return f"Error processing your request: {e}"
109
+
110
+ # Gradio interface for Old Documents
111
+ with gr.Blocks(title="Old Documents RAG") as demo:
112
+ gr.Markdown("## Old Documents RAG")
113
+ query = gr.Textbox(placeholder="Your question...", lines=2, label="Ask about Old Documents")
114
+ doc_file = gr.File(label="Upload Old Document (PDF, DOCX, TXT)")
115
+ btn = gr.Button("Submit")
116
+ out = gr.Textbox(label="Answer from Old Documents", lines=8, interactive=False)
117
+
118
+ def process_old_doc(query, doc_file):
119
+ if doc_file:
120
+ # Save and ingest the uploaded file
121
+ upload_dir = os.path.join(os.path.dirname(__file__), "uploaded_docs")
122
+ os.makedirs(upload_dir, exist_ok=True)
123
+ safe_filename = os.path.basename(doc_file.name)
124
+ save_path = os.path.join(upload_dir, safe_filename)
125
+ with open(save_path, "wb") as f:
126
+ f.write(doc_file.read())
127
+ status = ingest_file(save_path)
128
+ answer = answer_question(query)
129
+ return f"{status}\n\n{answer}"
130
+ else:
131
+ # Use last uploaded file or return error if none exists
132
+ if last_uploaded_path:
133
+ answer = answer_question(query)
134
+ return f"[Using previously uploaded document: {os.path.basename(last_uploaded_path)}]\n\n{answer}"
135
+ else:
136
+ return "No document uploaded. Please upload an old document to proceed."
137
+
138
+ btn.click(fn=process_old_doc, inputs=[query, doc_file], outputs=out)
139
+
140
+ if __name__ == "__main__":
141
  demo.launch(debug=True)