bluenevus commited on
Commit
e33167d
·
1 Parent(s): 3858bb7

Update app.py via AI Editor

Browse files
Files changed (1) hide show
  1. app.py +49 -76
app.py CHANGED
@@ -13,11 +13,6 @@ import openai
13
  import base64
14
  import datetime
15
  from werkzeug.utils import secure_filename
16
- import chromadb
17
- from chromadb.config import Settings
18
- from langchain.embeddings.openai import OpenAIEmbeddings
19
- from langchain_community.vectorstores import Chroma
20
- from langchain.text_splitter import RecursiveCharacterTextSplitter
21
 
22
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(threadName)s %(message)s")
23
  logger = logging.getLogger("AskTricare")
@@ -27,64 +22,9 @@ SESSION_DATA = {}
27
  SESSION_LOCKS = {}
28
  SESSION_DIR_BASE = os.path.join(tempfile.gettempdir(), "asktricare_sessions")
29
  os.makedirs(SESSION_DIR_BASE, exist_ok=True)
30
- VECTOR_DB_DIR = os.path.join(os.getcwd(), "vector_db")
31
- DOCS_DIR = os.path.join(os.getcwd(), "doc")
32
- os.makedirs(DOCS_DIR, exist_ok=True)
33
- os.makedirs(VECTOR_DB_DIR, exist_ok=True)
34
 
35
  openai.api_key = os.environ.get("OPENAI_API_KEY")
36
 
37
- chroma_client = chromadb.Client(Settings(
38
- chroma_db_impl="duckdb+parquet",
39
- persist_directory=VECTOR_DB_DIR,
40
- ))
41
- embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", openai_api_key=openai.api_key)
42
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
43
-
44
- def ingest_docs():
45
- logger.info("Starting document ingestion...")
46
- file_paths = []
47
- for root, _, files in os.walk(DOCS_DIR):
48
- for f in files:
49
- if f.lower().endswith(('.txt', '.pdf', '.md', '.docx')):
50
- file_paths.append(os.path.join(root, f))
51
- documents = []
52
- metadatas = []
53
- ids = []
54
- for path in file_paths:
55
- try:
56
- with open(path, "r", encoding="utf-8", errors="ignore") as infile:
57
- content = infile.read()
58
- chunks = text_splitter.split_text(content)
59
- for idx, chunk in enumerate(chunks):
60
- documents.append(chunk)
61
- metadatas.append({"source": path, "chunk": idx})
62
- ids.append(f"{os.path.basename(path)}_{idx}")
63
- except Exception as e:
64
- logger.error(f"Error ingesting {path}: {e}")
65
- if documents:
66
- vectordb = Chroma(
67
- collection_name="asktricare",
68
- embedding_function=embeddings,
69
- persist_directory=VECTOR_DB_DIR,
70
- client_settings=Settings(chroma_db_impl="duckdb+parquet", persist_directory=VECTOR_DB_DIR),
71
- )
72
- vectordb.add_texts(documents, metadatas=metadatas, ids=ids)
73
- vectordb.persist()
74
- logger.info(f"Ingested {len(documents)} chunks from {len(file_paths)} files.")
75
- else:
76
- logger.info("No new documents to ingest.")
77
-
78
- if not os.listdir(VECTOR_DB_DIR):
79
- ingest_docs()
80
-
81
- vectordb = Chroma(
82
- collection_name="asktricare",
83
- embedding_function=embeddings,
84
- persist_directory=VECTOR_DB_DIR,
85
- client_settings=Settings(chroma_db_impl="duckdb+parquet", persist_directory=VECTOR_DB_DIR),
86
- )
87
-
88
  def get_session_id():
89
  sid = flask_request.cookies.get("asktricare_session_id")
90
  if not sid:
@@ -106,6 +46,7 @@ def get_session_state(session_id):
106
  SESSION_DATA[session_id] = {
107
  "messages": [],
108
  "uploads": [],
 
109
  "created": datetime.datetime.utcnow().isoformat()
110
  }
111
  return SESSION_DATA[session_id]
@@ -233,6 +174,27 @@ app.layout = html.Div([
233
  ], style={"display": "flex"})
234
  ])
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  @app.callback(
237
  Output("session-id", "data"),
238
  Input("url", "href"),
@@ -243,7 +205,6 @@ def assign_session_id(_):
243
  d = get_session_dir(sid)
244
  load_session_state(sid)
245
  logger.info(f"Assigned session id: {sid}")
246
- resp = dash.no_update
247
  return sid
248
 
249
  @app.callback(
@@ -284,6 +245,13 @@ def main_callback(session_id, send_clicks, file_contents, file_names, user_input
284
  with open(fp, "wb") as f:
285
  f.write(base64.b64decode(data))
286
  uploads.append({"name": fname, "is_img": is_img, "path": fp})
 
 
 
 
 
 
 
287
  state["uploads"].extend(uploads)
288
  save_session_state(session_id)
289
  logger.info(f"Session {session_id}: Uploaded files {[u['name'] for u in uploads]}")
@@ -292,27 +260,32 @@ def main_callback(session_id, send_clicks, file_contents, file_names, user_input
292
  loading = True
293
  state["messages"].append({"role": "user", "content": user_input})
294
  try:
295
- docs = []
296
- try:
297
- retr = vectordb.similarity_search(user_input, k=3)
298
- docs = [d.page_content for d in retr]
299
- except Exception as e:
300
- logger.warning(f"Vector search failed: {e}")
301
- context = "\n\n".join(docs)
302
  system_prompt = load_system_prompt()
303
  messages = [
304
  {"role": "system", "content": system_prompt},
305
  ]
306
  for m in state["messages"]:
307
  messages.append({"role": m["role"], "content": m["content"]})
308
- if context.strip():
309
- messages.append({"role": "system", "content": f"Relevant reference material:\n{context}"})
310
- response = openai.ChatCompletion.create(
311
- model="gpt-3.5-turbo",
312
- messages=messages,
313
- max_tokens=700,
314
- temperature=0.2,
315
- )
 
 
 
 
 
 
 
 
 
 
316
  reply = response.choices[0].message.content
317
  state["messages"].append({"role": "assistant", "content": reply})
318
  logger.info(f"Session {session_id}: User: {user_input} | Assistant: {reply}")
 
13
  import base64
14
  import datetime
15
  from werkzeug.utils import secure_filename
 
 
 
 
 
16
 
17
  logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(threadName)s %(message)s")
18
  logger = logging.getLogger("AskTricare")
 
22
  SESSION_LOCKS = {}
23
  SESSION_DIR_BASE = os.path.join(tempfile.gettempdir(), "asktricare_sessions")
24
  os.makedirs(SESSION_DIR_BASE, exist_ok=True)
 
 
 
 
25
 
26
  openai.api_key = os.environ.get("OPENAI_API_KEY")
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def get_session_id():
29
  sid = flask_request.cookies.get("asktricare_session_id")
30
  if not sid:
 
46
  SESSION_DATA[session_id] = {
47
  "messages": [],
48
  "uploads": [],
49
+ "openai_file_ids": [],
50
  "created": datetime.datetime.utcnow().isoformat()
51
  }
52
  return SESSION_DATA[session_id]
 
174
  ], style={"display": "flex"})
175
  ])
176
 
177
+ def _upload_file_to_openai(file_path, purpose="assistants"):
178
+ try:
179
+ with open(file_path, 'rb') as f:
180
+ res = openai.File.create(
181
+ file=f,
182
+ purpose=purpose
183
+ )
184
+ logger.info(f"Uploaded file to OpenAI: {res.id}")
185
+ return res.id
186
+ except Exception as e:
187
+ logger.error(f"Failed to upload file to OpenAI: {e}")
188
+ return None
189
+
190
+ def _get_openai_file_ids(session_state):
191
+ return session_state.get("openai_file_ids", [])
192
+
193
+ def _is_supported_doc(filename):
194
+ ext = os.path.splitext(filename)[1].lower()
195
+ # OpenAI supports: txt, pdf, docx, md for assistants file search
196
+ return ext in [".txt", ".pdf", ".md", ".docx"]
197
+
198
  @app.callback(
199
  Output("session-id", "data"),
200
  Input("url", "href"),
 
205
  d = get_session_dir(sid)
206
  load_session_state(sid)
207
  logger.info(f"Assigned session id: {sid}")
 
208
  return sid
209
 
210
  @app.callback(
 
245
  with open(fp, "wb") as f:
246
  f.write(base64.b64decode(data))
247
  uploads.append({"name": fname, "is_img": is_img, "path": fp})
248
+ # If document is supported, upload to OpenAI
249
+ if _is_supported_doc(fname):
250
+ file_id = _upload_file_to_openai(fp)
251
+ if file_id:
252
+ if "openai_file_ids" not in state:
253
+ state["openai_file_ids"] = []
254
+ state["openai_file_ids"].append(file_id)
255
  state["uploads"].extend(uploads)
256
  save_session_state(session_id)
257
  logger.info(f"Session {session_id}: Uploaded files {[u['name'] for u in uploads]}")
 
260
  loading = True
261
  state["messages"].append({"role": "user", "content": user_input})
262
  try:
263
+ # Use OpenAI's file search tool via ChatCompletion if files exist
264
+ file_ids = _get_openai_file_ids(state)
 
 
 
 
 
265
  system_prompt = load_system_prompt()
266
  messages = [
267
  {"role": "system", "content": system_prompt},
268
  ]
269
  for m in state["messages"]:
270
  messages.append({"role": m["role"], "content": m["content"]})
271
+ if file_ids:
272
+ # Use 'tools' for file_search (RAG) if supported
273
+ response = openai.ChatCompletion.create(
274
+ model="gpt-3.5-turbo-1106",
275
+ messages=messages,
276
+ tools=[{"type": "file_search"}],
277
+ tool_choice="file_search",
278
+ file_ids=file_ids,
279
+ max_tokens=700,
280
+ temperature=0.2,
281
+ )
282
+ else:
283
+ response = openai.ChatCompletion.create(
284
+ model="gpt-3.5-turbo",
285
+ messages=messages,
286
+ max_tokens=700,
287
+ temperature=0.2,
288
+ )
289
  reply = response.choices[0].message.content
290
  state["messages"].append({"role": "assistant", "content": reply})
291
  logger.info(f"Session {session_id}: User: {user_input} | Assistant: {reply}")