AYS11231 commited on
Commit
7fd4998
·
verified ·
1 Parent(s): a4cc9cd

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +113 -71
app.py CHANGED
@@ -5,8 +5,9 @@ import os
5
  import requests
6
  from pypdf import PdfReader
7
  import gradio as gr
8
- import chromadb
9
  import numpy as np
 
 
10
 
11
  load_dotenv(override=True)
12
 
@@ -105,21 +106,34 @@ class Me:
105
  self.openai = OpenAI()
106
  self.name = "Alexandre Saadoun"
107
 
108
- # Initialize Chroma connection
109
- self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
 
110
 
111
  # Initialize RAG system - this will auto-load all files in me/
112
- self._setup_chroma_collection()
113
  self._populate_initial_data()
114
 
115
- def _setup_chroma_collection(self):
116
- """Setup Chroma collection for RAG"""
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  try:
118
- self.collection = self.chroma_client.get_collection(name="knowledge_base")
119
- print("✅ Loaded existing knowledge base")
120
- except:
121
- self.collection = self.chroma_client.create_collection(name="knowledge_base")
122
- print("✅ Created new knowledge base")
123
 
124
  def _get_embedding(self, text):
125
  """Get embedding for text using OpenAI"""
@@ -130,9 +144,9 @@ class Me:
130
  return response.data[0].embedding
131
 
132
  def _populate_initial_data(self):
133
- """Store initial knowledge in Chroma"""
134
  # Check if data already exists
135
- count = self.collection.count()
136
 
137
  if count == 0: # Only populate if empty
138
  print("Auto-loading all files from me/ directory...")
@@ -192,14 +206,20 @@ class Me:
192
 
193
  # Clear existing me/ content
194
  try:
195
- # Get all documents from me/ files
196
- results = self.collection.get(include=["metadatas"])
197
- me_ids = [results["ids"][i] for i, metadata in enumerate(results["metadatas"])
198
- if metadata.get("source", "").startswith("me_")]
199
 
200
- if me_ids:
201
- self.collection.delete(ids=me_ids)
202
- print(f"Cleared {len(me_ids)} existing files from me/")
 
 
 
 
 
 
203
  except Exception as e:
204
  print(f"Error clearing existing data: {e}")
205
 
@@ -210,20 +230,40 @@ class Me:
210
  def _search_knowledge(self, query, limit=3):
211
  """Search for relevant knowledge using vector similarity"""
212
  try:
213
- results = self.collection.query(
214
- query_texts=[query],
215
- n_results=limit,
216
- include=["documents", "metadatas", "distances"]
217
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  search_results = []
220
- if results["documents"] and results["documents"][0]:
221
- for i, doc in enumerate(results["documents"][0]):
222
- search_results.append({
223
- "content": doc,
224
- "type": results["metadatas"][0][i].get("type", "unknown") if results["metadatas"] else "unknown",
225
- "score": 1 - results["distances"][0][i] if results["distances"] else 1.0
226
- })
227
 
228
  return search_results
229
  except Exception as e:
@@ -231,18 +271,19 @@ class Me:
231
  return []
232
 
233
  def _store_new_knowledge(self, information, context=""):
234
- """Store new information in Chroma"""
235
  try:
236
- doc_id = f"conv_{len(self.collection.get()['ids'])}"
237
- self.collection.add(
238
- documents=[information],
239
- metadatas=[{
240
- "type": "conversation",
241
- "context": context,
242
- "timestamp": str(np.datetime64('now'))
243
- }],
244
- ids=[doc_id]
245
- )
 
246
  except Exception as e:
247
  print(f"Error storing knowledge: {e}")
248
 
@@ -268,25 +309,19 @@ class Me:
268
 
269
  # Store each chunk
270
  try:
271
- documents = []
272
- metadatas = []
273
- ids = []
274
-
275
  for i, chunk in enumerate(chunks):
276
- documents.append(chunk)
277
- metadatas.append({
 
 
 
278
  "type": "text_content",
279
  "source": source_name,
280
  "chunk_index": i,
281
  "timestamp": str(np.datetime64('now'))
282
  })
283
- ids.append(f"{source_name}_chunk_{i}")
284
 
285
- self.collection.add(
286
- documents=documents,
287
- metadatas=metadatas,
288
- ids=ids
289
- )
290
  except Exception as e:
291
  print(f"Error storing chunks: {e}")
292
 
@@ -340,22 +375,31 @@ class Me:
340
  """
341
  try:
342
  if knowledge_type:
343
- # Get documents of specific type
344
- results = self.collection.get(include=["metadatas"])
345
- type_ids = [results["ids"][i] for i, metadata in enumerate(results["metadatas"])
346
- if metadata.get("type") == knowledge_type]
 
 
 
 
 
 
 
347
 
348
- if type_ids:
349
- self.collection.delete(ids=type_ids)
350
- print(f"Deleted {len(type_ids)} {knowledge_type} documents")
351
  else:
352
  print(f"No {knowledge_type} documents found")
353
  else:
354
- # Clear entire collection
355
- all_ids = self.collection.get()["ids"]
356
- if all_ids:
357
- self.collection.delete(ids=all_ids)
358
- print(f"Deleted {len(all_ids)} documents")
 
 
359
  else:
360
  print("No documents to delete")
361
 
@@ -365,12 +409,10 @@ class Me:
365
  def get_knowledge_stats(self):
366
  """Get statistics about the knowledge base"""
367
  try:
368
- results = self.collection.get(include=["metadatas"])
369
-
370
  stats = {}
371
- total = len(results["ids"])
372
 
373
- for metadata in results["metadatas"]:
374
  doc_type = metadata.get("type", "unknown")
375
  stats[doc_type] = stats.get(doc_type, 0) + 1
376
 
 
5
  import requests
6
  from pypdf import PdfReader
7
  import gradio as gr
 
8
  import numpy as np
9
+ import pickle
10
+ import os
11
 
12
  load_dotenv(override=True)
13
 
 
106
  self.openai = OpenAI()
107
  self.name = "Alexandre Saadoun"
108
 
109
+ # Initialize simple vector store
110
+ self.vector_store_path = "./vector_store.pkl"
111
+ self.knowledge_base = {"documents": [], "embeddings": [], "metadata": []}
112
 
113
  # Initialize RAG system - this will auto-load all files in me/
114
+ self._setup_vector_store()
115
  self._populate_initial_data()
116
 
117
+ def _setup_vector_store(self):
118
+ """Setup simple vector store for RAG"""
119
+ try:
120
+ if os.path.exists(self.vector_store_path):
121
+ with open(self.vector_store_path, 'rb') as f:
122
+ self.knowledge_base = pickle.load(f)
123
+ print("✅ Loaded existing knowledge base")
124
+ else:
125
+ print("✅ Created new knowledge base")
126
+ except Exception as e:
127
+ print(f"Error loading knowledge base: {e}")
128
+ self.knowledge_base = {"documents": [], "embeddings": [], "metadata": []}
129
+
130
+ def _save_vector_store(self):
131
+ """Save vector store to disk"""
132
  try:
133
+ with open(self.vector_store_path, 'wb') as f:
134
+ pickle.dump(self.knowledge_base, f)
135
+ except Exception as e:
136
+ print(f"Error saving knowledge base: {e}")
 
137
 
138
  def _get_embedding(self, text):
139
  """Get embedding for text using OpenAI"""
 
144
  return response.data[0].embedding
145
 
146
  def _populate_initial_data(self):
147
+ """Store initial knowledge in vector store"""
148
  # Check if data already exists
149
+ count = len(self.knowledge_base["documents"])
150
 
151
  if count == 0: # Only populate if empty
152
  print("Auto-loading all files from me/ directory...")
 
206
 
207
  # Clear existing me/ content
208
  try:
209
+ indices_to_remove = []
210
+ for i, metadata in enumerate(self.knowledge_base["metadata"]):
211
+ if metadata.get("source", "").startswith("me_"):
212
+ indices_to_remove.append(i)
213
 
214
+ # Remove in reverse order to maintain indices
215
+ for i in reversed(indices_to_remove):
216
+ del self.knowledge_base["documents"][i]
217
+ del self.knowledge_base["embeddings"][i]
218
+ del self.knowledge_base["metadata"][i]
219
+
220
+ if indices_to_remove:
221
+ print(f"Cleared {len(indices_to_remove)} existing files from me/")
222
+ self._save_vector_store()
223
  except Exception as e:
224
  print(f"Error clearing existing data: {e}")
225
 
 
230
  def _search_knowledge(self, query, limit=3):
231
  """Search for relevant knowledge using vector similarity"""
232
  try:
233
+ if not self.knowledge_base["documents"]:
234
+ return []
235
+
236
+ # Get query embedding
237
+ query_embedding = self._get_embedding(query)
238
+ query_vector = np.array(query_embedding)
239
+
240
+ # Calculate cosine similarities
241
+ similarities = []
242
+ for i, doc_embedding in enumerate(self.knowledge_base["embeddings"]):
243
+ doc_vector = np.array(doc_embedding)
244
+
245
+ # Cosine similarity
246
+ dot_product = np.dot(query_vector, doc_vector)
247
+ norm_query = np.linalg.norm(query_vector)
248
+ norm_doc = np.linalg.norm(doc_vector)
249
+
250
+ if norm_query > 0 and norm_doc > 0:
251
+ similarity = dot_product / (norm_query * norm_doc)
252
+ else:
253
+ similarity = 0.0
254
+
255
+ similarities.append((similarity, i))
256
+
257
+ # Sort by similarity and get top results
258
+ similarities.sort(reverse=True)
259
 
260
  search_results = []
261
+ for similarity, idx in similarities[:limit]:
262
+ search_results.append({
263
+ "content": self.knowledge_base["documents"][idx],
264
+ "type": self.knowledge_base["metadata"][idx].get("type", "unknown"),
265
+ "score": similarity
266
+ })
 
267
 
268
  return search_results
269
  except Exception as e:
 
271
  return []
272
 
273
  def _store_new_knowledge(self, information, context=""):
274
+ """Store new information in vector store"""
275
  try:
276
+ embedding = self._get_embedding(information)
277
+
278
+ self.knowledge_base["documents"].append(information)
279
+ self.knowledge_base["embeddings"].append(embedding)
280
+ self.knowledge_base["metadata"].append({
281
+ "type": "conversation",
282
+ "context": context,
283
+ "timestamp": str(np.datetime64('now'))
284
+ })
285
+
286
+ self._save_vector_store()
287
  except Exception as e:
288
  print(f"Error storing knowledge: {e}")
289
 
 
309
 
310
  # Store each chunk
311
  try:
 
 
 
 
312
  for i, chunk in enumerate(chunks):
313
+ embedding = self._get_embedding(chunk)
314
+
315
+ self.knowledge_base["documents"].append(chunk)
316
+ self.knowledge_base["embeddings"].append(embedding)
317
+ self.knowledge_base["metadata"].append({
318
  "type": "text_content",
319
  "source": source_name,
320
  "chunk_index": i,
321
  "timestamp": str(np.datetime64('now'))
322
  })
 
323
 
324
+ self._save_vector_store()
 
 
 
 
325
  except Exception as e:
326
  print(f"Error storing chunks: {e}")
327
 
 
375
  """
376
  try:
377
  if knowledge_type:
378
+ # Remove documents of specific type
379
+ indices_to_remove = []
380
+ for i, metadata in enumerate(self.knowledge_base["metadata"]):
381
+ if metadata.get("type") == knowledge_type:
382
+ indices_to_remove.append(i)
383
+
384
+ # Remove in reverse order to maintain indices
385
+ for i in reversed(indices_to_remove):
386
+ del self.knowledge_base["documents"][i]
387
+ del self.knowledge_base["embeddings"][i]
388
+ del self.knowledge_base["metadata"][i]
389
 
390
+ if indices_to_remove:
391
+ print(f"Deleted {len(indices_to_remove)} {knowledge_type} documents")
392
+ self._save_vector_store()
393
  else:
394
  print(f"No {knowledge_type} documents found")
395
  else:
396
+ # Clear entire knowledge base
397
+ count = len(self.knowledge_base["documents"])
398
+ self.knowledge_base = {"documents": [], "embeddings": [], "metadata": []}
399
+
400
+ if count > 0:
401
+ print(f"Deleted {count} documents")
402
+ self._save_vector_store()
403
  else:
404
  print("No documents to delete")
405
 
 
409
  def get_knowledge_stats(self):
410
  """Get statistics about the knowledge base"""
411
  try:
 
 
412
  stats = {}
413
+ total = len(self.knowledge_base["documents"])
414
 
415
+ for metadata in self.knowledge_base["metadata"]:
416
  doc_type = metadata.get("type", "unknown")
417
  stats[doc_type] = stats.get(doc_type, 0) + 1
418