AYS11231 commited on
Commit
a4cc9cd
·
verified ·
1 Parent(s): 12ec789

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +121 -104
app.py CHANGED
@@ -5,7 +5,7 @@ import os
5
  import requests
6
  from pypdf import PdfReader
7
  import gradio as gr
8
- from neo4j import GraphDatabase
9
  import numpy as np
10
 
11
  load_dotenv(override=True)
@@ -105,31 +105,21 @@ class Me:
105
  self.openai = OpenAI()
106
  self.name = "Alexandre Saadoun"
107
 
108
- # Initialize Neo4j connection
109
- self.neo4j_driver = GraphDatabase.driver(
110
- os.getenv("NEO4J_URI", "bolt://localhost:7687"),
111
- auth=(os.getenv("NEO4J_USER", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
112
- )
113
 
114
  # Initialize RAG system - this will auto-load all files in me/
115
- self._setup_neo4j_schema()
116
  self._populate_initial_data()
117
 
118
- def _setup_neo4j_schema(self):
119
- """Setup Neo4j schema for RAG"""
120
- with self.neo4j_driver.session() as session:
121
- # Create vector index for embeddings
122
- try:
123
- session.run("""
124
- CREATE VECTOR INDEX knowledge_embeddings IF NOT EXISTS
125
- FOR (n:Knowledge) ON (n.embedding)
126
- OPTIONS {indexConfig: {
127
- `vector.dimensions`: 1536,
128
- `vector.similarity_function`: 'cosine'
129
- }}
130
- """)
131
- except Exception as e:
132
- print(f"Index might already exist: {e}")
133
 
134
  def _get_embedding(self, text):
135
  """Get embedding for text using OpenAI"""
@@ -140,15 +130,13 @@ class Me:
140
  return response.data[0].embedding
141
 
142
  def _populate_initial_data(self):
143
- """Store initial knowledge in Neo4j"""
144
- with self.neo4j_driver.session() as session:
145
- # Check if data already exists
146
- result = session.run("MATCH (n:Knowledge) RETURN count(n) as count")
147
- count = result.single()["count"]
148
-
149
- if count == 0: # Only populate if empty
150
- print("Auto-loading all files from me/ directory...")
151
- self._auto_load_me_directory()
152
 
153
  def _auto_load_me_directory(self):
154
  """Automatically load and process all files in the me/ directory"""
@@ -203,16 +191,17 @@ class Me:
203
  print("Reloading me/ directory...")
204
 
205
  # Clear existing me/ content
206
- with self.neo4j_driver.session() as session:
207
- result = session.run("""
208
- MATCH (n:Knowledge)
209
- WHERE n.source STARTS WITH 'me_'
210
- DELETE n
211
- RETURN count(n) as deleted
212
- """)
213
- deleted = result.single()["deleted"]
214
- if deleted > 0:
215
- print(f"Cleared {deleted} existing files from me/")
 
216
 
217
  # Reload everything
218
  self._auto_load_me_directory()
@@ -220,33 +209,42 @@ class Me:
220
 
221
  def _search_knowledge(self, query, limit=3):
222
  """Search for relevant knowledge using vector similarity"""
223
- query_embedding = self._get_embedding(query)
224
-
225
- with self.neo4j_driver.session() as session:
226
- result = session.run("""
227
- CALL db.index.vector.queryNodes('knowledge_embeddings', $limit, $query_embedding)
228
- YIELD node, score
229
- RETURN node.content as content, node.type as type, score
230
- ORDER BY score DESC
231
- """, query_embedding=query_embedding, limit=limit)
 
 
 
 
 
 
232
 
233
- return [{"content": record["content"], "type": record["type"], "score": record["score"]}
234
- for record in result]
 
 
235
 
236
  def _store_new_knowledge(self, information, context=""):
237
- """Store new information in Neo4j"""
238
- embedding = self._get_embedding(information)
239
-
240
- with self.neo4j_driver.session() as session:
241
- session.run("""
242
- CREATE (n:Knowledge {
243
- content: $content,
244
- type: 'conversation',
245
- context: $context,
246
- embedding: $embedding,
247
- timestamp: datetime()
248
- })
249
- """, content=information, context=context, embedding=embedding)
 
250
 
251
  def bulk_load_text_content(self, text_content, source_name="raw_text", chunk_size=800):
252
  """
@@ -269,24 +267,28 @@ class Me:
269
  print(f"Created {len(chunks)} chunks")
270
 
271
  # Store each chunk
272
- with self.neo4j_driver.session() as session:
 
 
 
 
273
  for i, chunk in enumerate(chunks):
274
- embedding = self._get_embedding(chunk)
275
-
276
- session.run("""
277
- CREATE (n:Knowledge {
278
- content: $content,
279
- type: 'text_content',
280
- source: $source,
281
- chunk_index: $chunk_index,
282
- embedding: $embedding,
283
- timestamp: datetime()
284
- })
285
- """,
286
- content=chunk,
287
- source=source_name,
288
- chunk_index=i,
289
- embedding=embedding)
290
 
291
  print(f"Loaded {len(chunks)} chunks from {source_name}")
292
 
@@ -334,38 +336,53 @@ class Me:
334
  Clear all or specific type of knowledge from the database
335
 
336
  Args:
337
- knowledge_type: If specified, only delete nodes of this type
338
  """
339
- with self.neo4j_driver.session() as session:
340
  if knowledge_type:
341
- result = session.run("MATCH (n:Knowledge {type: $type}) DELETE n RETURN count(n) as deleted",
342
- type=knowledge_type)
 
 
 
 
 
 
 
 
343
  else:
344
- result = session.run("MATCH (n:Knowledge) DELETE n RETURN count(n) as deleted")
345
-
346
- deleted_count = result.single()["deleted"]
347
- print(f"Deleted {deleted_count} knowledge nodes")
 
 
 
 
 
 
348
 
349
  def get_knowledge_stats(self):
350
  """Get statistics about the knowledge base"""
351
- with self.neo4j_driver.session() as session:
352
- result = session.run("""
353
- MATCH (n:Knowledge)
354
- RETURN n.type as type, count(n) as count
355
- ORDER BY count DESC
356
- """)
357
 
358
  stats = {}
359
- total = 0
360
- for record in result:
361
- stats[record["type"]] = record["count"]
362
- total += record["count"]
 
363
 
364
  print(f"Knowledge Base Stats (Total: {total} documents):")
365
- for doc_type, count in stats.items():
366
  print(f" {doc_type}: {count}")
367
 
368
  return stats
 
 
 
 
369
 
370
  def handle_tool_call(self, tool_calls):
371
  results = []
@@ -420,9 +437,9 @@ If you learn new relevant information during conversations, use the store_conver
420
  return response.choices[0].message.content
421
 
422
  def __del__(self):
423
- """Close Neo4j connection"""
424
- if hasattr(self, 'neo4j_driver'):
425
- self.neo4j_driver.close()
426
 
427
 
428
  if __name__ == "__main__":
 
5
  import requests
6
  from pypdf import PdfReader
7
  import gradio as gr
8
+ import chromadb
9
  import numpy as np
10
 
11
  load_dotenv(override=True)
 
105
  self.openai = OpenAI()
106
  self.name = "Alexandre Saadoun"
107
 
108
+ # Initialize Chroma connection
109
+ self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
 
 
 
110
 
111
  # Initialize RAG system - this will auto-load all files in me/
112
+ self._setup_chroma_collection()
113
  self._populate_initial_data()
114
 
115
+ def _setup_chroma_collection(self):
116
+ """Setup Chroma collection for RAG"""
117
+ try:
118
+ self.collection = self.chroma_client.get_collection(name="knowledge_base")
119
+ print("✅ Loaded existing knowledge base")
120
+ except:
121
+ self.collection = self.chroma_client.create_collection(name="knowledge_base")
122
+ print("✅ Created new knowledge base")
 
 
 
 
 
 
 
123
 
124
  def _get_embedding(self, text):
125
  """Get embedding for text using OpenAI"""
 
130
  return response.data[0].embedding
131
 
132
  def _populate_initial_data(self):
133
+ """Store initial knowledge in Chroma"""
134
+ # Check if data already exists
135
+ count = self.collection.count()
136
+
137
+ if count == 0: # Only populate if empty
138
+ print("Auto-loading all files from me/ directory...")
139
+ self._auto_load_me_directory()
 
 
140
 
141
  def _auto_load_me_directory(self):
142
  """Automatically load and process all files in the me/ directory"""
 
191
  print("Reloading me/ directory...")
192
 
193
  # Clear existing me/ content
194
+ try:
195
+ # Get all documents from me/ files
196
+ results = self.collection.get(include=["metadatas"])
197
+ me_ids = [results["ids"][i] for i, metadata in enumerate(results["metadatas"])
198
+ if metadata.get("source", "").startswith("me_")]
199
+
200
+ if me_ids:
201
+ self.collection.delete(ids=me_ids)
202
+ print(f"Cleared {len(me_ids)} existing files from me/")
203
+ except Exception as e:
204
+ print(f"Error clearing existing data: {e}")
205
 
206
  # Reload everything
207
  self._auto_load_me_directory()
 
209
 
210
  def _search_knowledge(self, query, limit=3):
211
  """Search for relevant knowledge using vector similarity"""
212
+ try:
213
+ results = self.collection.query(
214
+ query_texts=[query],
215
+ n_results=limit,
216
+ include=["documents", "metadatas", "distances"]
217
+ )
218
+
219
+ search_results = []
220
+ if results["documents"] and results["documents"][0]:
221
+ for i, doc in enumerate(results["documents"][0]):
222
+ search_results.append({
223
+ "content": doc,
224
+ "type": results["metadatas"][0][i].get("type", "unknown") if results["metadatas"] else "unknown",
225
+ "score": 1 - results["distances"][0][i] if results["distances"] else 1.0
226
+ })
227
 
228
+ return search_results
229
+ except Exception as e:
230
+ print(f"Search error: {e}")
231
+ return []
232
 
233
  def _store_new_knowledge(self, information, context=""):
234
+ """Store new information in Chroma"""
235
+ try:
236
+ doc_id = f"conv_{len(self.collection.get()['ids'])}"
237
+ self.collection.add(
238
+ documents=[information],
239
+ metadatas=[{
240
+ "type": "conversation",
241
+ "context": context,
242
+ "timestamp": str(np.datetime64('now'))
243
+ }],
244
+ ids=[doc_id]
245
+ )
246
+ except Exception as e:
247
+ print(f"Error storing knowledge: {e}")
248
 
249
  def bulk_load_text_content(self, text_content, source_name="raw_text", chunk_size=800):
250
  """
 
267
  print(f"Created {len(chunks)} chunks")
268
 
269
  # Store each chunk
270
+ try:
271
+ documents = []
272
+ metadatas = []
273
+ ids = []
274
+
275
  for i, chunk in enumerate(chunks):
276
+ documents.append(chunk)
277
+ metadatas.append({
278
+ "type": "text_content",
279
+ "source": source_name,
280
+ "chunk_index": i,
281
+ "timestamp": str(np.datetime64('now'))
282
+ })
283
+ ids.append(f"{source_name}_chunk_{i}")
284
+
285
+ self.collection.add(
286
+ documents=documents,
287
+ metadatas=metadatas,
288
+ ids=ids
289
+ )
290
+ except Exception as e:
291
+ print(f"Error storing chunks: {e}")
292
 
293
  print(f"Loaded {len(chunks)} chunks from {source_name}")
294
 
 
336
  Clear all or specific type of knowledge from the database
337
 
338
  Args:
339
+ knowledge_type: If specified, only delete documents of this type
340
  """
341
+ try:
342
  if knowledge_type:
343
+ # Get documents of specific type
344
+ results = self.collection.get(include=["metadatas"])
345
+ type_ids = [results["ids"][i] for i, metadata in enumerate(results["metadatas"])
346
+ if metadata.get("type") == knowledge_type]
347
+
348
+ if type_ids:
349
+ self.collection.delete(ids=type_ids)
350
+ print(f"Deleted {len(type_ids)} {knowledge_type} documents")
351
+ else:
352
+ print(f"No {knowledge_type} documents found")
353
  else:
354
+ # Clear entire collection
355
+ all_ids = self.collection.get()["ids"]
356
+ if all_ids:
357
+ self.collection.delete(ids=all_ids)
358
+ print(f"Deleted {len(all_ids)} documents")
359
+ else:
360
+ print("No documents to delete")
361
+
362
+ except Exception as e:
363
+ print(f"Error clearing knowledge base: {e}")
364
 
365
  def get_knowledge_stats(self):
366
  """Get statistics about the knowledge base"""
367
+ try:
368
+ results = self.collection.get(include=["metadatas"])
 
 
 
 
369
 
370
  stats = {}
371
+ total = len(results["ids"])
372
+
373
+ for metadata in results["metadatas"]:
374
+ doc_type = metadata.get("type", "unknown")
375
+ stats[doc_type] = stats.get(doc_type, 0) + 1
376
 
377
  print(f"Knowledge Base Stats (Total: {total} documents):")
378
+ for doc_type, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
379
  print(f" {doc_type}: {count}")
380
 
381
  return stats
382
+
383
+ except Exception as e:
384
+ print(f"Error getting stats: {e}")
385
+ return {}
386
 
387
  def handle_tool_call(self, tool_calls):
388
  results = []
 
437
  return response.choices[0].message.content
438
 
439
  def __del__(self):
440
+ """Clean up Chroma connection"""
441
+ # Chroma client doesn't need explicit closing
442
+ pass
443
 
444
 
445
  if __name__ == "__main__":