Spaces:

AYS11231
/

BioChat2

Sleeping

App Files Files Community

AYS11231 commited on 15 days ago

Commit

a4cc9cd

verified ·

1 Parent(s): 12ec789

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +121 -104

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 import requests
 from pypdf import PdfReader
 import gradio as gr
-from neo4j import GraphDatabase
 import numpy as np
 load_dotenv(override=True)
@@ -105,31 +105,21 @@ class Me:
         self.openai = OpenAI()
         self.name = "Alexandre Saadoun"
-        # Initialize Neo4j connection
-        self.neo4j_driver = GraphDatabase.driver(
-            os.getenv("NEO4J_URI", "bolt://localhost:7687"),
-            auth=(os.getenv("NEO4J_USER", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
-        )
         # Initialize RAG system - this will auto-load all files in me/
-        self._setup_neo4j_schema()
         self._populate_initial_data()
-    def _setup_neo4j_schema(self):
-        """Setup Neo4j schema for RAG"""
-        with self.neo4j_driver.session() as session:
-            # Create vector index for embeddings
-            try:
-                session.run("""
-                    CREATE VECTOR INDEX knowledge_embeddings IF NOT EXISTS
-                    FOR (n:Knowledge) ON (n.embedding)
-                    OPTIONS {indexConfig: {
-                        `vector.dimensions`: 1536,
-                        `vector.similarity_function`: 'cosine'
-                    }}
-                """)
-            except Exception as e:
-                print(f"Index might already exist: {e}")
     def _get_embedding(self, text):
         """Get embedding for text using OpenAI"""
@@ -140,15 +130,13 @@ class Me:
         return response.data[0].embedding
     def _populate_initial_data(self):
-        """Store initial knowledge in Neo4j"""
-        with self.neo4j_driver.session() as session:
-            # Check if data already exists
-            result = session.run("MATCH (n:Knowledge) RETURN count(n) as count")
-            count = result.single()["count"]
-            if count == 0:  # Only populate if empty
-                print("Auto-loading all files from me/ directory...")
-                self._auto_load_me_directory()
     def _auto_load_me_directory(self):
         """Automatically load and process all files in the me/ directory"""
@@ -203,16 +191,17 @@ class Me:
         print("Reloading me/ directory...")
         # Clear existing me/ content
-        with self.neo4j_driver.session() as session:
-            result = session.run("""
-                MATCH (n:Knowledge)
-                WHERE n.source STARTS WITH 'me_'
-                DELETE n
-                RETURN count(n) as deleted
-            """)
-            deleted = result.single()["deleted"]
-            if deleted > 0:
-                print(f"Cleared {deleted} existing files from me/")
         # Reload everything
         self._auto_load_me_directory()
@@ -220,33 +209,42 @@ class Me:
     def _search_knowledge(self, query, limit=3):
         """Search for relevant knowledge using vector similarity"""
-        query_embedding = self._get_embedding(query)
-        with self.neo4j_driver.session() as session:
-            result = session.run("""
-                CALL db.index.vector.queryNodes('knowledge_embeddings', $limit, $query_embedding)
-                YIELD node, score
-                RETURN node.content as content, node.type as type, score
-                ORDER BY score DESC
-            """, query_embedding=query_embedding, limit=limit)
-            return [{"content": record["content"], "type": record["type"], "score": record["score"]}
-                   for record in result]
     def _store_new_knowledge(self, information, context=""):
-        """Store new information in Neo4j"""
-        embedding = self._get_embedding(information)
-        with self.neo4j_driver.session() as session:
-            session.run("""
-                CREATE (n:Knowledge {
-                    content: $content,
-                    type: 'conversation',
-                    context: $context,
-                    embedding: $embedding,
-                    timestamp: datetime()
-                })
-            """, content=information, context=context, embedding=embedding)
     def bulk_load_text_content(self, text_content, source_name="raw_text", chunk_size=800):
         """
@@ -269,24 +267,28 @@ class Me:
         print(f"Created {len(chunks)} chunks")
         # Store each chunk
-        with self.neo4j_driver.session() as session:
             for i, chunk in enumerate(chunks):
-                embedding = self._get_embedding(chunk)
-                session.run("""
-                    CREATE (n:Knowledge {
-                        content: $content,
-                        type: 'text_content',
-                        source: $source,
-                        chunk_index: $chunk_index,
-                        embedding: $embedding,
-                        timestamp: datetime()
-                    })
-                """,
-                content=chunk,
-                source=source_name,
-                chunk_index=i,
-                embedding=embedding)
         print(f"Loaded {len(chunks)} chunks from {source_name}")
@@ -334,38 +336,53 @@ class Me:
         Clear all or specific type of knowledge from the database
         Args:
-            knowledge_type: If specified, only delete nodes of this type
         """
-        with self.neo4j_driver.session() as session:
             if knowledge_type:
-                result = session.run("MATCH (n:Knowledge {type: $type}) DELETE n RETURN count(n) as deleted",
-                                   type=knowledge_type)
             else:
-                result = session.run("MATCH (n:Knowledge) DELETE n RETURN count(n) as deleted")
-            deleted_count = result.single()["deleted"]
-            print(f"Deleted {deleted_count} knowledge nodes")
     def get_knowledge_stats(self):
         """Get statistics about the knowledge base"""
-        with self.neo4j_driver.session() as session:
-            result = session.run("""
-                MATCH (n:Knowledge)
-                RETURN n.type as type, count(n) as count
-                ORDER BY count DESC
-            """)
             stats = {}
-            total = 0
-            for record in result:
-                stats[record["type"]] = record["count"]
-                total += record["count"]
             print(f"Knowledge Base Stats (Total: {total} documents):")
-            for doc_type, count in stats.items():
                 print(f"  {doc_type}: {count}")
             return stats
     def handle_tool_call(self, tool_calls):
         results = []
@@ -420,9 +437,9 @@ If you learn new relevant information during conversations, use the store_conver
         return response.choices[0].message.content
     def __del__(self):
-        """Close Neo4j connection"""
-        if hasattr(self, 'neo4j_driver'):
-            self.neo4j_driver.close()
 if __name__ == "__main__":

 import requests
 from pypdf import PdfReader
 import gradio as gr
+import chromadb
 import numpy as np
 load_dotenv(override=True)
         self.openai = OpenAI()
         self.name = "Alexandre Saadoun"
+        # Initialize Chroma connection
+        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
         # Initialize RAG system - this will auto-load all files in me/
+        self._setup_chroma_collection()
         self._populate_initial_data()
+    def _setup_chroma_collection(self):
+        """Setup Chroma collection for RAG"""
+        try:
+            self.collection = self.chroma_client.get_collection(name="knowledge_base")
+            print("✅ Loaded existing knowledge base")
+        except:
+            self.collection = self.chroma_client.create_collection(name="knowledge_base")
+            print("✅ Created new knowledge base")
     def _get_embedding(self, text):
         """Get embedding for text using OpenAI"""
         return response.data[0].embedding
     def _populate_initial_data(self):
+        """Store initial knowledge in Chroma"""
+        # Check if data already exists
+        count = self.collection.count()
+        if count == 0:  # Only populate if empty
+            print("Auto-loading all files from me/ directory...")
+            self._auto_load_me_directory()
     def _auto_load_me_directory(self):
         """Automatically load and process all files in the me/ directory"""
         print("Reloading me/ directory...")
         # Clear existing me/ content
+        try:
+            # Get all documents from me/ files
+            results = self.collection.get(include=["metadatas"])
+            me_ids = [results["ids"][i] for i, metadata in enumerate(results["metadatas"])
+                     if metadata.get("source", "").startswith("me_")]
+            if me_ids:
+                self.collection.delete(ids=me_ids)
+                print(f"Cleared {len(me_ids)} existing files from me/")
+        except Exception as e:
+            print(f"Error clearing existing data: {e}")
         # Reload everything
         self._auto_load_me_directory()
     def _search_knowledge(self, query, limit=3):
         """Search for relevant knowledge using vector similarity"""
+        try:
+            results = self.collection.query(
+                query_texts=[query],
+                n_results=limit,
+                include=["documents", "metadatas", "distances"]
+            )
+            search_results = []
+            if results["documents"] and results["documents"][0]:
+                for i, doc in enumerate(results["documents"][0]):
+                    search_results.append({
+                        "content": doc,
+                        "type": results["metadatas"][0][i].get("type", "unknown") if results["metadatas"] else "unknown",
+                        "score": 1 - results["distances"][0][i] if results["distances"] else 1.0
+                    })
+            return search_results
+        except Exception as e:
+            print(f"Search error: {e}")
+            return []
     def _store_new_knowledge(self, information, context=""):
+        """Store new information in Chroma"""
+        try:
+            doc_id = f"conv_{len(self.collection.get()['ids'])}"
+            self.collection.add(
+                documents=[information],
+                metadatas=[{
+                    "type": "conversation",
+                    "context": context,
+                    "timestamp": str(np.datetime64('now'))
+                }],
+                ids=[doc_id]
+            )
+        except Exception as e:
+            print(f"Error storing knowledge: {e}")
     def bulk_load_text_content(self, text_content, source_name="raw_text", chunk_size=800):
         """
         print(f"Created {len(chunks)} chunks")
         # Store each chunk
+        try:
+            documents = []
+            metadatas = []
+            ids = []
             for i, chunk in enumerate(chunks):
+                documents.append(chunk)
+                metadatas.append({
+                    "type": "text_content",
+                    "source": source_name,
+                    "chunk_index": i,
+                    "timestamp": str(np.datetime64('now'))
+                })
+                ids.append(f"{source_name}_chunk_{i}")
+            self.collection.add(
+                documents=documents,
+                metadatas=metadatas,
+                ids=ids
+            )
+        except Exception as e:
+            print(f"Error storing chunks: {e}")
         print(f"Loaded {len(chunks)} chunks from {source_name}")
         Clear all or specific type of knowledge from the database
         Args:
+            knowledge_type: If specified, only delete documents of this type
         """
+        try:
             if knowledge_type:
+                # Get documents of specific type
+                results = self.collection.get(include=["metadatas"])
+                type_ids = [results["ids"][i] for i, metadata in enumerate(results["metadatas"])
+                           if metadata.get("type") == knowledge_type]
+                if type_ids:
+                    self.collection.delete(ids=type_ids)
+                    print(f"Deleted {len(type_ids)} {knowledge_type} documents")
+                else:
+                    print(f"No {knowledge_type} documents found")
             else:
+                # Clear entire collection
+                all_ids = self.collection.get()["ids"]
+                if all_ids:
+                    self.collection.delete(ids=all_ids)
+                    print(f"Deleted {len(all_ids)} documents")
+                else:
+                    print("No documents to delete")
+        except Exception as e:
+            print(f"Error clearing knowledge base: {e}")
     def get_knowledge_stats(self):
         """Get statistics about the knowledge base"""
+        try:
+            results = self.collection.get(include=["metadatas"])
             stats = {}
+            total = len(results["ids"])
+            for metadata in results["metadatas"]:
+                doc_type = metadata.get("type", "unknown")
+                stats[doc_type] = stats.get(doc_type, 0) + 1
             print(f"Knowledge Base Stats (Total: {total} documents):")
+            for doc_type, count in sorted(stats.items(), key=lambda x: x[1], reverse=True):
                 print(f"  {doc_type}: {count}")
             return stats
+        except Exception as e:
+            print(f"Error getting stats: {e}")
+            return {}
     def handle_tool_call(self, tool_calls):
         results = []
         return response.choices[0].message.content
     def __del__(self):
+        """Clean up Chroma connection"""
+        # Chroma client doesn't need explicit closing
+        pass
 if __name__ == "__main__":