AYS11231 commited on
Commit
839e960
·
verified ·
1 Parent(s): 0af0679

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +431 -0
  2. bulk_loader_script.py +1 -1
app.py ADDED
@@ -0,0 +1,431 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from openai import OpenAI
3
+ import json
4
+ import os
5
+ import requests
6
+ from pypdf import PdfReader
7
+ import gradio as gr
8
+ import neo4j
9
+ from neo4j import GraphDatabase
10
+ import numpy as np
11
+
12
+ load_dotenv(override=True)
13
+
14
+ def push(text):
15
+ requests.post(
16
+ "https://api.pushover.net/1/messages.json",
17
+ data={
18
+ "token": os.getenv("PUSHOVER_TOKEN"),
19
+ "user": os.getenv("PUSHOVER_USER"),
20
+ "message": text,
21
+ }
22
+ )
23
+
24
+
25
+ def record_user_details(email, name="Name not provided", notes="not provided"):
26
+ push(f"Recording {name} with email {email} and notes {notes}")
27
+ return {"recorded": "ok"}
28
+
29
+ def record_unknown_question(question):
30
+ push(f"Recording {question}")
31
+ return {"recorded": "ok"}
32
+
33
+ def store_conversation_info(information, context=""):
34
+ """Store new information from conversations"""
35
+ return {"stored": "ok", "info": information}
36
+
37
+ record_user_details_json = {
38
+ "name": "record_user_details",
39
+ "description": "Use this tool to record that a user is interested in being in touch and provided an email address",
40
+ "parameters": {
41
+ "type": "object",
42
+ "properties": {
43
+ "email": {
44
+ "type": "string",
45
+ "description": "The email address of this user"
46
+ },
47
+ "name": {
48
+ "type": "string",
49
+ "description": "The user's name, if they provided it"
50
+ }
51
+ ,
52
+ "notes": {
53
+ "type": "string",
54
+ "description": "Any additional information about the conversation that's worth recording to give context"
55
+ }
56
+ },
57
+ "required": ["email"],
58
+ "additionalProperties": False
59
+ }
60
+ }
61
+
62
+ record_unknown_question_json = {
63
+ "name": "record_unknown_question",
64
+ "description": "Always use this tool to record any question that couldn't be answered as you didn't know the answer",
65
+ "parameters": {
66
+ "type": "object",
67
+ "properties": {
68
+ "question": {
69
+ "type": "string",
70
+ "description": "The question that couldn't be answered"
71
+ },
72
+ },
73
+ "required": ["question"],
74
+ "additionalProperties": False
75
+ }
76
+ }
77
+
78
+ store_conversation_info_json = {
79
+ "name": "store_conversation_info",
80
+ "description": "Store new information learned during conversations for future reference",
81
+ "parameters": {
82
+ "type": "object",
83
+ "properties": {
84
+ "information": {
85
+ "type": "string",
86
+ "description": "The new information to store"
87
+ },
88
+ "context": {
89
+ "type": "string",
90
+ "description": "Context about when/how this information was learned"
91
+ }
92
+ },
93
+ "required": ["information"],
94
+ "additionalProperties": False
95
+ }
96
+ }
97
+
98
+ tools = [{"type": "function", "function": record_user_details_json},
99
+ {"type": "function", "function": record_unknown_question_json},
100
+ {"type": "function", "function": store_conversation_info_json}]
101
+
102
+
103
+ class Me:
104
+
105
+ def __init__(self):
106
+ self.openai = OpenAI()
107
+ self.name = "Alexandre Saadoun"
108
+
109
+ # Initialize Neo4j connection
110
+ self.neo4j_driver = GraphDatabase.driver(
111
+ os.getenv("NEO4J_URI", "bolt://localhost:7687"),
112
+ auth=(os.getenv("NEO4J_USER", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
113
+ )
114
+
115
+ # Initialize RAG system - this will auto-load all files in me/
116
+ self._setup_neo4j_schema()
117
+ self._populate_initial_data()
118
+
119
+ def _setup_neo4j_schema(self):
120
+ """Setup Neo4j schema for RAG"""
121
+ with self.neo4j_driver.session() as session:
122
+ # Create vector index for embeddings
123
+ try:
124
+ session.run("""
125
+ CREATE VECTOR INDEX knowledge_embeddings IF NOT EXISTS
126
+ FOR (n:Knowledge) ON (n.embedding)
127
+ OPTIONS {indexConfig: {
128
+ `vector.dimensions`: 1536,
129
+ `vector.similarity_function`: 'cosine'
130
+ }}
131
+ """)
132
+ except Exception as e:
133
+ print(f"Index might already exist: {e}")
134
+
135
+ def _get_embedding(self, text):
136
+ """Get embedding for text using OpenAI"""
137
+ response = self.openai.embeddings.create(
138
+ model="text-embedding-3-small",
139
+ input=text
140
+ )
141
+ return response.data[0].embedding
142
+
143
+ def _populate_initial_data(self):
144
+ """Store initial knowledge in Neo4j"""
145
+ with self.neo4j_driver.session() as session:
146
+ # Check if data already exists
147
+ result = session.run("MATCH (n:Knowledge) RETURN count(n) as count")
148
+ count = result.single()["count"]
149
+
150
+ if count == 0: # Only populate if empty
151
+ print("Auto-loading all files from me/ directory...")
152
+ self._auto_load_me_directory()
153
+
154
+ def _auto_load_me_directory(self):
155
+ """Automatically load and process all files in the me/ directory"""
156
+ import glob
157
+
158
+ me_dir = "me/"
159
+ if not os.path.exists(me_dir):
160
+ print(f"Directory {me_dir} not found")
161
+ return
162
+
163
+ # Find all files in me/ directory
164
+ all_files = glob.glob(os.path.join(me_dir, "*"))
165
+ processed_files = []
166
+
167
+ for file_path in all_files:
168
+ if os.path.isfile(file_path): # Skip directories
169
+ filename = os.path.basename(file_path)
170
+ print(f"Auto-processing: {filename}")
171
+
172
+ try:
173
+ # Handle different file types
174
+ if file_path.endswith('.pdf'):
175
+ reader = PdfReader(file_path)
176
+ content = ""
177
+ for page in reader.pages:
178
+ page_text = page.extract_text()
179
+ if page_text:
180
+ content += page_text
181
+
182
+ elif file_path.endswith(('.txt', '.md')):
183
+ with open(file_path, 'r', encoding='utf-8') as f:
184
+ content = f.read()
185
+
186
+ else:
187
+ print(f"Skipping unsupported file type: {filename}")
188
+ continue
189
+
190
+ if content.strip(): # Only process if content exists
191
+ self.bulk_load_text_content(content, f"me_{filename}")
192
+ processed_files.append(filename)
193
+
194
+ except Exception as e:
195
+ print(f"Error processing {filename}: {e}")
196
+
197
+ if processed_files:
198
+ print(f"✅ Auto-loaded {len(processed_files)} files: {', '.join(processed_files)}")
199
+ else:
200
+ print("No files found to process in me/ directory")
201
+
202
+ def reload_me_directory(self):
203
+ """Reload all files from me/ directory (useful when you add new files)"""
204
+ print("Reloading me/ directory...")
205
+
206
+ # Clear existing me/ content
207
+ with self.neo4j_driver.session() as session:
208
+ result = session.run("""
209
+ MATCH (n:Knowledge)
210
+ WHERE n.source STARTS WITH 'me_'
211
+ DELETE n
212
+ RETURN count(n) as deleted
213
+ """)
214
+ deleted = result.single()["deleted"]
215
+ if deleted > 0:
216
+ print(f"Cleared {deleted} existing files from me/")
217
+
218
+ # Reload everything
219
+ self._auto_load_me_directory()
220
+ print("✅ me/ directory reloaded!")
221
+
222
+ def _search_knowledge(self, query, limit=3):
223
+ """Search for relevant knowledge using vector similarity"""
224
+ query_embedding = self._get_embedding(query)
225
+
226
+ with self.neo4j_driver.session() as session:
227
+ result = session.run("""
228
+ CALL db.index.vector.queryNodes('knowledge_embeddings', $limit, $query_embedding)
229
+ YIELD node, score
230
+ RETURN node.content as content, node.type as type, score
231
+ ORDER BY score DESC
232
+ """, query_embedding=query_embedding, limit=limit)
233
+
234
+ return [{"content": record["content"], "type": record["type"], "score": record["score"]}
235
+ for record in result]
236
+
237
+ def _store_new_knowledge(self, information, context=""):
238
+ """Store new information in Neo4j"""
239
+ embedding = self._get_embedding(information)
240
+
241
+ with self.neo4j_driver.session() as session:
242
+ session.run("""
243
+ CREATE (n:Knowledge {
244
+ content: $content,
245
+ type: 'conversation',
246
+ context: $context,
247
+ embedding: $embedding,
248
+ timestamp: datetime()
249
+ })
250
+ """, content=information, context=context, embedding=embedding)
251
+
252
+ def bulk_load_text_content(self, text_content, source_name="raw_text", chunk_size=800):
253
+ """
254
+ Load raw text content into the vector database
255
+
256
+ Args:
257
+ text_content: Raw text string (summary, report, etc.)
258
+ source_name: Name/identifier for this content
259
+ chunk_size: Size of chunks to split text into
260
+ """
261
+ print(f"Processing text content: {source_name}")
262
+
263
+ # Split into chunks
264
+ chunks = []
265
+ for i in range(0, len(text_content), chunk_size):
266
+ chunk = text_content[i:i+chunk_size].strip()
267
+ if chunk: # Skip empty chunks
268
+ chunks.append(chunk)
269
+
270
+ print(f"Created {len(chunks)} chunks")
271
+
272
+ # Store each chunk
273
+ with self.neo4j_driver.session() as session:
274
+ for i, chunk in enumerate(chunks):
275
+ embedding = self._get_embedding(chunk)
276
+
277
+ session.run("""
278
+ CREATE (n:Knowledge {
279
+ content: $content,
280
+ type: 'text_content',
281
+ source: $source,
282
+ chunk_index: $chunk_index,
283
+ embedding: $embedding,
284
+ timestamp: datetime()
285
+ })
286
+ """,
287
+ content=chunk,
288
+ source=source_name,
289
+ chunk_index=i,
290
+ embedding=embedding)
291
+
292
+ print(f"Loaded {len(chunks)} chunks from {source_name}")
293
+
294
+ def load_text_files(self, file_paths, chunk_size=800):
295
+ """
296
+ Load raw text files (summaries, reports) into the database
297
+
298
+ Args:
299
+ file_paths: List of text file paths
300
+ chunk_size: Size of chunks to split text into
301
+ """
302
+ for file_path in file_paths:
303
+ print(f"Loading {file_path}...")
304
+
305
+ try:
306
+ with open(file_path, 'r', encoding='utf-8') as f:
307
+ content = f.read()
308
+
309
+ # Use filename as source name
310
+ source_name = os.path.basename(file_path)
311
+ self.bulk_load_text_content(content, source_name, chunk_size)
312
+
313
+ except Exception as e:
314
+ print(f"Error loading {file_path}: {e}")
315
+
316
+ def load_directory(self, directory_path, chunk_size=800):
317
+ """
318
+ Load all .txt files from a directory
319
+
320
+ Args:
321
+ directory_path: Path to directory containing text files
322
+ chunk_size: Size of chunks to split text into
323
+ """
324
+ import glob
325
+
326
+ txt_files = glob.glob(os.path.join(directory_path, "*.txt"))
327
+ if txt_files:
328
+ print(f"Found {len(txt_files)} text files in {directory_path}")
329
+ self.load_text_files(txt_files, chunk_size)
330
+ else:
331
+ print(f"No .txt files found in {directory_path}")
332
+
333
+ def clear_knowledge_base(self, knowledge_type=None):
334
+ """
335
+ Clear all or specific type of knowledge from the database
336
+
337
+ Args:
338
+ knowledge_type: If specified, only delete nodes of this type
339
+ """
340
+ with self.neo4j_driver.session() as session:
341
+ if knowledge_type:
342
+ result = session.run("MATCH (n:Knowledge {type: $type}) DELETE n RETURN count(n) as deleted",
343
+ type=knowledge_type)
344
+ else:
345
+ result = session.run("MATCH (n:Knowledge) DELETE n RETURN count(n) as deleted")
346
+
347
+ deleted_count = result.single()["deleted"]
348
+ print(f"Deleted {deleted_count} knowledge nodes")
349
+
350
+ def get_knowledge_stats(self):
351
+ """Get statistics about the knowledge base"""
352
+ with self.neo4j_driver.session() as session:
353
+ result = session.run("""
354
+ MATCH (n:Knowledge)
355
+ RETURN n.type as type, count(n) as count
356
+ ORDER BY count DESC
357
+ """)
358
+
359
+ stats = {}
360
+ total = 0
361
+ for record in result:
362
+ stats[record["type"]] = record["count"]
363
+ total += record["count"]
364
+
365
+ print(f"Knowledge Base Stats (Total: {total} documents):")
366
+ for doc_type, count in stats.items():
367
+ print(f" {doc_type}: {count}")
368
+
369
+ return stats
370
+
371
+ def handle_tool_call(self, tool_calls):
372
+ results = []
373
+ for tool_call in tool_calls:
374
+ tool_name = tool_call.function.name
375
+ arguments = json.loads(tool_call.function.arguments)
376
+ print(f"Tool called: {tool_name}", flush=True)
377
+
378
+ if tool_name == "store_conversation_info":
379
+ # Store in Neo4j when this tool is called
380
+ self._store_new_knowledge(arguments["information"], arguments.get("context", ""))
381
+ result = {"stored": "ok", "info": arguments["information"]}
382
+ else:
383
+ tool = globals().get(tool_name)
384
+ result = tool(**arguments) if tool else {}
385
+
386
+ results.append({"role": "tool","content": json.dumps(result),"tool_call_id": tool_call.id})
387
+ return results
388
+
389
+ def system_prompt(self, relevant_knowledge=""):
390
+ system_prompt = f"You are acting as {self.name}. You are answering questions on {self.name}'s website, \
391
+ particularly questions related to {self.name}'s career, background, skills and experience. \
392
+ Your responsibility is to represent {self.name} for interactions on the website as faithfully as possible. \
393
+ Be professional and engaging, as if talking to a potential client or future employer who came across the website. \
394
+ If you don't know the answer to any question, use your record_unknown_question tool to record the question that you couldn't answer, even if it's about something trivial or unrelated to career. \
395
+ If the user is engaging in discussion, try to steer them towards getting in touch via email; ask for their email and record it using your record_user_details tool. \
396
+ If you learn new relevant information during conversations, use the store_conversation_info tool to remember it for future interactions."
397
+
398
+ if relevant_knowledge:
399
+ system_prompt += f"\n\n## Relevant Background Information:\n{relevant_knowledge}"
400
+
401
+ system_prompt += f"\n\nWith this context, please chat with the user, always staying in character as {self.name}."
402
+ return system_prompt
403
+
404
+ def chat(self, message, history):
405
+ # Search for relevant knowledge
406
+ relevant_docs = self._search_knowledge(message)
407
+ relevant_knowledge = "\n".join([f"- {doc['content'][:200]}..." for doc in relevant_docs if doc['score'] > 0.7])
408
+
409
+ messages = [{"role": "system", "content": self.system_prompt(relevant_knowledge)}] + history + [{"role": "user", "content": message}]
410
+ done = False
411
+ while not done:
412
+ response = self.openai.chat.completions.create(model="gpt-4o-mini", messages=messages, tools=tools)
413
+ if response.choices[0].finish_reason=="tool_calls":
414
+ message_obj = response.choices[0].message
415
+ tool_calls = message_obj.tool_calls
416
+ results = self.handle_tool_call(tool_calls)
417
+ messages.append(message_obj)
418
+ messages.extend(results)
419
+ else:
420
+ done = True
421
+ return response.choices[0].message.content
422
+
423
+ def __del__(self):
424
+ """Close Neo4j connection"""
425
+ if hasattr(self, 'neo4j_driver'):
426
+ self.neo4j_driver.close()
427
+
428
+
429
+ if __name__ == "__main__":
430
+ me = Me()
431
+ gr.ChatInterface(me.chat, type="messages").launch()
bulk_loader_script.py CHANGED
@@ -4,7 +4,7 @@ Simple bulk loader for raw text summaries and reports
4
  Just drop your .txt files in a folder and run this script
5
  """
6
 
7
- from enhanced_app_rag import Me
8
  import os
9
 
10
  def main():
 
4
  Just drop your .txt files in a folder and run this script
5
  """
6
 
7
+ from app import Me
8
  import os
9
 
10
  def main():