Princeaka commited on
Commit
51e7f28
·
verified ·
1 Parent(s): 6f9d4af

Update multimodal_module.py

Browse files
Files changed (1) hide show
  1. multimodal_module.py +1253 -717
multimodal_module.py CHANGED
@@ -1,767 +1,1303 @@
1
- # multimodal_module.py
2
- import os
3
- import pickle
4
- import subprocess
5
- import tempfile
6
- import shutil
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  import asyncio
8
- import logging
9
- from datetime import datetime
10
- from typing import Dict, List, Optional, Any, Union
11
- import uuid
12
- import numpy as np
13
-
14
- # Configure logging
15
- logging.basicConfig(
16
- level=logging.INFO,
17
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
- )
19
- logger = logging.getLogger("MultiModalModule")
20
-
21
- # Space-specific environment configuration
22
- os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
23
- os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
24
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
25
-
26
- # Core ML Imports
27
- import torch
28
- from transformers import (
29
- pipeline,
30
- AutoModelForSeq2SeqLM,
31
- AutoTokenizer,
32
- Wav2Vec2Processor,
33
- Wav2Vec2ForSequenceClassification,
34
- AutoModelForCausalLM
35
- )
36
- from diffusers import (
37
- StableDiffusionPipeline,
38
- StableDiffusionInpaintPipeline
39
- )
40
- from huggingface_hub import hf_hub_download, snapshot_download
41
-
42
- # Audio Processing
43
- import librosa
44
- import soundfile as sf
45
- from gtts import gTTS
46
- import speech_recognition as sr
47
- import webrtcvad
48
-
49
- # Image/Video Processing
50
- from PIL import Image
51
- import imageio
52
- import imageio_ffmpeg
53
- import moviepy.editor as mp
54
- import cv2
55
-
56
- # Document Processing
57
- import fitz # PyMuPDF
58
- from langdetect import detect, DetectorFactory
59
- DetectorFactory.seed = 0
60
-
61
- # Configuration
62
- USE_SAFETY_CHECKER = False
63
- MAX_HISTORY_LENGTH = 100
64
- TEMP_DIR = "tmp"
65
- MODEL_CACHE_DIR = "model_cache"
66
-
67
- class MultiModalChatModule:
68
- """Complete multimodal module optimized for Hugging Face Spaces"""
69
-
70
- def __init__(self, chat_history_file: str = "chat_histories.pkl"):
71
- """Initialize with Space optimizations"""
72
- # Create required directories
73
- os.makedirs(TEMP_DIR, exist_ok=True)
74
- os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
75
-
76
- # Device configuration
77
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
78
- self.torch_dtype = torch.float16 if "cuda" in self.device else torch.float32
79
- logger.info(f"Initialized on {self.device.upper()} with dtype {self.torch_dtype}")
80
-
81
- # Model registry
82
- self.model_names = {
83
- "voice_emotion_processor": "facebook/hubert-large-ls960-ft",
84
- "voice_emotion_model": "superb/hubert-base-superb-er",
85
- "translation_model": "facebook/nllb-200-distilled-600M",
86
- "chatbot_tokenizer": "facebook/blenderbot-400M-distill",
87
- "chatbot_model": "facebook/blenderbot-400M-distill",
88
- "image_captioner": "Salesforce/blip-image-captioning-base",
89
- "sd_inpaint": "runwayml/stable-diffusion-inpainting",
90
- "sd_text2img": "runwayml/stable-diffusion-v1-5",
91
- "code_model": "bigcode/starcoder",
92
- }
93
-
94
- # Model placeholders
95
- self._voice_processor = None
96
- self._voice_emotion_model = None
97
- self._translator = None
98
- self._chat_tokenizer = None
99
- self._chat_model = None
100
- self._image_captioner = None
101
- self._sd_pipe = None
102
- self._sd_inpaint = None
103
- self._code_tokenizer = None
104
- self._code_model = None
105
-
106
- # Helpers
107
- self._sr_recognizer = sr.Recognizer()
108
- self.vad = webrtcvad.Vad(3)
109
- self.chat_history_file = chat_history_file
110
- self.user_chat_histories = self._load_chat_histories()
111
-
112
- # Load tracking
113
- self._loaded = {
114
- "voice": False,
115
- "translation": False,
116
- "chat": False,
117
- "image_caption": False,
118
- "sd": False,
119
- "code": False,
120
- }
121
-
122
- # ----------------------
123
- # Core Utilities
124
- # ----------------------
125
- def _tmp_path(self, suffix: str = "") -> str:
126
- """Generate space-compatible temp file path"""
127
- path = os.path.join(TEMP_DIR, f"{uuid.uuid4().hex}{suffix}")
128
- os.makedirs(os.path.dirname(path), exist_ok=True)
129
- return path
130
-
131
- def _cleanup(self, *paths: str) -> None:
132
- """Safely remove files/directories"""
133
- for path in paths:
134
- try:
135
- if path and os.path.exists(path):
136
- if os.path.isfile(path):
137
- os.remove(path)
138
- elif os.path.isdir(path):
139
- shutil.rmtree(path)
140
- except Exception as e:
141
- logger.warning(f"Cleanup failed for {path}: {e}")
142
 
143
- def _load_chat_histories(self) -> Dict[int, List[dict]]:
144
- """Load chat histories from file"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  try:
146
- with open(self.chat_history_file, "rb") as f:
147
- return pickle.load(f)
 
148
  except Exception as e:
149
- logger.warning(f"Failed loading chat history: {e}")
150
- return {}
151
-
152
- def _save_chat_histories(self) -> None:
153
- """Persist chat histories to file"""
154
- try:
155
- with open(self.chat_history_file, "wb") as f:
156
- pickle.dump(self.user_chat_histories, f)
157
- except Exception as e:
158
- logger.error(f"Failed saving chat history: {e}")
159
-
160
- def _update_history(self, user_id: int, role: str, content: Any, lang: str = "en") -> None:
161
- """Update conversation history"""
162
- if user_id not in self.user_chat_histories:
163
- self.user_chat_histories[user_id] = []
164
-
165
- self.user_chat_histories[user_id].append({
166
- "timestamp": datetime.now().isoformat(),
167
- "role": role,
168
- "content": content,
169
- "language": lang
170
- })
171
-
172
- # Enforce max history length
173
- self.user_chat_histories[user_id] = self.user_chat_histories[user_id][-MAX_HISTORY_LENGTH:]
174
- self._save_chat_histories()
175
-
176
- # ----------------------
177
- # Model Loading
178
- # ----------------------
179
- def _load_voice_models(self) -> None:
180
- """Load voice processing models"""
181
- if self._loaded["voice"]:
182
- return
183
-
184
  try:
185
- logger.info("Loading voice models...")
186
- self._voice_processor = Wav2Vec2Processor.from_pretrained(
187
- self.model_names["voice_emotion_processor"],
188
- cache_dir=MODEL_CACHE_DIR
189
- )
190
- self._voice_emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(
191
- self.model_names["voice_emotion_model"],
192
- cache_dir=MODEL_CACHE_DIR
193
- ).to(self.device)
194
- self._loaded["voice"] = True
195
- logger.info("Voice models loaded successfully")
196
  except Exception as e:
197
- logger.error(f"Failed loading voice models: {e}")
198
-
199
- def _load_translation(self) -> None:
200
- """Load translation pipeline"""
201
- if self._loaded["translation"]:
202
- return
203
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  try:
205
- logger.info("Loading translation model...")
206
- device = 0 if self.device == "cuda" else -1
207
- self._translator = pipeline(
208
- "translation",
209
- model=self.model_names["translation_model"],
210
- device=device,
211
- cache_dir=MODEL_CACHE_DIR
212
- )
213
- self._loaded["translation"] = True
214
- logger.info("Translation model loaded successfully")
215
- except Exception as e:
216
- logger.error(f"Failed loading translation model: {e}")
 
 
 
 
 
 
 
 
 
217
 
218
- def _load_chatbot(self) -> None:
219
- """Load chatbot models"""
220
- if self._loaded["chat"]:
221
- return
 
 
 
 
 
222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  try:
224
- logger.info("Loading chatbot models...")
225
- self._chat_tokenizer = AutoTokenizer.from_pretrained(
226
- self.model_names["chatbot_tokenizer"],
227
- cache_dir=MODEL_CACHE_DIR
228
- )
229
- self._chat_model = AutoModelForSeq2SeqLM.from_pretrained(
230
- self.model_names["chatbot_model"],
231
- cache_dir=MODEL_CACHE_DIR
232
- ).to(self.device)
233
- self._loaded["chat"] = True
234
- logger.info("Chatbot models loaded successfully")
235
  except Exception as e:
236
- logger.error(f"Failed loading chatbot models: {e}")
237
-
238
- def _load_image_captioner(self) -> None:
239
- """Load image captioning model"""
240
- if self._loaded["image_caption"]:
241
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
 
243
  try:
244
- logger.info("Loading image captioner...")
245
- device = 0 if self.device == "cuda" else -1
246
- self._image_captioner = pipeline(
247
- "image-to-text",
248
- model=self.model_names["image_captioner"],
249
- device=device,
250
- cache_dir=MODEL_CACHE_DIR
251
- )
252
- self._loaded["image_caption"] = True
253
- logger.info("Image captioner loaded successfully")
254
  except Exception as e:
255
- logger.error(f"Failed loading image captioner: {e}")
256
-
257
- def _load_sd(self) -> None:
258
- """Load Stable Diffusion models"""
259
- if self._loaded["sd"]:
260
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
 
 
262
  try:
263
- logger.info("Loading Stable Diffusion models...")
264
-
265
- # Text-to-image
266
- self._sd_pipe = StableDiffusionPipeline.from_pretrained(
267
- self.model_names["sd_text2img"],
268
- torch_dtype=self.torch_dtype,
269
- safety_checker=None if not USE_SAFETY_CHECKER else None,
270
- cache_dir=MODEL_CACHE_DIR
271
- ).to(self.device)
272
-
273
- # Inpainting
274
- self._sd_inpaint = StableDiffusionInpaintPipeline.from_pretrained(
275
- self.model_names["sd_inpaint"],
276
- torch_dtype=self.torch_dtype,
277
- cache_dir=MODEL_CACHE_DIR
278
- ).to(self.device)
279
-
280
- self._loaded["sd"] = True
281
- logger.info("Stable Diffusion models loaded successfully")
282
- except Exception as e:
283
- logger.error(f"Failed loading Stable Diffusion models: {e}")
284
- self._sd_pipe = None
285
- self._sd_inpaint = None
286
-
287
- def _load_code_model(self) -> None:
288
- """Load code generation model"""
289
- if self._loaded["code"]:
290
- return
291
-
292
  try:
293
- logger.info("Loading code model...")
294
- self._code_tokenizer = AutoTokenizer.from_pretrained(
295
- self.model_names["code_model"],
296
- cache_dir=MODEL_CACHE_DIR
297
- )
298
- self._code_model = AutoModelForCausalLM.from_pretrained(
299
- self.model_names["code_model"],
300
- cache_dir=MODEL_CACHE_DIR
301
- ).to(self.device)
302
- self._loaded["code"] = True
303
- logger.info("Code model loaded successfully")
 
 
 
 
 
 
 
 
 
304
  except Exception as e:
305
- logger.error(f"Failed loading code model: {e}")
306
- self._code_tokenizer = None
307
- self._code_model = None
308
-
309
- # ----------------------
310
- # Audio Processing
311
- # ----------------------
312
- async def analyze_voice_emotion(self, audio_path: str) -> str:
313
- """Analyze emotion from voice audio"""
314
- self._load_voice_models()
315
- if not self._voice_processor or not self._voice_emotion_model:
316
- return "unknown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  try:
319
- speech, sr = librosa.load(audio_path, sr=16000)
320
- inputs = self._voice_processor(
321
- speech,
322
- sampling_rate=sr,
323
- return_tensors="pt",
324
- padding=True
325
- ).to(self.device)
326
-
327
- with torch.no_grad():
328
- logits = self._voice_emotion_model(**inputs).logits
329
-
330
- emotions = {
331
- 0: "happy", 1: "sad", 2: "angry",
332
- 3: "fearful", 4: "calm", 5: "surprised"
333
- }
334
- return emotions.get(torch.argmax(logits).item(), "unknown")
335
- except Exception as e:
336
- logger.error(f"Voice emotion analysis failed: {e}")
337
- return "error"
338
-
339
- async def process_voice_message(self, voice_file, user_id: int) -> Dict[str, Any]:
340
- """Process voice message to text with emotion analysis"""
341
- ogg_path = self._tmp_path(".ogg")
342
- wav_path = self._tmp_path(".wav")
343
-
344
  try:
345
- # Save and convert audio
346
- await voice_file.download_to_drive(ogg_path)
347
-
348
- # Convert to WAV
349
- ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
350
- cmd = [
351
- ffmpeg_path, "-y", "-i", ogg_path,
352
- "-ar", "16000", "-ac", "1", wav_path
353
- ]
354
- subprocess.run(cmd, check=True, capture_output=True)
355
-
356
- # Analyze audio
357
- speech, sr = librosa.load(wav_path, sr=16000)
358
-
359
- # Voice Activity Detection
360
- is_speech = self.vad.is_speech(
361
- (speech * 32767).astype(np.int16).tobytes(),
362
- sample_rate=sr
363
- )
364
-
365
- # Transcription
366
- text = ""
367
- lang = "en"
368
- if is_speech:
369
- with sr.AudioFile(wav_path) as source:
370
- audio = self._sr_recognizer.record(source)
371
- try:
372
- text = self._sr_recognizer.recognize_google(audio, language="en-US")
373
- except sr.UnknownValueError:
374
- pass
375
- except Exception as e:
376
- logger.warning(f"Speech recognition failed: {e}")
377
-
378
- # Emotion analysis
379
- emotion = await self.analyze_voice_emotion(wav_path) if is_speech else "no_speech"
380
-
381
- # Update history
382
- result = {
383
- "text": text,
384
- "language": lang,
385
- "emotion": emotion,
386
- "is_speech": is_speech
387
- }
388
- self._update_history(user_id, "user", result, lang)
389
-
390
- return result
391
  except Exception as e:
392
- logger.error(f"Voice message processing failed: {e}")
393
  return {"error": str(e)}
394
- finally:
395
- self._cleanup(ogg_path, wav_path)
396
-
397
- async def generate_voice_reply(self, text: str, user_id: int, fmt: str = "ogg") -> str:
398
- """Generate audio from text (TTS)"""
399
- mp3_path = self._tmp_path(".mp3")
400
- out_path = self._tmp_path(f".{fmt}")
401
 
 
 
 
 
 
 
402
  try:
403
- # Generate TTS
404
- tts = gTTS(text=text, lang='en')
405
- tts.save(mp3_path)
406
-
407
- # Convert format
408
- ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
409
- if fmt == "ogg":
410
- subprocess.run([
411
- ffmpeg_path, "-y", "-i", mp3_path,
412
- "-c:a", "libopus", out_path
413
- ], check=True)
414
- elif fmt == "wav":
415
- subprocess.run([
416
- ffmpeg_path, "-y", "-i", mp3_path, out_path
417
- ], check=True)
418
- else:
419
- shutil.move(mp3_path, out_path)
420
-
421
- # Update history
422
- self._update_history(user_id, "assistant", f"[Voice reply: {fmt}]")
423
-
424
- return out_path
425
  except Exception as e:
426
- logger.error(f"Voice reply generation failed: {e}")
427
- raise RuntimeError(f"TTS failed: {e}")
428
- finally:
429
- if fmt != "mp3" and os.path.exists(mp3_path):
430
- self._cleanup(mp3_path)
431
-
432
- # ----------------------
433
- # Text Processing
434
- # ----------------------
435
- async def generate_response(self, text: str, user_id: int, lang: str = "en") -> str:
436
- """Generate conversational response with context"""
437
- self._load_chatbot()
438
- self._load_translation()
439
-
440
- # Update history
441
- self._update_history(user_id, "user", text, lang)
442
-
443
- # Prepare context
444
- context = []
445
- for msg in self.user_chat_histories[user_id][-5:]:
446
- if msg["language"] != "en":
447
- try:
448
- translated = self._translator(msg["content"])[0]["translation_text"]
449
- context.append(f"{msg['role']}: {translated}")
450
- except Exception:
451
- context.append(f"{msg['role']}: {msg['content']}")
452
- else:
453
- context.append(f"{msg['role']}: {msg['content']}")
454
-
455
- # Generate response
456
- input_text = f"Context:\n{' '.join(context)}\nUser: {text}"
457
- inputs = self._chat_tokenizer(input_text, return_tensors="pt").to(self.device)
458
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  try:
460
- outputs = self._chat_model.generate(
461
- **inputs,
462
- max_new_tokens=200,
463
- do_sample=True,
464
- temperature=0.7
465
- )
466
- response = self._chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
467
  except Exception as e:
468
- logger.error(f"Response generation failed: {e}")
469
- response = "I couldn't generate a response. Please try again."
470
-
471
- # Translate if needed
472
- if lang != "en":
473
  try:
474
- response = self._translator(response)[0]["translation_text"]
 
 
 
 
 
 
475
  except Exception:
476
- pass
477
-
478
- # Update history
479
- self._update_history(user_id, "assistant", response, lang)
480
- return response
481
-
482
- # ----------------------
483
- # Image Processing
484
- # ----------------------
485
- async def process_image_message(self, image_file, user_id: int) -> str:
486
- """Generate caption for an image"""
487
- img_path = self._tmp_path(".jpg")
488
 
 
 
 
 
 
489
  try:
490
- # Save and load image
491
- await image_file.download_to_drive(img_path)
492
- image = Image.open(img_path).convert("RGB")
493
-
494
- # Generate caption
495
- self._load_image_captioner()
496
- caption = self._image_captioner(image)[0]["generated_text"]
497
-
498
- # Update history
499
- self._update_history(user_id, "user", "[Image]", "en")
500
- self._update_history(user_id, "assistant", f"Image description: {caption}", "en")
501
-
502
- return caption
503
- except Exception as e:
504
- logger.error(f"Image processing failed: {e}")
505
- return f"Error processing image: {str(e)}"
506
- finally:
507
- self._cleanup(img_path)
508
-
509
- async def generate_image_from_text(self, prompt: str, user_id: int,
510
- width: int = 512, height: int = 512,
511
- steps: int = 30) -> str:
512
- """Generate image from text prompt"""
513
- self._load_sd()
514
- if not self._sd_pipe:
515
- raise RuntimeError("Image generation unavailable")
516
-
517
- out_path = self._tmp_path(".png")
518
 
 
 
519
  try:
520
- # Generate image
521
- result = self._sd_pipe(
522
- prompt,
523
- num_inference_steps=steps,
524
- height=height,
525
- width=width
526
- )
527
- result.images[0].save(out_path)
528
-
529
- # Update history
530
- self._update_history(user_id, "user", f"[Image request: {prompt}]", "en")
531
- self._update_history(user_id, "assistant", f"[Generated image]", "en")
532
-
533
- return out_path
534
  except Exception as e:
535
- logger.error(f"Image generation failed: {e}")
536
- raise RuntimeError(f"Image generation failed: {e}")
537
-
538
- async def edit_image_inpaint(self, image_file, mask_file=None,
539
- prompt: str = "", user_id: int = 0) -> str:
540
- """Edit image using inpainting"""
541
- self._load_sd()
542
- if not self._sd_inpaint:
543
- raise RuntimeError("Image editing unavailable")
544
-
545
- img_path = self._tmp_path(".png")
546
- mask_path = self._tmp_path("_mask.png") if mask_file else None
547
- out_path = self._tmp_path("_edited.png")
548
 
 
 
549
  try:
550
- # Save inputs
551
- await image_file.download_to_drive(img_path)
552
- if mask_file:
553
- await mask_file.download_to_drive(mask_path)
554
-
555
- # Prepare images
556
- init_image = Image.open(img_path).convert("RGB")
557
- mask_image = Image.open(mask_path).convert("L") if mask_path else Image.new("L", init_image.size, 255)
558
-
559
- # Inpaint
560
- result = self._sd_inpaint(
561
- prompt=prompt if prompt else " ",
562
- image=init_image,
563
- mask_image=mask_image,
564
- guidance_scale=7.5,
565
- num_inference_steps=30
566
- )
567
- result.images[0].save(out_path)
568
-
569
- # Update history
570
- self._update_history(user_id, "user", "[Image edit request]", "en")
571
- self._update_history(user_id, "assistant", "[Edited image]", "en")
572
-
573
- return out_path
574
  except Exception as e:
575
- logger.error(f"Image editing failed: {e}")
576
- raise RuntimeError(f"Inpainting failed: {e}")
577
- finally:
578
- self._cleanup(img_path, mask_path)
579
-
580
- # ----------------------
581
- # Video Processing
582
- # ----------------------
583
- async def process_video(self, video_file, user_id: int, max_frames: int = 4) -> Dict[str, Any]:
584
- """Process video file to extract audio and keyframes"""
585
- vid_path = self._tmp_path(".mp4")
586
- audio_path = self._tmp_path(".wav")
587
-
 
 
 
588
  try:
589
- # Save video
590
- await video_file.download_to_drive(vid_path)
591
-
592
- # Extract audio
593
- clip = mp.VideoFileClip(vid_path)
594
- clip.audio.write_audiofile(audio_path, logger=None)
595
- duration = clip.duration
596
- fps = clip.fps
597
-
598
- # Transcribe audio
599
- transcribed = ""
600
- try:
601
- with sr.AudioFile(audio_path) as source:
602
- audio = self._sr_recognizer.record(source)
603
- transcribed = self._sr_recognizer.recognize_google(audio)
604
- except Exception as e:
605
- logger.warning(f"Audio transcription failed: {e}")
606
-
607
- # Extract frames
608
- frames = []
609
- captions = []
610
- try:
611
- reader = imageio.get_reader(vid_path)
612
- total_frames = reader.count_frames()
613
- step = max(1, total_frames // max_frames)
614
-
615
- for i in range(0, total_frames, step):
616
- try:
617
- frame = reader.get_data(i)
618
- frame_path = self._tmp_path(f"_frame{i}.jpg")
619
- Image.fromarray(frame).save(frame_path)
620
- frames.append(frame_path)
621
-
622
- if len(frames) >= max_frames:
623
- break
624
- except Exception:
625
- continue
626
-
627
- # Generate captions
628
- if frames and self._load_image_captioner():
629
- for frame_path in frames:
630
- try:
631
- caption = self._image_captioner(Image.open(frame_path))[0]["generated_text"]
632
- captions.append(caption)
633
- except Exception:
634
- captions.append("")
635
- finally:
636
- self._cleanup(frame_path)
637
- except Exception as e:
638
- logger.warning(f"Frame extraction failed: {e}")
639
-
640
- # Update history
641
- result = {
642
- "duration": duration,
643
- "fps": fps,
644
- "transcription": transcribed,
645
- "captions": captions
646
- }
647
- self._update_history(user_id, "user", "[Video upload]", "en")
648
- self._update_history(user_id, "assistant", result, "en")
649
-
650
- return result
651
  except Exception as e:
652
- logger.error(f"Video processing failed: {e}")
653
  return {"error": str(e)}
654
- finally:
655
- self._cleanup(vid_path, audio_path)
656
-
657
- # ----------------------
658
- # File Processing
659
- # ----------------------
660
- async def process_file(self, file_obj, user_id: int) -> Dict[str, Any]:
661
- """Process document files (PDF, DOCX, TXT)"""
662
- fpath = self._tmp_path()
663
-
664
- try:
665
- # Save file
666
- await file_obj.download_to_drive(fpath)
667
 
668
- # Read based on type
669
- text = ""
670
- if fpath.lower().endswith(".pdf"):
671
- try:
672
- with fitz.open(fpath) as doc:
673
- text = "\n".join([page.get_text() for page in doc])
674
- except Exception as e:
675
- text = f"[PDF error: {e}]"
676
- elif fpath.lower().endswith((".txt", ".csv")):
677
- try:
678
- with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
679
- text = f.read()
680
- except Exception as e:
681
- text = f"[Text error: {e}]"
682
- elif fpath.lower().endswith(".docx"):
683
- try:
684
- import docx
685
- doc = docx.Document(fpath)
686
- text = "\n".join([p.text for p in doc.paragraphs])
687
- except Exception as e:
688
- text = f"[DOCX error: {e}]"
689
- else:
690
- text = "[Unsupported file type]"
691
 
692
- # Summarize
693
- summary = text[:500] + ("..." if len(text) > 500 else "")
694
-
695
- # Update history
696
- result = {
697
- "summary": summary,
698
- "length": len(text),
699
- "type": os.path.splitext(fpath)[1]
700
- }
701
- self._update_history(user_id, "user", f"[File upload: {result['type']}]", "en")
702
- self._update_history(user_id, "assistant", result, "en")
703
-
704
- return result
705
  except Exception as e:
706
- logger.error(f"File processing failed: {e}")
707
- return {"error": str(e)}
708
- finally:
709
- self._cleanup(fpath)
710
-
711
- # ----------------------
712
- # Code Processing
713
- # ----------------------
714
- async def code_complete(self, prompt: str, max_tokens: int = 512,
715
- temperature: float = 0.2) -> str:
716
- """Generate code completions"""
717
- self._load_code_model()
718
- if not self._code_model or not self._code_tokenizer:
719
- raise RuntimeError("Code model not available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
 
721
  try:
722
- inputs = self._code_tokenizer(prompt, return_tensors="pt").to(self.device)
723
- outputs = self._code_model.generate(
724
- **inputs,
725
- max_new_tokens=max_tokens,
726
- temperature=temperature,
727
- do_sample=True
728
- )
729
- return self._code_tokenizer.decode(outputs[0], skip_special_tokens=True)
730
- except Exception as e:
731
- logger.error(f"Code completion failed: {e}")
732
- raise RuntimeError(f"Code generation error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733
 
734
- async def execute_python_code(self, code: str, timeout: int = 5) -> Dict[str, str]:
735
- """Execute Python code in sandbox (DANGER: Unsecure)"""
736
- temp_dir = self._tmp_path()
737
- script_path = os.path.join(temp_dir, "script.py")
 
738
 
739
- try:
740
- # Create temp dir
741
- os.makedirs(temp_dir, exist_ok=True)
742
-
743
- # Write script
744
- with open(script_path, "w") as f:
745
- f.write(code)
746
-
747
- # Execute
748
- proc = await asyncio.create_subprocess_exec(
749
- "python3", script_path,
750
- stdout=asyncio.subprocess.PIPE,
751
- stderr=asyncio.subprocess.PIPE
752
- )
753
 
 
 
754
  try:
755
- stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
756
- return {
757
- "stdout": stdout.decode("utf-8", errors="ignore"),
758
- "stderr": stderr.decode("utf-8", errors="ignore")
759
- }
760
- except asyncio.TimeoutError:
761
- proc.kill()
762
- return {"error": "Execution timed out"}
763
- except Exception as e:
764
- logger.error(f"Code execution failed: {e}")
765
- return {"error": str(e)}
766
- finally:
767
- self._cleanup(temp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # multimodal_superagent.py
2
+ """
3
+ Multimodal SuperAgent v6.0 — Single-file master implementation
4
+ COPY & PASTE into multimodal_superagent.py
5
+
6
+ WHAT'S NEW vs v5.0 (upgrade, not a rewrite):
7
+ - Close-to-Human Brain (CHB) layer: the only module allowed to speak to the user.
8
+ - Gathers evidence from memory, web, tools, models.
9
+ - Verifies claims with a Versioned Fact Store (VFS).
10
+ - Resolves contradictions (old vs new knowledge).
11
+ - Computes confidence; only answers when >= 0.90. Otherwise returns the certain subset.
12
+ - Versioned Fact Store (JSON): timelines of facts, supersedes history, provenance & citations.
13
+ - Confidence Scorer: reliability + recency + agreement + self-consistency + retrieval strength – contradictions.
14
+ - Web search: local HTML scraping with 24h cache (title/url/snippet<=100 chars). No paid tokens needed.
15
+ - Multimodal perception glue:
16
+ - Emojis/emotion tags for text (lightweight).
17
+ - Image caption hook (BLIP/ViT if configured) → stored as evidence.
18
+ - Video analyzer (keyframes). Basic FPS “interpolation” without external binaries (frame duplication).
19
+ - Safer math & schema checks during verification (SymPy + simple validators).
20
+ - Long-term memory, KG, agents, image & voice pipelines retained and improved with better error paths.
21
+ - Citations and confidence optionally attached to replies (text mode).
22
+ - Runs CPU or GPU (auto-detect). No hard dependencies on cloud APIs.
23
+ """
24
+
25
+ from __future__ import annotations
26
+ import os, sys, time, json, uuid, shutil, tempfile, subprocess, logging, math, hashlib, re
27
+ from dataclasses import dataclass, field, asdict
28
+ from typing import Any, Dict, List, Optional, Tuple, Union
29
  import asyncio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # --- lazy imports for heavy libs; imported when needed ---
32
+ try:
33
+ import torch
34
+ except Exception:
35
+ torch = None
36
+ try:
37
+ import numpy as np
38
+ except Exception:
39
+ np = None
40
+
41
+ # Logging
42
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
43
+ logger = logging.getLogger("SuperAgent")
44
+
45
+ # ---------------------------
46
+ # CONFIG — edit BEFORE running
47
+ # ---------------------------
48
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
49
+ TMP_DIR = os.getenv("SUPERAGENT_TMP", os.path.join(BASE_DIR, "tmp"))
50
+ CACHE_DIR = os.getenv("SUPERAGENT_CACHE", os.path.join(BASE_DIR, "model_cache"))
51
+ os.makedirs(TMP_DIR, exist_ok=True)
52
+ os.makedirs(CACHE_DIR, exist_ok=True)
53
+
54
+ # Central configuration: change model paths here or provide config.yaml and set env SUPERAGENT_CONFIG
55
+ MODEL_CONFIG = {
56
+ "device": "auto", # "auto" | "cpu" | "cuda"
57
+ "llm": { # prefer local HF model id or path; optionally set 'backend' to 'llamacpp'
58
+ "model_path": None, # e.g., "/path/to/Mistral-7B-Instruct"
59
+ "backend": "transformers", # "transformers" | "llamacpp"
60
+ "quantize": None, # e.g., "bitsandbytes" or GGUF for llamacpp
61
+ },
62
+ "embedder": "sentence-transformers/all-MiniLM-L6-v2",
63
+ "faiss_index_dir": os.path.join(CACHE_DIR, "faiss_index"),
64
+ "sdxl_base": None, # e.g., "/path/to/sdxl-base"
65
+ "sdxl_refiner": None, # optional
66
+ "sdxl_inpaint": None, # optional
67
+ "blip_caption": None, # e.g., "Salesforce/blip-image-captioning-base" (optional)
68
+ "piper_binary": "/usr/local/bin/piper", # optional
69
+ "piper_voice": None, # optional
70
+ "allow_web_search": False, # opt-in
71
+ "safety_blocklist": ["terror", "explosive", "harm"],
72
+ "knowledge_graph_path": os.path.join(CACHE_DIR, "kg.json"),
73
+ "memory_persist": True,
74
+ "memory_file": os.path.join(CACHE_DIR, "longterm_memory.json"),
75
+ "embed_dim": 384, # MiniLM-L6-v2
76
+ "auto_learn": True,
77
+ "vfs_file": os.path.join(CACHE_DIR, "versioned_facts.json"),
78
+ "web_cache_file": os.path.join(CACHE_DIR, "webcache.json"),
79
+ "chb_min_confidence": 0.90,
80
+ "max_memory_items": 10000,
81
+ "kg_max_nodes": 500,
82
+ "short_memory_turns": 50,
83
+ }
84
+
85
+ # ---------------------------
86
+ # Helpers
87
+ # ---------------------------
88
+ def uid(prefix="asset"):
89
+ return f"{prefix}_{uuid.uuid4().hex[:8]}"
90
+
91
+ def tmp_path(suffix=""):
92
+ p = os.path.join(TMP_DIR, f"{uuid.uuid4().hex}{suffix}")
93
+ os.makedirs(os.path.dirname(p), exist_ok=True)
94
+ return p
95
+
96
+ def cleanup(*paths):
97
+ for p in paths:
98
  try:
99
+ if not p: continue
100
+ if os.path.isfile(p): os.remove(p)
101
+ elif os.path.isdir(p): shutil.rmtree(p)
102
  except Exception as e:
103
+ logger.debug("cleanup failed %s: %s", p, e)
104
+
105
+ def now_ts():
106
+ return float(time.time())
107
+
108
+ # ---------------------------
109
+ # Provenance & Watermark
110
+ # ---------------------------
111
+ @dataclass
112
+ class Provenance:
113
+ model: str
114
+ version: str
115
+ synthetic: bool = True
116
+ consent: Optional[str] = None
117
+ created_at: float = field(default_factory=now_ts)
118
+ extra: Dict[str, Any] = field(default_factory=dict)
119
+
120
+ class ProvenanceManager:
121
+ def attach(self, filepath: str, meta: Union[Provenance, dict]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  try:
123
+ meta_out = asdict(meta) if isinstance(meta, Provenance) else meta
124
+ with open(filepath + ".prov.json", "w", encoding="utf-8") as f:
125
+ json.dump(meta_out, f, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
126
  except Exception as e:
127
+ logger.warning("provenance attach failed: %s", e)
128
+
129
+ def watermark_audio(self, wav_bytes: bytes) -> bytes:
130
+ # Non-destructive placeholder; does not alter content.
131
+ return wav_bytes
132
+
133
+ def watermark_image(self, pil_img):
134
+ # Return as-is. Replace with robust watermark if desired.
135
+ return pil_img
136
+
137
+ # ---------------------------
138
+ # Safety & Consent
139
+ # ---------------------------
140
+ @dataclass
141
+ class ConsentRecord:
142
+ user_id: str
143
+ attestation: str
144
+ ts: float = field(default_factory=now_ts)
145
+
146
+ class SafetyManager:
147
+ def __init__(self, blocklist=None):
148
+ self.blocklist = blocklist or MODEL_CONFIG["safety_blocklist"]
149
+ self.consent_log: List[ConsentRecord] = []
150
+
151
+ def record_consent(self, user_id: str, attestation: str) -> str:
152
+ token = hashlib.sha256(f"{user_id}-{attestation}-{time.time()}".encode()).hexdigest()
153
+ self.consent_log.append(ConsentRecord(user_id=user_id, attestation=attestation))
154
+ return token
155
+
156
+ def is_allowed(self, text: str) -> bool:
157
+ t = (text or "").lower()
158
+ return not any(b in t for b in self.blocklist)
159
+
160
+ def check_public_figure(self, embedding) -> bool:
161
+ # Placeholder: implement real similarity against protected embeddings if available
162
+ return False
163
+
164
+ # ---------------------------
165
+ # Short-term memory (conversation buffer)
166
+ # ---------------------------
167
+ class ShortTermMemory:
168
+ def __init__(self, max_turns=None):
169
+ self.max_turns = max_turns or MODEL_CONFIG["short_memory_turns"]
170
+ self.store: Dict[str, List[Dict[str,Any]]] = {}
171
+
172
+ def push(self, user_id: str, role: str, content: Any):
173
+ k = str(user_id)
174
+ arr = self.store.setdefault(k, [])
175
+ arr.append({"ts":now_ts(), "role":role, "content":content})
176
+ if len(arr) > self.max_turns:
177
+ self.store[k] = arr[-self.max_turns:]
178
+
179
+ def recent(self, user_id: str, k=10):
180
+ return self.store.get(str(user_id), [])[-k:]
181
+
182
+ # ---------------------------
183
+ # Long-term memory (FAISS + Sentence-Transformers)
184
+ # ---------------------------
185
+ class LongTermMemory:
186
+ def __init__(self, index_dir=None, embed_model_name=None):
187
+ self.index_dir = index_dir or MODEL_CONFIG["faiss_index_dir"]
188
+ os.makedirs(self.index_dir, exist_ok=True)
189
+ self.memfile = MODEL_CONFIG.get("memory_file")
190
+ self.embed_model_name = embed_model_name or MODEL_CONFIG["embedder"]
191
+ self.data: List[Dict[str,Any]] = []
192
+ self.index = None
193
+ self.embedder = None
194
+ self.dim = MODEL_CONFIG.get("embed_dim", 384)
195
+ self._load()
196
+
197
+ def _load(self):
198
+ if os.path.exists(self.memfile):
199
+ try:
200
+ with open(self.memfile, "r", encoding="utf-8") as f:
201
+ self.data = json.load(f)
202
+ except Exception:
203
+ self.data = []
204
+ # load faiss if available
205
  try:
206
+ import faiss # type: ignore
207
+ idx_path = os.path.join(self.index_dir, "index.faiss")
208
+ if os.path.exists(idx_path):
209
+ try:
210
+ self.index = faiss.read_index(idx_path)
211
+ except Exception:
212
+ self.index = None
213
+ else:
214
+ self.index = None
215
+ except Exception:
216
+ self.index = None
217
+
218
+ def _save(self):
219
+ with open(self.memfile, "w", encoding="utf-8") as f:
220
+ json.dump(self.data, f, indent=2, ensure_ascii=False)
221
+ if self.index is not None:
222
+ try:
223
+ import faiss # type: ignore
224
+ faiss.write_index(self.index, os.path.join(self.index_dir, "index.faiss"))
225
+ except Exception:
226
+ pass
227
 
228
+ def _get_embedder(self):
229
+ if self.embedder is None:
230
+ try:
231
+ from sentence_transformers import SentenceTransformer
232
+ self.embedder = SentenceTransformer(self.embed_model_name)
233
+ except Exception as e:
234
+ logger.debug("embedder load failed: %s", e)
235
+ self.embedder = None
236
+ return self.embedder
237
 
238
+ def _ensure_index(self, dim:int):
239
+ if self.index is None:
240
+ try:
241
+ import faiss # type: ignore
242
+ self.index = faiss.IndexFlatL2(dim)
243
+ except Exception:
244
+ self.index = None
245
+
246
+ def add(self, user_id: str, text: str, kind: str="turn"):
247
+ if not text: return
248
+ # prune if oversized
249
+ if len(self.data) >= MODEL_CONFIG["max_memory_items"]:
250
+ self.data = self.data[int(0.1*len(self.data)):] # drop oldest 10%
251
+ item = {"id": uid("m"), "user_id": str(user_id), "text": text, "kind": kind, "ts": now_ts()}
252
+ self.data.append(item)
253
+ emb = self._get_embedder()
254
+ if emb is not None:
255
+ vec = emb.encode([text])
256
+ import numpy as _np
257
+ self._ensure_index(vec.shape[1])
258
+ if self.index is not None:
259
+ self.index.add(_np.asarray(vec, dtype="float32"))
260
+ if MODEL_CONFIG.get("memory_persist"):
261
+ self._save()
262
+
263
+ def search(self, query: str, top_k=5) -> List[Dict[str,Any]]:
264
+ if not self.data:
265
+ return []
266
+ emb = self._get_embedder()
267
+ if emb is None or self.index is None:
268
+ # fallback naive search
269
+ q = (query or "").lower()
270
+ scored=[]
271
+ for i,m in enumerate(self.data):
272
+ s = sum(1 for tok in q.split() if tok in m["text"].lower())
273
+ if s>0: scored.append((s,i))
274
+ scored.sort(reverse=True)
275
+ return [self.data[i] for _,i in scored[:top_k]]
276
+ qv = emb.encode([query])
277
+ import numpy as _np
278
+ D,I = self.index.search(_np.asarray(qv, dtype="float32"), min(top_k, len(self.data)))
279
+ res=[]
280
+ used=set()
281
+ for idx in I[0]:
282
+ if 0 <= idx < len(self.data) and int(idx) not in used:
283
+ used.add(int(idx))
284
+ res.append(self.data[int(idx)])
285
+ return res
286
+
287
+ def export_all(self):
288
+ return {"count": len(self.data), "items": self.data}
289
+
290
+ def import_bulk(self, items: List[Dict[str,Any]]):
291
+ self.data = items or []
292
+ emb = self._get_embedder()
293
+ if emb is not None and self.data:
294
+ vecs = emb.encode([m["text"] for m in self.data])
295
+ import numpy as _np
296
+ self._ensure_index(vecs.shape[1])
297
+ if self.index is not None:
298
+ self.index.reset()
299
+ self.index.add(_np.asarray(vecs, dtype="float32"))
300
+ self._save()
301
+
302
+ # ---------------------------
303
+ # Knowledge Graph (simple JSON triples)
304
+ # ---------------------------
305
+ class KnowledgeGraph:
306
+ def __init__(self, path=None):
307
+ self.path = path or MODEL_CONFIG["knowledge_graph_path"]
308
+ self.graph = {}
309
+ self._load()
310
+
311
+ def _load(self):
312
+ if os.path.exists(self.path):
313
+ try:
314
+ with open(self.path,"r",encoding="utf-8") as f:
315
+ self.graph = json.load(f)
316
+ except Exception:
317
+ self.graph = {}
318
+
319
+ def add_fact(self, subj: str, pred: str, obj: str):
320
+ # cap nodes to avoid unbounded growth
321
+ if len(self.graph) > MODEL_CONFIG["kg_max_nodes"]:
322
+ # basic prune: drop oldest 10% of keys by insertion order
323
+ keys = list(self.graph.keys())
324
+ for k in keys[:max(1, len(keys)//10)]:
325
+ self.graph.pop(k, None)
326
+ k = f"{subj}::{pred}"
327
+ self.graph.setdefault(k, []).append(obj)
328
+ self._save()
329
+
330
+ def query(self, subj: str, pred: str):
331
+ return self.graph.get(f"{subj}::{pred}", [])
332
+
333
+ def _save(self):
334
  try:
335
+ with open(self.path,"w",encoding="utf-8") as f:
336
+ json.dump(self.graph, f, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
337
  except Exception as e:
338
+ logger.debug("kg save failed: %s", e)
339
+
340
+ # ---------------------------
341
+ # Versioned Fact Store (VFS)
342
+ # ---------------------------
343
+ class VersionedFactStore:
344
+ """
345
+ Stores facts with versioning & provenance.
346
+ Fact schema:
347
+ {
348
+ "id": str, "claim": str, "value": str, "scope": str|None,
349
+ "first_seen": ts, "verified_at": ts, "confidence": float,
350
+ "sources": [{"type": "web|memory|kg|vision|audio|manual", "ref": str, "title": str|None, "time": ts}],
351
+ "supersedes": str|None, "valid_from": ts|None, "valid_to": ts|None
352
+ }
353
+ """
354
+ def __init__(self, path=None):
355
+ self.path = path or MODEL_CONFIG["vfs_file"]
356
+ self.facts: List[Dict[str,Any]] = []
357
+ self._load()
358
+
359
+ def _load(self):
360
+ if os.path.exists(self.path):
361
+ try:
362
+ with open(self.path, "r", encoding="utf-8") as f:
363
+ self.facts = json.load(f)
364
+ except Exception:
365
+ self.facts = []
366
 
367
+ def _save(self):
368
  try:
369
+ with open(self.path, "w", encoding="utf-8") as f:
370
+ json.dump(self.facts, f, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
371
  except Exception as e:
372
+ logger.debug("vfs save failed: %s", e)
373
+
374
+ def add_or_update(self, claim: str, value: str, sources: List[Dict[str,Any]], confidence: float,
375
+ scope: Optional[str]=None, supersedes: Optional[str]=None,
376
+ valid_from: Optional[float]=None, valid_to: Optional[float]=None) -> Dict[str,Any]:
377
+ new_id = uid("fact")
378
+ rec = {
379
+ "id": new_id, "claim": claim, "value": value, "scope": scope,
380
+ "first_seen": now_ts(), "verified_at": now_ts(), "confidence": float(confidence),
381
+ "sources": sources or [], "supersedes": supersedes,
382
+ "valid_from": valid_from, "valid_to": valid_to
383
+ }
384
+ self.facts.append(rec)
385
+ self._save()
386
+ return rec
387
+
388
+ def find(self, claim: str) -> List[Dict[str,Any]]:
389
+ c = claim.strip().lower()
390
+ return [f for f in self.facts if f.get("claim","").strip().lower()==c]
391
+
392
+ def latest(self, claim: str) -> Optional[Dict[str,Any]]:
393
+ items = self.find(claim)
394
+ if not items: return None
395
+ # return the most recent verified_at
396
+ return sorted(items, key=lambda x: x.get("verified_at", 0), reverse=True)[0]
397
+
398
+ def all(self) -> List[Dict[str,Any]]:
399
+ return self.facts
400
+
401
+ # ---------------------------
402
+ # Web search (HTML scrape + 24h cache)
403
+ # ---------------------------
404
+ class WebSearch:
405
+ def __init__(self, enabled: bool=False, cache_file: Optional[str]=None):
406
+ self.enabled = bool(enabled)
407
+ self.cache_file = cache_file or MODEL_CONFIG["web_cache_file"]
408
+ self.cache = {}
409
+ self._load_cache()
410
+
411
+ def _load_cache(self):
412
+ if os.path.exists(self.cache_file):
413
+ try:
414
+ with open(self.cache_file, "r", encoding="utf-8") as f:
415
+ self.cache = json.load(f)
416
+ except Exception:
417
+ self.cache = {}
418
 
419
+ def _save_cache(self):
420
  try:
421
+ with open(self.cache_file, "w", encoding="utf-8") as f:
422
+ json.dump(self.cache, f, indent=2, ensure_ascii=False)
423
+ except Exception:
424
+ pass
425
+
426
+ def search(self, query: str, max_results: int=3) -> List[Dict[str,str]]:
427
+ if not self.enabled:
428
+ return []
429
+ key = hashlib.sha1(query.encode()).hexdigest()
430
+ # 24h TTL
431
+ if key in self.cache and (now_ts() - self.cache[key]["ts"]) < 86400:
432
+ return self.cache[key]["hits"][:max_results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  try:
434
+ import requests
435
+ from bs4 import BeautifulSoup
436
+ url = f"https://duckduckgo.com/html/?q={requests.utils.quote(query)}"
437
+ headers = {"User-Agent":"Mozilla/5.0"}
438
+ r = requests.get(url, headers=headers, timeout=8)
439
+ soup = BeautifulSoup(r.text, "html.parser")
440
+ results=[]
441
+ # capture title/link/snippet (≤100 chars)
442
+ containers = soup.select(".result")[:max_results]
443
+ for c in containers:
444
+ a = c.select_one(".result__a")
445
+ s = c.select_one(".result__snippet")
446
+ title = a.get_text(strip=True) if a else ""
447
+ link = a.get("href") if a else ""
448
+ snippet = (s.get_text(" ", strip=True) if s else "")[:100]
449
+ if title and link:
450
+ results.append({"title": title, "link": link, "snippet": snippet})
451
+ self.cache[key] = {"ts": now_ts(), "hits": results}
452
+ self._save_cache()
453
+ return results
454
  except Exception as e:
455
+ logger.debug("websearch failed: %s", e)
456
+ return []
457
+
458
+ # ---------------------------
459
+ # LLM wrapper (transformers / llama.cpp)
460
+ # ---------------------------
461
+ class LocalLLM:
462
+ def __init__(self, model_path=None, backend="transformers", device="auto", quantize=None):
463
+ self.model_path = model_path
464
+ self.backend = backend
465
+ self.quantize = quantize
466
+ self.device = device if device!="auto" else ("cuda" if torch and torch.cuda.is_available() else "cpu")
467
+ self.model = None
468
+ self.tokenizer = None
469
+ self._loaded = False
470
+
471
+ def load(self):
472
+ if self._loaded: return
473
+ if self.backend == "transformers":
474
+ try:
475
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
476
+ if not self.model_path:
477
+ raise ValueError("MODEL_CONFIG['llm']['model_path'] not set")
478
+ logger.info("Loading transformers model %s", self.model_path)
479
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, cache_dir=CACHE_DIR)
480
+ # device_map="auto" keeps it simple; will use GPU if available
481
+ self.model = AutoModelForCausalLM.from_pretrained(
482
+ self.model_path, device_map="auto", cache_dir=CACHE_DIR
483
+ )
484
+ self._loaded = True
485
+ except Exception as e:
486
+ logger.error("transformers LLM load failed: %s", e)
487
+ self._loaded = False
488
+ elif self.backend == "llamacpp":
489
+ try:
490
+ import llama_cpp
491
+ if not self.model_path:
492
+ raise ValueError("MODEL_CONFIG['llm']['model_path'] not set")
493
+ self.model = llama_cpp.Llama(model_path=self.model_path)
494
+ self._loaded = True
495
+ except Exception as e:
496
+ logger.error("llamacpp load failed: %s", e)
497
+ self._loaded = False
498
+ else:
499
+ logger.error("Unknown backend %s", self.backend)
500
+ self._loaded = False
501
+
502
+ def ready(self) -> bool:
503
+ return self._loaded
504
+
505
+ def generate(self, prompt: str, max_tokens: int=256, temperature: float=0.7) -> str:
506
+ if not self._loaded:
507
+ raise RuntimeError("LLM not loaded")
508
+ if self.backend == "transformers":
509
+ try:
510
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
511
+ out = self.model.generate(
512
+ **inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature,
513
+ pad_token_id=self.tokenizer.eos_token_id
514
+ )
515
+ return self.tokenizer.decode(out[0], skip_special_tokens=True)
516
+ except Exception as e:
517
+ logger.error("LLM generation failed: %s", e)
518
+ return "[llm-error]"
519
+ elif self.backend == "llamacpp":
520
+ try:
521
+ result = self.model.create(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
522
+ if isinstance(result, dict) and "choices" in result:
523
+ return result["choices"][0]["text"]
524
+ return str(result)
525
+ except Exception as e:
526
+ logger.error("llamacpp generation failed: %s", e)
527
+ return "[llm-error]"
528
+ else:
529
+ return "[backend-not-implemented]"
530
+
531
+ # ---------------------------
532
+ # Image engine (SDXL/diffusers + caption)
533
+ # ---------------------------
534
+ class ImageEngine:
535
+ def __init__(self, base=None, refiner=None, inpaint=None, blip=None, device="auto"):
536
+ self.paths = {"base": base, "refiner": refiner, "inpaint": inpaint, "blip": blip}
537
+ self.device = device if device!="auto" else ("cuda" if torch and torch.cuda.is_available() else "cpu")
538
+ self.pipe = None
539
+ self.refiner = None
540
+ self.inpaint_pipe = None
541
+ self.captioner = None
542
+
543
+ def load_pipes(self):
544
+ if self.pipe is None and self.paths.get("base"):
545
+ try:
546
+ from diffusers import StableDiffusionXLPipeline
547
+ dtype = torch.float16 if self.device=="cuda" else torch.float32
548
+ self.pipe = StableDiffusionXLPipeline.from_pretrained(self.paths["base"], torch_dtype=dtype, cache_dir=CACHE_DIR)
549
+ self.pipe.to(self.device)
550
+ except Exception as e:
551
+ logger.debug("sdxl base load failed: %s", e)
552
+ self.pipe = None
553
+ if self.refiner is None and self.paths.get("refiner"):
554
+ try:
555
+ from diffusers import StableDiffusionXLImg2ImgPipeline
556
+ dtype = torch.float16 if self.device=="cuda" else torch.float32
557
+ self.refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(self.paths["refiner"], torch_dtype=dtype, cache_dir=CACHE_DIR)
558
+ self.refiner.to(self.device)
559
+ except Exception as e:
560
+ logger.debug("refiner load failed: %s", e)
561
+ self.refiner = None
562
+ if self.inpaint_pipe is None and self.paths.get("inpaint"):
563
+ try:
564
+ from diffusers import StableDiffusionXLInpaintPipeline
565
+ dtype = torch.float16 if self.device=="cuda" else torch.float32
566
+ self.inpaint_pipe = StableDiffusionXLInpaintPipeline.from_pretrained(self.paths["inpaint"], torch_dtype=dtype, cache_dir=CACHE_DIR)
567
+ self.inpaint_pipe.to(self.device)
568
+ except Exception as e:
569
+ logger.debug("inpaint load failed: %s", e)
570
+ self.inpaint_pipe = None
571
+ if self.captioner is None and self.paths.get("blip"):
572
+ try:
573
+ from transformers import pipeline
574
+ device_idx = 0 if self.device=="cuda" else -1
575
+ self.captioner = pipeline("image-to-text", model=self.paths["blip"], device=device_idx, cache_dir=CACHE_DIR)
576
+ except Exception as e:
577
+ logger.debug("captioner load failed: %s", e)
578
+ self.captioner = None
579
+
580
+ def generate(self, prompt: str, negative: Optional[str]=None, steps:int=30, width:int=1024, height:int=1024, seed:Optional[int]=None):
581
+ self.load_pipes()
582
+ if not self.pipe:
583
+ raise RuntimeError("SDXL text2img pipeline unavailable. Provide path in MODEL_CONFIG['sdxl_base']")
584
+ generator = None
585
+ if torch and seed is not None:
586
+ generator = torch.Generator(device=self.device); generator.manual_seed(int(seed))
587
+ out = self.pipe(prompt=prompt, negative_prompt=negative or "", num_inference_steps=steps,
588
+ width=width, height=height, generator=generator).images[0]
589
+ if self.refiner:
590
+ try:
591
+ out = self.refiner(prompt=prompt, image=out, strength=0.2).images[0]
592
+ except Exception:
593
+ pass
594
+ path = tmp_path(".png")
595
+ out.save(path)
596
+ return path
597
 
598
+ def inpaint(self, image_path: str, mask_path: Optional[str], prompt: str=""):
599
+ self.load_pipes()
600
+ if not self.inpaint_pipe:
601
+ raise RuntimeError("Inpaint pipeline not available.")
602
+ from PIL import Image
603
+ init = Image.open(image_path).convert("RGB")
604
+ mask = Image.open(mask_path).convert("L") if mask_path else Image.new("L", init.size, 255)
605
+ res = self.inpaint_pipe(prompt=prompt or " ", image=init, mask_image=mask, guidance_scale=7.5, num_inference_steps=30)
606
+ out = tmp_path("_inpaint.png"); res.images[0].save(out); return out
607
+
608
+ def caption(self, image_path: str) -> str:
609
+ self.load_pipes()
610
+ if not self.captioner: return ""
611
+ from PIL import Image
612
+ img = Image.open(image_path).convert("RGB")
613
  try:
614
+ return self.captioner(img)[0].get("generated_text","")
615
+ except Exception:
616
+ return ""
617
+
618
+ # ---------------------------
619
+ # Video engine (keyframes) + functional FPS "interpolation"
620
+ # ---------------------------
621
+ class VideoEngine:
622
+ def __init__(self):
623
+ pass
624
+
625
+ def analyze(self, video_path: str, max_frames:int=6):
 
 
 
 
 
 
 
 
 
 
 
 
 
626
  try:
627
+ import imageio
628
+ from PIL import Image
629
+ reader = imageio.get_reader(video_path)
630
+ total = reader.count_frames()
631
+ step = max(1, total // max_frames) if total else 1
632
+ frames=[]
633
+ for i in range(0,total,step):
634
+ try:
635
+ arr = reader.get_data(i)
636
+ p = tmp_path(f"_frame{i}.jpg")
637
+ Image.fromarray(arr).save(p)
638
+ frames.append(p)
639
+ if len(frames) >= max_frames: break
640
+ except Exception:
641
+ continue
642
+ return {"frames": frames, "count": len(frames)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  except Exception as e:
 
644
  return {"error": str(e)}
 
 
 
 
 
 
 
645
 
646
+ def interpolate_fps(self, video_path: str, factor:int=2) -> Dict[str,Any]:
647
+ """
648
+ Functional, dependency-light interpolation by frame duplication.
649
+ This increases frame count (and apparent FPS) without optical flow.
650
+ Produces a new MP4 if imageio-ffmpeg is available; else returns extracted frames.
651
+ """
652
  try:
653
+ import imageio, imageio.v3 as iio
654
+ reader = imageio.get_reader(video_path)
655
+ fps = reader.get_meta_data().get("fps", 24)
656
+ new_fps = max(1, int(fps*factor))
657
+ out_path = tmp_path(".mp4")
658
+ try:
659
+ writer = imageio.get_writer(out_path, fps=new_fps)
660
+ for frame in reader:
661
+ # duplicate frames "factor" times
662
+ for _ in range(factor):
663
+ writer.append_data(frame)
664
+ writer.close()
665
+ return {"status":"ok", "output": out_path, "fps": new_fps}
666
+ except Exception as e:
667
+ return {"status":"partial", "reason": f"writer failed: {e}"}
 
 
 
 
 
 
 
668
  except Exception as e:
669
+ return {"status":"error", "reason": str(e)}
670
+
671
+ # ---------------------------
672
+ # Voice engine (register, embed, TTS)
673
+ # ---------------------------
674
+ class VoiceEngine:
675
+ def __init__(self, provenance: ProvenanceManager, safety: SafetyManager, piper_bin=None, piper_voice=None):
676
+ self.prov = provenance
677
+ self.safety = safety
678
+ self.piper_bin = piper_bin
679
+ self.piper_voice = piper_voice
680
+ self.profiles: Dict[str,Dict[str,Any]] = {}
681
+
682
+ def extract_embedding(self, wav_path: str) -> Optional[List[float]]:
683
+ try:
684
+ with open(wav_path,"rb") as f:
685
+ b = f.read()
686
+ h = hashlib.sha256(b).digest()
687
+ return [float(x)/255.0 for x in h[:192]]
688
+ except Exception:
689
+ return None
690
+
691
+ def register(self, user_id: str, wav_path: str, consent_text: str, block_public=True):
692
+ token = self.safety.record_consent(user_id, consent_text)
693
+ emb = self.extract_embedding(wav_path)
694
+ if block_public and self.safety.check_public_figure(emb):
695
+ return {"status":"rejected","reason":"protected_speaker"}
696
+ vid = f"voice_{hashlib.sha1((user_id+str(time.time())).encode()).hexdigest()[:10]}"
697
+ self.profiles[vid] = {"user_id": user_id, "embedding": emb, "consent": token}
698
+ return {"status":"ok", "voice_id": vid}
699
+
700
+ def synthesize(self, voice_id:Optional[str], text:str, emotion:Optional[str]=None, rate:float=1.0, fmt:str="wav"):
701
+ if not self.safety.is_allowed(text):
702
+ return None
703
+ out = tmp_path(f".{fmt}")
704
+ # Prefer Piper if configured (local CLI)
705
+ if self.piper_bin and self.piper_voice and os.path.exists(self.piper_bin) and os.path.exists(self.piper_voice):
706
+ try:
707
+ p = subprocess.Popen([self.piper_bin, "-m", self.piper_voice, "-f", out],
708
+ stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
709
+ p.communicate(input=text, timeout=60)
710
+ if os.path.exists(out):
711
+ with open(out,"rb") as f:
712
+ b = f.read()
713
+ b2 = self.prov.watermark_audio(b)
714
+ with open(out,"wb") as f: f.write(b2)
715
+ self.prov.attach(out, Provenance(model="piper", version="1.0",
716
+ consent=self.profiles.get(voice_id, {}).get("consent")))
717
+ return out
718
+ except Exception as e:
719
+ logger.debug("piper fail: %s", e)
720
+ # Fallback pyttsx3
721
  try:
722
+ import pyttsx3
723
+ eng = pyttsx3.init()
724
+ try: eng.setProperty("rate", int(200*rate))
725
+ except Exception: pass
726
+ eng.save_to_file(text, out); eng.runAndWait()
727
+ with open(out,"rb") as f:
728
+ b = f.read()
729
+ b2 = self.prov.watermark_audio(b)
730
+ with open(out,"wb") as f: f.write(b2)
731
+ self.prov.attach(out, Provenance(model="pyttsx3", version="1.0"))
732
+ return out
733
  except Exception as e:
734
+ logger.debug("pyttsx3 fail: %s", e)
735
+ # fallback: silent wav (keeps pipeline functional)
 
 
 
736
  try:
737
+ import wave, struct
738
+ fr=16000; dur=max(1, min(5, int(len(text)/10)))
739
+ with wave.open(out,'w') as wf:
740
+ wf.setnchannels(1); wf.setsampwidth(2); wf.setframerate(fr)
741
+ for _ in range(fr*dur): wf.writeframes(struct.pack('<h',0))
742
+ self.prov.attach(out, Provenance(model="silence", version="0.1"))
743
+ return out
744
  except Exception:
745
+ return None
 
 
 
 
 
 
 
 
 
 
 
746
 
747
+ # ---------------------------
748
+ # MathEngine (SymPy)
749
+ # ---------------------------
750
+ class MathEngine:
751
+ def __init__(self):
752
  try:
753
+ import sympy as sp
754
+ self.sp = sp
755
+ except Exception:
756
+ self.sp = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
757
 
758
+ def solve(self, expr: str) -> str:
759
+ if not self.sp: return "[sympy missing]"
760
  try:
761
+ parsed = self.sp.sympify(expr)
762
+ return str(self.sp.simplify(parsed))
 
 
 
 
 
 
 
 
 
 
 
 
763
  except Exception as e:
764
+ return f"[math-error] {e}"
 
 
 
 
 
 
 
 
 
 
 
 
765
 
766
+ def eval_numeric(self, expr: str) -> str:
767
+ if not self.sp: return "[sympy missing]"
768
  try:
769
+ return str(self.sp.N(self.sp.sympify(expr)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770
  except Exception as e:
771
+ return f"[math-error] {e}"
772
+
773
+ def safe_eval(self, text: str) -> Optional[str]:
774
+ # detect simple math patterns like "2+2", "sqrt(2)", etc.
775
+ if re.fullmatch(r"[0-9\.\+\-\*\/\(\)\s^sqrtpiE]+", text.replace("**","^")):
776
+ return self.eval_numeric(text)
777
+ return None
778
+
779
+ # ---------------------------
780
+ # Code Sandbox (lightweight)
781
+ # ---------------------------
782
+ class CodeSandbox:
783
+ def __init__(self):
784
+ pass
785
+ def run(self, code:str, timeout: int=5) -> Dict[str,Any]:
786
+ # VERY light sandbox; for real isolation use subprocess + rlimits in your env.
787
  try:
788
+ import io, contextlib
789
+ buf = io.StringIO()
790
+ ns = {}
791
+ with contextlib.redirect_stdout(buf):
792
+ exec(code, {"__builtins__": {"print": print, "range": range, "len": len}}, ns)
793
+ out = buf.getvalue()
794
+ return {"stdout": out, "keys": list(ns.keys())}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  except Exception as e:
 
796
  return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
797
 
798
+ # ---------------------------
799
+ # Agents
800
+ # ---------------------------
801
+ class Agent:
802
+ def __init__(self, name:str, model: LocalLLM, tools:Dict[str,Any]=None):
803
+ self.name = name
804
+ self.model = model
805
+ self.tools = tools or {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806
 
807
+ def act(self, prompt:str, max_tokens=256, temperature=0.7):
808
+ try:
809
+ if not self.model.ready():
810
+ self.model.load()
811
+ return self.model.generate(prompt, max_tokens=max_tokens, temperature=temperature)
 
 
 
 
 
 
 
 
812
  except Exception as e:
813
+ logger.error("Agent %s failed: %s", self.name, e)
814
+ return f"[{self.name}-error] {e}"
815
+
816
+ class AgentHub:
817
+ def __init__(self, llm: LocalLLM, img_engine: ImageEngine, vid_engine: VideoEngine):
818
+ self.research = Agent("ResearchAgent", llm, {"web": None})
819
+ self.coder = Agent("CoderAgent", llm, {"exec": CodeSandbox()})
820
+ self.designer = Agent("DesignerAgent", llm, {"image": img_engine})
821
+ self.vid = Agent("VideoAgent", llm, {"video": vid_engine})
822
+
823
+ def coordinate(self, user_request:str):
824
+ if any(k in user_request.lower() for k in ["code","implement","script"]):
825
+ return self.coder.act(f"Write code for: {user_request}", max_tokens=512)
826
+ if any(k in user_request.lower() for k in ["design","image","generate"]):
827
+ return self.designer.act(f"Create an image plan: {user_request}", max_tokens=256)
828
+ return self.research.act(f"Research and summarize: {user_request}", max_tokens=512)
829
+
830
+ # ---------------------------
831
+ # Perception (lightweight multimodal glue)
832
+ # ---------------------------
833
+ class Perception:
834
+ EMOJI_MAP = {
835
+ "😂":"joy","😅":"relief","😭":"sad","😡":"anger","😍":"affection","😎":"confident",
836
+ "🤔":"thinking","😴":"tired","🙄":"annoyed","😇":"polite","😁":"happy","🤣":"joy"
837
+ }
838
+ def text_emotion_tags(self, text:str) -> List[str]:
839
+ tags=set()
840
+ for ch in text:
841
+ if ch in self.EMOJI_MAP: tags.add(self.EMOJI_MAP[ch])
842
+ if re.search(r"\b(sad|upset|angry|frustrated)\b", text.lower()): tags.add("negative")
843
+ if re.search(r"\b(happy|great|awesome|love)\b", text.lower()): tags.add("positive")
844
+ return sorted(tags)
845
+
846
+ # ---------------------------
847
+ # Confidence Scorer
848
+ # ---------------------------
849
+ class ConfidenceScorer:
850
+ def score(self, *,
851
+ source_reliability: float,
852
+ recency: float,
853
+ agreement: float,
854
+ self_consistency: float,
855
+ retrieval_strength: float,
856
+ contradictions: float) -> float:
857
+ # conservative weighting; each in [0,1]
858
+ w1,w2,w3,w4,w5,w6 = 0.22,0.18,0.18,0.18,0.14,0.10
859
+ s = (w1*source_reliability +
860
+ w2*recency +
861
+ w3*agreement +
862
+ w4*self_consistency +
863
+ w5*retrieval_strength -
864
+ w6*contradictions)
865
+ return max(0.0, min(1.0, s))
866
+
867
+ # ---------------------------
868
+ # Close-to-Human Brain (CHB)
869
+ # ---------------------------
870
+ class CloseToHumanBrain:
871
+ def __init__(self, llm: LocalLLM, longmem: LongTermMemory, kg: KnowledgeGraph,
872
+ vfs: VersionedFactStore, web: WebSearch, img: ImageEngine):
873
+ self.llm = llm
874
+ self.long = longmem
875
+ self.kg = kg
876
+ self.vfs = vfs
877
+ self.web = web
878
+ self.img = img
879
+ self.math = MathEngine()
880
+ self.perc = Perception()
881
+ self.scorer = ConfidenceScorer()
882
+
883
+ # ---- Retrieval
884
+ def retrieve(self, user_text: str, k:int=5) -> Tuple[str, List[Dict[str,str]]]:
885
+ mem_hits = self.long.search(user_text, top_k=min(8,k))
886
+ mem_ctx = "\n".join([h["text"] for h in mem_hits]) if mem_hits else ""
887
+ web_hits = self.web.search(user_text, max_results=3) if self.web.enabled else []
888
+ return mem_ctx, web_hits
889
+
890
+ # ---- Drafting
891
+ def multi_draft(self, prompt_base: str, drafts:int=3, max_tokens:int=384) -> List[str]:
892
+ outs=[]
893
+ temps=[0.4, 0.7, 1.0][:max(1,drafts)]
894
+ if not self.llm.ready():
895
+ self.llm.load()
896
+ for t in temps:
897
+ out = self.llm.generate(prompt_base, max_tokens=max_tokens, temperature=t)
898
+ outs.append(out)
899
+ return outs
900
+
901
+ # ---- Verification helpers
902
+ def _estimate_reliability(self, sources: List[Dict[str,Any]]) -> float:
903
+ if not sources: return 0.4
904
+ rel=0.0
905
+ for s in sources:
906
+ t = s.get("type","")
907
+ if t=="memory": rel += 0.6
908
+ elif t=="kg": rel += 0.7
909
+ elif t=="web": rel += 0.65
910
+ elif t in ("vision","audio","video"): rel += 0.55
911
+ else: rel += 0.5
912
+ return min(1.0, rel / max(1,len(sources)))
913
+
914
+ def _recency(self, sources: List[Dict[str,Any]]) -> float:
915
+ if not sources: return 0.3
916
+ ages=[]
917
+ now=now_ts()
918
+ for s in sources:
919
+ ts = s.get("time") or now
920
+ ages.append(max(0.0, now - ts))
921
+ avg = sum(ages)/len(ages)
922
+ # map age (in seconds) to [0,1] with simple decay (~1 day half-life)
923
+ day = 86400.0
924
+ return max(0.0, min(1.0, 1.0/(1.0 + (avg/day))))
925
+
926
+ def _agreement(self, claims: List[str]) -> float:
927
+ # crude token overlap agreement
928
+ if not claims: return 0.0
929
+ base=set(re.findall(r"\w+", claims[0].lower()))
930
+ agree=1
931
+ for c in claims[1:]:
932
+ toks=set(re.findall(r"\w+", c.lower()))
933
+ if len(base & toks) > 0: agree += 1
934
+ return agree/len(claims)
935
+
936
+ def _self_consistency(self, drafts: List[str]) -> float:
937
+ # measure average pairwise Jaccard of word sets
938
+ if not drafts: return 0.0
939
+ sets=[set(re.findall(r"\w+", d.lower())) for d in drafts]
940
+ if len(sets)==1: return 1.0
941
+ pair_scores=[]
942
+ for i in range(len(sets)):
943
+ for j in range(i+1,len(sets)):
944
+ a,b=sets[i],sets[j]
945
+ inter=len(a & b); union=len(a | b) or 1
946
+ pair_scores.append(inter/union)
947
+ return sum(pair_scores)/len(pair_scores)
948
+
949
+ def _retrieval_strength(self, mem_ctx: str) -> float:
950
+ if not mem_ctx: return 0.4
951
+ # simple function of context length
952
+ L = len(mem_ctx.split())
953
+ return max(0.4, min(1.0, math.log10(1+L)/2))
954
+
955
+ # ---- Verify & Synthesize
956
+ def verify_and_respond(self, user_id:str, user_text: str,
957
+ preferred_lang: Optional[str]=None) -> Dict[str,Any]:
958
+ # Perception
959
+ emotion_tags = self.perc.text_emotion_tags(user_text)
960
+
961
+ # Retrieve evidence
962
+ mem_ctx, web_hits = self.retrieve(user_text, k=6)
963
+ web_ctx = "\n".join([f"{h['title']} ({h['link']}) — {h.get('snippet','')}" for h in web_hits]) if web_hits else ""
964
+ citations = [{"type":"web","ref": h["link"], "title": h["title"], "time": now_ts()} for h in web_hits]
965
+
966
+ # Construct base prompt
967
+ prompt = (
968
+ "Persona: helpful, precise assistant.\n"
969
+ "Use given memory and web snippets as *evidence*.\n"
970
+ "If a claim is uncertain, narrow it or state limits.\n\n"
971
+ f"Memory Evidence:\n{mem_ctx}\n\nWeb Evidence:\n{web_ctx}\n\n"
972
+ f"User: {user_text}\nAssistant:"
973
+ )
974
+
975
+ # Multi-draft
976
+ drafts = self.multi_draft(prompt, drafts=3, max_tokens=512)
977
+
978
+ # Self-consistency + basic math check (if any plain expression present)
979
+ math_value = None
980
+ m = re.search(r"(?:calculate|solve)\s*([0-9\.\+\-\*\/\(\)\s^sqrtpiE]+)", user_text, re.I)
981
+ if m:
982
+ math_value = self.math.safe_eval(m.group(1))
983
+
984
+ # Build sources list (memory + web)
985
+ sources = citations[:]
986
+ if mem_ctx:
987
+ sources.append({"type":"memory","ref":"longterm_memory","title":"long-term memory","time": now_ts()})
988
+
989
+ # Score
990
+ source_rel = self._estimate_reliability(sources)
991
+ recency = self._recency(sources)
992
+ agree = self._agreement(drafts)
993
+ self_cons = self._self_consistency(drafts)
994
+ retr = self._retrieval_strength(mem_ctx)
995
+ contradictions = 0.0
996
+
997
+ # Compare with VFS (detect contradictions for simple exact match claims)
998
+ # We extract simple "X is Y" patterns from drafts; very lightweight.
999
+ simple_claims=[]
1000
+ for d in drafts:
1001
+ for sent in re.split(r"[.\n]", d):
1002
+ m2 = re.search(r"^([\w\s\-]{3,})\s+is\s+([\w\s\-\%\.]{2,})$", sent.strip(), re.I)
1003
+ if m2:
1004
+ c = f"{m2.group(1).strip()} is"
1005
+ v = m2.group(2).strip()
1006
+ simple_claims.append((c,v))
1007
+ # check against latest
1008
+ for c,v in simple_claims:
1009
+ latest = self.vfs.latest(c)
1010
+ if latest and latest.get("value") and latest["value"].strip().lower()!=v.lower():
1011
+ contradictions += 0.5 # penalize disagreement with stored fact
1012
+
1013
+ conf = self.scorer.score(
1014
+ source_reliability=source_rel,
1015
+ recency=recency,
1016
+ agreement=agree,
1017
+ self_consistency=self_cons,
1018
+ retrieval_strength=retr,
1019
+ contradictions=contradictions
1020
+ )
1021
+
1022
+ # Choose the most concise draft
1023
+ best = min(drafts, key=lambda s: len(s) if s else 1e9)
1024
+
1025
+ # If math was requested & computed, splice it in with highest certainty
1026
+ if math_value and "[math-error]" not in math_value and "sympy missing" not in math_value:
1027
+ best = f"{best}\n\nMath check: {math_value}"
1028
+
1029
+ # If confidence < threshold, trim to certain subset
1030
+ min_conf = MODEL_CONFIG["chb_min_confidence"]
1031
+ if conf < min_conf:
1032
+ # Provide narrowed/certain answer: we extract sentences with highest overlap across drafts.
1033
+ sent_scores=[]
1034
+ sents = [s.strip() for s in re.split(r"(?<=[\.\!\?])\s+", best) if s.strip()]
1035
+ for s in sents:
1036
+ count=sum(1 for d in drafts if s.lower() in d.lower())
1037
+ sent_scores.append((count, s))
1038
+ sent_scores.sort(reverse=True)
1039
+ certain = " ".join([s for cnt,s in sent_scores if cnt>=2]) # present in >=2 drafts
1040
+ if not certain:
1041
+ certain = "I'm not fully confident. Here's what is most certain from the evidence I have."
1042
+ best = certain
1043
+
1044
+ # Record any simple claims to VFS as new knowledge (with provenance)
1045
+ for c,v in simple_claims[:3]:
1046
+ self.vfs.add_or_update(claim=c, value=v, sources=sources, confidence=float(conf))
1047
+
1048
+ # Build final message with optional citations
1049
+ if citations:
1050
+ cites = "\n".join([f"- {c['title']} — {c['ref']}" for c in citations])
1051
+ best_out = f"{best}\n\nConfidence: {conf:.2f}\nSources:\n{cites}"
1052
+ else:
1053
+ best_out = f"{best}\n\nConfidence: {conf:.2f}"
1054
+
1055
+ return {"reply": best_out, "confidence": conf, "citations": citations}
1056
+
1057
+ # ---------------------------
1058
+ # Orchestrator (ties everything with CHB)
1059
+ # ---------------------------
1060
+ class SuperAgent:
1061
+ def __init__(self, config:dict):
1062
+ self.cfg = config
1063
+ self.prov = ProvenanceManager()
1064
+ self.safety = SafetyManager(blocklist=config.get("safety_blocklist"))
1065
+ self.short = ShortTermMemory()
1066
+ self.long = LongTermMemory(index_dir=config.get("faiss_index_dir"), embed_model_name=config.get("embedder"))
1067
+ self.kg = KnowledgeGraph(config.get("knowledge_graph_path"))
1068
+ self.vfs = VersionedFactStore(config.get("vfs_file"))
1069
+ self.web = WebSearch(enabled=bool(config.get("allow_web_search")), cache_file=config.get("web_cache_file"))
1070
+ # LLM
1071
+ self.llm = LocalLLM(model_path=config.get("llm",{}).get("model_path"),
1072
+ backend=config.get("llm",{}).get("backend","transformers"),
1073
+ device=config.get("device","auto"),
1074
+ quantize=config.get("llm",{}).get("quantize",None))
1075
+ # image & video engines
1076
+ self.image = ImageEngine(base=config.get("sdxl_base"), refiner=config.get("sdxl_refiner"),
1077
+ inpaint=config.get("sdxl_inpaint"), blip=config.get("blip_caption"),
1078
+ device=config.get("device","auto"))
1079
+ self.video = VideoEngine()
1080
+ self.voice = VoiceEngine(self.prov, self.safety, piper_bin=config.get("piper_binary"), piper_voice=config.get("piper_voice"))
1081
+ # agents
1082
+ self.agents = AgentHub(self.llm, self.image, self.video)
1083
+ # Close-to-Human Brain
1084
+ self.chb = CloseToHumanBrain(self.llm, self.long, self.kg, self.vfs, self.web, self.image)
1085
+
1086
+ def detect_intent(self, text:str) -> str:
1087
+ t = (text or "").lower().strip()
1088
+ if t.startswith("/img ") or t.startswith("/image "): return "image"
1089
+ if t.startswith("/inpaint "): return "inpaint"
1090
+ if t.startswith("/tts "): return "tts"
1091
+ if t.startswith("/video "): return "video"
1092
+ if t.startswith("/vidinterp "): return "vidinterp"
1093
+ if t.startswith("/kg "): return "kg"
1094
+ if t.startswith("/agent "): return "agent"
1095
+ if any(k in t for k in ["solve", "calculate", "integrate", "differentiate"]): return "math"
1096
+ return "chat"
1097
+
1098
+ def handle(self, user_id:str, text:str, preferred_lang:Optional[str]=None) -> Dict[str,Any]:
1099
+ if not self.safety.is_allowed(text):
1100
+ return {"status":"blocked","reason":"policy"}
1101
+ self.short.push(user_id, "user", text)
1102
+
1103
+ intent = self.detect_intent(text)
1104
+ reply = ""
1105
+ payload: Dict[str,Any] = {}
1106
 
1107
  try:
1108
+ if intent == "math":
1109
+ me = self.chb.math
1110
+ expr = re.sub(r"^(solve|calculate)\s*","", text, flags=re.I).strip()
1111
+ res = {"exact": me.solve(expr), "numeric": me.eval_numeric(expr)}
1112
+ reply = json.dumps(res, ensure_ascii=False, indent=2)
1113
+
1114
+ elif intent == "image":
1115
+ prompt = text.split(" ",1)[1] if " " in text else text
1116
+ path = self.image.generate(prompt=prompt)
1117
+ reply = f"[image] {path}"
1118
+
1119
+ elif intent == "inpaint":
1120
+ reply = "Use /v1/image/inpaint API with 'image' and optional 'mask' files."
1121
+
1122
+ elif intent == "tts":
1123
+ content = text.split(" ",1)[1] if " " in text else text
1124
+ path = self.voice.synthesize(None, content)
1125
+ reply = f"[tts] {path}"
1126
+
1127
+ elif intent == "video":
1128
+ reply = "Upload video via /v1/video/analyze to extract keyframes."
1129
+
1130
+ elif intent == "vidinterp":
1131
+ parts = text.split(" ",2)
1132
+ factor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 2
1133
+ reply = f"Use /v1/video/interpolate (factor={factor}) with a video file."
1134
+
1135
+ elif intent == "kg":
1136
+ reply = "Use /v1/kg endpoints (not exposed in this single file demo) or extend as needed."
1137
+
1138
+ elif intent == "agent":
1139
+ task = text.split(" ",1)[1] if " " in text else ""
1140
+ out = self.agents.coordinate(task)
1141
+ # CHB is the final boss: pass agent output through CHB to verify/format
1142
+ verified = self.chb.verify_and_respond(user_id, f"{task}\n\nAgentDraft:\n{out}", preferred_lang)
1143
+ reply = verified["reply"]
1144
+ payload.update({"confidence": verified["confidence"]})
1145
 
1146
+ else:
1147
+ # Default conversational path goes through CHB (final arbiter)
1148
+ verified = self.chb.verify_and_respond(user_id, text, preferred_lang)
1149
+ reply = verified["reply"]
1150
+ payload.update({"confidence": verified["confidence"]})
1151
 
1152
+ except Exception as e:
1153
+ logger.error("Handle failed: %s", e)
1154
+ reply = f"[error] {e}"
 
 
 
 
 
 
 
 
 
 
 
1155
 
1156
+ # Learning: store turn + (optionally) extracted facts handled inside CHB
1157
+ if MODEL_CONFIG.get("auto_learn"):
1158
  try:
1159
+ self.long.add(user_id, text, kind="turn")
1160
+ except Exception as e:
1161
+ logger.debug("long-term add failed: %s", e)
1162
+
1163
+ self.short.push(user_id, "assistant", reply)
1164
+ return {"status":"ok", "reply": reply, **payload}
1165
+
1166
+ # ---------------------------
1167
+ # FastAPI + Gradio integration
1168
+ # ---------------------------
1169
+ try:
1170
+ from fastapi import FastAPI, UploadFile, File, Form
1171
+ from fastapi.responses import FileResponse, JSONResponse
1172
+ from pydantic import BaseModel
1173
+ _FASTAPI = True
1174
+ except Exception:
1175
+ _FASTAPI = False
1176
+
1177
+ try:
1178
+ import gradio as gr
1179
+ _GRADIO = True
1180
+ except Exception:
1181
+ _GRADIO = False
1182
+
1183
+ app = FastAPI(title="Multimodal SuperAgent") if _FASTAPI else None
1184
+ _AGENT_SINGLETON: Optional[SuperAgent] = None
1185
+
1186
+ def get_agent() -> SuperAgent:
1187
+ global _AGENT_SINGLETON
1188
+ if _AGENT_SINGLETON is None:
1189
+ # merge env config file if present
1190
+ cfg_path = os.environ.get("SUPERAGENT_CONFIG")
1191
+ cfg = MODEL_CONFIG.copy()
1192
+ if cfg_path and os.path.exists(cfg_path):
1193
+ try:
1194
+ import yaml
1195
+ with open(cfg_path,"r",encoding="utf-8") as f:
1196
+ y = yaml.safe_load(f) or {}
1197
+ cfg.update(y)
1198
+ except Exception:
1199
+ pass
1200
+ _AGENT_SINGLETON = SuperAgent(cfg)
1201
+ return _AGENT_SINGLETON
1202
+
1203
+ if _FASTAPI and app is not None:
1204
+ class ChatIn(BaseModel):
1205
+ user_id: str
1206
+ text: str
1207
+ preferred_lang: Optional[str]=None
1208
+
1209
+ @app.post("/v1/chat")
1210
+ async def api_chat(inp: ChatIn):
1211
+ return get_agent().handle(inp.user_id, inp.text, inp.preferred_lang)
1212
+
1213
+ @app.post("/v1/register_voice")
1214
+ async def api_reg_voice(user_id: str = Form(...), consent: str = Form(...), file: UploadFile = File(...)):
1215
+ tmpf = tmp_path("_voice.wav")
1216
+ with open(tmpf,"wb") as f: f.write(await file.read())
1217
+ return get_agent().voice.register(user_id, tmpf, consent)
1218
+
1219
+ @app.post("/v1/tts")
1220
+ async def api_tts(voice_id: Optional[str]=Form(None), text: str = Form(...), fmt: str = Form("wav")):
1221
+ p = get_agent().voice.synthesize(voice_id, text, fmt=fmt)
1222
+ if p and os.path.exists(p): return FileResponse(p, media_type="audio/wav")
1223
+ return JSONResponse({"error":"tts_failed"})
1224
+
1225
+ @app.post("/v1/image/generate")
1226
+ async def api_img_gen(prompt: str = Form(...)):
1227
+ p = get_agent().image.generate(prompt)
1228
+ if p and os.path.exists(p): return FileResponse(p, media_type="image/png")
1229
+ return JSONResponse({"error":"image_unavailable"})
1230
+
1231
+ @app.post("/v1/image/inpaint")
1232
+ async def api_img_inpaint(prompt: str = Form(...), image: UploadFile = File(...), mask: UploadFile = File(None)):
1233
+ img_tmp = tmp_path("_img")
1234
+ with open(img_tmp,"wb") as f: f.write(await image.read())
1235
+ mask_tmp=None
1236
+ if mask:
1237
+ mask_tmp = tmp_path("_mask"); open(mask_tmp,"wb").write(await mask.read())
1238
+ p = get_agent().image.inpaint(img_tmp, mask_tmp, prompt)
1239
+ if p and os.path.exists(p): return FileResponse(p, media_type="image/png")
1240
+ return JSONResponse({"error":"inpaint_failed"})
1241
+
1242
+ @app.post("/v1/video/analyze")
1243
+ async def api_vid(file: UploadFile = File(...)):
1244
+ tmpf = tmp_path("_vid")
1245
+ with open(tmpf,"wb") as f: f.write(await file.read())
1246
+ return get_agent().video.analyze(tmpf)
1247
+
1248
+ @app.post("/v1/video/interpolate")
1249
+ async def api_vid_interp(factor: int = Form(2), file: UploadFile = File(...)):
1250
+ tmpf = tmp_path("_vid")
1251
+ with open(tmpf,"wb") as f: f.write(await file.read())
1252
+ return get_agent().video.interpolate_fps(tmpf, factor=max(2, int(factor)))
1253
+
1254
+ @app.post("/v1/memory/export")
1255
+ async def mem_export():
1256
+ return get_agent().long.export_all()
1257
+
1258
+ @app.post("/v1/memory/import")
1259
+ async def mem_import(items: List[Dict[str,Any]]):
1260
+ get_agent().long.import_bulk(items)
1261
+ return {"status":"ok","count":len(items)}
1262
+
1263
+ @app.post("/v1/web/toggle")
1264
+ async def web_toggle(enabled: bool = Form(...)):
1265
+ get_agent().web.enabled = bool(enabled); return {"enabled":get_agent().web.enabled}
1266
+
1267
+ # ---------------------------
1268
+ # Optional Gradio demo (runs when module executed)
1269
+ # ---------------------------
1270
+ def launch_gradio():
1271
+ if not _GRADIO:
1272
+ logger.warning("Gradio not installed")
1273
+ return
1274
+ agent = get_agent()
1275
+ with gr.Blocks(title="Multimodal SuperAgent") as demo:
1276
+ gr.Markdown("# Multimodal SuperAgent v6.0 (CHB)")
1277
+ with gr.Row():
1278
+ user_id = gr.Textbox(value="user1", label="User ID")
1279
+ prompt = gr.Textbox(label="Prompt")
1280
+ btn = gr.Button("Send")
1281
+ out = gr.Markdown(label="Reply")
1282
+ def send(u,p):
1283
+ res = agent.handle(u,p)
1284
+ return res.get("reply","")
1285
+ btn.click(send, [user_id,prompt], out)
1286
+ demo.launch(server_name="0.0.0.0", server_port=7860)
1287
+
1288
+ # ---------------------------
1289
+ # CLI
1290
+ # ---------------------------
1291
+ if __name__ == "__main__":
1292
+ import argparse
1293
+ ap = argparse.ArgumentParser()
1294
+ ap.add_argument("--demo", action="store_true")
1295
+ ap.add_argument("--gradio", action="store_true")
1296
+ args = ap.parse_args()
1297
+ if args.demo:
1298
+ a = get_agent()
1299
+ print("Chat:", a.handle("user1","Hello 😁, calculate 2*(3+4).")["reply"])
1300
+ print("Math:", a.handle("user1","solve 2*x+1=5")["reply"])
1301
+ print("Interp hint:", a.handle("user1","/vidinterp 2")["reply"])
1302
+ if args.gradio:
1303
+ launch_gradio()