Spaces:
Running
Running
Update multimodal_module.py
Browse files- multimodal_module.py +1253 -717
multimodal_module.py
CHANGED
@@ -1,767 +1,1303 @@
|
|
1 |
-
#
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
import asyncio
|
8 |
-
import logging
|
9 |
-
from datetime import datetime
|
10 |
-
from typing import Dict, List, Optional, Any, Union
|
11 |
-
import uuid
|
12 |
-
import numpy as np
|
13 |
-
|
14 |
-
# Configure logging
|
15 |
-
logging.basicConfig(
|
16 |
-
level=logging.INFO,
|
17 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
18 |
-
)
|
19 |
-
logger = logging.getLogger("MultiModalModule")
|
20 |
-
|
21 |
-
# Space-specific environment configuration
|
22 |
-
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
23 |
-
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
24 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
25 |
-
|
26 |
-
# Core ML Imports
|
27 |
-
import torch
|
28 |
-
from transformers import (
|
29 |
-
pipeline,
|
30 |
-
AutoModelForSeq2SeqLM,
|
31 |
-
AutoTokenizer,
|
32 |
-
Wav2Vec2Processor,
|
33 |
-
Wav2Vec2ForSequenceClassification,
|
34 |
-
AutoModelForCausalLM
|
35 |
-
)
|
36 |
-
from diffusers import (
|
37 |
-
StableDiffusionPipeline,
|
38 |
-
StableDiffusionInpaintPipeline
|
39 |
-
)
|
40 |
-
from huggingface_hub import hf_hub_download, snapshot_download
|
41 |
-
|
42 |
-
# Audio Processing
|
43 |
-
import librosa
|
44 |
-
import soundfile as sf
|
45 |
-
from gtts import gTTS
|
46 |
-
import speech_recognition as sr
|
47 |
-
import webrtcvad
|
48 |
-
|
49 |
-
# Image/Video Processing
|
50 |
-
from PIL import Image
|
51 |
-
import imageio
|
52 |
-
import imageio_ffmpeg
|
53 |
-
import moviepy.editor as mp
|
54 |
-
import cv2
|
55 |
-
|
56 |
-
# Document Processing
|
57 |
-
import fitz # PyMuPDF
|
58 |
-
from langdetect import detect, DetectorFactory
|
59 |
-
DetectorFactory.seed = 0
|
60 |
-
|
61 |
-
# Configuration
|
62 |
-
USE_SAFETY_CHECKER = False
|
63 |
-
MAX_HISTORY_LENGTH = 100
|
64 |
-
TEMP_DIR = "tmp"
|
65 |
-
MODEL_CACHE_DIR = "model_cache"
|
66 |
-
|
67 |
-
class MultiModalChatModule:
|
68 |
-
"""Complete multimodal module optimized for Hugging Face Spaces"""
|
69 |
-
|
70 |
-
def __init__(self, chat_history_file: str = "chat_histories.pkl"):
|
71 |
-
"""Initialize with Space optimizations"""
|
72 |
-
# Create required directories
|
73 |
-
os.makedirs(TEMP_DIR, exist_ok=True)
|
74 |
-
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
|
75 |
-
|
76 |
-
# Device configuration
|
77 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
78 |
-
self.torch_dtype = torch.float16 if "cuda" in self.device else torch.float32
|
79 |
-
logger.info(f"Initialized on {self.device.upper()} with dtype {self.torch_dtype}")
|
80 |
-
|
81 |
-
# Model registry
|
82 |
-
self.model_names = {
|
83 |
-
"voice_emotion_processor": "facebook/hubert-large-ls960-ft",
|
84 |
-
"voice_emotion_model": "superb/hubert-base-superb-er",
|
85 |
-
"translation_model": "facebook/nllb-200-distilled-600M",
|
86 |
-
"chatbot_tokenizer": "facebook/blenderbot-400M-distill",
|
87 |
-
"chatbot_model": "facebook/blenderbot-400M-distill",
|
88 |
-
"image_captioner": "Salesforce/blip-image-captioning-base",
|
89 |
-
"sd_inpaint": "runwayml/stable-diffusion-inpainting",
|
90 |
-
"sd_text2img": "runwayml/stable-diffusion-v1-5",
|
91 |
-
"code_model": "bigcode/starcoder",
|
92 |
-
}
|
93 |
-
|
94 |
-
# Model placeholders
|
95 |
-
self._voice_processor = None
|
96 |
-
self._voice_emotion_model = None
|
97 |
-
self._translator = None
|
98 |
-
self._chat_tokenizer = None
|
99 |
-
self._chat_model = None
|
100 |
-
self._image_captioner = None
|
101 |
-
self._sd_pipe = None
|
102 |
-
self._sd_inpaint = None
|
103 |
-
self._code_tokenizer = None
|
104 |
-
self._code_model = None
|
105 |
-
|
106 |
-
# Helpers
|
107 |
-
self._sr_recognizer = sr.Recognizer()
|
108 |
-
self.vad = webrtcvad.Vad(3)
|
109 |
-
self.chat_history_file = chat_history_file
|
110 |
-
self.user_chat_histories = self._load_chat_histories()
|
111 |
-
|
112 |
-
# Load tracking
|
113 |
-
self._loaded = {
|
114 |
-
"voice": False,
|
115 |
-
"translation": False,
|
116 |
-
"chat": False,
|
117 |
-
"image_caption": False,
|
118 |
-
"sd": False,
|
119 |
-
"code": False,
|
120 |
-
}
|
121 |
-
|
122 |
-
# ----------------------
|
123 |
-
# Core Utilities
|
124 |
-
# ----------------------
|
125 |
-
def _tmp_path(self, suffix: str = "") -> str:
|
126 |
-
"""Generate space-compatible temp file path"""
|
127 |
-
path = os.path.join(TEMP_DIR, f"{uuid.uuid4().hex}{suffix}")
|
128 |
-
os.makedirs(os.path.dirname(path), exist_ok=True)
|
129 |
-
return path
|
130 |
-
|
131 |
-
def _cleanup(self, *paths: str) -> None:
|
132 |
-
"""Safely remove files/directories"""
|
133 |
-
for path in paths:
|
134 |
-
try:
|
135 |
-
if path and os.path.exists(path):
|
136 |
-
if os.path.isfile(path):
|
137 |
-
os.remove(path)
|
138 |
-
elif os.path.isdir(path):
|
139 |
-
shutil.rmtree(path)
|
140 |
-
except Exception as e:
|
141 |
-
logger.warning(f"Cleanup failed for {path}: {e}")
|
142 |
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
try:
|
146 |
-
|
147 |
-
|
|
|
148 |
except Exception as e:
|
149 |
-
logger.
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
"content": content,
|
169 |
-
"language": lang
|
170 |
-
})
|
171 |
-
|
172 |
-
# Enforce max history length
|
173 |
-
self.user_chat_histories[user_id] = self.user_chat_histories[user_id][-MAX_HISTORY_LENGTH:]
|
174 |
-
self._save_chat_histories()
|
175 |
-
|
176 |
-
# ----------------------
|
177 |
-
# Model Loading
|
178 |
-
# ----------------------
|
179 |
-
def _load_voice_models(self) -> None:
|
180 |
-
"""Load voice processing models"""
|
181 |
-
if self._loaded["voice"]:
|
182 |
-
return
|
183 |
-
|
184 |
try:
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
cache_dir=MODEL_CACHE_DIR
|
189 |
-
)
|
190 |
-
self._voice_emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(
|
191 |
-
self.model_names["voice_emotion_model"],
|
192 |
-
cache_dir=MODEL_CACHE_DIR
|
193 |
-
).to(self.device)
|
194 |
-
self._loaded["voice"] = True
|
195 |
-
logger.info("Voice models loaded successfully")
|
196 |
except Exception as e:
|
197 |
-
logger.
|
198 |
-
|
199 |
-
def
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
try:
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
-
def
|
219 |
-
|
220 |
-
|
221 |
-
|
|
|
|
|
|
|
|
|
|
|
222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
try:
|
224 |
-
|
225 |
-
|
226 |
-
self.model_names["chatbot_tokenizer"],
|
227 |
-
cache_dir=MODEL_CACHE_DIR
|
228 |
-
)
|
229 |
-
self._chat_model = AutoModelForSeq2SeqLM.from_pretrained(
|
230 |
-
self.model_names["chatbot_model"],
|
231 |
-
cache_dir=MODEL_CACHE_DIR
|
232 |
-
).to(self.device)
|
233 |
-
self._loaded["chat"] = True
|
234 |
-
logger.info("Chatbot models loaded successfully")
|
235 |
except Exception as e:
|
236 |
-
logger.
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
|
|
|
243 |
try:
|
244 |
-
|
245 |
-
|
246 |
-
self._image_captioner = pipeline(
|
247 |
-
"image-to-text",
|
248 |
-
model=self.model_names["image_captioner"],
|
249 |
-
device=device,
|
250 |
-
cache_dir=MODEL_CACHE_DIR
|
251 |
-
)
|
252 |
-
self._loaded["image_caption"] = True
|
253 |
-
logger.info("Image captioner loaded successfully")
|
254 |
except Exception as e:
|
255 |
-
logger.
|
256 |
-
|
257 |
-
def
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
|
|
262 |
try:
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
self.
|
275 |
-
self.model_names["sd_inpaint"],
|
276 |
-
torch_dtype=self.torch_dtype,
|
277 |
-
cache_dir=MODEL_CACHE_DIR
|
278 |
-
).to(self.device)
|
279 |
-
|
280 |
-
self._loaded["sd"] = True
|
281 |
-
logger.info("Stable Diffusion models loaded successfully")
|
282 |
-
except Exception as e:
|
283 |
-
logger.error(f"Failed loading Stable Diffusion models: {e}")
|
284 |
-
self._sd_pipe = None
|
285 |
-
self._sd_inpaint = None
|
286 |
-
|
287 |
-
def _load_code_model(self) -> None:
|
288 |
-
"""Load code generation model"""
|
289 |
-
if self._loaded["code"]:
|
290 |
-
return
|
291 |
-
|
292 |
try:
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
)
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
304 |
except Exception as e:
|
305 |
-
logger.
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
self.
|
315 |
-
|
316 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
try:
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
0: "happy", 1: "sad", 2: "angry",
|
332 |
-
3: "fearful", 4: "calm", 5: "surprised"
|
333 |
-
}
|
334 |
-
return emotions.get(torch.argmax(logits).item(), "unknown")
|
335 |
-
except Exception as e:
|
336 |
-
logger.error(f"Voice emotion analysis failed: {e}")
|
337 |
-
return "error"
|
338 |
-
|
339 |
-
async def process_voice_message(self, voice_file, user_id: int) -> Dict[str, Any]:
|
340 |
-
"""Process voice message to text with emotion analysis"""
|
341 |
-
ogg_path = self._tmp_path(".ogg")
|
342 |
-
wav_path = self._tmp_path(".wav")
|
343 |
-
|
344 |
try:
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
(speech * 32767).astype(np.int16).tobytes(),
|
362 |
-
sample_rate=sr
|
363 |
-
)
|
364 |
-
|
365 |
-
# Transcription
|
366 |
-
text = ""
|
367 |
-
lang = "en"
|
368 |
-
if is_speech:
|
369 |
-
with sr.AudioFile(wav_path) as source:
|
370 |
-
audio = self._sr_recognizer.record(source)
|
371 |
-
try:
|
372 |
-
text = self._sr_recognizer.recognize_google(audio, language="en-US")
|
373 |
-
except sr.UnknownValueError:
|
374 |
-
pass
|
375 |
-
except Exception as e:
|
376 |
-
logger.warning(f"Speech recognition failed: {e}")
|
377 |
-
|
378 |
-
# Emotion analysis
|
379 |
-
emotion = await self.analyze_voice_emotion(wav_path) if is_speech else "no_speech"
|
380 |
-
|
381 |
-
# Update history
|
382 |
-
result = {
|
383 |
-
"text": text,
|
384 |
-
"language": lang,
|
385 |
-
"emotion": emotion,
|
386 |
-
"is_speech": is_speech
|
387 |
-
}
|
388 |
-
self._update_history(user_id, "user", result, lang)
|
389 |
-
|
390 |
-
return result
|
391 |
except Exception as e:
|
392 |
-
logger.error(f"Voice message processing failed: {e}")
|
393 |
return {"error": str(e)}
|
394 |
-
finally:
|
395 |
-
self._cleanup(ogg_path, wav_path)
|
396 |
-
|
397 |
-
async def generate_voice_reply(self, text: str, user_id: int, fmt: str = "ogg") -> str:
|
398 |
-
"""Generate audio from text (TTS)"""
|
399 |
-
mp3_path = self._tmp_path(".mp3")
|
400 |
-
out_path = self._tmp_path(f".{fmt}")
|
401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
402 |
try:
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
else:
|
419 |
-
shutil.move(mp3_path, out_path)
|
420 |
-
|
421 |
-
# Update history
|
422 |
-
self._update_history(user_id, "assistant", f"[Voice reply: {fmt}]")
|
423 |
-
|
424 |
-
return out_path
|
425 |
except Exception as e:
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
self.
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
459 |
try:
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
)
|
466 |
-
|
|
|
|
|
|
|
|
|
467 |
except Exception as e:
|
468 |
-
logger.
|
469 |
-
|
470 |
-
|
471 |
-
# Translate if needed
|
472 |
-
if lang != "en":
|
473 |
try:
|
474 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
except Exception:
|
476 |
-
|
477 |
-
|
478 |
-
# Update history
|
479 |
-
self._update_history(user_id, "assistant", response, lang)
|
480 |
-
return response
|
481 |
-
|
482 |
-
# ----------------------
|
483 |
-
# Image Processing
|
484 |
-
# ----------------------
|
485 |
-
async def process_image_message(self, image_file, user_id: int) -> str:
|
486 |
-
"""Generate caption for an image"""
|
487 |
-
img_path = self._tmp_path(".jpg")
|
488 |
|
|
|
|
|
|
|
|
|
|
|
489 |
try:
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
# Generate caption
|
495 |
-
self._load_image_captioner()
|
496 |
-
caption = self._image_captioner(image)[0]["generated_text"]
|
497 |
-
|
498 |
-
# Update history
|
499 |
-
self._update_history(user_id, "user", "[Image]", "en")
|
500 |
-
self._update_history(user_id, "assistant", f"Image description: {caption}", "en")
|
501 |
-
|
502 |
-
return caption
|
503 |
-
except Exception as e:
|
504 |
-
logger.error(f"Image processing failed: {e}")
|
505 |
-
return f"Error processing image: {str(e)}"
|
506 |
-
finally:
|
507 |
-
self._cleanup(img_path)
|
508 |
-
|
509 |
-
async def generate_image_from_text(self, prompt: str, user_id: int,
|
510 |
-
width: int = 512, height: int = 512,
|
511 |
-
steps: int = 30) -> str:
|
512 |
-
"""Generate image from text prompt"""
|
513 |
-
self._load_sd()
|
514 |
-
if not self._sd_pipe:
|
515 |
-
raise RuntimeError("Image generation unavailable")
|
516 |
-
|
517 |
-
out_path = self._tmp_path(".png")
|
518 |
|
|
|
|
|
519 |
try:
|
520 |
-
|
521 |
-
|
522 |
-
prompt,
|
523 |
-
num_inference_steps=steps,
|
524 |
-
height=height,
|
525 |
-
width=width
|
526 |
-
)
|
527 |
-
result.images[0].save(out_path)
|
528 |
-
|
529 |
-
# Update history
|
530 |
-
self._update_history(user_id, "user", f"[Image request: {prompt}]", "en")
|
531 |
-
self._update_history(user_id, "assistant", f"[Generated image]", "en")
|
532 |
-
|
533 |
-
return out_path
|
534 |
except Exception as e:
|
535 |
-
|
536 |
-
raise RuntimeError(f"Image generation failed: {e}")
|
537 |
-
|
538 |
-
async def edit_image_inpaint(self, image_file, mask_file=None,
|
539 |
-
prompt: str = "", user_id: int = 0) -> str:
|
540 |
-
"""Edit image using inpainting"""
|
541 |
-
self._load_sd()
|
542 |
-
if not self._sd_inpaint:
|
543 |
-
raise RuntimeError("Image editing unavailable")
|
544 |
-
|
545 |
-
img_path = self._tmp_path(".png")
|
546 |
-
mask_path = self._tmp_path("_mask.png") if mask_file else None
|
547 |
-
out_path = self._tmp_path("_edited.png")
|
548 |
|
|
|
|
|
549 |
try:
|
550 |
-
|
551 |
-
await image_file.download_to_drive(img_path)
|
552 |
-
if mask_file:
|
553 |
-
await mask_file.download_to_drive(mask_path)
|
554 |
-
|
555 |
-
# Prepare images
|
556 |
-
init_image = Image.open(img_path).convert("RGB")
|
557 |
-
mask_image = Image.open(mask_path).convert("L") if mask_path else Image.new("L", init_image.size, 255)
|
558 |
-
|
559 |
-
# Inpaint
|
560 |
-
result = self._sd_inpaint(
|
561 |
-
prompt=prompt if prompt else " ",
|
562 |
-
image=init_image,
|
563 |
-
mask_image=mask_image,
|
564 |
-
guidance_scale=7.5,
|
565 |
-
num_inference_steps=30
|
566 |
-
)
|
567 |
-
result.images[0].save(out_path)
|
568 |
-
|
569 |
-
# Update history
|
570 |
-
self._update_history(user_id, "user", "[Image edit request]", "en")
|
571 |
-
self._update_history(user_id, "assistant", "[Edited image]", "en")
|
572 |
-
|
573 |
-
return out_path
|
574 |
except Exception as e:
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
|
|
|
|
|
|
588 |
try:
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
fps = clip.fps
|
597 |
-
|
598 |
-
# Transcribe audio
|
599 |
-
transcribed = ""
|
600 |
-
try:
|
601 |
-
with sr.AudioFile(audio_path) as source:
|
602 |
-
audio = self._sr_recognizer.record(source)
|
603 |
-
transcribed = self._sr_recognizer.recognize_google(audio)
|
604 |
-
except Exception as e:
|
605 |
-
logger.warning(f"Audio transcription failed: {e}")
|
606 |
-
|
607 |
-
# Extract frames
|
608 |
-
frames = []
|
609 |
-
captions = []
|
610 |
-
try:
|
611 |
-
reader = imageio.get_reader(vid_path)
|
612 |
-
total_frames = reader.count_frames()
|
613 |
-
step = max(1, total_frames // max_frames)
|
614 |
-
|
615 |
-
for i in range(0, total_frames, step):
|
616 |
-
try:
|
617 |
-
frame = reader.get_data(i)
|
618 |
-
frame_path = self._tmp_path(f"_frame{i}.jpg")
|
619 |
-
Image.fromarray(frame).save(frame_path)
|
620 |
-
frames.append(frame_path)
|
621 |
-
|
622 |
-
if len(frames) >= max_frames:
|
623 |
-
break
|
624 |
-
except Exception:
|
625 |
-
continue
|
626 |
-
|
627 |
-
# Generate captions
|
628 |
-
if frames and self._load_image_captioner():
|
629 |
-
for frame_path in frames:
|
630 |
-
try:
|
631 |
-
caption = self._image_captioner(Image.open(frame_path))[0]["generated_text"]
|
632 |
-
captions.append(caption)
|
633 |
-
except Exception:
|
634 |
-
captions.append("")
|
635 |
-
finally:
|
636 |
-
self._cleanup(frame_path)
|
637 |
-
except Exception as e:
|
638 |
-
logger.warning(f"Frame extraction failed: {e}")
|
639 |
-
|
640 |
-
# Update history
|
641 |
-
result = {
|
642 |
-
"duration": duration,
|
643 |
-
"fps": fps,
|
644 |
-
"transcription": transcribed,
|
645 |
-
"captions": captions
|
646 |
-
}
|
647 |
-
self._update_history(user_id, "user", "[Video upload]", "en")
|
648 |
-
self._update_history(user_id, "assistant", result, "en")
|
649 |
-
|
650 |
-
return result
|
651 |
except Exception as e:
|
652 |
-
logger.error(f"Video processing failed: {e}")
|
653 |
return {"error": str(e)}
|
654 |
-
finally:
|
655 |
-
self._cleanup(vid_path, audio_path)
|
656 |
-
|
657 |
-
# ----------------------
|
658 |
-
# File Processing
|
659 |
-
# ----------------------
|
660 |
-
async def process_file(self, file_obj, user_id: int) -> Dict[str, Any]:
|
661 |
-
"""Process document files (PDF, DOCX, TXT)"""
|
662 |
-
fpath = self._tmp_path()
|
663 |
-
|
664 |
-
try:
|
665 |
-
# Save file
|
666 |
-
await file_obj.download_to_drive(fpath)
|
667 |
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
elif fpath.lower().endswith((".txt", ".csv")):
|
677 |
-
try:
|
678 |
-
with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
|
679 |
-
text = f.read()
|
680 |
-
except Exception as e:
|
681 |
-
text = f"[Text error: {e}]"
|
682 |
-
elif fpath.lower().endswith(".docx"):
|
683 |
-
try:
|
684 |
-
import docx
|
685 |
-
doc = docx.Document(fpath)
|
686 |
-
text = "\n".join([p.text for p in doc.paragraphs])
|
687 |
-
except Exception as e:
|
688 |
-
text = f"[DOCX error: {e}]"
|
689 |
-
else:
|
690 |
-
text = "[Unsupported file type]"
|
691 |
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
"summary": summary,
|
698 |
-
"length": len(text),
|
699 |
-
"type": os.path.splitext(fpath)[1]
|
700 |
-
}
|
701 |
-
self._update_history(user_id, "user", f"[File upload: {result['type']}]", "en")
|
702 |
-
self._update_history(user_id, "assistant", result, "en")
|
703 |
-
|
704 |
-
return result
|
705 |
except Exception as e:
|
706 |
-
logger.error(
|
707 |
-
return {
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
720 |
|
721 |
try:
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
733 |
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
|
|
738 |
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
# Write script
|
744 |
-
with open(script_path, "w") as f:
|
745 |
-
f.write(code)
|
746 |
-
|
747 |
-
# Execute
|
748 |
-
proc = await asyncio.create_subprocess_exec(
|
749 |
-
"python3", script_path,
|
750 |
-
stdout=asyncio.subprocess.PIPE,
|
751 |
-
stderr=asyncio.subprocess.PIPE
|
752 |
-
)
|
753 |
|
|
|
|
|
754 |
try:
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# multimodal_superagent.py
|
2 |
+
"""
|
3 |
+
Multimodal SuperAgent v6.0 — Single-file master implementation
|
4 |
+
COPY & PASTE into multimodal_superagent.py
|
5 |
+
|
6 |
+
WHAT'S NEW vs v5.0 (upgrade, not a rewrite):
|
7 |
+
- Close-to-Human Brain (CHB) layer: the only module allowed to speak to the user.
|
8 |
+
- Gathers evidence from memory, web, tools, models.
|
9 |
+
- Verifies claims with a Versioned Fact Store (VFS).
|
10 |
+
- Resolves contradictions (old vs new knowledge).
|
11 |
+
- Computes confidence; only answers when >= 0.90. Otherwise returns the certain subset.
|
12 |
+
- Versioned Fact Store (JSON): timelines of facts, supersedes history, provenance & citations.
|
13 |
+
- Confidence Scorer: reliability + recency + agreement + self-consistency + retrieval strength – contradictions.
|
14 |
+
- Web search: local HTML scraping with 24h cache (title/url/snippet<=100 chars). No paid tokens needed.
|
15 |
+
- Multimodal perception glue:
|
16 |
+
- Emojis/emotion tags for text (lightweight).
|
17 |
+
- Image caption hook (BLIP/ViT if configured) → stored as evidence.
|
18 |
+
- Video analyzer (keyframes). Basic FPS “interpolation” without external binaries (frame duplication).
|
19 |
+
- Safer math & schema checks during verification (SymPy + simple validators).
|
20 |
+
- Long-term memory, KG, agents, image & voice pipelines retained and improved with better error paths.
|
21 |
+
- Citations and confidence optionally attached to replies (text mode).
|
22 |
+
- Runs CPU or GPU (auto-detect). No hard dependencies on cloud APIs.
|
23 |
+
"""
|
24 |
+
|
25 |
+
from __future__ import annotations
|
26 |
+
import os, sys, time, json, uuid, shutil, tempfile, subprocess, logging, math, hashlib, re
|
27 |
+
from dataclasses import dataclass, field, asdict
|
28 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
29 |
import asyncio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
# --- lazy imports for heavy libs; imported when needed ---
|
32 |
+
try:
|
33 |
+
import torch
|
34 |
+
except Exception:
|
35 |
+
torch = None
|
36 |
+
try:
|
37 |
+
import numpy as np
|
38 |
+
except Exception:
|
39 |
+
np = None
|
40 |
+
|
41 |
+
# Logging
|
42 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
|
43 |
+
logger = logging.getLogger("SuperAgent")
|
44 |
+
|
45 |
+
# ---------------------------
|
46 |
+
# CONFIG — edit BEFORE running
|
47 |
+
# ---------------------------
|
48 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
49 |
+
TMP_DIR = os.getenv("SUPERAGENT_TMP", os.path.join(BASE_DIR, "tmp"))
|
50 |
+
CACHE_DIR = os.getenv("SUPERAGENT_CACHE", os.path.join(BASE_DIR, "model_cache"))
|
51 |
+
os.makedirs(TMP_DIR, exist_ok=True)
|
52 |
+
os.makedirs(CACHE_DIR, exist_ok=True)
|
53 |
+
|
54 |
+
# Central configuration: change model paths here or provide config.yaml and set env SUPERAGENT_CONFIG
|
55 |
+
MODEL_CONFIG = {
|
56 |
+
"device": "auto", # "auto" | "cpu" | "cuda"
|
57 |
+
"llm": { # prefer local HF model id or path; optionally set 'backend' to 'llamacpp'
|
58 |
+
"model_path": None, # e.g., "/path/to/Mistral-7B-Instruct"
|
59 |
+
"backend": "transformers", # "transformers" | "llamacpp"
|
60 |
+
"quantize": None, # e.g., "bitsandbytes" or GGUF for llamacpp
|
61 |
+
},
|
62 |
+
"embedder": "sentence-transformers/all-MiniLM-L6-v2",
|
63 |
+
"faiss_index_dir": os.path.join(CACHE_DIR, "faiss_index"),
|
64 |
+
"sdxl_base": None, # e.g., "/path/to/sdxl-base"
|
65 |
+
"sdxl_refiner": None, # optional
|
66 |
+
"sdxl_inpaint": None, # optional
|
67 |
+
"blip_caption": None, # e.g., "Salesforce/blip-image-captioning-base" (optional)
|
68 |
+
"piper_binary": "/usr/local/bin/piper", # optional
|
69 |
+
"piper_voice": None, # optional
|
70 |
+
"allow_web_search": False, # opt-in
|
71 |
+
"safety_blocklist": ["terror", "explosive", "harm"],
|
72 |
+
"knowledge_graph_path": os.path.join(CACHE_DIR, "kg.json"),
|
73 |
+
"memory_persist": True,
|
74 |
+
"memory_file": os.path.join(CACHE_DIR, "longterm_memory.json"),
|
75 |
+
"embed_dim": 384, # MiniLM-L6-v2
|
76 |
+
"auto_learn": True,
|
77 |
+
"vfs_file": os.path.join(CACHE_DIR, "versioned_facts.json"),
|
78 |
+
"web_cache_file": os.path.join(CACHE_DIR, "webcache.json"),
|
79 |
+
"chb_min_confidence": 0.90,
|
80 |
+
"max_memory_items": 10000,
|
81 |
+
"kg_max_nodes": 500,
|
82 |
+
"short_memory_turns": 50,
|
83 |
+
}
|
84 |
+
|
85 |
+
# ---------------------------
|
86 |
+
# Helpers
|
87 |
+
# ---------------------------
|
88 |
+
def uid(prefix="asset"):
|
89 |
+
return f"{prefix}_{uuid.uuid4().hex[:8]}"
|
90 |
+
|
91 |
+
def tmp_path(suffix=""):
|
92 |
+
p = os.path.join(TMP_DIR, f"{uuid.uuid4().hex}{suffix}")
|
93 |
+
os.makedirs(os.path.dirname(p), exist_ok=True)
|
94 |
+
return p
|
95 |
+
|
96 |
+
def cleanup(*paths):
|
97 |
+
for p in paths:
|
98 |
try:
|
99 |
+
if not p: continue
|
100 |
+
if os.path.isfile(p): os.remove(p)
|
101 |
+
elif os.path.isdir(p): shutil.rmtree(p)
|
102 |
except Exception as e:
|
103 |
+
logger.debug("cleanup failed %s: %s", p, e)
|
104 |
+
|
105 |
+
def now_ts():
|
106 |
+
return float(time.time())
|
107 |
+
|
108 |
+
# ---------------------------
|
109 |
+
# Provenance & Watermark
|
110 |
+
# ---------------------------
|
111 |
+
@dataclass
|
112 |
+
class Provenance:
|
113 |
+
model: str
|
114 |
+
version: str
|
115 |
+
synthetic: bool = True
|
116 |
+
consent: Optional[str] = None
|
117 |
+
created_at: float = field(default_factory=now_ts)
|
118 |
+
extra: Dict[str, Any] = field(default_factory=dict)
|
119 |
+
|
120 |
+
class ProvenanceManager:
|
121 |
+
def attach(self, filepath: str, meta: Union[Provenance, dict]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
try:
|
123 |
+
meta_out = asdict(meta) if isinstance(meta, Provenance) else meta
|
124 |
+
with open(filepath + ".prov.json", "w", encoding="utf-8") as f:
|
125 |
+
json.dump(meta_out, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
except Exception as e:
|
127 |
+
logger.warning("provenance attach failed: %s", e)
|
128 |
+
|
129 |
+
def watermark_audio(self, wav_bytes: bytes) -> bytes:
|
130 |
+
# Non-destructive placeholder; does not alter content.
|
131 |
+
return wav_bytes
|
132 |
+
|
133 |
+
def watermark_image(self, pil_img):
|
134 |
+
# Return as-is. Replace with robust watermark if desired.
|
135 |
+
return pil_img
|
136 |
+
|
137 |
+
# ---------------------------
|
138 |
+
# Safety & Consent
|
139 |
+
# ---------------------------
|
140 |
+
@dataclass
|
141 |
+
class ConsentRecord:
|
142 |
+
user_id: str
|
143 |
+
attestation: str
|
144 |
+
ts: float = field(default_factory=now_ts)
|
145 |
+
|
146 |
+
class SafetyManager:
|
147 |
+
def __init__(self, blocklist=None):
|
148 |
+
self.blocklist = blocklist or MODEL_CONFIG["safety_blocklist"]
|
149 |
+
self.consent_log: List[ConsentRecord] = []
|
150 |
+
|
151 |
+
def record_consent(self, user_id: str, attestation: str) -> str:
|
152 |
+
token = hashlib.sha256(f"{user_id}-{attestation}-{time.time()}".encode()).hexdigest()
|
153 |
+
self.consent_log.append(ConsentRecord(user_id=user_id, attestation=attestation))
|
154 |
+
return token
|
155 |
+
|
156 |
+
def is_allowed(self, text: str) -> bool:
|
157 |
+
t = (text or "").lower()
|
158 |
+
return not any(b in t for b in self.blocklist)
|
159 |
+
|
160 |
+
def check_public_figure(self, embedding) -> bool:
|
161 |
+
# Placeholder: implement real similarity against protected embeddings if available
|
162 |
+
return False
|
163 |
+
|
164 |
+
# ---------------------------
|
165 |
+
# Short-term memory (conversation buffer)
|
166 |
+
# ---------------------------
|
167 |
+
class ShortTermMemory:
|
168 |
+
def __init__(self, max_turns=None):
|
169 |
+
self.max_turns = max_turns or MODEL_CONFIG["short_memory_turns"]
|
170 |
+
self.store: Dict[str, List[Dict[str,Any]]] = {}
|
171 |
+
|
172 |
+
def push(self, user_id: str, role: str, content: Any):
|
173 |
+
k = str(user_id)
|
174 |
+
arr = self.store.setdefault(k, [])
|
175 |
+
arr.append({"ts":now_ts(), "role":role, "content":content})
|
176 |
+
if len(arr) > self.max_turns:
|
177 |
+
self.store[k] = arr[-self.max_turns:]
|
178 |
+
|
179 |
+
def recent(self, user_id: str, k=10):
|
180 |
+
return self.store.get(str(user_id), [])[-k:]
|
181 |
+
|
182 |
+
# ---------------------------
|
183 |
+
# Long-term memory (FAISS + Sentence-Transformers)
|
184 |
+
# ---------------------------
|
185 |
+
class LongTermMemory:
|
186 |
+
def __init__(self, index_dir=None, embed_model_name=None):
|
187 |
+
self.index_dir = index_dir or MODEL_CONFIG["faiss_index_dir"]
|
188 |
+
os.makedirs(self.index_dir, exist_ok=True)
|
189 |
+
self.memfile = MODEL_CONFIG.get("memory_file")
|
190 |
+
self.embed_model_name = embed_model_name or MODEL_CONFIG["embedder"]
|
191 |
+
self.data: List[Dict[str,Any]] = []
|
192 |
+
self.index = None
|
193 |
+
self.embedder = None
|
194 |
+
self.dim = MODEL_CONFIG.get("embed_dim", 384)
|
195 |
+
self._load()
|
196 |
+
|
197 |
+
def _load(self):
|
198 |
+
if os.path.exists(self.memfile):
|
199 |
+
try:
|
200 |
+
with open(self.memfile, "r", encoding="utf-8") as f:
|
201 |
+
self.data = json.load(f)
|
202 |
+
except Exception:
|
203 |
+
self.data = []
|
204 |
+
# load faiss if available
|
205 |
try:
|
206 |
+
import faiss # type: ignore
|
207 |
+
idx_path = os.path.join(self.index_dir, "index.faiss")
|
208 |
+
if os.path.exists(idx_path):
|
209 |
+
try:
|
210 |
+
self.index = faiss.read_index(idx_path)
|
211 |
+
except Exception:
|
212 |
+
self.index = None
|
213 |
+
else:
|
214 |
+
self.index = None
|
215 |
+
except Exception:
|
216 |
+
self.index = None
|
217 |
+
|
218 |
+
def _save(self):
|
219 |
+
with open(self.memfile, "w", encoding="utf-8") as f:
|
220 |
+
json.dump(self.data, f, indent=2, ensure_ascii=False)
|
221 |
+
if self.index is not None:
|
222 |
+
try:
|
223 |
+
import faiss # type: ignore
|
224 |
+
faiss.write_index(self.index, os.path.join(self.index_dir, "index.faiss"))
|
225 |
+
except Exception:
|
226 |
+
pass
|
227 |
|
228 |
+
def _get_embedder(self):
|
229 |
+
if self.embedder is None:
|
230 |
+
try:
|
231 |
+
from sentence_transformers import SentenceTransformer
|
232 |
+
self.embedder = SentenceTransformer(self.embed_model_name)
|
233 |
+
except Exception as e:
|
234 |
+
logger.debug("embedder load failed: %s", e)
|
235 |
+
self.embedder = None
|
236 |
+
return self.embedder
|
237 |
|
238 |
+
def _ensure_index(self, dim:int):
|
239 |
+
if self.index is None:
|
240 |
+
try:
|
241 |
+
import faiss # type: ignore
|
242 |
+
self.index = faiss.IndexFlatL2(dim)
|
243 |
+
except Exception:
|
244 |
+
self.index = None
|
245 |
+
|
246 |
+
def add(self, user_id: str, text: str, kind: str="turn"):
|
247 |
+
if not text: return
|
248 |
+
# prune if oversized
|
249 |
+
if len(self.data) >= MODEL_CONFIG["max_memory_items"]:
|
250 |
+
self.data = self.data[int(0.1*len(self.data)):] # drop oldest 10%
|
251 |
+
item = {"id": uid("m"), "user_id": str(user_id), "text": text, "kind": kind, "ts": now_ts()}
|
252 |
+
self.data.append(item)
|
253 |
+
emb = self._get_embedder()
|
254 |
+
if emb is not None:
|
255 |
+
vec = emb.encode([text])
|
256 |
+
import numpy as _np
|
257 |
+
self._ensure_index(vec.shape[1])
|
258 |
+
if self.index is not None:
|
259 |
+
self.index.add(_np.asarray(vec, dtype="float32"))
|
260 |
+
if MODEL_CONFIG.get("memory_persist"):
|
261 |
+
self._save()
|
262 |
+
|
263 |
+
def search(self, query: str, top_k=5) -> List[Dict[str,Any]]:
|
264 |
+
if not self.data:
|
265 |
+
return []
|
266 |
+
emb = self._get_embedder()
|
267 |
+
if emb is None or self.index is None:
|
268 |
+
# fallback naive search
|
269 |
+
q = (query or "").lower()
|
270 |
+
scored=[]
|
271 |
+
for i,m in enumerate(self.data):
|
272 |
+
s = sum(1 for tok in q.split() if tok in m["text"].lower())
|
273 |
+
if s>0: scored.append((s,i))
|
274 |
+
scored.sort(reverse=True)
|
275 |
+
return [self.data[i] for _,i in scored[:top_k]]
|
276 |
+
qv = emb.encode([query])
|
277 |
+
import numpy as _np
|
278 |
+
D,I = self.index.search(_np.asarray(qv, dtype="float32"), min(top_k, len(self.data)))
|
279 |
+
res=[]
|
280 |
+
used=set()
|
281 |
+
for idx in I[0]:
|
282 |
+
if 0 <= idx < len(self.data) and int(idx) not in used:
|
283 |
+
used.add(int(idx))
|
284 |
+
res.append(self.data[int(idx)])
|
285 |
+
return res
|
286 |
+
|
287 |
+
def export_all(self):
|
288 |
+
return {"count": len(self.data), "items": self.data}
|
289 |
+
|
290 |
+
def import_bulk(self, items: List[Dict[str,Any]]):
|
291 |
+
self.data = items or []
|
292 |
+
emb = self._get_embedder()
|
293 |
+
if emb is not None and self.data:
|
294 |
+
vecs = emb.encode([m["text"] for m in self.data])
|
295 |
+
import numpy as _np
|
296 |
+
self._ensure_index(vecs.shape[1])
|
297 |
+
if self.index is not None:
|
298 |
+
self.index.reset()
|
299 |
+
self.index.add(_np.asarray(vecs, dtype="float32"))
|
300 |
+
self._save()
|
301 |
+
|
302 |
+
# ---------------------------
|
303 |
+
# Knowledge Graph (simple JSON triples)
|
304 |
+
# ---------------------------
|
305 |
+
class KnowledgeGraph:
|
306 |
+
def __init__(self, path=None):
|
307 |
+
self.path = path or MODEL_CONFIG["knowledge_graph_path"]
|
308 |
+
self.graph = {}
|
309 |
+
self._load()
|
310 |
+
|
311 |
+
def _load(self):
|
312 |
+
if os.path.exists(self.path):
|
313 |
+
try:
|
314 |
+
with open(self.path,"r",encoding="utf-8") as f:
|
315 |
+
self.graph = json.load(f)
|
316 |
+
except Exception:
|
317 |
+
self.graph = {}
|
318 |
+
|
319 |
+
def add_fact(self, subj: str, pred: str, obj: str):
|
320 |
+
# cap nodes to avoid unbounded growth
|
321 |
+
if len(self.graph) > MODEL_CONFIG["kg_max_nodes"]:
|
322 |
+
# basic prune: drop oldest 10% of keys by insertion order
|
323 |
+
keys = list(self.graph.keys())
|
324 |
+
for k in keys[:max(1, len(keys)//10)]:
|
325 |
+
self.graph.pop(k, None)
|
326 |
+
k = f"{subj}::{pred}"
|
327 |
+
self.graph.setdefault(k, []).append(obj)
|
328 |
+
self._save()
|
329 |
+
|
330 |
+
def query(self, subj: str, pred: str):
|
331 |
+
return self.graph.get(f"{subj}::{pred}", [])
|
332 |
+
|
333 |
+
def _save(self):
|
334 |
try:
|
335 |
+
with open(self.path,"w",encoding="utf-8") as f:
|
336 |
+
json.dump(self.graph, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
except Exception as e:
|
338 |
+
logger.debug("kg save failed: %s", e)
|
339 |
+
|
340 |
+
# ---------------------------
|
341 |
+
# Versioned Fact Store (VFS)
|
342 |
+
# ---------------------------
|
343 |
+
class VersionedFactStore:
|
344 |
+
"""
|
345 |
+
Stores facts with versioning & provenance.
|
346 |
+
Fact schema:
|
347 |
+
{
|
348 |
+
"id": str, "claim": str, "value": str, "scope": str|None,
|
349 |
+
"first_seen": ts, "verified_at": ts, "confidence": float,
|
350 |
+
"sources": [{"type": "web|memory|kg|vision|audio|manual", "ref": str, "title": str|None, "time": ts}],
|
351 |
+
"supersedes": str|None, "valid_from": ts|None, "valid_to": ts|None
|
352 |
+
}
|
353 |
+
"""
|
354 |
+
def __init__(self, path=None):
|
355 |
+
self.path = path or MODEL_CONFIG["vfs_file"]
|
356 |
+
self.facts: List[Dict[str,Any]] = []
|
357 |
+
self._load()
|
358 |
+
|
359 |
+
def _load(self):
|
360 |
+
if os.path.exists(self.path):
|
361 |
+
try:
|
362 |
+
with open(self.path, "r", encoding="utf-8") as f:
|
363 |
+
self.facts = json.load(f)
|
364 |
+
except Exception:
|
365 |
+
self.facts = []
|
366 |
|
367 |
+
def _save(self):
|
368 |
try:
|
369 |
+
with open(self.path, "w", encoding="utf-8") as f:
|
370 |
+
json.dump(self.facts, f, indent=2, ensure_ascii=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
except Exception as e:
|
372 |
+
logger.debug("vfs save failed: %s", e)
|
373 |
+
|
374 |
+
def add_or_update(self, claim: str, value: str, sources: List[Dict[str,Any]], confidence: float,
|
375 |
+
scope: Optional[str]=None, supersedes: Optional[str]=None,
|
376 |
+
valid_from: Optional[float]=None, valid_to: Optional[float]=None) -> Dict[str,Any]:
|
377 |
+
new_id = uid("fact")
|
378 |
+
rec = {
|
379 |
+
"id": new_id, "claim": claim, "value": value, "scope": scope,
|
380 |
+
"first_seen": now_ts(), "verified_at": now_ts(), "confidence": float(confidence),
|
381 |
+
"sources": sources or [], "supersedes": supersedes,
|
382 |
+
"valid_from": valid_from, "valid_to": valid_to
|
383 |
+
}
|
384 |
+
self.facts.append(rec)
|
385 |
+
self._save()
|
386 |
+
return rec
|
387 |
+
|
388 |
+
def find(self, claim: str) -> List[Dict[str,Any]]:
|
389 |
+
c = claim.strip().lower()
|
390 |
+
return [f for f in self.facts if f.get("claim","").strip().lower()==c]
|
391 |
+
|
392 |
+
def latest(self, claim: str) -> Optional[Dict[str,Any]]:
|
393 |
+
items = self.find(claim)
|
394 |
+
if not items: return None
|
395 |
+
# return the most recent verified_at
|
396 |
+
return sorted(items, key=lambda x: x.get("verified_at", 0), reverse=True)[0]
|
397 |
+
|
398 |
+
def all(self) -> List[Dict[str,Any]]:
|
399 |
+
return self.facts
|
400 |
+
|
401 |
+
# ---------------------------
|
402 |
+
# Web search (HTML scrape + 24h cache)
|
403 |
+
# ---------------------------
|
404 |
+
class WebSearch:
|
405 |
+
def __init__(self, enabled: bool=False, cache_file: Optional[str]=None):
|
406 |
+
self.enabled = bool(enabled)
|
407 |
+
self.cache_file = cache_file or MODEL_CONFIG["web_cache_file"]
|
408 |
+
self.cache = {}
|
409 |
+
self._load_cache()
|
410 |
+
|
411 |
+
def _load_cache(self):
|
412 |
+
if os.path.exists(self.cache_file):
|
413 |
+
try:
|
414 |
+
with open(self.cache_file, "r", encoding="utf-8") as f:
|
415 |
+
self.cache = json.load(f)
|
416 |
+
except Exception:
|
417 |
+
self.cache = {}
|
418 |
|
419 |
+
def _save_cache(self):
|
420 |
try:
|
421 |
+
with open(self.cache_file, "w", encoding="utf-8") as f:
|
422 |
+
json.dump(self.cache, f, indent=2, ensure_ascii=False)
|
423 |
+
except Exception:
|
424 |
+
pass
|
425 |
+
|
426 |
+
def search(self, query: str, max_results: int=3) -> List[Dict[str,str]]:
|
427 |
+
if not self.enabled:
|
428 |
+
return []
|
429 |
+
key = hashlib.sha1(query.encode()).hexdigest()
|
430 |
+
# 24h TTL
|
431 |
+
if key in self.cache and (now_ts() - self.cache[key]["ts"]) < 86400:
|
432 |
+
return self.cache[key]["hits"][:max_results]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
try:
|
434 |
+
import requests
|
435 |
+
from bs4 import BeautifulSoup
|
436 |
+
url = f"https://duckduckgo.com/html/?q={requests.utils.quote(query)}"
|
437 |
+
headers = {"User-Agent":"Mozilla/5.0"}
|
438 |
+
r = requests.get(url, headers=headers, timeout=8)
|
439 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
440 |
+
results=[]
|
441 |
+
# capture title/link/snippet (≤100 chars)
|
442 |
+
containers = soup.select(".result")[:max_results]
|
443 |
+
for c in containers:
|
444 |
+
a = c.select_one(".result__a")
|
445 |
+
s = c.select_one(".result__snippet")
|
446 |
+
title = a.get_text(strip=True) if a else ""
|
447 |
+
link = a.get("href") if a else ""
|
448 |
+
snippet = (s.get_text(" ", strip=True) if s else "")[:100]
|
449 |
+
if title and link:
|
450 |
+
results.append({"title": title, "link": link, "snippet": snippet})
|
451 |
+
self.cache[key] = {"ts": now_ts(), "hits": results}
|
452 |
+
self._save_cache()
|
453 |
+
return results
|
454 |
except Exception as e:
|
455 |
+
logger.debug("websearch failed: %s", e)
|
456 |
+
return []
|
457 |
+
|
458 |
+
# ---------------------------
|
459 |
+
# LLM wrapper (transformers / llama.cpp)
|
460 |
+
# ---------------------------
|
461 |
+
class LocalLLM:
|
462 |
+
def __init__(self, model_path=None, backend="transformers", device="auto", quantize=None):
|
463 |
+
self.model_path = model_path
|
464 |
+
self.backend = backend
|
465 |
+
self.quantize = quantize
|
466 |
+
self.device = device if device!="auto" else ("cuda" if torch and torch.cuda.is_available() else "cpu")
|
467 |
+
self.model = None
|
468 |
+
self.tokenizer = None
|
469 |
+
self._loaded = False
|
470 |
+
|
471 |
+
def load(self):
|
472 |
+
if self._loaded: return
|
473 |
+
if self.backend == "transformers":
|
474 |
+
try:
|
475 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
476 |
+
if not self.model_path:
|
477 |
+
raise ValueError("MODEL_CONFIG['llm']['model_path'] not set")
|
478 |
+
logger.info("Loading transformers model %s", self.model_path)
|
479 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, cache_dir=CACHE_DIR)
|
480 |
+
# device_map="auto" keeps it simple; will use GPU if available
|
481 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
482 |
+
self.model_path, device_map="auto", cache_dir=CACHE_DIR
|
483 |
+
)
|
484 |
+
self._loaded = True
|
485 |
+
except Exception as e:
|
486 |
+
logger.error("transformers LLM load failed: %s", e)
|
487 |
+
self._loaded = False
|
488 |
+
elif self.backend == "llamacpp":
|
489 |
+
try:
|
490 |
+
import llama_cpp
|
491 |
+
if not self.model_path:
|
492 |
+
raise ValueError("MODEL_CONFIG['llm']['model_path'] not set")
|
493 |
+
self.model = llama_cpp.Llama(model_path=self.model_path)
|
494 |
+
self._loaded = True
|
495 |
+
except Exception as e:
|
496 |
+
logger.error("llamacpp load failed: %s", e)
|
497 |
+
self._loaded = False
|
498 |
+
else:
|
499 |
+
logger.error("Unknown backend %s", self.backend)
|
500 |
+
self._loaded = False
|
501 |
+
|
502 |
+
def ready(self) -> bool:
|
503 |
+
return self._loaded
|
504 |
+
|
505 |
+
def generate(self, prompt: str, max_tokens: int=256, temperature: float=0.7) -> str:
|
506 |
+
if not self._loaded:
|
507 |
+
raise RuntimeError("LLM not loaded")
|
508 |
+
if self.backend == "transformers":
|
509 |
+
try:
|
510 |
+
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
|
511 |
+
out = self.model.generate(
|
512 |
+
**inputs, max_new_tokens=max_tokens, do_sample=True, temperature=temperature,
|
513 |
+
pad_token_id=self.tokenizer.eos_token_id
|
514 |
+
)
|
515 |
+
return self.tokenizer.decode(out[0], skip_special_tokens=True)
|
516 |
+
except Exception as e:
|
517 |
+
logger.error("LLM generation failed: %s", e)
|
518 |
+
return "[llm-error]"
|
519 |
+
elif self.backend == "llamacpp":
|
520 |
+
try:
|
521 |
+
result = self.model.create(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
|
522 |
+
if isinstance(result, dict) and "choices" in result:
|
523 |
+
return result["choices"][0]["text"]
|
524 |
+
return str(result)
|
525 |
+
except Exception as e:
|
526 |
+
logger.error("llamacpp generation failed: %s", e)
|
527 |
+
return "[llm-error]"
|
528 |
+
else:
|
529 |
+
return "[backend-not-implemented]"
|
530 |
+
|
531 |
+
# ---------------------------
|
532 |
+
# Image engine (SDXL/diffusers + caption)
|
533 |
+
# ---------------------------
|
534 |
+
class ImageEngine:
|
535 |
+
def __init__(self, base=None, refiner=None, inpaint=None, blip=None, device="auto"):
|
536 |
+
self.paths = {"base": base, "refiner": refiner, "inpaint": inpaint, "blip": blip}
|
537 |
+
self.device = device if device!="auto" else ("cuda" if torch and torch.cuda.is_available() else "cpu")
|
538 |
+
self.pipe = None
|
539 |
+
self.refiner = None
|
540 |
+
self.inpaint_pipe = None
|
541 |
+
self.captioner = None
|
542 |
+
|
543 |
+
def load_pipes(self):
|
544 |
+
if self.pipe is None and self.paths.get("base"):
|
545 |
+
try:
|
546 |
+
from diffusers import StableDiffusionXLPipeline
|
547 |
+
dtype = torch.float16 if self.device=="cuda" else torch.float32
|
548 |
+
self.pipe = StableDiffusionXLPipeline.from_pretrained(self.paths["base"], torch_dtype=dtype, cache_dir=CACHE_DIR)
|
549 |
+
self.pipe.to(self.device)
|
550 |
+
except Exception as e:
|
551 |
+
logger.debug("sdxl base load failed: %s", e)
|
552 |
+
self.pipe = None
|
553 |
+
if self.refiner is None and self.paths.get("refiner"):
|
554 |
+
try:
|
555 |
+
from diffusers import StableDiffusionXLImg2ImgPipeline
|
556 |
+
dtype = torch.float16 if self.device=="cuda" else torch.float32
|
557 |
+
self.refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(self.paths["refiner"], torch_dtype=dtype, cache_dir=CACHE_DIR)
|
558 |
+
self.refiner.to(self.device)
|
559 |
+
except Exception as e:
|
560 |
+
logger.debug("refiner load failed: %s", e)
|
561 |
+
self.refiner = None
|
562 |
+
if self.inpaint_pipe is None and self.paths.get("inpaint"):
|
563 |
+
try:
|
564 |
+
from diffusers import StableDiffusionXLInpaintPipeline
|
565 |
+
dtype = torch.float16 if self.device=="cuda" else torch.float32
|
566 |
+
self.inpaint_pipe = StableDiffusionXLInpaintPipeline.from_pretrained(self.paths["inpaint"], torch_dtype=dtype, cache_dir=CACHE_DIR)
|
567 |
+
self.inpaint_pipe.to(self.device)
|
568 |
+
except Exception as e:
|
569 |
+
logger.debug("inpaint load failed: %s", e)
|
570 |
+
self.inpaint_pipe = None
|
571 |
+
if self.captioner is None and self.paths.get("blip"):
|
572 |
+
try:
|
573 |
+
from transformers import pipeline
|
574 |
+
device_idx = 0 if self.device=="cuda" else -1
|
575 |
+
self.captioner = pipeline("image-to-text", model=self.paths["blip"], device=device_idx, cache_dir=CACHE_DIR)
|
576 |
+
except Exception as e:
|
577 |
+
logger.debug("captioner load failed: %s", e)
|
578 |
+
self.captioner = None
|
579 |
+
|
580 |
+
def generate(self, prompt: str, negative: Optional[str]=None, steps:int=30, width:int=1024, height:int=1024, seed:Optional[int]=None):
|
581 |
+
self.load_pipes()
|
582 |
+
if not self.pipe:
|
583 |
+
raise RuntimeError("SDXL text2img pipeline unavailable. Provide path in MODEL_CONFIG['sdxl_base']")
|
584 |
+
generator = None
|
585 |
+
if torch and seed is not None:
|
586 |
+
generator = torch.Generator(device=self.device); generator.manual_seed(int(seed))
|
587 |
+
out = self.pipe(prompt=prompt, negative_prompt=negative or "", num_inference_steps=steps,
|
588 |
+
width=width, height=height, generator=generator).images[0]
|
589 |
+
if self.refiner:
|
590 |
+
try:
|
591 |
+
out = self.refiner(prompt=prompt, image=out, strength=0.2).images[0]
|
592 |
+
except Exception:
|
593 |
+
pass
|
594 |
+
path = tmp_path(".png")
|
595 |
+
out.save(path)
|
596 |
+
return path
|
597 |
|
598 |
+
def inpaint(self, image_path: str, mask_path: Optional[str], prompt: str=""):
|
599 |
+
self.load_pipes()
|
600 |
+
if not self.inpaint_pipe:
|
601 |
+
raise RuntimeError("Inpaint pipeline not available.")
|
602 |
+
from PIL import Image
|
603 |
+
init = Image.open(image_path).convert("RGB")
|
604 |
+
mask = Image.open(mask_path).convert("L") if mask_path else Image.new("L", init.size, 255)
|
605 |
+
res = self.inpaint_pipe(prompt=prompt or " ", image=init, mask_image=mask, guidance_scale=7.5, num_inference_steps=30)
|
606 |
+
out = tmp_path("_inpaint.png"); res.images[0].save(out); return out
|
607 |
+
|
608 |
+
def caption(self, image_path: str) -> str:
|
609 |
+
self.load_pipes()
|
610 |
+
if not self.captioner: return ""
|
611 |
+
from PIL import Image
|
612 |
+
img = Image.open(image_path).convert("RGB")
|
613 |
try:
|
614 |
+
return self.captioner(img)[0].get("generated_text","")
|
615 |
+
except Exception:
|
616 |
+
return ""
|
617 |
+
|
618 |
+
# ---------------------------
|
619 |
+
# Video engine (keyframes) + functional FPS "interpolation"
|
620 |
+
# ---------------------------
|
621 |
+
class VideoEngine:
|
622 |
+
def __init__(self):
|
623 |
+
pass
|
624 |
+
|
625 |
+
def analyze(self, video_path: str, max_frames:int=6):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
626 |
try:
|
627 |
+
import imageio
|
628 |
+
from PIL import Image
|
629 |
+
reader = imageio.get_reader(video_path)
|
630 |
+
total = reader.count_frames()
|
631 |
+
step = max(1, total // max_frames) if total else 1
|
632 |
+
frames=[]
|
633 |
+
for i in range(0,total,step):
|
634 |
+
try:
|
635 |
+
arr = reader.get_data(i)
|
636 |
+
p = tmp_path(f"_frame{i}.jpg")
|
637 |
+
Image.fromarray(arr).save(p)
|
638 |
+
frames.append(p)
|
639 |
+
if len(frames) >= max_frames: break
|
640 |
+
except Exception:
|
641 |
+
continue
|
642 |
+
return {"frames": frames, "count": len(frames)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
643 |
except Exception as e:
|
|
|
644 |
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
645 |
|
646 |
+
def interpolate_fps(self, video_path: str, factor:int=2) -> Dict[str,Any]:
|
647 |
+
"""
|
648 |
+
Functional, dependency-light interpolation by frame duplication.
|
649 |
+
This increases frame count (and apparent FPS) without optical flow.
|
650 |
+
Produces a new MP4 if imageio-ffmpeg is available; else returns extracted frames.
|
651 |
+
"""
|
652 |
try:
|
653 |
+
import imageio, imageio.v3 as iio
|
654 |
+
reader = imageio.get_reader(video_path)
|
655 |
+
fps = reader.get_meta_data().get("fps", 24)
|
656 |
+
new_fps = max(1, int(fps*factor))
|
657 |
+
out_path = tmp_path(".mp4")
|
658 |
+
try:
|
659 |
+
writer = imageio.get_writer(out_path, fps=new_fps)
|
660 |
+
for frame in reader:
|
661 |
+
# duplicate frames "factor" times
|
662 |
+
for _ in range(factor):
|
663 |
+
writer.append_data(frame)
|
664 |
+
writer.close()
|
665 |
+
return {"status":"ok", "output": out_path, "fps": new_fps}
|
666 |
+
except Exception as e:
|
667 |
+
return {"status":"partial", "reason": f"writer failed: {e}"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
668 |
except Exception as e:
|
669 |
+
return {"status":"error", "reason": str(e)}
|
670 |
+
|
671 |
+
# ---------------------------
|
672 |
+
# Voice engine (register, embed, TTS)
|
673 |
+
# ---------------------------
|
674 |
+
class VoiceEngine:
|
675 |
+
def __init__(self, provenance: ProvenanceManager, safety: SafetyManager, piper_bin=None, piper_voice=None):
|
676 |
+
self.prov = provenance
|
677 |
+
self.safety = safety
|
678 |
+
self.piper_bin = piper_bin
|
679 |
+
self.piper_voice = piper_voice
|
680 |
+
self.profiles: Dict[str,Dict[str,Any]] = {}
|
681 |
+
|
682 |
+
def extract_embedding(self, wav_path: str) -> Optional[List[float]]:
|
683 |
+
try:
|
684 |
+
with open(wav_path,"rb") as f:
|
685 |
+
b = f.read()
|
686 |
+
h = hashlib.sha256(b).digest()
|
687 |
+
return [float(x)/255.0 for x in h[:192]]
|
688 |
+
except Exception:
|
689 |
+
return None
|
690 |
+
|
691 |
+
def register(self, user_id: str, wav_path: str, consent_text: str, block_public=True):
|
692 |
+
token = self.safety.record_consent(user_id, consent_text)
|
693 |
+
emb = self.extract_embedding(wav_path)
|
694 |
+
if block_public and self.safety.check_public_figure(emb):
|
695 |
+
return {"status":"rejected","reason":"protected_speaker"}
|
696 |
+
vid = f"voice_{hashlib.sha1((user_id+str(time.time())).encode()).hexdigest()[:10]}"
|
697 |
+
self.profiles[vid] = {"user_id": user_id, "embedding": emb, "consent": token}
|
698 |
+
return {"status":"ok", "voice_id": vid}
|
699 |
+
|
700 |
+
def synthesize(self, voice_id:Optional[str], text:str, emotion:Optional[str]=None, rate:float=1.0, fmt:str="wav"):
|
701 |
+
if not self.safety.is_allowed(text):
|
702 |
+
return None
|
703 |
+
out = tmp_path(f".{fmt}")
|
704 |
+
# Prefer Piper if configured (local CLI)
|
705 |
+
if self.piper_bin and self.piper_voice and os.path.exists(self.piper_bin) and os.path.exists(self.piper_voice):
|
706 |
+
try:
|
707 |
+
p = subprocess.Popen([self.piper_bin, "-m", self.piper_voice, "-f", out],
|
708 |
+
stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
709 |
+
p.communicate(input=text, timeout=60)
|
710 |
+
if os.path.exists(out):
|
711 |
+
with open(out,"rb") as f:
|
712 |
+
b = f.read()
|
713 |
+
b2 = self.prov.watermark_audio(b)
|
714 |
+
with open(out,"wb") as f: f.write(b2)
|
715 |
+
self.prov.attach(out, Provenance(model="piper", version="1.0",
|
716 |
+
consent=self.profiles.get(voice_id, {}).get("consent")))
|
717 |
+
return out
|
718 |
+
except Exception as e:
|
719 |
+
logger.debug("piper fail: %s", e)
|
720 |
+
# Fallback pyttsx3
|
721 |
try:
|
722 |
+
import pyttsx3
|
723 |
+
eng = pyttsx3.init()
|
724 |
+
try: eng.setProperty("rate", int(200*rate))
|
725 |
+
except Exception: pass
|
726 |
+
eng.save_to_file(text, out); eng.runAndWait()
|
727 |
+
with open(out,"rb") as f:
|
728 |
+
b = f.read()
|
729 |
+
b2 = self.prov.watermark_audio(b)
|
730 |
+
with open(out,"wb") as f: f.write(b2)
|
731 |
+
self.prov.attach(out, Provenance(model="pyttsx3", version="1.0"))
|
732 |
+
return out
|
733 |
except Exception as e:
|
734 |
+
logger.debug("pyttsx3 fail: %s", e)
|
735 |
+
# fallback: silent wav (keeps pipeline functional)
|
|
|
|
|
|
|
736 |
try:
|
737 |
+
import wave, struct
|
738 |
+
fr=16000; dur=max(1, min(5, int(len(text)/10)))
|
739 |
+
with wave.open(out,'w') as wf:
|
740 |
+
wf.setnchannels(1); wf.setsampwidth(2); wf.setframerate(fr)
|
741 |
+
for _ in range(fr*dur): wf.writeframes(struct.pack('<h',0))
|
742 |
+
self.prov.attach(out, Provenance(model="silence", version="0.1"))
|
743 |
+
return out
|
744 |
except Exception:
|
745 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
|
747 |
+
# ---------------------------
|
748 |
+
# MathEngine (SymPy)
|
749 |
+
# ---------------------------
|
750 |
+
class MathEngine:
|
751 |
+
def __init__(self):
|
752 |
try:
|
753 |
+
import sympy as sp
|
754 |
+
self.sp = sp
|
755 |
+
except Exception:
|
756 |
+
self.sp = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
757 |
|
758 |
+
def solve(self, expr: str) -> str:
|
759 |
+
if not self.sp: return "[sympy missing]"
|
760 |
try:
|
761 |
+
parsed = self.sp.sympify(expr)
|
762 |
+
return str(self.sp.simplify(parsed))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
763 |
except Exception as e:
|
764 |
+
return f"[math-error] {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
765 |
|
766 |
+
def eval_numeric(self, expr: str) -> str:
|
767 |
+
if not self.sp: return "[sympy missing]"
|
768 |
try:
|
769 |
+
return str(self.sp.N(self.sp.sympify(expr)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
770 |
except Exception as e:
|
771 |
+
return f"[math-error] {e}"
|
772 |
+
|
773 |
+
def safe_eval(self, text: str) -> Optional[str]:
|
774 |
+
# detect simple math patterns like "2+2", "sqrt(2)", etc.
|
775 |
+
if re.fullmatch(r"[0-9\.\+\-\*\/\(\)\s^sqrtpiE]+", text.replace("**","^")):
|
776 |
+
return self.eval_numeric(text)
|
777 |
+
return None
|
778 |
+
|
779 |
+
# ---------------------------
|
780 |
+
# Code Sandbox (lightweight)
|
781 |
+
# ---------------------------
|
782 |
+
class CodeSandbox:
|
783 |
+
def __init__(self):
|
784 |
+
pass
|
785 |
+
def run(self, code:str, timeout: int=5) -> Dict[str,Any]:
|
786 |
+
# VERY light sandbox; for real isolation use subprocess + rlimits in your env.
|
787 |
try:
|
788 |
+
import io, contextlib
|
789 |
+
buf = io.StringIO()
|
790 |
+
ns = {}
|
791 |
+
with contextlib.redirect_stdout(buf):
|
792 |
+
exec(code, {"__builtins__": {"print": print, "range": range, "len": len}}, ns)
|
793 |
+
out = buf.getvalue()
|
794 |
+
return {"stdout": out, "keys": list(ns.keys())}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
795 |
except Exception as e:
|
|
|
796 |
return {"error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
797 |
|
798 |
+
# ---------------------------
|
799 |
+
# Agents
|
800 |
+
# ---------------------------
|
801 |
+
class Agent:
|
802 |
+
def __init__(self, name:str, model: LocalLLM, tools:Dict[str,Any]=None):
|
803 |
+
self.name = name
|
804 |
+
self.model = model
|
805 |
+
self.tools = tools or {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
806 |
|
807 |
+
def act(self, prompt:str, max_tokens=256, temperature=0.7):
|
808 |
+
try:
|
809 |
+
if not self.model.ready():
|
810 |
+
self.model.load()
|
811 |
+
return self.model.generate(prompt, max_tokens=max_tokens, temperature=temperature)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
812 |
except Exception as e:
|
813 |
+
logger.error("Agent %s failed: %s", self.name, e)
|
814 |
+
return f"[{self.name}-error] {e}"
|
815 |
+
|
816 |
+
class AgentHub:
|
817 |
+
def __init__(self, llm: LocalLLM, img_engine: ImageEngine, vid_engine: VideoEngine):
|
818 |
+
self.research = Agent("ResearchAgent", llm, {"web": None})
|
819 |
+
self.coder = Agent("CoderAgent", llm, {"exec": CodeSandbox()})
|
820 |
+
self.designer = Agent("DesignerAgent", llm, {"image": img_engine})
|
821 |
+
self.vid = Agent("VideoAgent", llm, {"video": vid_engine})
|
822 |
+
|
823 |
+
def coordinate(self, user_request:str):
|
824 |
+
if any(k in user_request.lower() for k in ["code","implement","script"]):
|
825 |
+
return self.coder.act(f"Write code for: {user_request}", max_tokens=512)
|
826 |
+
if any(k in user_request.lower() for k in ["design","image","generate"]):
|
827 |
+
return self.designer.act(f"Create an image plan: {user_request}", max_tokens=256)
|
828 |
+
return self.research.act(f"Research and summarize: {user_request}", max_tokens=512)
|
829 |
+
|
830 |
+
# ---------------------------
|
831 |
+
# Perception (lightweight multimodal glue)
|
832 |
+
# ---------------------------
|
833 |
+
class Perception:
|
834 |
+
EMOJI_MAP = {
|
835 |
+
"😂":"joy","😅":"relief","😭":"sad","😡":"anger","😍":"affection","😎":"confident",
|
836 |
+
"🤔":"thinking","😴":"tired","🙄":"annoyed","😇":"polite","😁":"happy","🤣":"joy"
|
837 |
+
}
|
838 |
+
def text_emotion_tags(self, text:str) -> List[str]:
|
839 |
+
tags=set()
|
840 |
+
for ch in text:
|
841 |
+
if ch in self.EMOJI_MAP: tags.add(self.EMOJI_MAP[ch])
|
842 |
+
if re.search(r"\b(sad|upset|angry|frustrated)\b", text.lower()): tags.add("negative")
|
843 |
+
if re.search(r"\b(happy|great|awesome|love)\b", text.lower()): tags.add("positive")
|
844 |
+
return sorted(tags)
|
845 |
+
|
846 |
+
# ---------------------------
|
847 |
+
# Confidence Scorer
|
848 |
+
# ---------------------------
|
849 |
+
class ConfidenceScorer:
|
850 |
+
def score(self, *,
|
851 |
+
source_reliability: float,
|
852 |
+
recency: float,
|
853 |
+
agreement: float,
|
854 |
+
self_consistency: float,
|
855 |
+
retrieval_strength: float,
|
856 |
+
contradictions: float) -> float:
|
857 |
+
# conservative weighting; each in [0,1]
|
858 |
+
w1,w2,w3,w4,w5,w6 = 0.22,0.18,0.18,0.18,0.14,0.10
|
859 |
+
s = (w1*source_reliability +
|
860 |
+
w2*recency +
|
861 |
+
w3*agreement +
|
862 |
+
w4*self_consistency +
|
863 |
+
w5*retrieval_strength -
|
864 |
+
w6*contradictions)
|
865 |
+
return max(0.0, min(1.0, s))
|
866 |
+
|
867 |
+
# ---------------------------
|
868 |
+
# Close-to-Human Brain (CHB)
|
869 |
+
# ---------------------------
|
870 |
+
class CloseToHumanBrain:
|
871 |
+
def __init__(self, llm: LocalLLM, longmem: LongTermMemory, kg: KnowledgeGraph,
|
872 |
+
vfs: VersionedFactStore, web: WebSearch, img: ImageEngine):
|
873 |
+
self.llm = llm
|
874 |
+
self.long = longmem
|
875 |
+
self.kg = kg
|
876 |
+
self.vfs = vfs
|
877 |
+
self.web = web
|
878 |
+
self.img = img
|
879 |
+
self.math = MathEngine()
|
880 |
+
self.perc = Perception()
|
881 |
+
self.scorer = ConfidenceScorer()
|
882 |
+
|
883 |
+
# ---- Retrieval
|
884 |
+
def retrieve(self, user_text: str, k:int=5) -> Tuple[str, List[Dict[str,str]]]:
|
885 |
+
mem_hits = self.long.search(user_text, top_k=min(8,k))
|
886 |
+
mem_ctx = "\n".join([h["text"] for h in mem_hits]) if mem_hits else ""
|
887 |
+
web_hits = self.web.search(user_text, max_results=3) if self.web.enabled else []
|
888 |
+
return mem_ctx, web_hits
|
889 |
+
|
890 |
+
# ---- Drafting
|
891 |
+
def multi_draft(self, prompt_base: str, drafts:int=3, max_tokens:int=384) -> List[str]:
|
892 |
+
outs=[]
|
893 |
+
temps=[0.4, 0.7, 1.0][:max(1,drafts)]
|
894 |
+
if not self.llm.ready():
|
895 |
+
self.llm.load()
|
896 |
+
for t in temps:
|
897 |
+
out = self.llm.generate(prompt_base, max_tokens=max_tokens, temperature=t)
|
898 |
+
outs.append(out)
|
899 |
+
return outs
|
900 |
+
|
901 |
+
# ---- Verification helpers
|
902 |
+
def _estimate_reliability(self, sources: List[Dict[str,Any]]) -> float:
|
903 |
+
if not sources: return 0.4
|
904 |
+
rel=0.0
|
905 |
+
for s in sources:
|
906 |
+
t = s.get("type","")
|
907 |
+
if t=="memory": rel += 0.6
|
908 |
+
elif t=="kg": rel += 0.7
|
909 |
+
elif t=="web": rel += 0.65
|
910 |
+
elif t in ("vision","audio","video"): rel += 0.55
|
911 |
+
else: rel += 0.5
|
912 |
+
return min(1.0, rel / max(1,len(sources)))
|
913 |
+
|
914 |
+
def _recency(self, sources: List[Dict[str,Any]]) -> float:
|
915 |
+
if not sources: return 0.3
|
916 |
+
ages=[]
|
917 |
+
now=now_ts()
|
918 |
+
for s in sources:
|
919 |
+
ts = s.get("time") or now
|
920 |
+
ages.append(max(0.0, now - ts))
|
921 |
+
avg = sum(ages)/len(ages)
|
922 |
+
# map age (in seconds) to [0,1] with simple decay (~1 day half-life)
|
923 |
+
day = 86400.0
|
924 |
+
return max(0.0, min(1.0, 1.0/(1.0 + (avg/day))))
|
925 |
+
|
926 |
+
def _agreement(self, claims: List[str]) -> float:
|
927 |
+
# crude token overlap agreement
|
928 |
+
if not claims: return 0.0
|
929 |
+
base=set(re.findall(r"\w+", claims[0].lower()))
|
930 |
+
agree=1
|
931 |
+
for c in claims[1:]:
|
932 |
+
toks=set(re.findall(r"\w+", c.lower()))
|
933 |
+
if len(base & toks) > 0: agree += 1
|
934 |
+
return agree/len(claims)
|
935 |
+
|
936 |
+
def _self_consistency(self, drafts: List[str]) -> float:
|
937 |
+
# measure average pairwise Jaccard of word sets
|
938 |
+
if not drafts: return 0.0
|
939 |
+
sets=[set(re.findall(r"\w+", d.lower())) for d in drafts]
|
940 |
+
if len(sets)==1: return 1.0
|
941 |
+
pair_scores=[]
|
942 |
+
for i in range(len(sets)):
|
943 |
+
for j in range(i+1,len(sets)):
|
944 |
+
a,b=sets[i],sets[j]
|
945 |
+
inter=len(a & b); union=len(a | b) or 1
|
946 |
+
pair_scores.append(inter/union)
|
947 |
+
return sum(pair_scores)/len(pair_scores)
|
948 |
+
|
949 |
+
def _retrieval_strength(self, mem_ctx: str) -> float:
|
950 |
+
if not mem_ctx: return 0.4
|
951 |
+
# simple function of context length
|
952 |
+
L = len(mem_ctx.split())
|
953 |
+
return max(0.4, min(1.0, math.log10(1+L)/2))
|
954 |
+
|
955 |
+
# ---- Verify & Synthesize
|
956 |
+
def verify_and_respond(self, user_id:str, user_text: str,
|
957 |
+
preferred_lang: Optional[str]=None) -> Dict[str,Any]:
|
958 |
+
# Perception
|
959 |
+
emotion_tags = self.perc.text_emotion_tags(user_text)
|
960 |
+
|
961 |
+
# Retrieve evidence
|
962 |
+
mem_ctx, web_hits = self.retrieve(user_text, k=6)
|
963 |
+
web_ctx = "\n".join([f"{h['title']} ({h['link']}) — {h.get('snippet','')}" for h in web_hits]) if web_hits else ""
|
964 |
+
citations = [{"type":"web","ref": h["link"], "title": h["title"], "time": now_ts()} for h in web_hits]
|
965 |
+
|
966 |
+
# Construct base prompt
|
967 |
+
prompt = (
|
968 |
+
"Persona: helpful, precise assistant.\n"
|
969 |
+
"Use given memory and web snippets as *evidence*.\n"
|
970 |
+
"If a claim is uncertain, narrow it or state limits.\n\n"
|
971 |
+
f"Memory Evidence:\n{mem_ctx}\n\nWeb Evidence:\n{web_ctx}\n\n"
|
972 |
+
f"User: {user_text}\nAssistant:"
|
973 |
+
)
|
974 |
+
|
975 |
+
# Multi-draft
|
976 |
+
drafts = self.multi_draft(prompt, drafts=3, max_tokens=512)
|
977 |
+
|
978 |
+
# Self-consistency + basic math check (if any plain expression present)
|
979 |
+
math_value = None
|
980 |
+
m = re.search(r"(?:calculate|solve)\s*([0-9\.\+\-\*\/\(\)\s^sqrtpiE]+)", user_text, re.I)
|
981 |
+
if m:
|
982 |
+
math_value = self.math.safe_eval(m.group(1))
|
983 |
+
|
984 |
+
# Build sources list (memory + web)
|
985 |
+
sources = citations[:]
|
986 |
+
if mem_ctx:
|
987 |
+
sources.append({"type":"memory","ref":"longterm_memory","title":"long-term memory","time": now_ts()})
|
988 |
+
|
989 |
+
# Score
|
990 |
+
source_rel = self._estimate_reliability(sources)
|
991 |
+
recency = self._recency(sources)
|
992 |
+
agree = self._agreement(drafts)
|
993 |
+
self_cons = self._self_consistency(drafts)
|
994 |
+
retr = self._retrieval_strength(mem_ctx)
|
995 |
+
contradictions = 0.0
|
996 |
+
|
997 |
+
# Compare with VFS (detect contradictions for simple exact match claims)
|
998 |
+
# We extract simple "X is Y" patterns from drafts; very lightweight.
|
999 |
+
simple_claims=[]
|
1000 |
+
for d in drafts:
|
1001 |
+
for sent in re.split(r"[.\n]", d):
|
1002 |
+
m2 = re.search(r"^([\w\s\-]{3,})\s+is\s+([\w\s\-\%\.]{2,})$", sent.strip(), re.I)
|
1003 |
+
if m2:
|
1004 |
+
c = f"{m2.group(1).strip()} is"
|
1005 |
+
v = m2.group(2).strip()
|
1006 |
+
simple_claims.append((c,v))
|
1007 |
+
# check against latest
|
1008 |
+
for c,v in simple_claims:
|
1009 |
+
latest = self.vfs.latest(c)
|
1010 |
+
if latest and latest.get("value") and latest["value"].strip().lower()!=v.lower():
|
1011 |
+
contradictions += 0.5 # penalize disagreement with stored fact
|
1012 |
+
|
1013 |
+
conf = self.scorer.score(
|
1014 |
+
source_reliability=source_rel,
|
1015 |
+
recency=recency,
|
1016 |
+
agreement=agree,
|
1017 |
+
self_consistency=self_cons,
|
1018 |
+
retrieval_strength=retr,
|
1019 |
+
contradictions=contradictions
|
1020 |
+
)
|
1021 |
+
|
1022 |
+
# Choose the most concise draft
|
1023 |
+
best = min(drafts, key=lambda s: len(s) if s else 1e9)
|
1024 |
+
|
1025 |
+
# If math was requested & computed, splice it in with highest certainty
|
1026 |
+
if math_value and "[math-error]" not in math_value and "sympy missing" not in math_value:
|
1027 |
+
best = f"{best}\n\nMath check: {math_value}"
|
1028 |
+
|
1029 |
+
# If confidence < threshold, trim to certain subset
|
1030 |
+
min_conf = MODEL_CONFIG["chb_min_confidence"]
|
1031 |
+
if conf < min_conf:
|
1032 |
+
# Provide narrowed/certain answer: we extract sentences with highest overlap across drafts.
|
1033 |
+
sent_scores=[]
|
1034 |
+
sents = [s.strip() for s in re.split(r"(?<=[\.\!\?])\s+", best) if s.strip()]
|
1035 |
+
for s in sents:
|
1036 |
+
count=sum(1 for d in drafts if s.lower() in d.lower())
|
1037 |
+
sent_scores.append((count, s))
|
1038 |
+
sent_scores.sort(reverse=True)
|
1039 |
+
certain = " ".join([s for cnt,s in sent_scores if cnt>=2]) # present in >=2 drafts
|
1040 |
+
if not certain:
|
1041 |
+
certain = "I'm not fully confident. Here's what is most certain from the evidence I have."
|
1042 |
+
best = certain
|
1043 |
+
|
1044 |
+
# Record any simple claims to VFS as new knowledge (with provenance)
|
1045 |
+
for c,v in simple_claims[:3]:
|
1046 |
+
self.vfs.add_or_update(claim=c, value=v, sources=sources, confidence=float(conf))
|
1047 |
+
|
1048 |
+
# Build final message with optional citations
|
1049 |
+
if citations:
|
1050 |
+
cites = "\n".join([f"- {c['title']} — {c['ref']}" for c in citations])
|
1051 |
+
best_out = f"{best}\n\nConfidence: {conf:.2f}\nSources:\n{cites}"
|
1052 |
+
else:
|
1053 |
+
best_out = f"{best}\n\nConfidence: {conf:.2f}"
|
1054 |
+
|
1055 |
+
return {"reply": best_out, "confidence": conf, "citations": citations}
|
1056 |
+
|
1057 |
+
# ---------------------------
|
1058 |
+
# Orchestrator (ties everything with CHB)
|
1059 |
+
# ---------------------------
|
1060 |
+
class SuperAgent:
|
1061 |
+
def __init__(self, config:dict):
|
1062 |
+
self.cfg = config
|
1063 |
+
self.prov = ProvenanceManager()
|
1064 |
+
self.safety = SafetyManager(blocklist=config.get("safety_blocklist"))
|
1065 |
+
self.short = ShortTermMemory()
|
1066 |
+
self.long = LongTermMemory(index_dir=config.get("faiss_index_dir"), embed_model_name=config.get("embedder"))
|
1067 |
+
self.kg = KnowledgeGraph(config.get("knowledge_graph_path"))
|
1068 |
+
self.vfs = VersionedFactStore(config.get("vfs_file"))
|
1069 |
+
self.web = WebSearch(enabled=bool(config.get("allow_web_search")), cache_file=config.get("web_cache_file"))
|
1070 |
+
# LLM
|
1071 |
+
self.llm = LocalLLM(model_path=config.get("llm",{}).get("model_path"),
|
1072 |
+
backend=config.get("llm",{}).get("backend","transformers"),
|
1073 |
+
device=config.get("device","auto"),
|
1074 |
+
quantize=config.get("llm",{}).get("quantize",None))
|
1075 |
+
# image & video engines
|
1076 |
+
self.image = ImageEngine(base=config.get("sdxl_base"), refiner=config.get("sdxl_refiner"),
|
1077 |
+
inpaint=config.get("sdxl_inpaint"), blip=config.get("blip_caption"),
|
1078 |
+
device=config.get("device","auto"))
|
1079 |
+
self.video = VideoEngine()
|
1080 |
+
self.voice = VoiceEngine(self.prov, self.safety, piper_bin=config.get("piper_binary"), piper_voice=config.get("piper_voice"))
|
1081 |
+
# agents
|
1082 |
+
self.agents = AgentHub(self.llm, self.image, self.video)
|
1083 |
+
# Close-to-Human Brain
|
1084 |
+
self.chb = CloseToHumanBrain(self.llm, self.long, self.kg, self.vfs, self.web, self.image)
|
1085 |
+
|
1086 |
+
def detect_intent(self, text:str) -> str:
|
1087 |
+
t = (text or "").lower().strip()
|
1088 |
+
if t.startswith("/img ") or t.startswith("/image "): return "image"
|
1089 |
+
if t.startswith("/inpaint "): return "inpaint"
|
1090 |
+
if t.startswith("/tts "): return "tts"
|
1091 |
+
if t.startswith("/video "): return "video"
|
1092 |
+
if t.startswith("/vidinterp "): return "vidinterp"
|
1093 |
+
if t.startswith("/kg "): return "kg"
|
1094 |
+
if t.startswith("/agent "): return "agent"
|
1095 |
+
if any(k in t for k in ["solve", "calculate", "integrate", "differentiate"]): return "math"
|
1096 |
+
return "chat"
|
1097 |
+
|
1098 |
+
def handle(self, user_id:str, text:str, preferred_lang:Optional[str]=None) -> Dict[str,Any]:
|
1099 |
+
if not self.safety.is_allowed(text):
|
1100 |
+
return {"status":"blocked","reason":"policy"}
|
1101 |
+
self.short.push(user_id, "user", text)
|
1102 |
+
|
1103 |
+
intent = self.detect_intent(text)
|
1104 |
+
reply = ""
|
1105 |
+
payload: Dict[str,Any] = {}
|
1106 |
|
1107 |
try:
|
1108 |
+
if intent == "math":
|
1109 |
+
me = self.chb.math
|
1110 |
+
expr = re.sub(r"^(solve|calculate)\s*","", text, flags=re.I).strip()
|
1111 |
+
res = {"exact": me.solve(expr), "numeric": me.eval_numeric(expr)}
|
1112 |
+
reply = json.dumps(res, ensure_ascii=False, indent=2)
|
1113 |
+
|
1114 |
+
elif intent == "image":
|
1115 |
+
prompt = text.split(" ",1)[1] if " " in text else text
|
1116 |
+
path = self.image.generate(prompt=prompt)
|
1117 |
+
reply = f"[image] {path}"
|
1118 |
+
|
1119 |
+
elif intent == "inpaint":
|
1120 |
+
reply = "Use /v1/image/inpaint API with 'image' and optional 'mask' files."
|
1121 |
+
|
1122 |
+
elif intent == "tts":
|
1123 |
+
content = text.split(" ",1)[1] if " " in text else text
|
1124 |
+
path = self.voice.synthesize(None, content)
|
1125 |
+
reply = f"[tts] {path}"
|
1126 |
+
|
1127 |
+
elif intent == "video":
|
1128 |
+
reply = "Upload video via /v1/video/analyze to extract keyframes."
|
1129 |
+
|
1130 |
+
elif intent == "vidinterp":
|
1131 |
+
parts = text.split(" ",2)
|
1132 |
+
factor = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 2
|
1133 |
+
reply = f"Use /v1/video/interpolate (factor={factor}) with a video file."
|
1134 |
+
|
1135 |
+
elif intent == "kg":
|
1136 |
+
reply = "Use /v1/kg endpoints (not exposed in this single file demo) or extend as needed."
|
1137 |
+
|
1138 |
+
elif intent == "agent":
|
1139 |
+
task = text.split(" ",1)[1] if " " in text else ""
|
1140 |
+
out = self.agents.coordinate(task)
|
1141 |
+
# CHB is the final boss: pass agent output through CHB to verify/format
|
1142 |
+
verified = self.chb.verify_and_respond(user_id, f"{task}\n\nAgentDraft:\n{out}", preferred_lang)
|
1143 |
+
reply = verified["reply"]
|
1144 |
+
payload.update({"confidence": verified["confidence"]})
|
1145 |
|
1146 |
+
else:
|
1147 |
+
# Default conversational path goes through CHB (final arbiter)
|
1148 |
+
verified = self.chb.verify_and_respond(user_id, text, preferred_lang)
|
1149 |
+
reply = verified["reply"]
|
1150 |
+
payload.update({"confidence": verified["confidence"]})
|
1151 |
|
1152 |
+
except Exception as e:
|
1153 |
+
logger.error("Handle failed: %s", e)
|
1154 |
+
reply = f"[error] {e}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1155 |
|
1156 |
+
# Learning: store turn + (optionally) extracted facts handled inside CHB
|
1157 |
+
if MODEL_CONFIG.get("auto_learn"):
|
1158 |
try:
|
1159 |
+
self.long.add(user_id, text, kind="turn")
|
1160 |
+
except Exception as e:
|
1161 |
+
logger.debug("long-term add failed: %s", e)
|
1162 |
+
|
1163 |
+
self.short.push(user_id, "assistant", reply)
|
1164 |
+
return {"status":"ok", "reply": reply, **payload}
|
1165 |
+
|
1166 |
+
# ---------------------------
|
1167 |
+
# FastAPI + Gradio integration
|
1168 |
+
# ---------------------------
|
1169 |
+
try:
|
1170 |
+
from fastapi import FastAPI, UploadFile, File, Form
|
1171 |
+
from fastapi.responses import FileResponse, JSONResponse
|
1172 |
+
from pydantic import BaseModel
|
1173 |
+
_FASTAPI = True
|
1174 |
+
except Exception:
|
1175 |
+
_FASTAPI = False
|
1176 |
+
|
1177 |
+
try:
|
1178 |
+
import gradio as gr
|
1179 |
+
_GRADIO = True
|
1180 |
+
except Exception:
|
1181 |
+
_GRADIO = False
|
1182 |
+
|
1183 |
+
app = FastAPI(title="Multimodal SuperAgent") if _FASTAPI else None
|
1184 |
+
_AGENT_SINGLETON: Optional[SuperAgent] = None
|
1185 |
+
|
1186 |
+
def get_agent() -> SuperAgent:
|
1187 |
+
global _AGENT_SINGLETON
|
1188 |
+
if _AGENT_SINGLETON is None:
|
1189 |
+
# merge env config file if present
|
1190 |
+
cfg_path = os.environ.get("SUPERAGENT_CONFIG")
|
1191 |
+
cfg = MODEL_CONFIG.copy()
|
1192 |
+
if cfg_path and os.path.exists(cfg_path):
|
1193 |
+
try:
|
1194 |
+
import yaml
|
1195 |
+
with open(cfg_path,"r",encoding="utf-8") as f:
|
1196 |
+
y = yaml.safe_load(f) or {}
|
1197 |
+
cfg.update(y)
|
1198 |
+
except Exception:
|
1199 |
+
pass
|
1200 |
+
_AGENT_SINGLETON = SuperAgent(cfg)
|
1201 |
+
return _AGENT_SINGLETON
|
1202 |
+
|
1203 |
+
if _FASTAPI and app is not None:
|
1204 |
+
class ChatIn(BaseModel):
|
1205 |
+
user_id: str
|
1206 |
+
text: str
|
1207 |
+
preferred_lang: Optional[str]=None
|
1208 |
+
|
1209 |
+
@app.post("/v1/chat")
|
1210 |
+
async def api_chat(inp: ChatIn):
|
1211 |
+
return get_agent().handle(inp.user_id, inp.text, inp.preferred_lang)
|
1212 |
+
|
1213 |
+
@app.post("/v1/register_voice")
|
1214 |
+
async def api_reg_voice(user_id: str = Form(...), consent: str = Form(...), file: UploadFile = File(...)):
|
1215 |
+
tmpf = tmp_path("_voice.wav")
|
1216 |
+
with open(tmpf,"wb") as f: f.write(await file.read())
|
1217 |
+
return get_agent().voice.register(user_id, tmpf, consent)
|
1218 |
+
|
1219 |
+
@app.post("/v1/tts")
|
1220 |
+
async def api_tts(voice_id: Optional[str]=Form(None), text: str = Form(...), fmt: str = Form("wav")):
|
1221 |
+
p = get_agent().voice.synthesize(voice_id, text, fmt=fmt)
|
1222 |
+
if p and os.path.exists(p): return FileResponse(p, media_type="audio/wav")
|
1223 |
+
return JSONResponse({"error":"tts_failed"})
|
1224 |
+
|
1225 |
+
@app.post("/v1/image/generate")
|
1226 |
+
async def api_img_gen(prompt: str = Form(...)):
|
1227 |
+
p = get_agent().image.generate(prompt)
|
1228 |
+
if p and os.path.exists(p): return FileResponse(p, media_type="image/png")
|
1229 |
+
return JSONResponse({"error":"image_unavailable"})
|
1230 |
+
|
1231 |
+
@app.post("/v1/image/inpaint")
|
1232 |
+
async def api_img_inpaint(prompt: str = Form(...), image: UploadFile = File(...), mask: UploadFile = File(None)):
|
1233 |
+
img_tmp = tmp_path("_img")
|
1234 |
+
with open(img_tmp,"wb") as f: f.write(await image.read())
|
1235 |
+
mask_tmp=None
|
1236 |
+
if mask:
|
1237 |
+
mask_tmp = tmp_path("_mask"); open(mask_tmp,"wb").write(await mask.read())
|
1238 |
+
p = get_agent().image.inpaint(img_tmp, mask_tmp, prompt)
|
1239 |
+
if p and os.path.exists(p): return FileResponse(p, media_type="image/png")
|
1240 |
+
return JSONResponse({"error":"inpaint_failed"})
|
1241 |
+
|
1242 |
+
@app.post("/v1/video/analyze")
|
1243 |
+
async def api_vid(file: UploadFile = File(...)):
|
1244 |
+
tmpf = tmp_path("_vid")
|
1245 |
+
with open(tmpf,"wb") as f: f.write(await file.read())
|
1246 |
+
return get_agent().video.analyze(tmpf)
|
1247 |
+
|
1248 |
+
@app.post("/v1/video/interpolate")
|
1249 |
+
async def api_vid_interp(factor: int = Form(2), file: UploadFile = File(...)):
|
1250 |
+
tmpf = tmp_path("_vid")
|
1251 |
+
with open(tmpf,"wb") as f: f.write(await file.read())
|
1252 |
+
return get_agent().video.interpolate_fps(tmpf, factor=max(2, int(factor)))
|
1253 |
+
|
1254 |
+
@app.post("/v1/memory/export")
|
1255 |
+
async def mem_export():
|
1256 |
+
return get_agent().long.export_all()
|
1257 |
+
|
1258 |
+
@app.post("/v1/memory/import")
|
1259 |
+
async def mem_import(items: List[Dict[str,Any]]):
|
1260 |
+
get_agent().long.import_bulk(items)
|
1261 |
+
return {"status":"ok","count":len(items)}
|
1262 |
+
|
1263 |
+
@app.post("/v1/web/toggle")
|
1264 |
+
async def web_toggle(enabled: bool = Form(...)):
|
1265 |
+
get_agent().web.enabled = bool(enabled); return {"enabled":get_agent().web.enabled}
|
1266 |
+
|
1267 |
+
# ---------------------------
|
1268 |
+
# Optional Gradio demo (runs when module executed)
|
1269 |
+
# ---------------------------
|
1270 |
+
def launch_gradio():
|
1271 |
+
if not _GRADIO:
|
1272 |
+
logger.warning("Gradio not installed")
|
1273 |
+
return
|
1274 |
+
agent = get_agent()
|
1275 |
+
with gr.Blocks(title="Multimodal SuperAgent") as demo:
|
1276 |
+
gr.Markdown("# Multimodal SuperAgent v6.0 (CHB)")
|
1277 |
+
with gr.Row():
|
1278 |
+
user_id = gr.Textbox(value="user1", label="User ID")
|
1279 |
+
prompt = gr.Textbox(label="Prompt")
|
1280 |
+
btn = gr.Button("Send")
|
1281 |
+
out = gr.Markdown(label="Reply")
|
1282 |
+
def send(u,p):
|
1283 |
+
res = agent.handle(u,p)
|
1284 |
+
return res.get("reply","")
|
1285 |
+
btn.click(send, [user_id,prompt], out)
|
1286 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
1287 |
+
|
1288 |
+
# ---------------------------
|
1289 |
+
# CLI
|
1290 |
+
# ---------------------------
|
1291 |
+
if __name__ == "__main__":
|
1292 |
+
import argparse
|
1293 |
+
ap = argparse.ArgumentParser()
|
1294 |
+
ap.add_argument("--demo", action="store_true")
|
1295 |
+
ap.add_argument("--gradio", action="store_true")
|
1296 |
+
args = ap.parse_args()
|
1297 |
+
if args.demo:
|
1298 |
+
a = get_agent()
|
1299 |
+
print("Chat:", a.handle("user1","Hello 😁, calculate 2*(3+4).")["reply"])
|
1300 |
+
print("Math:", a.handle("user1","solve 2*x+1=5")["reply"])
|
1301 |
+
print("Interp hint:", a.handle("user1","/vidinterp 2")["reply"])
|
1302 |
+
if args.gradio:
|
1303 |
+
launch_gradio()
|