Spaces:

Princeaka
/

multimodal_module

Running

App Files Files Community

Princeaka commited on 10 days ago

Commit

d45eebc

verified ·

1 Parent(s): 5ebcc12

Update multimodal_module.py

Browse files

Files changed (1) hide show

multimodal_module.py +645 -644

multimodal_module.py CHANGED Viewed

@@ -1,644 +1,645 @@
-# multimodal_module.py
-import os
-import pickle
-import subprocess
-import tempfile
-import shutil
-import asyncio
-from datetime import datetime
-from typing import Dict, List, Optional, Any
-import io
-import uuid
-# Core ML libs
-import torch
-from transformers import (
-    pipeline,
-    AutoModelForSeq2SeqLM,
-    AutoTokenizer,
-    Wav2Vec2Processor,
-    Wav2Vec2ForSequenceClassification,
-)
-from diffusers import StableDiffusionPipeline, StableDiffusionInpaintPipeline
-from transformers import AutoModelForCausalLM, AutoTokenizer as HFTokenizer
-# Audio / speech
-import librosa
-import speech_recognition as sr
-from gtts import gTTS
-# Image, video, files
-from PIL import Image, ImageOps
-import imageio_ffmpeg as ffmpeg
-import imageio
-import moviepy.editor as mp
-import fitz  # PyMuPDF for PDFs
-# Misc
-from langdetect import DetectorFactory
-DetectorFactory.seed = 0
-# Optional: safety-check toggles
-USE_SAFETY_CHECKER = False
-# Helper for temp files
-def _tmp_path(suffix=""):
-    return os.path.join(tempfile.gettempdir(), f"{uuid.uuid4().hex}{suffix}")
-class MultiModalChatModule:
-    """
-    Full-power multimodal module.
-    - Lazy-loads big models on first use.
-    - Methods are async-friendly.
-    """
-    def __init__(self, chat_history_file: str = "chat_histories.pkl"):
-        self.user_chat_histories: Dict[int, List[dict]] = self._load_chat_histories(chat_history_file)
-        self.chat_history_file = chat_history_file
-        # device
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        print(f"[MultiModal] device: {self.device}")
-        # placeholders for large models (lazy)
-        self._voice_processor = None
-        self._voice_emotion_model = None
-        self._translator = None
-        self._chat_tokenizer = None
-        self._chat_model = None
-        self._chat_model_name = "bigscience/bloom"  # placeholder; will set proper below
-        self._image_captioner = None
-        self._sd_pipe = None
-        self._sd_inpaint = None
-        self._code_tokenizer = None
-        self._code_model = None
-        # other small helpers
-        self._sr_recognizer = sr.Recognizer()
-        # set common model names (you can change)
-        self.model_names = {
-            "voice_emotion_processor": "facebook/hubert-large-ls960-ft",
-            "voice_emotion_model": "superb/hubert-base-superb-er",
-            "translation_model": "facebook/nllb-200-distilled-600M",
-            "chatbot_tokenizer": "facebook/blenderbot-400M-distill",
-            "chatbot_model": "facebook/blenderbot-400M-distill",
-            "image_captioner": "Salesforce/blip-image-captioning-base",
-            "sd_inpaint": "runwayml/stable-diffusion-inpainting",
-            "sd_text2img": "runwayml/stable-diffusion-v1-5",
-            "code_model": "bigcode/starcoder",  # Or use a specific StarCoder checkpoint on HF
-        }
-        # keep track of which heavy groups are loaded
-        self._loaded = {
-            "voice": False,
-            "translation": False,
-            "chat": False,
-            "image_caption": False,
-            "sd": False,
-            "code": False,
-        }
-    # ----------------------
-    # persistence
-    # ----------------------
-    def _load_chat_histories(self, fn: str) -> Dict[int, List[dict]]:
-        try:
-            with open(fn, "rb") as f:
-                return pickle.load(f)
-        except Exception:
-            return {}
-    def _save_chat_histories(self):
-        try:
-            with open(self.chat_history_file, "wb") as f:
-                pickle.dump(self.user_chat_histories, f)
-        except Exception as e:
-            print("[MultiModal] Warning: failed to save chat histories:", e)
-    # ----------------------
-    # Lazy loaders
-    # ----------------------
-    def _load_voice_models(self):
-        if self._loaded["voice"]:
-            return
-        print("[MultiModal] Loading voice/emotion models...")
-        self._voice_processor = Wav2Vec2Processor.from_pretrained(self.model_names["voice_emotion_processor"])
-        self._voice_emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_names["voice_emotion_model"]).to(self.device)
-        self._loaded["voice"] = True
-        print("[MultiModal] Voice models loaded.")
-    def _load_translation(self):
-        if self._loaded["translation"]:
-            return
-        print("[MultiModal] Loading translation pipeline...")
-        device_idx = 0 if self.device == "cuda" else -1
-        self._translator = pipeline("translation", model=self.model_names["translation_model"], device=device_idx)
-        self._loaded["translation"] = True
-        print("[MultiModal] Translation loaded.")
-    def _load_chatbot(self):
-        if self._loaded["chat"]:
-            return
-        print("[MultiModal] Loading chatbot model...")
-        # chatbot: keep current blenderbot to preserve behaviour
-        self._chat_tokenizer = AutoTokenizer.from_pretrained(self.model_names["chatbot_tokenizer"])
-        self._chat_model = AutoModelForSeq2SeqLM.from_pretrained(self.model_names["chatbot_model"]).to(self.device)
-        self._loaded["chat"] = True
-        print("[MultiModal] Chatbot loaded.")
-    def _load_image_captioner(self):
-        if self._loaded["image_caption"]:
-            return
-        print("[MultiModal] Loading image captioner...")
-        device_idx = 0 if self.device == "cuda" else -1
-        self._image_captioner = pipeline("image-to-text", model=self.model_names["image_captioner"], device=device_idx)
-        self._loaded["image_caption"] = True
-        print("[MultiModal] Image captioner loaded.")
-    def _load_sd(self):
-        if self._loaded["sd"]:
-            return
-        print("[MultiModal] Loading Stable Diffusion pipelines...")
-        # text2img
-        sd_model = self.model_names["sd_text2img"]
-        sd_inpaint_model = self.model_names["sd_inpaint"]
-        # Use float16 on GPU for speed
-        torch_dtype = torch.float16 if self.device == "cuda" else torch.float32
-        try:
-            self._sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model, torch_dtype=torch_dtype)
-            self._sd_pipe = self._sd_pipe.to(self.device)
-        except Exception as e:
-            print("[MultiModal] Warning loading text2img:", e)
-            self._sd_pipe = None
-        try:
-            self._sd_inpaint = StableDiffusionInpaintPipeline.from_pretrained(sd_inpaint_model, torch_dtype=torch_dtype)
-            self._sd_inpaint = self._sd_inpaint.to(self.device)
-        except Exception as e:
-            print("[MultiModal] Warning loading inpaint:", e)
-            self._sd_inpaint = None
-        self._loaded["sd"] = True
-        print("[MultiModal] Stable Diffusion loaded (where possible).")
-    def _load_code_model(self):
-        if self._loaded["code"]:
-            return
-        print("[MultiModal] Loading code model...")
-        # StarCoder style model (may require HF_TOKEN or large memory)
-        try:
-            self._code_tokenizer = HFTokenizer.from_pretrained(self.model_names["code_model"])
-            self._code_model = AutoModelForCausalLM.from_pretrained(self.model_names["code_model"]).to(self.device)
-            self._loaded["code"] = True
-            print("[MultiModal] Code model loaded.")
-        except Exception as e:
-            print("[MultiModal] Warning: could not load code model:", e)
-            self._code_tokenizer = None
-            self._code_model = None
-    # ----------------------
-    # Voice: analyze emotion, transcribe
-    # ----------------------
-    async def analyze_voice_emotion(self, audio_path: str) -> str:
-        self._load_voice_models()
-        speech, sr_ = librosa.load(audio_path, sr=16000)
-        inputs = self._voice_processor(speech, sampling_rate=sr_, return_tensors="pt", padding=True).to(self.device)
-        with torch.no_grad():
-            logits = self._voice_emotion_model(**inputs).logits
-        predicted_class = torch.argmax(logits).item()
-        return {
-            0: "😊 Happy",
-            1: "😢 Sad",
-            2: "😠 Angry",
-            3: "😨 Fearful",
-            4: "😌 Calm",
-            5: "😲 Surprised",
-        }.get(predicted_class, "🤔 Unknown")
-    async def process_voice_message(self, voice_file, user_id: int) -> dict:
-        """
-        voice_file: Starlette UploadFile or object with get_file() used previously in your code.
-        Returns: {text, language, emotion}
-        """
-        # Save OGG locally
-        ogg_path = _tmp_path(".ogg")
-        wav_path = _tmp_path(".wav")
-        tf = await voice_file.get_file()
-        await tf.download_to_drive(ogg_path)
-        # Convert to WAV via ffmpeg
-        try:
-            ffmpeg_path = ffmpeg.get_ffmpeg_exe()
-            subprocess.run([ffmpeg_path, "-y", "-i", ogg_path, wav_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        except Exception as e:
-            # fallback: try ffmpeg in PATH
-            try:
-                subprocess.run(["ffmpeg", "-y", "-i", ogg_path, wav_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            except Exception as ee:
-                raise RuntimeError(f"ffmpeg conversion failed: {e} / {ee}")
-        # Transcribe using SpeechRecognition Google STT (as before) -- or you can integrate whisper
-        recognizer = self._sr_recognizer
-        with sr.AudioFile(wav_path) as source:
-            audio = recognizer.record(source)
-        detected_lang = None
-        detected_text = ""
-        # tried languages set
-        lang_map = {
-            "zh": {"stt": "zh-CN"},
-            "ja": {"stt": "ja-JP"},
-            "ko": {"stt": "ko-KR"},
-            "en": {"stt": "en-US"},
-            "es": {"stt": "es-ES"},
-            "fr": {"stt": "fr-FR"},
-            "de": {"stt": "de-DE"},
-            "it": {"stt": "it-IT"},
-        }
-        for lang_code, lang_data in lang_map.items():
-            try:
-                detected_text = recognizer.recognize_google(audio, language=lang_data["stt"])
-                detected_lang = lang_code
-                break
-            except sr.UnknownValueError:
-                continue
-            except Exception:
-                continue
-        if not detected_lang:
-            # If not recognized, try fallback: detect from small chunk via langdetect
-            detected_lang = "en"
-            detected_text = ""
-        # emotion
-        emotion = await self.analyze_voice_emotion(wav_path)
-        # remove temp files
-        try:
-            os.remove(ogg_path)
-            os.remove(wav_path)
-        except Exception:
-            pass
-        return {"text": detected_text, "language": detected_lang, "emotion": emotion}
-    # ----------------------
-    # Text chat with translation & history
-    # ----------------------
-    async def generate_response(self, text: str, user_id: int, lang: str = "en") -> str:
-        # Ensure chat model loaded
-        self._load_chatbot()
-        self._load_translation()
-        if user_id not in self.user_chat_histories:
-            self.user_chat_histories[user_id] = []
-        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "user", "text": text, "language": lang})
-        self.user_chat_histories[user_id] = self.user_chat_histories[user_id][-100:]
-        self._save_chat_histories()
-        # Build context: translate last few msgs to English for consistency
-        context_texts = []
-        for msg in self.user_chat_histories[user_id][-5:]:
-            if msg.get("language", "en") != "en":
-                try:
-                    translated = self._translator(msg["text"])[0]["translation_text"]
-                except Exception:
-                    translated = msg["text"]
-            else:
-                translated = msg["text"]
-            context_texts.append(f"{msg['role']}: {translated}")
-        context = "\n".join(context_texts)
-        input_text = f"Context:\n{context}\nUser: {text if lang == 'en' else context_texts[-1].split(': ', 1)[1]}"
-        # Tokenize + generate
-        inputs = self._chat_tokenizer.encode(input_text, return_tensors="pt").to(self.device)
-        outputs = self._chat_model.generate(inputs, max_length=1000)
-        response_en = self._chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # Translate back to user's language if needed
-        if lang != "en":
-            try:
-                response = self._translator(response_en)[0]["translation_text"]
-            except Exception:
-                response = response_en
-        else:
-            response = response_en
-        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "bot", "text": response, "language": lang})
-        self._save_chat_histories()
-        return response
-    # ----------------------
-    # Image captioning (existing)
-    # ----------------------
-    async def process_image_message(self, image_file, user_id: int) -> str:
-        # Save image
-        img_path = _tmp_path(".jpg")
-        tf = await image_file.get_file()
-        await tf.download_to_drive(img_path)
-        # load captioner
-        self._load_image_captioner()
-        try:
-            image = Image.open(img_path).convert("RGB")
-            description = self._image_captioner(image)[0]["generated_text"]
-        except Exception as e:
-            description = f"[Error generating caption: {e}]"
-        # cleanup
-        try:
-            os.remove(img_path)
-        except Exception:
-            pass
-        # store in history
-        if user_id not in self.user_chat_histories:
-            self.user_chat_histories[user_id] = []
-        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "user", "text": "[Image]", "language": "en"})
-        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "bot", "text": f"Image description: {description}", "language": "en"})
-        self._save_chat_histories()
-        return description
-    # ----------------------
-    # Voice reply (TTS)
-    # ----------------------
-    async def generate_voice_reply(self, text: str, user_id: int, fmt: str = "ogg") -> str:
-        """
-        Generate TTS audio reply using gTTS (or swap out to another TTS if you have).
-        Returns path to audio file.
-        """
-        mp3_path = _tmp_path(".mp3")
-        out_path = _tmp_path(f".{fmt}")
-        try:
-            tts = gTTS(text)
-            tts.save(mp3_path)
-            # convert to requested format using ffmpeg (ogg/opus for Telegram voice)
-            ffmpeg_path = ffmpeg.get_ffmpeg_exe()
-            if fmt == "ogg":
-                # convert mp3 -> ogg (opus)
-                subprocess.run([ffmpeg_path, "-y", "-i", mp3_path, "-c:a", "libopus", out_path], check=True)
-            elif fmt == "wav":
-                subprocess.run([ffmpeg_path, "-y", "-i", mp3_path, out_path], check=True)
-            else:
-                # default: return mp3
-                shutil.move(mp3_path, out_path)
-        except Exception as e:
-            # fallback: raise
-            raise RuntimeError(f"TTS failed: {e}")
-        finally:
-            try:
-                if os.path.exists(mp3_path) and os.path.exists(out_path) and mp3_path != out_path:
-                    os.remove(mp3_path)
-            except Exception:
-                pass
-        return out_path
-    # ----------------------
-    # Image generation (text -> image)
-    # ----------------------
-    async def generate_image_from_text(self, prompt: str, user_id: int, width: int = 512, height: int = 512, steps: int = 30) -> str:
-        self._load_sd()
-        if self._sd_pipe is None:
-            raise RuntimeError("Stable Diffusion pipeline not available.")
-        out_path = _tmp_path(".png")
-        try:
-            # diffusion pipeline uses CPU/GPU internally
-            result = self._sd_pipe(prompt, num_inference_steps=steps, height=height, width=width)
-            image = result.images[0]
-            image.save(out_path)
-        except Exception as e:
-            raise RuntimeError(f"Image generation failed: {e}")
-        return out_path
-    # ----------------------
-    # Image editing (inpainting)
-    # ----------------------
-    async def edit_image_inpaint(self, image_file, mask_file=None, prompt: str = "", user_id: int = 0) -> str:
-        self._load_sd()
-        if self._sd_inpaint is None:
-            raise RuntimeError("Inpainting pipeline not available.")
-        # Save files
-        img_path = _tmp_path(".png")
-        tf = await image_file.get_file()
-        await tf.download_to_drive(img_path)
-        if mask_file:
-            mask_path = _tmp_path(".png")
-            m_tf = await mask_file.get_file()
-            await m_tf.download_to_drive(mask_path)
-            mask_image = Image.open(mask_path).convert("L")
-        else:
-            # default mask (edit entire image)
-            mask_image = Image.new("L", Image.open(img_path).size, color=255)
-            mask_path = None
-        init_image = Image.open(img_path).convert("RGB")
-        # run inpaint
-        out_path = _tmp_path(".png")
-        try:
-            result = self._sd_inpaint(prompt=prompt if prompt else " ", image=init_image, mask_image=mask_image, guidance_scale=7.5, num_inference_steps=30)
-            edited = result.images[0]
-            edited.save(out_path)
-        except Exception as e:
-            raise RuntimeError(f"Inpainting failed: {e}")
-        finally:
-            try:
-                os.remove(img_path)
-                if mask_path:
-                    os.remove(mask_path)
-            except Exception:
-                pass
-        return out_path
-    # ----------------------
-    # Video processing: extract audio, frames, summarize
-    # ----------------------
-    async def process_video(self, video_file, user_id: int, max_frames: int = 4) -> dict:
-        """
-        Accepts uploaded video file, extracts audio (for transcription) and sample frames,
-        returns summary: {duration, fps, transcriptions, captions}
-        """
-        vid_path = _tmp_path(".mp4")
-        tf = await video_file.get_file()
-        await tf.download_to_drive(vid_path)
-        # Extract audio
-        audio_path = _tmp_path(".wav")
-        try:
-            clip = mp.VideoFileClip(vid_path)
-            clip.audio.write_audiofile(audio_path, logger=None)
-            duration = clip.duration
-            fps = clip.fps
-        except Exception as e:
-            raise RuntimeError(f"Video processing failed: {e}")
-        # Transcribe audio using the same process_voice_message flow: use SpeechRecognition or integrate Whisper
-        # For now we'll try SpeechRecognition on the audio
-        recognizer = sr.Recognizer()
-        with sr.AudioFile(audio_path) as source:
-            audio = recognizer.record(source)
-        transcribed = ""
-        try:
-            transcribed = recognizer.recognize_google(audio)
-        except Exception:
-            transcribed = ""
-        # Extract a few frames evenly
-        frames = []
-        try:
-            clip_reader = imageio.get_reader(vid_path, "ffmpeg")
-            total_frames = clip_reader.count_frames()
-            step = max(1, total_frames // max_frames)
-            for i in range(0, total_frames, step):
-                try:
-                    frame = clip_reader.get_data(i)
-                    pil = Image.fromarray(frame)
-                    ppath = _tmp_path(".jpg")
-                    pil.save(ppath)
-                    frames.append(ppath)
-                    if len(frames) >= max_frames:
-                        break
-                except Exception:
-                    continue
-            clip_reader.close()
-        except Exception:
-            pass
-        # Use image captioner on the frames
-        captions = []
-        if frames:
-            self._load_image_captioner()
-            for p in frames:
-                try:
-                    img = Image.open(p).convert("RGB")
-                    c = self._image_captioner(img)[0]["generated_text"]
-                    captions.append(c)
-                except Exception:
-                    captions.append("")
-                finally:
-                    try:
-                        os.remove(p)
-                    except Exception:
-                        pass
-        # cleanup
-        try:
-            os.remove(vid_path)
-            os.remove(audio_path)
-        except Exception:
-            pass
-        return {"duration": duration, "fps": fps, "transcription": transcribed, "captions": captions}
-    # ----------------------
-    # File processing (PDF, DOCX, TXT, CSV)
-    # ----------------------
-    async def process_file(self, file_obj, user_id: int) -> dict:
-        """
-        Reads a file, extracts text (supports PDF/TXT/CSV/DOCX if python-docx added),
-        and returns a short summary.
-        """
-        # Save file
-        fpath = _tmp_path()
-        tf = await file_obj.get_file()
-        await tf.download_to_drive(fpath)
-        lower = fpath.lower()
-        text = ""
-        if fpath.endswith(".pdf"):
-            try:
-                doc = fitz.open(fpath)
-                for page in doc:
-                    text += page.get_text()
-            except Exception as e:
-                text = f"[PDF read error: {e}]"
-        elif fpath.endswith((".txt", ".csv")):
-            try:
-                with open(fpath, "r", encoding="utf-8", errors="ignore") as fh:
-                    text = fh.read()
-            except Exception as e:
-                text = f"[File read error: {e}]"
-        elif fpath.endswith(".docx"):
-            try:
-                import docx
-                doc = docx.Document(fpath)
-                text = "\n".join([p.text for p in doc.paragraphs])
-            except Exception as e:
-                text = f"[DOCX read error: {e}]"
-        else:
-            text = "[Unsupported file type]"
-        # Summarize: simple heuristic or use translator/chat model to summarize (but that costs compute)
-        summary = text[:300] + ("..." if len(text) > 300 else "")
-        try:
-            os.remove(fpath)
-        except Exception:
-            pass
-        return {"summary": summary, "full_text_length": len(text)}
-    # ----------------------
-    # Code assistance: generate / explain code
-    # ----------------------
-    async def code_complete(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
-        """
-        Uses a code LLM (StarCoder or similar) to complete or generate code.
-        """
-        self._load_code_model()
-        if not self._code_model or not self._code_tokenizer:
-            raise RuntimeError("Code model not available.")
-        input_ids = self._code_tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
-        gen = self._code_model.generate(input_ids, max_new_tokens=max_tokens, do_sample=False)
-        out = self._code_tokenizer.decode(gen[0], skip_special_tokens=True)
-        return out
-    # ----------------------
-    # Optional: execute Python code in sandbox (WARNING: security risk)
-    # ----------------------
-    async def execute_python_code(self, code: str, timeout: int = 5) -> dict:
-        """
-        Execute Python code in a very limited sandbox subprocess.
-        WARNING: Running arbitrary code is dangerous. Use only with trusted inputs or stronger sandboxing (containers).
-        """
-        # Create temp dir
-        d = tempfile.mkdtemp()
-        file_path = os.path.join(d, "main.py")
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(code)
-        # run with timeout
-        try:
-            proc = await asyncio.create_subprocess_exec(
-                "python3", file_path,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
-            )
-            try:
-                stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
-            except asyncio.TimeoutError:
-                proc.kill()
-                return {"error": "Execution timed out"}
-            return {"stdout": stdout.decode("utf-8", errors="ignore"), "stderr": stderr.decode("utf-8", errors="ignore")}
-        finally:
-            try:
-                shutil.rmtree(d)
-            except Exception:
-                pass

+# multimodal_module.py
+import os
+import pickle
+import subprocess
+import tempfile
+import shutil
+import asyncio
+from datetime import datetime
+from huggingface_hub import hf_hub_download, snapshot_download
+from typing import Dict, List, Optional, Any
+import io
+import uuid
+# Core ML libs
+import torch
+from transformers import (
+    pipeline,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    Wav2Vec2Processor,
+    Wav2Vec2ForSequenceClassification,
+)
+from diffusers import StableDiffusionPipeline, StableDiffusionInpaintPipeline
+from transformers import AutoModelForCausalLM, AutoTokenizer as HFTokenizer
+# Audio / speech
+import librosa
+import speech_recognition as sr
+from gtts import gTTS
+# Image, video, files
+from PIL import Image, ImageOps
+import imageio_ffmpeg as ffmpeg
+import imageio
+import moviepy.editor as mp
+import fitz  # PyMuPDF for PDFs
+# Misc
+from langdetect import DetectorFactory
+DetectorFactory.seed = 0
+# Optional: safety-check toggles
+USE_SAFETY_CHECKER = False
+# Helper for temp files
+def _tmp_path(suffix=""):
+    return os.path.join(tempfile.gettempdir(), f"{uuid.uuid4().hex}{suffix}")
+class MultiModalChatModule:
+    """
+    Full-power multimodal module.
+    - Lazy-loads big models on first use.
+    - Methods are async-friendly.
+    """
+    def __init__(self, chat_history_file: str = "chat_histories.pkl"):
+        self.user_chat_histories: Dict[int, List[dict]] = self._load_chat_histories(chat_history_file)
+        self.chat_history_file = chat_history_file
+        # device
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"[MultiModal] device: {self.device}")
+        # placeholders for large models (lazy)
+        self._voice_processor = None
+        self._voice_emotion_model = None
+        self._translator = None
+        self._chat_tokenizer = None
+        self._chat_model = None
+        self._chat_model_name = "bigscience/bloom"  # placeholder; will set proper below
+        self._image_captioner = None
+        self._sd_pipe = None
+        self._sd_inpaint = None
+        self._code_tokenizer = None
+        self._code_model = None
+        # other small helpers
+        self._sr_recognizer = sr.Recognizer()
+        # set common model names (you can change)
+        self.model_names = {
+            "voice_emotion_processor": "facebook/hubert-large-ls960-ft",
+            "voice_emotion_model": "superb/hubert-base-superb-er",
+            "translation_model": "facebook/nllb-200-distilled-600M",
+            "chatbot_tokenizer": "facebook/blenderbot-400M-distill",
+            "chatbot_model": "facebook/blenderbot-400M-distill",
+            "image_captioner": "Salesforce/blip-image-captioning-base",
+            "sd_inpaint": "runwayml/stable-diffusion-inpainting",
+            "sd_text2img": "runwayml/stable-diffusion-v1-5",
+            "code_model": "bigcode/starcoder",  # Or use a specific StarCoder checkpoint on HF
+        }
+        # keep track of which heavy groups are loaded
+        self._loaded = {
+            "voice": False,
+            "translation": False,
+            "chat": False,
+            "image_caption": False,
+            "sd": False,
+            "code": False,
+        }
+    # ----------------------
+    # persistence
+    # ----------------------
+    def _load_chat_histories(self, fn: str) -> Dict[int, List[dict]]:
+        try:
+            with open(fn, "rb") as f:
+                return pickle.load(f)
+        except Exception:
+            return {}
+    def _save_chat_histories(self):
+        try:
+            with open(self.chat_history_file, "wb") as f:
+                pickle.dump(self.user_chat_histories, f)
+        except Exception as e:
+            print("[MultiModal] Warning: failed to save chat histories:", e)
+    # ----------------------
+    # Lazy loaders
+    # ----------------------
+    def _load_voice_models(self):
+        if self._loaded["voice"]:
+            return
+        print("[MultiModal] Loading voice/emotion models...")
+        self._voice_processor = Wav2Vec2Processor.from_pretrained(self.model_names["voice_emotion_processor"])
+        self._voice_emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_names["voice_emotion_model"]).to(self.device)
+        self._loaded["voice"] = True
+        print("[MultiModal] Voice models loaded.")
+    def _load_translation(self):
+        if self._loaded["translation"]:
+            return
+        print("[MultiModal] Loading translation pipeline...")
+        device_idx = 0 if self.device == "cuda" else -1
+        self._translator = pipeline("translation", model=self.model_names["translation_model"], device=device_idx)
+        self._loaded["translation"] = True
+        print("[MultiModal] Translation loaded.")
+    def _load_chatbot(self):
+        if self._loaded["chat"]:
+            return
+        print("[MultiModal] Loading chatbot model...")
+        # chatbot: keep current blenderbot to preserve behaviour
+        self._chat_tokenizer = AutoTokenizer.from_pretrained(self.model_names["chatbot_tokenizer"])
+        self._chat_model = AutoModelForSeq2SeqLM.from_pretrained(self.model_names["chatbot_model"]).to(self.device)
+        self._loaded["chat"] = True
+        print("[MultiModal] Chatbot loaded.")
+    def _load_image_captioner(self):
+        if self._loaded["image_caption"]:
+            return
+        print("[MultiModal] Loading image captioner...")
+        device_idx = 0 if self.device == "cuda" else -1
+        self._image_captioner = pipeline("image-to-text", model=self.model_names["image_captioner"], device=device_idx)
+        self._loaded["image_caption"] = True
+        print("[MultiModal] Image captioner loaded.")
+    def _load_sd(self):
+        if self._loaded["sd"]:
+            return
+        print("[MultiModal] Loading Stable Diffusion pipelines...")
+        # text2img
+        sd_model = self.model_names["sd_text2img"]
+        sd_inpaint_model = self.model_names["sd_inpaint"]
+        # Use float16 on GPU for speed
+        torch_dtype = torch.float16 if self.device == "cuda" else torch.float32
+        try:
+            self._sd_pipe = StableDiffusionPipeline.from_pretrained(sd_model, torch_dtype=torch_dtype)
+            self._sd_pipe = self._sd_pipe.to(self.device)
+        except Exception as e:
+            print("[MultiModal] Warning loading text2img:", e)
+            self._sd_pipe = None
+        try:
+            self._sd_inpaint = StableDiffusionInpaintPipeline.from_pretrained(sd_inpaint_model, torch_dtype=torch_dtype)
+            self._sd_inpaint = self._sd_inpaint.to(self.device)
+        except Exception as e:
+            print("[MultiModal] Warning loading inpaint:", e)
+            self._sd_inpaint = None
+        self._loaded["sd"] = True
+        print("[MultiModal] Stable Diffusion loaded (where possible).")
+    def _load_code_model(self):
+        if self._loaded["code"]:
+            return
+        print("[MultiModal] Loading code model...")
+        # StarCoder style model (may require HF_TOKEN or large memory)
+        try:
+            self._code_tokenizer = HFTokenizer.from_pretrained(self.model_names["code_model"])
+            self._code_model = AutoModelForCausalLM.from_pretrained(self.model_names["code_model"]).to(self.device)
+            self._loaded["code"] = True
+            print("[MultiModal] Code model loaded.")
+        except Exception as e:
+            print("[MultiModal] Warning: could not load code model:", e)
+            self._code_tokenizer = None
+            self._code_model = None
+    # ----------------------
+    # Voice: analyze emotion, transcribe
+    # ----------------------
+    async def analyze_voice_emotion(self, audio_path: str) -> str:
+        self._load_voice_models()
+        speech, sr_ = librosa.load(audio_path, sr=16000)
+        inputs = self._voice_processor(speech, sampling_rate=sr_, return_tensors="pt", padding=True).to(self.device)
+        with torch.no_grad():
+            logits = self._voice_emotion_model(**inputs).logits
+        predicted_class = torch.argmax(logits).item()
+        return {
+            0: "😊 Happy",
+            1: "😢 Sad",
+            2: "😠 Angry",
+            3: "😨 Fearful",
+            4: "😌 Calm",
+            5: "😲 Surprised",
+        }.get(predicted_class, "🤔 Unknown")
+    async def process_voice_message(self, voice_file, user_id: int) -> dict:
+        """
+        voice_file: Starlette UploadFile or object with get_file() used previously in your code.
+        Returns: {text, language, emotion}
+        """
+        # Save OGG locally
+        ogg_path = _tmp_path(".ogg")
+        wav_path = _tmp_path(".wav")
+        tf = await voice_file.get_file()
+        await tf.download_to_drive(ogg_path)
+        # Convert to WAV via ffmpeg
+        try:
+            ffmpeg_path = ffmpeg.get_ffmpeg_exe()
+            subprocess.run([ffmpeg_path, "-y", "-i", ogg_path, wav_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        except Exception as e:
+            # fallback: try ffmpeg in PATH
+            try:
+                subprocess.run(["ffmpeg", "-y", "-i", ogg_path, wav_path], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            except Exception as ee:
+                raise RuntimeError(f"ffmpeg conversion failed: {e} / {ee}")
+        # Transcribe using SpeechRecognition Google STT (as before) -- or you can integrate whisper
+        recognizer = self._sr_recognizer
+        with sr.AudioFile(wav_path) as source:
+            audio = recognizer.record(source)
+        detected_lang = None
+        detected_text = ""
+        # tried languages set
+        lang_map = {
+            "zh": {"stt": "zh-CN"},
+            "ja": {"stt": "ja-JP"},
+            "ko": {"stt": "ko-KR"},
+            "en": {"stt": "en-US"},
+            "es": {"stt": "es-ES"},
+            "fr": {"stt": "fr-FR"},
+            "de": {"stt": "de-DE"},
+            "it": {"stt": "it-IT"},
+        }
+        for lang_code, lang_data in lang_map.items():
+            try:
+                detected_text = recognizer.recognize_google(audio, language=lang_data["stt"])
+                detected_lang = lang_code
+                break
+            except sr.UnknownValueError:
+                continue
+            except Exception:
+                continue
+        if not detected_lang:
+            # If not recognized, try fallback: detect from small chunk via langdetect
+            detected_lang = "en"
+            detected_text = ""
+        # emotion
+        emotion = await self.analyze_voice_emotion(wav_path)
+        # remove temp files
+        try:
+            os.remove(ogg_path)
+            os.remove(wav_path)
+        except Exception:
+            pass
+        return {"text": detected_text, "language": detected_lang, "emotion": emotion}
+    # ----------------------
+    # Text chat with translation & history
+    # ----------------------
+    async def generate_response(self, text: str, user_id: int, lang: str = "en") -> str:
+        # Ensure chat model loaded
+        self._load_chatbot()
+        self._load_translation()
+        if user_id not in self.user_chat_histories:
+            self.user_chat_histories[user_id] = []
+        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "user", "text": text, "language": lang})
+        self.user_chat_histories[user_id] = self.user_chat_histories[user_id][-100:]
+        self._save_chat_histories()
+        # Build context: translate last few msgs to English for consistency
+        context_texts = []
+        for msg in self.user_chat_histories[user_id][-5:]:
+            if msg.get("language", "en") != "en":
+                try:
+                    translated = self._translator(msg["text"])[0]["translation_text"]
+                except Exception:
+                    translated = msg["text"]
+            else:
+                translated = msg["text"]
+            context_texts.append(f"{msg['role']}: {translated}")
+        context = "\n".join(context_texts)
+        input_text = f"Context:\n{context}\nUser: {text if lang == 'en' else context_texts[-1].split(': ', 1)[1]}"
+        # Tokenize + generate
+        inputs = self._chat_tokenizer.encode(input_text, return_tensors="pt").to(self.device)
+        outputs = self._chat_model.generate(inputs, max_length=1000)
+        response_en = self._chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Translate back to user's language if needed
+        if lang != "en":
+            try:
+                response = self._translator(response_en)[0]["translation_text"]
+            except Exception:
+                response = response_en
+        else:
+            response = response_en
+        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "bot", "text": response, "language": lang})
+        self._save_chat_histories()
+        return response
+    # ----------------------
+    # Image captioning (existing)
+    # ----------------------
+    async def process_image_message(self, image_file, user_id: int) -> str:
+        # Save image
+        img_path = _tmp_path(".jpg")
+        tf = await image_file.get_file()
+        await tf.download_to_drive(img_path)
+        # load captioner
+        self._load_image_captioner()
+        try:
+            image = Image.open(img_path).convert("RGB")
+            description = self._image_captioner(image)[0]["generated_text"]
+        except Exception as e:
+            description = f"[Error generating caption: {e}]"
+        # cleanup
+        try:
+            os.remove(img_path)
+        except Exception:
+            pass
+        # store in history
+        if user_id not in self.user_chat_histories:
+            self.user_chat_histories[user_id] = []
+        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "user", "text": "[Image]", "language": "en"})
+        self.user_chat_histories[user_id].append({"timestamp": datetime.now().isoformat(), "role": "bot", "text": f"Image description: {description}", "language": "en"})
+        self._save_chat_histories()
+        return description
+    # ----------------------
+    # Voice reply (TTS)
+    # ----------------------
+    async def generate_voice_reply(self, text: str, user_id: int, fmt: str = "ogg") -> str:
+        """
+        Generate TTS audio reply using gTTS (or swap out to another TTS if you have).
+        Returns path to audio file.
+        """
+        mp3_path = _tmp_path(".mp3")
+        out_path = _tmp_path(f".{fmt}")
+        try:
+            tts = gTTS(text)
+            tts.save(mp3_path)
+            # convert to requested format using ffmpeg (ogg/opus for Telegram voice)
+            ffmpeg_path = ffmpeg.get_ffmpeg_exe()
+            if fmt == "ogg":
+                # convert mp3 -> ogg (opus)
+                subprocess.run([ffmpeg_path, "-y", "-i", mp3_path, "-c:a", "libopus", out_path], check=True)
+            elif fmt == "wav":
+                subprocess.run([ffmpeg_path, "-y", "-i", mp3_path, out_path], check=True)
+            else:
+                # default: return mp3
+                shutil.move(mp3_path, out_path)
+        except Exception as e:
+            # fallback: raise
+            raise RuntimeError(f"TTS failed: {e}")
+        finally:
+            try:
+                if os.path.exists(mp3_path) and os.path.exists(out_path) and mp3_path != out_path:
+                    os.remove(mp3_path)
+            except Exception:
+                pass
+        return out_path
+    # ----------------------
+    # Image generation (text -> image)
+    # ----------------------
+    async def generate_image_from_text(self, prompt: str, user_id: int, width: int = 512, height: int = 512, steps: int = 30) -> str:
+        self._load_sd()
+        if self._sd_pipe is None:
+            raise RuntimeError("Stable Diffusion pipeline not available.")
+        out_path = _tmp_path(".png")
+        try:
+            # diffusion pipeline uses CPU/GPU internally
+            result = self._sd_pipe(prompt, num_inference_steps=steps, height=height, width=width)
+            image = result.images[0]
+            image.save(out_path)
+        except Exception as e:
+            raise RuntimeError(f"Image generation failed: {e}")
+        return out_path
+    # ----------------------
+    # Image editing (inpainting)
+    # ----------------------
+    async def edit_image_inpaint(self, image_file, mask_file=None, prompt: str = "", user_id: int = 0) -> str:
+        self._load_sd()
+        if self._sd_inpaint is None:
+            raise RuntimeError("Inpainting pipeline not available.")
+        # Save files
+        img_path = _tmp_path(".png")
+        tf = await image_file.get_file()
+        await tf.download_to_drive(img_path)
+        if mask_file:
+            mask_path = _tmp_path(".png")
+            m_tf = await mask_file.get_file()
+            await m_tf.download_to_drive(mask_path)
+            mask_image = Image.open(mask_path).convert("L")
+        else:
+            # default mask (edit entire image)
+            mask_image = Image.new("L", Image.open(img_path).size, color=255)
+            mask_path = None
+        init_image = Image.open(img_path).convert("RGB")
+        # run inpaint
+        out_path = _tmp_path(".png")
+        try:
+            result = self._sd_inpaint(prompt=prompt if prompt else " ", image=init_image, mask_image=mask_image, guidance_scale=7.5, num_inference_steps=30)
+            edited = result.images[0]
+            edited.save(out_path)
+        except Exception as e:
+            raise RuntimeError(f"Inpainting failed: {e}")
+        finally:
+            try:
+                os.remove(img_path)
+                if mask_path:
+                    os.remove(mask_path)
+            except Exception:
+                pass
+        return out_path
+    # ----------------------
+    # Video processing: extract audio, frames, summarize
+    # ----------------------
+    async def process_video(self, video_file, user_id: int, max_frames: int = 4) -> dict:
+        """
+        Accepts uploaded video file, extracts audio (for transcription) and sample frames,
+        returns summary: {duration, fps, transcriptions, captions}
+        """
+        vid_path = _tmp_path(".mp4")
+        tf = await video_file.get_file()
+        await tf.download_to_drive(vid_path)
+        # Extract audio
+        audio_path = _tmp_path(".wav")
+        try:
+            clip = mp.VideoFileClip(vid_path)
+            clip.audio.write_audiofile(audio_path, logger=None)
+            duration = clip.duration
+            fps = clip.fps
+        except Exception as e:
+            raise RuntimeError(f"Video processing failed: {e}")
+        # Transcribe audio using the same process_voice_message flow: use SpeechRecognition or integrate Whisper
+        # For now we'll try SpeechRecognition on the audio
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(audio_path) as source:
+            audio = recognizer.record(source)
+        transcribed = ""
+        try:
+            transcribed = recognizer.recognize_google(audio)
+        except Exception:
+            transcribed = ""
+        # Extract a few frames evenly
+        frames = []
+        try:
+            clip_reader = imageio.get_reader(vid_path, "ffmpeg")
+            total_frames = clip_reader.count_frames()
+            step = max(1, total_frames // max_frames)
+            for i in range(0, total_frames, step):
+                try:
+                    frame = clip_reader.get_data(i)
+                    pil = Image.fromarray(frame)
+                    ppath = _tmp_path(".jpg")
+                    pil.save(ppath)
+                    frames.append(ppath)
+                    if len(frames) >= max_frames:
+                        break
+                except Exception:
+                    continue
+            clip_reader.close()
+        except Exception:
+            pass
+        # Use image captioner on the frames
+        captions = []
+        if frames:
+            self._load_image_captioner()
+            for p in frames:
+                try:
+                    img = Image.open(p).convert("RGB")
+                    c = self._image_captioner(img)[0]["generated_text"]
+                    captions.append(c)
+                except Exception:
+                    captions.append("")
+                finally:
+                    try:
+                        os.remove(p)
+                    except Exception:
+                        pass
+        # cleanup
+        try:
+            os.remove(vid_path)
+            os.remove(audio_path)
+        except Exception:
+            pass
+        return {"duration": duration, "fps": fps, "transcription": transcribed, "captions": captions}
+    # ----------------------
+    # File processing (PDF, DOCX, TXT, CSV)
+    # ----------------------
+    async def process_file(self, file_obj, user_id: int) -> dict:
+        """
+        Reads a file, extracts text (supports PDF/TXT/CSV/DOCX if python-docx added),
+        and returns a short summary.
+        """
+        # Save file
+        fpath = _tmp_path()
+        tf = await file_obj.get_file()
+        await tf.download_to_drive(fpath)
+        lower = fpath.lower()
+        text = ""
+        if fpath.endswith(".pdf"):
+            try:
+                doc = fitz.open(fpath)
+                for page in doc:
+                    text += page.get_text()
+            except Exception as e:
+                text = f"[PDF read error: {e}]"
+        elif fpath.endswith((".txt", ".csv")):
+            try:
+                with open(fpath, "r", encoding="utf-8", errors="ignore") as fh:
+                    text = fh.read()
+            except Exception as e:
+                text = f"[File read error: {e}]"
+        elif fpath.endswith(".docx"):
+            try:
+                import docx
+                doc = docx.Document(fpath)
+                text = "\n".join([p.text for p in doc.paragraphs])
+            except Exception as e:
+                text = f"[DOCX read error: {e}]"
+        else:
+            text = "[Unsupported file type]"
+        # Summarize: simple heuristic or use translator/chat model to summarize (but that costs compute)
+        summary = text[:300] + ("..." if len(text) > 300 else "")
+        try:
+            os.remove(fpath)
+        except Exception:
+            pass
+        return {"summary": summary, "full_text_length": len(text)}
+    # ----------------------
+    # Code assistance: generate / explain code
+    # ----------------------
+    async def code_complete(self, prompt: str, max_tokens: int = 512, temperature: float = 0.2) -> str:
+        """
+        Uses a code LLM (StarCoder or similar) to complete or generate code.
+        """
+        self._load_code_model()
+        if not self._code_model or not self._code_tokenizer:
+            raise RuntimeError("Code model not available.")
+        input_ids = self._code_tokenizer(prompt, return_tensors="pt").input_ids.to(self.device)
+        gen = self._code_model.generate(input_ids, max_new_tokens=max_tokens, do_sample=False)
+        out = self._code_tokenizer.decode(gen[0], skip_special_tokens=True)
+        return out
+    # ----------------------
+    # Optional: execute Python code in sandbox (WARNING: security risk)
+    # ----------------------
+    async def execute_python_code(self, code: str, timeout: int = 5) -> dict:
+        """
+        Execute Python code in a very limited sandbox subprocess.
+        WARNING: Running arbitrary code is dangerous. Use only with trusted inputs or stronger sandboxing (containers).
+        """
+        # Create temp dir
+        d = tempfile.mkdtemp()
+        file_path = os.path.join(d, "main.py")
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(code)
+        # run with timeout
+        try:
+            proc = await asyncio.create_subprocess_exec(
+                "python3", file_path,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+            )
+            try:
+                stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
+            except asyncio.TimeoutError:
+                proc.kill()
+                return {"error": "Execution timed out"}
+            return {"stdout": stdout.decode("utf-8", errors="ignore"), "stderr": stderr.decode("utf-8", errors="ignore")}
+        finally:
+            try:
+                shutil.rmtree(d)
+            except Exception:
+                pass