Princeaka commited on
Commit
3647f22
·
verified ·
1 Parent(s): 8195d80

Upload deepseek_python_20250811_1f44d6.py

Browse files
Files changed (1) hide show
  1. deepseek_python_20250811_1f44d6.py +767 -0
deepseek_python_20250811_1f44d6.py ADDED
@@ -0,0 +1,767 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # multimodal_module.py
2
+ import os
3
+ import pickle
4
+ import subprocess
5
+ import tempfile
6
+ import shutil
7
+ import asyncio
8
+ import logging
9
+ from datetime import datetime
10
+ from typing import Dict, List, Optional, Any, Union
11
+ import uuid
12
+ import numpy as np
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger("MultiModalModule")
20
+
21
+ # Space-specific environment configuration
22
+ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
23
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
24
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
25
+
26
+ # Core ML Imports
27
+ import torch
28
+ from transformers import (
29
+ pipeline,
30
+ AutoModelForSeq2SeqLM,
31
+ AutoTokenizer,
32
+ Wav2Vec2Processor,
33
+ Wav2Vec2ForSequenceClassification,
34
+ AutoModelForCausalLM
35
+ )
36
+ from diffusers import (
37
+ StableDiffusionPipeline,
38
+ StableDiffusionInpaintPipeline
39
+ )
40
+ from huggingface_hub import hf_hub_download, snapshot_download
41
+
42
+ # Audio Processing
43
+ import librosa
44
+ import soundfile as sf
45
+ from gtts import gTTS
46
+ import speech_recognition as sr
47
+ import webrtcvad
48
+
49
+ # Image/Video Processing
50
+ from PIL import Image
51
+ import imageio
52
+ import imageio_ffmpeg
53
+ import moviepy.editor as mp
54
+ import cv2
55
+
56
+ # Document Processing
57
+ import fitz # PyMuPDF
58
+ from langdetect import detect, DetectorFactory
59
+ DetectorFactory.seed = 0
60
+
61
+ # Configuration
62
+ USE_SAFETY_CHECKER = False
63
+ MAX_HISTORY_LENGTH = 100
64
+ TEMP_DIR = "tmp"
65
+ MODEL_CACHE_DIR = "model_cache"
66
+
67
+ class MultiModalChatModule:
68
+ """Complete multimodal module optimized for Hugging Face Spaces"""
69
+
70
+ def __init__(self, chat_history_file: str = "chat_histories.pkl"):
71
+ """Initialize with Space optimizations"""
72
+ # Create required directories
73
+ os.makedirs(TEMP_DIR, exist_ok=True)
74
+ os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
75
+
76
+ # Device configuration
77
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
78
+ self.torch_dtype = torch.float16 if "cuda" in self.device else torch.float32
79
+ logger.info(f"Initialized on {self.device.upper()} with dtype {self.torch_dtype}")
80
+
81
+ # Model registry
82
+ self.model_names = {
83
+ "voice_emotion_processor": "facebook/hubert-large-ls960-ft",
84
+ "voice_emotion_model": "superb/hubert-base-superb-er",
85
+ "translation_model": "facebook/nllb-200-distilled-600M",
86
+ "chatbot_tokenizer": "facebook/blenderbot-400M-distill",
87
+ "chatbot_model": "facebook/blenderbot-400M-distill",
88
+ "image_captioner": "Salesforce/blip-image-captioning-base",
89
+ "sd_inpaint": "runwayml/stable-diffusion-inpainting",
90
+ "sd_text2img": "runwayml/stable-diffusion-v1-5",
91
+ "code_model": "bigcode/starcoder",
92
+ }
93
+
94
+ # Model placeholders
95
+ self._voice_processor = None
96
+ self._voice_emotion_model = None
97
+ self._translator = None
98
+ self._chat_tokenizer = None
99
+ self._chat_model = None
100
+ self._image_captioner = None
101
+ self._sd_pipe = None
102
+ self._sd_inpaint = None
103
+ self._code_tokenizer = None
104
+ self._code_model = None
105
+
106
+ # Helpers
107
+ self._sr_recognizer = sr.Recognizer()
108
+ self.vad = webrtcvad.Vad(3)
109
+ self.chat_history_file = chat_history_file
110
+ self.user_chat_histories = self._load_chat_histories()
111
+
112
+ # Load tracking
113
+ self._loaded = {
114
+ "voice": False,
115
+ "translation": False,
116
+ "chat": False,
117
+ "image_caption": False,
118
+ "sd": False,
119
+ "code": False,
120
+ }
121
+
122
+ # ----------------------
123
+ # Core Utilities
124
+ # ----------------------
125
+ def _tmp_path(self, suffix: str = "") -> str:
126
+ """Generate space-compatible temp file path"""
127
+ path = os.path.join(TEMP_DIR, f"{uuid.uuid4().hex}{suffix}")
128
+ os.makedirs(os.path.dirname(path), exist_ok=True)
129
+ return path
130
+
131
+ def _cleanup(self, *paths: str) -> None:
132
+ """Safely remove files/directories"""
133
+ for path in paths:
134
+ try:
135
+ if path and os.path.exists(path):
136
+ if os.path.isfile(path):
137
+ os.remove(path)
138
+ elif os.path.isdir(path):
139
+ shutil.rmtree(path)
140
+ except Exception as e:
141
+ logger.warning(f"Cleanup failed for {path}: {e}")
142
+
143
+ def _load_chat_histories(self) -> Dict[int, List[dict]]:
144
+ """Load chat histories from file"""
145
+ try:
146
+ with open(self.chat_history_file, "rb") as f:
147
+ return pickle.load(f)
148
+ except Exception as e:
149
+ logger.warning(f"Failed loading chat history: {e}")
150
+ return {}
151
+
152
+ def _save_chat_histories(self) -> None:
153
+ """Persist chat histories to file"""
154
+ try:
155
+ with open(self.chat_history_file, "wb") as f:
156
+ pickle.dump(self.user_chat_histories, f)
157
+ except Exception as e:
158
+ logger.error(f"Failed saving chat history: {e}")
159
+
160
+ def _update_history(self, user_id: int, role: str, content: Any, lang: str = "en") -> None:
161
+ """Update conversation history"""
162
+ if user_id not in self.user_chat_histories:
163
+ self.user_chat_histories[user_id] = []
164
+
165
+ self.user_chat_histories[user_id].append({
166
+ "timestamp": datetime.now().isoformat(),
167
+ "role": role,
168
+ "content": content,
169
+ "language": lang
170
+ })
171
+
172
+ # Enforce max history length
173
+ self.user_chat_histories[user_id] = self.user_chat_histories[user_id][-MAX_HISTORY_LENGTH:]
174
+ self._save_chat_histories()
175
+
176
+ # ----------------------
177
+ # Model Loading
178
+ # ----------------------
179
+ def _load_voice_models(self) -> None:
180
+ """Load voice processing models"""
181
+ if self._loaded["voice"]:
182
+ return
183
+
184
+ try:
185
+ logger.info("Loading voice models...")
186
+ self._voice_processor = Wav2Vec2Processor.from_pretrained(
187
+ self.model_names["voice_emotion_processor"],
188
+ cache_dir=MODEL_CACHE_DIR
189
+ )
190
+ self._voice_emotion_model = Wav2Vec2ForSequenceClassification.from_pretrained(
191
+ self.model_names["voice_emotion_model"],
192
+ cache_dir=MODEL_CACHE_DIR
193
+ ).to(self.device)
194
+ self._loaded["voice"] = True
195
+ logger.info("Voice models loaded successfully")
196
+ except Exception as e:
197
+ logger.error(f"Failed loading voice models: {e}")
198
+
199
+ def _load_translation(self) -> None:
200
+ """Load translation pipeline"""
201
+ if self._loaded["translation"]:
202
+ return
203
+
204
+ try:
205
+ logger.info("Loading translation model...")
206
+ device = 0 if self.device == "cuda" else -1
207
+ self._translator = pipeline(
208
+ "translation",
209
+ model=self.model_names["translation_model"],
210
+ device=device,
211
+ cache_dir=MODEL_CACHE_DIR
212
+ )
213
+ self._loaded["translation"] = True
214
+ logger.info("Translation model loaded successfully")
215
+ except Exception as e:
216
+ logger.error(f"Failed loading translation model: {e}")
217
+
218
+ def _load_chatbot(self) -> None:
219
+ """Load chatbot models"""
220
+ if self._loaded["chat"]:
221
+ return
222
+
223
+ try:
224
+ logger.info("Loading chatbot models...")
225
+ self._chat_tokenizer = AutoTokenizer.from_pretrained(
226
+ self.model_names["chatbot_tokenizer"],
227
+ cache_dir=MODEL_CACHE_DIR
228
+ )
229
+ self._chat_model = AutoModelForSeq2SeqLM.from_pretrained(
230
+ self.model_names["chatbot_model"],
231
+ cache_dir=MODEL_CACHE_DIR
232
+ ).to(self.device)
233
+ self._loaded["chat"] = True
234
+ logger.info("Chatbot models loaded successfully")
235
+ except Exception as e:
236
+ logger.error(f"Failed loading chatbot models: {e}")
237
+
238
+ def _load_image_captioner(self) -> None:
239
+ """Load image captioning model"""
240
+ if self._loaded["image_caption"]:
241
+ return
242
+
243
+ try:
244
+ logger.info("Loading image captioner...")
245
+ device = 0 if self.device == "cuda" else -1
246
+ self._image_captioner = pipeline(
247
+ "image-to-text",
248
+ model=self.model_names["image_captioner"],
249
+ device=device,
250
+ cache_dir=MODEL_CACHE_DIR
251
+ )
252
+ self._loaded["image_caption"] = True
253
+ logger.info("Image captioner loaded successfully")
254
+ except Exception as e:
255
+ logger.error(f"Failed loading image captioner: {e}")
256
+
257
+ def _load_sd(self) -> None:
258
+ """Load Stable Diffusion models"""
259
+ if self._loaded["sd"]:
260
+ return
261
+
262
+ try:
263
+ logger.info("Loading Stable Diffusion models...")
264
+
265
+ # Text-to-image
266
+ self._sd_pipe = StableDiffusionPipeline.from_pretrained(
267
+ self.model_names["sd_text2img"],
268
+ torch_dtype=self.torch_dtype,
269
+ safety_checker=None if not USE_SAFETY_CHECKER else None,
270
+ cache_dir=MODEL_CACHE_DIR
271
+ ).to(self.device)
272
+
273
+ # Inpainting
274
+ self._sd_inpaint = StableDiffusionInpaintPipeline.from_pretrained(
275
+ self.model_names["sd_inpaint"],
276
+ torch_dtype=self.torch_dtype,
277
+ cache_dir=MODEL_CACHE_DIR
278
+ ).to(self.device)
279
+
280
+ self._loaded["sd"] = True
281
+ logger.info("Stable Diffusion models loaded successfully")
282
+ except Exception as e:
283
+ logger.error(f"Failed loading Stable Diffusion models: {e}")
284
+ self._sd_pipe = None
285
+ self._sd_inpaint = None
286
+
287
+ def _load_code_model(self) -> None:
288
+ """Load code generation model"""
289
+ if self._loaded["code"]:
290
+ return
291
+
292
+ try:
293
+ logger.info("Loading code model...")
294
+ self._code_tokenizer = AutoTokenizer.from_pretrained(
295
+ self.model_names["code_model"],
296
+ cache_dir=MODEL_CACHE_DIR
297
+ )
298
+ self._code_model = AutoModelForCausalLM.from_pretrained(
299
+ self.model_names["code_model"],
300
+ cache_dir=MODEL_CACHE_DIR
301
+ ).to(self.device)
302
+ self._loaded["code"] = True
303
+ logger.info("Code model loaded successfully")
304
+ except Exception as e:
305
+ logger.error(f"Failed loading code model: {e}")
306
+ self._code_tokenizer = None
307
+ self._code_model = None
308
+
309
+ # ----------------------
310
+ # Audio Processing
311
+ # ----------------------
312
+ async def analyze_voice_emotion(self, audio_path: str) -> str:
313
+ """Analyze emotion from voice audio"""
314
+ self._load_voice_models()
315
+ if not self._voice_processor or not self._voice_emotion_model:
316
+ return "unknown"
317
+
318
+ try:
319
+ speech, sr = librosa.load(audio_path, sr=16000)
320
+ inputs = self._voice_processor(
321
+ speech,
322
+ sampling_rate=sr,
323
+ return_tensors="pt",
324
+ padding=True
325
+ ).to(self.device)
326
+
327
+ with torch.no_grad():
328
+ logits = self._voice_emotion_model(**inputs).logits
329
+
330
+ emotions = {
331
+ 0: "happy", 1: "sad", 2: "angry",
332
+ 3: "fearful", 4: "calm", 5: "surprised"
333
+ }
334
+ return emotions.get(torch.argmax(logits).item(), "unknown")
335
+ except Exception as e:
336
+ logger.error(f"Voice emotion analysis failed: {e}")
337
+ return "error"
338
+
339
+ async def process_voice_message(self, voice_file, user_id: int) -> Dict[str, Any]:
340
+ """Process voice message to text with emotion analysis"""
341
+ ogg_path = self._tmp_path(".ogg")
342
+ wav_path = self._tmp_path(".wav")
343
+
344
+ try:
345
+ # Save and convert audio
346
+ await voice_file.download_to_drive(ogg_path)
347
+
348
+ # Convert to WAV
349
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
350
+ cmd = [
351
+ ffmpeg_path, "-y", "-i", ogg_path,
352
+ "-ar", "16000", "-ac", "1", wav_path
353
+ ]
354
+ subprocess.run(cmd, check=True, capture_output=True)
355
+
356
+ # Analyze audio
357
+ speech, sr = librosa.load(wav_path, sr=16000)
358
+
359
+ # Voice Activity Detection
360
+ is_speech = self.vad.is_speech(
361
+ (speech * 32767).astype(np.int16).tobytes(),
362
+ sample_rate=sr
363
+ )
364
+
365
+ # Transcription
366
+ text = ""
367
+ lang = "en"
368
+ if is_speech:
369
+ with sr.AudioFile(wav_path) as source:
370
+ audio = self._sr_recognizer.record(source)
371
+ try:
372
+ text = self._sr_recognizer.recognize_google(audio, language="en-US")
373
+ except sr.UnknownValueError:
374
+ pass
375
+ except Exception as e:
376
+ logger.warning(f"Speech recognition failed: {e}")
377
+
378
+ # Emotion analysis
379
+ emotion = await self.analyze_voice_emotion(wav_path) if is_speech else "no_speech"
380
+
381
+ # Update history
382
+ result = {
383
+ "text": text,
384
+ "language": lang,
385
+ "emotion": emotion,
386
+ "is_speech": is_speech
387
+ }
388
+ self._update_history(user_id, "user", result, lang)
389
+
390
+ return result
391
+ except Exception as e:
392
+ logger.error(f"Voice message processing failed: {e}")
393
+ return {"error": str(e)}
394
+ finally:
395
+ self._cleanup(ogg_path, wav_path)
396
+
397
+ async def generate_voice_reply(self, text: str, user_id: int, fmt: str = "ogg") -> str:
398
+ """Generate audio from text (TTS)"""
399
+ mp3_path = self._tmp_path(".mp3")
400
+ out_path = self._tmp_path(f".{fmt}")
401
+
402
+ try:
403
+ # Generate TTS
404
+ tts = gTTS(text=text, lang='en')
405
+ tts.save(mp3_path)
406
+
407
+ # Convert format
408
+ ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe()
409
+ if fmt == "ogg":
410
+ subprocess.run([
411
+ ffmpeg_path, "-y", "-i", mp3_path,
412
+ "-c:a", "libopus", out_path
413
+ ], check=True)
414
+ elif fmt == "wav":
415
+ subprocess.run([
416
+ ffmpeg_path, "-y", "-i", mp3_path, out_path
417
+ ], check=True)
418
+ else:
419
+ shutil.move(mp3_path, out_path)
420
+
421
+ # Update history
422
+ self._update_history(user_id, "assistant", f"[Voice reply: {fmt}]")
423
+
424
+ return out_path
425
+ except Exception as e:
426
+ logger.error(f"Voice reply generation failed: {e}")
427
+ raise RuntimeError(f"TTS failed: {e}")
428
+ finally:
429
+ if fmt != "mp3" and os.path.exists(mp3_path):
430
+ self._cleanup(mp3_path)
431
+
432
+ # ----------------------
433
+ # Text Processing
434
+ # ----------------------
435
+ async def generate_response(self, text: str, user_id: int, lang: str = "en") -> str:
436
+ """Generate conversational response with context"""
437
+ self._load_chatbot()
438
+ self._load_translation()
439
+
440
+ # Update history
441
+ self._update_history(user_id, "user", text, lang)
442
+
443
+ # Prepare context
444
+ context = []
445
+ for msg in self.user_chat_histories[user_id][-5:]:
446
+ if msg["language"] != "en":
447
+ try:
448
+ translated = self._translator(msg["content"])[0]["translation_text"]
449
+ context.append(f"{msg['role']}: {translated}")
450
+ except Exception:
451
+ context.append(f"{msg['role']}: {msg['content']}")
452
+ else:
453
+ context.append(f"{msg['role']}: {msg['content']}")
454
+
455
+ # Generate response
456
+ input_text = f"Context:\n{' '.join(context)}\nUser: {text}"
457
+ inputs = self._chat_tokenizer(input_text, return_tensors="pt").to(self.device)
458
+
459
+ try:
460
+ outputs = self._chat_model.generate(
461
+ **inputs,
462
+ max_new_tokens=200,
463
+ do_sample=True,
464
+ temperature=0.7
465
+ )
466
+ response = self._chat_tokenizer.decode(outputs[0], skip_special_tokens=True)
467
+ except Exception as e:
468
+ logger.error(f"Response generation failed: {e}")
469
+ response = "I couldn't generate a response. Please try again."
470
+
471
+ # Translate if needed
472
+ if lang != "en":
473
+ try:
474
+ response = self._translator(response)[0]["translation_text"]
475
+ except Exception:
476
+ pass
477
+
478
+ # Update history
479
+ self._update_history(user_id, "assistant", response, lang)
480
+ return response
481
+
482
+ # ----------------------
483
+ # Image Processing
484
+ # ----------------------
485
+ async def process_image_message(self, image_file, user_id: int) -> str:
486
+ """Generate caption for an image"""
487
+ img_path = self._tmp_path(".jpg")
488
+
489
+ try:
490
+ # Save and load image
491
+ await image_file.download_to_drive(img_path)
492
+ image = Image.open(img_path).convert("RGB")
493
+
494
+ # Generate caption
495
+ self._load_image_captioner()
496
+ caption = self._image_captioner(image)[0]["generated_text"]
497
+
498
+ # Update history
499
+ self._update_history(user_id, "user", "[Image]", "en")
500
+ self._update_history(user_id, "assistant", f"Image description: {caption}", "en")
501
+
502
+ return caption
503
+ except Exception as e:
504
+ logger.error(f"Image processing failed: {e}")
505
+ return f"Error processing image: {str(e)}"
506
+ finally:
507
+ self._cleanup(img_path)
508
+
509
+ async def generate_image_from_text(self, prompt: str, user_id: int,
510
+ width: int = 512, height: int = 512,
511
+ steps: int = 30) -> str:
512
+ """Generate image from text prompt"""
513
+ self._load_sd()
514
+ if not self._sd_pipe:
515
+ raise RuntimeError("Image generation unavailable")
516
+
517
+ out_path = self._tmp_path(".png")
518
+
519
+ try:
520
+ # Generate image
521
+ result = self._sd_pipe(
522
+ prompt,
523
+ num_inference_steps=steps,
524
+ height=height,
525
+ width=width
526
+ )
527
+ result.images[0].save(out_path)
528
+
529
+ # Update history
530
+ self._update_history(user_id, "user", f"[Image request: {prompt}]", "en")
531
+ self._update_history(user_id, "assistant", f"[Generated image]", "en")
532
+
533
+ return out_path
534
+ except Exception as e:
535
+ logger.error(f"Image generation failed: {e}")
536
+ raise RuntimeError(f"Image generation failed: {e}")
537
+
538
+ async def edit_image_inpaint(self, image_file, mask_file=None,
539
+ prompt: str = "", user_id: int = 0) -> str:
540
+ """Edit image using inpainting"""
541
+ self._load_sd()
542
+ if not self._sd_inpaint:
543
+ raise RuntimeError("Image editing unavailable")
544
+
545
+ img_path = self._tmp_path(".png")
546
+ mask_path = self._tmp_path("_mask.png") if mask_file else None
547
+ out_path = self._tmp_path("_edited.png")
548
+
549
+ try:
550
+ # Save inputs
551
+ await image_file.download_to_drive(img_path)
552
+ if mask_file:
553
+ await mask_file.download_to_drive(mask_path)
554
+
555
+ # Prepare images
556
+ init_image = Image.open(img_path).convert("RGB")
557
+ mask_image = Image.open(mask_path).convert("L") if mask_path else Image.new("L", init_image.size, 255)
558
+
559
+ # Inpaint
560
+ result = self._sd_inpaint(
561
+ prompt=prompt if prompt else " ",
562
+ image=init_image,
563
+ mask_image=mask_image,
564
+ guidance_scale=7.5,
565
+ num_inference_steps=30
566
+ )
567
+ result.images[0].save(out_path)
568
+
569
+ # Update history
570
+ self._update_history(user_id, "user", "[Image edit request]", "en")
571
+ self._update_history(user_id, "assistant", "[Edited image]", "en")
572
+
573
+ return out_path
574
+ except Exception as e:
575
+ logger.error(f"Image editing failed: {e}")
576
+ raise RuntimeError(f"Inpainting failed: {e}")
577
+ finally:
578
+ self._cleanup(img_path, mask_path)
579
+
580
+ # ----------------------
581
+ # Video Processing
582
+ # ----------------------
583
+ async def process_video(self, video_file, user_id: int, max_frames: int = 4) -> Dict[str, Any]:
584
+ """Process video file to extract audio and keyframes"""
585
+ vid_path = self._tmp_path(".mp4")
586
+ audio_path = self._tmp_path(".wav")
587
+
588
+ try:
589
+ # Save video
590
+ await video_file.download_to_drive(vid_path)
591
+
592
+ # Extract audio
593
+ clip = mp.VideoFileClip(vid_path)
594
+ clip.audio.write_audiofile(audio_path, logger=None)
595
+ duration = clip.duration
596
+ fps = clip.fps
597
+
598
+ # Transcribe audio
599
+ transcribed = ""
600
+ try:
601
+ with sr.AudioFile(audio_path) as source:
602
+ audio = self._sr_recognizer.record(source)
603
+ transcribed = self._sr_recognizer.recognize_google(audio)
604
+ except Exception as e:
605
+ logger.warning(f"Audio transcription failed: {e}")
606
+
607
+ # Extract frames
608
+ frames = []
609
+ captions = []
610
+ try:
611
+ reader = imageio.get_reader(vid_path)
612
+ total_frames = reader.count_frames()
613
+ step = max(1, total_frames // max_frames)
614
+
615
+ for i in range(0, total_frames, step):
616
+ try:
617
+ frame = reader.get_data(i)
618
+ frame_path = self._tmp_path(f"_frame{i}.jpg")
619
+ Image.fromarray(frame).save(frame_path)
620
+ frames.append(frame_path)
621
+
622
+ if len(frames) >= max_frames:
623
+ break
624
+ except Exception:
625
+ continue
626
+
627
+ # Generate captions
628
+ if frames and self._load_image_captioner():
629
+ for frame_path in frames:
630
+ try:
631
+ caption = self._image_captioner(Image.open(frame_path))[0]["generated_text"]
632
+ captions.append(caption)
633
+ except Exception:
634
+ captions.append("")
635
+ finally:
636
+ self._cleanup(frame_path)
637
+ except Exception as e:
638
+ logger.warning(f"Frame extraction failed: {e}")
639
+
640
+ # Update history
641
+ result = {
642
+ "duration": duration,
643
+ "fps": fps,
644
+ "transcription": transcribed,
645
+ "captions": captions
646
+ }
647
+ self._update_history(user_id, "user", "[Video upload]", "en")
648
+ self._update_history(user_id, "assistant", result, "en")
649
+
650
+ return result
651
+ except Exception as e:
652
+ logger.error(f"Video processing failed: {e}")
653
+ return {"error": str(e)}
654
+ finally:
655
+ self._cleanup(vid_path, audio_path)
656
+
657
+ # ----------------------
658
+ # File Processing
659
+ # ----------------------
660
+ async def process_file(self, file_obj, user_id: int) -> Dict[str, Any]:
661
+ """Process document files (PDF, DOCX, TXT)"""
662
+ fpath = self._tmp_path()
663
+
664
+ try:
665
+ # Save file
666
+ await file_obj.download_to_drive(fpath)
667
+
668
+ # Read based on type
669
+ text = ""
670
+ if fpath.lower().endswith(".pdf"):
671
+ try:
672
+ with fitz.open(fpath) as doc:
673
+ text = "\n".join([page.get_text() for page in doc])
674
+ except Exception as e:
675
+ text = f"[PDF error: {e}]"
676
+ elif fpath.lower().endswith((".txt", ".csv")):
677
+ try:
678
+ with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
679
+ text = f.read()
680
+ except Exception as e:
681
+ text = f"[Text error: {e}]"
682
+ elif fpath.lower().endswith(".docx"):
683
+ try:
684
+ import docx
685
+ doc = docx.Document(fpath)
686
+ text = "\n".join([p.text for p in doc.paragraphs])
687
+ except Exception as e:
688
+ text = f"[DOCX error: {e}]"
689
+ else:
690
+ text = "[Unsupported file type]"
691
+
692
+ # Summarize
693
+ summary = text[:500] + ("..." if len(text) > 500 else "")
694
+
695
+ # Update history
696
+ result = {
697
+ "summary": summary,
698
+ "length": len(text),
699
+ "type": os.path.splitext(fpath)[1]
700
+ }
701
+ self._update_history(user_id, "user", f"[File upload: {result['type']}]", "en")
702
+ self._update_history(user_id, "assistant", result, "en")
703
+
704
+ return result
705
+ except Exception as e:
706
+ logger.error(f"File processing failed: {e}")
707
+ return {"error": str(e)}
708
+ finally:
709
+ self._cleanup(fpath)
710
+
711
+ # ----------------------
712
+ # Code Processing
713
+ # ----------------------
714
+ async def code_complete(self, prompt: str, max_tokens: int = 512,
715
+ temperature: float = 0.2) -> str:
716
+ """Generate code completions"""
717
+ self._load_code_model()
718
+ if not self._code_model or not self._code_tokenizer:
719
+ raise RuntimeError("Code model not available")
720
+
721
+ try:
722
+ inputs = self._code_tokenizer(prompt, return_tensors="pt").to(self.device)
723
+ outputs = self._code_model.generate(
724
+ **inputs,
725
+ max_new_tokens=max_tokens,
726
+ temperature=temperature,
727
+ do_sample=True
728
+ )
729
+ return self._code_tokenizer.decode(outputs[0], skip_special_tokens=True)
730
+ except Exception as e:
731
+ logger.error(f"Code completion failed: {e}")
732
+ raise RuntimeError(f"Code generation error: {e}")
733
+
734
+ async def execute_python_code(self, code: str, timeout: int = 5) -> Dict[str, str]:
735
+ """Execute Python code in sandbox (DANGER: Unsecure)"""
736
+ temp_dir = self._tmp_path()
737
+ script_path = os.path.join(temp_dir, "script.py")
738
+
739
+ try:
740
+ # Create temp dir
741
+ os.makedirs(temp_dir, exist_ok=True)
742
+
743
+ # Write script
744
+ with open(script_path, "w") as f:
745
+ f.write(code)
746
+
747
+ # Execute
748
+ proc = await asyncio.create_subprocess_exec(
749
+ "python3", script_path,
750
+ stdout=asyncio.subprocess.PIPE,
751
+ stderr=asyncio.subprocess.PIPE
752
+ )
753
+
754
+ try:
755
+ stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
756
+ return {
757
+ "stdout": stdout.decode("utf-8", errors="ignore"),
758
+ "stderr": stderr.decode("utf-8", errors="ignore")
759
+ }
760
+ except asyncio.TimeoutError:
761
+ proc.kill()
762
+ return {"error": "Execution timed out"}
763
+ except Exception as e:
764
+ logger.error(f"Code execution failed: {e}")
765
+ return {"error": str(e)}
766
+ finally:
767
+ self._cleanup(temp_dir)