brightlembo commited on
Commit
3267fb2
·
verified ·
1 Parent(s): 940db7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -84
app.py CHANGED
@@ -7,106 +7,144 @@ from transformers import (
7
  AutoTokenizer,
8
  AutoModelForCausalLM
9
  )
10
- from modelscope.pipelines import pipeline as ms_pipeline
11
  from PIL import Image
 
 
12
 
13
- def load_models():
14
- # Chargement des modèles
15
- blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
16
- blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
17
-
18
- # Modèle de transcription audio
19
- audio_transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
20
-
21
- # Modèle de génération de texte (version gratuite GPT-2)
22
- text_generator = pipeline("text-generation", model="gpt2")
23
-
24
- return blip_processor, blip_model, audio_transcriber, text_generator
25
 
26
- def analyze_image(image, blip_processor, blip_model):
27
- # Questions pour l'analyse d'image
28
- questions = [
29
- "What is in the picture?",
30
- "What are the main colors?",
31
- "What is the setting or background?",
32
- "What is happening in the image?",
33
- ]
34
-
35
- responses = {}
36
- for question in questions:
37
- inputs = blip_processor(images=image, text=question, return_tensors="pt")
38
- outputs = blip_model.generate(**inputs)
39
- answer = blip_processor.decode(outputs[0], skip_special_tokens=True)
40
- responses[question] = answer
41
-
42
- description = f"This image shows {responses['What is in the picture?']}. "
43
- description += f"The main colors are {responses['What are the main colors?']}. "
44
- description += f"The setting is {responses['What is the setting or background?']}. "
45
- description += f"In the scene, {responses['What is happening in the image?']}."
46
-
47
- return description
48
 
49
- def process_inputs(image, audio, text, models):
50
- blip_processor, blip_model, audio_transcriber, text_generator = models
51
-
52
- final_prompt = ""
53
-
54
- # Analyse de l'image si présente
55
- if image is not None:
56
- image_description = analyze_image(image, blip_processor, blip_model)
57
- final_prompt += f"Visual description: {image_description}\n"
58
-
59
- # Transcription audio si présent
60
- if audio is not None:
61
- audio_text = audio_transcriber(audio)["text"]
62
- final_prompt += f"Audio content: {audio_text}\n"
63
-
64
- # Ajout du texte si présent
65
- if text:
66
- final_prompt += f"Additional context: {text}\n"
67
-
68
- # Génération du prompt optimisé avec GPT-2
69
- prompt_enhancement = text_generator(
70
- final_prompt,
71
- max_length=200,
72
- num_return_sequences=1
73
- )[0]["generated_text"]
74
-
75
- # Création de la vidéo avec ModelScope
76
- video_pipeline = ms_pipeline(
77
- 'text-to-video-synthesis',
78
- model='damo/text-to-video-synthesis'
79
- )
80
-
81
- result = video_pipeline({
82
- 'text': prompt_enhancement,
83
- 'output_video_path': 'output.mp4'
84
- })
85
-
86
- return 'output.mp4', prompt_enhancement
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- # Interface Gradio
89
  def create_interface():
90
- models = load_models()
 
91
 
92
  interface = gr.Interface(
93
- fn=lambda img, audio, txt: process_inputs(img, audio, txt, models),
94
  inputs=[
95
- gr.Image(type="pil", label="Upload Image"),
96
- gr.Audio(type="filepath", label="Upload Audio"),
97
- gr.Textbox(label="Enter Additional Text")
98
  ],
99
  outputs=[
100
- gr.Video(label="Generated Video"),
101
- gr.Textbox(label="Generated Prompt")
102
  ],
103
- title="Multimodal Content to Video Generator",
104
- description="Upload an image, audio, or text (or any combination) to generate a video."
 
 
 
 
 
 
 
105
  )
106
 
107
  return interface
108
 
109
- # Lancement de l'application
110
  if __name__ == "__main__":
111
  interface = create_interface()
112
  interface.launch()
 
7
  AutoTokenizer,
8
  AutoModelForCausalLM
9
  )
 
10
  from PIL import Image
11
+ import os
12
+ import logging
13
 
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
16
 
17
+ class MultimodalProcessor:
18
+ def __init__(self):
19
+ self.load_models()
20
+
21
+ def load_models(self):
22
+ """Charge les modèles avec gestion d'erreurs"""
23
+ try:
24
+ logger.info("Chargement des modèles...")
25
+ self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
26
+ self.blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
27
+ self.audio_transcriber = pipeline("automatic-speech-recognition",
28
+ model="openai/whisper-small")
29
+ self.text_generator = pipeline("text-generation",
30
+ model="gpt2")
31
+ logger.info("Modèles chargés avec succès")
32
+ except Exception as e:
33
+ logger.error(f"Erreur lors du chargement des modèles: {str(e)}")
34
+ raise
 
 
 
 
35
 
36
+ def analyze_image(self, image):
37
+ """Analyse une image et retourne une description"""
38
+ try:
39
+ if image is None:
40
+ return ""
41
+
42
+ questions = [
43
+ "What is in the picture?",
44
+ "What are the main colors?",
45
+ "What is the setting or background?",
46
+ "What is happening in the image?",
47
+ ]
48
+
49
+ responses = {}
50
+ for question in questions:
51
+ inputs = self.blip_processor(images=image, text=question, return_tensors="pt")
52
+ outputs = self.blip_model.generate(**inputs)
53
+ answer = self.blip_processor.decode(outputs[0], skip_special_tokens=True)
54
+ responses[question] = answer
55
+
56
+ description = (
57
+ f"This image shows {responses['What is in the picture?']}. "
58
+ f"The main colors are {responses['What are the main colors?']}. "
59
+ f"The setting is {responses['What is the setting or background?']}. "
60
+ f"In the scene, {responses['What is happening in the image?']}"
61
+ )
62
+
63
+ return description
64
+ except Exception as e:
65
+ logger.error(f"Erreur lors de l'analyse de l'image: {str(e)}")
66
+ return "Erreur lors de l'analyse de l'image."
67
+
68
+ def transcribe_audio(self, audio_path):
69
+ """Transcrit un fichier audio"""
70
+ try:
71
+ if audio_path is None:
72
+ return ""
73
+ return self.audio_transcriber(audio_path)["text"]
74
+ except Exception as e:
75
+ logger.error(f"Erreur lors de la transcription audio: {str(e)}")
76
+ return "Erreur lors de la transcription audio."
77
+
78
+ def generate_text(self, prompt):
79
+ """Génère du texte à partir d'un prompt"""
80
+ try:
81
+ if not prompt:
82
+ return ""
83
+ response = self.text_generator(prompt,
84
+ max_length=200,
85
+ num_return_sequences=1)[0]["generated_text"]
86
+ return response
87
+ except Exception as e:
88
+ logger.error(f"Erreur lors de la génération de texte: {str(e)}")
89
+ return "Erreur lors de la génération de texte."
90
+
91
+ def process_inputs(self, image, audio, text):
92
+ """Traite les entrées multimodales"""
93
+ try:
94
+ # Analyse de l'image
95
+ image_description = self.analyze_image(image) if image is not None else ""
96
+
97
+ # Transcription audio
98
+ audio_text = self.transcribe_audio(audio) if audio is not None else ""
99
+
100
+ # Combinaison des entrées
101
+ combined_input = ""
102
+ if image_description:
103
+ combined_input += f"Visual description: {image_description}\n"
104
+ if audio_text:
105
+ combined_input += f"Audio content: {audio_text}\n"
106
+ if text:
107
+ combined_input += f"Additional context: {text}\n"
108
+
109
+ # Génération du prompt final
110
+ if combined_input:
111
+ final_prompt = self.generate_text(combined_input)
112
+ else:
113
+ final_prompt = "Aucune entrée fournie."
114
+
115
+ return final_prompt
116
+
117
+ except Exception as e:
118
+ logger.error(f"Erreur lors du traitement des entrées: {str(e)}")
119
+ return "Une erreur est survenue lors du traitement des entrées."
120
 
 
121
  def create_interface():
122
+ """Crée l'interface Gradio"""
123
+ processor = MultimodalProcessor()
124
 
125
  interface = gr.Interface(
126
+ fn=processor.process_inputs,
127
  inputs=[
128
+ gr.Image(type="pil", label="Télécharger une image"),
129
+ gr.Audio(type="filepath", label="Télécharger un fichier audio"),
130
+ gr.Textbox(label="Entrez du texte additionnel")
131
  ],
132
  outputs=[
133
+ gr.Textbox(label="Description générée")
 
134
  ],
135
+ title="Analyseur de Contenu Multimodal",
136
+ description="""
137
+ Cette application analyse vos contenus multimodaux :
138
+ - Images : génère une description détaillée
139
+ - Audio : transcrit le contenu
140
+ - Texte : enrichit la description
141
+
142
+ La sortie combine toutes ces informations en une description cohérente.
143
+ """
144
  )
145
 
146
  return interface
147
 
 
148
  if __name__ == "__main__":
149
  interface = create_interface()
150
  interface.launch()