K00B404 commited on
Commit
e63873f
·
verified ·
1 Parent(s): ed43390

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -1
app.py CHANGED
@@ -1,4 +1,119 @@
1
  from huggingfaceinferenceclient import HuggingFaceInferenceClient
2
  from outpaintprocessor import DynamicImageOutpainter
3
  from aivideopipeline import AIImageVideoPipeline
4
- from mmig import MultiModelImageGenerator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from huggingfaceinferenceclient import HuggingFaceInferenceClient
2
  from outpaintprocessor import DynamicImageOutpainter
3
  from aivideopipeline import AIImageVideoPipeline
4
+ from mmig import MultiModelImageGenerator
5
+
6
+
7
+ import os
8
+ import requests
9
+ from PIL import Image
10
+ from io import BytesIO
11
+ from huggingface_hub import InferenceClient
12
+ from IPython.display import Audio, display
13
+ import gradio as gr
14
+
15
+ # Whisper for Speech-to-Text
16
+ WHISPER_API_URL = "https://api-inference.huggingface.co/models/distil-whisper/distil-large-v2"
17
+ WHISPER_HEADERS = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
18
+
19
+ def speech_to_text(filename):
20
+ with open(filename, "rb") as f:
21
+ data = f.read()
22
+ response = requests.post(WHISPER_API_URL, headers=WHISPER_HEADERS, data=data)
23
+ if response.status_code == 200:
24
+ return response.json().get("text", "Could not recognize speech")
25
+ else:
26
+ print(f"Error: {response.status_code} - {response.text}")
27
+ return None
28
+
29
+ # Chatbot Logic with Hugging Face InferenceClient
30
+ client = InferenceClient(api_key="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
31
+
32
+ def chatbot_logic(input_text):
33
+ messages = [{"role": "user", "content": input_text}]
34
+ try:
35
+ completion = client.chat.completions.create(
36
+ model="mistralai/Mistral-Nemo-Instruct-2407",
37
+ messages=messages,
38
+ max_tokens=500
39
+ )
40
+ return completion.choices[0].message["content"]
41
+ except Exception as e:
42
+ print(f"Error: {e}")
43
+ return None
44
+
45
+ # Bark for Text-to-Speech
46
+ BARK_API_URL = "https://api-inference.huggingface.co/models/suno/bark"
47
+ BARK_HEADERS = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
48
+
49
+ def text_to_speech(text):
50
+ payload = {"inputs": text}
51
+ response = requests.post(BARK_API_URL, headers=BARK_HEADERS, json=payload)
52
+ if response.status_code == 200:
53
+ return response.content
54
+ else:
55
+ print(f"Error: {response.status_code} - {response.text}")
56
+ return None
57
+
58
+ # Flux for Image Generation
59
+ FLUX_API_URL = "https://api-inference.huggingface.co/models/enhanceaiteam/Flux-uncensored"
60
+ FLUX_HEADERS = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}
61
+
62
+ def generate_image(prompt):
63
+ data = {"inputs": prompt}
64
+ response = requests.post(FLUX_API_URL, headers=FLUX_HEADERS, json=data)
65
+ if response.status_code == 200:
66
+ image_bytes = BytesIO(response.content)
67
+ return Image.open(image_bytes)
68
+ else:
69
+ print(f"Error: {response.status_code} - {response.text}")
70
+ return None
71
+
72
+ # Gradio Interface for Chatbot and Image Generator
73
+ def create_ui():
74
+ def process_chat(audio_file):
75
+ # Step 1: Speech to Text
76
+ recognized_text = speech_to_text(audio_file)
77
+ if not recognized_text:
78
+ return "Could not recognize speech", None, None
79
+
80
+ # Step 2: Chatbot Logic
81
+ response_text = chatbot_logic(recognized_text)
82
+ if not response_text:
83
+ return f"Error generating response for: {recognized_text}", None, None
84
+
85
+ # Step 3: Text to Speech
86
+ audio_output = text_to_speech(response_text)
87
+ if not audio_output:
88
+ return f"Error synthesizing response: {response_text}", None, None
89
+
90
+ # Step 4: Image Generation
91
+ generated_image = generate_image(response_text)
92
+
93
+ return response_text, Audio(audio_output, autoplay=True), generated_image
94
+
95
+ with gr.Blocks(title="Voice-to-Voice Chatbot with Image Generation") as ui:
96
+ gr.Markdown("## Voice-to-Voice Chatbot with Image Generation\nUpload an audio file to interact with the chatbot.")
97
+
98
+ audio_input = gr.Audio(source="upload", type="filepath", label="Input Audio File")
99
+ submit_button = gr.Button("Process")
100
+
101
+ with gr.Row():
102
+ chatbot_response = gr.Textbox(label="Chatbot Response", lines=2)
103
+
104
+ with gr.Row():
105
+ audio_output = gr.Audio(label="Generated Audio Response")
106
+ image_output = gr.Image(label="Generated Image")
107
+
108
+ submit_button.click(
109
+ fn=process_chat,
110
+ inputs=audio_input,
111
+ outputs=[chatbot_response, audio_output, image_output],
112
+ show_progress=True
113
+ )
114
+
115
+ return ui
116
+
117
+ # Run the Gradio Interface
118
+ if __name__ == "__main__":
119
+ create_ui().launch(debug=True)