K00B404 commited on
Commit
0ecd9af
·
verified ·
1 Parent(s): e7693f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -80
app.py CHANGED
@@ -1,9 +1,3 @@
1
- #from huggingfaceinferenceclient import HuggingFaceInferenceClient
2
- #from outpaintprocessor import DynamicImageOutpainter
3
- #from aivideopipeline import AIImageVideoPipeline
4
- #from mmig import MultiModelImageGenerator
5
-
6
-
7
  import os
8
  import requests
9
  from PIL import Image
@@ -12,100 +6,161 @@ from huggingface_hub import InferenceClient
12
  from IPython.display import Audio, display
13
  import gradio as gr
14
 
 
15
  read_token = os.getenv('HF_READ')
16
  write_token = os.getenv('HF_WRITE')
17
- #chatmodel
18
- chatmodel="mistralai/Mistral-Nemo-Instruct-2407"
19
- # Whisper for Speech-to-Text
20
- WHISPER_API_URL = "https://api-inference.huggingface.co/models/distil-whisper/distil-large-v2"
21
- WHISPER_HEADERS = {"Authorization": "Bearer " + read_token}
22
- # Bark for Text-to-Speech
23
- BARK_API_URL = "https://api-inference.huggingface.co/models/suno/bark"
24
- BARK_HEADERS = {"Authorization": "Bearer "+read_token}
25
- # Flux for Image Generation
26
- FLUX_API_URL = "https://api-inference.huggingface.co/models/enhanceaiteam/Flux-uncensored"
27
- FLUX_HEADERS = {"Authorization": "Bearer "+read_token}
28
 
29
- def speech_to_text(filename):
30
- with open(filename, "rb") as f:
31
- data = f.read()
32
- response = requests.post(WHISPER_API_URL, headers=WHISPER_HEADERS, data=data)
33
- if response.status_code == 200:
34
- return response.json().get("text", "Could not recognize speech")
35
- else:
36
- print(f"Error: {response.status_code} - {response.text}")
37
- return None
38
-
39
- # Chatbot Logic with Hugging Face InferenceClient
40
  client = InferenceClient(api_key=read_token)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def chatbot_logic(input_text):
43
- messages = [{"role": "user", "content": input_text}]
 
 
 
 
44
  try:
45
  completion = client.chat.completions.create(
46
- model=chatmodel,
47
- messages=messages,
48
  max_tokens=500
49
  )
50
- return completion.choices[0].message["content"]
51
- except Exception as e:
52
- print(f"Error: {e}")
53
- return None
 
 
 
 
54
 
 
 
 
 
55
 
56
  def text_to_speech(text):
57
- payload = {"inputs": text}
58
- response = requests.post(BARK_API_URL, headers=BARK_HEADERS, json=payload)
59
- if response.status_code == 200:
60
- return response.content
61
- else:
62
- print(f"Error: {response.status_code} - {response.text}")
63
- return None
 
 
64
 
65
  def generate_image(prompt):
66
- data = {"inputs": prompt}
67
- response = requests.post(FLUX_API_URL, headers=FLUX_HEADERS, json=data)
68
- if response.status_code == 200:
69
- image_bytes = BytesIO(response.content)
70
- return Image.open(image_bytes)
71
- else:
72
- print(f"Error: {response.status_code} - {response.text}")
73
- return None
74
-
75
- # Gradio Interface for Chatbot and Image Generator
76
- def create_ui():
77
- def process_chat(audio_file):
78
- # Step 1: Speech to Text
79
- recognized_text = speech_to_text(audio_file)
80
- if not recognized_text:
81
- return "Could not recognize speech", None, None
82
-
83
- # Step 2: Chatbot Logic
84
- response_text = chatbot_logic(recognized_text)
85
- if not response_text:
86
- return f"Error generating response for: {recognized_text}", None, None
87
-
88
- # Step 3: Text to Speech
89
- audio_output = text_to_speech(response_text)
90
- if not audio_output:
91
- return f"Error synthesizing response: {response_text}", None, None
92
-
93
- # Step 4: Image Generation
94
- generated_image = generate_image(response_text)
95
-
96
- return response_text, Audio(audio_output, autoplay=True), generated_image
97
 
98
- with gr.Blocks(title="Voice-to-Voice Chatbot with Image Generation") as ui:
99
- gr.Markdown("## Voice-to-Voice Chatbot with Image Generation\nUpload an audio file to interact with the chatbot.")
 
 
 
 
 
 
 
100
 
101
  audio_input = gr.Audio(source="upload", type="filepath", label="Input Audio File")
102
- submit_button = gr.Button("Process")
103
 
104
  with gr.Row():
105
- chatbot_response = gr.Textbox(label="Chatbot Response", lines=2)
106
-
107
  with gr.Row():
108
- audio_output = gr.Audio(label="Generated Audio Response")
109
  image_output = gr.Image(label="Generated Image")
110
 
111
  submit_button.click(
@@ -117,6 +172,5 @@ def create_ui():
117
 
118
  return ui
119
 
120
- # Run the Gradio Interface
121
  if __name__ == "__main__":
122
  create_ui().launch(debug=True)
 
 
 
 
 
 
 
1
  import os
2
  import requests
3
  from PIL import Image
 
6
  from IPython.display import Audio, display
7
  import gradio as gr
8
 
9
+ # Tokens for Hugging Face API
10
  read_token = os.getenv('HF_READ')
11
  write_token = os.getenv('HF_WRITE')
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ # Model configurations
14
+ HEADERS = {"Authorization": f"Bearer {read_token}"}
15
+ BASE_URL='https://api-inference.huggingface.co/models/'
16
+ CHAT_MODEL = "mistralai/Mistral-Nemo-Instruct-2407"
17
+ WHISPER_API_URL = "distil-whisper/distil-large-v2"
18
+ BARK_API_URL = "suno/bark"
19
+ FLUX_API_URL = "enhanceaiteam/Flux-uncensored"
20
+
21
+ # Initialize Hugging Face Inference Client
 
 
22
  client = InferenceClient(api_key=read_token)
23
 
24
+ # Chatbot system prompt
25
+ system_prompt = """
26
+ You are an empathetic and knowledgeable AI assistant designed to engage in meaningful conversations,
27
+ assist with tasks, and provide accurate information.
28
+ You can also generate vivid visuals!
29
+ To request an image, include a description between the IMG tags, like this:
30
+ ##IMG: A serene forest at dawn with a golden glow:IMG##
31
+ """
32
+
33
+ chat_history = []
34
+
35
+ def tagger(bot_response):
36
+ """
37
+ Extract tags from the bot response and return the filtered response text and tags.
38
+
39
+ Args:
40
+ bot_response (str): The full response text from the chatbot.
41
+
42
+ Returns:
43
+ tuple: A tuple containing:
44
+ - filtered_response (str): The response text with tags removed.
45
+ - tags (dict): A dictionary of extracted tags and their values.
46
+ """
47
+ import re
48
+
49
+ tags = {}
50
+ filtered_response = bot_response
51
+
52
+ # Match patterns like ##IMG: ... :IMG##
53
+ img_pattern = r"##IMG:(.+?):IMG##"
54
+ img_matches = re.findall(img_pattern, bot_response)
55
+
56
+ if img_matches:
57
+ tags['images'] = img_matches
58
+ # Remove image tags from the response text
59
+ filtered_response = re.sub(img_pattern, "", filtered_response).strip()
60
+
61
+ # Additional tags can be added here as needed
62
+ # For example, if you want to support ##AUDIO: ... :AUDIO## tags:
63
+ # audio_pattern = r"##AUDIO:(.+?):AUDIO##"
64
+ # audio_matches = re.findall(audio_pattern, bot_response)
65
+ # if audio_matches:
66
+ # tags['audio'] = audio_matches
67
+ # filtered_response = re.sub(audio_pattern, "", filtered_response).strip()
68
+
69
+ return filtered_response, tags
70
+
71
+ def speech_to_text(filename):
72
+ """Convert speech to text using Whisper API."""
73
+ try:
74
+ with open(filename, "rb") as f:
75
+ data = f.read()
76
+ response = requests.post(BASE_URL+WHISPER_API_URL, headers=HEADERS, data=data)
77
+ if response.status_code == 200:
78
+ return response.json().get("text", "Could not recognize speech")
79
+ print(f"Whisper Error: {response.status_code} - {response.text}")
80
+ except Exception as e:
81
+ print(f"Exception in speech_to_text: {e}")
82
+ return None
83
+
84
  def chatbot_logic(input_text):
85
+ """Generate a response from the chatbot and handle tags."""
86
+ global chat_history
87
+ chat_history.append({"role": "user", "content": input_text})
88
+ messages = [{"role": "system", "content": system_prompt}] + chat_history
89
+
90
  try:
91
  completion = client.chat.completions.create(
92
+ model=CHAT_MODEL,
93
+ messages=messages,
94
  max_tokens=500
95
  )
96
+ response_text = completion.choices[0].message["content"]
97
+
98
+ # Use tagger to process tags and clean response text
99
+ response_text, tags = tagger(response_text)
100
+ chat_history.append({"role": "assistant", "content": response_text})
101
+
102
+ # Extract image prompt from tags if present
103
+ image_prompt = tags.get("images")[0] if "images" in tags else None
104
 
105
+ return response_text, image_prompt
106
+ except Exception as e:
107
+ print(f"Chatbot Error: {e}")
108
+ return None, None
109
 
110
  def text_to_speech(text):
111
+ """Convert text to speech using Bark API."""
112
+ try:
113
+ response = requests.post(BASE_URL+BARK_API_URL, headers=HEADERS, json={"inputs": text})
114
+ if response.status_code == 200:
115
+ return response.content
116
+ print(f"Bark Error: {response.status_code} - {response.text}")
117
+ except Exception as e:
118
+ print(f"Exception in text_to_speech: {e}")
119
+ return None
120
 
121
  def generate_image(prompt):
122
+ """Generate an image using the Flux API."""
123
+ try:
124
+ response = requests.post(BASE_URL+FLUX_API_URL, headers=HEADERS, json={"inputs": prompt})
125
+ if response.status_code == 200:
126
+ return Image.open(BytesIO(response.content))
127
+ print(f"Flux Error: {response.status_code} - {response.text}")
128
+ except Exception as e:
129
+ print(f"Exception in generate_image: {e}")
130
+ return None
131
+
132
+ def process_chat(audio_file):
133
+ """Process user input, generate response, and optionally create media."""
134
+ # Step 1: Speech-to-text
135
+ recognized_text = speech_to_text(audio_file)
136
+ if not recognized_text:
137
+ return "Speech recognition failed.", None, None
138
+
139
+ # Step 2: Chatbot response
140
+ response_text, image_prompt = chatbot_logic(recognized_text)
141
+ if not response_text:
142
+ return "Failed to generate chatbot response.", None, None
143
+
144
+ # Step 3: Text-to-speech
145
+ audio_response = text_to_speech(response_text)
 
 
 
 
 
 
 
146
 
147
+ # Step 4: Optional image generation
148
+ generated_image = generate_image(image_prompt) if image_prompt else None
149
+
150
+ return response_text, Audio(audio_response, autoplay=True), generated_image
151
+
152
+ def create_ui():
153
+ """Build and launch the Gradio interface."""
154
+ with gr.Blocks(title="Enhanced Voice-to-Voice Chatbot with Images") as ui:
155
+ gr.Markdown("## Voice-to-Voice AI Chatbot\nTalk to the AI and see its responses, including images it generates!")
156
 
157
  audio_input = gr.Audio(source="upload", type="filepath", label="Input Audio File")
158
+ submit_button = gr.Button("Submit")
159
 
160
  with gr.Row():
161
+ chatbot_response = gr.Textbox(label="Chatbot Response", lines=4)
 
162
  with gr.Row():
163
+ audio_output = gr.Audio(label="Audio Response")
164
  image_output = gr.Image(label="Generated Image")
165
 
166
  submit_button.click(
 
172
 
173
  return ui
174
 
 
175
  if __name__ == "__main__":
176
  create_ui().launch(debug=True)