File size: 4,366 Bytes
59e3a51
b3bbe1c
63a3a84
e63873f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from huggingfaceinferenceclient import HuggingFaceInferenceClient
from outpaintprocessor import DynamicImageOutpainter
from aivideopipeline import AIImageVideoPipeline
from mmig import MultiModelImageGenerator


import os
import requests
from PIL import Image
from io import BytesIO
from huggingface_hub import InferenceClient
from IPython.display import Audio, display
import gradio as gr

# Whisper for Speech-to-Text
WHISPER_API_URL = "https://api-inference.huggingface.co/models/distil-whisper/distil-large-v2"
WHISPER_HEADERS = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}

def speech_to_text(filename):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.post(WHISPER_API_URL, headers=WHISPER_HEADERS, data=data)
    if response.status_code == 200:
        return response.json().get("text", "Could not recognize speech")
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Chatbot Logic with Hugging Face InferenceClient
client = InferenceClient(api_key="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")

def chatbot_logic(input_text):
    messages = [{"role": "user", "content": input_text}]
    try:
        completion = client.chat.completions.create(
            model="mistralai/Mistral-Nemo-Instruct-2407", 
            messages=messages, 
            max_tokens=500
        )
        return completion.choices[0].message["content"]
    except Exception as e:
        print(f"Error: {e}")
        return None

# Bark for Text-to-Speech
BARK_API_URL = "https://api-inference.huggingface.co/models/suno/bark"
BARK_HEADERS = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}

def text_to_speech(text):
    payload = {"inputs": text}
    response = requests.post(BARK_API_URL, headers=BARK_HEADERS, json=payload)
    if response.status_code == 200:
        return response.content
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Flux for Image Generation
FLUX_API_URL = "https://api-inference.huggingface.co/models/enhanceaiteam/Flux-uncensored"
FLUX_HEADERS = {"Authorization": "Bearer hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}

def generate_image(prompt):
    data = {"inputs": prompt}
    response = requests.post(FLUX_API_URL, headers=FLUX_HEADERS, json=data)
    if response.status_code == 200:
        image_bytes = BytesIO(response.content)
        return Image.open(image_bytes)
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

# Gradio Interface for Chatbot and Image Generator
def create_ui():
    def process_chat(audio_file):
        # Step 1: Speech to Text
        recognized_text = speech_to_text(audio_file)
        if not recognized_text:
            return "Could not recognize speech", None, None
        
        # Step 2: Chatbot Logic
        response_text = chatbot_logic(recognized_text)
        if not response_text:
            return f"Error generating response for: {recognized_text}", None, None
        
        # Step 3: Text to Speech
        audio_output = text_to_speech(response_text)
        if not audio_output:
            return f"Error synthesizing response: {response_text}", None, None
        
        # Step 4: Image Generation
        generated_image = generate_image(response_text)
        
        return response_text, Audio(audio_output, autoplay=True), generated_image

    with gr.Blocks(title="Voice-to-Voice Chatbot with Image Generation") as ui:
        gr.Markdown("## Voice-to-Voice Chatbot with Image Generation\nUpload an audio file to interact with the chatbot.")
        
        audio_input = gr.Audio(source="upload", type="filepath", label="Input Audio File")
        submit_button = gr.Button("Process")
        
        with gr.Row():
            chatbot_response = gr.Textbox(label="Chatbot Response", lines=2)
        
        with gr.Row():
            audio_output = gr.Audio(label="Generated Audio Response")
            image_output = gr.Image(label="Generated Image")
        
        submit_button.click(
            fn=process_chat,
            inputs=audio_input,
            outputs=[chatbot_response, audio_output, image_output],
            show_progress=True
        )
    
    return ui

# Run the Gradio Interface
if __name__ == "__main__":
    create_ui().launch(debug=True)