Spaces:
Sleeping
Sleeping
File size: 4,048 Bytes
10734c0 4b526a8 10734c0 4b526a8 d0446f7 4b526a8 e630a36 10734c0 f08ad33 9fc2ff8 10734c0 7318986 abccd5b 9a29083 dff67aa abccd5b c49076d ab4208d 9a29083 4b526a8 2f88375 c49076d abccd5b 10734c0 4b526a8 10734c0 4f52190 3fe965b 5c7b9f9 10734c0 5c7b9f9 10734c0 4b526a8 10734c0 4b526a8 4f52190 3fe965b 10734c0 4b526a8 10734c0 4b526a8 10734c0 49428f3 10734c0 4b526a8 10734c0 4b526a8 10734c0 4b526a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import whisper
import gradio as gr
from groq import Groq
from deep_translator import GoogleTranslator
from diffusers import StableDiffusionPipeline
import os
import torch
import openai
from huggingface_hub import InferenceApi
# Set up Groq API key
api_key = os.getenv("GROQ_API_KEY")
client = Groq(api_key=api_key)
# Set device: CUDA if available, else CPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id1 = os.getenv("API_KEY")
pipe = StableDiffusionPipeline.from_pretrained(model_id1, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
pipe = pipe.to('cpu')
# Updated function for text generation using the new API structure
def generate_creative_text(prompt):
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content":prompt}
],
model="llama-3.2-90b-text-preview"
)
chatbot_response = chat_completion.choices[0].message.content
return chatbot_response
def process_audio(audio_path, image_option, creative_text_option):
if audio_path is None:
return "Please upload an audio file.", None, None, None
# Step 1: Transcribe audio
try:
with open(audio_path, "rb") as file:
transcription = client.audio.transcriptions.create(
file=(os.path.basename(audio_path), file.read()),
model="whisper-large-v3",
language="ta",
response_format="verbose_json",
)
tamil_text = transcription.text
except Exception as e:
return f"An error occurred during transcription: {str(e)}", None, None, None
# Step 2: Translate Tamil to English
try:
translator = GoogleTranslator(source='ta', target='en')
translation = translator.translate(tamil_text)
except Exception as e:
return tamil_text, f"An error occurred during translation: {str(e)}", None, None
# Step 3: Generate creative text (if selected)
creative_text = None
if creative_text_option == "Generate Creative Text":
creative_text = generate_creative_text(translation)
# Step 4: Generate image (if selected)
image = None
if image_option == "Generate Image":
try:
model_id1 = "dreamlike-art/dreamlike-diffusion-1.0"
pipe = StableDiffusionPipeline.from_pretrained(model_id1, torch_dtype=torch.float16, use_safetensors=True)
pipe = pipe.to('cpu')
image = pipe(translation).images[0]
except Exception as e:
return tamil_text, translation, creative_text, f"An error occurred during image generation: {str(e)}"
return tamil_text, translation, creative_text, image
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Base()) as iface:
gr.Markdown("# Audio Transcription, Translation, Image & Creative Text Generation")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio File")
image_option = gr.Dropdown(["Generate Image", "Skip Image"], label="Image Generation", value="Generate Image")
creative_text_option = gr.Dropdown(["Generate Creative Text", "Skip Creative Text"], label="Creative Text Generation", value="Generate Creative Text")
submit_button = gr.Button("Process Audio")
with gr.Column():
tamil_text_output = gr.Textbox(label="Tamil Transcription")
translation_output = gr.Textbox(label="English Translation")
creative_text_output = gr.Textbox(label="Creative Text")
image_output = gr.Image(label="Generated Image")
submit_button.click(
fn=process_audio,
inputs=[audio_input, image_option, creative_text_option],
outputs=[tamil_text_output, translation_output, creative_text_output, image_output]
)
# Launch the interface
iface.launch()
|