File size: 4,208 Bytes
10734c0
 
 
 
 
4b526a8
10734c0
4b526a8
 
 
 
10734c0
f08ad33
9fc2ff8
10734c0
 
f08ad33
 
 
 
 
 
 
 
 
 
 
 
7318986
abccd5b
 
 
4b526a8
 
 
 
 
 
 
 
 
abccd5b
10734c0
4b526a8
10734c0
 
 
5c7b9f9
10734c0
5c7b9f9
 
 
 
 
 
 
 
10734c0
 
4b526a8
10734c0
 
 
 
 
 
 
4b526a8
 
 
 
 
 
10734c0
 
 
4b526a8
 
 
10734c0
 
4b526a8
10734c0
4b526a8
10734c0
 
 
4b526a8
10734c0
 
 
 
4b526a8
10734c0
 
4b526a8
10734c0
4b526a8
10734c0
 
 
4b526a8
 
10734c0
 
 
 
4b526a8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import whisper
import gradio as gr
from groq import Groq
from deep_translator import GoogleTranslator
from diffusers import StableDiffusionPipeline
import os
import torch
import openai

# # Replace with your OpenAI API key
# openai.api_key = "https://huggingface.co/EleutherAI/gpt-neo-2.7B/resolve/main/model.safetensors"

# Set up Groq API key
api_key = os.getenv("GROQ_API_KEY")
client = Groq(api_key=api_key)

# Retrieve Hugging Face API key from environment variable
HF_API_KEY = os.getenv("HF_API_KEY")

if HF_API_KEY is None:
    raise ValueError("Hugging Face API key not found. Please set it as an environment variable.")

# Login to Hugging Face
try:
    login(HF_API_KEY)
    print("Login successful!")
except Exception as e:
    print(f"Error during Hugging Face login: {str(e)}")

# Set device: CUDA if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_id1 = "dreamlike-art/dreamlike-diffusion-1.0"
pipe = StableDiffusionPipeline.from_pretrained(model_id1, torch_dtype=torch.float16, use_safetensors=True)
pipe = pipe.to("cuda")

…            temperature=0.7,
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        return f"An error occurred during text generation: {str(e)}"


def process_audio(audio_path, image_option, creative_text_option):
    if audio_path is None:
        return "Please upload an audio file.", None, None, None

    # Step 1: Transcribe audio
    try:
        with open(audio_path, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(os.path.basename(audio_path), file.read()),
                model="whisper-large-v3",
                language="ta",
                response_format="verbose_json",
            )
        tamil_text = transcription.text
    except Exception as e:
        return f"An error occurred during transcription: {str(e)}", None, None, None
   
    # Step 2: Translate Tamil to English
    try:
        translator = GoogleTranslator(source='ta', target='en')
        translation = translator.translate(tamil_text)
    except Exception as e:
        return tamil_text, f"An error occurred during translation: {str(e)}", None, None

    # Step 3: Generate creative text (if selected)
    creative_text = None
    if creative_text_option == "Generate Creative Text":
        creative_text = generate_creative_text(translation)

    # Step 4: Generate image (if selected)
    image = None
    if image_option == "Generate Image":
        try:
            model_id1 = "dreamlike-art/dreamlike-diffusion-1.0"
            pipe = StableDiffusionPipeline.from_pretrained(model_id1, torch_dtype=torch.float16, use_safetensors=True)
            pipe = pipe.to("cuda")
            image = pipe(translation).images[0]
        except Exception as e:
            return tamil_text, translation, creative_text, f"An error occurred during image generation: {str(e)}"

    return tamil_text, translation, creative_text, image

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Base()) as iface:
    gr.Markdown("# Audio Transcription, Translation, Image & Creative Text Generation")
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio File")
            image_option = gr.Dropdown(["Generate Image", "Skip Image"], label="Image Generation", value="Generate Image")
            creative_text_option = gr.Dropdown(["Generate Creative Text", "Skip Creative Text"], label="Creative Text Generation", value="Generate Creative Text")
            submit_button = gr.Button("Process Audio")
        with gr.Column():
          tamil_text_output = gr.Textbox(label="Tamil Transcription")
            translation_output = gr.Textbox(label="English Translation")
            creative_text_output = gr.Textbox(label="Creative Text")
            image_output = gr.Image(label="Generated Image")
    submit_button.click(
        fn=process_audio,
        inputs=[audio_input, image_option, creative_text_option],
        outputs=[tamil_text_output, translation_output, creative_text_output, image_output]
    )

# Launch the interface
iface.launch()