File size: 6,266 Bytes
7ae35c9
dbe4319
5a834fc
 
4786e02
 
9bea5a2
 
7a59ca0
9bea5a2
 
 
 
 
4c35324
9bea5a2
 
7d90627
3638d85
9bea5a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e395658
 
9bea5a2
3638d85
e395658
9bea5a2
3f22c4a
394d306
 
9bea5a2
96f47bd
6700b95
 
 
96f47bd
9bea5a2
e95a1cd
7d90627
e95a1cd
b9d9615
3f22c4a
 
5a834fc
 
e95a1cd
 
9bea5a2
3638d85
 
 
 
 
 
 
9bea5a2
5a834fc
e95a1cd
3f22c4a
 
 
 
7d90627
9bea5a2
4c35324
 
 
e95a1cd
4c35324
e95a1cd
3638d85
9bea5a2
 
e95a1cd
5a834fc
 
 
e95a1cd
 
9bea5a2
3638d85
 
 
 
 
 
 
 
 
 
 
3f22c4a
3638d85
 
 
9bea5a2
 
 
 
 
 
 
3638d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bea5a2
 
 
 
 
 
 
 
 
 
 
 
3638d85
 
3f22c4a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import gradio as gr
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import torch
from PIL import Image, ImageDraw, ImageFont
import scipy.io.wavfile
from TTS.api import TTS
from moviepy.editor import CompositeVideoClip, ImageClip, AudioFileClip, concatenate_videoclips, VideoFileClip
import os
from groq import Groq
from deepgram import Deepgram
import asyncio
import aiohttp
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Initialize Clients
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
deepgram_client = Deepgram(api_key=os.getenv("DEEPGRAM_API_KEY"))

# Use GPT-3.5-turbo for text generation
async def generate_comedy_script(prompt):
    chat_completion = await groq_client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a comedy writer. Generate a short, funny script based on the given prompt."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        model="mixtral-8x7b-32768",
        max_tokens=200
    )
    return chat_completion.choices[0].message.content

# Use Coqui TTS for text-to-speech (unchanged)
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=False)

# Use MusicGen for music generation (unchanged)
music_generator = pipeline("text-to-audio", model="facebook/musicgen-small", device="cpu")

# Use Fluently Anime (Stable Diffusion) for anime image generation (unchanged)
model_id = "fluently/Fluently-anime"
anime_image_generator = StableDiffusionPipeline.from_pretrained(model_id).to("cpu")

# Convert Text to Speech using Coqui TTS (unchanged)
def text_to_speech(script):
    output_audio = 'output.wav'
    tts.tts_to_file(text=script, file_path=output_audio)
    return output_audio

# Create Anime Images Using Fluently Anime (unchanged)
def create_images_from_script(script):
    lines = script.split('. ')
    image_paths = []
    for i, line in enumerate(lines):
        img = anime_image_generator(line).images[0]
        img_path = f'/tmp/anime_image_{i}.png'
        img.save(img_path)
        image_paths.append(img_path)
    return image_paths

# Generate Fun Music Track using MusicGen (unchanged)
def generate_fun_music(prompt, output_music_file="fun_music.wav"):
    response = music_generator(prompt)
    audio_data = response["audio"]
    sampling_rate = response["sampling_rate"]
    scipy.io.wavfile.write(output_music_file, rate=sampling_rate, data=audio_data)
    return output_music_file

# Create Video from Generated Anime Images (unchanged)
def generate_text_video(script):
    image_paths = create_images_from_script(script)
    video_clip = ImageSequenceClip(image_paths, fps=24)
    video_path = "/tmp/final_video.mp4"
    video_clip.write_videofile(video_path, codec='libx264')
    return video_path

# Combine Audio and Video (unchanged)
def combine_audio_video(video_file, audio_file):
    video = VideoFileClip(video_file)
    audio = AudioFileClip(audio_file)
    final_video = video.set_audio(audio)
    return final_video

# Main Function to Generate Comedy Animation
async def generate_comedy_and_animation(prompt):
    script = await generate_comedy_script(prompt)
    audio_file = text_to_speech(script)
    video_file = generate_text_video(script)
    fun_music = generate_fun_music(prompt)
    final_video = combine_audio_video(video_file, fun_music)
    return script, audio_file, final_video

# Generate Kids Content (unchanged)
def generate_kids_content(theme):
    music_file = generate_fun_music(theme, output_music_file="kids_music.wav")
    clips = []
    for i in range(5):
        img = Image.new('RGB', (800, 400), color=(0, 0, 255))
        d = ImageDraw.Draw(img)
        fnt = ImageFont.load_default()
        d.text((10, 180), f"Kids Music: {theme}", font=fnt, fill=(255, 255, 0))
        frame_path = f'/tmp/kids_temp_{i}.png'
        img.save(frame_path)
        clips.append(ImageClip(frame_path).set_duration(1).set_position(('center', 'center')))
    final_video = concatenate_videoclips(clips, method="compose").set_audio(AudioFileClip(music_file))
    final_video.write_videofile("/tmp/kids_animation.mp4", fps=24)
    return music_file, "/tmp/kids_animation.mp4"

# New function for speech-to-text
async def transcribe_audio(audio_file):
    with open(audio_file, 'rb') as audio:
        source = {'buffer': audio, 'mimetype': 'audio/wav'}
        response = await deepgram_client.transcription.prerecorded(source, {'smart_format': True, 'model': 'general'})
        return response['results']['channels'][0]['alternatives'][0]['transcript']

# Gradio Interface
with gr.Blocks() as app:
    gr.Markdown("## AI Comedy and Kids Content Generator")

    # Comedy Animation Tab
    with gr.Tab("Generate Comedy Animation"):
        prompt_input = gr.Textbox(label="Comedy Prompt")
        generate_btn = gr.Button("Generate Comedy Script and Animation")
        comedy_script = gr.Textbox(label="Generated Script")
        comedy_audio = gr.Audio(label="Generated Audio")
        comedy_video = gr.Video(label="Generated Animation")

        generate_btn.click(
            generate_comedy_and_animation,
            inputs=prompt_input,
            outputs=[comedy_script, comedy_audio, comedy_video]
        )

    # Kids Music Animation Tab
    with gr.Tab("Generate Kids Music Animation"):
        theme_input = gr.Textbox(label="Kids Music Theme")
        generate_music_btn = gr.Button("Generate Kids Music and Animation")
        kids_music_audio = gr.Audio(label="Generated Music")
        kids_music_video = gr.Video(label="Generated Kids Animation")

        generate_music_btn.click(
            generate_kids_content,
            inputs=theme_input,
            outputs=[kids_music_audio, kids_music_video]
        )

    # New Speech-to-Text Tab
    with gr.Tab("Speech-to-Text"):
        audio_input = gr.Audio(label="Upload Audio")
        transcribe_btn = gr.Button("Transcribe Audio")
        transcription_output = gr.Textbox(label="Transcription")

        transcribe_btn.click(
            transcribe_audio,
            inputs=audio_input,
            outputs=transcription_output
        )

app.launch()