SameerArz commited on
Commit
c0d2d56
·
verified ·
1 Parent(s): e1f39a2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +244 -0
app.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import streamlit as st
3
+ from gradio_client import Client
4
+ from PIL import Image
5
+ import moviepy.editor as mp
6
+ from natsort import natsorted
7
+ from pydantic import BaseModel, Field
8
+ from typing import List, Dict, Type, Optional, TypedDict
9
+ from langgraph.graph import StateGraph, START, END
10
+ from langchain_groq import ChatGroq
11
+ from langchain_core.messages import SystemMessage
12
+ import os
13
+ from dotenv import load_dotenv
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ # Constants
19
+ HF_TOKEN = os.getenv("HF_TOKEN")
20
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
21
+ IMAGE_GENERATION_SPACE_NAME = "habib926653/stabilityai-stable-diffusion-3.5-large-turbo"
22
+ SUPPORTED_FORMATS = ["mp3", "wav", "ogg", "flac", "aac", "m4a"]
23
+
24
+ # Pydantic Models
25
+ class SingleScene(BaseModel):
26
+ text: str = Field(description="Actual Segment of text(a scene) from the complete story")
27
+ image_prompts: List[str] = Field(
28
+ description="""List of detailed and descriptive image prompts for the segment
29
+ prompt format: [theme: {atmosphere/mood}] [style: {artistic/photorealistic}] [focus: {main subject}] [details: {specific elements}] [lighting: {day/night/mystic}] [perspective: {close-up/wide-angle}]"
30
+ Example: "theme: eerie forest | style: cinematic realism | focus: abandoned cabin | details: broken windows, overgrown vines | lighting: moonlit fog | perspective: wide-angle shot"
31
+ """
32
+ )
33
+
34
+ class ScenesResponseSchema(BaseModel):
35
+ scenes: List[SingleScene]
36
+
37
+ # Structured Output Extractor
38
+ class State(TypedDict):
39
+ messages: list
40
+ output: Optional[BaseModel]
41
+
42
+ class StructuredOutputExtractor:
43
+ def __init__(self, response_schema: Type[BaseModel]):
44
+ self.response_schema = response_schema
45
+ self.llm = ChatGroq(model="deepseek-r1-distill-llama-70b", api_key=GROQ_API_KEY)
46
+ self.structured_llm = self.llm.with_structured_output(response_schema)
47
+ self._build_graph()
48
+
49
+ def _build_graph(self):
50
+ graph_builder = StateGraph(State)
51
+ graph_builder.add_node("extract", self._extract_structured_info)
52
+ graph_builder.add_edge(START, "extract")
53
+ graph_builder.add_edge("extract", END)
54
+ self.graph = graph_builder.compile()
55
+
56
+ def _extract_structured_info(self, state: dict):
57
+ query = state['messages'][-1].content
58
+ try:
59
+ output = self.structured_llm.invoke(query)
60
+ return {"output": output}
61
+ except Exception as e:
62
+ st.error(f"Error during extraction: {e}")
63
+ return {"output": None}
64
+
65
+ def extract(self, query: str) -> Optional[BaseModel]:
66
+ result = self.graph.invoke({"messages": [SystemMessage(content=query)]})
67
+ return result.get('output')
68
+
69
+ # Utility Functions
70
+ def calculate_read_time(text: str, words_per_minute: int = 155) -> str:
71
+ try:
72
+ if not text or not isinstance(text, str):
73
+ return "Invalid input: Text must be a non-empty string."
74
+ words = text.split()
75
+ word_count = len(words)
76
+ total_seconds = (word_count / words_per_minute) * 60
77
+ hours = int(total_seconds // 3600)
78
+ minutes = int((total_seconds % 3600) // 60)
79
+ seconds = int(total_seconds % 60)
80
+ if hours > 0:
81
+ return f"Reading time: {hours} hour(s), {minutes} minute(s), and {seconds} second(s)."
82
+ elif minutes > 0:
83
+ return f"Reading time: {minutes} minute(s) and {seconds} second(s)."
84
+ else:
85
+ return f"Reading time: {seconds} second(s)."
86
+ except Exception as e:
87
+ return f"An error occurred: {e}"
88
+
89
+ def get_scenes(text_script: str):
90
+ read_time = calculate_read_time(text_script)
91
+ prompt = f"""
92
+ ROLE: Story to Scene Generator
93
+ Tasks: For the given story
94
+ 1. Read it Completely and Understand the Complete Context
95
+ 2. Rewrite the story in tiny scenes(but without even changing a word) with highly detailed and context aware list of image prompts to visualize each scene
96
+ 3. Never Describe complete scene in a single image prompt use multiple prompts
97
+ RULE OF THUMB: 12 image prompts / 1 min audio
98
+
99
+ Estimated Read Time: {read_time}\n\n
100
+ Complete Story: {text_script}
101
+ """
102
+ extractor = StructuredOutputExtractor(response_schema=ScenesResponseSchema)
103
+ result = extractor.extract(prompt)
104
+ return result.model_dump() if result else {}
105
+
106
+ def generate_audio(text, language_code, speaker, path='test_audio.mp3'):
107
+ try:
108
+ client = Client("habib926653/Multilingual-TTS")
109
+ result = client.predict(
110
+ text=text,
111
+ language_code=language_code,
112
+ speaker=speaker,
113
+ api_name="/text_to_speech_edge"
114
+ )
115
+ audio_file_path = result[1]
116
+ with open(audio_file_path, 'rb') as f:
117
+ audio_bytes = f.read()
118
+ with open(path, 'wb') as f:
119
+ f.write(audio_bytes)
120
+ return {"audio_file": path}
121
+ except Exception as e:
122
+ st.error(f"Error during audio generation: {e}")
123
+ return {"error": str(e)}
124
+
125
+ def generate_image(prompt, path='test_image.png'):
126
+ try:
127
+ client = Client(IMAGE_GENERATION_SPACE_NAME, hf_token=HF_TOKEN)
128
+ result = client.predict(
129
+ prompt=prompt,
130
+ width=1280,
131
+ height=720,
132
+ api_name="/generate_image"
133
+ )
134
+ image = Image.open(result)
135
+ image.save(path)
136
+ return result
137
+ except Exception as e:
138
+ st.error(f"Error during image generation: {e}")
139
+ return {"error": str(e)}
140
+
141
+ def generate_video_assets(scenes: Dict, language: str, speaker: str, base_path: str = "media") -> str:
142
+ try:
143
+ if not os.path.exists(base_path):
144
+ os.makedirs(base_path)
145
+ scenes_list = scenes.get("scenes", [])
146
+ video_folder = os.path.join(base_path, f"video_{len(os.listdir(base_path)) + 1}")
147
+ os.makedirs(video_folder, exist_ok=True)
148
+ images_folder = os.path.join(video_folder, "images")
149
+ audio_folder = os.path.join(video_folder, "audio")
150
+ os.makedirs(images_folder, exist_ok=True)
151
+ os.makedirs(audio_folder, exist_ok=True)
152
+
153
+ for scene_count, scene in enumerate(scenes_list):
154
+ text = scene.get("text", "")
155
+ image_prompts = scene.get("image_prompts", [])
156
+ audio_path = os.path.join(audio_folder, f"scene_{scene_count + 1}.mp3")
157
+ audio_result = generate_audio(text, language, speaker, path=audio_path)
158
+ if "error" in audio_result:
159
+ continue
160
+ scene_images_folder = os.path.join(images_folder, f"scene_{scene_count + 1}")
161
+ os.makedirs(scene_images_folder, exist_ok=True)
162
+ for count, prompt in enumerate(image_prompts):
163
+ image_path = os.path.join(scene_images_folder, f"scene_{scene_count + 1}_image_{count + 1}.png")
164
+ generate_image(prompt=prompt, path=image_path)
165
+
166
+ return video_folder
167
+ except Exception as e:
168
+ st.error(f"Error during video asset generation: {e}")
169
+ return ""
170
+
171
+ def generate_video(video_folder: str, output_filename: str = "final_video.mp4"):
172
+ try:
173
+ audio_folder = os.path.join(video_folder, "audio")
174
+ images_folder = os.path.join(video_folder, "images")
175
+ final_clips = []
176
+ scene_folders = [
177
+ os.path.join(images_folder, scene)
178
+ for scene in natsorted(os.listdir(images_folder))
179
+ if os.path.isdir(os.path.join(images_folder, scene))
180
+ ]
181
+ for scene_path in scene_folders:
182
+ scene_name = os.path.basename(scene_path)
183
+ audio_path = os.path.join(audio_folder, f"{scene_name}.mp3")
184
+ if not os.path.exists(audio_path):
185
+ continue
186
+ image_files = natsorted([
187
+ os.path.join(scene_path, img)
188
+ for img in os.listdir(scene_path)
189
+ if img.lower().endswith(('.png', '.jpg', '.jpeg'))
190
+ ])
191
+ if not image_files:
192
+ continue
193
+ audio_clip = mp.AudioFileClip(audio_path)
194
+ duration_per_image = audio_clip.duration / len(image_files)
195
+ image_clips = [mp.ImageClip(img).set_duration(duration_per_image) for img in image_files]
196
+ scene_video = mp.concatenate_videoclips(image_clips, method="compose").set_audio(audio_clip)
197
+ final_clips.append(scene_video)
198
+ if not final_clips:
199
+ st.error("No valid scenes processed.")
200
+ return None
201
+ final_video = mp.concatenate_videoclips(final_clips, method="compose")
202
+ output_path = os.path.join(video_folder, output_filename)
203
+ final_video.write_videofile(output_path, fps=24, codec='libx264')
204
+ return output_path
205
+ except Exception as e:
206
+ st.error(f"Error during video generation: {e}")
207
+ return None
208
+
209
+ # Streamlit App
210
+ def main():
211
+ st.markdown("<h1 style='text-align: center;'>Text to Video Generator</h1>", unsafe_allow_html=True)
212
+ st.markdown("<p style='text-align: center;'>Leave a Like if it works for you! ❤️</p>", unsafe_allow_html=True)
213
+
214
+ text_script = st.text_area("Enter your script/story (max 1500 characters):", max_chars=1500)
215
+ language = st.selectbox("Choose Language:", ["Urdu", "English"])
216
+ client = Client("habib926653/Multilingual-TTS")
217
+ speakers_response = client.predict(language=language, api_name="/get_speakers")
218
+ speakers = [choice[0] for choice in speakers_response["choices"]]
219
+ selected_speaker = st.selectbox("Choose Speaker:", speakers)
220
+
221
+ if st.button("Generate Video"):
222
+ if text_script:
223
+ with st.spinner("Generating video... This may take a few minutes."):
224
+ scenes = get_scenes(text_script)
225
+ if not scenes:
226
+ st.error("Failed to generate scenes.")
227
+ else:
228
+ video_assets_folder = generate_video_assets(scenes, language, selected_speaker)
229
+ if video_assets_folder:
230
+ generated_video_path = generate_video(video_assets_folder)
231
+ if generated_video_path:
232
+ st.success("Video generated successfully!")
233
+ st.video(generated_video_path)
234
+ else:
235
+ st.warning("Please enter some text to generate a video.")
236
+
237
+ st.markdown("### 🔥 See How It Works (Example)")
238
+ example_script = """
239
+ One hot summer day, a thirsty crow was flying in search of water. He looked everywhere, but he couldn't find a single drop. Tired and exhausted, he finally spotted a clay pot with a little water at the bottom.
240
+ """
241
+ st.markdown(f"**Example Script:** {example_script}")
242
+
243
+ if __name__ == "__main__":
244
+ main()