garyuzair commited on
Commit
87d325d
Β·
verified Β·
1 Parent(s): 682bf09

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +548 -0
app.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gc
3
+ import torch
4
+ import streamlit as st
5
+ import tempfile
6
+ import json
7
+ import subprocess
8
+ import shutil
9
+ from datetime import datetime
10
+ from io import BytesIO
11
+ import random
12
+ from PIL import Image
13
+
14
+ # --- Hugging Face Model Libraries (Local Models) ---
15
+ from transformers import AutoTokenizer, AutoModelForCausalLM
16
+ from parler_tts import ParlerTTSForConditionalGeneration
17
+
18
+ # --- Google Generative AI (Gemini API) ---
19
+ try:
20
+ from google import generativeai as genai
21
+ from google.generativeai import types as genai_types # For GenerateContentConfig
22
+ google_gemini_sdk_available = True
23
+ except ImportError:
24
+ google_gemini_sdk_available = False
25
+ # Error will be handled in UI
26
+
27
+ # --- Application Configuration ---
28
+ st.set_page_config(layout="wide", page_title="🌟 AI POV Story Weaver v2")
29
+
30
+ # --- Model IDs ---
31
+ SCRIPT_LLM_MODEL_ID = "openai-community/gpt2-medium" # Stand-in for "Tinglama"
32
+ TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
33
+
34
+ # --- Gemini API Configuration (from Streamlit Secrets) ---
35
+ GEMINI_API_KEY = st.secrets.get("GEMINI_API_KEY")
36
+ GEMINI_IMAGE_MODEL_ID = st.secrets.get("GEMINI_IMAGE_MODEL_ID") # User's specified model
37
+
38
+ # --- Hugging Face Cache ---
39
+ CACHE_DIR = os.path.join(tempfile.gettempdir(), "hf_cache_story_weaver_v2")
40
+ os.makedirs(CACHE_DIR, exist_ok=True)
41
+ # (Setting environment variables for HF cache)
42
+ os.environ['HUGGINGFACE_HUB_CACHE'] = CACHE_DIR
43
+ os.environ['HF_HOME'] = CACHE_DIR
44
+ os.environ['TRANSFORMERS_CACHE'] = CACHE_DIR
45
+
46
+ # --- Session State Initialization ---
47
+ if 'run_id' not in st.session_state:
48
+ st.session_state.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
49
+ if 'generated_data' not in st.session_state:
50
+ st.session_state.generated_data = None
51
+ if 'temp_base_dir' not in st.session_state:
52
+ st.session_state.temp_base_dir = None
53
+
54
+ # --- Utility Functions (largely same as before) ---
55
+ def get_session_temp_dir():
56
+ if st.session_state.temp_base_dir and os.path.exists(st.session_state.temp_base_dir):
57
+ return st.session_state.temp_base_dir
58
+ base_dir = os.path.join(tempfile.gettempdir(), f"story_weaver_v2_run_{st.session_state.run_id}")
59
+ os.makedirs(base_dir, exist_ok=True)
60
+ st.session_state.temp_base_dir = base_dir
61
+ return base_dir
62
+
63
+ def cleanup_temp_files():
64
+ path_to_clean = st.session_state.get("temp_base_dir")
65
+ if path_to_clean and os.path.exists(path_to_clean):
66
+ try:
67
+ shutil.rmtree(path_to_clean)
68
+ st.session_state.temp_base_dir = None
69
+ except Exception as e:
70
+ st.warning(f"Warning: Could not clean up temp dir {path_to_clean}: {e}")
71
+
72
+ def clear_gpu_cache():
73
+ gc.collect()
74
+ if torch.cuda.is_available():
75
+ torch.cuda.empty_cache()
76
+
77
+ # --- Model Loading Functions (Cached) ---
78
+ @st.cache_resource
79
+ def load_script_llm_resources(model_id):
80
+ st.write(f"Loading LLM for script generation: {model_id}...")
81
+ tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
82
+ model = AutoModelForCausalLM.from_pretrained(
83
+ model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
84
+ device_map="auto", cache_dir=CACHE_DIR
85
+ )
86
+ if tokenizer.pad_token_id is None:
87
+ tokenizer.pad_token = tokenizer.eos_token
88
+ model.config.pad_token_id = model.config.eos_token_id
89
+ st.write("LLM for script generation loaded.")
90
+ return model, tokenizer
91
+
92
+ @st.cache_resource
93
+ def load_tts_resources(model_id):
94
+ st.write(f"Loading TTS model: {model_id}...")
95
+ model = ParlerTTSForConditionalGeneration.from_pretrained(
96
+ model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
97
+ device_map="auto", cache_dir=CACHE_DIR
98
+ )
99
+ tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
100
+ st.write("TTS model loaded.")
101
+ return model, tokenizer
102
+
103
+ @st.cache_resource
104
+ def get_gemini_sdk_client(_api_key): # Argument to help Streamlit caching
105
+ """
106
+ Returns a Gemini SDK client instance if SDK is available and API key is provided.
107
+ This uses the client pattern from the user's snippet.
108
+ """
109
+ if not google_gemini_sdk_available:
110
+ st.error("Google Generative AI SDK (`google-generativeai`) is not installed.")
111
+ return None
112
+ if not _api_key:
113
+ st.error("GEMINI_API_KEY not found in secrets.toml.")
114
+ return None
115
+ try:
116
+ # According to user's snippet, they instantiate client directly with API key
117
+ client = genai.Client(api_key=_api_key)
118
+ st.write("Gemini SDK Client initialized.")
119
+ return client
120
+ except Exception as e:
121
+ st.error(f"Error initializing Gemini SDK Client: {e}")
122
+ return None
123
+
124
+ # --- Core Generation Functions ---
125
+
126
+ def generate_story_and_prompts(main_pov_prompt: str, num_scenes: int):
127
+ st.info(f"Generating story and image prompts for '{main_pov_prompt}'...")
128
+ model, tokenizer = load_script_llm_resources(SCRIPT_LLM_MODEL_ID)
129
+
130
+ # --- Enhanced Prompt for Script LLM ---
131
+ structured_prompt = f"""
132
+ You are an expert visual storyteller and AI prompt engineer. Your task is to generate a multi-scene story based on a user's Point-of-View (POV) prompt.
133
+ The story must be divided into exactly {num_scenes} distinct scenes.
134
+
135
+ For each scene, you must provide:
136
+ 1. "scene_number": An integer representing the scene order (e.g., 1, 2, ...).
137
+ 2. "scene_narration": A short paragraph (2-4 sentences, ~30-60 words). This narration should be from the first-person POV, be engaging, and suitable for text-to-speech. Convey emotion or atmosphere where appropriate.
138
+ 3. "image_generation_prompt": A highly descriptive and creative prompt (1-3 sentences, ~40-75 words) tailored for an advanced AI image generator like Gemini Flash. This prompt should generate a single, compelling image for the scene. Include:
139
+ * **Subject & Action:** Clearly define the main subject(s) and what they are doing from the POV.
140
+ * **Setting & Environment:** Describe the location, time of day, and key environmental details.
141
+ * **Visual Style & Medium:** Suggest an artistic style (e.g., "photorealistic," "cinematic with dramatic lighting," "fantasy digital art," "impressionistic oil painting," "cyberpunk anime concept art," "vintage photograph").
142
+ * **Camera View & Composition:** Specify camera angle if important (e.g., "first-person POV looking through a visor," "low-angle shot emphasizing scale," "close-up on a mysterious object," "wide establishing shot").
143
+ * **Lighting & Color:** Describe the lighting conditions (e.g., "soft morning light," "neon glow," "moonlit night," "dramatic chiaroscuro") and dominant colors or color palette.
144
+ * **Mood & Atmosphere:** Indicate the desired feeling (e.g., "mysterious and eerie," "hopeful and adventurous," "tense and suspenseful," "serene and peaceful").
145
+ * **Key Details:** Mention any specific objects, textures, or elements crucial to the scene.
146
+
147
+ User's main POV prompt: "{main_pov_prompt}"
148
+
149
+ Output the result STRICTLY as a single JSON object. The JSON object should have a key "title" (a concise title derived from the main POV prompt) and a key "scenes" which is a list of scene objects. Each scene object must contain the keys "scene_number", "scene_narration", and "image_generation_prompt".
150
+
151
+ Example of a single scene object within the "scenes" list:
152
+ {{
153
+ "scene_number": 1,
154
+ "scene_narration": "My metallic fingers traced the glowing hieroglyphs on the alien console. A low hum resonated through the derelict starship, promising either discovery or doom.",
155
+ "image_generation_prompt": "First-person POV of a sleek, silver robotic hand touching intricate, glowing blue hieroglyphs on a dark, alien control panel. The background shows the dimly lit, derelict interior of a starship, with faint starlight filtering through a cracked viewport. Style: Cinematic sci-fi, photorealistic textures on the robot hand and console, mysterious and suspenseful atmosphere. Focus on the interaction between hand and console."
156
+ }}
157
+
158
+ Begin JSON output now:
159
+ ```json
160
+ """
161
+
162
+ input_ids = tokenizer.encode(structured_prompt, return_tensors="pt").to(model.device)
163
+ estimated_output_tokens = num_scenes * 180 + 150 # Increased estimate for richer prompts
164
+ max_new_tokens = min(estimated_output_tokens, 1200) # Slightly increased cap
165
+
166
+ try:
167
+ output = model.generate(
168
+ input_ids, max_new_tokens=max_new_tokens, do_sample=True,
169
+ temperature=0.7, top_k=60, pad_token_id=tokenizer.eos_token_id,
170
+ eos_token_id=tokenizer.eos_token_id
171
+ )
172
+ result_text = tokenizer.decode(output, skip_special_tokens=True)
173
+
174
+ json_str_content = ""
175
+ # Try to extract JSON block, more robustly
176
+ if "```json" in result_text:
177
+ json_start_index = result_text.find("```json") + len("```json")
178
+ json_end_index = result_text.rfind("```")
179
+ if json_start_index != -1 and json_end_index != -1 and json_end_index > json_start_index:
180
+ json_str_content = result_text[json_start_index:json_end_index].strip()
181
+ else: # Fallback if ending ``` is missing or malformed
182
+ json_str_content = result_text[json_start_index:].strip()
183
+ else: # If no ```json marker, assume the relevant part starts with {
184
+ json_start_index = result_text.find("{")
185
+ if json_start_index != -1:
186
+ # Try to find matching braces, simple approach
187
+ # This is not a perfect JSON parser but a heuristic
188
+ open_braces = 0
189
+ potential_json_end = -1
190
+ for i, char in enumerate(result_text[json_start_index:]):
191
+ if char == '{':
192
+ open_braces += 1
193
+ elif char == '}':
194
+ open_braces -= 1
195
+ if open_braces == 0:
196
+ potential_json_end = json_start_index + i + 1
197
+ break
198
+ if potential_json_end != -1:
199
+ json_str_content = result_text[json_start_index:potential_json_end]
200
+ else: # Could not find balanced braces, take a guess
201
+ json_str_content = result_text[json_start_index:]
202
+
203
+
204
+ if not json_str_content:
205
+ st.error("LLM did not produce detectable JSON content.")
206
+ st.text_area("LLM Full Raw Output:", result_text, height=300)
207
+ return None
208
+
209
+ try:
210
+ parsed_json = json.loads(json_str_content)
211
+ except json.JSONDecodeError as e:
212
+ st.error(f"LLM output JSON parsing error: {e}")
213
+ st.text_area("Attempted JSON content:", json_str_content, height=200)
214
+ st.text_area("LLM Full Raw Output (for debugging):", result_text, height=300)
215
+ return None
216
+
217
+ if not isinstance(parsed_json, dict) or "scenes" not in parsed_json or not isinstance(parsed_json["scenes"], list):
218
+ st.error("LLM output JSON structure is not as expected (missing 'scenes' list or not a dict).")
219
+ st.json(parsed_json)
220
+ return None
221
+
222
+ if len(parsed_json["scenes"]) != num_scenes:
223
+ st.warning(f"LLM generated {len(parsed_json['scenes'])} scenes, but {num_scenes} were requested. Adjusting...")
224
+ parsed_json["scenes"] = parsed_json["scenes"][:num_scenes]
225
+ while len(parsed_json["scenes"]) < num_scenes: # Pad if too few (basic)
226
+ parsed_json["scenes"].append({
227
+ "scene_number": len(parsed_json["scenes"]) + 1,
228
+ "scene_narration": "Error: Scene data missing from LLM.",
229
+ "image_generation_prompt": "Error: Image prompt missing from LLM."
230
+ })
231
+
232
+ st.success("Story and image prompts generated successfully!")
233
+ return parsed_json
234
+
235
+ except Exception as e:
236
+ st.error(f"Error during LLM story/prompt generation: {e}")
237
+ st.text_area("LLM Full Raw Output (on exception):", result_text if 'result_text' in locals() else "N/A", height=300)
238
+ return None
239
+ finally:
240
+ del model; del tokenizer; clear_gpu_cache()
241
+
242
+
243
+ def generate_images_via_gemini(story_data):
244
+ st.info("Generating images with Gemini API...")
245
+ sdk_client = get_gemini_sdk_client(GEMINI_API_KEY) # Use the new client getter
246
+
247
+ if not sdk_client:
248
+ st.error("Gemini SDK Client not initialized. Cannot generate images.")
249
+ return None
250
+
251
+ if not GEMINI_IMAGE_MODEL_ID:
252
+ st.error("`GEMINI_IMAGE_MODEL_ID` is not set in secrets.toml. Cannot generate images.")
253
+ return None
254
+
255
+ st.markdown(f"**Using Gemini Model for Images:** `{GEMINI_IMAGE_MODEL_ID}`")
256
+ st.warning(f"""
257
+ **Note on Image Generation with `{GEMINI_IMAGE_MODEL_ID}`:**
258
+ - This uses your specified model and API call structure.
259
+ - Image characteristics (size, style nuances) are determined by this model.
260
+ - The 'Seed' input from the UI is not directly used in this specific Gemini API call structure.
261
+ """)
262
+
263
+ images_pil = []
264
+ for i, scene_obj in enumerate(story_data["scenes"]):
265
+ image_prompt_text = scene_obj.get("image_generation_prompt", "A beautiful, abstract scene.") # Fallback
266
+ scene_num = scene_obj.get("scene_number", i + 1)
267
+
268
+ st.write(f"Requesting image for Scene {scene_num} with prompt: \"{image_prompt_text[:150]}...\"")
269
+ try:
270
+ # --- Using user's specified Gemini calling convention ---
271
+ response = sdk_client.models.generate_content(
272
+ model=GEMINI_IMAGE_MODEL_ID, # Model name passed here
273
+ contents=[image_prompt_text], # The prompt for the image
274
+ # As per user's snippet, config might be needed by their specific model endpoint
275
+ generation_config=genai_types.GenerateContentConfig(
276
+ # response_modalities=["TEXT", "IMAGE"] # This was in user's example for GenerateContentConfig
277
+ # However, GenerateContentConfig does not have response_modalities.
278
+ # If the user's model requires this, it might be an older/internal SDK version or custom handling.
279
+ # For safety with public SDK, I will omit it unless specified it's for GenerateContentConfig.
280
+ # If it's for the top-level call, it would be different.
281
+ # The user's snippet has it under 'config=', implying it's for GenerateContentConfig.
282
+ # Let's try to include it if the types allow, otherwise this might error with public SDK.
283
+ # Upon checking google.generativeai.types.GenerationConfig, it does not have `response_modalities`.
284
+ # The user's example had `config=types.GenerateContentConfig(response_modalities=["TEXT", "IMAGE"])`
285
+ # This structure means `response_modalities` is an argument to `GenerateContentConfig`.
286
+ # If their preview SDK `types.GenerateContentConfig` accepts it, this will work.
287
+ # Otherwise, this line will be an error with the public SDK.
288
+ # For now, I will try to pass it as they specified, assuming their SDK version is different.
289
+ # **Update:** Based on their snippet, `response_modalities` seems to be part of `GenerateContentConfig`.
290
+ # However, standard `google.generativeai.types.GenerateContentConfig` doesn't list it.
291
+ # The `generate_content` method itself in `Model` class can take `request_options` which includes `response_mime_types`.
292
+ # The most robust way if `response_modalities` is not a standard config param,
293
+ # would be to rely on the model type to produce an image, or use `response_mime_type` if the model supports it.
294
+ # Given the user's code snippet, I'll include it as they had it, assuming their types.py is different.
295
+ # THIS IS A POTENTIAL POINT OF FAILURE IF USING STANDARD PUBLIC SDK.
296
+ **({"response_modalities": ["TEXT", "IMAGE"]} if hasattr(genai_types.GenerateContentConfig(), 'response_modalities') else {})
297
+ # The above line is a Poka-yoke to attempt to add it only if the attribute exists.
298
+ # A simpler approach is to just try what they gave:
299
+ # response_modalities = ["TEXT", "IMAGE"] # This would go into GenerateContentConfig
300
+ # This is very specific to their stated "working code"
301
+ ),
302
+ # The `config` argument in `client.models.generate_content` maps to `generation_config` for the Model service.
303
+ # The API may also have `tool_config` and `safety_settings`.
304
+ )
305
+
306
+ generated_image = None
307
+ response_text_parts = []
308
+
309
+ if hasattr(response, 'parts') and response.parts:
310
+ for part in response.parts:
311
+ if hasattr(part, 'text') and part.text:
312
+ response_text_parts.append(part.text)
313
+ if hasattr(part, 'mime_type') and part.mime_type and part.mime_type.startswith("image/"):
314
+ if hasattr(part, 'inline_data') and hasattr(part.inline_data, 'data'):
315
+ image_bytes = part.inline_data.data
316
+ generated_image = Image.open(BytesIO(image_bytes))
317
+ st.success(f"Image for Scene {scene_num} received from Gemini.")
318
+ break # Found an image
319
+ elif hasattr(response, 'text') and not generated_image: # If no parts but has text (error or text-only response)
320
+ response_text_parts.append(response.text)
321
+
322
+
323
+ if generated_image:
324
+ images_pil.append(generated_image)
325
+ if response_text_parts:
326
+ st.caption(f"Accompanying text from Gemini for Scene {scene_num}: {' '.join(response_text_parts)}")
327
+ else:
328
+ st.warning(f"No image data explicitly found from Gemini for Scene {scene_num}.")
329
+ if response_text_parts:
330
+ st.text_area(f"Gemini Text Response (Scene {scene_num}):", value=' '.join(response_text_parts), height=100)
331
+ else:
332
+ st.text(f"Raw Gemini Response (Scene {scene_num}): {response}") # Log raw response if no image
333
+ if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
334
+ st.warning(f"Prompt Feedback for scene {scene_num}: {response.prompt_feedback}")
335
+ images_pil.append(None)
336
+
337
+ except Exception as e:
338
+ st.error(f"Error generating image for Scene {scene_num} with Gemini: {e}")
339
+ st.error(f"Model used: {GEMINI_IMAGE_MODEL_ID}. Prompt: '{image_prompt_text[:100]}...'")
340
+ if "API key not valid" in str(e) or "PERMISSION_DENIED" in str(e):
341
+ st.error("Gemini API Key error. Check your key and its permissions for this model.")
342
+ elif "Could not find model" in str(e) or "MODEL_NAME_INVALID" in str(e):
343
+ st.error(f"Gemini Model '{GEMINI_IMAGE_MODEL_ID}' not found or invalid. Verify the model name.")
344
+ elif "response_modalities" in str(e):
345
+ st.error("The `response_modalities` config might not be supported by your version of `google.generativeai.types.GenerateContentConfig` or the model endpoint. This part of the code is based on your provided 'working snippet'.")
346
+ images_pil.append(None)
347
+
348
+ if not any(images_pil):
349
+ st.error("No images were successfully generated by Gemini.")
350
+ return None
351
+
352
+ st.success("Image generation step completed.")
353
+ return images_pil
354
+
355
+
356
+ def generate_audio_narrations(story_data):
357
+ st.info("Generating audio narrations with ParlerTTS...")
358
+ tts_model, tts_tokenizer = load_tts_resources(TTS_MODEL_ID)
359
+ audio_dir = os.path.join(get_session_temp_dir(), "audio_files")
360
+ os.makedirs(audio_dir, exist_ok=True)
361
+ audio_file_paths = []
362
+ description = "A clear and engaging narrator tells a story with enthusiasm."
363
+
364
+ for i, scene_obj in enumerate(story_data["scenes"]):
365
+ narration_text = scene_obj.get("scene_narration", "Narration unavailable.")
366
+ scene_num = scene_obj.get("scene_number", i + 1)
367
+ st.write(f"Generating audio for Scene {scene_num}...")
368
+ try:
369
+ input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_model.device)
370
+ prompt_input_ids = tts_tokenizer(narration_text, return_tensors="pt").input_ids.to(tts_model.device)
371
+ generation = tts_model.generate(
372
+ input_ids=input_ids, prompt_input_ids=prompt_input_ids,
373
+ do_sample=True, temperature=0.7, # Slightly warmer for more expression
374
+ repetition_penalty=1.2, guidance_scale=3.0 # Experiment with ParlerTTS params
375
+ ).to(torch.float32)
376
+ audio_waveform = generation.cpu().numpy().squeeze()
377
+ file_path = os.path.join(audio_dir, f"s_{scene_num}_audio.wav")
378
+ sf.write(file_path, audio_waveform, tts_model.config.sampling_rate)
379
+ audio_file_paths.append(file_path)
380
+ st.success(f"Audio for Scene {scene_num} created.")
381
+ except Exception as e:
382
+ st.error(f"Audio error (Scene {scene_num}): {e}")
383
+ audio_file_paths.append(None)
384
+ del tts_model; del tts_tokenizer; clear_gpu_cache()
385
+ st.success("Audio narration step completed.")
386
+ return audio_file_paths
387
+
388
+
389
+ def create_final_video(image_pil_objects, audio_paths):
390
+ # (This function remains largely the same as v1, ensure paths and checks are robust)
391
+ st.info("Creating final video...")
392
+ if not image_pil_objects or not audio_paths or len(image_pil_objects) != len(audio_paths):
393
+ st.error("Asset mismatch for video. Cannot create."); return None
394
+ try: subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
395
+ except: st.error("FFMPEG not found."); return None
396
+
397
+ temp_image_dir = os.path.join(get_session_temp_dir(), "vid_frames")
398
+ os.makedirs(temp_image_dir, exist_ok=True)
399
+ img_paths_for_vid = []
400
+ for idx, img_pil in enumerate(image_pil_objects):
401
+ if img_pil:
402
+ p = os.path.join(temp_image_dir, f"f_{idx:03d}.png"); img_pil.save(p)
403
+ img_paths_for_vid.append(p)
404
+ else: img_paths_for_vid.append(None)
405
+
406
+ temp_clips_dir = os.path.join(get_session_temp_dir(), "temp_vid_clips")
407
+ os.makedirs(temp_clips_dir, exist_ok=True)
408
+ vid_clip_paths, valid_clips = [], 0
409
+
410
+ for i, (img_p, aud_p) in enumerate(zip(img_paths_for_vid, audio_paths)):
411
+ s_num = i + 1
412
+ if not (img_p and aud_p): st.warning(f"Skipping Scene {s_num} in video (missing asset)."); continue
413
+ try:
414
+ aud_info = sf.info(aud_p); aud_dur = aud_info.duration
415
+ if aud_dur < 0.5: aud_dur = 0.5 # Min clip duration
416
+ clip_out_p = os.path.join(temp_clips_dir, f"c_{s_num:03d}.mp4")
417
+ cmd = [
418
+ "ffmpeg", "-y", "-loop", "1", "-i", img_p, "-i", aud_p,
419
+ "-c:v", "libx264", "-preset", "fast", "-tune", "stillimage", # Faster preset
420
+ "-vf", "scale=1280:720:force_original_aspect_ratio=decrease,pad=1280:720:(ow-iw)/2:(oh-ih)/2,setsar=1", # Scale and pad to 720p
421
+ "-c:a", "aac", "-b:a", "192k", "-pix_fmt", "yuv420p",
422
+ "-t", str(aud_dur), "-shortest", clip_out_p
423
+ ]
424
+ res = subprocess.run(cmd, capture_output=True, text=True)
425
+ if res.returncode != 0: st.error(f"FFMPEG clip error (S{s_num}):\n{res.stderr}"); continue
426
+ vid_clip_paths.append(clip_out_p); valid_clips +=1
427
+ st.write(f"Video clip for Scene {s_num} processed.")
428
+ except Exception as e: st.error(f"Video processing error (S{s_num}): {e}")
429
+
430
+ if not vid_clip_paths or valid_clips == 0: st.error("No valid video clips. Cannot create final video."); return None
431
+
432
+ concat_list_f = os.path.join(temp_clips_dir, "concat_list.txt")
433
+ with open(concat_list_f, "w") as f:
434
+ for clip_p in vid_clip_paths: f.write(f"file '{os.path.basename(clip_p)}'\n")
435
+
436
+ final_vid_out_p = os.path.join(get_session_temp_dir(), "final_story_video_720p.mp4")
437
+ concat_cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", concat_list_f, "-c", "copy", final_vid_out_p]
438
+ st.write("Concatenating video clips...")
439
+ res = subprocess.run(concat_cmd, capture_output=True, text=True, cwd=temp_clips_dir)
440
+ if res.returncode != 0: st.error(f"FFMPEG concat error:\n{res.stderr}"); return None
441
+
442
+ st.success("Final video created successfully!"); return final_vid_out_p
443
+
444
+ # --- Streamlit UI (largely same, check key names and logic) ---
445
+ st.title("🌟 AI POV Story Weaver v2 ✨")
446
+ st.markdown("Craft unique POV stories with refined AI-generated scripts, Gemini images, voiceovers, and a final video!")
447
+ st.markdown("---")
448
+
449
+ with st.sidebar:
450
+ st.header("πŸ› οΈ Story Configuration")
451
+ user_main_prompt = st.text_area(
452
+ "Enter your main POV story idea:",
453
+ st.session_state.get("user_main_prompt_val", "POV: I'm a lone astronaut discovering an ancient, bioluminescent forest on a new planet."),
454
+ height=120, key="main_prompt_input_v2"
455
+ )
456
+ num_scenes_input = st.slider(
457
+ "Number of Scenes:", 1, 4, # Max 4 for resource management
458
+ st.session_state.get("num_scenes_input_val", 2), key="num_scenes_slider_v2"
459
+ )
460
+ st.caption(f"Script LLM: `{SCRIPT_LLM_MODEL_ID}`")
461
+ st.caption(f"TTS Model: `{TTS_MODEL_ID}`")
462
+ st.caption(f"Image Model (Gemini): `{GEMINI_IMAGE_MODEL_ID}` (from secrets)")
463
+
464
+ if not google_gemini_sdk_available: st.error("Google SDK missing (`pip install google-generativeai`)")
465
+ if not GEMINI_API_KEY: st.error("`GEMINI_API_KEY` not set in secrets.")
466
+ if not GEMINI_IMAGE_MODEL_ID or "your-gemini" in GEMINI_IMAGE_MODEL_ID or "flash-preview" not in GEMINI_IMAGE_MODEL_ID :
467
+ st.warning(f"Model ID '{GEMINI_IMAGE_MODEL_ID}' might be placeholder or not your specific preview model. Ensure it's correct in secrets for image generation.")
468
+
469
+ st.markdown("---")
470
+ can_generate = google_gemini_sdk_available and GEMINI_API_KEY and GEMINI_IMAGE_MODEL_ID
471
+ if st.button("🎬 Weave My Story! (v2)", type="primary", use_container_width=True, disabled=not can_generate):
472
+ st.session_state.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
473
+ cleanup_temp_files()
474
+ st.session_state.generated_data = {}
475
+ st.session_state.user_main_prompt_val = user_main_prompt
476
+ st.session_state.num_scenes_input_val = num_scenes_input
477
+ st.session_state.trigger_generation_v2 = True
478
+
479
+ st.markdown("---")
480
+ if st.button("🧹 Clear All & Reset", use_container_width=True):
481
+ cleanup_temp_files()
482
+ keys_to_clear = ['generated_data', 'trigger_generation_v2', 'user_main_prompt_val', 'num_scenes_input_val']
483
+ for key in keys_to_clear:
484
+ if key in st.session_state: del st.session_state[key]
485
+ st.cache_resource.clear()
486
+ st.success("Cleared temp files, state, and model cache. Reload on next run."); st.rerun()
487
+
488
+ # --- Main Area for Generation and Display ---
489
+ if st.session_state.get("trigger_generation_v2"):
490
+ with st.spinner("πŸ“œ Phase 1: Generating enhanced story script and image prompts..."):
491
+ story_json_data = generate_story_and_prompts(
492
+ st.session_state.user_main_prompt_val, st.session_state.num_scenes_input_val
493
+ )
494
+ st.session_state.generated_data['story_json'] = story_json_data
495
+
496
+ if story_json_data:
497
+ # (Display logic for story, images, audio, video - similar to previous, ensuring keys match)
498
+ st.header("πŸ“œ Generated Story & Image Prompts")
499
+ st.json(story_json_data)
500
+ # ... (Download button for story_json) ...
501
+ st.markdown("---")
502
+
503
+ with st.spinner(f"🎨 Phase 2: Generating images with Gemini ({GEMINI_IMAGE_MODEL_ID})..."):
504
+ pil_images = generate_images_via_gemini(story_json_data)
505
+ st.session_state.generated_data['pil_images'] = pil_images
506
+
507
+ if pil_images and any(pil_images): # Check if list is not empty AND contains at least one image
508
+ st.header("πŸ–ΌοΈ Generated Images")
509
+ # ... (Display and download buttons for pil_images) ...
510
+ cols = st.columns(min(len(pil_images), 3))
511
+ for i, img in enumerate(pil_images):
512
+ if img:
513
+ with cols[i % len(cols)]:
514
+ st.image(img, caption=f"Scene {story_json_data['scenes'][i].get('scene_number', i+1)}")
515
+ # ... (download button for img) ...
516
+ st.markdown("---")
517
+
518
+ with st.spinner("πŸ”Š Phase 3: Generating audio narrations..."):
519
+ audio_paths = generate_audio_narrations(story_json_data)
520
+ st.session_state.generated_data['audio_paths'] = audio_paths
521
+
522
+ if audio_paths and any(audio_paths):
523
+ st.header("🎀 Generated Audio Narrations")
524
+ # ... (Display and download buttons for audio_paths) ...
525
+ st.markdown("---")
526
+
527
+ if st.session_state.generated_data.get('pil_images') and st.session_state.generated_data.get('audio_paths'):
528
+ with st.spinner("🎬 Final Phase: Weaving the video masterpiece..."):
529
+ final_video_path = create_final_video(
530
+ st.session_state.generated_data['pil_images'],
531
+ st.session_state.generated_data['audio_paths']
532
+ )
533
+ st.session_state.generated_data['final_video_path'] = final_video_path
534
+
535
+ if final_video_path:
536
+ st.header("πŸŽ‰ Your Story Video is Ready! πŸŽ‰")
537
+ st.video(final_video_path)
538
+ # ... (Download button for final_video_path) ...
539
+ st.balloons()
540
+ else: st.error("Video creation failed. Check FFMPEG logs if any were shown.")
541
+ else: st.warning("Skipping video: not all images or audio were generated.")
542
+ else: st.error("Audio generation failed. Cannot proceed to video.")
543
+ else: st.error("Image generation failed (no images returned). Cannot proceed.")
544
+ else: st.error("Story script generation failed. Cannot proceed.")
545
+ st.session_state.trigger_generation_v2 = False # Reset trigger
546
+
547
+ elif not st.session_state.get("user_main_prompt_val"):
548
+ st.info("πŸ‘‹ Welcome to the AI Story Weaver v2! Configure your story in the sidebar and click 'Weave My Story!'")