Spaces:

saq1b
/

podcastgen

Running

App Files Files Community

saq1b commited on Mar 7

Commit

d8bebcf

verified ·

1 Parent(s): 0d730da

Upload app.py

Browse files

Files changed (1) hide show

app.py +166 -76

app.py CHANGED Viewed

@@ -4,20 +4,19 @@ from google import genai
 from google.genai import types
 import json
 import uuid
-import io
 import edge_tts
 import asyncio
 import aiofiles
-import pypdf
 import os
 import time
 from typing import List, Dict
 class PodcastGenerator:
     def __init__(self):
         pass
-    async def generate_script(self, prompt: str, language: str, api_key: str) -> Dict:
         example = """
 {
     "topic": "AGI",
@@ -229,47 +228,81 @@ Follow this example structure:
 """
         user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
-        messages = [
-            {"role": "user", "parts": [user_prompt]}
-        ]
         client = genai.Client(api_key=api_key)
         safety_settings = [
             {
-            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-            "threshold": "BLOCK_NONE"
             },
             {
-            "category": "HARM_CATEGORY_HARASSMENT",
-            "threshold": "BLOCK_NONE"
             },
             {
-            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-            "threshold": "BLOCK_NONE"
             },
             {
-            "category": "HARM_CATEGORY_HATE_SPEECH",
-            "threshold": "BLOCK_NONE"
             }
         ]
         try:
-            response = await client.aio.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=messages,
-                config=types.GenerateContentConfig(
-                    temperature=1,
-                    response_mime_type="application/json",
-                    safety_settings=[
-                        types.SafetySetting(
-                            category=safety_setting["category"],
-                            threshold=safety_setting["threshold"]
-                        ) for safety_setting in safety_settings
-                    ],
-                    system_instruction=system_prompt
-                )
             )
         except Exception as e:
             if "API key not valid" in str(e):
                 raise gr.Error("Invalid API key. Please provide a valid Gemini API key.")
@@ -280,7 +313,27 @@ Follow this example structure:
         print(f"Generated podcast script:\n{response.text}")
         return json.loads(response.text)
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
         voice = speaker1 if speaker == 1 else speaker2
@@ -288,14 +341,22 @@ Follow this example structure:
         temp_filename = f"temp_{uuid.uuid4()}.wav"
         try:
-            await speech.save(temp_filename)
             return temp_filename
         except Exception as e:
             if os.path.exists(temp_filename):
                 os.remove(temp_filename)
             raise e
-    async def combine_audio_files(self, audio_files: List[str]) -> str:
         combined_audio = AudioSegment.empty()
         for audio_file in audio_files:
             combined_audio += AudioSegment.from_file(audio_file)
@@ -303,44 +364,59 @@ Follow this example structure:
         output_filename = f"output_{uuid.uuid4()}.wav"
         combined_audio.export(output_filename, format="wav")
         return output_filename
-    async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str) -> str:
-        start_time = time.time()
-        podcast_json = await self.generate_script(input_text, language, api_key)
-        end_time = time.time()
-        start_time = time.time()
-        audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in podcast_json['podcast']])
-        end_time = time.time()
-        combined_audio = await self.combine_audio_files(audio_files)
         return combined_audio
-class TextExtractor:
-    @staticmethod
-    async def extract_from_pdf(file_path: str) -> str:
-        async with aiofiles.open(file_path, 'rb') as file:
-            content = await file.read()
-            pdf_reader = pypdf.PdfReader(io.BytesIO(content))
-            return "\n\n".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
-    @staticmethod
-    async def extract_from_txt(file_path: str) -> str:
-        async with aiofiles.open(file_path, 'r') as file:
-            return await file.read()
-    @classmethod
-    async def extract_text(cls, file_path: str) -> str:
-        _, file_extension = os.path.splitext(file_path)
-        if file_extension.lower() == '.pdf':
-            return await cls.extract_from_pdf(file_path)
-        elif file_extension.lower() == '.txt':
-            return await cls.extract_from_txt(file_path)
-        else:
-            raise gr.Error(f"Unsupported file type: {file_extension}")
-async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str:
     start_time = time.time()
     voice_names = {
@@ -357,20 +433,32 @@ async def process_input(input_text: str, input_file, language: str, speaker1: st
     speaker1 = voice_names[speaker1]
     speaker2 = voice_names[speaker2]
-    if input_file:
-        input_text = await TextExtractor.extract_text(input_file.name)
-    if not api_key:
-        api_key = os.getenv("GENAI_API_KEY")
-    podcast_generator = PodcastGenerator()
-    podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key)
-    end_time = time.time()
-    return podcast
-# Define Gradio interface
 iface = gr.Interface(
     fn=process_input,
     inputs=[
@@ -422,8 +510,10 @@ iface = gr.Interface(
     ],
     title="PodcastGen 🎙️",
     description="Generate a 2-speaker podcast from text input or documents!",
-    allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch()

 from google.genai import types
 import json
 import uuid
 import edge_tts
 import asyncio
 import aiofiles
 import os
 import time
+import mimetypes
 from typing import List, Dict
 class PodcastGenerator:
     def __init__(self):
         pass
+    async def generate_script(self, prompt: str, language: str, api_key: str, file_obj=None, progress=None) -> Dict:
         example = """
 {
     "topic": "AGI",
 """
         user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
+        messages = []
+        # If file is provided, add it to the messages
+        if file_obj:
+            file_data = await self._read_file_bytes(file_obj)
+            mime_type = self._get_mime_type(file_obj.name)
+            messages.append(
+                types.Content(
+                    role="user",
+                    parts=[
+                        types.Part.from_bytes(
+                            data=file_data,
+                            mime_type=mime_type,
+                        )
+                    ],
+                )
+            )
+        # Add text prompt
+        messages.append(
+            types.Content(
+                role="user",
+                parts=[
+                    types.Part.from_text(text=user_prompt)
+                ],
+            )
+        )
         client = genai.Client(api_key=api_key)
         safety_settings = [
             {
+                "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                "threshold": "BLOCK_NONE"
             },
             {
+                "category": "HARM_CATEGORY_HARASSMENT",
+                "threshold": "BLOCK_NONE"
             },
             {
+                "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                "threshold": "BLOCK_NONE"
             },
             {
+                "category": "HARM_CATEGORY_HATE_SPEECH",
+                "threshold": "BLOCK_NONE"
             }
         ]
         try:
+            if progress:
+                progress(0.3, "Generating podcast script...")
+            # Add timeout to the API call
+            response = await asyncio.wait_for(
+                client.aio.models.generate_content(
+                    model="gemini-2.0-flash",
+                    contents=messages,
+                    config=types.GenerateContentConfig(
+                        temperature=1,
+                        response_mime_type="application/json",
+                        safety_settings=[
+                            types.SafetySetting(
+                                category=safety_setting["category"],
+                                threshold=safety_setting["threshold"]
+                            ) for safety_setting in safety_settings
+                        ],
+                        system_instruction=system_prompt
+                    )
+                ),
+                timeout=60  # 60 seconds timeout
             )
+        except asyncio.TimeoutError:
+            raise gr.Error("The script generation request timed out. Please try again later.")
         except Exception as e:
             if "API key not valid" in str(e):
                 raise gr.Error("Invalid API key. Please provide a valid Gemini API key.")
         print(f"Generated podcast script:\n{response.text}")
+        if progress:
+            progress(0.4, "Script generated successfully!")
         return json.loads(response.text)
+    async def _read_file_bytes(self, file_obj) -> bytes:
+        """Read file bytes from a file object"""
+        async with aiofiles.open(file_obj.name, 'rb') as f:
+            return await f.read()
+    def _get_mime_type(self, filename: str) -> str:
+        """Determine MIME type based on file extension"""
+        ext = os.path.splitext(filename)[1].lower()
+        if ext == '.pdf':
+            return "application/pdf"
+        elif ext == '.txt':
+            return "text/plain"
+        else:
+            # Fallback to the default mime type detector
+            mime_type, _ = mimetypes.guess_type(filename)
+            return mime_type or "application/octet-stream"
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
         voice = speaker1 if speaker == 1 else speaker2
         temp_filename = f"temp_{uuid.uuid4()}.wav"
         try:
+            # Add timeout to TTS generation
+            await asyncio.wait_for(speech.save(temp_filename), timeout=30)  # 30 seconds timeout
             return temp_filename
+        except asyncio.TimeoutError:
+            if os.path.exists(temp_filename):
+                os.remove(temp_filename)
+            raise gr.Error("Text-to-speech generation timed out. Please try with a shorter text.")
         except Exception as e:
             if os.path.exists(temp_filename):
                 os.remove(temp_filename)
             raise e
+    async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
+        if progress:
+            progress(0.9, "Combining audio files...")
         combined_audio = AudioSegment.empty()
         for audio_file in audio_files:
             combined_audio += AudioSegment.from_file(audio_file)
         output_filename = f"output_{uuid.uuid4()}.wav"
         combined_audio.export(output_filename, format="wav")
+        if progress:
+            progress(1.0, "Podcast generated successfully!")
         return output_filename
+    async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
+        try:
+            if progress:
+                progress(0.1, "Starting podcast generation...")
+            # Set overall timeout for the entire process
+            return await asyncio.wait_for(
+                self._generate_podcast_internal(input_text, language, speaker1, speaker2, api_key, file_obj, progress),
+                timeout=600  # 10 minutes total timeout
+            )
+        except asyncio.TimeoutError:
+            raise gr.Error("The podcast generation process timed out. Please try with shorter text or try again later.")
+        except Exception as e:
+            raise gr.Error(f"Error generating podcast: {str(e)}")
+    async def _generate_podcast_internal(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_obj=None, progress=None) -> str:
+        if progress:
+            progress(0.2, "Generating podcast script...")
+        podcast_json = await self.generate_script(input_text, language, api_key, file_obj, progress)
+        if progress:
+            progress(0.5, "Converting text to speech...")
+        # Process TTS in batches to prevent overwhelming the system
+        audio_files = []
+        total_lines = len(podcast_json['podcast'])
+        for i, item in enumerate(podcast_json['podcast']):
+            if progress:
+                current_progress = 0.5 + (0.4 * (i / total_lines))
+                progress(current_progress, f"Processing speech {i+1}/{total_lines}...")
+            try:
+                audio_file = await self.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
+                audio_files.append(audio_file)
+            except Exception as e:
+                # Clean up any files already created
+                for file in audio_files:
+                    if os.path.exists(file):
+                        os.remove(file)
+                raise gr.Error(f"Error generating speech for line {i+1}: {str(e)}")
+        combined_audio = await self.combine_audio_files(audio_files, progress)
         return combined_audio
+async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "", progress=gr.Progress()) -> str:
     start_time = time.time()
     voice_names = {
     speaker1 = voice_names[speaker1]
     speaker2 = voice_names[speaker2]
+    try:
+        progress(0.05, "Processing input...")
+        if not api_key:
+            api_key = os.getenv("GENAI_API_KEY")
+            if not api_key:
+                raise gr.Error("No API key provided. Please provide a Gemini API key.")
+        podcast_generator = PodcastGenerator()
+        podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key, input_file, progress)
+        end_time = time.time()
+        print(f"Total podcast generation time: {end_time - start_time:.2f} seconds")
+        return podcast
+    except Exception as e:
+        # Ensure we show a user-friendly error
+        error_msg = str(e)
+        if "rate limit" in error_msg.lower():
+            raise gr.Error("Rate limit exceeded. Please try again later or use your own API key.")
+        elif "timeout" in error_msg.lower():
+            raise gr.Error("The request timed out. This could be due to server load or the length of your input. Please try again with shorter text.")
+        else:
+            raise gr.Error(f"Error: {error_msg}")
+# Define Gradio interface with concurrency control
 iface = gr.Interface(
     fn=process_input,
     inputs=[
     ],
     title="PodcastGen 🎙️",
     description="Generate a 2-speaker podcast from text input or documents!",
+    allow_flagging="never",
+    concurrency_limit=3,  # Limit concurrent requests to prevent overload
+    concurrency_id="podcast_gen"  # Identifier for concurrency group
 )
 if __name__ == "__main__":
+    iface.queue(max_size=10).launch()  # Set maximum queue size