Spaces:

Pixeltable
/

Document-to-Audio-Synthesis

Sleeping

App Files Files Community

PierreBrunelle commited on Oct 23, 2024

Commit

f0a88ca

verified ·

1 Parent(s): 58c6bad

Update src/interface.py

Browse files

Files changed (1) hide show

src/interface.py +158 -77

src/interface.py CHANGED Viewed

@@ -1,6 +1,42 @@
 import gradio as gr
 from .processor import process_document
 def create_interface():
     with gr.Blocks(theme=gr.themes.Base()) as demo:
         gr.HTML(
@@ -13,124 +49,169 @@ def create_interface():
             """
         )
         with gr.Row():
             with gr.Column():
                 with gr.Accordion("🎯 What does it do?", open=True):
                     gr.Markdown("""
-                        - 📄 PDF document processing and text extraction
-                        - 🧠 Intelligent content transformation and summarization
-                        - 🎧 High-quality audio synthesis with voice selection
-                        - ⚙️ Configurable processing parameters
-                        - ⬇️ Downloadable audio output
                     """)
             with gr.Column():
                 with gr.Accordion("⚡ How does it work?", open=True):
                     gr.Markdown("""
-                        1. 📑 **Document Processing**
-                           - 📊 Chunks document using token-based segmentation
-                           - 🔄 Maintains document structure and context
-                        2. 🔍 **Content Processing**
-                           - 🤖 Transforms text using LLM optimization
-                           - 📝 Generates optimized audio scripts
-                        3. 🎵 **Audio Synthesis**
-                           - 🗣️ Converts scripts to natural speech
-                           - 🎙️ Multiple voice models available
                     """)
         with gr.Row():
-            with gr.Column():
-                api_key = gr.Textbox(
-                    label="🔑 OpenAI API Key",
-                    placeholder="sk-...",
-                    type="password"
                 )
-                file_input = gr.File(
-                    label="📁 Input Document (PDF)",
-                    file_types=[".pdf"]
                 )
-                with gr.Accordion("🎛️ Synthesis Parameters", open=True):
                     voice_select = gr.Radio(
                         choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
                         value="onyx",
-                        label="🎙️ Voice Model",
-                        info="TTS voice model selection"
                     )
                     style_select = gr.Radio(
-                        choices=["Technical", "Narrative", "Instructional", "Descriptive"],
-                        value="Technical",
-                        label="💫 Processing Style",
-                        info="Content processing approach"
                     )
-                with gr.Accordion("⚙️ Processing Parameters", open=False):
-                    chunk_size = gr.Slider(
-                        minimum=100, maximum=1000, value=300, step=50,
-                        label="📏 Chunk Size (tokens)",
-                        info="Text segmentation size"
-                    )
-                    temperature = gr.Slider(
-                        minimum=0, maximum=1, value=0.7, step=0.1,
-                        label="🌡️ Temperature",
-                        info="LLM randomness factor"
-                    )
-                    max_tokens = gr.Slider(
-                        minimum=100, maximum=1000, value=300, step=50,
-                        label="📊 Max Tokens",
-                        info="Maximum output token limit"
-                    )
-                process_btn = gr.Button("🚀 Process Document", variant="primary")
-                status_output = gr.Textbox(label="📋 Status")
         with gr.Tabs():
-            with gr.TabItem("📝 Content Processing"):
                 output_table = gr.Dataframe(
-                    headers=["🔍 Segment", "📄 Processed Content", "🎭 Audio Script"],
                     wrap=True
                 )
-            with gr.TabItem("🎧 Audio Output"):
-                audio_output = gr.Audio(
-                    label="🔊 Synthesized Audio",
-                    type="filepath",
-                    show_download_button=True
-                )
-        gr.Markdown("""
-            ### 📚 Technical Notes
-            - ⚡ Token limit affects processing speed and memory usage
-            - 🎯 Temperature values > 0.8 may introduce content variations
-            - 🔊 Audio synthesis has a 4096 token limit per segment
-            ### ⚙️ Performance Considerations
-            - 📊 Chunk size directly impacts processing time
-            - 🔄 Higher temperatures increase LLM compute time
-            - ⏱️ Audio synthesis scales with script length
-        """)
         gr.HTML(
             """
             <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
                 <p style="margin: 0; color: #666; font-size: 0.8em;">
                     🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
-                    | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
-                    | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
                 </p>
             </div>
             """
         )
-        def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
             return process_document(
-                pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
             )
         process_btn.click(
             update_interface,
             inputs=[
-                file_input, api_key, voice_select, style_select,
                 chunk_size, temperature, max_tokens
             ],
             outputs=[output_table, audio_output, status_output]

 import gradio as gr
 from .processor import process_document
+SYNTHESIS_MODES = {
+    "narration": {
+        "description": "Simple document narration with clear voice and natural pacing",
+        "styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
+        "default_temp": 0.7,
+        "default_chunks": 300,
+        "system_prompt": """Convert this content into clear narration.
+            Format:
+            - Clear sentence structures
+            - Natural pauses (...)
+            - Term definitions when needed
+            - Proper transitions"""
+    },
+    "podcast": {
+        "description": "Conversational style with engaging tone and dynamic pacing",
+        "styles": ["Casual", "Interview", "Educational", "Commentary"],
+        "default_temp": 0.8,
+        "default_chunks": 400,
+        "system_prompt": """Transform this content into engaging podcast-style speech."""
+    },
+    "presentation": {
+        "description": "Professional presentation style with clear structure",
+        "styles": ["Business", "Academic", "Sales", "Training"],
+        "default_temp": 0.6,
+        "default_chunks": 250,
+        "system_prompt": """Convert this content into a presentation format."""
+    },
+    "storytelling": {
+        "description": "Narrative style with emotional engagement",
+        "styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
+        "default_temp": 0.9,
+        "default_chunks": 500,
+        "system_prompt": """Transform this content into an engaging story."""
+    }
+}
 def create_interface():
     with gr.Blocks(theme=gr.themes.Base()) as demo:
         gr.HTML(
             """
         )
+        # Overview Row
         with gr.Row():
             with gr.Column():
                 with gr.Accordion("🎯 What does it do?", open=True):
                     gr.Markdown("""
+                        - 📄 Document processing  - 🧠 Content transformation
+                        - 🎧 Audio synthesis  - ⚙️ Multiple output styles
                     """)
             with gr.Column():
                 with gr.Accordion("⚡ How does it work?", open=True):
                     gr.Markdown("""
+                        1. 📑 **Processing:** Token-based segmentation
+                        2. 🔍 **Analysis:** LLM optimization & scripts
+                        3. 🎵 **Synthesis:** Multiple voice options
                     """)
+        synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])
+        # Main Input Row
         with gr.Row():
+            # Left Column - Core Inputs
+            with gr.Column(scale=1):
+                with gr.Row():
+                    api_key = gr.Textbox(
+                        label="🔑 OpenAI API Key",
+                        placeholder="sk-...",
+                        type="password",
+                        scale=2
+                    )
+                    file_input = gr.File(
+                        label="📁 Input PDF",
+                        file_types=[".pdf"],
+                        scale=1
+                    )
+            # Right Column - Mode Selection
+            with gr.Column(scale=1):
+                mode_select = gr.Radio(
+                    choices=list(SYNTHESIS_MODES.keys()),
+                    value="narration",
+                    label="🎭 Output Mode",
+                    info="Select output type"
                 )
+                mode_description = gr.Markdown(
+                    SYNTHESIS_MODES["narration"]["description"],
+                    elem_classes=["mode-description"]
                 )
+        # Parameters Row
+        with gr.Row():
+            # Voice and Style Column
+            with gr.Column():
+                with gr.Box():
+                    gr.Markdown("### 🎛️ Voice & Style")
                     voice_select = gr.Radio(
                         choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
                         value="onyx",
+                        label="🎙️ Voice",
+                        interactive=True
                     )
                     style_select = gr.Radio(
+                        choices=SYNTHESIS_MODES["narration"]["styles"],
+                        value=SYNTHESIS_MODES["narration"]["styles"][0],
+                        label="💫 Style",
+                        interactive=True
                     )
+            # Processing Parameters Column
+            with gr.Column():
+                with gr.Box():
+                    gr.Markdown("### ⚙️ Processing")
+                    with gr.Row():
+                        chunk_size = gr.Slider(
+                            minimum=100, maximum=1000,
+                            value=SYNTHESIS_MODES["narration"]["default_chunks"],
+                            step=50,
+                            label="📏 Chunk Size"
+                        )
+                        temperature = gr.Slider(
+                            minimum=0, maximum=1,
+                            value=SYNTHESIS_MODES["narration"]["default_temp"],
+                            step=0.1,
+                            label="🌡️ Temperature"
+                        )
+                        max_tokens = gr.Slider(
+                            minimum=100, maximum=1000,
+                            value=300,
+                            step=50,
+                            label="📊 Tokens"
+                        )
+        # Process Button Row
+        with gr.Row():
+            process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
+            status_output = gr.Textbox(label="📋 Status", scale=1)
+        # Output Tabs Row
         with gr.Tabs():
+            with gr.Tab("📝 Content"):
                 output_table = gr.Dataframe(
+                    headers=["🔍 Segment", "📄 Content", "🎭 Script"],
                     wrap=True
                 )
+            with gr.Tab("🎧 Audio"):
+                with gr.Row():
+                    audio_output = gr.Audio(
+                        label="🔊 Output",
+                        type="filepath",
+                        show_download_button=True
+                    )
+                    with gr.Column():
+                        gr.Markdown("""
+                            ### 📚 Quick Tips
+                            - 🎯 Lower temperature = more consistent output
+                            - 📏 Smaller chunks = more precise control
+                            - 🎙️ Try different voices for best results
+                        """)
+        # Footer
         gr.HTML(
             """
             <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
                 <p style="margin: 0; color: #666; font-size: 0.8em;">
                     🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
+                    | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
+                    | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
                 </p>
             </div>
             """
         )
+        def update_mode(mode_name):
+            mode = SYNTHESIS_MODES[mode_name]
+            return (
+                gr.update(choices=mode["styles"], value=mode["styles"][0]),
+                gr.update(value=mode["default_chunks"]),
+                gr.update(value=mode["default_temp"]),
+                mode["description"]
+            )
+        mode_select.change(
+            update_mode,
+            inputs=[mode_select],
+            outputs=[style_select, chunk_size, temperature, mode_description]
+        )
+        def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
+            mode = SYNTHESIS_MODES[mode_name]
             return process_document(
+                pdf_file=pdf_file,
+                api_key=api_key,
+                voice_choice=voice,
+                style_choice=style,
+                chunk_size=chunk_size,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                system_prompt=mode["system_prompt"]
             )
         process_btn.click(
             update_interface,
             inputs=[
+                file_input, api_key, mode_select, voice_select, style_select,
                 chunk_size, temperature, max_tokens
             ],
             outputs=[output_table, audio_output, status_output]