Spaces:

Pixeltable
/

Document-to-Audio-Synthesis

Sleeping

App Files Files Community

PierreBrunelle commited on Oct 23, 2024

Commit

58c6bad

verified ·

1 Parent(s): 5a88b24

Update src/interface.py

Browse files

Files changed (1) hide show

src/interface.py +43 -43

src/interface.py CHANGED Viewed

@@ -8,115 +8,115 @@ def create_interface():
             <div style="margin-bottom: 1rem;">
                 <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
                      alt="Pixeltable" style="max-width: 150px;" />
-                <h1>Document to Audio Synthesis</h1>
             </div>
             """
         )
         with gr.Row():
             with gr.Column():
-                with gr.Accordion("What does it do?", open=True):
                     gr.Markdown("""
-                        - PDF document processing and text extraction
-                        - Intelligent content transformation and summarization
-                        - High-quality audio synthesis with voice selection
-                        - Configurable processing parameters
-                        - Downloadable audio output
                     """)
             with gr.Column():
-                with gr.Accordion("How does it work?", open=True):
                     gr.Markdown("""
-                        1. **Document Processing**
-                           - Chunks document using token-based segmentation
-                           - Maintains document structure and context
-                        2. **Content Processing**
-                           - Transforms text using LLM optimization
-                           - Generates optimized audio scripts
-                        3. **Audio Synthesis**
-                           - Converts scripts to natural speech
-                           - Multiple voice models available
                     """)
         with gr.Row():
             with gr.Column():
                 api_key = gr.Textbox(
-                    label="OpenAI API Key",
                     placeholder="sk-...",
                     type="password"
                 )
                 file_input = gr.File(
-                    label="Input Document (PDF)",
                     file_types=[".pdf"]
                 )
-                with gr.Accordion("Synthesis Parameters", open=True):
                     voice_select = gr.Radio(
                         choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
                         value="onyx",
-                        label="Voice Model",
                         info="TTS voice model selection"
                     )
                     style_select = gr.Radio(
                         choices=["Technical", "Narrative", "Instructional", "Descriptive"],
                         value="Technical",
-                        label="Processing Style",
                         info="Content processing approach"
                     )
-                with gr.Accordion("Processing Parameters", open=False):
                     chunk_size = gr.Slider(
                         minimum=100, maximum=1000, value=300, step=50,
-                        label="Chunk Size (tokens)",
                         info="Text segmentation size"
                     )
                     temperature = gr.Slider(
                         minimum=0, maximum=1, value=0.7, step=0.1,
-                        label="Temperature",
                         info="LLM randomness factor"
                     )
                     max_tokens = gr.Slider(
                         minimum=100, maximum=1000, value=300, step=50,
-                        label="Max Tokens",
                         info="Maximum output token limit"
                     )
-                process_btn = gr.Button("Process Document", variant="primary")
-                status_output = gr.Textbox(label="Status")
         with gr.Tabs():
-            with gr.TabItem("Content Processing"):
                 output_table = gr.Dataframe(
-                    headers=["Segment", "Processed Content", "Audio Script"],
                     wrap=True
                 )
-            with gr.TabItem("Audio Output"):
                 audio_output = gr.Audio(
-                    label="Synthesized Audio",
                     type="filepath",
                     show_download_button=True
                 )
         gr.Markdown("""
-            ### Technical Notes
-            - Token limit affects processing speed and memory usage
-            - Temperature values > 0.8 may introduce content variations
-            - Audio synthesis has a 4096 token limit per segment
-            ### Performance Considerations
-            - Chunk size directly impacts processing time
-            - Higher temperatures increase LLM compute time
-            - Audio synthesis scales with script length
         """)
         gr.HTML(
             """
             <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
                 <p style="margin: 0; color: #666; font-size: 0.8em;">
-                    Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
-                    | <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
-                    | <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
                 </p>
             </div>
             """

             <div style="margin-bottom: 1rem;">
                 <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
                      alt="Pixeltable" style="max-width: 150px;" />
+                <h1>📄 Document to Audio Synthesis 🎧</h1>
             </div>
             """
         )
         with gr.Row():
             with gr.Column():
+                with gr.Accordion("🎯 What does it do?", open=True):
                     gr.Markdown("""
+                        - 📄 PDF document processing and text extraction
+                        - 🧠 Intelligent content transformation and summarization
+                        - 🎧 High-quality audio synthesis with voice selection
+                        - ⚙️ Configurable processing parameters
+                        - ⬇️ Downloadable audio output
                     """)
             with gr.Column():
+                with gr.Accordion("⚡ How does it work?", open=True):
                     gr.Markdown("""
+                        1. 📑 **Document Processing**
+                           - 📊 Chunks document using token-based segmentation
+                           - 🔄 Maintains document structure and context
+                        2. 🔍 **Content Processing**
+                           - 🤖 Transforms text using LLM optimization
+                           - 📝 Generates optimized audio scripts
+                        3. 🎵 **Audio Synthesis**
+                           - 🗣️ Converts scripts to natural speech
+                           - 🎙️ Multiple voice models available
                     """)
         with gr.Row():
             with gr.Column():
                 api_key = gr.Textbox(
+                    label="🔑 OpenAI API Key",
                     placeholder="sk-...",
                     type="password"
                 )
                 file_input = gr.File(
+                    label="📁 Input Document (PDF)",
                     file_types=[".pdf"]
                 )
+                with gr.Accordion("🎛️ Synthesis Parameters", open=True):
                     voice_select = gr.Radio(
                         choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
                         value="onyx",
+                        label="🎙️ Voice Model",
                         info="TTS voice model selection"
                     )
                     style_select = gr.Radio(
                         choices=["Technical", "Narrative", "Instructional", "Descriptive"],
                         value="Technical",
+                        label="💫 Processing Style",
                         info="Content processing approach"
                     )
+                with gr.Accordion("⚙️ Processing Parameters", open=False):
                     chunk_size = gr.Slider(
                         minimum=100, maximum=1000, value=300, step=50,
+                        label="📏 Chunk Size (tokens)",
                         info="Text segmentation size"
                     )
                     temperature = gr.Slider(
                         minimum=0, maximum=1, value=0.7, step=0.1,
+                        label="🌡️ Temperature",
                         info="LLM randomness factor"
                     )
                     max_tokens = gr.Slider(
                         minimum=100, maximum=1000, value=300, step=50,
+                        label="📊 Max Tokens",
                         info="Maximum output token limit"
                     )
+                process_btn = gr.Button("🚀 Process Document", variant="primary")
+                status_output = gr.Textbox(label="📋 Status")
         with gr.Tabs():
+            with gr.TabItem("📝 Content Processing"):
                 output_table = gr.Dataframe(
+                    headers=["🔍 Segment", "📄 Processed Content", "🎭 Audio Script"],
                     wrap=True
                 )
+            with gr.TabItem("🎧 Audio Output"):
                 audio_output = gr.Audio(
+                    label="🔊 Synthesized Audio",
                     type="filepath",
                     show_download_button=True
                 )
         gr.Markdown("""
+            ### 📚 Technical Notes
+            - ⚡ Token limit affects processing speed and memory usage
+            - 🎯 Temperature values > 0.8 may introduce content variations
+            - 🔊 Audio synthesis has a 4096 token limit per segment
+            ### ⚙️ Performance Considerations
+            - 📊 Chunk size directly impacts processing time
+            - 🔄 Higher temperatures increase LLM compute time
+            - ⏱️ Audio synthesis scales with script length
         """)
         gr.HTML(
             """
             <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
                 <p style="margin: 0; color: #666; font-size: 0.8em;">
+                    🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
+                    | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
+                    | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
                 </p>
             </div>
             """