Spaces:

Pixeltable
/

Document-to-Audio-Synthesis

Sleeping

App Files Files Community

Document-to-Audio-Synthesis / src /interface.py

PierreBrunelle

Update src/interface.py

f0a88ca verified 10 months ago

raw

history blame

8.95 kB

	import gradio as gr
	from .processor import process_document

	SYNTHESIS_MODES = {
	"narration": {
	"description": "Simple document narration with clear voice and natural pacing",
	"styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
	"default_temp": 0.7,
	"default_chunks": 300,
	"system_prompt": """Convert this content into clear narration.
	Format:
	- Clear sentence structures
	- Natural pauses (...)
	- Term definitions when needed
	- Proper transitions"""
	},
	"podcast": {
	"description": "Conversational style with engaging tone and dynamic pacing",
	"styles": ["Casual", "Interview", "Educational", "Commentary"],
	"default_temp": 0.8,
	"default_chunks": 400,
	"system_prompt": """Transform this content into engaging podcast-style speech."""
	},
	"presentation": {
	"description": "Professional presentation style with clear structure",
	"styles": ["Business", "Academic", "Sales", "Training"],
	"default_temp": 0.6,
	"default_chunks": 250,
	"system_prompt": """Convert this content into a presentation format."""
	},
	"storytelling": {
	"description": "Narrative style with emotional engagement",
	"styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
	"default_temp": 0.9,
	"default_chunks": 500,
	"system_prompt": """Transform this content into an engaging story."""
	}
	}

	def create_interface():
	with gr.Blocks(theme=gr.themes.Base()) as demo:
	gr.HTML(
	"""
	<div style="margin-bottom: 1rem;">
	<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
	alt="Pixeltable" style="max-width: 150px;" />
	<h1>📄 Document to Audio Synthesis 🎧</h1>
	</div>
	"""
	)

	# Overview Row
	with gr.Row():
	with gr.Column():
	with gr.Accordion("🎯 What does it do?", open=True):
	gr.Markdown("""
	- 📄 Document processing - 🧠 Content transformation
	- 🎧 Audio synthesis - ⚙️ Multiple output styles
	""")
	with gr.Column():
	with gr.Accordion("⚡ How does it work?", open=True):
	gr.Markdown("""
	1. 📑 Processing: Token-based segmentation
	2. 🔍 Analysis: LLM optimization & scripts
	3. 🎵 Synthesis: Multiple voice options
	""")

	synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])

	# Main Input Row
	with gr.Row():
	# Left Column - Core Inputs
	with gr.Column(scale=1):
	with gr.Row():
	api_key = gr.Textbox(
	label="🔑 OpenAI API Key",
	placeholder="sk-...",
	type="password",
	scale=2
	)
	file_input = gr.File(
	label="📁 Input PDF",
	file_types=[".pdf"],
	scale=1
	)

	# Right Column - Mode Selection
	with gr.Column(scale=1):
	mode_select = gr.Radio(
	choices=list(SYNTHESIS_MODES.keys()),
	value="narration",
	label="🎭 Output Mode",
	info="Select output type"
	)
	mode_description = gr.Markdown(
	SYNTHESIS_MODES["narration"]["description"],
	elem_classes=["mode-description"]
	)

	# Parameters Row
	with gr.Row():
	# Voice and Style Column
	with gr.Column():
	with gr.Box():
	gr.Markdown("### 🎛️ Voice & Style")
	voice_select = gr.Radio(
	choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
	value="onyx",
	label="🎙️ Voice",
	interactive=True
	)
	style_select = gr.Radio(
	choices=SYNTHESIS_MODES["narration"]["styles"],
	value=SYNTHESIS_MODES["narration"]["styles"][0],
	label="💫 Style",
	interactive=True
	)

	# Processing Parameters Column
	with gr.Column():
	with gr.Box():
	gr.Markdown("### ⚙️ Processing")
	with gr.Row():
	chunk_size = gr.Slider(
	minimum=100, maximum=1000,
	value=SYNTHESIS_MODES["narration"]["default_chunks"],
	step=50,
	label="📏 Chunk Size"
	)
	temperature = gr.Slider(
	minimum=0, maximum=1,
	value=SYNTHESIS_MODES["narration"]["default_temp"],
	step=0.1,
	label="🌡️ Temperature"
	)
	max_tokens = gr.Slider(
	minimum=100, maximum=1000,
	value=300,
	step=50,
	label="📊 Tokens"
	)

	# Process Button Row
	with gr.Row():
	process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
	status_output = gr.Textbox(label="📋 Status", scale=1)

	# Output Tabs Row
	with gr.Tabs():
	with gr.Tab("📝 Content"):
	output_table = gr.Dataframe(
	headers=["🔍 Segment", "📄 Content", "🎭 Script"],
	wrap=True
	)
	with gr.Tab("🎧 Audio"):
	with gr.Row():
	audio_output = gr.Audio(
	label="🔊 Output",
	type="filepath",
	show_download_button=True
	)
	with gr.Column():
	gr.Markdown("""
	### 📚 Quick Tips
	- 🎯 Lower temperature = more consistent output
	- 📏 Smaller chunks = more precise control
	- 🎙️ Try different voices for best results
	""")

	# Footer
	gr.HTML(
	"""
	<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
	<p style="margin: 0; color: #666; font-size: 0.8em;">
	🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
	\| 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
	\| 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
	</p>
	</div>
	"""
	)

	def update_mode(mode_name):
	mode = SYNTHESIS_MODES[mode_name]
	return (
	gr.update(choices=mode["styles"], value=mode["styles"][0]),
	gr.update(value=mode["default_chunks"]),
	gr.update(value=mode["default_temp"]),
	mode["description"]
	)

	mode_select.change(
	update_mode,
	inputs=[mode_select],
	outputs=[style_select, chunk_size, temperature, mode_description]
	)

	def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
	mode = SYNTHESIS_MODES[mode_name]
	return process_document(
	pdf_file=pdf_file,
	api_key=api_key,
	voice_choice=voice,
	style_choice=style,
	chunk_size=chunk_size,
	temperature=temperature,
	max_tokens=max_tokens,
	system_prompt=mode["system_prompt"]
	)

	process_btn.click(
	update_interface,
	inputs=[
	file_input, api_key, mode_select, voice_select, style_select,
	chunk_size, temperature, max_tokens
	],
	outputs=[output_table, audio_output, status_output]
	)

	return demo