import pixeltable as pxt from pixeltable.iterators import DocumentSplitter from pixeltable.functions import openai import os import requests import tempfile import gradio as gr def process_document(pdf_file, api_key, voice_choice, style_choice, chunk_size, temperature, max_tokens, system_prompt, progress=gr.Progress()): try: os.environ['OPENAI_API_KEY'] = api_key progress(0.1, desc="Initializing...") pxt.drop_dir('document_audio', force=True) pxt.create_dir('document_audio') docs = pxt.create_table( 'document_audio.documents', { 'document': pxt.Document, 'voice': pxt.String, 'style': pxt.String, 'mode_prompt': pxt.String } ) progress(0.2, desc="Processing document...") docs.insert([{ 'document': pdf_file.name, 'voice': voice_choice, 'style': style_choice, 'mode_prompt': system_prompt }]) chunks = pxt.create_view( 'document_audio.chunks', docs, iterator=DocumentSplitter.create( document=docs.document, separators='token_limit', limit=chunk_size ) ) progress(0.4, desc="Text processing...") chunks['content_response'] = openai.chat_completions( messages=[ { 'role': 'system', 'content': docs.mode_prompt # Use the mode-specific prompt }, {'role': 'user', 'content': chunks.text} ], model='gpt-4o-mini-2024-07-18', max_tokens=max_tokens, temperature=temperature ) chunks['content'] = chunks.content_response['choices'][0]['message']['content'] progress(0.6, desc="Script generation...") chunks['script_response'] = openai.chat_completions( messages=[ { 'role': 'system', 'content': f"""Convert content to audio script. Style: {docs.style} Format: - Clear sentence structures - Natural pauses (...) - Term definitions when needed - Proper transitions""" }, {'role': 'user', 'content': chunks.content} ], model='gpt-4o-mini-2024-07-18', max_tokens=max_tokens, temperature=temperature ) chunks['script'] = chunks.script_response['choices'][0]['message']['content'] progress(0.8, desc="Audio synthesis...") @pxt.udf(return_type=pxt.Audio) def generate_audio(script: str, voice: str): if not script or not voice: return None try: response = requests.post( "https://api.openai.com/v1/audio/speech", headers={"Authorization": f"Bearer {api_key}"}, json={"model": "tts-1", "input": script, "voice": voice} ) if response.status_code == 200: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') temp_file.write(response.content) temp_file.close() return temp_file.name except Exception as e: print(f"Error in audio synthesis: {e}") return None chunks['audio'] = generate_audio(chunks.script, docs.voice) audio_path = chunks.select(chunks.audio).tail(1)['audio'][0] results = chunks.select( chunks.content, chunks.script ).collect() display_data = [ [f"Segment {idx + 1}", row['content'], row['script']] for idx, row in enumerate(results) ] progress(1.0, desc="Complete") return display_data, audio_path, "Processing complete" except Exception as e: return None, None, f"Error: {str(e)}"