documentaitestv4

Build error

File size: 6,311 Bytes

import gradio as gr
import asyncio
from websockets import connect, Data, ClientConnection
from dotenv import load_dotenv
import json
import os 
import threading
import numpy as np
import base64
import soundfile as sf
import io
from pydub import AudioSegment
import time
import uuid

class LogColors:
    OK = '\033[94m'
    SUCCESS = '\033[92m'
    WARNING = '\033[93m'
    ERROR = '\033[91m'
    ENDC = '\033[0m'

load_dotenv() 
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable must be set")

WEBSOCKET_URI = "wss://api.openai.com/v1/realtime?intent=transcription"
WEBSOCKET_HEADERS = {
    "Authorization": "Bearer " + OPENAI_API_KEY,
    "OpenAI-Beta": "realtime=v1"
}

css = """
"""

connections = {}

class WebSocketClient:
    def __init__(self, uri: str, headers: dict, client_id: str):
        self.uri = uri
        self.headers = headers
        self.websocket: ClientConnection = None
        self.queue = asyncio.Queue(maxsize=10)
        self.loop = None
        self.client_id = client_id
        self.transcript = ""

    async def connect(self):
        try:
            self.websocket = await connect(self.uri, additional_headers=self.headers)
            print(f"{LogColors.SUCCESS}Connected to OpenAI WebSocket{LogColors.ENDC}\n")

            # Send session settings to OpenAI
            with open("openai_transcription_settings.json", "r") as f:
                settings = f.read()
                await self.websocket.send(settings)

            await asyncio.gather(self.receive_messages(), self.send_audio_chunks())
        except Exception as e:
            print(f"{LogColors.ERROR}WebSocket Connection Error: {e}{LogColors.ENDC}")

    def run(self):
        self.loop = asyncio.new_event_loop()
        asyncio.set_event_loop(self.loop)
        self.loop.run_until_complete(self.connect())

    def process_websocket_message(self, message: Data):
        message_object = json.loads(message)
        if message_object["type"] != "error":
            print(f"{LogColors.OK}Received message: {LogColors.ENDC} {message}")
            if message_object["type"] == "conversation.item.input_audio_transcription.delta":
                delta = message_object["delta"]
                self.transcript += delta
            elif message_object["type"] == "conversation.item.input_audio_transcription.completed":
                self.transcript += ' ' if len(self.transcript) and self.transcript[-1] != ' ' else ''
        else:
            print(f"{LogColors.ERROR}Error: {message}{LogColors.ENDC}")

    async def send_audio_chunks(self):
        while True:
            audio_data = await self.queue.get()
            sample_rate, audio_array = audio_data
            if self.websocket:
                # Convert to mono if stereo
                if audio_array.ndim > 1:
                    audio_array = audio_array.mean(axis=1)

                # Convert to float32 and normalize
                audio_array = audio_array.astype(np.float32)
                audio_array /= np.max(np.abs(audio_array)) if np.max(np.abs(audio_array)) > 0 else 1.0

                # Convert to 16-bit PCM
                audio_array_int16 = (audio_array * 32767).astype(np.int16)
                
                audio_buffer = io.BytesIO()
                sf.write(audio_buffer, audio_array_int16, sample_rate, format='WAV', subtype='PCM_16')
                audio_buffer.seek(0) 
                audio_segment = AudioSegment.from_file(audio_buffer, format="wav")
                resampled_audio = audio_segment.set_frame_rate(24000)
                
                output_buffer = io.BytesIO()
                resampled_audio.export(output_buffer, format="wav")
                output_buffer.seek(0)
                base64_audio = base64.b64encode(output_buffer.read()).decode("utf-8")

                await self.websocket.send(json.dumps({"type": "input_audio_buffer.append", "audio": base64_audio}))
                print(f"{LogColors.OK}Sent audio chunk{LogColors.ENDC}")

    async def receive_messages(self):
        async for message in self.websocket:
            self.process_websocket_message(message)

    def enqueue_audio_chunk(self, sample_rate: int, chunk_array: np.ndarray):
        if not self.queue.full():
            asyncio.run_coroutine_threadsafe(self.queue.put((sample_rate, chunk_array)), self.loop)
        else:
            print(f"{LogColors.WARNING}Queue is full, dropping audio chunk{LogColors.ENDC}")

    async def close(self):
        if self.websocket:
            await self.websocket.close()
            connections.pop(self.client_id)
        print(f"{LogColors.WARNING}WebSocket connection closed{LogColors.ENDC}")


def send_audio_chunk(new_chunk: gr.Audio, client_id: str):
    if client_id not in connections:
        return "Connection is being established, please try again in a few seconds."
    sr, y = new_chunk
    connections[client_id].enqueue_audio_chunk(sr, y)
    return connections[client_id].transcript

def create_new_websocket_connection():
    client_id = str(uuid.uuid4())
    connections[client_id] = WebSocketClient(WEBSOCKET_URI, WEBSOCKET_HEADERS, client_id)
    threading.Thread(target=connections[client_id].run, daemon=True).start()
    return client_id

def clear_transcript(client_id):
    if client_id in connections:
        connections[client_id].transcript = ""
    return ""

if __name__ == "__main__":
    with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
        gr.Markdown(f"# Realtime transcription demo")
        with gr.Row():
            with gr.Column():
                output_textbox = gr.Textbox(label="Transcript", value="", lines=7, interactive=False, autoscroll=True)
        with gr.Row():    
            with gr.Column(scale=5):
                audio_input = gr.Audio(streaming=True, format="wav")
            with gr.Column():
                clear_button = gr.Button("Clear")

        client_id = gr.State()
        clear_button.click(clear_transcript, inputs=[client_id], outputs=[output_textbox])
        audio_input.stream(send_audio_chunk, [audio_input, client_id], [output_textbox], stream_every=0.5, concurrency_limit=None)
        demo.load(create_new_websocket_connection, outputs=[client_id])

    demo.launch()