Spaces:

Athspi
/

Whshhs

Runtime error

App Files Files Community

Whshhs / app.py

Athspi

Update app.py

740846d verified 3 months ago

raw

history blame

5.6 kB

	import gradio as gr
	import asyncio
	import base64
	import io
	import cv2
	import pyaudio
	import PIL.Image
	import mss
	from google import genai
	from google.genai import types

	# Configuration
	FORMAT = pyaudio.paInt16
	CHANNELS = 1
	SEND_SAMPLE_RATE = 16000
	RECEIVE_SAMPLE_RATE = 24000
	CHUNK_SIZE = 1024
	MODEL = "models/gemini-2.0-flash-exp"

	class GeminiTTS:
	def __init__(self, api_key):
	self.client = genai.Client(http_options={"api_version": "v1alpha"}, api_key=api_key)
	self.pya = pyaudio.PyAudio()
	self.audio_in_queue = asyncio.Queue()
	self.out_queue = asyncio.Queue(maxsize=5)
	self.session = None
	self.audio_stream = None

	self.config = types.LiveConnectConfig(
	response_modalities=["audio"],
	speech_config=types.SpeechConfig(
	voice_config=types.VoiceConfig(
	prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Puck")
	)
	),
	system_instruction=types.Content(
	parts=[types.Part.from_text(text="Answer user ask, replay same thing user say no other word explain")],
	role="user"
	),
	)

	async def _get_frame(self, cap):
	ret, frame = cap.read()
	if not ret:
	return None
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	img = PIL.Image.fromarray(frame_rgb)
	img.thumbnail([1024, 1024])
	image_io = io.BytesIO()
	img.save(image_io, format="jpeg")
	image_io.seek(0)
	return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}

	async def _get_screen(self):
	sct = mss.mss()
	monitor = sct.monitors[0]
	i = sct.grab(monitor)
	img = PIL.Image.open(io.BytesIO(mss.tools.to_png(i.rgb, i.size)))
	image_io = io.BytesIO()
	img.save(image_io, format="jpeg")
	image_io.seek(0)
	return {"mime_type": "image/jpeg", "data": base64.b64encode(image_io.read()).decode()}

	async def process_input(self, text=None, mode="text"):
	try:
	async with self.client.aio.live.connect(model=MODEL, config=self.config) as session:
	self.session = session

	if mode == "text" and text:
	await session.send(input=text or ".", end_of_turn=True)
	elif mode == "camera":
	cap = cv2.VideoCapture(0)
	frame = await self._get_frame(cap)
	cap.release()
	if frame:
	await session.send(input=frame)
	elif mode == "screen":
	frame = await self._get_screen()
	if frame:
	await session.send(input=frame)

	# Get response
	turn = session.receive()
	async for response in turn:
	if data := response.data:
	return data
	if text := response.text:
	return text

	return "No response received"
	except Exception as e:
	return f"Error: {str(e)}"

	def create_gradio_interface():
	tts_handler = None

	def init_tts(api_key):
	nonlocal tts_handler
	tts_handler = GeminiTTS(api_key)
	return "Gemini TTS Initialized!"

	async def generate_response(text, mode):
	if not tts_handler:
	raise gr.Error("Please initialize the TTS system first with your API key")

	result = await tts_handler.process_input(text, mode)

	if isinstance(result, bytes):
	# Audio response
	with io.BytesIO() as wav_buffer:
	wav_buffer.write(result)
	wav_buffer.seek(0)
	return (RECEIVE_SAMPLE_RATE, wav_buffer.read())
	else:
	# Text response
	return result

	with gr.Blocks(title="Gemini TTS Interface") as demo:
	gr.Markdown("# 🎤 Gemini Text-to-Speech Interface")

	with gr.Row():
	api_key = gr.Textbox(label="Gemini API Key", type="password")
	init_btn = gr.Button("Initialize TTS")

	init_output = gr.Textbox(label="Initialization Status", interactive=False)
	init_btn.click(init_tts, inputs=api_key, outputs=init_output)

	with gr.Tab("Text Input"):
	with gr.Row():
	text_input = gr.Textbox(label="Enter Text", lines=3)
	text_btn = gr.Button("Generate Speech")

	text_output = gr.Audio(label="Generated Speech")
	text_btn.click(generate_response, inputs=[text_input, gr.Text("text", visible=False)], outputs=text_output)

	with gr.Tab("Camera Input"):
	camera_btn = gr.Button("Capture and Process")
	camera_output = gr.Audio(label="Generated Speech from Camera")
	camera_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("camera", visible=False)], outputs=camera_output)

	with gr.Tab("Screen Capture"):
	screen_btn = gr.Button("Capture Screen and Process")
	screen_output = gr.Audio(label="Generated Speech from Screen")
	screen_btn.click(generate_response, inputs=[gr.Text("", visible=False), gr.Text("screen", visible=False)], outputs=screen_output)

	return demo

	if __name__ == "__main__":
	demo = create_gradio_interface()
	demo.launch()