AI-Teller / app.py
Warlord-K's picture
Update app.py
f8ecbb8 verified
import gradio as gr
import fal_client
import asyncio
import base64
from io import BytesIO
from PIL import Image
import httpx
# System prompt to guide the model
SYSTEM_PROMPT = """
You are a kind assistant that compliments people's appearances and acknowledges their hard work for their families or the work they do as a part of the Ultratech Cement Family (if they are a man).
Look at the image and create a short, sincere compliment about:
1. Something positive about how they look (facial features, smile, eyes, etc.)
2. Acknowledge the hard work they likely do for their family
Be warm, genuine, and uplifting without being overly flowery or artificial.
Keep the response to 2-3 sentences maximum. Do not say anything apart from the compliment.
"""
async def process_image(image, language):
# Convert image to base64
img_url = fal_client.upload_file(image)
# Craft the prompt for the vision model
prompt = f"Please look at this person and give them a sincere compliment about their appearance and acknowledge their hard work for their family and work for ultratech cement (If they are a man)."
try:
# Call FAL API to get text description from image
result = await fal_client.async_client.run(
"fal-ai/any-llm-open",
arguments={
"prompt": prompt,
"system_prompt": SYSTEM_PROMPT + f"Make sure to only respond in the {language} language.",
"model": "google/gemini-2.0-flash-001",
"image_url": img_url
}
)
compliment_text = result["output"]
# Select TTS model based on language
tts_model = "fal-ai/kokoro/american-english"
voice = "af_heart"
if language == "Hindi":
tts_model = "fal-ai/kokoro/hindi"
voice = "hf_alpha"
# Call FAL API for text-to-speech
tts_result = await fal_client.async_client.run(
tts_model,
arguments={
"prompt": compliment_text,
"voice": voice,
}
)
audio_url = tts_result["audio"]["url"]
# Download the audio file
async with httpx.AsyncClient() as client:
response = await client.get(audio_url)
if response.status_code == 200:
audio_data = response.content
# Save the audio to a temporary file
temp_file = "temp_audio.wav"
with open(temp_file, "wb") as f:
f.write(audio_data)
return compliment_text, temp_file
else:
return compliment_text, None
except Exception as e:
return f"Error: {str(e)}", None
def process_image_sync(image, language):
return asyncio.run(process_image(image, language))
with gr.Blocks() as demo:
gr.Markdown("# Face Reader")
gr.Markdown("Upload a photo of someone, and the app will generate a prediction about them using AI")
with gr.Row():
with gr.Column():
# Input components
image_input = gr.Image(type="filepath", label="Upload Photo")
language_selector = gr.Radio(["English", "Hindi"], label="Output Language", value="English")
submit_button = gr.Button("Generate Prediction")
with gr.Column():
# Output components
text_output = gr.Textbox(label="AI Response")
audio_output = gr.Audio(label="AI Prediction", type="filepath")
# Set up the event
submit_button.click(
fn=process_image_sync,
inputs=[image_input, language_selector],
outputs=[text_output, audio_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch()