File size: 3,834 Bytes
88bd911
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ecbb8
88bd911
8bac54d
88bd911
 
 
 
 
 
 
c3eca6a
88bd911
 
c3eca6a
88bd911
 
 
 
 
c3eca6a
 
88bd911
 
 
 
 
 
 
 
 
 
feb2aaf
 
 
 
 
 
 
88bd911
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bac54d
88bd911
 
 
feb2aaf
 
88bd911
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
import fal_client
import asyncio
import base64
from io import BytesIO
from PIL import Image
import httpx

# System prompt to guide the model
SYSTEM_PROMPT = """
You are a kind assistant that compliments people's appearances and acknowledges their hard work for their families or the work they do as a part of the Ultratech Cement Family (if they are a man).
Look at the image and create a short, sincere compliment about:
1. Something positive about how they look (facial features, smile, eyes, etc.)
2. Acknowledge the hard work they likely do for their family
Be warm, genuine, and uplifting without being overly flowery or artificial.
Keep the response to 2-3 sentences maximum. Do not say anything apart from the compliment.
"""

async def process_image(image, language):
    # Convert image to base64
    img_url = fal_client.upload_file(image)
    
    # Craft the prompt for the vision model
    prompt = f"Please look at this person and give them a sincere compliment about their appearance and acknowledge their hard work for their family and work for ultratech cement (If they are a man)."
    
    try:
        # Call FAL API to get text description from image
        result = await fal_client.async_client.run(
            "fal-ai/any-llm-open",
            arguments={
                "prompt": prompt,
                "system_prompt": SYSTEM_PROMPT + f"Make sure to only respond in the {language} language.",
                "model": "google/gemini-2.0-flash-001",
                "image_url": img_url
            }
        )
        
        compliment_text = result["output"]
        
        # Select TTS model based on language
        tts_model = "fal-ai/kokoro/american-english"
        voice = "af_heart"
        if language == "Hindi":
            tts_model = "fal-ai/kokoro/hindi"
            voice = "hf_alpha"
        
        # Call FAL API for text-to-speech
        tts_result = await fal_client.async_client.run(
            tts_model,
            arguments={
                "prompt": compliment_text,
                "voice": voice,
            }
        )
        
        audio_url = tts_result["audio"]["url"]
        
        # Download the audio file
        async with httpx.AsyncClient() as client:
            response = await client.get(audio_url)
            if response.status_code == 200:
                audio_data = response.content
                
                # Save the audio to a temporary file
                temp_file = "temp_audio.wav"
                with open(temp_file, "wb") as f:
                    f.write(audio_data)
                
                return compliment_text, temp_file
            else:
                return compliment_text, None
    
    except Exception as e:
        return f"Error: {str(e)}", None

def process_image_sync(image, language):
    return asyncio.run(process_image(image, language))

with gr.Blocks() as demo:
    gr.Markdown("# Face Reader")
    gr.Markdown("Upload a photo of someone, and the app will generate a prediction about them using AI")
    
    with gr.Row():
        with gr.Column():
            # Input components
            image_input = gr.Image(type="filepath", label="Upload Photo")
            language_selector = gr.Radio(["English", "Hindi"], label="Output Language", value="English")
            submit_button = gr.Button("Generate Prediction")
        
        with gr.Column():
            # Output components
            text_output = gr.Textbox(label="AI Response")
            audio_output = gr.Audio(label="AI Prediction", type="filepath")
    
    # Set up the event
    submit_button.click(
        fn=process_image_sync,
        inputs=[image_input, language_selector],
        outputs=[text_output, audio_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()