Spaces:
Sleeping
Sleeping
Commit
·
e94ed44
0
Parent(s):
Init
Browse files- README.md +48 -0
- app.py +160 -0
- requirements.txt +9 -0
README.md
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: AsianMOM
|
| 3 |
+
emoji: 💢
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.31.2
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: true
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# AsianMOM 💢
|
| 13 |
+
|
| 14 |
+
**AsianMOM** is a fun, interactive Gradio Space that uses your webcam to observe what you're doing and then roasts you like a stereotypical Asian mom—complete with high expectations, cousin comparisons, and slipper threats!
|
| 15 |
+
|
| 16 |
+
## 🚀 Features
|
| 17 |
+
- **Live Webcam Feed**: Observes your actions in real time.
|
| 18 |
+
- **Vision Model**: Describes what it sees using BLIP image captioning.
|
| 19 |
+
- **Roast Generation**: Uses Meta's Llama-3.2-1B-Instruct to generate witty, culturally-inspired "mom roasts".
|
| 20 |
+
- **Text-to-Speech**: Delivers the roast in a mature, motherly voice using Parler-TTS.
|
| 21 |
+
- **Fully Automated**: No button presses needed—just let AsianMOM do her thing!
|
| 22 |
+
|
| 23 |
+
## 🛠️ How It Works
|
| 24 |
+
1. **Webcam Capture**: The app streams your webcam feed.
|
| 25 |
+
2. **Image Captioning**: BLIP model generates a description of what you're doing.
|
| 26 |
+
3. **Roast Generation**: Llama-3.2-1B-Instruct crafts a humorous, mom-style roast based on the caption.
|
| 27 |
+
4. **Voice Output**: Parler-TTS reads the roast aloud in a fitting voice.
|
| 28 |
+
|
| 29 |
+
## 📦 Setup & Usage
|
| 30 |
+
1. **Clone or Fork this Space**
|
| 31 |
+
2. Ensure your hardware supports GPU (T4 or better recommended)
|
| 32 |
+
3. All dependencies are managed via `requirements.txt`
|
| 33 |
+
4. Launch the Space and allow webcam access
|
| 34 |
+
5. Enjoy being roasted by AsianMOM!
|
| 35 |
+
|
| 36 |
+
## 🧩 Models Used
|
| 37 |
+
- [BLIP Image Captioning](https://huggingface.co/Salesforce/blip-image-captioning-base)
|
| 38 |
+
- [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)
|
| 39 |
+
- [Parler-TTS Mini Expresso](https://huggingface.co/parler-tts/parler-tts-mini-expresso)
|
| 40 |
+
|
| 41 |
+
## 🙏 Credits
|
| 42 |
+
- Inspired by classic Asian mom humor and memes
|
| 43 |
+
- Built with [Gradio](https://gradio.app/)
|
| 44 |
+
- Powered by Hugging Face models
|
| 45 |
+
|
| 46 |
+
## ⚠️ Disclaimer
|
| 47 |
+
This app is for entertainment purposes only. Stereotypes are used in a lighthearted, humorous way—please use responsibly and respectfully.
|
| 48 |
+
|
app.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import torch
|
| 4 |
+
import cv2
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import numpy as np
|
| 7 |
+
from transformers import pipeline, AutoProcessor, AutoModelForVision2Seq
|
| 8 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 12 |
+
|
| 13 |
+
# Set environment variables
|
| 14 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 15 |
+
|
| 16 |
+
def initialize_vision_model():
|
| 17 |
+
# Using BLIP for image captioning - lightweight but effective
|
| 18 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 19 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 20 |
+
|
| 21 |
+
return {
|
| 22 |
+
"processor": processor,
|
| 23 |
+
"model": model
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def analyze_image(image, vision_components):
|
| 27 |
+
processor = vision_components["processor"]
|
| 28 |
+
model = vision_components["model"]
|
| 29 |
+
|
| 30 |
+
# Convert to RGB if needed
|
| 31 |
+
if isinstance(image, np.ndarray):
|
| 32 |
+
image = Image.fromarray(image)
|
| 33 |
+
|
| 34 |
+
inputs = processor(image, return_tensors="pt")
|
| 35 |
+
|
| 36 |
+
with torch.no_grad():
|
| 37 |
+
outputs = model.generate(**inputs, max_length=30)
|
| 38 |
+
|
| 39 |
+
caption = processor.decode(outputs[0], skip_special_tokens=True)
|
| 40 |
+
return caption
|
| 41 |
+
|
| 42 |
+
def initialize_llm():
|
| 43 |
+
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
| 44 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 45 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 46 |
+
model_id,
|
| 47 |
+
torch_dtype=torch.bfloat16,
|
| 48 |
+
device_map="auto"
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
return {
|
| 52 |
+
"model": model,
|
| 53 |
+
"tokenizer": tokenizer
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
def generate_roast(caption, llm_components):
|
| 57 |
+
model = llm_components["model"]
|
| 58 |
+
tokenizer = llm_components["tokenizer"]
|
| 59 |
+
|
| 60 |
+
prompt = f"""[INST] You are AsianMOM, a stereotypical Asian mother who always has high expectations. \nYou just observed your child doing this: \"{caption}\"\n \nRespond with a short, humorous roast (maximum 2-3 sentences) in the style of a stereotypical Asian mother. \nInclude at least one of these elements:\n- Comparison to more successful relatives/cousins\n- High expectations about academic success\n- Mild threats about using slippers\n- Questioning life choices\n- Asking when they'll get married or have kids\n- Commenting on appearance\n- Saying \"back in my day\" and describing hardship\n\nBe funny but not hurtful. Keep it brief. [/INST]"""
|
| 61 |
+
|
| 62 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 63 |
+
|
| 64 |
+
with torch.no_grad():
|
| 65 |
+
outputs = model.generate(
|
| 66 |
+
**inputs,
|
| 67 |
+
max_length=300,
|
| 68 |
+
temperature=0.7,
|
| 69 |
+
top_p=0.9,
|
| 70 |
+
do_sample=True
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 74 |
+
# Extract just the response part, not the prompt
|
| 75 |
+
response = response.split("[/INST]")[1].strip()
|
| 76 |
+
|
| 77 |
+
return response
|
| 78 |
+
|
| 79 |
+
def initialize_tts_model():
|
| 80 |
+
tts_pipeline = pipeline(
|
| 81 |
+
"text-to-speech",
|
| 82 |
+
model="parler-tts/parler-tts-mini-expresso"
|
| 83 |
+
)
|
| 84 |
+
return tts_pipeline
|
| 85 |
+
|
| 86 |
+
def text_to_speech(text, tts_pipeline):
|
| 87 |
+
# Additional prompt to guide the voice style
|
| 88 |
+
styled_text = f"[[voice:female_mature]] [[speed:0.9]] [[precision:0.8]] {text}"
|
| 89 |
+
|
| 90 |
+
speech = tts_pipeline(styled_text)
|
| 91 |
+
return (speech["sampling_rate"], speech["audio"])
|
| 92 |
+
|
| 93 |
+
def process_frame(image, vision_components, llm_components, tts_pipeline):
|
| 94 |
+
# Step 1: Analyze what's in the image
|
| 95 |
+
caption = analyze_image(image, vision_components)
|
| 96 |
+
|
| 97 |
+
# Step 2: Generate roast based on the caption
|
| 98 |
+
roast = generate_roast(caption, llm_components)
|
| 99 |
+
|
| 100 |
+
# Step 3: Convert roast to speech
|
| 101 |
+
audio = text_to_speech(roast, tts_pipeline)
|
| 102 |
+
|
| 103 |
+
return caption, roast, audio
|
| 104 |
+
|
| 105 |
+
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
| 106 |
+
# Initialize all models
|
| 107 |
+
vision_components = initialize_vision_model()
|
| 108 |
+
llm_components = initialize_llm()
|
| 109 |
+
tts_pipeline = initialize_tts_model()
|
| 110 |
+
|
| 111 |
+
last_process_time = time.time() - 10 # Initialize with an offset
|
| 112 |
+
processing_interval = 5 # Process every 5 seconds
|
| 113 |
+
|
| 114 |
+
def process_webcam(image):
|
| 115 |
+
nonlocal last_process_time
|
| 116 |
+
|
| 117 |
+
current_time = time.time()
|
| 118 |
+
if current_time - last_process_time >= processing_interval and image is not None:
|
| 119 |
+
last_process_time = current_time
|
| 120 |
+
|
| 121 |
+
caption, roast, audio = process_frame(
|
| 122 |
+
image,
|
| 123 |
+
vision_components,
|
| 124 |
+
llm_components,
|
| 125 |
+
tts_pipeline
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
return image, caption, roast, audio
|
| 129 |
+
|
| 130 |
+
# Return None for outputs that shouldn't update
|
| 131 |
+
return image, None, None, None
|
| 132 |
+
|
| 133 |
+
video_feed.change(
|
| 134 |
+
process_webcam,
|
| 135 |
+
inputs=[video_feed],
|
| 136 |
+
outputs=[video_feed, analysis_output, roast_output, audio_output]
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
def create_app():
|
| 140 |
+
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
| 141 |
+
gr.Markdown("# AsianMOM: Asian Mother Observer & Mocker")
|
| 142 |
+
gr.Markdown("### Camera captures what you're doing and your Asian mom responds appropriately")
|
| 143 |
+
|
| 144 |
+
with gr.Row():
|
| 145 |
+
with gr.Column():
|
| 146 |
+
video_feed = gr.Image(source="webcam", streaming=True, label="Camera Feed")
|
| 147 |
+
|
| 148 |
+
with gr.Column():
|
| 149 |
+
analysis_output = gr.Textbox(label="What AsianMOM Sees", lines=2)
|
| 150 |
+
roast_output = gr.Textbox(label="AsianMOM's Thoughts", lines=4)
|
| 151 |
+
audio_output = gr.Audio(label="AsianMOM Says", autoplay=True)
|
| 152 |
+
|
| 153 |
+
# Setup the processing chain
|
| 154 |
+
setup_processing_chain(video_feed, analysis_output, roast_output, audio_output)
|
| 155 |
+
|
| 156 |
+
return app
|
| 157 |
+
|
| 158 |
+
if __name__ == "__main__":
|
| 159 |
+
app = create_app()
|
| 160 |
+
app.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.26.0
|
| 2 |
+
torch==2.1.0
|
| 3 |
+
torchvision
|
| 4 |
+
transformers==4.36.2
|
| 5 |
+
pillow
|
| 6 |
+
numpy
|
| 7 |
+
accelerate
|
| 8 |
+
git+https://github.com/huggingface/diffusers
|
| 9 |
+
opencv-python
|