Spaces:
Paused
Paused
File size: 1,698 Bytes
25a3bd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from gtts import gTTS
from io import BytesIO
import base64
from PIL import Image
import cv2
import numpy as np
from ultralyticsplus import YOLO
from base64 import b64encode
from speech_recognition import AudioFile, Recognizer
import numpy as np
from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
defaul_bot_voice = "γγ―γγγγγγγΎγ"
area_thres = 0.3
def infer(image):
results = model.predict(image, show=False)[0]
image = read_image_as_pil(image)
masks, boxes = results.masks, results.boxes
area_image = image.width * image.height
voice_bot = None
most_close = 0
out_img = None
diff_value = 0.5
if boxes is not None:
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
if int(cls) != 0:
continue
box = xyxy.tolist()
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
if area_rate >= most_close:
out_img = image.crop(tuple(box)).resize((128, 128))
most_close = area_rate
print(most_close, diff_value)
if most_close >= area_thres and diff_value >= 0.5:
voice_bot = tts(defaul_bot_voice, language="ja")
return voice_bot, out_img
iface = gr.Interface(
fn=infer,
title="aisatsu api",
inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice"))],
cache_examples=True,
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
examples=examples).launch(enable_queue=True) |