File size: 1,698 Bytes
25a3bd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from gtts import gTTS
from io import BytesIO
import base64
from PIL import Image
import cv2
import numpy as np

from ultralyticsplus import YOLO
from base64 import b64encode
from speech_recognition import AudioFile, Recognizer
import numpy as np
from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist

model = YOLO('ultralyticsplus/yolov8s')
CLASS = model.model.names
defaul_bot_voice = "γŠγ―γ„γ‚ˆγ†γ”γ–γ„γΎγ™"
area_thres = 0.3

def infer(image):
    results = model.predict(image, show=False)[0]
    image = read_image_as_pil(image)
    masks, boxes = results.masks, results.boxes
    area_image = image.width * image.height
    voice_bot = None
    most_close = 0
    out_img = None
    diff_value = 0.5
    if boxes is not None:
        for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
            if int(cls) != 0:
                continue
            box = xyxy.tolist()
            area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
            if area_rate >= most_close:
                out_img = image.crop(tuple(box)).resize((128, 128))
                most_close = area_rate
    print(most_close, diff_value)
    if most_close >= area_thres and diff_value >= 0.5:
        voice_bot = tts(defaul_bot_voice, language="ja")
    return voice_bot, out_img
    
iface = gr.Interface(
    fn=infer,
    title="aisatsu api",
    inputs=[gr.Image(label="image", type="pil", shape=(960, 640))],
    outputs=[gr.Image(label="output image"), gr.Textbox(label="output voice"))],
    cache_examples=True,
    article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>.",
    examples=examples).launch(enable_queue=True)