Spaces:
Paused
Paused
File size: 2,952 Bytes
48b9b5d 3ccf873 c7a4f81 3ccf873 48b9b5d 75b7975 48b9b5d 75b7975 48b9b5d 75b7975 48b9b5d 8b4aa8a 75b7975 48b9b5d 11efa99 3ccf873 48b9b5d c092255 11efa99 c092255 11efa99 48b9b5d 11efa99 3ccf873 48b9b5d 3ccf873 48b9b5d 7dc8950 48b9b5d 7dc8950 f5ddb49 48b9b5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import gradio as gr
import numpy as np
import torch
import transformers
from pathlib import Path
from transformers import pipeline
from transformers.utils import logging
# Log
#logging.set_verbosity_debug()
logger = logging.get_logger("transformers")
# Pipelines
## Automatic Speech Recognition
## https://huggingface.co/docs/transformers/task_summary#automatic-speech-recognition
## Require ffmpeg to be installed
asr_device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
asr_model = "openai/whisper-tiny"
asr = pipeline(
"automatic-speech-recognition",
model=asr_model,
torch_dtype=asr_torch_dtype,
device=asr_device
)
## Token Classification / Name Entity Recognition
## https://huggingface.co/docs/transformers/task_summary#token-classification
tc_device = 0 if torch.cuda.is_available() else "cpu"
tc_model = "dslim/distilbert-NER"
tc = pipeline(
"token-classification", # ner
model=tc_model,
device=tc_device
)
# ---
# Transformers
# https://www.gradio.app/main/docs/gradio/audio#behavior
# As output component: expects audio data in any of these formats:
# - a str or pathlib.Path filepath
# - or URL to an audio file,
# - or a bytes object (recommended for streaming),
# - or a tuple of (sample rate in Hz, audio data as numpy array)
def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None):
if audio is None:
return "..."
# TODO Manage str/Path
logger.debug("====> Transcribe")
text = ""
# https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
# Whisper input format for tuple differ from output provided by gradio audio component
if asr_model.startswith("openai/whisper"):
sampling_rate, raw = audio
# Convert to mono if stereo
if raw.ndim > 1:
raw = raw.mean(axis=1)
raw = raw.astype(np.float32) # Convert to asr_torch_dtype
raw /= np.max(np.abs(raw))
inputs = {"sampling_rate": sampling_rate, "raw": raw} # if type(audio) is tuple else audio
logger.debug(inputs)
transcript = asr(inputs)
text = transcript['text']
logger.debug("====> Tokenize:[" + text + "]")
entities = tc(text)
#logger.debug("Classify:[" + entities + "]")
# TODO Add Text Classification for sentiment analysis
return {"text": text, "entities": entities}
# ---
# Gradio
## Interfaces
# https://www.gradio.app/main/docs/gradio/audio
input_audio = gr.Audio(
sources=["upload", "microphone"],
show_share_button=False
)
## App
gradio_app = gr.Interface(
transcribe,
inputs=[
input_audio
],
outputs=[
gr.HighlightedText()
],
title="ASRNERSBX",
description=(
"Transcribe, Tokenize, Classify"
),
flagging_mode="never"
)
## Start!
gradio_app.launch() |