Spaces:
Running
Running
added examples
Browse files
app.py
CHANGED
@@ -7,12 +7,16 @@ from moviepy import VideoFileClip
|
|
7 |
from transformers import pipeline
|
8 |
import torchaudio
|
9 |
from speechbrain.pretrained.interfaces import foreign_class
|
|
|
10 |
|
11 |
# Load Whisper model to confirm English
|
12 |
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu")
|
13 |
|
14 |
classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
|
15 |
|
|
|
|
|
|
|
16 |
|
17 |
ACCENT_LABELS = {
|
18 |
"us": "American Accent",
|
@@ -69,10 +73,26 @@ def extract_audio(video_path):
|
|
69 |
clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
|
70 |
return audio_path
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def transcribe(audio_path):
|
73 |
result = whisper_pipe(audio_path, return_language=True)
|
74 |
print(result)
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
|
77 |
def analyze_accent(url_or_file):
|
78 |
try:
|
|
|
7 |
from transformers import pipeline
|
8 |
import torchaudio
|
9 |
from speechbrain.pretrained.interfaces import foreign_class
|
10 |
+
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
11 |
|
12 |
# Load Whisper model to confirm English
|
13 |
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu")
|
14 |
|
15 |
classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
|
16 |
|
17 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
|
18 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
|
19 |
+
|
20 |
|
21 |
ACCENT_LABELS = {
|
22 |
"us": "American Accent",
|
|
|
73 |
clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
|
74 |
return audio_path
|
75 |
|
76 |
+
|
77 |
+
def detect_language(audio_path):
|
78 |
+
audio, sr = torchaudio.load(audio_path)
|
79 |
+
inputs = processor(audio[0], sampling_rate=sr, return_tensors="pt")
|
80 |
+
logits = model.forward(**inputs).logits
|
81 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
82 |
+
decoded = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
|
83 |
+
return decoded[0] # crude approximation
|
84 |
+
|
85 |
+
|
86 |
def transcribe(audio_path):
|
87 |
result = whisper_pipe(audio_path, return_language=True)
|
88 |
print(result)
|
89 |
+
|
90 |
+
lang = result['chunks'][0]['language']
|
91 |
+
|
92 |
+
if lang == None:
|
93 |
+
lang = detect_language(audio_path)
|
94 |
+
|
95 |
+
return result['text'], lang
|
96 |
|
97 |
def analyze_accent(url_or_file):
|
98 |
try:
|