Spaces:
Sleeping
Sleeping
File size: 4,077 Bytes
9a3e0af 93224e1 9a3e0af 93224e1 9a3e0af 93224e1 9a3e0af 4778447 9a3e0af f8a135a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import gradio as gr
import torch
import tempfile
import os
import requests
from moviepy import VideoFileClip
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2Processor, Wav2Vec2Model
import torchaudio
# Load Whisper model to confirm English
whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu")
# Placeholder accent classifier (replace with real one or your own logic)
def classify_accent(audio_tensor, sample_rate):
# In a real case, you'd use a fine-tuned model or wav2vec2 embeddings
# We'll fake a classification here for demonstration
return {
"accent": "American",
"confidence": 87.2,
"summary": "The speaker uses rhotic pronunciation and North American intonation."
}
def download_video(url):
video_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
response = requests.get(url, stream=True)
with open(video_path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024*1024):
if chunk:
f.write(chunk)
return video_path
def extract_audio(video_path):
audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
clip = VideoFileClip(video_path)
clip.audio.write_audiofile(audio_path, codec='pcm_s16le')
return audio_path
def transcribe(audio_path):
result = whisper_pipe(audio_path)
return result['text']
def analyze_accent(url_or_file):
try:
print("Video path 1:", video_path)
if url_or_file.startswith("http"):
video_path = download_video(url_or_file)
else:
video_path = url_or_file
print("Video path:", video_path)
audio_path = extract_audio(video_path)
print("Audio path:", audio_path)
# Load audio with torchaudio
waveform, sample_rate = torchaudio.load(audio_path)
# Transcription (to verify English)
transcript = transcribe(audio_path)
if len(transcript.strip()) < 3:
return "Could not understand speech. Please try another video."
# Accent classification
result = classify_accent(waveform, sample_rate)
output = f"**Accent**: {result['accent']}\n\n"
output += f"**Confidence**: {result['confidence']}%\n\n"
output += f"**Explanation**: {result['summary']}\n\n"
output += f"**Transcript** (first 200 chars): {transcript[:200]}..."
# Clean up temp files
if url_or_file.startswith("http"):
os.remove(video_path)
os.remove(audio_path)
return output
except Exception as e:
return f"❌ Error: {str(e)}"
# gr.Interface(
# fn=analyze_accent,
# inputs=gr.Textbox(label="Public Video URL (e.g. MP4)", placeholder="https://..."),
# outputs=gr.Markdown(label="Accent Analysis Result"),
# title="English Accent Classifier",
# description="Paste a video URL (MP4) to extract audio, transcribe speech, and classify the English accent (e.g., American, British, etc.).",
# examples=[
# ["https://example.com/sample.mp4"], # example URL
# [open("cleo-abram.mp4", "rb")] # local file example
# ],
# live=True
# ).launch()
with gr.Blocks() as demo:
gr.Markdown("# English Accent Classifier")
with gr.Tab("From URL"):
url_input = gr.Textbox(label="Video URL (MP4)")
url_output = gr.Markdown()
gr.Button("Analyze").click(fn=analyze_accent, inputs=url_input, outputs=url_output)
with gr.Tab("From File"):
file_input = gr.File(label="Upload MP4 Video", file_types=[".mp4"])
file_output = gr.Markdown()
gr.Button("Analyze").click(fn=analyze_accent, inputs=file_input, outputs=file_output)
gr.Examples(
examples=[
[os.getcwd() + "/examples/cleo-abram.mp4"],
],
inputs=file_input,
outputs=file_output,
fn=analyze_accent,
label="Example MP4 Videos"
)
demo.launch() |