fahadqazi commited on
Commit
ab2a3d2
·
1 Parent(s): 059cd4b

accent detection

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. app.py +18 -8
  3. requirements.txt +3 -1
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pretrained_models/
2
+ wav2vec2_checkpoints/
app.py CHANGED
@@ -4,16 +4,20 @@ import tempfile
4
  import os
5
  import requests
6
  from moviepy import VideoFileClip
7
- from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2Processor, Wav2Vec2Model
8
  import torchaudio
 
9
 
10
  # Load Whisper model to confirm English
11
  whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu")
12
 
 
 
13
  # Placeholder accent classifier (replace with real one or your own logic)
14
  def classify_accent(audio_tensor, sample_rate):
15
- # In a real case, you'd use a fine-tuned model or wav2vec2 embeddings
16
- # We'll fake a classification here for demonstration
 
17
  return {
18
  "accent": "American",
19
  "confidence": 87.2,
@@ -36,8 +40,9 @@ def extract_audio(video_path):
36
  return audio_path
37
 
38
  def transcribe(audio_path):
39
- result = whisper_pipe(audio_path)
40
- return result['text']
 
41
 
42
  def analyze_accent(url_or_file):
43
  try:
@@ -61,16 +66,21 @@ def analyze_accent(url_or_file):
61
 
62
  # Transcription (to verify English)
63
  transcript = transcribe(audio_path)
64
- if len(transcript.strip()) < 3:
65
  return "Could not understand speech. Please try another video."
66
 
67
  # Accent classification
68
  result = classify_accent(waveform, sample_rate)
69
 
70
- output = f"**Accent**: {result['accent']}\n\n"
 
 
 
 
 
71
  output += f"**Confidence**: {result['confidence']}%\n\n"
72
  output += f"**Explanation**: {result['summary']}\n\n"
73
- output += f"**Transcript** (first 200 chars): {transcript[:200]}..."
74
 
75
  # Clean up temp files
76
  if url_or_file.startswith("http"):
 
4
  import os
5
  import requests
6
  from moviepy import VideoFileClip
7
+ from transformers import pipeline
8
  import torchaudio
9
+ from speechbrain.pretrained.interfaces import foreign_class
10
 
11
  # Load Whisper model to confirm English
12
  whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device="cpu")
13
 
14
+ classifier = foreign_class(source="Jzuluaga/accent-id-commonaccent_xlsr-en-english", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
15
+
16
  # Placeholder accent classifier (replace with real one or your own logic)
17
  def classify_accent(audio_tensor, sample_rate):
18
+ out_prob, score, index, text_lab = classifier.classify_batch([audio_tensor])
19
+ print(out_prob, score, index, text_lab)
20
+
21
  return {
22
  "accent": "American",
23
  "confidence": 87.2,
 
40
  return audio_path
41
 
42
  def transcribe(audio_path):
43
+ result = whisper_pipe(audio_path, return_language=True)
44
+ print(result)
45
+ return result['text'], result['chunks'][0]['language']
46
 
47
  def analyze_accent(url_or_file):
48
  try:
 
66
 
67
  # Transcription (to verify English)
68
  transcript = transcribe(audio_path)
69
+ if len(transcript[0].strip()) < 3:
70
  return "Could not understand speech. Please try another video."
71
 
72
  # Accent classification
73
  result = classify_accent(waveform, sample_rate)
74
 
75
+ output = f"**Language**: {transcript[1]}\n\n"
76
+
77
+ if transcript[1].lower() != "en" and transcript[1].lower() != "english":
78
+ return "The video is not in English. Please provide an English video."
79
+
80
+ output += f"**Accent**: {result['accent']}\n\n"
81
  output += f"**Confidence**: {result['confidence']}%\n\n"
82
  output += f"**Explanation**: {result['summary']}\n\n"
83
+ output += f"**Transcript** (first 200 chars): {transcript[0][:200]}..."
84
 
85
  # Clean up temp files
86
  if url_or_file.startswith("http"):
requirements.txt CHANGED
@@ -5,4 +5,6 @@ torchaudio
5
  moviepy
6
  ffmpeg-python
7
  requests
8
- yt_dlp
 
 
 
5
  moviepy
6
  ffmpeg-python
7
  requests
8
+ yt_dlp
9
+ soundfile
10
+ speechbrain==0.5.14