wesam0099 commited on
Commit
a50c417
·
verified ·
1 Parent(s): 764d7db

Upload 3 files

Browse files
Files changed (3) hide show
  1. src/agent.py +28 -0
  2. src/audio_utils.py +40 -0
  3. src/deep_model.py +25 -0
src/agent.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # agent.py
2
+
3
+ from audio_utils import record_audio, transcribe_audio
4
+ from deep_model import predict_accent
5
+
6
+ class AccentAgent:
7
+ def __init__(self, duration=5):
8
+ self.duration = duration
9
+ self.audio_path = None
10
+ self.transcription = ""
11
+ self.accent = ""
12
+
13
+ def run(self):
14
+ print("[Agent] Starting recording...")
15
+ self.audio_path = record_audio(duration=self.duration)
16
+ print("[Agent] Audio recorded at:", self.audio_path)
17
+
18
+ print("[Agent] Predicting accent...")
19
+ self.accent = predict_accent(self.audio_path)
20
+
21
+ print("[Agent] Transcribing audio...")
22
+ self.transcription = transcribe_audio(self.audio_path)
23
+
24
+ return {
25
+ "audio_path": self.audio_path,
26
+ "accent": self.accent,
27
+ "transcription": self.transcription
28
+ }
src/audio_utils.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # audio_utils.py
2
+
3
+ from transformers import pipeline
4
+ from pydub import AudioSegment
5
+ import os
6
+ import uuid
7
+ import sounddevice as sd
8
+ from scipy.io.wavfile import write
9
+ import tempfile
10
+
11
+ # تحميل نموذج Whisper
12
+ whisper_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base")
13
+
14
+ def convert_to_wav(audio_file):
15
+ sound = AudioSegment.from_file(audio_file)
16
+ temp_filename = f"temp_{uuid.uuid4()}.wav"
17
+ sound.export(temp_filename, format="wav")
18
+ return temp_filename
19
+
20
+ def transcribe_audio(audio_path):
21
+ if not audio_path.endswith(".wav"):
22
+ audio_path = convert_to_wav(audio_path)
23
+
24
+ result = whisper_pipeline(audio_path)
25
+ text = result['text']
26
+
27
+ # يمكن حذف الملف المؤقت بعد النسخ
28
+ if os.path.exists(audio_path):
29
+ os.remove(audio_path)
30
+
31
+ return text
32
+
33
+ def record_audio(duration=5, fs=16000):
34
+ """يسجل صوت من المايك لمدة محددة"""
35
+ recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
36
+ sd.wait()
37
+
38
+ temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
39
+ write(temp_wav.name, fs, recording)
40
+ return temp_wav.name
src/deep_model.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # deep_model.py
2
+
3
+ import torch
4
+ import librosa
5
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
6
+
7
+ MODEL_ID = "ylacombe/accent-classifier"
8
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_ID)
9
+ model = AutoModelForAudioClassification.from_pretrained(MODEL_ID)
10
+
11
+ # لاحظ أن الترتيب يعتمد على ترتيب تصنيفات النموذج نفسه
12
+ label_map = {
13
+ 4: "england",
14
+ 14: "us"
15
+ }
16
+
17
+ def predict_accent(audio_path: str) -> str:
18
+ audio, sr = librosa.load(audio_path, sr=16000)
19
+ inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt")
20
+
21
+ with torch.no_grad():
22
+ logits = model(**inputs).logits
23
+ predicted_id = torch.argmax(logits, dim=-1).item()
24
+
25
+ return label_map.get(predicted_id, f"Unknown (ID: {predicted_id})")