Spaces:

ash-171
/

accent-detection

Sleeping

App Files Files Community

ash-171 commited on May 30

Commit

5a8c370

verified ·

1 Parent(s): 766f3ce

Upload 5 files

Browse files

Files changed (5) hide show

src/app/__pycache__/main_agent.cpython-310.pyc +0 -0
src/app/main_agent.py +81 -0
src/custom_interface.py +158 -0
src/tools/__pycache__/accent_tool.cpython-310.pyc +0 -0
src/tools/accent_tool.py +52 -0

src/app/__pycache__/main_agent.cpython-310.pyc ADDED Viewed

Binary file (1.91 kB). View file

src/app/main_agent.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# from langchain_core.messages import HumanMessage, AIMessage
+# from langgraph.graph import MessageGraph
+# from langchain_core.runnables import RunnableLambda
+# from langchain_core.messages import BaseMessage
+# from langchain.tools import Tool
+# from langchain_core.runnables import RunnableLambda, Runnable
+# import re
+# def create_agent(accent_tool_obj) -> 'Runnable':
+#     accent_tool = Tool(
+#         name="AccentAnalyzer",
+#         func=accent_tool_obj.analyze,
+#         description="Analyze a public MP4 video URL and determine the English accent with transcription."
+#     )
+#     def analyze_node(messages: list[BaseMessage]) -> AIMessage:
+#         last_input = messages[-1].content
+#         match = re.search(r'https?://\S+', last_input)
+#         if match:
+#             url = match.group()
+#             result = accent_tool.func(url)
+#         else:
+#             result = "No valid video URL found in your message."
+#         return AIMessage(content=result)
+#     graph = MessageGraph()
+#     graph.add_node("analyze_accent", RunnableLambda(analyze_node))
+#     graph.set_entry_point("analyze_accent")
+#     graph.set_finish_point("analyze_accent")
+#     return graph.compile()
+#  --------------------------------------
+from langchain_core.messages import BaseMessage, AIMessage
+from langchain_core.runnables import RunnableLambda, Runnable
+from langchain_community.llms import Ollama
+from langchain.tools import Tool
+from langgraph.graph import MessageGraph
+import re
+llm = Ollama(model="gemma3", temperature=0.0) # llama3.1
+def create_agent(accent_tool_obj) -> tuple[Runnable, Runnable]:
+    accent_tool = Tool(
+        name="AccentAnalyzer",
+        func=accent_tool_obj.analyze,
+        description="Analyze a public MP4 video URL and determine the English accent with transcription."
+    )
+    def analyze_node(messages: list[BaseMessage]) -> AIMessage:
+        last_input = messages[-1].content
+        match = re.search(r'https?://\S+', last_input)
+        if match:
+            url = match.group()
+            result = accent_tool.func(url)
+        else:
+            result = "No valid video URL found in your message."
+        return AIMessage(content=result)
+    graph = MessageGraph()
+    graph.add_node("analyze_accent", RunnableLambda(analyze_node))
+    graph.set_entry_point("analyze_accent")
+    graph.set_finish_point("analyze_accent")
+    analysis_agent = graph.compile()
+    # Follow-up agent that uses transcript and responds to questions
+    def follow_up_node(messages: list[BaseMessage]) -> AIMessage:
+        user_question = messages[-1].content
+        transcript = accent_tool_obj.last_transcript or ""
+        prompt = f"""You are given this transcript of a video:
+        \"\"\"{transcript}\"\"\"
+        Now respond to the user's follow-up question: {user_question}
+        """
+        response = llm.invoke(prompt)
+        return AIMessage(content=response)
+    follow_up_agent = RunnableLambda(follow_up_node)
+    return analysis_agent, follow_up_agent

src/custom_interface.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+from speechbrain.pretrained import Pretrained
+class CustomEncoderWav2vec2Classifier(Pretrained):
+    """A ready-to-use class for utterance-level classification (e.g, speaker-id,
+    language-id, emotion recognition, keyword spotting, etc).
+    The class assumes that an self-supervised encoder like wav2vec2/hubert and a classifier model
+    are defined in the yaml file. If you want to
+    convert the predicted index into a corresponding text label, please
+    provide the path of the label_encoder in a variable called 'lab_encoder_file'
+    within the yaml.
+    The class can be used either to run only the encoder (encode_batch()) to
+    extract embeddings or to run a classification step (classify_batch()).
+    ```
+    Example
+    -------
+    >>> import torchaudio
+    >>> from speechbrain.pretrained import EncoderClassifier
+    >>> # Model is downloaded from the speechbrain HuggingFace repo
+    >>> tmpdir = getfixture("tmpdir")
+    >>> classifier = EncoderClassifier.from_hparams(
+    ...     source="speechbrain/spkrec-ecapa-voxceleb",
+    ...     savedir=tmpdir,
+    ... )
+    >>> # Compute embeddings
+    >>> signal, fs = torchaudio.load("samples/audio_samples/example1.wav")
+    >>> embeddings =  classifier.encode_batch(signal)
+    >>> # Classification
+    >>> prediction =  classifier .classify_batch(signal)
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def encode_batch(self, wavs, wav_lens=None, normalize=False):
+        """Encodes the input audio into a single vector embedding.
+        The waveforms should already be in the model's desired format.
+        You can call:
+        ``normalized = <this>.normalizer(signal, sample_rate)``
+        to get a correctly converted signal in most cases.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model. Make sure the sample rate is fs=16000 Hz.
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        normalize : bool
+            If True, it normalizes the embeddings with the statistics
+            contained in mean_var_norm_emb.
+        Returns
+        -------
+        torch.tensor
+            The encoded batch
+        """
+        # Manage single waveforms in input
+        if len(wavs.shape) == 1:
+            wavs = wavs.unsqueeze(0)
+        # Assign full length if wav_lens is not assigned
+        if wav_lens is None:
+            wav_lens = torch.ones(wavs.shape[0], device=self.device)
+        # Storing waveform in the specified device
+        wavs, wav_lens = wavs.to(self.device), wav_lens.to(self.device)
+        wavs = wavs.float()
+        # Computing features and embeddings
+        outputs = self.mods.wav2vec2(wavs)
+        # last dim will be used for AdaptativeAVG pool
+        outputs = self.mods.avg_pool(outputs, wav_lens)
+        outputs = outputs.view(outputs.shape[0], -1)
+        return outputs
+    def classify_batch(self, wavs, wav_lens=None):
+        """Performs classification on the top of the encoded features.
+        It returns the posterior probabilities, the index and, if the label
+        encoder is specified it also the text label.
+        Arguments
+        ---------
+        wavs : torch.tensor
+            Batch of waveforms [batch, time, channels] or [batch, time]
+            depending on the model. Make sure the sample rate is fs=16000 Hz.
+        wav_lens : torch.tensor
+            Lengths of the waveforms relative to the longest one in the
+            batch, tensor of shape [batch]. The longest one should have
+            relative length 1.0 and others len(waveform) / max_length.
+            Used for ignoring padding.
+        Returns
+        -------
+        out_prob
+            The log posterior probabilities of each class ([batch, N_class])
+        score:
+            It is the value of the log-posterior for the best class ([batch,])
+        index
+            The indexes of the best class ([batch,])
+        text_lab:
+            List with the text labels corresponding to the indexes.
+            (label encoder should be provided).
+        """
+        outputs = self.encode_batch(wavs, wav_lens)
+        outputs = self.mods.output_mlp(outputs)
+        out_prob = self.hparams.softmax(outputs)
+        score, index = torch.max(out_prob, dim=-1)
+        text_lab = self.hparams.label_encoder.decode_torch(index)
+        return out_prob, score, index, text_lab
+    def classify_file(self, path):
+        """Classifies the given audiofile into the given set of labels.
+        Arguments
+        ---------
+        path : str
+            Path to audio file to classify.
+        Returns
+        -------
+        out_prob
+            The log posterior probabilities of each class ([batch, N_class])
+        score:
+            It is the value of the log-posterior for the best class ([batch,])
+        index
+            The indexes of the best class ([batch,])
+        text_lab:
+            List with the text labels corresponding to the indexes.
+            (label encoder should be provided).
+        """
+        waveform = self.load_audio(path)
+        # Fake a batch:
+        batch = waveform.unsqueeze(0)
+        rel_length = torch.tensor([1.0])
+        outputs = self.encode_batch(batch, rel_length)
+        outputs = self.mods.output_mlp(outputs).squeeze(1)
+        out_prob = self.hparams.softmax(outputs)
+        score, index = torch.max(out_prob, dim=-1)
+        text_lab = self.hparams.label_encoder.decode_torch(index)
+        return out_prob, score, index, text_lab
+    def forward(self, wavs, wav_lens=None, normalize=False):
+        return self.encode_batch(
+            wavs=wavs, wav_lens=wav_lens, normalize=normalize
+        )

src/tools/__pycache__/accent_tool.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

src/tools/accent_tool.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os, requests, shutil
+from pydub import AudioSegment
+import whisper
+from speechbrain.pretrained.interfaces import foreign_class
+class AccentAnalyzerTool:
+    def __init__(self):
+        self.whisper_model = whisper.load_model("medium")
+        self.accent_model = foreign_class(
+            source="Jzuluaga/accent-id-commonaccent_xlsr-en-english",
+            pymodule_file="custom_interface.py",
+            classname="CustomEncoderWav2vec2Classifier"
+        )
+        self.last_transcript = None
+    def log(self, msg):
+        print(f"[AccentAnalyzerTool] {msg}")
+    def analyze(self, url: str) -> str:
+        try:
+            self.log("Downloading video...")
+            tmp_dir = "tmp"
+            os.makedirs(tmp_dir, exist_ok=True)
+            video_path = os.path.join(tmp_dir, "video.mp4")
+            r = requests.get(url)
+            with open(video_path, "wb") as f:
+                f.write(r.content)
+            self.log("Extracting audio...")
+            audio_path = os.path.join(tmp_dir, "audio.wav")
+            AudioSegment.from_file(video_path).export(audio_path, format="wav")
+            self.log("Classifying accent...")
+            _, score, _, label = self.accent_model.classify_file(audio_path)
+            accent = label[0].upper() if label[0] == 'us' else label[0].capitalize()
+            confidence = round(float(score) * 100, 2)
+            self.log("Transcribing...")
+            transcript = self.whisper_model.transcribe(audio_path)["text"]
+            self.last_transcript = transcript
+            summary = (
+                f"The speaker has a **{accent} English accent** "
+                f"with **{confidence}% confidence**.\n\n"
+                f"**Transcript of the audio:**\n\n *{transcript.strip(' ')}*"
+            )
+            shutil.rmtree(tmp_dir, ignore_errors=True)
+            return summary
+        except Exception as e:
+            return f"Error analyzing accent: {str(e)}"