Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on 11 days ago

Commit

fb65e18

1 Parent(s): ad493ec

fusion

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +108 -39

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Speech analysis
-emoji: ⚡
 colorFrom: gray
 colorTo: gray
 sdk: gradio

 ---
 title: Speech analysis
+emoji: 🌀
 colorFrom: gray
 colorTo: gray
 sdk: gradio

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import typing
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
@@ -58,16 +58,95 @@ class AgeGenderModel(Wav2Vec2PreTrainedModel):
     def forward(
             self,
-            input_values,
     ):
-        outputs = self.wav2vec2(input_values)
-        hidden_states = outputs[0]
         hidden_states = torch.mean(hidden_states, dim=1)
         logits_age = self.age(hidden_states)
         logits_gender = torch.softmax(self.gender(hidden_states), dim=1)
         return hidden_states, logits_age, logits_gender
 class ExpressionHead(nn.Module):
@@ -106,12 +185,11 @@ class ExpressionModel(Wav2Vec2PreTrainedModel):
         self.init_weights()
     def forward(self, input_values):
-        outputs = self.wav2vec2(input_values)
-        hidden_states = outputs[0]
         hidden_states = torch.mean(hidden_states, dim=1)
         logits = self.classifier(hidden_states)
-        return hidden_states, logits
 # Load models from hub
@@ -120,46 +198,37 @@ age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
 expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
 expression_model = ExpressionModel.from_pretrained(expression_model_name)
 def process_func(x: np.ndarray, sampling_rate: int) -> typing.Tuple[str, dict, str]:
-    r"""Predict age and gender or extract embeddings from raw audio signal."""
-    # run through processor to normalize signal
-    # always returns a batch, so we just get the first entry
-    # then we put it on the device
-    results = []
-    for processor, model in zip(
-            [age_gender_processor, expression_processor],
-            [age_gender_model, expression_model],
-    ):
-        y = processor(x, sampling_rate=sampling_rate)
-        y = y['input_values'][0]
-        y = y.reshape(1, -1)
-        y = torch.from_numpy(y).to(device)
-        # run through model
-        with torch.no_grad():
-            y = model(y)
-            if len(y) == 3:
-                # Age-gender model
-                y = torch.hstack([y[1], y[2]])
-            else:
-                # Expression model
-                y = y[1]
-        # convert to numpy
-        y = y.detach().cpu().numpy()
-        results.append(y[0])
     # Plot A/D/V values
-    plot_expression(results[1][0], results[1][1], results[1][2])
     expression_file = "expression.png"
     plt.savefig(expression_file)
     return (
-        f"{round(100 * results[0][0])} years",  # age
         {
-            "female": results[0][1],
-            "male": results[0][2],
-            "child": results[0][3],
         },
         expression_file,
     )

 import typing
+import types  # fusion of forward() of Wav2Vec2
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
     def forward(
             self,
+            frozen_cnn7,
     ):
+        hidden_states = self.wav2vec2(frozen_cnn7=frozen_cnn7)  # runs only Transformer layers
         hidden_states = torch.mean(hidden_states, dim=1)
         logits_age = self.age(hidden_states)
         logits_gender = torch.softmax(self.gender(hidden_states), dim=1)
         return hidden_states, logits_age, logits_gender
+# == Fusion = Define Age Wav2Vec2Model's forward to accept already computed CNN7 features from Emotion
+    def _forward(
+        self,
+        extract_features,
+        attention_mask=None):
+        # extract_features : CNN7 fetures of wav2vec2 as they are calc. from CNN7 feature extractor
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            raise ValueError
+            hidden_states = self.adapter(hidden_states)
+        return hidden_states
+# ===============================================
+# ================== Foward & CNN features
+    def _forward_and_cnn7(
+        self,
+        input_values,
+        attention_mask=None
+        ):
+        frozen_cnn7 = self.feature_extractor(input_values)
+        frozen_cnn7 = frozen_cnn7.transpose(1, 2)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                frozen_cnn7.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(frozen_cnn7)  # grad=True non frozen
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            raise ValueError
+            hidden_states = self.adapter(hidden_states)
+        return hidden_states, frozen_cnn7  # feature_projection is trainable thus we are unable to use the projected hidden states from official wav2vev2.forward
+# =============================
 class ExpressionHead(nn.Module):
         self.init_weights()
     def forward(self, input_values):
+        hidden_states, frozen_cnn7 = self.wav2vec2(input_values)
         hidden_states = torch.mean(hidden_states, dim=1)
         logits = self.classifier(hidden_states)
+        return hidden_states, logits, frozen_cnn7
 # Load models from hub
 expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
 expression_model = ExpressionModel.from_pretrained(expression_model_name)
+# Emotion Calc. CNN features
+age_gender_model.wav2vec2.forward = types.MethodType(_forward, age_gender_model)
+expression_model.wav2vec2.forward = types.MethodType(_forward_and_cnn7, expression_model)
 def process_func(x: np.ndarray, sampling_rate: int) -> typing.Tuple[str, dict, str]:
+    # batch audio
+    y = expression_processor(x, sampling_rate=sampling_rate)
+    y = y['input_values'][0]
+    y = y.reshape(1, -1)
+    y = torch.from_numpy(y).to(device)
+    # run through expression model
+    with torch.no_grad():
+        _, logits_expression, frozen_cnn7 = expression_model(y)
+        _, logits_age, logits_gender = age_gender_model(frozen_cnn7=frozen_cnn7)
     # Plot A/D/V values
+    plot_expression(logits_expression[0, 0].item(), # implicit detach().cpu().numpy()
+                    logits_expression[0, 1].item(),
+                    logits_expression[0, 2].item())
     expression_file = "expression.png"
     plt.savefig(expression_file)
     return (
+        f"{round(100 * logits_age[0, 0].item())} years",  # age
         {
+            "female": logits_gender[0, 0].item(),
+            "male": logits_gender[0, 1].item(),
+            "child": logits_gender[0, 2].item(),
         },
         expression_file,
     )