Dionyssos commited on
Commit
6d576da
·
1 Parent(s): fb65e18
Files changed (1) hide show
  1. app.py +56 -60
app.py CHANGED
@@ -71,82 +71,78 @@ class AgeGenderModel(Wav2Vec2PreTrainedModel):
71
 
72
 
73
 
74
- # == Fusion = Define Age Wav2Vec2Model's forward to accept already computed CNN7 features from Emotion
75
- def _forward(
76
- self,
77
- extract_features,
78
- attention_mask=None):
79
- # extract_features : CNN7 fetures of wav2vec2 as they are calc. from CNN7 feature extractor
80
-
81
-
82
- if attention_mask is not None:
83
- # compute reduced attention_mask corresponding to feature vectors
84
- attention_mask = self._get_feature_vector_attention_mask(
85
- extract_features.shape[1], attention_mask, add_adapter=False
86
- )
87
-
88
- hidden_states, extract_features = self.feature_projection(extract_features)
89
- hidden_states = self._mask_hidden_states(
90
- hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
91
  )
92
 
93
- encoder_outputs = self.encoder(
94
- hidden_states,
95
- attention_mask=attention_mask,
96
- output_attentions=output_attentions,
97
- output_hidden_states=output_hidden_states,
98
- return_dict=return_dict,
99
- )
100
 
101
- hidden_states = encoder_outputs[0]
 
 
 
 
 
 
102
 
103
- if self.adapter is not None:
104
- raise ValueError
105
- hidden_states = self.adapter(hidden_states)
106
 
107
- return hidden_states
108
- # ===============================================
 
109
 
 
110
 
111
- # ================== Foward & CNN features
112
- def _forward_and_cnn7(
113
- self,
114
- input_values,
115
- attention_mask=None
116
- ):
117
-
118
 
119
- frozen_cnn7 = self.feature_extractor(input_values)
120
- frozen_cnn7 = frozen_cnn7.transpose(1, 2)
 
 
121
 
122
- if attention_mask is not None:
123
- # compute reduced attention_mask corresponding to feature vectors
124
- attention_mask = self._get_feature_vector_attention_mask(
125
- frozen_cnn7.shape[1], attention_mask, add_adapter=False
126
- )
127
 
128
- hidden_states, extract_features = self.feature_projection(frozen_cnn7) # grad=True non frozen
129
- hidden_states = self._mask_hidden_states(
130
- hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
 
131
  )
132
 
133
- encoder_outputs = self.encoder(
134
- hidden_states,
135
- attention_mask=attention_mask,
136
- output_attentions=output_attentions,
137
- output_hidden_states=output_hidden_states,
138
- return_dict=return_dict,
139
- )
 
 
 
 
 
140
 
141
- hidden_states = encoder_outputs[0]
142
 
143
- if self.adapter is not None:
144
- raise ValueError
145
- hidden_states = self.adapter(hidden_states)
146
 
147
- return hidden_states, frozen_cnn7 # feature_projection is trainable thus we are unable to use the projected hidden states from official wav2vev2.forward
148
 
149
- # =============================
150
 
151
 
152
  class ExpressionHead(nn.Module):
 
71
 
72
 
73
 
74
+ # Fusion = AgeWav2Vec2Model forward() will accept already computed CNN7 features from ExpressioNmodel forward()
75
+ def _forward(
76
+ self,
77
+ extract_features,
78
+ attention_mask=None):
79
+ # extract_features : CNN7 fetures of wav2vec2 as they are calc. from CNN7 feature extractor
80
+
81
+
82
+ if attention_mask is not None:
83
+ # compute reduced attention_mask corresponding to feature vectors
84
+ attention_mask = self._get_feature_vector_attention_mask(
85
+ extract_features.shape[1], attention_mask, add_adapter=False
 
 
 
 
 
86
  )
87
 
88
+ hidden_states, extract_features = self.feature_projection(extract_features)
89
+ hidden_states = self._mask_hidden_states(
90
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
91
+ )
 
 
 
92
 
93
+ encoder_outputs = self.encoder(
94
+ hidden_states,
95
+ attention_mask=attention_mask,
96
+ output_attentions=output_attentions,
97
+ output_hidden_states=output_hidden_states,
98
+ return_dict=return_dict,
99
+ )
100
 
101
+ hidden_states = encoder_outputs[0]
 
 
102
 
103
+ if self.adapter is not None:
104
+ raise ValueError
105
+ hidden_states = self.adapter(hidden_states)
106
 
107
+ return hidden_states
108
 
 
 
 
 
 
 
 
109
 
110
+ def _forward_and_cnn7(
111
+ self,
112
+ input_values,
113
+ attention_mask=None):
114
 
115
+ frozen_cnn7 = self.feature_extractor(input_values)
116
+ frozen_cnn7 = frozen_cnn7.transpose(1, 2)
 
 
 
117
 
118
+ if attention_mask is not None:
119
+ # compute reduced attention_mask corresponding to feature vectors
120
+ attention_mask = self._get_feature_vector_attention_mask(
121
+ frozen_cnn7.shape[1], attention_mask, add_adapter=False
122
  )
123
 
124
+ hidden_states, extract_features = self.feature_projection(frozen_cnn7) # grad=True non frozen
125
+ hidden_states = self._mask_hidden_states(
126
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
127
+ )
128
+
129
+ encoder_outputs = self.encoder(
130
+ hidden_states,
131
+ attention_mask=attention_mask,
132
+ output_attentions=output_attentions,
133
+ output_hidden_states=output_hidden_states,
134
+ return_dict=return_dict,
135
+ )
136
 
137
+ hidden_states = encoder_outputs[0]
138
 
139
+ if self.adapter is not None:
140
+ raise ValueError
141
+ hidden_states = self.adapter(hidden_states)
142
 
143
+ return hidden_states, frozen_cnn7 # feature_proj is trainable thus we have to access the frozen_cnn7 before projection layer
144
 
145
+ # Fusion ============================= End
146
 
147
 
148
  class ExpressionHead(nn.Module):