Spaces:
Runtime error
Runtime error
| /* | |
| Copyright 2015 Google Inc. All rights reserved. | |
| Licensed under the Apache License, Version 2.0 (the "License"); | |
| you may not use this file except in compliance with the License. | |
| You may obtain a copy of the License at | |
| http://www.apache.org/licenses/LICENSE-2.0 | |
| Unless required by applicable law or agreed to in writing, software | |
| distributed under the License is distributed on an "AS IS" BASIS, | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| See the License for the specific language governing permissions and | |
| limitations under the License. | |
| */ | |
| // Author: [email protected] (David Talkin) | |
| // Implementation of the EpochTracker class. This does all of the | |
| // processing necessary to estimate the F0, voicing state and epochs | |
| // (glottal-closure instants) in human speech signals. See | |
| // epoch_tracker.h for details. | |
| const int kMinSampleRate = 6000; | |
| EpochTracker::EpochTracker(void) : sample_rate_(-1.0) { | |
| SetParameters(); | |
| } | |
| EpochTracker::~EpochTracker(void) { | |
| CleanUp(); | |
| } | |
| static inline int32_t RoundUp(float val) { | |
| return static_cast<int32_t>(val + 0.5); | |
| } | |
| void EpochTracker::CleanUp(void) { | |
| for (size_t i = 0; i < resid_peaks_.size(); ++i) { | |
| for (size_t j = 0; j < resid_peaks_[i].future.size(); ++j) { | |
| delete resid_peaks_[i].future[j]; | |
| } | |
| } | |
| resid_peaks_.clear(); | |
| output_.clear(); | |
| best_corr_.clear(); | |
| } | |
| void EpochTracker::SetParameters(void) { | |
| // Externally-settable control parameters: | |
| // Period for the returned F0 signal. | |
| external_frame_interval_ = kExternalFrameInterval; | |
| do_highpass_ = kDoHighpass; // Enables highpassing of input signal. | |
| // Enables Hilbert transformation of the input data. | |
| do_hilbert_transform_ = kDoHilbertTransform; | |
| max_f0_search_ = kMaxF0Search; // Maximum F0 to search for. | |
| min_f0_search_ = kMinF0Search; // Minimum F0 to search for. | |
| // Pulse spacing to use in unvoiced regions of the returned epoch signal. | |
| unvoiced_pulse_interval_ = kUnvoicedPulseInterval; | |
| debug_name_ = kDebugName; // base path for all debugging signals. | |
| // Internal feature-computation parameters: | |
| // For internal feature computations | |
| internal_frame_interval_ = kInternalFrameInterval; | |
| // for the high-pass filter | |
| corner_frequency_ = 80.0; | |
| filter_duration_ = 0.05; | |
| // for the LPC inverse filter. | |
| frame_duration_ = 0.02; // window size (sec) | |
| lpc_frame_interval_ = 0.01; // (sec) | |
| preemphasis_ = 0.98; // preemphasis for LPC analysis | |
| noise_floor_ = 70.0; // SNR in dB simulated during LPC analysis. | |
| // for computing LPC residual peak quality. | |
| peak_delay_ = 0.0004; // for measuring prominence | |
| skew_delay_ = 0.00015; // for measuring shape | |
| peak_val_wt_ = 0.1; | |
| peak_prominence_wt_ = 0.3; | |
| peak_skew_wt_ = 0.1; | |
| peak_quality_floor_ = 0.01; | |
| // for computing voice-transition pseudo-probabilities | |
| time_span_ = 0.020; // the interval (sec) centered on the | |
| // measurement point, used to compute parameter | |
| // deltas. | |
| level_change_den_ = 30.0; // max. dB level change expected over | |
| // time_span_ for bandpassed RMS for | |
| // computing pseudo-probability of | |
| // voicing. | |
| min_rms_db_ = 20.0; // level floor in dB | |
| // window size for computing amplitude-normalizing RMS | |
| ref_dur_ = 0.02; | |
| // low and high frequency limits for bandpassed RMS used in voicing indicator | |
| min_freq_for_rms_ = 100.0; | |
| max_freq_for_rms_ = 1000.0; | |
| // duration of integrator for bandpassed RMS | |
| rms_window_dur_ = 0.025; | |
| // window duration, in seconds, for NCCF computations | |
| correlation_dur_ = 0.0075; | |
| // ignore any NCCF peaks less than this | |
| correlation_thresh_ = 0.2; | |
| // Parametrs used by the dynamic-programming tracker: | |
| // reward for inserting another period | |
| reward_ = -1.5; | |
| // weight given to deviation of inter-pulse interval from the | |
| // closest NCCF peak lag | |
| period_deviation_wt_ = 1.0; | |
| // weight given to the quality of the residual peak | |
| peak_quality_wt_ = 1.3; | |
| // cost of the unvoiced hypothesis | |
| unvoiced_cost_ = kUnvoicedCost; | |
| // cost of high NCCF values in hypothetical unvoiced regions | |
| nccf_uv_peak_wt_ = 0.9; | |
| // weight given to period length | |
| period_wt_ = 0.0002; | |
| // weight given to the pseudo-probability of voicing feature | |
| level_wt_ = 0.8; | |
| // weight given to period-length differences between adjacent periods. | |
| freq_trans_wt_ = 1.8; | |
| // cost of switching between voicing states; modulated by voicing | |
| // onset/offset probs. | |
| voice_transition_factor_ = 1.4; | |
| // Parameters used to generate final outputs: | |
| // pad time in seconds to add to the last measured period during | |
| // output of periodically-resampled data | |
| endpoint_padding_ = 0.01; | |
| } | |
| bool EpochTracker::Init(const int16_t* input, int32_t n_input, float sample_rate, | |
| float min_f0_search, float max_f0_search, | |
| bool do_highpass, bool do_hilbert_transform) { | |
| if (input && (sample_rate > 6000.0) && (n_input > (sample_rate * 0.05)) && | |
| (min_f0_search < max_f0_search) && (min_f0_search > 0.0)) { | |
| CleanUp(); | |
| min_f0_search_ = min_f0_search; | |
| max_f0_search_ = max_f0_search; | |
| sample_rate_ = sample_rate; | |
| int16_t* input_p = const_cast<int16_t *>(input); | |
| if (do_highpass) { | |
| input_p = HighpassFilter(input_p, n_input, sample_rate, | |
| corner_frequency_, filter_duration_); | |
| } | |
| signal_.resize(n_input); | |
| if (do_hilbert_transform) { | |
| HilbertTransform(input_p, n_input, &(signal_.front())); | |
| } else { | |
| for (int32_t i = 0; i < n_input; ++i) { | |
| signal_[i] = input_p[i]; | |
| } | |
| } | |
| if (input_p != input) { | |
| delete [] input_p; | |
| } | |
| return true; | |
| } | |
| return false; | |
| } | |
| void EpochTracker::HilbertTransform(int16_t* input, int32_t n_input, | |
| float* output) { | |
| FFT ft = FFT(FFT::fft_pow2_from_window_size(n_input)); | |
| int32_t n_fft = ft.get_fftSize(); | |
| float* re = new float[n_fft]; | |
| float* im = new float[n_fft]; | |
| for (int i = 0; i < n_input; ++i) { | |
| re[i] = input[i]; | |
| im[i] = 0.0; | |
| } | |
| for (int i = n_input; i < n_fft; ++i) { | |
| re[i] = 0.0; | |
| im[i] = 0.0; | |
| } | |
| ft.fft(re, im); | |
| for (int i = 1; i < n_fft/2; ++i) { | |
| float tmp = im[i]; | |
| im[i] = -re[i]; | |
| re[i] = tmp; | |
| } | |
| re[0] = im[0] = 0.0; | |
| for (int i = n_fft/2 + 1; i < n_fft; ++i) { | |
| float tmp = im[i]; | |
| im[i] = re[i]; | |
| re[i] = -tmp; | |
| } | |
| ft.ifft(re, im); | |
| for (int i = 0; i < n_input; ++i) { | |
| output[i] = re[i] / n_fft; | |
| } | |
| delete [] re; | |
| delete [] im; | |
| } | |
| int16_t* EpochTracker::HighpassFilter(int16_t* input, int32_t n_input, | |
| float sample_rate, float corner_freq, | |
| float fir_duration) { | |
| FdFilter filter(sample_rate, corner_freq, true, fir_duration, false); | |
| int16_t* filtered_data = new int16_t[n_input]; | |
| int32_t max_buffer_size = filter.GetMaxInputSize(); | |
| int32_t to_process = n_input; | |
| bool start = true; | |
| bool end = false; | |
| int32_t input_index = 0; | |
| int32_t output_index = 0; | |
| while (to_process > 0) { | |
| int32_t to_send = to_process; | |
| if (to_send > max_buffer_size) { | |
| to_send = max_buffer_size; | |
| } else { | |
| end = true; | |
| } | |
| int32_t samples_returned = filter.FilterArray(input + input_index, to_send, | |
| start, end, | |
| filtered_data + output_index, | |
| n_input - output_index); | |
| input_index += to_send; | |
| to_process -= to_send; | |
| output_index += samples_returned; | |
| start = false; | |
| } | |
| return filtered_data; | |
| } | |
| static float LpcDcGain(float* lpc, int32_t order) { | |
| float sum = 0.0; | |
| for (int32_t i = 0; i <= order; ++i) { | |
| sum += lpc[i]; | |
| } | |
| if (sum > 0.0) { | |
| return sum; | |
| } else { | |
| return 1.0; | |
| } | |
| } | |
| static void MakeDeltas(float* now, float* next, int32_t size, int32_t n_steps, | |
| float* deltas) { | |
| for (int32_t i = 0; i < size; ++i) { | |
| deltas[i] = (next[i] - now[i]) / n_steps; | |
| } | |
| } | |
| bool EpochTracker::GetLpcResidual(const std::vector<float>& input, float sample_rate, | |
| std::vector<float>* output) { | |
| int32_t n_input = input.size(); | |
| if (!((n_input > 0) && (sample_rate > 0.0) && output)) { | |
| return false; | |
| } | |
| output->resize(n_input); | |
| int32_t frame_step = RoundUp(sample_rate * lpc_frame_interval_); | |
| int32_t frame_size = RoundUp(sample_rate * frame_duration_); | |
| int32_t n_frames = 1 + ((n_input - frame_size) / frame_step); | |
| int32_t n_analyzed = ((n_frames - 1) * frame_step) + frame_size; | |
| // Must have one more than frame size to do a complete frame. | |
| if (n_analyzed <= n_input) { | |
| n_frames--; | |
| if (n_frames <= 0) { | |
| return false; | |
| } | |
| } | |
| LpcAnalyzer lp; | |
| int32_t order = lp.GetLpcOrder(sample_rate); | |
| float* lpc = new float[order + 1]; | |
| float* old_lpc = new float[order + 1]; | |
| float* delta_lpc = new float[order + 1]; | |
| float norm_error = 0.0; | |
| float preemp_rms = 0.0; | |
| if (!lp.ComputeLpc(order, noise_floor_, frame_size, &(input.front()), | |
| old_lpc, NULL, NULL, &norm_error, &preemp_rms, | |
| preemphasis_)) { | |
| RELEASE_MEMORY(); | |
| return false; | |
| } | |
| for (int32_t i = 0; i <= order; ++i) { | |
| delta_lpc[i] = 0.0; | |
| (*output)[i] = 0.0; | |
| } | |
| float old_gain = LpcDcGain(old_lpc, order); | |
| float new_gain = 1.0; | |
| int32_t n_to_filter = (frame_size / 2) - order; // How many samples | |
| // to process before | |
| // computing the next | |
| // LPC frame. | |
| int32_t input_p = 0; // Where to get the next frame for LPC analysis | |
| int32_t output_p = order; // where to store output samples. | |
| int32_t proc_p = 0; // Where to pick up samples for input to the filter | |
| // Main processing loop: | |
| // Compute a new frame of LPC | |
| // Compute the DC gain for the new LPC | |
| // Compute delta DC gain. | |
| // Compute the LPC deltas. | |
| // For each point in the frame: | |
| // Use old_lpc to produce an output point. | |
| // Update the old LPCs and the DC gain | |
| // As soon as the center of the current frame is reached, compute | |
| // the LPC for the next frame. | |
| for ( ; n_frames > 0; --n_frames, input_p += frame_step, | |
| n_to_filter = frame_step) { | |
| if (!lp.ComputeLpc(order, noise_floor_, frame_size, | |
| (&(input.front())) + input_p, lpc, NULL, NULL, | |
| &norm_error, &preemp_rms, preemphasis_)) { | |
| RELEASE_MEMORY(); | |
| return false; | |
| } | |
| new_gain = LpcDcGain(lpc, order); | |
| float delta_gain = (new_gain - old_gain) / n_to_filter; | |
| MakeDeltas(old_lpc, lpc, order+1, n_to_filter, delta_lpc); | |
| for (int32_t sample = 0; sample < n_to_filter; ++sample, ++proc_p, | |
| ++output_p) { | |
| float sum = 0.0; | |
| int32_t mem = proc_p; | |
| for (int32_t k = order; k > 0; --k, ++mem) { | |
| sum += (old_lpc[k] * input[mem]); | |
| old_lpc[k] += delta_lpc[k]; | |
| } | |
| sum += input[mem]; // lpc[0] is always 1.0 | |
| (*output)[output_p] = sum / old_gain; | |
| old_gain += delta_gain; | |
| } | |
| } | |
| RELEASE_MEMORY(); | |
| return true; | |
| } | |
| // Note that GetResidualPulses assumes the LPC residual is in the | |
| // "correct" polarity, with the GCI pulses of interest being negative | |
| // pulses with a gradual fall and an abrupt rise. | |
| void EpochTracker::GetResidualPulses(void) { | |
| int32_t peak_ind = RoundUp(peak_delay_ * sample_rate_); | |
| int32_t skew_ind = RoundUp(skew_delay_ * sample_rate_); | |
| float min_peak = -1.0; // minimum value that will be considered as a peak | |
| int32_t limit = norm_residual_.size() - peak_ind; | |
| resid_peaks_.resize(0); | |
| peaks_debug_.resize(residual_.size()); | |
| for (size_t i = 0; i < peaks_debug_.size(); ++i) { | |
| peaks_debug_[i] = 0.0; | |
| } | |
| for (int32_t i = peak_ind; i < limit; ++i) { | |
| float val = norm_residual_[i]; | |
| if (val > min_peak) { | |
| continue; | |
| } | |
| if ((norm_residual_[i-1] > val) && (val <= norm_residual_[i+1])) { | |
| float vm_peak = norm_residual_[i - peak_ind]; | |
| float vp_peak = norm_residual_[i + peak_ind]; | |
| if ((vm_peak < val) || (vp_peak < val)) { | |
| continue; | |
| } | |
| float vm_skew = norm_residual_[i - skew_ind]; | |
| float vp_skew = norm_residual_[i + skew_ind]; | |
| float sharp = (0.5 * (vp_peak + vm_peak)) - val; | |
| float skew = -(vm_skew - vp_skew); | |
| ResidPeak p; | |
| p.resid_index = i; | |
| float time = static_cast<float>(i) / sample_rate_; | |
| p.frame_index = RoundUp(time / internal_frame_interval_); | |
| if (p.frame_index >= n_feature_frames_) { | |
| p.frame_index = n_feature_frames_ - 1; | |
| } | |
| p.peak_quality = (-val * peak_val_wt_) + (skew * peak_skew_wt_) + | |
| (sharp * peak_prominence_wt_); | |
| if (p.peak_quality < peak_quality_floor_) { | |
| p.peak_quality = peak_quality_floor_; | |
| } | |
| resid_peaks_.push_back(p); | |
| peaks_debug_[i] = p.peak_quality; | |
| } | |
| } | |
| } | |
| void EpochTracker::GetVoiceTransitionFeatures(void) { | |
| int32_t frame_offset = RoundUp(0.5 * time_span_ / internal_frame_interval_); | |
| if (frame_offset <= 0) { | |
| frame_offset = 1; | |
| } | |
| voice_onset_prob_.resize(n_feature_frames_); | |
| voice_offset_prob_.resize(n_feature_frames_); | |
| int32_t limit = n_feature_frames_ - frame_offset; | |
| for (int32_t frame = frame_offset; frame < limit; ++frame) { | |
| float delta_rms = (bandpassed_rms_[frame + frame_offset] - | |
| bandpassed_rms_[frame - frame_offset]) / level_change_den_; | |
| if (delta_rms > 1.0) { | |
| delta_rms = 1.0; | |
| } else { | |
| if (delta_rms < -1.0) { | |
| delta_rms = -1.0; | |
| } | |
| } | |
| float prob_onset = delta_rms; | |
| float prob_offset = -prob_onset; | |
| if (prob_onset > 1.0) { | |
| prob_onset = 1.0; | |
| } else { | |
| if (prob_onset < 0.0) { | |
| prob_onset = 0.0; | |
| } | |
| } | |
| if (prob_offset > 1.0) { | |
| prob_offset = 1.0; | |
| } else { | |
| if (prob_offset < 0.0) { | |
| prob_offset = 0.0; | |
| } | |
| } | |
| voice_onset_prob_[frame] = prob_onset; | |
| voice_offset_prob_[frame] = prob_offset; | |
| } | |
| // Just set the onset and offset probs to zero in the end zones. | |
| for (int32_t frame = 0; frame < frame_offset; ++frame) { | |
| int32_t bframe = n_feature_frames_ - 1 - frame; | |
| voice_onset_prob_[frame] = voice_offset_prob_[frame] = 0.0; | |
| voice_onset_prob_[bframe] = voice_offset_prob_[bframe] = 0.0; | |
| } | |
| } | |
| void EpochTracker::GetRmsVoicingModulator(void) { | |
| float min_val = bandpassed_rms_[0]; | |
| float max_val = min_val; | |
| prob_voiced_.resize(bandpassed_rms_.size()); | |
| // Find the max and min over the whole RMS array. Scale and offset | |
| // the RMS values to all fall in th range of 0.0 to 1.0. | |
| for (size_t i = 1; i < bandpassed_rms_.size(); ++i) { | |
| float val = bandpassed_rms_[i]; | |
| if (val < min_val) { | |
| min_val = val; | |
| } else { | |
| if (val > max_val) { | |
| max_val = val; | |
| } | |
| } | |
| } | |
| if (min_val < min_rms_db_) { | |
| min_val = min_rms_db_; | |
| } | |
| float range = max_val - min_val; | |
| if (range < 1.0) { | |
| range = 1.0; | |
| } | |
| for (size_t i = 0; i < bandpassed_rms_.size(); ++i) { | |
| prob_voiced_[i] = (bandpassed_rms_[i] - min_val) / range; | |
| if (prob_voiced_[i] < 0.0) { | |
| prob_voiced_[i] = 0.0; | |
| } | |
| } | |
| } | |
| int32_t EpochTracker::FindNccfPeaks(const std::vector<float>& input, float thresh, | |
| std::vector<int16_t>* output) { | |
| int32_t limit = input.size() - 1; | |
| uint32_t n_peaks = 0; | |
| float max_val = 0.0; | |
| int16_t max_index = 1; | |
| int16_t max_out_index = 0; | |
| output->resize(0); | |
| for (int16_t i = 1; i < limit; ++i) { | |
| float val = input[i]; | |
| if ((val > thresh) && (val > input[i-1]) && (val >= input[i+1])) { | |
| if (val > max_val) { | |
| max_val = val; | |
| max_out_index = n_peaks; | |
| max_index = i; | |
| } | |
| n_peaks++; | |
| output->push_back(i); | |
| } | |
| } | |
| // Be sure the highest peak is the first one in the array. | |
| if ((n_peaks > 1) && (max_out_index > 0)) { | |
| int16_t hold = (*output)[0]; | |
| (*output)[0] = (*output)[max_out_index]; | |
| (*output)[max_out_index] = hold; | |
| } else { | |
| if (n_peaks <= 0) { | |
| n_peaks = 1; | |
| output->push_back(max_index); | |
| } | |
| } | |
| return n_peaks; | |
| } | |
| void EpochTracker::CrossCorrelation(const std::vector<float>& data, int32_t start, | |
| int32_t first_lag, int32_t n_lags, | |
| int32_t size, std::vector<float>* corr) { | |
| const float* input = (&(data.front())) + start; | |
| corr->resize(n_lags); | |
| float energy = 0.0; // Zero-lag energy part of the normalizer. | |
| for (int32_t i = 0; i < size; ++i) { | |
| energy += input[i] * input[i]; | |
| } | |
| if (energy == 0.0) { // Bail out if no energy is found. | |
| for (int32_t i = 0; i < n_lags; ++i) { | |
| (*corr)[i] = 0.0; | |
| } | |
| return; | |
| } | |
| int32_t limit = first_lag + size; | |
| double lag_energy = 0.0; // Energy at the period hypothesis lag. | |
| for (int32_t i = first_lag; i < limit; ++i) { | |
| lag_energy += input[i] * input[i]; | |
| } | |
| int32_t last_lag = first_lag + n_lags; | |
| int32_t oind = 0; // Index for storing output values. | |
| for (int32_t lag = first_lag; lag < last_lag; ++lag, ++oind) { | |
| float sum = 0.0; | |
| int32_t lag_ind = lag; | |
| for (int32_t i = 0; i < size; ++i, ++lag_ind) { | |
| sum += input[i] * input[lag_ind]; | |
| } | |
| if (lag_energy <= 0.0) | |
| lag_energy = 1.0; | |
| (*corr)[oind] = sum / sqrt(energy * lag_energy); | |
| lag_energy -= input[lag] * input[lag]; // Discard old sample. | |
| lag_energy += input[lag_ind] * input[lag_ind]; // Pick up the new sample. | |
| } | |
| return; | |
| } | |
| void EpochTracker::GetPulseCorrelations(float window_dur, float peak_thresh) { | |
| first_nccf_lag_ = RoundUp(sample_rate_ / max_f0_search_); | |
| int32_t max_lag = RoundUp(sample_rate_ / min_f0_search_); | |
| n_nccf_lags_ = max_lag - first_nccf_lag_; | |
| int32_t window_size = RoundUp(window_dur * sample_rate_); | |
| int32_t half_wind = window_size / 2; | |
| int32_t frame_size = window_size + max_lag; | |
| std::vector<float> mixture; | |
| mixture.resize(residual_.size()); | |
| const float kMinCorrelationStep = 0.001; // Pulse separation | |
| // before computing new | |
| // correlation values. | |
| const float kResidFract = 0.7; // Fraction of the residual to use. | |
| const float kPcmFract = 1.0 - kResidFract; // Fraction of the input to use. | |
| for (size_t i = 0; i < residual_.size(); ++i) { | |
| mixture[i] = (kResidFract * residual_[i]) + (kPcmFract * signal_[i]); | |
| } | |
| int32_t min_step = RoundUp(sample_rate_ * kMinCorrelationStep); | |
| int32_t old_start = - (2.0 * min_step); | |
| for (size_t peak = 0; peak < resid_peaks_.size(); ++peak) { | |
| int32_t start = resid_peaks_[peak].resid_index - half_wind; | |
| if (start < 0) { | |
| start = 0; | |
| } | |
| size_t end = start + frame_size; | |
| if ((end >= mixture.size()) || ((start - old_start) < min_step)) { | |
| resid_peaks_[peak].nccf = resid_peaks_[peak - 1].nccf; | |
| resid_peaks_[peak].nccf_periods = resid_peaks_[peak - 1].nccf_periods; | |
| } else { | |
| CrossCorrelation(mixture, start, first_nccf_lag_, n_nccf_lags_, | |
| window_size, &(resid_peaks_[peak].nccf)); | |
| FindNccfPeaks(resid_peaks_[peak].nccf, peak_thresh, | |
| &(resid_peaks_[peak].nccf_periods)); | |
| // Turn the peak indices from FindNccfPeaks into NCCF period hyps. | |
| for (size_t i = 0; i < resid_peaks_[peak].nccf_periods.size(); ++i) { | |
| resid_peaks_[peak].nccf_periods[i] += first_nccf_lag_; | |
| } | |
| old_start = start; | |
| } | |
| } | |
| } | |
| void EpochTracker::Window(const std::vector<float>& input, int32_t offset, size_t size, | |
| float* output) { | |
| if (size != window_.size()) { | |
| window_.resize(size); | |
| float arg = 2.0 * M_PI / size; | |
| for (size_t i = 0; i < size; ++i) { | |
| window_[i] = 0.5 - (0.5 * cos((i + 0.5) * arg)); | |
| } | |
| } | |
| const float* data = (&(input.front())) + offset; | |
| for (size_t i = 0; i < size; ++i) { | |
| output[i] = data[i] * window_[i]; | |
| } | |
| } | |
| bool EpochTracker::GetBandpassedRmsSignal(const std::vector<float>& input, | |
| float sample_rate, | |
| float low_limit, float high_limit, | |
| float frame_interval, | |
| float frame_dur, | |
| std::vector<float>* output_rms) { | |
| size_t frame_step = RoundUp(sample_rate * frame_interval); | |
| size_t frame_size = RoundUp(sample_rate * frame_dur); | |
| size_t n_frames = 1 + ((input.size() - frame_size) / frame_step); | |
| if (n_frames < 2) { | |
| fprintf(stderr, "input too small (%d) in GetBandpassedRmsSignal\n", | |
| static_cast<int>(input.size())); | |
| output_rms->resize(0); | |
| return false; | |
| } | |
| output_rms->resize(n_frames); | |
| FFT ft(FFT::fft_pow2_from_window_size(frame_size)); | |
| int32_t fft_size = ft.get_fftSize(); | |
| int32_t first_bin = RoundUp(fft_size * low_limit / sample_rate); | |
| int32_t last_bin = RoundUp(fft_size * high_limit / sample_rate); | |
| float* re = new float[fft_size]; | |
| float* im = new float[fft_size]; | |
| size_t first_frame = frame_size / (2 * frame_step); | |
| if ((first_frame * 2 * frame_step) < frame_size) { | |
| first_frame++; | |
| } | |
| for (size_t frame = first_frame; frame < n_frames; ++frame) { | |
| Window(input, (frame - first_frame) * frame_step, frame_size, re); | |
| for (size_t i = 0; i < frame_size; ++i) { | |
| im[i] = 0.0; | |
| } | |
| for (int32_t i = frame_size; i < fft_size; ++i) { | |
| re[i] = im[i] = 0.0; | |
| } | |
| ft.fft(re, im); | |
| float rms = 20.0 * | |
| log10(1.0 + ft.get_band_rms(re, im, first_bin, last_bin)); | |
| (*output_rms)[frame] = rms; | |
| if (frame == first_frame) { | |
| for (size_t bframe = 0; bframe < first_frame; ++bframe) { | |
| (*output_rms)[bframe] = rms; | |
| } | |
| } | |
| } | |
| delete [] re; | |
| delete [] im; | |
| return true; | |
| } | |
| void EpochTracker::GetSymmetryStats(const std::vector<float>& data, float* pos_rms, | |
| float* neg_rms, float* mean) { | |
| int32_t n_input = data.size(); | |
| double p_sum = 0.0; | |
| double n_sum = 0.0; | |
| double sum = 0.0; | |
| int32_t n_p = 0; | |
| int32_t n_n = 0; | |
| for (int32_t i = 0; i < n_input; ++i) { | |
| sum += data[i]; | |
| } | |
| *mean = sum / n_input; | |
| for (int32_t i = 0; i < n_input; ++i) { | |
| double val = data[i] - *mean; | |
| if (val > 0.0) { | |
| p_sum += (val * val); | |
| n_p++; | |
| } else { | |
| if (val < 0.0) { | |
| n_sum += (val * val); | |
| n_n++; | |
| } | |
| } | |
| } | |
| *pos_rms = sqrt(p_sum / n_p); | |
| *neg_rms = sqrt(n_sum / n_n); | |
| } | |
| void EpochTracker::NormalizeAmplitude(const std::vector<float>& input, | |
| float sample_rate, | |
| std::vector<float>* output) { | |
| int32_t n_input = input.size(); | |
| int32_t ref_size = RoundUp(sample_rate * ref_dur_); | |
| std::vector<float> wind; | |
| output->resize(n_input); | |
| // Just calling Window here to create a Hann window in window_. | |
| Window(input, 0, ref_size, &(output->front())); | |
| int32_t ref_by_2 = ref_size / 2; | |
| int32_t frame_step = RoundUp(sample_rate * internal_frame_interval_); | |
| int32_t limit = n_input - ref_size; | |
| int32_t frame_limit = ref_by_2; | |
| int32_t data_p = 0; | |
| int32_t frame_p = 0; | |
| double old_inv_rms = 0.0; | |
| while (frame_p < limit) { | |
| double ref_energy = 1.0; // to prevent divz | |
| for (int32_t i = 0; i < ref_size; ++i) { | |
| double val = window_[i] * input[i + frame_p]; | |
| ref_energy += (val * val); | |
| } | |
| double inv_rms = sqrt(static_cast<double>(ref_size) / ref_energy); | |
| double delta_inv_rms = 0.0; | |
| if (frame_p > 0) { | |
| delta_inv_rms = (inv_rms - old_inv_rms) / frame_step; | |
| } else { | |
| old_inv_rms = inv_rms; | |
| } | |
| for (int i = 0; i < frame_limit; ++i, ++data_p) { | |
| (*output)[data_p] = input[data_p] * old_inv_rms; | |
| old_inv_rms += delta_inv_rms; | |
| } | |
| frame_limit = frame_step; | |
| frame_p += frame_step; | |
| } | |
| for ( ; data_p < n_input; ++data_p) { | |
| (*output)[data_p] = input[data_p] * old_inv_rms; | |
| } | |
| } | |
| bool EpochTracker::ComputePolarity(int *polarity) { | |
| if (sample_rate_ <= 0.0) { | |
| fprintf(stderr, "EpochTracker not initialized in ComputeFeatures\n"); | |
| return false; | |
| } | |
| if (!GetBandpassedRmsSignal(signal_, sample_rate_, min_freq_for_rms_, | |
| max_freq_for_rms_, internal_frame_interval_, | |
| rms_window_dur_, &bandpassed_rms_)) { | |
| fprintf(stderr, "Failure in GetBandpassedRmsSignal\n"); | |
| return false; | |
| } | |
| if (!GetLpcResidual(signal_, sample_rate_, &residual_)) { | |
| fprintf(stderr, "Failure in GetLpcResidual\n"); | |
| return false; | |
| } | |
| float mean = 0.0; | |
| GetSymmetryStats(residual_, &positive_rms_, &negative_rms_, &mean); | |
| *polarity = -1; | |
| if (positive_rms_ > negative_rms_) { | |
| *polarity = 1; | |
| } | |
| return true; | |
| } | |
| bool EpochTracker::ComputeFeatures(void) { | |
| if (sample_rate_ <= 0.0) { | |
| fprintf(stderr, "EpochTracker not initialized in ComputeFeatures\n"); | |
| return false; | |
| } | |
| if (!GetBandpassedRmsSignal(signal_, sample_rate_, min_freq_for_rms_, | |
| max_freq_for_rms_, internal_frame_interval_, | |
| rms_window_dur_, &bandpassed_rms_)) { | |
| fprintf(stderr, "Failure in GetBandpassedRmsSignal\n"); | |
| return false; | |
| } | |
| if (!GetLpcResidual(signal_, sample_rate_, &residual_)) { | |
| fprintf(stderr, "Failure in GetLpcResidual\n"); | |
| return false; | |
| } | |
| n_feature_frames_ = bandpassed_rms_.size(); | |
| float mean = 0.0; | |
| GetSymmetryStats(residual_, &positive_rms_, &negative_rms_, &mean); | |
| fprintf(stdout, "Residual symmetry: P:%f N:%f MEAN:%f\n", | |
| positive_rms_, negative_rms_, mean); | |
| if (positive_rms_ > negative_rms_) { | |
| fprintf(stdout, "Inverting signal\n"); | |
| for (size_t i = 0; i < residual_.size(); ++i) { | |
| residual_[i] = -residual_[i]; | |
| signal_[i] = -signal_[i]; | |
| } | |
| } | |
| NormalizeAmplitude(residual_, sample_rate_, &norm_residual_); | |
| GetResidualPulses(); | |
| GetPulseCorrelations(correlation_dur_, correlation_thresh_); | |
| GetVoiceTransitionFeatures(); | |
| GetRmsVoicingModulator(); | |
| return true; | |
| } | |
| bool EpochTracker::TrackEpochs(void) { | |
| CreatePeriodLattice(); | |
| DoDynamicProgramming(); | |
| return BacktrackAndSaveOutput(); | |
| } | |
| void EpochTracker::CreatePeriodLattice(void) { | |
| int32_t low_period = RoundUp(sample_rate_ / max_f0_search_); | |
| int32_t high_period = RoundUp(sample_rate_ / min_f0_search_); | |
| int32_t total_cands = 0; | |
| // For each pulse in the normalized residual... | |
| for (size_t peak = 0; peak < resid_peaks_.size(); ++peak) { | |
| size_t frame_index = resid_peaks_[peak].frame_index; | |
| size_t resid_index = resid_peaks_[peak].resid_index; | |
| int32_t min_period = resid_index + low_period; | |
| int32_t max_period = resid_index + high_period; | |
| float lowest_cost = 1.0e30; | |
| float time = resid_index / sample_rate_; | |
| int32_t best_nccf_period = resid_peaks_[peak].nccf_periods[0]; | |
| float best_cc_val = | |
| resid_peaks_[peak].nccf[best_nccf_period - first_nccf_lag_]; | |
| best_corr_.push_back(time); | |
| best_corr_.push_back(best_cc_val); | |
| EpochCand* uv_cand = new EpochCand; // pre-allocate an unvoiced candidate. | |
| uv_cand->voiced = false; | |
| uv_cand->start_peak = peak; | |
| uv_cand->cost_sum = 0.0; | |
| uv_cand->local_cost = 0.0; | |
| uv_cand->best_prev_cand = -1; | |
| int32_t next_cands_created = 0; | |
| // For each of the next residual pulses in the search range... | |
| for (size_t npeak = peak + 1; npeak < resid_peaks_.size(); ++npeak) { | |
| int32_t iperiod = resid_peaks_[npeak].resid_index - resid_index; | |
| if (resid_peaks_[npeak].resid_index >= min_period) { | |
| float fperiod = iperiod; | |
| // Find the NCCF period that most closely matches. | |
| int32_t cc_peak = 0; | |
| float min_period_diff = fabs(log(fperiod / best_nccf_period)); | |
| for (size_t cc_peak_ind = 1; | |
| cc_peak_ind < resid_peaks_[peak].nccf_periods.size(); | |
| ++cc_peak_ind) { | |
| int32_t nccf_period = resid_peaks_[peak].nccf_periods[cc_peak_ind]; | |
| float test_diff = fabs(log(fperiod / nccf_period)); | |
| if (test_diff < min_period_diff) { | |
| min_period_diff = test_diff; | |
| cc_peak = cc_peak_ind; | |
| } | |
| } | |
| // Generate a forward-period candidate. Grade the candidate | |
| // on closeness to a NCCF period hyp, value of the NCCF, | |
| // values of the candidate endpoint peaks. | |
| EpochCand* v_cand = new EpochCand; | |
| v_cand->voiced = true; | |
| v_cand->period = iperiod; | |
| int32_t cc_index = iperiod - first_nccf_lag_; | |
| float cc_value = 0.0; | |
| // If this period is in the normal search range, retrieve the | |
| // actual NCCF value for that lag. | |
| if ((cc_index >= 0) && (cc_index < n_nccf_lags_)) { | |
| cc_value = resid_peaks_[peak].nccf[cc_index]; | |
| } else { // punt and use the "closest" nccf peak | |
| int32_t peak_cc_index = resid_peaks_[peak].nccf_periods[cc_peak] - | |
| first_nccf_lag_; | |
| cc_value = resid_peaks_[peak].nccf[peak_cc_index]; | |
| } | |
| float per_dev_cost = period_deviation_wt_ * min_period_diff; | |
| float level_cost = level_wt_ * (1.0 - prob_voiced_[frame_index]); | |
| float period_cost = fperiod * period_wt_; | |
| float peak_qual_cost = peak_quality_wt_ / | |
| (resid_peaks_[npeak].peak_quality + resid_peaks_[peak].peak_quality); | |
| float local_cost = (1.0 - cc_value) + per_dev_cost + peak_qual_cost + | |
| level_cost + period_cost + reward_; | |
| v_cand->local_cost = local_cost; | |
| if (local_cost < lowest_cost) { | |
| lowest_cost = local_cost; | |
| // Evaluate this best voiced period as an unvoiced | |
| // hypothesis. (There are always plenty of poor | |
| // voiced candidates!) | |
| uv_cand->period = iperiod; | |
| level_cost = level_wt_ * prob_voiced_[frame_index]; | |
| uv_cand->local_cost = (nccf_uv_peak_wt_ * cc_value) + | |
| level_cost + unvoiced_cost_ + reward_; | |
| uv_cand->end_peak = npeak; | |
| uv_cand->closest_nccf_period = | |
| resid_peaks_[peak].nccf_periods[cc_peak]; | |
| } | |
| v_cand->start_peak = peak; | |
| v_cand->end_peak = npeak; | |
| v_cand->closest_nccf_period = resid_peaks_[peak].nccf_periods[cc_peak]; | |
| v_cand->cost_sum = 0.0; | |
| v_cand->best_prev_cand = -1; | |
| resid_peaks_[peak].future.push_back(v_cand); | |
| resid_peaks_[npeak].past.push_back(v_cand); | |
| total_cands++; | |
| next_cands_created++; | |
| if (resid_peaks_[npeak].resid_index >= max_period) { | |
| break; // Exit the search only after at least one peak has | |
| // been found, even if it is necessary to go beyond | |
| // the nominal maximum period. | |
| } | |
| } // end if this period is >= minimum search period. | |
| } // end for each next pulse in the global period-search range. | |
| // Install the unvoiced candidate for this pulse. | |
| if (next_cands_created) { // Register the unvoiced hyp iff there | |
| // was at least one voiced hyp. | |
| resid_peaks_[peak].future.push_back(uv_cand); | |
| resid_peaks_[uv_cand->end_peak].past.push_back(uv_cand); | |
| total_cands++; | |
| } else { | |
| delete uv_cand; | |
| } | |
| // Now all residual-pulse period hyps that start at the current | |
| // pulse have been generated. | |
| // If this pulse is one of the first few in the residual that had | |
| // no possible preceeding periods, mark it as an origin. | |
| if (resid_peaks_[peak].past.size() == 0) { // Is this pulse an origin? | |
| for (size_t pp = 0; pp < resid_peaks_[peak].future.size(); ++pp) { | |
| resid_peaks_[peak].future[pp]->cost_sum = | |
| resid_peaks_[peak].future[pp]->local_cost; | |
| resid_peaks_[peak].future[pp]->best_prev_cand = -1; | |
| } | |
| } else { // There are previous period hyps to consider... | |
| // Check if at least one UV hyp is included in the period hyps | |
| // that end on this peak. If there are none, generate one by | |
| // cloning the best voiced hyp in the collection, but score it | |
| // as unvoiced. | |
| int32_t uv_hyps_found = 0; | |
| float lowest_cost = resid_peaks_[peak].past[0]->local_cost; | |
| size_t lowest_index = 0; | |
| for (size_t pcand = 0; pcand < resid_peaks_[peak].past.size(); ++pcand) { | |
| if (!resid_peaks_[peak].past[pcand]->voiced) { | |
| uv_hyps_found++; | |
| } else { | |
| if (resid_peaks_[peak].past[pcand]->local_cost < lowest_cost) { | |
| lowest_index = pcand; | |
| lowest_cost = resid_peaks_[peak].past[pcand]->local_cost; | |
| } | |
| } | |
| } | |
| if (!uv_hyps_found) { // clone an UV hyp from the best V hyp found. | |
| size_t start_peak = resid_peaks_[peak].past[lowest_index]->start_peak; | |
| EpochCand* uv_cand = new EpochCand; | |
| uv_cand->voiced = false; | |
| uv_cand->start_peak = start_peak; | |
| uv_cand->end_peak = peak; | |
| uv_cand->period = resid_peaks_[peak].past[lowest_index]->period; | |
| uv_cand->closest_nccf_period = | |
| resid_peaks_[peak].past[lowest_index]->closest_nccf_period; | |
| uv_cand->cost_sum = 0.0; | |
| uv_cand->local_cost = 0.0; | |
| uv_cand->best_prev_cand = -1; | |
| float llevel_cost = level_wt_ * | |
| prob_voiced_[resid_peaks_[start_peak].frame_index]; | |
| int32_t lcc_index = uv_cand->period - first_nccf_lag_; | |
| float lcc_value = 0.0; | |
| // If this period is in the normal search range, retrieve the | |
| // actual NCCF value for that lag. | |
| if ((lcc_index >= 0) && (lcc_index < n_nccf_lags_)) { | |
| lcc_value = resid_peaks_[start_peak].nccf[lcc_index]; | |
| } else { | |
| int32_t peak_cc_index = uv_cand->closest_nccf_period - first_nccf_lag_; | |
| lcc_value = resid_peaks_[start_peak].nccf[peak_cc_index]; | |
| } | |
| uv_cand->local_cost = (nccf_uv_peak_wt_ * lcc_value) + llevel_cost + | |
| unvoiced_cost_ + reward_; | |
| resid_peaks_[start_peak].future.push_back(uv_cand); | |
| resid_peaks_[peak].past.push_back(uv_cand); | |
| total_cands++; | |
| } | |
| } | |
| } // end of the first pass at all pulses in the residual. | |
| // All forward period hypotheses that start on all residual pulses | |
| // in the signal have now been generated, and both voiced and | |
| // unvoiced continuity throughout the lattice of hyps have been | |
| // assured. | |
| } | |
| void EpochTracker::DoDynamicProgramming(void) { | |
| // Perform the dynamic programming iterations over all pulses in | |
| // the residual. | |
| // For each pulse in the residual.... | |
| for (size_t peak = 0; peak < resid_peaks_.size(); ++peak) { | |
| if (resid_peaks_[peak].past.size() == 0) { // Is this peak an origin? | |
| continue; | |
| } | |
| // For each forward period hypothesis starting at this pulse... | |
| for (size_t fhyp = 0; fhyp < resid_peaks_[peak].future.size(); ++fhyp) { | |
| float min_cost = 1.0e30; // huge | |
| size_t min_index = 0; | |
| float forward_period = resid_peaks_[peak].future[fhyp]->period; | |
| // For each of the previous period hyps ending on this pulse... | |
| for (size_t phyp = 0; phyp < resid_peaks_[peak].past.size(); ++phyp) { | |
| float sum_cost = 0.0; | |
| // There are 4 voicing hyps to consider: V->V V->UV UV->V UV->UV | |
| if (resid_peaks_[peak].future[fhyp]->voiced && | |
| resid_peaks_[peak].past[phyp]->voiced) { // v->v | |
| float f_trans_cost = freq_trans_wt_ * | |
| fabs(log(forward_period / resid_peaks_[peak].past[phyp]->period)); | |
| sum_cost = f_trans_cost + resid_peaks_[peak].past[phyp]->cost_sum; | |
| } else { | |
| if (resid_peaks_[peak].future[fhyp]->voiced && | |
| !resid_peaks_[peak].past[phyp]->voiced) { // uv->v | |
| float v_transition_cost = voice_transition_factor_ * | |
| (1.0 - voice_onset_prob_[resid_peaks_[peak].frame_index]); | |
| sum_cost = resid_peaks_[peak].past[phyp]->cost_sum + | |
| v_transition_cost; | |
| } else { | |
| if ((!resid_peaks_[peak].future[fhyp]->voiced) && | |
| resid_peaks_[peak].past[phyp]->voiced) { // v->uv | |
| float v_transition_cost = voice_transition_factor_ * | |
| (1.0 - voice_offset_prob_[resid_peaks_[peak].frame_index]); | |
| sum_cost = resid_peaks_[peak].past[phyp]->cost_sum + | |
| v_transition_cost; | |
| } else { // UV->UV | |
| sum_cost = resid_peaks_[peak].past[phyp]->cost_sum; | |
| } | |
| } | |
| } | |
| if (sum_cost < min_cost) { | |
| min_cost = sum_cost; | |
| min_index = phyp; | |
| } | |
| } // end for each previous period hyp | |
| resid_peaks_[peak].future[fhyp]->cost_sum = | |
| resid_peaks_[peak].future[fhyp]->local_cost + min_cost; | |
| resid_peaks_[peak].future[fhyp]->best_prev_cand = min_index; | |
| } // end for each foreward period hyp | |
| } // end for each pulse in the residual signal. | |
| // Here ends the dynamic programming. | |
| } | |
| bool EpochTracker::BacktrackAndSaveOutput(void) { | |
| if (resid_peaks_.size() == 0) { | |
| fprintf(stderr, "Can't backtrack with no residual peaks\n"); | |
| return false; | |
| } | |
| // Now find the best period hypothesis at the end of the signal, | |
| // and backtrack from there. | |
| float min_cost = 1.0e30; | |
| int32_t min_index = 0; | |
| // First, find a terminal peak which is the end of more than one | |
| // period candidate. | |
| size_t end = 0; | |
| for (size_t peak = resid_peaks_.size() - 1; peak > 0; --peak) { | |
| if ((resid_peaks_[peak].past.size() > 1)) { | |
| for (size_t ind = 0; ind < resid_peaks_[peak].past.size(); ++ind) { | |
| if (resid_peaks_[peak].past[ind]->cost_sum < min_cost) { | |
| min_cost = resid_peaks_[peak].past[ind]->cost_sum; | |
| min_index = ind; | |
| } | |
| } | |
| end = peak; | |
| break; | |
| } | |
| } | |
| if (end == 0) { | |
| fprintf(stderr, "No terminal peak found in DynamicProgramming\n"); | |
| return false; | |
| } | |
| output_.clear(); | |
| // Backtrack through the best pointers to retrieve the optimum | |
| // period and voicing candidates. Save the GCI and voicing | |
| // estimates. | |
| while (1) { | |
| int32_t start_peak = resid_peaks_[end].past[min_index]->start_peak; | |
| TrackerResults tr; | |
| tr.resid_index = resid_peaks_[start_peak].resid_index; | |
| if (resid_peaks_[end].past[min_index]->voiced) { | |
| float nccf_period = | |
| resid_peaks_[end].past[min_index]->closest_nccf_period; | |
| // TODO(dtalkin) If the closest NCCF period is more than epsilon | |
| // different from the inter-pulse interval, use the inter-pulse | |
| // interval instead. | |
| tr.f0 = sample_rate_ / nccf_period; | |
| tr.voiced = true; | |
| } else { | |
| tr.f0 = 0.0; | |
| tr.voiced = false; | |
| } | |
| int32_t cc_index = resid_peaks_[end].past[min_index]->period - | |
| first_nccf_lag_; | |
| // If this period is in the normal search range, retrieve the | |
| // actual NCCF value for that lag. | |
| if ((cc_index >= 0) && (cc_index < n_nccf_lags_)) { | |
| tr.nccf_value = resid_peaks_[start_peak].nccf[cc_index]; | |
| } else { | |
| int32_t peak_cc_index = | |
| resid_peaks_[end].past[min_index]->closest_nccf_period - | |
| first_nccf_lag_; | |
| tr.nccf_value = resid_peaks_[start_peak].nccf[peak_cc_index]; | |
| } | |
| output_.push_back(tr); | |
| size_t new_end = resid_peaks_[end].past[min_index]->start_peak; | |
| min_index = resid_peaks_[end].past[min_index]->best_prev_cand; | |
| if (min_index < 0) { // Has an origin pulse been reached? | |
| break; | |
| } | |
| end = new_end; | |
| } | |
| // NOTE: The output_ array is in reverse time order! | |
| return true; | |
| } | |
| void EpochTracker::GetFilledEpochs(float unvoiced_pm_interval, | |
| std::vector<float>* times, | |
| std::vector<int16_t>* voicing) { | |
| times->clear(); | |
| voicing->clear(); | |
| float final_time = norm_residual_.size() / sample_rate_; | |
| int32_t limit = output_.size() - 1; | |
| int32_t i = limit; | |
| // Produce the output in normal time order. | |
| while (i >= 0) { | |
| int32_t i_old = i; | |
| float time = output_[i].resid_index / sample_rate_; | |
| // Note that the pulse locations of both the beginning and end | |
| // of any voiced period are of interest. | |
| if (output_[i].voiced || ((i < limit) && (output_[i+1].voiced))) { | |
| times->push_back(time); | |
| voicing->push_back(1); | |
| i--; | |
| } | |
| if (i == limit) { | |
| time = 0.0; | |
| } | |
| if ((i > 0) && (!output_[i].voiced) && (time < final_time)) { | |
| for ( ; i > 0; --i) { | |
| if (output_[i].voiced) { | |
| break; | |
| } | |
| } | |
| float next_time = final_time; | |
| int32_t fill_ind = 1; | |
| if (i > 0) { | |
| next_time = (output_[i].resid_index / sample_rate_) - | |
| (1.0 / max_f0_search_); | |
| } | |
| float now = time + (fill_ind * unvoiced_pm_interval); | |
| while (now < next_time) { | |
| times->push_back(now); | |
| voicing->push_back(0); | |
| fill_ind++; | |
| now = time + (fill_ind * unvoiced_pm_interval); | |
| } | |
| } | |
| if (i == i_old) { | |
| i--; | |
| } | |
| } | |
| } | |
| bool EpochTracker::ResampleAndReturnResults(float resample_interval, | |
| std::vector<float>* f0, | |
| std::vector<float>* correlations) { | |
| if ((sample_rate_ <= 0.0) || (output_.size() == 0)) { | |
| fprintf(stderr, | |
| "Un-initialized EpochTracker or no output_ in ResampleAndReturnF0\n"); | |
| return false; | |
| } | |
| if (resample_interval <= 0.0) { | |
| fprintf(stderr, "resample_interval <= 0.0 in ResampleAndReturnF0\n"); | |
| return false; | |
| } | |
| float last_time = (output_[0].resid_index / sample_rate_) + endpoint_padding_; | |
| int32_t n_frames = RoundUp(last_time / resample_interval); | |
| f0->resize(0); | |
| correlations->resize(0); | |
| f0->insert(f0->begin(), n_frames, 0.0); | |
| correlations->insert(correlations->begin(), n_frames, 0.0); | |
| int32_t limit = output_.size() - 1; | |
| int32_t prev_frame = 0; | |
| float prev_f0 = output_[limit].f0; | |
| float prev_corr = output_[limit].nccf_value; | |
| for (int32_t i = limit; i >= 0; --i) { | |
| int32_t frame = RoundUp(output_[i].resid_index / | |
| (sample_rate_ * resample_interval)); | |
| (*f0)[frame] = output_[i].f0; | |
| (*correlations)[frame] = output_[i].nccf_value; | |
| if ((frame - prev_frame) > 1) { | |
| for (int32_t fr = prev_frame + 1; fr < frame; ++fr) { | |
| (*f0)[fr] = prev_f0; | |
| (*correlations)[fr] = prev_corr; | |
| } | |
| } | |
| prev_frame = frame; | |
| prev_corr = output_[i].nccf_value; | |
| prev_f0 = output_[i].f0; | |
| } | |
| for (int32_t frame = prev_frame; frame < n_frames; ++frame) { | |
| (*f0)[frame] = prev_f0; | |
| (*correlations)[frame] = prev_corr; | |
| } | |
| return true; | |
| } | |
| bool EpochTracker::WriteDebugData(const std::vector<float>& data, | |
| const std::string& extension) { | |
| if (debug_name_.empty()) { | |
| return true; | |
| } | |
| std::string filename = debug_name_ + "." + extension; | |
| if (data.size() == 0) { | |
| fprintf(stdout, "Data size==0 for %s in WriteDebugData\n", | |
| filename.c_str()); | |
| return false; | |
| } | |
| FILE* out = fopen(filename.c_str(), "w"); | |
| if (!out) { | |
| fprintf(stderr, "Can't open %s for debug output\n", filename.c_str()); | |
| return false; | |
| } | |
| size_t written = fwrite(&(data.front()), sizeof(data.front()), | |
| data.size(), out); | |
| fclose(out); | |
| if (written != data.size()) { | |
| fprintf(stderr, "Problems writing debug data (%d %d)\n", | |
| static_cast<int>(written), static_cast<int>(data.size())); | |
| return false; | |
| } | |
| return true; | |
| } | |
| bool EpochTracker::WriteDiagnostics(const std::string& file_base) { | |
| if (!file_base.empty()) { | |
| set_debug_name(file_base); | |
| } | |
| WriteDebugData(signal_, "pcm"); | |
| WriteDebugData(residual_, "resid"); | |
| WriteDebugData(norm_residual_, "nresid"); | |
| WriteDebugData(bandpassed_rms_, "bprms"); | |
| WriteDebugData(voice_onset_prob_, "onsetp"); | |
| WriteDebugData(voice_offset_prob_, "offsetp"); | |
| WriteDebugData(peaks_debug_, "pvals"); | |
| WriteDebugData(prob_voiced_, "pvoiced"); | |
| // best_corr_ is only available after CreatePeriodLattice. | |
| WriteDebugData(best_corr_, "bestcorr"); | |
| // NOTE: if WriteDiagnostics is called before the | |
| // DynamicProgramming, there will be nothing in output_. | |
| if ((!debug_name_.empty()) && (output_.size() > 2)) { | |
| std::string pm_name = debug_name_ + ".pmlab"; | |
| FILE* pmfile = fopen(pm_name.c_str(), "w"); | |
| fprintf(pmfile, "#\n"); | |
| std::vector<float> f0; | |
| int32_t limit = output_.size() - 1; | |
| // Produce debug output in normal time order. | |
| for (int32_t i = limit; i >= 0; --i) { | |
| float time = output_[i].resid_index / sample_rate_; | |
| // Note that the pulse locations of both the beginning and end | |
| // of any voiced period are of interest. | |
| if (output_[i].voiced || ((i < limit) && (output_[i+1].voiced))) { | |
| fprintf(pmfile, "%f blue \n", time); | |
| } else { | |
| fprintf(pmfile, "%f red \n", time); | |
| } | |
| f0.push_back(time); | |
| f0.push_back(output_[i].f0); | |
| f0.push_back(output_[i].nccf_value); | |
| } | |
| fclose(pmfile); | |
| WriteDebugData(f0, "f0ap"); | |
| } | |
| return true; | |
| } | |