Spaces:
Runtime error
Runtime error
| /* | |
| Copyright 2015 Google Inc. All rights reserved. | |
| Licensed under the Apache License, Version 2.0 (the "License"); | |
| you may not use this file except in compliance with the License. | |
| You may obtain a copy of the License at | |
| http://www.apache.org/licenses/LICENSE-2.0 | |
| Unless required by applicable law or agreed to in writing, software | |
| distributed under the License is distributed on an "AS IS" BASIS, | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| See the License for the specific language governing permissions and | |
| limitations under the License. | |
| */ | |
| // Author: [email protected] (David Talkin) | |
| // EpochTracker estimates the location of glottal closure instants | |
| // (GCI), also known as "epochs" from digitized acoustic speech | |
| // signals. It simultaneously estimates the local fundamental | |
| // frequency (F0) and voicing state of the speech on a per-epoch | |
| // basis. Various output methods are available for retrieving the | |
| // results. | |
| // | |
| // The processing stages are: | |
| // * Optionally highpass the signal at 80 Hz to remove rumble, etc. | |
| // * Compute the LPC residual, obtaining an approximation of the | |
| // differentiated glottal flow. | |
| // * Normalize the amplitude of the residual by a local RMS measure. | |
| // * Pick the prominent peaks in the glottal flow, and grade them by | |
| // peakiness, skew and relative amplitude. | |
| // * Compute correlates of voicing to serve as pseudo-probabilities | |
| // of voicing, voicing onset and voicing offset. | |
| // * For every peak selected from the residual, compute a normalized | |
| // cross-correlation function (NCCF) of the LPC residual with a | |
| // relatively short reference window centered on the peak. | |
| // * For each peak in the residual, hypothesize all following peaks | |
| // within a specified F0 seaqrch range, that might be the end of a | |
| // period starting on that peak. | |
| // * Grade each of these hypothesized periods on local measures of | |
| // "voicedness" using the NCCF and the pseudo-probability of voicing | |
| // feature. | |
| // * Generate an unvoiced hypothesis for each period and grade it | |
| // for "voicelessness". | |
| // * Do a dynamic programming iteration to grade the goodness of | |
| // continuity between all hypotheses that start on a peak and | |
| // those that end on the same peak. For voiced-voiced | |
| // connections add a cost for F0 transitions. For | |
| // unvoiced-voiced and voiced-unvoiced transitions add a cost | |
| // that is modulated by the voicing onset or offset inverse | |
| // pseudo-probability. Unvoiced-unvoiced transitions incur no cost. | |
| // * Backtrack through the lowest-cost path developed during the | |
| // dynamic-programming stage to determine the best peak collection | |
| // in the residual. At each voiced peak, find the peak in the NCCF | |
| // (computed above) that corresponds to the duration closest to the | |
| // inter-peak interval, and use that as the inverse F0 for the | |
| // peak. | |
| // | |
| // A typical calling sequence might look like: | |
| /* ============================================================== | |
| EpochTracker et; | |
| et.Init(); // Prepare the instance for, possibly, multiple calls. | |
| Track* f0; // for returning the F0 track | |
| Track* pm; // for returning the epoch track | |
| if (!et.ComputeEpochs(my_input_waveform, &pm, &f0)) { | |
| exit(-2); // problems in the epoch computations | |
| } | |
| DoSomethingWithTracks(f0, pm); | |
| delete f0; | |
| delete pm; | |
| ============================================================== */ | |
| // | |
| // NOTE: Any client of this code inherits the Google command-line flags | |
| // defined in epoch_tracker.cc. These flags are processed in the Init() | |
| // method, and override both default and params-sourced settings. | |
| // | |
| // As currently written, this is a batch process. Very little has | |
| // been done to conserve either memory or CPU. The aim was simply to | |
| // make the best possible tracker. As will be seen in the | |
| // implementation, there are many parameters that can be adjusted to | |
| // influence the processing. It is very unlikely that the best | |
| // parameter setting is currently expressed in the code! However, the | |
| // performance, as written, appears to be quite good on a variety of voices. | |
| static const float kExternalFrameInterval = 0.005; | |
| static const float kInternalFrameInterval = 0.002; | |
| static const float kMinF0Search = 40.0; | |
| static const float kMaxF0Search = 500.0; | |
| static const float kUnvoicedPulseInterval = 0.01; | |
| static const float kUnvoicedCost = 0.9; | |
| static const bool kDoHighpass = true; | |
| static const bool kDoHilbertTransform = false; | |
| static const char kDebugName[] = ""; | |
| class EpochTracker { | |
| public: | |
| EpochTracker(void); | |
| virtual ~EpochTracker(void); | |
| // Set the default operating parameters of the tracker. | |
| void SetParameters(void); | |
| // NOTE: The following methods are exposed primarily for algorithm | |
| // development purposes, where EpochTracker is used in a developer's test | |
| // harness. These need not/should not be called directly in normal use. | |
| // Prepare the instance for use. Some sanity check is made on the | |
| // parameters, and the instance is reset so it can be reused | |
| // multiple times by simply calling Init() for each new input | |
| // signal. frame_interval determines the framing for some of the | |
| // internal feature computations, and for the periodic resampling of | |
| // F0 that will occur during final tracking result output. min_ | |
| // and max_f0_search are the bounding values, in Hz, for the F0 | |
| // search. | |
| // NOTE: This Init method is DEPRECATED, and is only retained to | |
| // support legacy code. IT MAY GO AWAY SOON. This is NOT to be | |
| // used with ComputeEpochs(). | |
| bool Init(const int16_t* input, int32_t n_input, float sample_rate, | |
| float min_f0_search, float max_f0_search, | |
| bool do_highpass, bool do_hilbert_transform); | |
| // Set the name for various intermediate features and other signals | |
| // that may be written to files used during debug and development of | |
| // the algorithm. If this is set to the empty std::string, no debug | |
| // signals will be output. | |
| void set_debug_name(const std::string& debug_name) { | |
| debug_name_ = debug_name; | |
| } | |
| std::string debug_name(void) { return debug_name_; } | |
| // Compute the Hilbert transform of the signal in input, and place | |
| // the floating-point results in output. output must be at least | |
| // n_input samples long. TODO(dtalkin): Make these vector inputs | |
| // and outputs. | |
| void HilbertTransform(int16_t* input, int32_t n_input, float* output); | |
| // Apply a highpass filter to the signal in input. The filter | |
| // corner frequency is corner_freq, and the duration, in seconds, of | |
| // the Hann-truncated symmetric FIR is in fir_duration. The | |
| // transition bandwidth is the inverse of fir_duration. The return | |
| // value is a pointer to the filtered result, which is the same | |
| // length as the input (n_input). It is up to the caller to free | |
| // this returned memory. TODO(dtalkin): Make this vector I/O and | |
| // supply the output as floats. | |
| int16_t* HighpassFilter(int16_t* input, int32_t n_input, | |
| float sample_rate, float corner_freq, | |
| float fir_duration); | |
| // Compute the LPC residual of the speech signal in input. | |
| // sample_rate is the rate of both the input and the residual to be | |
| // placed in output. The order of the LPC analysis is automatically | |
| // set to be appropriate for the sample rate, and the output is | |
| // integrated so it approximates the derivative of the glottal flow. | |
| bool GetLpcResidual(const std::vector<float>& input, float sample_rate, | |
| std::vector<float>* output); | |
| // Compute the normalized cross-correlation function (NCCF) of the | |
| // signal in data, starting at sample start. size is the number of | |
| // samples to include in the inner product. Compute n_lags | |
| // contiguous correlations starting at a delay of first_lag samples. | |
| // Return the resulting n_lags correlation values in corr. Note | |
| // that the NCCF is bounded by +-1.0. | |
| void CrossCorrelation(const std::vector<float>& data, int32_t start, | |
| int32_t first_lag, int32_t n_lags, | |
| int32_t size, std::vector<float>* corr); | |
| // Compute the band-limited RMS of the signal in input, which is | |
| // sampled at sample_rate. low_limit and high_limit are the | |
| // frequency bounds, in Hz, within which th RMS is measured. | |
| // frame_interval is the period of the RMS signal returned in | |
| // output_rms. frame_dur is the duration, in seconds, of the | |
| // Hanning window used for each measurement. | |
| bool GetBandpassedRmsSignal(const std::vector<float>& input, float sample_rate, | |
| float low_limit, float high_limit, float frame_interval, | |
| float frame_dur, std::vector<float>* output_rms); | |
| // Compute the RMS of positive and negative signal values separately. | |
| // The signal is made to be zero mean before this computation. Any | |
| // imbalance in these measures is an indication of asymmetrical peak | |
| // distribution, which is charactristic of the LPC residual of voiced speech. | |
| void GetSymmetryStats(const std::vector<float>& data, float* pos_rms, | |
| float* neg_rms, float* mean); | |
| // Normalize the input signal based on a local measure of its RMS. | |
| void NormalizeAmplitude(const std::vector<float>& input, float sample_rate, | |
| std::vector<float>* output); | |
| // Apply a Hann weighting to the signal in input starting at | |
| // sample index offset. The window will contain size samples, and | |
| // the windowed signal is placed in output. | |
| void Window(const std::vector<float>& input, int32_t offset, size_t size, | |
| float* output); | |
| // Computes signal polarity (-1 for negative, +1 for | |
| // positive). Requires data to be initialized via Init(...). Returns | |
| // false if there's an error. | |
| bool ComputePolarity(int *polarity); | |
| // Compute NCCF, NCCF peak locations and values, bandpass RMS, | |
| // residual, symmetry statistics (and invert residual, if necessary), | |
| // normalized residual, residual peaks and values. Finally, generate | |
| // the pulse working array in preparation for dynamic programming. | |
| bool ComputeFeatures(void); | |
| // Write all of data to a file, wht name of which is | |
| // debug_name_ _ "." + extension. If debug_name_ is empty, do nothing. | |
| bool WriteDebugData(const std::vector<float>& data, | |
| const std::string& extension); | |
| // Write a collection of debugging signals to separate files with | |
| // various, internally-defined name extensions. If file_base is not | |
| // empty, use this as the base path for all of the files. If file | |
| // base is empty, use debug_name_ as the base path. If both are | |
| // empty, do nothing. | |
| bool WriteDiagnostics(const std::string& file_base); | |
| // After Init, ComputeFeatures and CreatePeriodLattice have been | |
| // successfully called, TrackEpochs should be called to do the | |
| // actual tracking of epochs (GCI) and to estimate the corresponding | |
| // F0. This method integrates the information from all of the | |
| // features, including the LPC residual peaks and the NCCF values, | |
| // to find the optimum period assignments and voicing state | |
| // assignments over the entire signal. The results are left in | |
| // internal storage, pending retrieval by other methods. | |
| bool TrackEpochs(void); | |
| // Create a lattice of glottal period hypotheses in preparation for | |
| // dynamic programming. This fills out most of the data fields in | |
| // resid_peaks_. This must be called after ComputeFeatures. | |
| void CreatePeriodLattice(void); | |
| // Apply the Viterbi dynamic programming algorithm to find the best | |
| // path through the period hypothesis lattice created by | |
| // CreatePeriodLattice. The backpointers and cumulative scores are | |
| // left in the relevant fields in resid_peaks_. | |
| void DoDynamicProgramming(void); | |
| // Backtrack through the best pointers in the period hypothesis | |
| // lattice created by CreatePeriodLattice and processed by | |
| // DoDynamicProgramming. The estimated GCI locations | |
| // (epochs) and the corresponding F0 and voicing-states are placed | |
| // in the output_ array pending retrieval using other methods. | |
| bool BacktrackAndSaveOutput(void); | |
| // Resample the per-period F0 and correlation data that results from | |
| // the tracker to a periodic signal at an interval of | |
| // resample_interval seconds. Samples returned are those nearest in | |
| // time to an epoch. Thus, if the resample_interval is greater than | |
| // the local epoch interval, some epochs, and their period | |
| // information, will be skipped. Conversely, if the | |
| // resample_interval is less than the local epoch interval, | |
| // measurements will be replicated as required. | |
| bool ResampleAndReturnResults(float resample_interval, | |
| std::vector<float>* f0, | |
| std::vector<float>* correlations); | |
| // Convert the raw backtracking results in output_ into | |
| // normal-time-order epoch markers. In unvoiced regions, fill with | |
| // regularly-spaced pulses separated by unvoiced_pm_interval | |
| // seconds. The epoch/pulse times are returned in times re the | |
| // utterance beginning, and the corresponding voicing states in | |
| // voicing (0=unvoiced; 1=voiced). This can only be called after | |
| // TrackEpochs. | |
| void GetFilledEpochs(float unvoiced_pm_interval, std::vector<float>* times, | |
| std::vector<int16_t>* voicing); | |
| // Setters. | |
| void set_do_hilbert_transform(bool v) { do_hilbert_transform_ = v; } | |
| void set_do_highpass(bool v) { do_highpass_ = v; } | |
| void set_external_frame_interval(float v) { external_frame_interval_ = v; } | |
| void set_unvoiced_pulse_interval(float v) { unvoiced_pulse_interval_ = v; } | |
| void set_min_f0_search(float v) { min_f0_search_ = v; } | |
| void set_max_f0_search(float v) { max_f0_search_ = v; } | |
| void set_unvoiced_cost(float v) { unvoiced_cost_ = v; } | |
| private: | |
| // Search the signal in norm_residual_ for prominent negative peaks. | |
| // Grade the peaks on a combination of amplitude, "peakiness" and | |
| // skew. (It is expected that the glottal pulses will | |
| // have a relatively slow fall, and a rapid rise.) Place the | |
| // selected and graded pulses in resid_peaks_. | |
| void GetResidualPulses(void); | |
| // Create pseudo-probability functions in voice_onset_prob_ and | |
| // voice_offset_prob_ that attempt to indicate the time-varying | |
| // probability that a voice onset or offset is occurring. | |
| // Presently, this is based solely on the derivative of the | |
| // bandpassed RMS signal, bandpassed_rms_. | |
| void GetVoiceTransitionFeatures(void); | |
| // Generate a pseudo-probability function that attempts to corespond | |
| // to the probability that voicing is occurring. This is presently | |
| // based solely on the bandpassed RMS signal, bandpassed_rms_. | |
| void GetRmsVoicingModulator(void); | |
| // Free memory, and prepare the instance for a new signal. | |
| void CleanUp(void); | |
| // Scan the signal in input searching for all local maxima that | |
| // exceed thresh. The indices corresponding to the location of the | |
| // peaks are placed in output. The first entry in output is always | |
| // the location of the largest maximum found. | |
| int32_t FindNccfPeaks(const std::vector<float>& input, float thresh, | |
| std::vector<int16_t>* output); | |
| // Compute the NCCF with the reference window centered on each of | |
| // the residual pulses identified in GetResidualPulses. window_dur | |
| // is the duration in seconds of the correlation inner product. | |
| // After the NCCF for each residual pulse is computed, it is | |
| // searched for local maxima that exceed peak_thresh. These peak | |
| // locations and the full NCCF are saved in the corresponding | |
| // elements of the resid_peaks_ array of structures. | |
| void GetPulseCorrelations(float window_dur, float peak_thresh); | |
| private: | |
| // EpochCand stores all of the period hypotheses that can be | |
| // generated from the peaks found in the LPC residual. It also | |
| // maintains the cumulative path costs and backpointers generated | |
| // during dynamic programming. | |
| struct EpochCand { | |
| int32_t period; // # of samples in this period candidate | |
| float local_cost; // cost of calling this a period (or unvoiced) | |
| float cost_sum; // cumulative cost from DP | |
| int32_t start_peak; // index in resid_peaks_ where this period hyp starts | |
| int32_t end_peak; // where this period ends | |
| int32_t best_prev_cand; // backpointer used after DP | |
| int32_t closest_nccf_period; // per. implied by the closest correlation peak | |
| bool voiced; // hypothesized voicing state for this cand. | |
| }; | |
| typedef std::vector<EpochCand*> CandList; | |
| // The ResidPeak stores data for each residual impulse. The array | |
| // of these in resid_peaks_ serves as input to the dynamic | |
| // programming search for GCI, voicing state and F0. | |
| struct ResidPeak { | |
| int32_t resid_index; // index into the resid_ array of this peak | |
| int32_t frame_index; // index into the feature arrays for this peak | |
| float peak_quality; // "goodness" measure for this peak | |
| std::vector<float> nccf; // the NCCF computed centered on this peak | |
| std::vector<int16_t> nccf_periods; // periods implied by major peaks in nccf | |
| CandList future; // period candidates that start on this peak | |
| CandList past; // period candidates that end on this peak | |
| }; | |
| struct TrackerResults { | |
| bool voiced; | |
| float f0; | |
| int32_t resid_index; | |
| float nccf_value; | |
| }; | |
| typedef std::vector<TrackerResults> TrackerOutput; | |
| protected: | |
| std::vector<ResidPeak> resid_peaks_; // array of structures used to | |
| // store the peak search lattice | |
| TrackerOutput output_; // Array of time stamped results of the tracker. | |
| // signal_, residual_, norm_residual and peaks_debug_ are all | |
| // sampled at the original signal input sample_rate_. | |
| std::vector<float> signal_; // floating version of input speech signal | |
| std::vector<float> residual_; // LPC residual normalized for constant DC. | |
| std::vector<float> norm_residual_; // LPC residual normalized by its local RMS. | |
| std::vector<float> peaks_debug_; // for debug output of residual peak candidates | |
| // bandpassed_rms_, voice_onset_prob_, voice_offset_prob_ and | |
| // prob_voiced_ are all sampled with a period of internal_frame_interval_. | |
| std::vector<float> bandpassed_rms_; // RMS sampled at internal_frame_interval_ | |
| std::vector<float> voice_onset_prob_; // prob that a voice onset is occurring | |
| std::vector<float> voice_offset_prob_; // prob that a voice offset is occurring | |
| std::vector<float> prob_voiced_; // prob that voicing is occurring | |
| std::vector<float> best_corr_; // An array of best NCCF vals for all resid peaks. | |
| std::vector<float> window_; // Hann weighting array for Window() | |
| float sample_rate_; // original input signal sample rate in Hz | |
| float positive_rms_; // RMS of all positive, non-zero samples in residual_ | |
| float negative_rms_; // RMS of all negative, non-zero samples in residual_ | |
| int32_t n_feature_frames_; // The number of feature frames available | |
| // for all features computed at | |
| // internal_frame_interval_. | |
| int32_t first_nccf_lag_; // The index of the first correlation of the | |
| // NCCF. This is determined by | |
| // max_f0_search_. | |
| int32_t n_nccf_lags_; // The number of correlations computed at each | |
| // residual peak candidate. This is determined | |
| // by max_f0_search_ and min_f0_search_. | |
| std::string debug_name_; // The base path name for all debug output files. | |
| // Below are all of the parameters that control the functioning of | |
| // the tracker. These are all set to default known-to-work values in | |
| // SetParameters(). | |
| // Control parameters available to clients of EpochTracker. | |
| float external_frame_interval_; // Frame interval for final output of F0. | |
| float unvoiced_pulse_interval_; // Pulse interval in unvoiced regions | |
| float min_f0_search_; // minimum F0 to search for (Hz) | |
| float max_f0_search_; // maximum F0 to search for (Hz) | |
| bool do_highpass_; // Highpass input sighal iff true. | |
| bool do_hilbert_transform_; // Hilbert trans. input data iff true. | |
| // Internal feature-computation Parameters: | |
| float internal_frame_interval_; // interval, in seconds, between frame onsets | |
| // for the high-pass filter | |
| float corner_frequency_; | |
| float filter_duration_; | |
| // for the LPC inverse filter. | |
| float frame_duration_; // window size (sec) | |
| float lpc_frame_interval_; // (sec) | |
| float preemphasis_; // preemphasis for LPC analysis | |
| float noise_floor_; // SNR in dB simulated during LPC analysis. | |
| // for computing LPC residual peak quality. | |
| float peak_delay_; // for measuring prominence | |
| float skew_delay_; // for measuring shape | |
| float peak_val_wt_; | |
| float peak_prominence_wt_; | |
| float peak_skew_wt_; | |
| float peak_quality_floor_; | |
| // for computing voice-transition pseudo-probabilities | |
| float time_span_; // the interval (sec) centered on the | |
| // measurement point, used to | |
| // compute parameter deltas | |
| float level_change_den_; // max. dB level change | |
| // expected over time_span_ for | |
| // bandpassed RMS | |
| // for computing pseudo-probability of voicing | |
| float min_rms_db_; // level floor in dB | |
| // window size for computing amplitude-normalizing RMS | |
| float ref_dur_; | |
| // low and high frequency limits for bandpassed RMS used in voicing indicator | |
| float min_freq_for_rms_; | |
| float max_freq_for_rms_; | |
| // duration of integrator for bandpassed RMS | |
| float rms_window_dur_; | |
| // window duration, in seconds, for NCCF computations | |
| float correlation_dur_; | |
| // ignore any NCCF peaks less than this | |
| float correlation_thresh_; | |
| // Parametrs used by the dynamic-programming tracker: | |
| // reward for inserting another period | |
| float reward_; | |
| // weight given to deviation of inter-pulse interval from the | |
| // closest NCCF peak lag | |
| float period_deviation_wt_; | |
| // weight given to the quality of the residual peak | |
| float peak_quality_wt_; | |
| // cost of the unvoiced hypothesis | |
| float unvoiced_cost_; | |
| // cost of high NCCF values in hypothetical unvoiced regions | |
| float nccf_uv_peak_wt_; | |
| // weight given to period length | |
| float period_wt_; | |
| // weight given to the pseudo-probability of voicing feature | |
| float level_wt_; | |
| // weight given to period-length differences between adjacent periods. | |
| float freq_trans_wt_; | |
| // cost of switching between voicing states; modulated by voicing | |
| // onset/offset probs. | |
| float voice_transition_factor_; | |
| // Parameters used to generate final outputs: | |
| // pad time in seconds to add to the last measured period during | |
| // output of periodically-resampled data | |
| float endpoint_padding_; | |
| }; | |