62 #include "fe_internal.h"
65 #define SMOOTH_WINDOW 4
66 #define LAMBDA_POWER 0.7
67 #define LAMBDA_A 0.995
72 #define SLOW_PEAK_FORGET_FACTOR 0.9995
73 #define SLOW_PEAK_LEARN_FACTOR 0.9
74 #define SPEECH_VOLUME_RANGE 8.0
78 static FILE *vad_stats;
79 static int64 low_snr = 0;
80 static int64 low_volume = 0;
99 powspec_t slow_peak_sum;
102 powspec_t lambda_power;
103 powspec_t comp_lambda_power;
105 powspec_t comp_lambda_a;
107 powspec_t comp_lambda_b;
111 powspec_t inv_max_gain;
113 powspec_t smooth_scaling[2 * SMOOTH_WINDOW + 3];
117 fe_lower_envelope(
noise_stats_t *noise_stats, powspec_t * buf, powspec_t * floor_buf, int32 num_filt)
121 for (i = 0; i < num_filt; i++) {
123 if (buf[i] >= floor_buf[i]) {
125 noise_stats->lambda_a * floor_buf[i] + noise_stats->comp_lambda_a * buf[i];
129 noise_stats->lambda_b * floor_buf[i] + noise_stats->comp_lambda_b * buf[i];
132 if (buf[i] >= floor_buf[i]) {
133 floor_buf[i] = fe_log_add(noise_stats->lambda_a + floor_buf[i],
134 noise_stats->comp_lambda_a + buf[i]);
137 floor_buf[i] = fe_log_add(noise_stats->lambda_b + floor_buf[i],
138 noise_stats->comp_lambda_b + buf[i]);
146 fe_is_frame_quiet(
noise_stats_t *noise_stats, powspec_t *buf, int32 num_filt)
151 double smooth_factor;
154 for (i = 0; i < num_filt; i++) {
158 sum = fe_log_add(sum, buf[i]);
164 smooth_factor = (sum > noise_stats->slow_peak_sum) ? SLOW_PEAK_LEARN_FACTOR : SLOW_PEAK_FORGET_FACTOR;
165 noise_stats->slow_peak_sum = noise_stats->slow_peak_sum * smooth_factor +
166 sum * (1 - smooth_factor);
170 fprintf(vad_stats,
"%.3f %.3f ", noise_stats->slow_peak_sum, sum);
172 fprintf(vad_stats,
"%d %d ", noise_stats->slow_peak_sum, sum);
176 is_quiet = noise_stats->slow_peak_sum - SPEECH_VOLUME_RANGE > sum;
178 is_quiet = noise_stats->slow_peak_sum - FLOAT2FIX(SPEECH_VOLUME_RANGE) > sum;
185 fe_temp_masking(
noise_stats_t *noise_stats, powspec_t * buf, powspec_t * peak, int32 num_filt)
190 for (i = 0; i < num_filt; i++) {
194 peak[i] *= noise_stats->lambda_t;
195 if (buf[i] < noise_stats->lambda_t * peak[i])
196 buf[i] = peak[i] * noise_stats->mu_t;
198 peak[i] += noise_stats->lambda_t;
199 if (buf[i] < noise_stats->lambda_t + peak[i])
200 buf[i] = peak[i] + noise_stats->mu_t;
203 if (cur_in > peak[i])
210 fe_weight_smooth(
noise_stats_t *noise_stats, powspec_t * buf, powspec_t * coefs, int32 num_filt)
216 for (i = 0; i < num_filt; i++) {
217 l1 = ((i - SMOOTH_WINDOW) > 0) ? (i - SMOOTH_WINDOW) : 0;
218 l2 = ((i + SMOOTH_WINDOW) <
219 (num_filt - 1)) ? (i + SMOOTH_WINDOW) : (num_filt - 1);
223 for (j = l1; j <= l2; j++) {
226 buf[i] = buf[i] * (coef / (l2 - l1 + 1));
229 for (j = l1; j <= l2; j++) {
230 coef = fe_log_add(coef, coefs[j]);
232 buf[i] = buf[i] + coef - noise_stats->smooth_scaling[l2 - l1 + 1];
239 fe_init_noisestats(
int num_filters)
247 (powspec_t *)
ckd_calloc(num_filters,
sizeof(powspec_t));
249 (powspec_t *)
ckd_calloc(num_filters,
sizeof(powspec_t));
251 (powspec_t *)
ckd_calloc(num_filters,
sizeof(powspec_t));
253 (powspec_t *)
ckd_calloc(num_filters,
sizeof(powspec_t));
255 noise_stats->undefined = TRUE;
256 noise_stats->num_filters = num_filters;
259 noise_stats->lambda_power = LAMBDA_POWER;
260 noise_stats->comp_lambda_power = 1 - LAMBDA_POWER;
261 noise_stats->lambda_a = LAMBDA_A;
262 noise_stats->comp_lambda_a = 1 - LAMBDA_A;
263 noise_stats->lambda_b = LAMBDA_B;
264 noise_stats->comp_lambda_b = 1 - LAMBDA_B;
265 noise_stats->lambda_t = LAMBDA_T;
266 noise_stats->mu_t = MU_T;
267 noise_stats->max_gain = MAX_GAIN;
268 noise_stats->inv_max_gain = 1.0 / MAX_GAIN;
270 for (i = 1; i < 2 * SMOOTH_WINDOW + 1; i++) {
271 noise_stats->smooth_scaling[i] = 1.0 / i;
274 noise_stats->lambda_power = FLOAT2FIX(log(LAMBDA_POWER));
275 noise_stats->comp_lambda_power = FLOAT2FIX(log(1 - LAMBDA_POWER));
276 noise_stats->lambda_a = FLOAT2FIX(log(LAMBDA_A));
277 noise_stats->comp_lambda_a = FLOAT2FIX(log(1 - LAMBDA_A));
278 noise_stats->lambda_b = FLOAT2FIX(log(LAMBDA_B));
279 noise_stats->comp_lambda_b = FLOAT2FIX(log(1 - LAMBDA_B));
280 noise_stats->lambda_t = FLOAT2FIX(log(LAMBDA_T));
281 noise_stats->mu_t = FLOAT2FIX(log(MU_T));
282 noise_stats->max_gain = FLOAT2FIX(log(MAX_GAIN));
283 noise_stats->inv_max_gain = FLOAT2FIX(log(1.0 / MAX_GAIN));
285 for (i = 1; i < 2 * SMOOTH_WINDOW + 3; i++) {
286 noise_stats->smooth_scaling[i] = FLOAT2FIX(log(i));
291 vad_stats = fopen(
"vad_debug",
"w");
301 noise_stats->undefined = TRUE;
314 E_INFO(
"Low SNR [%ld] frames; Low volume [%ld] frames\n", (
long)low_snr, (
long)low_volume);
324 fe_track_snr(
fe_t * fe, int32 *in_speech)
334 if (!(fe->remove_noise || fe->remove_silence)) {
339 noise_stats = fe->noise_stats;
341 num_filts = noise_stats->num_filters;
343 signal = (powspec_t *)
ckd_calloc(num_filts,
sizeof(powspec_t));
345 if (noise_stats->undefined) {
346 noise_stats->slow_peak_sum = FIX2FLOAT(0.0);
347 for (i = 0; i < num_filts; i++) {
348 noise_stats->power[i] = mfspec[i];
350 noise_stats->noise[i] = mfspec[i] / noise_stats->max_gain;
351 noise_stats->floor[i] = mfspec[i] / noise_stats->max_gain;
352 noise_stats->peak[i] = 0.0;
354 noise_stats->noise[i] = mfspec[i] - noise_stats->max_gain;;
355 noise_stats->floor[i] = mfspec[i] - noise_stats->max_gain;
356 noise_stats->peak[i] = MIN_FIXLOG;
359 noise_stats->undefined = FALSE;
363 for (i = 0; i < num_filts; i++) {
365 noise_stats->power[i] =
366 noise_stats->lambda_power * noise_stats->power[i] + noise_stats->comp_lambda_power * mfspec[i];
368 noise_stats->power[i] = fe_log_add(noise_stats->lambda_power + noise_stats->power[i],
369 noise_stats->comp_lambda_power + mfspec[i]);
374 fe_lower_envelope(noise_stats, noise_stats->power, noise_stats->noise, num_filts);
376 lrt = FLOAT2FIX(0.0);
377 for (i = 0; i < num_filts; i++) {
379 signal[i] = noise_stats->power[i] - noise_stats->noise[i];
382 snr = log(noise_stats->power[i] / noise_stats->noise[i]);
384 signal[i] = fe_log_sub(noise_stats->power[i], noise_stats->noise[i]);
385 snr = noise_stats->power[i] - noise_stats->noise[i];
390 is_quiet = fe_is_frame_quiet(noise_stats, signal, num_filts);
393 if (lrt < fe->vad_threshold)
400 if (fe->remove_silence && (lrt < fe->vad_threshold || is_quiet)) {
402 if (fe->remove_silence && (lrt < FLOAT2FIX(fe->vad_threshold) || is_quiet)) {
411 fprintf(vad_stats,
"%.3f %d\n", lrt, *in_speech);
413 fprintf(vad_stats,
"%d %d\n", lrt, *in_speech);
417 fe_lower_envelope(noise_stats, signal, noise_stats->floor, num_filts);
419 fe_temp_masking(noise_stats, signal, noise_stats->peak, num_filts);
421 if (!fe->remove_noise) {
427 for (i = 0; i < num_filts; i++) {
428 if (signal[i] < noise_stats->floor[i])
429 signal[i] = noise_stats->floor[i];
432 gain = (powspec_t *)
ckd_calloc(num_filts,
sizeof(powspec_t));
434 for (i = 0; i < num_filts; i++) {
435 if (signal[i] < noise_stats->max_gain * noise_stats->power[i])
436 gain[i] = signal[i] / noise_stats->power[i];
438 gain[i] = noise_stats->max_gain;
439 if (gain[i] < noise_stats->inv_max_gain)
440 gain[i] = noise_stats->inv_max_gain;
443 for (i = 0; i < num_filts; i++) {
444 gain[i] = signal[i] - noise_stats->power[i];
445 if (gain[i] > noise_stats->max_gain)
446 gain[i] = noise_stats->max_gain;
447 if (gain[i] < noise_stats->inv_max_gain)
448 gain[i] = noise_stats->inv_max_gain;
453 fe_weight_smooth(noise_stats, mfspec, gain, num_filts);
460 fe_vad_hangover(
fe_t * fe, mfcc_t * feat, int32 is_speech, int32 store_pcm)
462 if (!fe->vad_data->in_speech) {
463 fe_prespch_write_cep(fe->vad_data->prespch_buf, feat);
465 fe_prespch_write_pcm(fe->vad_data->prespch_buf, fe->spch);
470 fe->vad_data->post_speech_frames = 0;
471 if (!fe->vad_data->in_speech) {
472 fe->vad_data->pre_speech_frames++;
474 if (fe->vad_data->pre_speech_frames >= fe->start_speech) {
475 fe->vad_data->pre_speech_frames = 0;
476 fe->vad_data->in_speech = 1;
480 fe->vad_data->pre_speech_frames = 0;
481 if (fe->vad_data->in_speech) {
482 fe->vad_data->post_speech_frames++;
484 if (fe->vad_data->post_speech_frames >= fe->post_speech) {
485 fe->vad_data->post_speech_frames = 0;
486 fe->vad_data->in_speech = 0;
487 fe_prespch_reset_cep(fe->vad_data->prespch_buf);
488 fe_prespch_reset_pcm(fe->vad_data->prespch_buf);
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Sphinx's memory allocation/deallocation routines.
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Implementation of logging routines.
Structure for the front-end computation.