SphinxBase  5prealpha
fe_noise.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2013 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 /* This noise removal algorithm is inspired by the following papers
39  * Computationally Efficient Speech Enchancement by Spectral Minina Tracking
40  * by G. Doblinger
41  *
42  * Power-Normalized Cepstral Coefficients (PNCC) for Robust Speech Recognition
43  * by C. Kim.
44  *
45  * For the recent research and state of art see papers about IMRCA and
46  * A Minimum-Mean-Square-Error Noise Reduction Algorithm On Mel-Frequency
47  * Cepstra For Robust Speech Recognition by Dong Yu and others
48  */
49 
50 #ifdef HAVE_CONFIG_H
51 #include <config.h>
52 #endif
53 
54 #include <math.h>
55 
56 #include "sphinxbase/prim_type.h"
57 #include "sphinxbase/ckd_alloc.h"
58 #include "sphinxbase/strfuncs.h"
59 #include "sphinxbase/err.h"
60 
61 #include "fe_noise.h"
62 #include "fe_internal.h"
63 
64 /* Noise supression constants */
65 #define SMOOTH_WINDOW 4
66 #define LAMBDA_POWER 0.7
67 #define LAMBDA_A 0.995
68 #define LAMBDA_B 0.5
69 #define LAMBDA_T 0.85
70 #define MU_T 0.2
71 #define MAX_GAIN 20
72 #define SLOW_PEAK_FORGET_FACTOR 0.9995
73 #define SLOW_PEAK_LEARN_FACTOR 0.9
74 #define SPEECH_VOLUME_RANGE 8.0
75 
76 /* define VAD_DEBUG 1 */
77 #ifdef VAD_DEBUG
78 static FILE *vad_stats;
79 static int64 low_snr = 0;
80 static int64 low_volume = 0;
81 #endif
82 
83 struct noise_stats_s {
84  /* Smoothed power */
85  powspec_t *power;
86  /* Noise estimate */
87  powspec_t *noise;
88  /* Signal floor estimate */
89  powspec_t *floor;
90  /* Peak for temporal masking */
91  powspec_t *peak;
92 
93  /* Initialize it next time */
94  uint8 undefined;
95  /* Number of items to process */
96  uint32 num_filters;
97 
98  /* Sum of slow peaks for VAD */
99  powspec_t slow_peak_sum;
100 
101  /* Precomputed constants */
102  powspec_t lambda_power;
103  powspec_t comp_lambda_power;
104  powspec_t lambda_a;
105  powspec_t comp_lambda_a;
106  powspec_t lambda_b;
107  powspec_t comp_lambda_b;
108  powspec_t lambda_t;
109  powspec_t mu_t;
110  powspec_t max_gain;
111  powspec_t inv_max_gain;
112 
113  powspec_t smooth_scaling[2 * SMOOTH_WINDOW + 3];
114 };
115 
116 static void
117 fe_lower_envelope(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * floor_buf, int32 num_filt)
118 {
119  int i;
120 
121  for (i = 0; i < num_filt; i++) {
122 #ifndef FIXED_POINT
123  if (buf[i] >= floor_buf[i]) {
124  floor_buf[i] =
125  noise_stats->lambda_a * floor_buf[i] + noise_stats->comp_lambda_a * buf[i];
126  }
127  else {
128  floor_buf[i] =
129  noise_stats->lambda_b * floor_buf[i] + noise_stats->comp_lambda_b * buf[i];
130  }
131 #else
132  if (buf[i] >= floor_buf[i]) {
133  floor_buf[i] = fe_log_add(noise_stats->lambda_a + floor_buf[i],
134  noise_stats->comp_lambda_a + buf[i]);
135  }
136  else {
137  floor_buf[i] = fe_log_add(noise_stats->lambda_b + floor_buf[i],
138  noise_stats->comp_lambda_b + buf[i]);
139  }
140 #endif
141  }
142 }
143 
144 /* update slow peaks, check if max signal level big enough compared to peak */
145 static int16
146 fe_is_frame_quiet(noise_stats_t *noise_stats, powspec_t *buf, int32 num_filt)
147 {
148  int i;
149  int16 is_quiet;
150  powspec_t sum;
151  double smooth_factor;
152 
153  sum = 0.0;
154  for (i = 0; i < num_filt; i++) {
155 #ifndef FIXED_POINT
156  sum += buf[i];
157 #else
158  sum = fe_log_add(sum, buf[i]);
159 #endif
160  }
161 #ifndef FIXED_POINT
162  sum = log(sum);
163 #endif
164  smooth_factor = (sum > noise_stats->slow_peak_sum) ? SLOW_PEAK_LEARN_FACTOR : SLOW_PEAK_FORGET_FACTOR;
165  noise_stats->slow_peak_sum = noise_stats->slow_peak_sum * smooth_factor +
166  sum * (1 - smooth_factor);
167 
168 #ifdef VAD_DEBUG
169 #ifndef FIXED_POINT
170  fprintf(vad_stats, "%.3f %.3f ", noise_stats->slow_peak_sum, sum);
171 #else
172  fprintf(vad_stats, "%d %d ", noise_stats->slow_peak_sum, sum);
173 #endif
174 #endif
175 #ifndef FIXED_POINT
176  is_quiet = noise_stats->slow_peak_sum - SPEECH_VOLUME_RANGE > sum;
177 #else
178  is_quiet = noise_stats->slow_peak_sum - FLOAT2FIX(SPEECH_VOLUME_RANGE) > sum;
179 #endif
180  return is_quiet;
181 }
182 
183 /* temporal masking */
184 static void
185 fe_temp_masking(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * peak, int32 num_filt)
186 {
187  powspec_t cur_in;
188  int i;
189 
190  for (i = 0; i < num_filt; i++) {
191  cur_in = buf[i];
192 
193 #ifndef FIXED_POINT
194  peak[i] *= noise_stats->lambda_t;
195  if (buf[i] < noise_stats->lambda_t * peak[i])
196  buf[i] = peak[i] * noise_stats->mu_t;
197 #else
198  peak[i] += noise_stats->lambda_t;
199  if (buf[i] < noise_stats->lambda_t + peak[i])
200  buf[i] = peak[i] + noise_stats->mu_t;
201 #endif
202 
203  if (cur_in > peak[i])
204  peak[i] = cur_in;
205  }
206 }
207 
208 /* spectral weight smoothing */
209 static void
210 fe_weight_smooth(noise_stats_t *noise_stats, powspec_t * buf, powspec_t * coefs, int32 num_filt)
211 {
212  int i, j;
213  int l1, l2;
214  powspec_t coef;
215 
216  for (i = 0; i < num_filt; i++) {
217  l1 = ((i - SMOOTH_WINDOW) > 0) ? (i - SMOOTH_WINDOW) : 0;
218  l2 = ((i + SMOOTH_WINDOW) <
219  (num_filt - 1)) ? (i + SMOOTH_WINDOW) : (num_filt - 1);
220 
221 #ifndef FIXED_POINT
222  coef = 0;
223  for (j = l1; j <= l2; j++) {
224  coef += coefs[j];
225  }
226  buf[i] = buf[i] * (coef / (l2 - l1 + 1));
227 #else
228  coef = MIN_FIXLOG;
229  for (j = l1; j <= l2; j++) {
230  coef = fe_log_add(coef, coefs[j]);
231  }
232  buf[i] = buf[i] + coef - noise_stats->smooth_scaling[l2 - l1 + 1];
233 #endif
234 
235  }
236 }
237 
239 fe_init_noisestats(int num_filters)
240 {
241  int i;
242  noise_stats_t *noise_stats;
243 
244  noise_stats = (noise_stats_t *) ckd_calloc(1, sizeof(noise_stats_t));
245 
246  noise_stats->power =
247  (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
248  noise_stats->noise =
249  (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
250  noise_stats->floor =
251  (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
252  noise_stats->peak =
253  (powspec_t *) ckd_calloc(num_filters, sizeof(powspec_t));
254 
255  noise_stats->undefined = TRUE;
256  noise_stats->num_filters = num_filters;
257 
258 #ifndef FIXED_POINT
259  noise_stats->lambda_power = LAMBDA_POWER;
260  noise_stats->comp_lambda_power = 1 - LAMBDA_POWER;
261  noise_stats->lambda_a = LAMBDA_A;
262  noise_stats->comp_lambda_a = 1 - LAMBDA_A;
263  noise_stats->lambda_b = LAMBDA_B;
264  noise_stats->comp_lambda_b = 1 - LAMBDA_B;
265  noise_stats->lambda_t = LAMBDA_T;
266  noise_stats->mu_t = MU_T;
267  noise_stats->max_gain = MAX_GAIN;
268  noise_stats->inv_max_gain = 1.0 / MAX_GAIN;
269 
270  for (i = 1; i < 2 * SMOOTH_WINDOW + 1; i++) {
271  noise_stats->smooth_scaling[i] = 1.0 / i;
272  }
273 #else
274  noise_stats->lambda_power = FLOAT2FIX(log(LAMBDA_POWER));
275  noise_stats->comp_lambda_power = FLOAT2FIX(log(1 - LAMBDA_POWER));
276  noise_stats->lambda_a = FLOAT2FIX(log(LAMBDA_A));
277  noise_stats->comp_lambda_a = FLOAT2FIX(log(1 - LAMBDA_A));
278  noise_stats->lambda_b = FLOAT2FIX(log(LAMBDA_B));
279  noise_stats->comp_lambda_b = FLOAT2FIX(log(1 - LAMBDA_B));
280  noise_stats->lambda_t = FLOAT2FIX(log(LAMBDA_T));
281  noise_stats->mu_t = FLOAT2FIX(log(MU_T));
282  noise_stats->max_gain = FLOAT2FIX(log(MAX_GAIN));
283  noise_stats->inv_max_gain = FLOAT2FIX(log(1.0 / MAX_GAIN));
284 
285  for (i = 1; i < 2 * SMOOTH_WINDOW + 3; i++) {
286  noise_stats->smooth_scaling[i] = FLOAT2FIX(log(i));
287  }
288 #endif
289 
290 #ifdef VAD_DEBUG
291  vad_stats = fopen("vad_debug", "w");
292 #endif
293 
294  return noise_stats;
295 }
296 
297 void
298 fe_reset_noisestats(noise_stats_t * noise_stats)
299 {
300  if (noise_stats)
301  noise_stats->undefined = TRUE;
302 }
303 
304 void
305 fe_free_noisestats(noise_stats_t * noise_stats)
306 {
307  ckd_free(noise_stats->power);
308  ckd_free(noise_stats->noise);
309  ckd_free(noise_stats->floor);
310  ckd_free(noise_stats->peak);
311  ckd_free(noise_stats);
312 #ifdef VAD_DEBUG
313  fclose(vad_stats);
314  E_INFO("Low SNR [%ld] frames; Low volume [%ld] frames\n", (long)low_snr, (long)low_volume);
315 #endif
316 
317 }
318 
323 void
324 fe_track_snr(fe_t * fe, int32 *in_speech)
325 {
326  powspec_t *signal;
327  powspec_t *gain;
328  noise_stats_t *noise_stats;
329  powspec_t *mfspec;
330  int32 i, num_filts;
331  int16 is_quiet;
332  powspec_t lrt, snr;
333 
334  if (!(fe->remove_noise || fe->remove_silence)) {
335  *in_speech = TRUE;
336  return;
337  }
338 
339  noise_stats = fe->noise_stats;
340  mfspec = fe->mfspec;
341  num_filts = noise_stats->num_filters;
342 
343  signal = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t));
344 
345  if (noise_stats->undefined) {
346  noise_stats->slow_peak_sum = FIX2FLOAT(0.0);
347  for (i = 0; i < num_filts; i++) {
348  noise_stats->power[i] = mfspec[i];
349 #ifndef FIXED_POINT
350  noise_stats->noise[i] = mfspec[i] / noise_stats->max_gain;
351  noise_stats->floor[i] = mfspec[i] / noise_stats->max_gain;
352  noise_stats->peak[i] = 0.0;
353 #else
354  noise_stats->noise[i] = mfspec[i] - noise_stats->max_gain;;
355  noise_stats->floor[i] = mfspec[i] - noise_stats->max_gain;
356  noise_stats->peak[i] = MIN_FIXLOG;
357 #endif
358  }
359  noise_stats->undefined = FALSE;
360  }
361 
362  /* Calculate smoothed power */
363  for (i = 0; i < num_filts; i++) {
364 #ifndef FIXED_POINT
365  noise_stats->power[i] =
366  noise_stats->lambda_power * noise_stats->power[i] + noise_stats->comp_lambda_power * mfspec[i];
367 #else
368  noise_stats->power[i] = fe_log_add(noise_stats->lambda_power + noise_stats->power[i],
369  noise_stats->comp_lambda_power + mfspec[i]);
370 #endif
371  }
372 
373  /* Noise estimation and vad decision */
374  fe_lower_envelope(noise_stats, noise_stats->power, noise_stats->noise, num_filts);
375 
376  lrt = FLOAT2FIX(0.0);
377  for (i = 0; i < num_filts; i++) {
378 #ifndef FIXED_POINT
379  signal[i] = noise_stats->power[i] - noise_stats->noise[i];
380  if (signal[i] < 1.0)
381  signal[i] = 1.0;
382  snr = log(noise_stats->power[i] / noise_stats->noise[i]);
383 #else
384  signal[i] = fe_log_sub(noise_stats->power[i], noise_stats->noise[i]);
385  snr = noise_stats->power[i] - noise_stats->noise[i];
386 #endif
387  if (snr > lrt)
388  lrt = snr;
389  }
390  is_quiet = fe_is_frame_quiet(noise_stats, signal, num_filts);
391 
392 #ifdef VAD_DEBUG
393  if (lrt < fe->vad_threshold)
394  low_snr++;
395  else if (is_quiet)
396  low_volume++;
397 #endif
398 
399 #ifndef FIXED_POINT
400  if (fe->remove_silence && (lrt < fe->vad_threshold || is_quiet)) {
401 #else
402  if (fe->remove_silence && (lrt < FLOAT2FIX(fe->vad_threshold) || is_quiet)) {
403 #endif
404  *in_speech = FALSE;
405  } else {
406  *in_speech = TRUE;
407  }
408 
409 #ifdef VAD_DEBUG
410 #ifndef FIXED_POINT
411  fprintf(vad_stats, "%.3f %d\n", lrt, *in_speech);
412 #else
413  fprintf(vad_stats, "%d %d\n", lrt, *in_speech);
414 #endif
415 #endif
416 
417  fe_lower_envelope(noise_stats, signal, noise_stats->floor, num_filts);
418 
419  fe_temp_masking(noise_stats, signal, noise_stats->peak, num_filts);
420 
421  if (!fe->remove_noise) {
422  /* no need for further calculations if noise cancellation disabled */
423  ckd_free(signal);
424  return;
425  }
426 
427  for (i = 0; i < num_filts; i++) {
428  if (signal[i] < noise_stats->floor[i])
429  signal[i] = noise_stats->floor[i];
430  }
431 
432  gain = (powspec_t *) ckd_calloc(num_filts, sizeof(powspec_t));
433 #ifndef FIXED_POINT
434  for (i = 0; i < num_filts; i++) {
435  if (signal[i] < noise_stats->max_gain * noise_stats->power[i])
436  gain[i] = signal[i] / noise_stats->power[i];
437  else
438  gain[i] = noise_stats->max_gain;
439  if (gain[i] < noise_stats->inv_max_gain)
440  gain[i] = noise_stats->inv_max_gain;
441  }
442 #else
443  for (i = 0; i < num_filts; i++) {
444  gain[i] = signal[i] - noise_stats->power[i];
445  if (gain[i] > noise_stats->max_gain)
446  gain[i] = noise_stats->max_gain;
447  if (gain[i] < noise_stats->inv_max_gain)
448  gain[i] = noise_stats->inv_max_gain;
449  }
450 #endif
451 
452  /* Weight smoothing and time frequency normalization */
453  fe_weight_smooth(noise_stats, mfspec, gain, num_filts);
454 
455  ckd_free(gain);
456  ckd_free(signal);
457 }
458 
459 void
460 fe_vad_hangover(fe_t * fe, mfcc_t * feat, int32 is_speech, int32 store_pcm)
461 {
462  if (!fe->vad_data->in_speech) {
463  fe_prespch_write_cep(fe->vad_data->prespch_buf, feat);
464  if (store_pcm)
465  fe_prespch_write_pcm(fe->vad_data->prespch_buf, fe->spch);
466  }
467 
468  /* track vad state and deal with cepstrum prespeech buffer */
469  if (is_speech) {
470  fe->vad_data->post_speech_frames = 0;
471  if (!fe->vad_data->in_speech) {
472  fe->vad_data->pre_speech_frames++;
473  /* check for transition sil->speech */
474  if (fe->vad_data->pre_speech_frames >= fe->start_speech) {
475  fe->vad_data->pre_speech_frames = 0;
476  fe->vad_data->in_speech = 1;
477  }
478  }
479  } else {
480  fe->vad_data->pre_speech_frames = 0;
481  if (fe->vad_data->in_speech) {
482  fe->vad_data->post_speech_frames++;
483  /* check for transition speech->sil */
484  if (fe->vad_data->post_speech_frames >= fe->post_speech) {
485  fe->vad_data->post_speech_frames = 0;
486  fe->vad_data->in_speech = 0;
487  fe_prespch_reset_cep(fe->vad_data->prespch_buf);
488  fe_prespch_reset_pcm(fe->vad_data->prespch_buf);
489  }
490  }
491  }
492 }
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
Sphinx&#39;s memory allocation/deallocation routines.
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
Implementation of logging routines.
Structure for the front-end computation.
Definition: fe_internal.h:117