48 #include "sphinxbase/byteorder.h"
49 #include "sphinxbase/fixpoint.h"
55 #include "fe_internal.h"
58 static const arg_t fe_args[] = {
59 waveform_to_cepstral_command_line_macro(),
60 { NULL, 0, NULL, NULL }
69 fe->sampling_rate = cmd_ln_float32_r(config,
"-samprate");
70 frate = cmd_ln_int32_r(config,
"-frate");
71 if (frate > MAX_INT16 || frate > fe->sampling_rate || frate < 1) {
73 (
"Frame rate %d can not be bigger than sample rate %.02f\n",
74 frate, fe->sampling_rate);
78 fe->frame_rate = (int16)frate;
81 fe->dither_seed = cmd_ln_int32_r(config,
"-seed");
83 #ifdef WORDS_BIGENDIAN
84 fe->swap = strcmp(
"big",
cmd_ln_str_r(config,
"-input_endian")) == 0 ? 0 : 1;
86 fe->swap = strcmp(
"little",
cmd_ln_str_r(config,
"-input_endian")) == 0 ? 0 : 1;
88 fe->window_length = cmd_ln_float32_r(config,
"-wlen");
89 fe->pre_emphasis_alpha = cmd_ln_float32_r(config,
"-alpha");
91 fe->num_cepstra = (uint8)cmd_ln_int32_r(config,
"-ncep");
92 fe->fft_size = (int16)cmd_ln_int32_r(config,
"-nfft");
95 for (j = fe->fft_size, fe->fft_order = 0; j > 1; j >>= 1, fe->fft_order++) {
96 if (((j % 2) != 0) || (fe->fft_size <= 0)) {
97 E_ERROR(
"fft: number of points must be a power of 2 (is %d)\n",
103 if (fe->fft_size < (
int)(fe->window_length * fe->sampling_rate)) {
104 E_ERROR(
"FFT: Number of points must be greater or equal to frame size (%d samples)\n",
105 (
int)(fe->window_length * fe->sampling_rate));
109 fe->pre_speech = (int16)cmd_ln_int32_r(config,
"-vad_prespeech");
110 fe->post_speech = (int16)cmd_ln_int32_r(config,
"-vad_postspeech");
111 fe->start_speech = (int16)cmd_ln_int32_r(config,
"-vad_startspeech");
112 fe->vad_threshold = cmd_ln_float32_r(config,
"-vad_threshold");
118 if (0 == strcmp(
cmd_ln_str_r(config,
"-transform"),
"dct"))
119 fe->transform = DCT_II;
120 else if (0 == strcmp(
cmd_ln_str_r(config,
"-transform"),
"legacy"))
121 fe->transform = LEGACY_DCT;
122 else if (0 == strcmp(
cmd_ln_str_r(config,
"-transform"),
"htk"))
123 fe->transform = DCT_HTK;
125 E_ERROR(
"Invalid transform type (values are 'dct', 'legacy', 'htk')\n");
130 fe->log_spec = RAW_LOG_SPEC;
132 fe->log_spec = SMOOTH_LOG_SPEC;
140 mel->sampling_rate = fe->sampling_rate;
141 mel->fft_size = fe->fft_size;
142 mel->num_cepstra = fe->num_cepstra;
143 mel->num_filters = cmd_ln_int32_r(config,
"-nfilt");
146 fe->feature_dimension = mel->num_filters;
148 fe->feature_dimension = fe->num_cepstra;
150 mel->upper_filt_freq = cmd_ln_float32_r(config,
"-upperf");
151 mel->lower_filt_freq = cmd_ln_float32_r(config,
"-lowerf");
156 mel->warp_params =
cmd_ln_str_r(config,
"-warp_params");
157 mel->lifter_val = cmd_ln_int32_r(config,
"-lifter");
162 if (fe_warp_set(mel, mel->warp_type) != FE_SUCCESS) {
163 E_ERROR(
"Failed to initialize the warping function.\n");
166 fe_warp_set_parameters(mel, mel->warp_params, mel->sampling_rate);
171 fe_print_current(
fe_t const *fe)
173 E_INFO(
"Current FE Parameters:\n");
174 E_INFO(
"\tSampling Rate: %f\n", fe->sampling_rate);
175 E_INFO(
"\tFrame Size: %d\n", fe->frame_size);
176 E_INFO(
"\tFrame Shift: %d\n", fe->frame_shift);
177 E_INFO(
"\tFFT Size: %d\n", fe->fft_size);
178 E_INFO(
"\tLower Frequency: %g\n",
179 fe->mel_fb->lower_filt_freq);
180 E_INFO(
"\tUpper Frequency: %g\n",
181 fe->mel_fb->upper_filt_freq);
182 E_INFO(
"\tNumber of filters: %d\n", fe->mel_fb->num_filters);
183 E_INFO(
"\tNumber of Overflow Samps: %d\n", fe->num_overflow_samps);
184 E_INFO(
"Will %sremove DC offset at frame level\n",
185 fe->remove_dc ?
"" :
"not ");
187 E_INFO(
"Will add dither to audio\n");
188 E_INFO(
"Dither seeded with %d\n", fe->dither_seed);
191 E_INFO(
"Will not add dither to audio\n");
193 if (fe->mel_fb->lifter_val) {
194 E_INFO(
"Will apply sine-curve liftering, period %d\n",
195 fe->mel_fb->lifter_val);
197 E_INFO(
"Will %snormalize filters to unit area\n",
198 fe->mel_fb->unit_area ?
"" :
"not ");
199 E_INFO(
"Will %sround filter frequencies to DFT points\n",
200 fe->mel_fb->round_filters ?
"" :
"not ");
201 E_INFO(
"Will %suse double bandwidth in mel filter\n",
202 fe->mel_fb->doublewide ?
"" :
"not ");
215 int prespch_frame_len;
221 if (fe_parse_general_params(
cmd_ln_retain(config), fe) < 0) {
230 fe->frame_shift = (int32) (fe->sampling_rate / fe->frame_rate + 0.5);
231 fe->frame_size = (int32) (fe->window_length * fe->sampling_rate + 0.5);
232 fe->pre_emphasis_prior = 0;
236 assert (fe->frame_shift > 1);
238 if (fe->frame_size < fe->frame_shift) {
240 (
"Frame size %d (-wlen) must be greater than frame shift %d (-frate)\n",
241 fe->frame_size, fe->frame_shift);
247 if (fe->frame_size > (fe->fft_size)) {
249 (
"Number of FFT points has to be a power of 2 higher than %d, it is %d\n",
250 fe->frame_size, fe->fft_size);
256 fe_init_dither(fe->dither_seed);
259 fe->overflow_samps =
ckd_calloc(fe->frame_size,
sizeof(int16));
260 fe->hamming_window =
ckd_calloc(fe->frame_size/2,
sizeof(window_t));
263 fe_create_hamming(fe->hamming_window, fe->frame_size);
266 fe->mel_fb =
ckd_calloc(1,
sizeof(*fe->mel_fb));
269 fe_parse_melfb_params(config, fe, fe->mel_fb);
271 if (fe->mel_fb->upper_filt_freq > fe->sampling_rate / 2 + 1.0) {
272 E_ERROR(
"Upper frequency %.1f is higher than samprate/2 (%.1f)\n",
273 fe->mel_fb->upper_filt_freq, fe->sampling_rate / 2);
278 fe_build_melfilters(fe->mel_fb);
280 fe_compute_melcosine(fe->mel_fb);
281 if (fe->remove_noise || fe->remove_silence)
282 fe->noise_stats = fe_init_noisestats(fe->mel_fb->num_filters);
285 prespch_frame_len = fe->log_spec != RAW_LOG_SPEC ? fe->num_cepstra : fe->mel_fb->num_filters;
286 fe->vad_data->prespch_buf = fe_prespch_init(fe->pre_speech + 1, prespch_frame_len, fe->frame_shift);
290 fe->spch =
ckd_calloc(fe->frame_size,
sizeof(*fe->spch));
291 fe->frame =
ckd_calloc(fe->fft_size,
sizeof(*fe->frame));
292 fe->spec =
ckd_calloc(fe->fft_size,
sizeof(*fe->spec));
293 fe->mfspec =
ckd_calloc(fe->mel_fb->num_filters,
sizeof(*fe->mfspec));
296 fe->ccc =
ckd_calloc(fe->fft_size / 4,
sizeof(*fe->ccc));
297 fe->sss =
ckd_calloc(fe->fft_size / 4,
sizeof(*fe->sss));
298 fe_create_twiddle(fe);
301 fe_print_current(fe);
316 fe_get_config(
fe_t *fe)
322 fe_init_dither(int32 seed)
324 E_INFO(
"Using %d as the seed.\n", seed);
331 vad_data->in_speech = 0;
332 vad_data->pre_speech_frames = 0;
333 vad_data->post_speech_frames = 0;
334 fe_prespch_reset_cep(vad_data->prespch_buf);
338 fe_start_utt(
fe_t * fe)
340 fe->num_overflow_samps = 0;
341 memset(fe->overflow_samps, 0, fe->frame_size *
sizeof(int16));
342 fe->pre_emphasis_prior = 0;
343 fe_reset_vad_data(fe->vad_data);
348 fe_start_stream(
fe_t *fe)
350 fe->num_processed_samps = 0;
351 fe_reset_noisestats(fe->noise_stats);
355 fe_get_output_size(
fe_t *fe)
357 return (
int)fe->feature_dimension;
361 fe_get_input_size(
fe_t *fe,
int *out_frame_shift,
365 *out_frame_shift = fe->frame_shift;
367 *out_frame_size = fe->frame_size;
371 fe_get_vad_state(
fe_t *fe)
373 return fe->vad_data->in_speech;
377 fe_process_frames(
fe_t *fe,
378 int16
const **inout_spch,
379 size_t *inout_nsamps,
381 int32 *inout_nframes,
384 return fe_process_frames_ext(fe, inout_spch, inout_nsamps, buf_cep, inout_nframes, NULL, NULL, out_frameidx);
392 fe_copy_from_prespch(
fe_t *fe, int32 *inout_nframes, mfcc_t **buf_cep,
int outidx)
394 while ((*inout_nframes) > 0 && fe_prespch_read_cep(fe->vad_data->prespch_buf, buf_cep[outidx]) > 0) {
405 fe_check_prespeech(
fe_t *fe, int32 *inout_nframes, mfcc_t **buf_cep,
int outidx, int32 *out_frameidx,
size_t *inout_nsamps,
int orig_nsamps)
407 if (fe->vad_data->in_speech) {
408 if (fe_prespch_ncep(fe->vad_data->prespch_buf) > 0) {
412 outidx = fe_copy_from_prespch(fe, inout_nframes, buf_cep, outidx);
416 *out_frameidx = (fe->num_processed_samps + orig_nsamps - *inout_nsamps) / fe->frame_shift - fe->pre_speech;
424 if (fe->num_overflow_samps > 0)
425 fe->num_overflow_samps -= fe->frame_shift;
431 fe_process_frames_ext(
fe_t *fe,
432 int16
const **inout_spch,
433 size_t *inout_nsamps,
435 int32 *inout_nframes,
437 int32 *voiced_spch_nsamps,
440 int outidx, n_overflow, orig_n_overflow;
441 int16
const *orig_spch;
450 if (buf_cep == NULL) {
451 if (*inout_nsamps + fe->num_overflow_samps < (
size_t)fe->frame_size)
455 + ((*inout_nsamps + fe->num_overflow_samps - fe->frame_size)
457 if (!fe->vad_data->in_speech)
458 *inout_nframes += fe_prespch_ncep(fe->vad_data->prespch_buf);
459 return *inout_nframes;
466 if (*inout_nsamps + fe->num_overflow_samps < (
size_t)fe->frame_size) {
467 if (*inout_nsamps > 0) {
469 memcpy(fe->overflow_samps + fe->num_overflow_samps,
470 *inout_spch, *inout_nsamps * (
sizeof(int16)));
471 fe->num_overflow_samps += *inout_nsamps;
472 fe->num_processed_samps += *inout_nsamps;
473 *inout_spch += *inout_nsamps;
482 if (*inout_nframes < 1) {
491 if (fe->vad_data->in_speech && fe_prespch_ncep(fe->vad_data->prespch_buf) > 0) {
492 outidx = fe_copy_from_prespch(fe, inout_nframes, buf_cep, outidx);
493 if ((*inout_nframes) < 1) {
495 *inout_nframes = outidx;
501 orig_spch = *inout_spch;
502 orig_nsamps = *inout_nsamps;
503 orig_n_overflow = fe->num_overflow_samps;
506 if (fe->num_overflow_samps > 0) {
507 int offset = fe->frame_size - fe->num_overflow_samps;
509 memcpy(fe->overflow_samps + fe->num_overflow_samps,
510 *inout_spch, offset *
sizeof(**inout_spch));
511 fe_read_frame(fe, fe->overflow_samps, fe->frame_size);
513 *inout_spch += offset;
514 *inout_nsamps -= offset;
516 fe_read_frame(fe, *inout_spch, fe->frame_size);
518 *inout_spch += fe->frame_size;
519 *inout_nsamps -= fe->frame_size;
522 fe_write_frame(fe, buf_cep[outidx], voiced_spch != NULL);
523 outidx = fe_check_prespeech(fe, inout_nframes, buf_cep, outidx, out_frameidx, inout_nsamps, orig_nsamps);
526 while (*inout_nframes > 0 && *inout_nsamps >= (
size_t)fe->frame_shift) {
527 fe_shift_frame(fe, *inout_spch, fe->frame_shift);
528 fe_write_frame(fe, buf_cep[outidx], voiced_spch != NULL);
530 outidx = fe_check_prespeech(fe, inout_nframes, buf_cep, outidx, out_frameidx, inout_nsamps, orig_nsamps);
533 *inout_spch += fe->frame_shift;
534 *inout_nsamps -= fe->frame_shift;
538 if (fe->num_overflow_samps <= 0) {
540 n_overflow = *inout_nsamps;
541 if (n_overflow > fe->frame_shift)
542 n_overflow = fe->frame_shift;
543 fe->num_overflow_samps = fe->frame_size - fe->frame_shift;
545 if (fe->num_overflow_samps > *inout_spch - orig_spch)
546 fe->num_overflow_samps = *inout_spch - orig_spch;
547 fe->num_overflow_samps += n_overflow;
548 if (fe->num_overflow_samps > 0) {
549 memcpy(fe->overflow_samps,
550 *inout_spch - (fe->frame_size - fe->frame_shift),
551 fe->num_overflow_samps *
sizeof(**inout_spch));
553 *inout_spch += n_overflow;
554 *inout_nsamps -= n_overflow;
559 memmove(fe->overflow_samps,
560 fe->overflow_samps + orig_n_overflow - fe->num_overflow_samps,
561 fe->num_overflow_samps *
sizeof(*fe->overflow_samps));
563 n_overflow = *inout_spch - orig_spch + *inout_nsamps;
564 if (n_overflow > fe->frame_size - fe->num_overflow_samps)
565 n_overflow = fe->frame_size - fe->num_overflow_samps;
566 memcpy(fe->overflow_samps + fe->num_overflow_samps,
567 orig_spch, n_overflow *
sizeof(*orig_spch));
568 fe->num_overflow_samps += n_overflow;
570 if (n_overflow > *inout_spch - orig_spch) {
571 n_overflow -= (*inout_spch - orig_spch);
572 *inout_spch += n_overflow;
573 *inout_nsamps -= n_overflow;
579 *inout_nframes = outidx;
580 fe->num_processed_samps += orig_nsamps - *inout_nsamps;
586 fe_process_utt(
fe_t * fe, int16
const * spch,
size_t nsamps,
587 mfcc_t *** cep_block, int32 * nframes)
593 fe_process_frames(fe, NULL, &nsamps, NULL, nframes, NULL);
596 cep = (mfcc_t **)
ckd_calloc_2d(*nframes, fe->feature_dimension,
sizeof(**cep));
598 cep = (mfcc_t **)
ckd_calloc_2d(1, fe->feature_dimension,
sizeof(**cep));
600 rv = fe_process_frames(fe, &spch, &nsamps, cep, nframes, NULL);
608 fe_end_utt(
fe_t * fe, mfcc_t * cepvector, int32 * nframes)
612 if (fe->num_overflow_samps > 0) {
613 fe_read_frame(fe, fe->overflow_samps, fe->num_overflow_samps);
614 fe_write_frame(fe, cepvector, FALSE);
615 if (fe->vad_data->in_speech)
620 fe->num_overflow_samps = 0;
637 if (--fe->refcount > 0)
642 if (fe->mel_fb->mel_cosine)
643 fe_free_2d((
void *) fe->mel_fb->mel_cosine);
661 fe_free_noisestats(fe->noise_stats);
664 fe_prespch_free(fe->vad_data->prespch_buf);
678 fe_mfcc_to_float(
fe_t * fe,
679 mfcc_t ** input, float32 ** output, int32 nframes)
684 if ((
void *) input == (
void *) output)
685 return nframes * fe->feature_dimension;
687 for (i = 0; i < nframes * fe->feature_dimension; ++i)
688 output[0][i] = MFCC2FLOAT(input[0][i]);
697 fe_float_to_mfcc(
fe_t * fe,
698 float32 ** input, mfcc_t ** output, int32 nframes)
703 if ((
void *) input == (
void *) output)
704 return nframes * fe->feature_dimension;
706 for (i = 0; i < nframes * fe->feature_dimension; ++i)
707 output[0][i] = FLOAT2MFCC(input[0][i]);
713 fe_logspec_to_mfcc(
fe_t * fe,
const mfcc_t * fr_spec, mfcc_t * fr_cep)
716 fe_spec2cep(fe, fr_spec, fr_cep);
721 powspec =
ckd_malloc(fe->mel_fb->num_filters *
sizeof(powspec_t));
722 for (i = 0; i < fe->mel_fb->num_filters; ++i)
723 powspec[i] = (powspec_t) fr_spec[i];
724 fe_spec2cep(fe, powspec, fr_cep);
731 fe_logspec_dct2(
fe_t * fe,
const mfcc_t * fr_spec, mfcc_t * fr_cep)
734 fe_dct2(fe, fr_spec, fr_cep, 0);
739 powspec =
ckd_malloc(fe->mel_fb->num_filters *
sizeof(powspec_t));
740 for (i = 0; i < fe->mel_fb->num_filters; ++i)
741 powspec[i] = (powspec_t) fr_spec[i];
742 fe_dct2(fe, powspec, fr_cep, 0);
749 fe_mfcc_dct3(
fe_t * fe,
const mfcc_t * fr_cep, mfcc_t * fr_spec)
752 fe_dct3(fe, fr_cep, fr_spec);
757 powspec =
ckd_malloc(fe->mel_fb->num_filters *
sizeof(powspec_t));
758 fe_dct3(fe, fr_cep, powspec);
759 for (i = 0; i < fe->mel_fb->num_filters; ++i)
760 fr_spec[i] = (mfcc_t) powspec[i];
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
#define E_INFO(...)
Print logging information to standard error stream.
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define E_ERROR(...)
Print error message to error log.
Base Struct to hold all structure for MFCC computation.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
#define s3_rand_seed(s)
Macros to simplify calling of random generator function.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_get(void)
Retrieve the global cmd_ln_t object used by non-re-entrant functions.
Implementation of logging routines.
Argument definition structure.
High performance prortable random generator created by Takuji Nishimura and Makoto Matsumoto...
Opaque structure used to hold the results of command-line parsing.
#define ckd_malloc(sz)
Macro for ckd_malloc
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Structure for the front-end computation.