51 #if defined(_WIN32) && !defined(GNUWINCE)
52 #define srand48(x) srand(x)
53 #define lrand48() rand()
60 #include <sphinxbase/sphinxbase_export.h>
63 #include <sphinxbase/fixpoint.h>
73 #ifdef WORDS_BIGENDIAN
74 #define NATIVE_ENDIAN "big"
76 #define NATIVE_ENDIAN "little"
80 #define DEFAULT_SAMPLING_RATE 16000
82 #define DEFAULT_FRAME_RATE 100
85 #define DEFAULT_FRAME_SHIFT 160
87 #define DEFAULT_WINDOW_LENGTH 0.025625
89 #define DEFAULT_FFT_SIZE 512
91 #define DEFAULT_NUM_CEPSTRA 13
93 #define DEFAULT_NUM_FILTERS 40
96 #define DEFAULT_PRE_SPEECH 20
98 #define DEFAULT_POST_SPEECH 50
100 #define DEFAULT_START_SPEECH 10
103 #define DEFAULT_LOWER_FILT_FREQ 133.33334
105 #define DEFAULT_UPPER_FILT_FREQ 6855.4976
107 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
109 #define DEFAULT_WARP_TYPE "inverse_linear"
113 #define waveform_to_cepstral_command_line_macro() \
117 "Write out logspectral files instead of cepstra" }, \
122 "Write out cepstral-smoothed logspectral files" }, \
127 "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
131 ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
132 "Preemphasis parameter" }, \
136 ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
141 ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
146 ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
147 "Hamming window length" }, \
151 ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
156 ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
157 "Number of filter banks" }, \
161 ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
162 "Lower edge of filters" }, \
166 ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
167 "Upper edge of filters" }, \
172 "Normalize mel filters to unit area" }, \
174 { "-round_filters", \
177 "Round mel filter frequencies to DFT points" }, \
181 ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
182 "Number of cep coefficients" }, \
187 "Use double bandwidth filters (same center freq)" }, \
192 "Length of sin-curve for liftering, or 0 for no liftering." }, \
194 { "-vad_prespeech", \
196 ARG_STRINGIFY(DEFAULT_PRE_SPEECH), \
197 "Num of speech frames to keep before silence to speech." }, \
199 { "-vad_startspeech", \
201 ARG_STRINGIFY(DEFAULT_START_SPEECH), \
202 "Num of speech frames to trigger vad from silence to speech." }, \
204 { "-vad_postspeech", \
206 ARG_STRINGIFY(DEFAULT_POST_SPEECH), \
207 "Num of silence frames to keep after from speech to silence." }, \
209 { "-vad_threshold", \
212 "Threshold for decision between noise and silence frames. Log-ratio between signal level and noise level." }, \
217 "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
222 "Warping function type (or shape)" }, \
227 "Parameters defining the warping function" }, \
232 "Add 1/2-bit noise" }, \
236 ARG_STRINGIFY(SEED), \
237 "Seed for random number generator; if less than zero, pick our own" }, \
242 "Remove DC offset from each frame" }, \
247 "Remove noise with spectral subtraction in mel-energies" }, \
249 { "-remove_silence", \
252 "Enables VAD, removes silence frames from processing" }, \
257 "Show input filenames" } \
262 typedef fixed32 mfcc_t;
265 #define FLOAT2MFCC(x) FLOAT2FIX(x)
267 #define MFCC2FLOAT(x) FIX2FLOAT(x)
269 #define MFCCMUL(a,b) FIXMUL(a,b)
270 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
274 typedef float32 mfcc_t;
276 #define FLOAT2MFCC(x) (x)
278 #define MFCC2FLOAT(x) (x)
280 #define MFCCMUL(a,b) ((a)*(b))
281 #define MFCCLN(x,in,out) log(x)
294 FE_OUTPUT_FILE_SUCCESS = 0,
295 FE_CONTROL_FILE_ERROR = -1,
297 FE_UNKNOWN_SINGLE_OR_BATCH = -3,
298 FE_INPUT_FILE_OPEN_ERROR = -4,
299 FE_INPUT_FILE_READ_ERROR = -5,
300 FE_MEM_ALLOC_ERROR = -6,
301 FE_OUTPUT_FILE_WRITE_ERROR = -7,
302 FE_OUTPUT_FILE_OPEN_ERROR = -8,
303 FE_ZERO_ENERGY_ERROR = -9,
304 FE_INVALID_PARAM_ERROR = -10
315 fe_t* fe_init_auto(
void);
325 arg_t const *fe_get_args(
void);
354 void fe_start_stream(
fe_t *fe);
361 int fe_start_utt(
fe_t *fe);
377 int fe_get_output_size(
fe_t *fe);
393 void fe_get_input_size(
fe_t *fe,
int *out_frame_shift,
394 int *out_frame_size);
402 uint8 fe_get_vad_state(
fe_t *fe);
419 int fe_end_utt(
fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
437 int fe_free(
fe_t *fe);
453 int fe_process_frames_ext(
fe_t *fe,
454 int16
const **inout_spch,
455 size_t *inout_nsamps,
457 int32 *inout_nframes,
459 int32 *voiced_spch_nsamps,
460 int32 *out_frameidx);
512 int fe_process_frames(
fe_t *fe,
513 int16
const **inout_spch,
514 size_t *inout_nsamps,
516 int32 *inout_nframes,
517 int32 *out_frameidx);
535 int fe_process_utt(
fe_t *fe,
546 void fe_free_2d(
void *arr);
552 int fe_mfcc_to_float(
fe_t *fe,
561 int fe_float_to_mfcc(
fe_t *fe,
590 int fe_logspec_to_mfcc(
fe_t *fe,
591 const mfcc_t *fr_spec,
604 int fe_logspec_dct2(
fe_t *fe,
605 const mfcc_t *fr_spec,
618 int fe_mfcc_dct3(
fe_t *fe,
619 const mfcc_t *fr_cep,
Command-line and other configurationparsing and handling.
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.