SphinxBase  5prealpha
fe.h
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 /*
39  * fe.h
40  *
41  * $Log: fe.h,v $
42  * Revision 1.11 2005/02/05 02:15:02 egouvea
43  * Removed fe_process(), never used
44  *
45  * Revision 1.10 2004/12/10 16:48:55 rkm
46  * Added continuous density acoustic model handling
47  *
48  *
49  */
50 
51 #if defined(_WIN32) && !defined(GNUWINCE)
52 #define srand48(x) srand(x)
53 #define lrand48() rand()
54 #endif
55 
56 #ifndef _NEW_FE_H_
57 #define _NEW_FE_H_
58 
59 /* Win32/WinCE DLL gunk */
60 #include <sphinxbase/sphinxbase_export.h>
61 
62 #include <sphinxbase/cmd_ln.h>
63 #include <sphinxbase/fixpoint.h>
64 
65 #ifdef __cplusplus
66 extern "C" {
67 #endif
68 #if 0
69 /* Fool Emacs. */
70 }
71 #endif
72 
73 #ifdef WORDS_BIGENDIAN
74 #define NATIVE_ENDIAN "big"
75 #else
76 #define NATIVE_ENDIAN "little"
77 #endif
78 
80 #define DEFAULT_SAMPLING_RATE 16000
81 
82 #define DEFAULT_FRAME_RATE 100
83 
85 #define DEFAULT_FRAME_SHIFT 160
86 
87 #define DEFAULT_WINDOW_LENGTH 0.025625
88 
89 #define DEFAULT_FFT_SIZE 512
90 
91 #define DEFAULT_NUM_CEPSTRA 13
92 
93 #define DEFAULT_NUM_FILTERS 40
94 
96 #define DEFAULT_PRE_SPEECH 20
97 
98 #define DEFAULT_POST_SPEECH 50
99 
100 #define DEFAULT_START_SPEECH 10
101 
103 #define DEFAULT_LOWER_FILT_FREQ 133.33334
104 
105 #define DEFAULT_UPPER_FILT_FREQ 6855.4976
106 
107 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97
108 
109 #define DEFAULT_WARP_TYPE "inverse_linear"
110 
111 #define SEED -1
112 
113 #define waveform_to_cepstral_command_line_macro() \
114  { "-logspec", \
115  ARG_BOOLEAN, \
116  "no", \
117  "Write out logspectral files instead of cepstra" }, \
118  \
119  { "-smoothspec", \
120  ARG_BOOLEAN, \
121  "no", \
122  "Write out cepstral-smoothed logspectral files" }, \
123  \
124  { "-transform", \
125  ARG_STRING, \
126  "legacy", \
127  "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \
128  \
129  { "-alpha", \
130  ARG_FLOAT32, \
131  ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \
132  "Preemphasis parameter" }, \
133  \
134  { "-samprate", \
135  ARG_FLOAT32, \
136  ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \
137  "Sampling rate" }, \
138  \
139  { "-frate", \
140  ARG_INT32, \
141  ARG_STRINGIFY(DEFAULT_FRAME_RATE), \
142  "Frame rate" }, \
143  \
144  { "-wlen", \
145  ARG_FLOAT32, \
146  ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \
147  "Hamming window length" }, \
148  \
149  { "-nfft", \
150  ARG_INT32, \
151  ARG_STRINGIFY(DEFAULT_FFT_SIZE), \
152  "Size of FFT" }, \
153  \
154  { "-nfilt", \
155  ARG_INT32, \
156  ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \
157  "Number of filter banks" }, \
158  \
159  { "-lowerf", \
160  ARG_FLOAT32, \
161  ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \
162  "Lower edge of filters" }, \
163  \
164  { "-upperf", \
165  ARG_FLOAT32, \
166  ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \
167  "Upper edge of filters" }, \
168  \
169  { "-unit_area", \
170  ARG_BOOLEAN, \
171  "yes", \
172  "Normalize mel filters to unit area" }, \
173  \
174  { "-round_filters", \
175  ARG_BOOLEAN, \
176  "yes", \
177  "Round mel filter frequencies to DFT points" }, \
178  \
179  { "-ncep", \
180  ARG_INT32, \
181  ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \
182  "Number of cep coefficients" }, \
183  \
184  { "-doublebw", \
185  ARG_BOOLEAN, \
186  "no", \
187  "Use double bandwidth filters (same center freq)" }, \
188  \
189  { "-lifter", \
190  ARG_INT32, \
191  "0", \
192  "Length of sin-curve for liftering, or 0 for no liftering." }, \
193  \
194  { "-vad_prespeech", \
195  ARG_INT32, \
196  ARG_STRINGIFY(DEFAULT_PRE_SPEECH), \
197  "Num of speech frames to keep before silence to speech." }, \
198  \
199  { "-vad_startspeech", \
200  ARG_INT32, \
201  ARG_STRINGIFY(DEFAULT_START_SPEECH), \
202  "Num of speech frames to trigger vad from silence to speech." }, \
203  \
204  { "-vad_postspeech", \
205  ARG_INT32, \
206  ARG_STRINGIFY(DEFAULT_POST_SPEECH), \
207  "Num of silence frames to keep after from speech to silence." }, \
208  \
209  { "-vad_threshold", \
210  ARG_FLOAT32, \
211  "3.0", \
212  "Threshold for decision between noise and silence frames. Log-ratio between signal level and noise level." }, \
213  \
214  { "-input_endian", \
215  ARG_STRING, \
216  NATIVE_ENDIAN, \
217  "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \
218  \
219  { "-warp_type", \
220  ARG_STRING, \
221  DEFAULT_WARP_TYPE, \
222  "Warping function type (or shape)" }, \
223  \
224  { "-warp_params", \
225  ARG_STRING, \
226  NULL, \
227  "Parameters defining the warping function" }, \
228  \
229  { "-dither", \
230  ARG_BOOLEAN, \
231  "no", \
232  "Add 1/2-bit noise" }, \
233  \
234  { "-seed", \
235  ARG_INT32, \
236  ARG_STRINGIFY(SEED), \
237  "Seed for random number generator; if less than zero, pick our own" }, \
238  \
239  { "-remove_dc", \
240  ARG_BOOLEAN, \
241  "no", \
242  "Remove DC offset from each frame" }, \
243  \
244  { "-remove_noise", \
245  ARG_BOOLEAN, \
246  "yes", \
247  "Remove noise with spectral subtraction in mel-energies" }, \
248  \
249  { "-remove_silence", \
250  ARG_BOOLEAN, \
251  "yes", \
252  "Enables VAD, removes silence frames from processing" }, \
253  \
254  { "-verbose", \
255  ARG_BOOLEAN, \
256  "no", \
257  "Show input filenames" } \
258 
259 
260 #ifdef FIXED_POINT
261 
262 typedef fixed32 mfcc_t;
263 
265 #define FLOAT2MFCC(x) FLOAT2FIX(x)
266 
267 #define MFCC2FLOAT(x) FIX2FLOAT(x)
268 
269 #define MFCCMUL(a,b) FIXMUL(a,b)
270 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out)
271 #else /* !FIXED_POINT */
272 
274 typedef float32 mfcc_t;
276 #define FLOAT2MFCC(x) (x)
277 
278 #define MFCC2FLOAT(x) (x)
279 
280 #define MFCCMUL(a,b) ((a)*(b))
281 #define MFCCLN(x,in,out) log(x)
282 #endif /* !FIXED_POINT */
283 
287 typedef struct fe_s fe_t;
288 
292 enum fe_error_e {
293  FE_SUCCESS = 0,
294  FE_OUTPUT_FILE_SUCCESS = 0,
295  FE_CONTROL_FILE_ERROR = -1,
296  FE_START_ERROR = -2,
297  FE_UNKNOWN_SINGLE_OR_BATCH = -3,
298  FE_INPUT_FILE_OPEN_ERROR = -4,
299  FE_INPUT_FILE_READ_ERROR = -5,
300  FE_MEM_ALLOC_ERROR = -6,
301  FE_OUTPUT_FILE_WRITE_ERROR = -7,
302  FE_OUTPUT_FILE_OPEN_ERROR = -8,
303  FE_ZERO_ENERGY_ERROR = -9,
304  FE_INVALID_PARAM_ERROR = -10
305 };
306 
314 SPHINXBASE_EXPORT
315 fe_t* fe_init_auto(void);
316 
324 SPHINXBASE_EXPORT
325 arg_t const *fe_get_args(void);
326 
337 SPHINXBASE_EXPORT
338 fe_t *fe_init_auto_r(cmd_ln_t *config);
339 
347 SPHINXBASE_EXPORT
348 const cmd_ln_t *fe_get_config(fe_t *fe);
349 
353 SPHINXBASE_EXPORT
354 void fe_start_stream(fe_t *fe);
355 
360 SPHINXBASE_EXPORT
361 int fe_start_utt(fe_t *fe);
362 
376 SPHINXBASE_EXPORT
377 int fe_get_output_size(fe_t *fe);
378 
392 SPHINXBASE_EXPORT
393 void fe_get_input_size(fe_t *fe, int *out_frame_shift,
394  int *out_frame_size);
395 
401 SPHINXBASE_EXPORT
402 uint8 fe_get_vad_state(fe_t *fe);
403 
418 SPHINXBASE_EXPORT
419 int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
420 
426 SPHINXBASE_EXPORT
427 fe_t *fe_retain(fe_t *fe);
428 
436 SPHINXBASE_EXPORT
437 int fe_free(fe_t *fe);
438 
439 /*
440  * Do same as fe_process_frames, but also returns
441  * voiced audio. Output audio is valid till next
442  * fe_process_frames call.
443  *
444  * DO NOT MIX fe_process_frames calls
445  *
446  * @param voiced_spch Output: obtain voiced audio samples here
447  *
448  * @param voiced_spch_nsamps Output: shows voiced_spch length
449  *
450  * @param out_frameidx Output: index of the utterance start
451  */
452 SPHINXBASE_EXPORT
453 int fe_process_frames_ext(fe_t *fe,
454  int16 const **inout_spch,
455  size_t *inout_nsamps,
456  mfcc_t **buf_cep,
457  int32 *inout_nframes,
458  int16 *voiced_spch,
459  int32 *voiced_spch_nsamps,
460  int32 *out_frameidx);
461 
511 SPHINXBASE_EXPORT
512 int fe_process_frames(fe_t *fe,
513  int16 const **inout_spch,
514  size_t *inout_nsamps,
515  mfcc_t **buf_cep,
516  int32 *inout_nframes,
517  int32 *out_frameidx);
518 
534 SPHINXBASE_EXPORT
535 int fe_process_utt(fe_t *fe,
536  int16 const *spch,
537  size_t nsamps,
538  mfcc_t ***cep_block,
539  int32 *nframes
540  );
541 
545 SPHINXBASE_EXPORT
546 void fe_free_2d(void *arr);
547 
551 SPHINXBASE_EXPORT
552 int fe_mfcc_to_float(fe_t *fe,
553  mfcc_t **input,
554  float32 **output,
555  int32 nframes);
556 
560 SPHINXBASE_EXPORT
561 int fe_float_to_mfcc(fe_t *fe,
562  float32 **input,
563  mfcc_t **output,
564  int32 nframes);
565 
589 SPHINXBASE_EXPORT
590 int fe_logspec_to_mfcc(fe_t *fe,
591  const mfcc_t *fr_spec,
592  mfcc_t *fr_cep
593  );
594 
603 SPHINXBASE_EXPORT
604 int fe_logspec_dct2(fe_t *fe,
605  const mfcc_t *fr_spec,
606  mfcc_t *fr_cep
607  );
608 
617 SPHINXBASE_EXPORT
618 int fe_mfcc_dct3(fe_t *fe,
619  const mfcc_t *fr_cep,
620  mfcc_t *fr_spec
621  );
622 
623 #ifdef __cplusplus
624 }
625 #endif
626 
627 
628 #endif
Command-line and other configurationparsing and handling.
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.
Definition: fe_internal.h:117