SphinxBase  5prealpha
cont_seg.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2013 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * cont_seg.c -- Continuously listen and segment input speech into utterances.
39  *
40  * HISTORY
41  *
42  * 05-Nov-13 Created from adseg and fileseg
43  *
44  */
45 
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <assert.h>
50 #include <math.h>
51 
52 #if defined(_WIN32) && !defined(__CYGWIN__)
53 #include <windows.h>
54 #else
55 #include <sys/select.h>
56 #endif
57 
58 #include <sphinxbase/prim_type.h>
59 #include <sphinxbase/ad.h>
60 #include <sphinxbase/fe.h>
61 #include <sphinxbase/cmd_ln.h>
62 #include <sphinxbase/ckd_alloc.h>
63 #include <sphinxbase/err.h>
64 
65 #define BLOCKSIZE 1024
66 
67 static const arg_t cont_args_def[] = {
68  waveform_to_cepstral_command_line_macro(),
69  /* Argument file. */
70  {"-argfile",
71  ARG_STRING,
72  NULL,
73  "Argument file giving extra arguments."},
74  {"-adcdev",
75  ARG_STRING,
76  NULL,
77  "Name of audio device to use for input."},
78  {"-inmic",
80  "no",
81  "Transcribe audio from microphone."},
82  {"-infile",
83  ARG_STRING,
84  NULL,
85  "Name of audio file to use for input."},
86  {"-singlefile",
88  FALSE,
89  "Write a single cleaned file."},
90  {NULL, 0, NULL, NULL}
91 };
92 
93 static fe_t *fe;
94 static cmd_ln_t *config;
95 static int (*read_audio) (int16 * buf, int len);
96 static ad_rec_t *ad;
97 static const char *infile_path;
98 static FILE *infile;
99 static int32 singlefile;
100 
101 /* Sleep for specified msec */
102 static void
103 sleep_msec(int32 ms)
104 {
105 #if (defined(_WIN32) && !defined(GNUWINCE)) || defined(_WIN32_WCE)
106  Sleep(ms);
107 #else
108  /* ------------------- Unix ------------------ */
109  struct timeval tmo;
110 
111  tmo.tv_sec = 0;
112  tmo.tv_usec = ms * 1000;
113 
114  select(0, NULL, NULL, NULL, &tmo);
115 #endif
116 }
117 
118 static int
119 read_audio_file(int16 * buf, int len)
120 {
121  if (!infile) {
122  E_FATAL("Failed to read audio from file\n");
123  return -1;
124  }
125  return fread(buf, sizeof(int16), len, infile);
126 }
127 
128 static int
129 read_audio_adev(int16 * buf, int len)
130 {
131  int k;
132 
133  if (!ad) {
134  E_FATAL("Failed to read audio from mic\n");
135  return -1;
136  }
137  while ((k = ad_read(ad, buf, len)) == 0)
138  /* wait until something is read */
139  sleep_msec(50);
140 
141  return k;
142 }
143 
144 void
145 segment_audio()
146 {
147  FILE *file;
148  int16 pcm_buf[BLOCKSIZE];
149  mfcc_t **cep_buf;
150  int16 *voiced_buf = NULL;
151  int32 voiced_nsamps, out_frameidx, uttstart = 0;
152  char file_name[1024];
153  uint8 cur_vad_state, vad_state, writing;
154  int uttno, uttlen, sample_rate;
155  int32 nframes, nframes_tmp;
156  int16 frame_size, frame_shift, frame_rate;
157  size_t k;
158 
159  sample_rate = (int) cmd_ln_float32_r(config, "-samprate");
160  frame_rate = cmd_ln_int32_r(config, "-frate");
161  frame_size =
162  (int32) (cmd_ln_float32_r(config, "-wlen") * sample_rate + 0.5);
163  frame_shift =
164  (int32) (sample_rate / cmd_ln_int32_r(config, "-frate") + 0.5);
165  nframes = (BLOCKSIZE - frame_size) / frame_shift;
166  cep_buf =
167  (mfcc_t **) ckd_calloc_2d(nframes, fe_get_output_size(fe),
168  sizeof(mfcc_t));
169 
170  uttno = 0;
171  uttlen = 0;
172  cur_vad_state = 0;
173  voiced_nsamps = 0;
174  writing = 0;
175  file = NULL;
176  fe_start_stream(fe);
177  fe_start_utt(fe);
178  while ((k = read_audio(pcm_buf, BLOCKSIZE)) > 0) {
179  int16 const *pcm_buf_tmp;
180  pcm_buf_tmp = &pcm_buf[0];
181  while (k) {
182  nframes_tmp = nframes;
183  fe_process_frames_ext(fe, &pcm_buf_tmp, &k, cep_buf,
184  &nframes_tmp, voiced_buf,
185  &voiced_nsamps, &out_frameidx);
186  if (out_frameidx > 0) {
187  uttstart = out_frameidx;
188  }
189  vad_state = fe_get_vad_state(fe);
190  if (!cur_vad_state && vad_state) {
191  /* silence->speech transition, time to start new file */
192  uttno++;
193  if (!singlefile) {
194  sprintf(file_name, "%s%04d.raw", infile_path, uttno);
195  if ((file = fopen(file_name, "wb")) == NULL)
196  E_FATAL_SYSTEM("Failed to open '%s' for writing",
197  file_name);
198  } else {
199  sprintf(file_name, "%s.raw", infile_path);
200  if ((file = fopen(file_name, "ab")) == NULL)
201  E_FATAL_SYSTEM("Failed to open '%s' for writing",
202  file_name);
203  }
204  writing = 1;
205  }
206 
207  if (writing && file && voiced_nsamps > 0) {
208  fwrite(voiced_buf, sizeof(int16), voiced_nsamps, file);
209  uttlen += voiced_nsamps;
210  }
211 
212  if (cur_vad_state && !vad_state) {
213  /* speech -> silence transition, time to finish file */
214  fclose(file);
215  printf("Utterance %04d: file %s start %.1f sec length %d samples ( %.2f sec )\n",
216  uttno,
217  file_name,
218  ((double) uttstart) / frame_rate,
219  uttlen,
220  ((double) uttlen) / sample_rate);
221  fflush(stdout);
222  fe_end_utt(fe, cep_buf[0], &nframes_tmp);
223  writing = 0;
224  uttlen = 0;
225  voiced_nsamps = 0;
226  fe_start_utt(fe);
227  }
228  cur_vad_state = vad_state;
229  }
230  }
231 
232  if (writing) {
233  fclose(file);
234  printf("Utterance %04d: file %s start %.1f sec length %d samples ( %.2f sec )\n",
235  uttno,
236  file_name,
237  ((double) uttstart) / frame_rate,
238  uttlen,
239  ((double) uttlen) / sample_rate);
240  fflush(stdout);
241  }
242  fe_end_utt(fe, cep_buf[0], &nframes);
243  ckd_free_2d(cep_buf);
244 }
245 
246 int
247 main(int argc, char *argv[])
248 {
249  int i;
250  int16 buf[2048];
251 
252  config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE);
253 
254  if (config && cmd_ln_str_r(config, "-argfile"))
255  config = cmd_ln_parse_file_r(config, cont_args_def,
256  cmd_ln_str_r(config, "-argfile"), FALSE);
257 
258  if (config == NULL || (cmd_ln_str_r(config, "-infile") == NULL && cmd_ln_boolean_r(config, "-inmic") == FALSE)) {
259  E_INFO("Specify '-infile <file.wav>' to segment a file or '-inmic yes' to segment audio from microphone.\n");
260  cmd_ln_free_r(config);
261  return 1;
262  }
263 
264 
265  singlefile = cmd_ln_boolean_r(config, "-singlefile");
266  if ((infile_path = cmd_ln_str_r(config, "-infile")) != NULL) {
267  if ((infile = fopen(infile_path, "rb")) == NULL) {
268  E_FATAL_SYSTEM("Failed to read audio from '%s'", infile_path);
269  return 1;
270  }
271  read_audio = &read_audio_file;
272  /* skip wav header */
273  read_audio(buf, 44);
274  }
275  else if cmd_ln_boolean_r(config, "-inmic") {
276  if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
277  (int) cmd_ln_float32_r(config,
278  "-samprate"))) ==
279  NULL) {
280  E_FATAL("Failed to open audio device\n");
281  return 1;
282  }
283  read_audio = &read_audio_adev;
284  printf("Start recording ...\n");
285  fflush(stdout);
286  if (ad_start_rec(ad) < 0)
287  E_FATAL("Failed to start recording\n");
288 
289  /* TODO remove this thing */
290  for (i = 0; i < 5; i++) {
291  sleep_msec(200);
292  read_audio(buf, 2048);
293  }
294  printf("You may speak now\n");
295  fflush(stdout);
296  }
297 
298  fe = fe_init_auto_r(config);
299  if (fe == NULL)
300  return 1;
301 
302  segment_audio();
303 
304  if (ad)
305  ad_close(ad);
306  if (infile)
307  fclose(infile);
308 
309  fe_free(fe);
310  cmd_ln_free_r(config);
311  return 0;
312 }
Command-line and other configurationparsing and handling.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
Sphinx&#39;s memory allocation/deallocation routines.
Audio recording structure.
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1046
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:556
#define ARG_STRING
String argument (optional).
Definition: cmd_ln.h:114
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
Implementation of logging routines.
#define E_FATAL_SYSTEM(...)
Print error text; Call perror(&quot;&quot;); exit(errno);.
Definition: err.h:90
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition: cmd_ln.h:118
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on &quot; \r\t\n&quot; and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:764
generic live audio interface for recording and playback
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
Structure for the front-end computation.
Definition: fe_internal.h:117
SPHINXBASE_EXPORT ad_rec_t * ad_open_dev(const char *dev, int32 samples_per_sec)
Open a specific audio device for recording.
Definition: ad_alsa.c:187