SphinxBase  5prealpha
sphinx_pitch.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2008 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41 
42 #include <stdio.h>
43 #include <string.h>
44 
45 #include <sphinxbase/cmd_ln.h>
46 #include <sphinxbase/yin.h>
47 #include <sphinxbase/ckd_alloc.h>
48 #include <sphinxbase/byteorder.h>
49 #include <sphinxbase/strfuncs.h>
50 #include <sphinxbase/err.h>
51 #include <sphinxbase/pio.h>
52 
53 static arg_t defn[] = {
54  { "-i",
55  ARG_STRING,
56  NULL,
57  "Single audio input file" },
58 
59  { "-o",
60  ARG_STRING,
61  NULL,
62  "Single text output file (standard output will be used if not given)" },
63 
64  { "-c",
65  ARG_STRING,
66  NULL,
67  "Control file for batch processing" },
68 
69  { "-nskip",
70  ARG_INT32,
71  "0",
72  "If a control file was specified, the number of utterances to skip at the head of the file" },
73 
74  { "-runlen",
75  ARG_INT32,
76  "-1",
77  "If a control file was specified, the number of utterances to process (see -nskip too)" },
78 
79  { "-di",
80  ARG_STRING,
81  NULL,
82  "Input directory, input file names are relative to this, if defined" },
83 
84  { "-ei",
85  ARG_STRING,
86  NULL,
87  "Input extension to be applied to all input files" },
88 
89  { "-do",
90  ARG_STRING,
91  NULL,
92  "Output directory, output files are relative to this" },
93 
94  { "-eo",
95  ARG_STRING,
96  NULL,
97  "Output extension to be applied to all output files" },
98 
99  { "-nist",
100  ARG_BOOLEAN,
101  "no",
102  "Defines input format as NIST sphere" },
103 
104  { "-raw",
105  ARG_BOOLEAN,
106  "no",
107  "Defines input format as raw binary data" },
108 
109  { "-mswav",
110  ARG_BOOLEAN,
111  "no",
112  "Defines input format as Microsoft Wav (RIFF)" },
113 
114  { "-samprate",
115  ARG_INT32,
116  "0",
117  "Sampling rate of audio data (will be determined automatically if 0)" },
118 
119  { "-input_endian",
120  ARG_STRING,
121  NULL,
122  "Endianness of audio data (will be determined automatically if not given)" },
123 
124  { "-fshift",
125  ARG_FLOAT32,
126  "0.01",
127  "Frame shift: number of seconds between each analysis frame." },
128 
129  { "-flen",
130  ARG_FLOAT32,
131  "0.025",
132  "Number of seconds in each analysis frame (needs to be greater than twice the longest period you wish to detect - to detect down to 80Hz you need a frame length of 2.0/80 = 0.025)." },
133 
134  { "-smooth_window",
135  ARG_INT32,
136  "2",
137  "Number of frames on either side of the current frame to use for smoothing." },
138 
139  { "-voice_thresh",
140  ARG_FLOAT32,
141  "0.1",
142  "Threshold of normalized difference under which to search for the fundamental period." },
143 
144  { "-search_range",
145  ARG_FLOAT32,
146  "0.2",
147  "Fraction of the best local estimate to use as a search range for smoothing." },
148 
149  { NULL, 0, NULL, NULL }
150 };
151 
152 static int extract_pitch(const char *in, const char *out);
153 static int run_control_file(const char *ctl);
154 
155 int
156 main(int argc, char *argv[])
157 {
158  cmd_ln_parse(defn, argc, argv, TRUE);
159 
160  /* Run a control file if requested. */
161  if (cmd_ln_str("-c")) {
162  if (run_control_file(cmd_ln_str("-c")) < 0)
163  return 1;
164  }
165  else {
166  if (extract_pitch(cmd_ln_str("-i"), cmd_ln_str("-o")) < 0)
167  return 1;
168  }
169 
170  cmd_ln_free();
171  return 0;
172 }
173 
174 static int
175 guess_file_type(char const *file, FILE *infh)
176 {
177  char header[4];
178 
179  fseek(infh, 0, SEEK_SET);
180  if (fread(header, 1, 4, infh) != 4) {
181  E_ERROR_SYSTEM("Failed to read 4 byte header");
182  return -1;
183  }
184  if (0 == memcmp(header, "RIFF", 4)) {
185  E_INFO("%s appears to be a WAV file\n", file);
186  cmd_ln_set_boolean("-mswav", TRUE);
187  cmd_ln_set_boolean("-nist", FALSE);
188  cmd_ln_set_boolean("-raw", FALSE);
189  }
190  else if (0 == memcmp(header, "NIST", 4)) {
191  E_INFO("%s appears to be a NIST SPHERE file\n", file);
192  cmd_ln_set_boolean("-mswav", FALSE);
193  cmd_ln_set_boolean("-nist", TRUE);
194  cmd_ln_set_boolean("-raw", FALSE);
195  }
196  else {
197  E_INFO("%s appears to be raw data\n", file);
198  cmd_ln_set_boolean("-mswav", FALSE);
199  cmd_ln_set_boolean("-nist", FALSE);
200  cmd_ln_set_boolean("-raw", TRUE);
201  }
202  fseek(infh, 0, SEEK_SET);
203  return 0;
204 }
205 
206 #define TRY_FREAD(ptr, size, nmemb, stream) \
207  if (fread(ptr, size, nmemb, stream) != (nmemb)) { \
208  E_ERROR_SYSTEM("Failed to read %d bytes", size * nmemb); \
209  goto error_out; \
210  }
211 
212 static int
213 read_riff_header(FILE *infh)
214 {
215  char id[4];
216  int32 intval, header_len;
217  int16 shortval;
218 
219  /* RIFF files are little-endian by definition. */
220  cmd_ln_set_str("-input_endian", "little");
221 
222  /* Read in all the header chunks and etcetera. */
223  TRY_FREAD(id, 1, 4, infh);
224  /* Total file length (we don't care) */
225  TRY_FREAD(&intval, 4, 1, infh);
226  /* 'WAVE' */
227  TRY_FREAD(id, 1, 4, infh);
228  if (0 != memcmp(id, "WAVE", 4)) {
229  E_ERROR("This is not a WAVE file\n");
230  goto error_out;
231  }
232  /* 'fmt ' */
233  TRY_FREAD(id, 1, 4, infh);
234  if (0 != memcmp(id, "fmt ", 4)) {
235  E_ERROR("Format chunk missing\n");
236  goto error_out;
237  }
238  /* Length of 'fmt ' chunk */
239  TRY_FREAD(&intval, 4, 1, infh);
240  SWAP_LE_32(&intval);
241  header_len = intval;
242 
243  /* Data format. */
244  TRY_FREAD(&shortval, 2, 1, infh);
245  SWAP_LE_16(&shortval);
246  if (shortval != 1) { /* PCM */
247  E_ERROR("WAVE file is not in PCM format\n");
248  goto error_out;
249  }
250 
251  /* Number of channels. */
252  TRY_FREAD(&shortval, 2, 1, infh);
253  SWAP_LE_16(&shortval);
254  if (shortval != 1) { /* PCM */
255  E_ERROR("WAVE file is not single channel\n");
256  goto error_out;
257  }
258 
259  /* Sampling rate (finally!) */
260  TRY_FREAD(&intval, 4, 1, infh);
261  SWAP_LE_32(&intval);
262  if (cmd_ln_int32("-samprate") == 0)
263  cmd_ln_set_int32("-samprate", intval);
264  else if (cmd_ln_int32("-samprate") != intval) {
265  E_WARN("WAVE file sampling rate %d != -samprate %d\n",
266  intval, cmd_ln_int32("-samprate"));
267  }
268 
269  /* Average bytes per second (we don't care) */
270  TRY_FREAD(&intval, 4, 1, infh);
271 
272  /* Block alignment (we don't care) */
273  TRY_FREAD(&shortval, 2, 1, infh);
274 
275  /* Bits per sample (must be 16) */
276  TRY_FREAD(&shortval, 2, 1, infh);
277  SWAP_LE_16(&shortval);
278  if (shortval != 16) {
279  E_ERROR("WAVE file is not 16-bit\n");
280  goto error_out;
281  }
282 
283  /* Any extra parameters. */
284  if (header_len > 16)
285  fseek(infh, header_len - 16, SEEK_CUR);
286 
287  /* Now skip to the 'data' chunk. */
288  while (1) {
289  TRY_FREAD(id, 1, 4, infh);
290  if (0 == memcmp(id, "data", 4)) {
291  /* Total number of bytes of data (we don't care). */
292  TRY_FREAD(&intval, 4, 1, infh);
293  break;
294  }
295  else {
296  /* Some other stuff... */
297  /* Number of bytes of ... whatever */
298  TRY_FREAD(&intval, 4, 1, infh);
299  SWAP_LE_32(&intval);
300  fseek(infh, intval, SEEK_CUR);
301  }
302  }
303 
304  /* We are ready to rumble. */
305  return 0;
306 error_out:
307  return -1;
308 }
309 
310 static int
311 read_nist_header(FILE *infh)
312 {
313  char hdr[1024];
314  char *line, *c;
315 
316  TRY_FREAD(hdr, 1, 1024, infh);
317  hdr[1023] = '\0';
318 
319  /* Roughly parse it to find the sampling rate and byte order
320  * (don't bother with other stuff) */
321  if ((line = strstr(hdr, "sample_rate")) == NULL) {
322  E_ERROR("No sampling rate in NIST header!\n");
323  goto error_out;
324  }
325  c = strchr(line, '\n');
326  if (c) *c = '\0';
327  c = strrchr(line, ' ');
328  if (c == NULL) {
329  E_ERROR("Could not find sampling rate!\n");
330  goto error_out;
331  }
332  ++c;
333  if (cmd_ln_int32("-samprate") == 0)
334  cmd_ln_set_int32("-samprate", atoi(c));
335  else if (cmd_ln_int32("-samprate") != atoi(c)) {
336  E_WARN("NIST file sampling rate %d != -samprate %d\n",
337  atoi(c), cmd_ln_int32("-samprate"));
338  }
339 
340  if (line + strlen(line) < hdr + 1023)
341  line[strlen(line)] = ' ';
342  if ((line = strstr(hdr, "sample_byte_format")) == NULL) {
343  E_ERROR("No sample byte format in NIST header!\n");
344  goto error_out;
345  }
346  c = strchr(line, '\n');
347  if (c) *c = '\0';
348  c = strrchr(line, ' ');
349  if (c == NULL) {
350  E_ERROR("Could not find sample byte order!\n");
351  goto error_out;
352  }
353  ++c;
354  if (0 == memcmp(c, "01", 2)) {
355  cmd_ln_set_str("-input_endian", "little");
356  }
357  else if (0 == memcmp(c, "10", 2)) {
358  cmd_ln_set_str("-input_endian", "big");
359  }
360  else {
361  E_ERROR("Unknown byte order %s\n", c);
362  goto error_out;
363  }
364 
365  /* We are ready to rumble. */
366  return 0;
367 error_out:
368  return -1;
369 }
370 
371 static int
372 extract_pitch(const char *in, const char *out)
373 {
374  FILE *infh = NULL, *outfh = NULL;
375  size_t flen, fshift, nsamps;
376  int16 *buf = NULL;
377  yin_t *yin = NULL;
378  uint16 period, bestdiff;
379  int32 sps;
380 
381  if (out) {
382  if ((outfh = fopen(out, "w")) == NULL) {
383  E_ERROR_SYSTEM("Failed to open %s for writing", out);
384  goto error_out;
385  }
386  }
387  else {
388  outfh = stdout;
389  }
390  if ((infh = fopen(in, "rb")) == NULL) {
391  E_ERROR_SYSTEM("Failed to open %s for reading", in);
392  goto error_out;
393  }
394 
395  /* If we weren't told what the file type is, weakly try to
396  * determine it (actually it's pretty obvious) */
397  if (!(cmd_ln_boolean("-raw")
398  || cmd_ln_boolean("-mswav")
399  || cmd_ln_boolean("-nist"))) {
400  if (guess_file_type(in, infh) < 0)
401  goto error_out;
402  }
403 
404  /* Grab the sampling rate and byte order from the header and also
405  * make sure this is 16-bit linear PCM. */
406  if (cmd_ln_boolean("-mswav")) {
407  if (read_riff_header(infh) < 0)
408  goto error_out;
409  }
410  else if (cmd_ln_boolean("-nist")) {
411  if (read_nist_header(infh) < 0)
412  goto error_out;
413  }
414  else if (cmd_ln_boolean("-raw")) {
415  /* Just use some defaults for sampling rate and endian. */
416  if (cmd_ln_str("-input_endian") == NULL) {
417  cmd_ln_set_str("-input_endian", "little");
418  }
419  if (cmd_ln_int32("-samprate") == 0)
420  cmd_ln_set_int32("-samprate", 16000);
421  }
422 
423  /* Now read frames and write pitch estimates. */
424  sps = cmd_ln_int32("-samprate");
425  flen = (size_t)(0.5 + sps * cmd_ln_float32("-flen"));
426  fshift = (size_t)(0.5 + sps * cmd_ln_float32("-fshift"));
427  yin = yin_init(flen, cmd_ln_float32("-voice_thresh"),
428  cmd_ln_float32("-search_range"),
429  cmd_ln_int32("-smooth_window"));
430  if (yin == NULL) {
431  E_ERROR("Failed to initialize YIN\n");
432  goto error_out;
433  }
434  buf = ckd_calloc(flen, sizeof(*buf));
435  /* Read the first full frame of data. */
436  if (fread(buf, sizeof(*buf), flen, infh) != flen) {
437  /* Fail silently, which is probably okay. */
438  }
439  yin_start(yin);
440  nsamps = 0;
441  while (!feof(infh)) {
442  /* Process a frame of data. */
443  yin_write(yin, buf);
444  if (yin_read(yin, &period, &bestdiff)) {
445  fprintf(outfh, "%.3f %.2f %.2f\n",
446  /* Time point. */
447  (double)nsamps/sps,
448  /* "Probability" of voicing. */
449  bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
450  /* Pitch (possibly bogus) */
451  period == 0 ? sps : (double)sps / period);
452  nsamps += fshift;
453  }
454  /* Shift it back and get the next frame's overlap. */
455  memmove(buf, buf + fshift, (flen - fshift) * sizeof(*buf));
456  if (fread(buf + flen - fshift, sizeof(*buf), fshift, infh) != fshift) {
457  /* Fail silently (FIXME: really?) */
458  }
459  }
460  yin_end(yin);
461  /* Process trailing frames of data. */
462  while (yin_read(yin, &period, &bestdiff)) {
463  fprintf(outfh, "%.3f %.2f %.2f\n",
464  /* Time point. */
465  (double)nsamps/sps,
466  /* "Probability" of voicing. */
467  bestdiff > 32768 ? 0.0 : 1.0 - (double)bestdiff / 32768,
468  /* Pitch (possibly bogus) */
469  period == 0 ? sps : (double)sps / period);
470  }
471 
472  if (yin)
473  yin_free(yin);
474  ckd_free(buf);
475  fclose(infh);
476  if (outfh && outfh != stdout)
477  fclose(outfh);
478  return 0;
479 
480 error_out:
481  if (yin)
482  yin_free(yin);
483  ckd_free(buf);
484  if (infh) fclose(infh);
485  if (outfh && outfh != stdout)
486  fclose(outfh);
487  return -1;
488 }
489 
490 static int
491 run_control_file(const char *ctl)
492 {
493  FILE *ctlfh;
494  char *line;
495  char *di, *dout, *ei, *eio;
496  size_t len;
497  int rv, guess_type, guess_sps, guess_endian;
498  int32 skip, runlen;
499 
500  skip = cmd_ln_int32("-nskip");
501  runlen = cmd_ln_int32("-runlen");
502 
503  /* Whether to guess file types */
504  guess_type = !(cmd_ln_boolean("-raw")
505  || cmd_ln_boolean("-mswav")
506  || cmd_ln_boolean("-nist"));
507  /* Whether to guess sampling rate */
508  guess_sps = (cmd_ln_int32("-samprate") == 0);
509  /* Whether to guess endian */
510  guess_endian = (cmd_ln_str("-input_endian") == NULL);
511 
512  if ((ctlfh = fopen(ctl, "r")) == NULL) {
513  E_ERROR_SYSTEM("Failed to open control file %s", ctl);
514  return -1;
515  }
516  if (cmd_ln_str("-di"))
517  di = string_join(cmd_ln_str("-di"), "/", NULL);
518  else
519  di = ckd_salloc("");
520  if (cmd_ln_str("-do"))
521  dout = string_join(cmd_ln_str("-do"), "/", NULL);
522  else
523  dout = ckd_salloc("");
524  if (cmd_ln_str("-ei"))
525  ei = string_join(".", cmd_ln_str("-ei"), NULL);
526  else
527  ei = ckd_salloc("");
528  if (cmd_ln_str("-eo"))
529  eio = string_join(".", cmd_ln_str("-eo"), NULL);
530  else
531  eio = ckd_salloc("");
532  rv = 0;
533  while ((line = fread_line(ctlfh, &len)) != NULL) {
534  char *infile, *outfile;
535 
536  if (skip-- > 0) {
537  ckd_free(line);
538  continue;
539  }
540  if (runlen == 0) {
541  ckd_free(line);
542  break;
543  }
544  --runlen;
545 
546  if (line[len-1] == '\n')
547  line[len-1] = '\0';
548 
549  infile = string_join(di, line, ei, NULL);
550  outfile = string_join(dout, line, eio, NULL);
551 
552  /* Reset various guessed information */
553  if (guess_type) {
554  cmd_ln_set_boolean("-nist", FALSE);
555  cmd_ln_set_boolean("-mswav", FALSE);
556  cmd_ln_set_boolean("-raw", FALSE);
557  }
558  if (guess_sps)
559  cmd_ln_set_int32("-samprate", 0);
560  if (guess_endian)
561  cmd_ln_set_str("-input_endian", NULL);
562 
563  rv = extract_pitch(infile, outfile);
564 
565  ckd_free(infile);
566  ckd_free(outfile);
567  ckd_free(line);
568 
569  if (rv != 0)
570  break;
571  }
572  ckd_free(di);
573  ckd_free(dout);
574  ckd_free(ei);
575  ckd_free(eio);
576  fclose(ctlfh);
577  return rv;
578 }
SPHINXBASE_EXPORT void yin_end(yin_t *pe)
Mark the end of an utterance.
Definition: yin.c:166
#define E_ERROR_SYSTEM(...)
Print error text; Call perror(&quot;&quot;);.
Definition: err.h:99
Command-line and other configurationparsing and handling.
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define cmd_ln_set_int32(n, i)
Set a 32-bit integer value in the global command line.
Definition: cmd_ln.h:565
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
#define ARG_INT32
Definition: cmd_ln.h:144
Sphinx&#39;s memory allocation/deallocation routines.
#define cmd_ln_set_boolean(n, b)
Set a boolean value in the global command line.
Definition: cmd_ln.h:586
#define ARG_STRING
String argument (optional).
Definition: cmd_ln.h:114
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
SPHINXBASE_EXPORT int yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff)
Read a raw estimated pitch value from the pitch estimator.
Definition: yin.c:222
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
SPHINXBASE_EXPORT void yin_free(yin_t *pe)
Free a moving-window pitch estimator.
Definition: yin.c:150
SPHINXBASE_EXPORT void yin_start(yin_t *pe)
Start processing an utterance.
Definition: yin.c:158
#define ARG_FLOAT32
Definition: cmd_ln.h:148
SPHINXBASE_EXPORT char * fread_line(FILE *stream, size_t *out_len)
Read a line of arbitrary length from a file and return it as a newly allocated string.
Definition: pio.c:377
SPHINXBASE_EXPORT yin_t * yin_init(int frame_size, float search_threshold, float search_range, int smooth_window)
Initialize moving-window pitch estimation.
Definition: yin.c:131
Implementation of logging routines.
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Definition: cmd_ln.h:118
Argument definition structure.
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
Implementation of pitch estimation.
SPHINXBASE_EXPORT void yin_write(yin_t *pe, int16 const *frame)
Feed a frame of data to the pitch estimator.
Definition: yin.c:195
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string...
Definition: strfuncs.c:70
Definition: yin.c:51
#define cmd_ln_boolean(name)
Retrieve a boolean from the global command line.
Definition: cmd_ln.h:550
SPHINXBASE_EXPORT int32 cmd_ln_parse(const arg_t *defn, int32 argc, char *argv[], int32 strict)
Non-reentrant version of cmd_ln_parse().
Definition: cmd_ln.c:746
SPHINXBASE_EXPORT void cmd_ln_free(void)
Free the global command line, if any exists.
Definition: cmd_ln.c:1082
#define cmd_ln_set_str(n, s)
Set a string in the global command line.
Definition: cmd_ln.h:558
#define cmd_ln_str(name)
Retrieve a string from the global command line.
Definition: cmd_ln.h:513
file IO related operations.
#define cmd_ln_int32(name)
Retrieve a 32-bit integer from the global command line.
Definition: cmd_ln.h:529
#define cmd_ln_float32(name)
Retrieve a 32-bit float from the global command line.
Definition: cmd_ln.h:536