SphinxBase  5prealpha
sphinx_fe.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1996-2004 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <time.h>
41 #include <assert.h>
42 
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46 
47 #include <sphinxbase/fe.h>
48 #include <sphinxbase/strfuncs.h>
49 #include <sphinxbase/pio.h>
50 #include <sphinxbase/filename.h>
51 #include <sphinxbase/cmd_ln.h>
52 #include <sphinxbase/err.h>
53 #include <sphinxbase/ckd_alloc.h>
54 #include <sphinxbase/byteorder.h>
55 #include <sphinxbase/hash_table.h>
56 
57 #include "sphinx_wave2feat.h"
58 #include "cmd_ln_defn.h"
59 
60 typedef struct audio_type_s {
61  char const *name;
62  int (*detect)(sphinx_wave2feat_t *wtf);
63  int (*decode)(sphinx_wave2feat_t *wtf);
64 } audio_type_t;
65 
66 typedef struct output_type_s {
67  char const *name;
68  int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
69  int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
71 
73  int refcount;
75  fe_t *fe;
76  char *infile;
77  char *outfile;
78  FILE *infh;
79  FILE *outfh;
80  short *audio;
81  mfcc_t **feat;
82  int blocksize;
83  int featsize;
84  int veclen;
85  int in_veclen;
86  int byteswap;
87  output_type_t const *ot;
88 };
89 
91 typedef struct RIFFHeader{
92  char rifftag[4]; /* "RIFF" string */
93  int32 TotalLength; /* Total length */
94  char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */
95  int32 RemainingLength; /* Remaining length */
96  int16 data_format; /* data format tag, 1 = PCM */
97  int16 numchannels; /* Number of channels in file */
98  int32 SamplingFreq; /* Sampling frequency */
99  int32 BytesPerSec; /* Average bytes/sec */
100  int16 BlockAlign; /* Block align */
101  int16 BitsPerSample; /* 8 or 16 bit */
102  char datatag[4]; /* "data" string */
103  int32 datalength; /* Raw data length */
104 } MSWAV_hdr;
105 
111 static int
112 detect_riff(sphinx_wave2feat_t *wtf)
113 {
114  FILE *fh;
115  MSWAV_hdr hdr;
116  double samprate;
117 
118  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
119  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
120  return -1;
121  }
122  if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
123  E_ERROR("Failed to read RIFF header");
124  fclose(fh);
125  return -1;
126  }
127  /* Make sure it is actually a RIFF file. */
128  if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
129  fclose(fh);
130  return FALSE;
131  }
132  if (cmd_ln_int32_r(wtf->config, "-nchans") != hdr.numchannels) {
133  E_ERROR("Number of channels %d does not match configured value in file '%s'\n", hdr.numchannels, wtf->infile);
134  fclose(fh);
135  return -1;
136  }
137  samprate = cmd_ln_float32_r(wtf->config, "-samprate");
138  if (samprate != hdr.SamplingFreq) {
139  E_ERROR("Sample rate %d does not match configured value %.1f in file '%s'\n",
140  hdr.SamplingFreq, samprate, wtf->infile);
141  fclose(fh);
142  return -1;
143  }
144  wtf->infh = fh;
145 
146  return TRUE;
147 }
148 
149 static int
150 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
151 {
152  char nist[7];
153  lineiter_t *li;
154  FILE *fh;
155 
156  if ((fh = fopen(infile, "rb")) == NULL) {
157  E_ERROR_SYSTEM("Failed to open %s", infile);
158  return -1;
159  }
160  if (fread(&nist, 1, 7, fh) != 7) {
161  E_ERROR_SYSTEM("Failed to read NIST header");
162  fclose(fh);
163  return -1;
164  }
165  /* Is this actually a NIST file? */
166  if (0 != strncmp(nist, "NIST_1A", 7)) {
167  fclose(fh);
168  return FALSE;
169  }
170  /* Rewind, parse lines. */
171  fseek(fh, 0, SEEK_SET);
172  for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
173  char **words;
174  int nword;
175 
176  string_trim(li->buf, STRING_BOTH);
177  if (strlen(li->buf) == 0) {
178  lineiter_free(li);
179  break;
180  }
181  nword = str2words(li->buf, NULL, 0);
182  if (nword != 3)
183  continue;
184  words = (char **)ckd_calloc(nword, sizeof(*words));
185  str2words(li->buf, words, nword);
186  if (0 == strcmp(words[0], "sample_rate")) {
187  float samprate = atof_c(words[2]);
188  if (cmd_ln_float32_r(wtf->config, "-samprate") != samprate) {
189  E_ERROR("Sample rate %.1f does not match configured value in file '%s'\n", samprate, infile);
190  lineiter_free(li);
191  fclose(fh);
192  return -1;
193  }
194  }
195  if (0 == strcmp(words[0], "channel_count")) {
196  int nchans = atoi(words[2]);
197  if (cmd_ln_int32_r(wtf->config, "-nchans") != nchans) {
198  E_ERROR("Number of channels %d does not match configured value in file '%s'\n", nchans, infile);
199  lineiter_free(li);
200  fclose(fh);
201  return -1;
202  }
203  }
204  if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
205  const char *endian = (0 == strcmp(words[2], "10")) ? "big" : "little";
206  if (0 != strcmp(cmd_ln_str_r(wtf->config, "-input_endian"), endian)) {
207  E_ERROR("Input endian %s does not match configured value in file '%s'\n", endian, infile);
208  lineiter_free(li);
209  fclose(fh);
210  return -1;
211  }
212  }
213  ckd_free(words);
214  }
215 
216  fseek(fh, 1024, SEEK_SET);
217  if (out_fh)
218  *out_fh = fh;
219  else
220  fclose(fh);
221  return TRUE;
222 }
223 
224 #ifdef HAVE_POPEN
225 static int
226 detect_sph2pipe(sphinx_wave2feat_t *wtf)
227 {
228  FILE *fh;
229  char *cmdline;
230  int rv;
231 
232  /* Determine if it's NIST file and get parameters. */
233  if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
234  return rv;
235 
236  /* Now popen it with sph2pipe. */
237  cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
238  if ((fh = popen(cmdline, "r")) == NULL) {
239  E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
240  ckd_free(cmdline);
241  return -1;
242  }
243 
244  wtf->infh = fh;
245  return TRUE;
246 }
247 #else /* !HAVE_POPEN */
248 static int
249 detect_sph2pipe(sphinx_wave2feat_t *wtf)
250 {
251  E_ERROR("popen() not available, cannot run sph2pipe\n");
252  return -1;
253 }
254 #endif /* !HAVE_POPEN */
255 
261 static int
262 detect_nist(sphinx_wave2feat_t *wtf)
263 {
264  FILE *fh;
265  int rv;
266 
267  if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
268  return rv;
269  wtf->infh = fh;
270 
271  return TRUE;
272 }
273 
274 
281 static int
282 detect_raw(sphinx_wave2feat_t *wtf)
283 {
284  FILE *fh;
285 
286  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
287  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
288  return -1;
289  }
290  wtf->infh = fh;
291  return TRUE;
292 }
293 
300 static int
301 detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
302 {
303  FILE *fh;
304  int32 len;
305  long flen;
306 
307  if ((fh = fopen(wtf->infile, "rb")) == NULL) {
308  E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
309  return -1;
310  }
311  if (fread(&len, 4, 1, fh) != 1) {
312  E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
313  fclose(fh);
314  return -1;
315  }
316  fseek(fh, 0, SEEK_END);
317  flen = ftell(fh);
318 
319  /* figure out whether to byteswap */
320  flen = (flen / 4) - 1;
321  if (flen != len) {
322  /* First make sure this is an endianness problem, otherwise fail. */
323  SWAP_INT32(&len);
324  if (flen != len) {
325  SWAP_INT32(&len);
326  E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
327  len, flen);
328  return -1;
329  }
330  /* Set the input endianness to the opposite of the machine endianness... */
331  cmd_ln_set_str_r(wtf->config, "-input_endian",
332  (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
333  ? "little" : "big"));
334  }
335 
336  fseek(fh, 4, SEEK_SET);
337  wtf->infh = fh;
338  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
339  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
340  }
341  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
342  wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
343  wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
344  }
345  else {
346  /* Should not happen. */
347  E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
348  assert(FALSE);
349  }
350 
351  return TRUE;
352 }
353 
354 int
355 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
356 {
357  int i, j;
358 
359  if (whichchan > 0) {
360  for (i = whichchan - 1; i < nsamp; i += nchans)
361  buf[i/nchans] = buf[i];
362  }
363  else {
364  for (i = 0; i < nsamp; i += nchans) {
365  float64 tmp = 0.0;
366  for (j = 0; j < nchans && i + j < nsamp; ++j) {
367  tmp += buf[i + j];
368  }
369  buf[i/nchans] = (int16)(tmp / nchans);
370  }
371  }
372  return i/nchans;
373 }
374 
379 static int
380 decode_pcm(sphinx_wave2feat_t *wtf)
381 {
382  size_t nsamp;
383  int32 n, nfr, nchans, whichchan;
384  uint32 nfloat;
385 
386  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
387  whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
388  fe_start_stream(wtf->fe);
389  fe_start_utt(wtf->fe);
390  nfloat = 0;
391  while ((nsamp = fread(wtf->audio, sizeof(int16), wtf->blocksize, wtf->infh)) != 0) {
392  size_t nvec;
393  int16 const *inspeech;
394 
395  /* Byteswap stuff here if necessary. */
396  if (wtf->byteswap) {
397  for (n = 0; n < nsamp; ++n)
398  SWAP_INT16(wtf->audio + n);
399  }
400 
401  /* Mix or pick channels. */
402  if (nchans > 1)
403  nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
404 
405  inspeech = wtf->audio;
406  nvec = wtf->featsize;
407  /* Consume all samples. */
408  while (nsamp) {
409  nfr = nvec;
410  fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr, NULL);
411  if (nfr) {
412  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
413  return -1;
414  nfloat += n;
415  }
416  }
417  inspeech = wtf->audio;
418  }
419  /* Now process any leftover audio frames. */
420  fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
421  if (nfr) {
422  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
423  return -1;
424  nfloat += n;
425  }
426 
427  if (fclose(wtf->infh) == EOF)
428  E_ERROR_SYSTEM("Failed to close input file");
429  wtf->infh = NULL;
430  return nfloat;
431 }
432 
437 static int
438 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
439 {
440  int nfloat = 0, n;
441  int featsize = wtf->featsize;
442 
443  /* If the input vector length is less than the output length, we
444  * need to do this one frame at a time, because there's empty
445  * space at the end of each vector in wtf->feat. */
446  if (wtf->in_veclen < wtf->veclen)
447  featsize = 1;
448  while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
449  featsize * wtf->in_veclen, wtf->infh)) != 0) {
450  int i, nfr = n / wtf->in_veclen;
451  if (n % wtf->in_veclen) {
452  E_ERROR("Size of file %d not a multiple of veclen %d\n",
453  n, wtf->in_veclen);
454  return -1;
455  }
456  /* Byteswap stuff here if necessary. */
457  if (wtf->byteswap) {
458  for (i = 0; i < n; ++i)
459  SWAP_FLOAT32(wtf->feat[0] + i);
460  }
461  fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
462  for (i = 0; i < nfr; ++i) {
463  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
464  if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
465  fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
466  else
467  fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
468  }
469  else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
470  fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
471  }
472  }
473  if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
474  return -1;
475  nfloat += n;
476  }
477 
478  if (fclose(wtf->infh) == EOF)
479  E_ERROR_SYSTEM("Failed to close input file");
480  wtf->infh = NULL;
481  return nfloat;
482 }
483 
484 static const audio_type_t types[] = {
485  { "-mswav", &detect_riff, &decode_pcm },
486  { "-nist", &detect_nist, &decode_pcm },
487  { "-raw", &detect_raw, &decode_pcm },
488  { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
489 };
490 static const int ntypes = sizeof(types)/sizeof(types[0]);
491 static const audio_type_t mfcc_type = {
492  "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
493 };
494 
500 static int
501 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
502 {
503  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
504  E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
505  return -1;
506  }
507  return 0;
508 }
509 
515 static int
516 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
517 {
518  int i, nfloat = 0;
519 
520  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
521  for (i = 0; i < nfr; ++i) {
522  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
523  E_ERROR_SYSTEM("Writing %d values to %s failed",
524  wtf->veclen, wtf->outfile);
525  return -1;
526  }
527  nfloat += wtf->veclen;
528  }
529  return nfloat;
530 }
531 
532 typedef enum htk_feature_kind_e {
533  WAVEFORM = 0, /* PCM audio (rarely used) */
534  LPC = 1, /* LPC filter coefficients */
535  LPCREFC = 2, /* LPC reflection coefficients */
536  LPCEPSTRA = 3, /* LPC-based cepstral coefficients */
537  LPCDELCEP = 4, /* LPCC plus deltas */
538  IREFC = 5, /* 16-bit integer LPC reflection coefficients */
539  MFCC = 6, /* MFCCs */
540  FBANK = 7, /* Log mel spectrum */
541  MELSPEC = 8, /* Linear mel spectrum */
542  USER = 9, /* User defined */
543  DISCRETE = 10, /* Vector quantized data */
544  PLP = 11 /* PLP coefficients */
545 } htk_feature_kind_t;
546 
547 typedef enum htk_feature_flag_e {
548  _E = 0000100, /* has energy */
549  _N = 0000200, /* absolute energy supressed */
550  _D = 0000400, /* has delta coefficients */
551  _A = 0001000, /* has acceleration (delta-delta) coefficients */
552  _C = 0002000, /* is compressed */
553  _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
554  _K = 0010000, /* has CRC checksum */
555  _O = 0020000, /* has 0th cepstral coefficient */
556  _V = 0040000, /* has VQ data */
557  _T = 0100000 /* has third differential coefficients */
558 } htk_feature_flag_t;
559 
563 static int
564 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
565 {
566  int32 samp_period;
567  int16 samp_size;
568  int16 param_kind;
569  int swap = FALSE;
570 
571  /* HTK files are big-endian. */
572  if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
573  swap = TRUE;
574  /* Same file size thing as in Sphinx files (I think) */
575  if (swap) SWAP_INT32(&nfloat);
576  if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
577  return -1;
578  /* Sample period in 100ns units. */
579  samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
580  if (swap) SWAP_INT32(&samp_period);
581  if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
582  return -1;
583  /* Sample size - veclen * sizeof each sample. */
584  samp_size = wtf->veclen * 4;
585  if (swap) SWAP_INT16(&samp_size);
586  if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
587  return -1;
588  /* Format and flags. */
589  if (cmd_ln_boolean_r(wtf->config, "-logspec")
590  || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
591  param_kind = FBANK; /* log mel-filter bank outputs */
592  else
593  param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
594  if (swap) SWAP_INT16(&param_kind);
595  if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
596  return -1;
597 
598  return 0;
599 }
600 
604 static int
605 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
606 {
607  int i, j, swap, htk_reorder, nfloat = 0;
608 
609  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
610  /* This is possibly inefficient, but probably not a big deal. */
611  swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
612  htk_reorder = (0 == strcmp("htk", wtf->ot->name)
613  && !(cmd_ln_boolean_r(wtf->config, "-logspec")
614  || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
615  for (i = 0; i < nfr; ++i) {
616  if (htk_reorder) {
617  mfcc_t c0 = frames[i][0];
618  memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
619  frames[i][wtf->veclen - 1] = c0;
620  }
621  if (swap)
622  for (j = 0; j < wtf->veclen; ++j)
623  SWAP_FLOAT32(frames[i] + j);
624  if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
625  E_ERROR_SYSTEM("Writing %d values to %s failed",
626  wtf->veclen, wtf->outfile);
627  return -1;
628  }
629  nfloat += wtf->veclen;
630  }
631  return nfloat;
632 }
633 
637 static int
638 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
639 {
640  int i, j, nfloat = 0;
641 
642  fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
643  for (i = 0; i < nfr; ++i) {
644  for (j = 0; j < wtf->veclen; ++j) {
645  fprintf(wtf->outfh, "%.5g", MFCC2FLOAT(frames[i][j]));
646  if (j == wtf->veclen - 1)
647  fprintf(wtf->outfh, "\n");
648  else
649  fprintf(wtf->outfh, " ");
650  }
651  nfloat += wtf->veclen;
652  }
653  return nfloat;
654 }
655 
656 static const output_type_t outtypes[] = {
657  { "sphinx", &output_header_sphinx, &output_frames_sphinx },
658  { "htk", &output_header_htk, &output_frames_htk },
659  { "text", NULL, &output_frames_text }
660 };
661 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
662 
664 sphinx_wave2feat_init(cmd_ln_t *config)
665 {
666  sphinx_wave2feat_t *wtf;
667  int i;
668 
669  wtf = (sphinx_wave2feat_t *)ckd_calloc(1, sizeof(*wtf));
670  wtf->refcount = 1;
671  wtf->config = cmd_ln_retain(config);
672  wtf->fe = fe_init_auto_r(wtf->config);
673  if (!wtf->fe) {
674  E_FATAL("Failed to create feature extraction\n");
675  }
676 
677  wtf->ot = outtypes; /* Default (sphinx) type. */
678  for (i = 0; i < nouttypes; ++i) {
679  output_type_t const *otype = &outtypes[i];
680  if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
681  wtf->ot = otype;
682  break;
683  }
684  }
685  if (i == nouttypes) {
686  E_ERROR("Unknown output type: '%s'\n",
687  cmd_ln_str_r(config, "-ofmt"));
688  sphinx_wave2feat_free(wtf);
689  return NULL;
690  }
691 
692  return wtf;
693 }
694 
695 int
696 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
697 {
698  if (wtf == NULL)
699  return 0;
700  if (--wtf->refcount > 0)
701  return wtf->refcount;
702 
703  if (wtf->audio)
704  ckd_free(wtf->audio);
705  if (wtf->feat)
706  ckd_free_2d(wtf->feat);
707  if (wtf->infile)
708  ckd_free(wtf->infile);
709  if (wtf->outfile)
710  ckd_free(wtf->outfile);
711  if (wtf->infh) {
712  if (fclose(wtf->infh) == EOF)
713  E_ERROR_SYSTEM("Failed to close input file");
714  }
715  if (wtf->outfh) {
716  if (fclose(wtf->outfh) == EOF)
717  E_ERROR_SYSTEM("Failed to close output file");
718  }
719  cmd_ln_free_r(wtf->config);
720  fe_free(wtf->fe);
721  ckd_free(wtf);
722 
723  return 0;
724 }
725 
727 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
728 {
729  ++wtf->refcount;
730  return wtf;
731 }
732 
733 static audio_type_t const *
734 detect_audio_type(sphinx_wave2feat_t *wtf)
735 {
736  audio_type_t const *atype = NULL;
737  int i;
738 
739  /* Special case audio type for Sphinx MFCC inputs. */
740  if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
741  || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
742  int rv = mfcc_type.detect(wtf);
743  if (rv == -1)
744  goto error_out;
745  return &mfcc_type;
746  }
747 
748  /* Try to use the type of infile given on the command line. */
749  for (i = 0; i < ntypes; ++i) {
750  int rv;
751  atype = &types[i];
752  if (cmd_ln_boolean_r(wtf->config, atype->name)) {
753  rv = (*atype->detect)(wtf);
754  if (rv == -1)
755  goto error_out;
756  else if (rv == TRUE)
757  break;
758  }
759  }
760  if (i == ntypes) {
761  /* Detect file type of infile and get parameters. */
762  for (i = 0; i < ntypes; ++i) {
763  int rv;
764  atype = &types[i];
765  rv = (*atype->detect)(wtf);
766  if (rv == -1)
767  goto error_out;
768  else if (rv == TRUE)
769  break;
770  }
771  if (i == ntypes)
772  goto error_out;
773  }
774  return atype;
775  error_out:
776  if (wtf->infh)
777  fclose(wtf->infh);
778  wtf->infh = NULL;
779  return NULL;
780 }
781 
782 int
783 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
784  char const *infile, char const *outfile)
785 {
786  int nchans, nfloat, veclen;
787  audio_type_t const *atype = NULL;
788  int fshift, fsize;
789 
790  E_INFO("Converting %s to %s\n", infile, outfile);
791 
792  wtf->infile = ckd_salloc(infile);
793 
794  /* Detect input file type. */
795  if ((atype = detect_audio_type(wtf)) == NULL)
796  return -1;
797 
798  /* Determine whether to byteswap input. */
799  wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
800  cmd_ln_str_r(wtf->config, "-input_endian"));
801 
802  /* Get the output frame size (if not already set). */
803  if (wtf->veclen == 0)
804  wtf->veclen = fe_get_output_size(wtf->fe);
805 
806  /* Set up the input and output buffers. */
807  fe_get_input_size(wtf->fe, &fshift, &fsize);
808  /* Want to get at least a whole frame plus shift in here. Also we
809  will either pick or mix multiple channels so we need to read
810  them all at once. */
811  nchans = cmd_ln_int32_r(wtf->config, "-nchans");
812  wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
813  if (wtf->blocksize < (fsize + fshift) * nchans) {
814  E_INFO("Block size of %d too small, increasing to %d\n",
815  wtf->blocksize,
816  (fsize + fshift) * nchans);
817  wtf->blocksize = (fsize + fshift) * nchans;
818  }
819  wtf->audio = (short *)ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
820  wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
821 
822  /* Use the maximum of the input and output frame sizes to allocate this. */
823  veclen = wtf->veclen;
824  if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
825 
826  wtf->feat = (mfcc_t**)ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
827 
828  /* Let's go! */
829  if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
830  E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
831  return -1;
832  }
833  /* Write an empty header, which we'll fill in later. */
834  if (wtf->ot->output_header &&
835  (*wtf->ot->output_header)(wtf, 0) < 0) {
836  E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
837  goto error_out;
838  }
839  wtf->outfile = ckd_salloc(outfile);
840 
841  if ((nfloat = (*atype->decode)(wtf)) < 0) {
842  E_ERROR("Failed to convert");
843  goto error_out;
844  }
845 
846  if (wtf->ot->output_header) {
847  if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
848  E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
849  goto error_out;
850  }
851  if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
852  E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
853  goto error_out;
854  }
855  }
856 
857 
858  if (wtf->audio)
859  ckd_free(wtf->audio);
860  if (wtf->feat)
861  ckd_free_2d(wtf->feat);
862  if (wtf->infile)
863  ckd_free(wtf->infile);
864  if (wtf->outfile)
865  ckd_free(wtf->outfile);
866 
867  wtf->audio = NULL;
868  wtf->infile = NULL;
869  wtf->feat = NULL;
870  wtf->outfile = NULL;
871 
872  if (wtf->outfh)
873  if (fclose(wtf->outfh) == EOF)
874  E_ERROR_SYSTEM("Failed to close output file");
875  wtf->outfh = NULL;
876 
877  return 0;
878 
879 error_out:
880 
881  if (wtf->audio)
882  ckd_free(wtf->audio);
883  if (wtf->feat)
884  ckd_free_2d(wtf->feat);
885  if (wtf->infile)
886  ckd_free(wtf->infile);
887  if (wtf->outfile)
888  ckd_free(wtf->outfile);
889 
890  wtf->audio = NULL;
891  wtf->infile = NULL;
892  wtf->feat = NULL;
893  wtf->outfile = NULL;
894 
895  if (wtf->outfh)
896  if (fclose(wtf->outfh) == EOF)
897  E_ERROR_SYSTEM("Failed to close output file");
898  wtf->outfh = NULL;
899 
900  return -1;
901 }
902 
903 void
904 build_filenames(cmd_ln_t *config, char const *basename,
905  char **out_infile, char **out_outfile)
906 {
907  char const *di, *do_, *ei, *eo;
908 
909  di = cmd_ln_str_r(config, "-di");
910  do_ = cmd_ln_str_r(config, "-do");
911  ei = cmd_ln_str_r(config, "-ei");
912  eo = cmd_ln_str_r(config, "-eo");
913 
914  *out_infile = string_join(di ? di : "",
915  di ? "/" : "",
916  basename,
917  ei ? "." : "",
918  ei ? ei : "",
919  NULL);
920  *out_outfile = string_join(do_ ? do_ : "",
921  do_ ? "/" : "",
922  basename,
923  eo ? "." : "",
924  eo ? eo : "",
925  NULL);
926  /* Build output directory structure if possible/requested (it is
927  * by default). */
928  if (cmd_ln_boolean_r(config, "-build_outdirs")) {
929  char *dirname = ckd_salloc(*out_outfile);
930  path2dirname(*out_outfile, dirname);
931  build_directory(dirname);
932  ckd_free(dirname);
933  }
934 }
935 
936 static int
937 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
938 {
939  hash_table_t *files;
940  hash_iter_t *itor;
941  lineiter_t *li;
942  FILE *ctlfh;
943  int nskip, runlen, npart;
944 
945  if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
946  E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
947  return -1;
948  }
949  nskip = cmd_ln_int32_r(wtf->config, "-nskip");
950  runlen = cmd_ln_int32_r(wtf->config, "-runlen");
951  if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
952  /* Count lines in the file. */
953  int partlen, part, nlines = 0;
954  part = cmd_ln_int32_r(wtf->config, "-part");
955  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
956  ++nlines;
957  fseek(ctlfh, 0, SEEK_SET);
958  partlen = nlines / npart;
959  nskip = partlen * (part - 1);
960  if (part == npart)
961  runlen = -1;
962  else
963  runlen = partlen;
964  }
965  if (runlen != -1){
966  E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
967  files = hash_table_new(runlen, HASH_CASE_YES);
968  }
969  else {
970  E_INFO("Processing all remaining utterances at position %d\n", nskip);
971  files = hash_table_new(1000, HASH_CASE_YES);
972  }
973  for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
974  char *c, *infile, *outfile;
975 
976  if (nskip-- > 0)
977  continue;
978  if (runlen == 0) {
979  lineiter_free(li);
980  break;
981  }
982  --runlen;
983 
984  string_trim(li->buf, STRING_BOTH);
985  /* Extract the file ID from the control line. */
986  if ((c = strchr(li->buf, ' ')) != NULL)
987  *c = '\0';
988  if (strlen(li->buf) == 0) {
989  E_WARN("Empty line %d in control file, skipping\n", li->lineno);
990  continue;
991  }
992  build_filenames(wtf->config, li->buf, &infile, &outfile);
993  if (hash_table_lookup(files, infile, NULL) == 0)
994  continue;
995  sphinx_wave2feat_convert_file(wtf, infile, outfile);
996  hash_table_enter(files, infile, outfile);
997  }
998  for (itor = hash_table_iter(files); itor;
999  itor = hash_table_iter_next(itor)) {
1000  ckd_free((void *)hash_entry_key(itor->ent));
1001  ckd_free(hash_entry_val(itor->ent));
1002  }
1003  hash_table_free(files);
1004  fclose(ctlfh);
1005 
1006  return 0;
1007 }
1008 
1009 int
1010 main(int argc, char *argv[])
1011 {
1012  sphinx_wave2feat_t *wtf;
1013  cmd_ln_t *config;
1014  int rv;
1015 
1016  config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE);
1017 
1018  if (config && cmd_ln_str_r(config, "-argfile"))
1019  config = cmd_ln_parse_file_r(config, defn,
1020  cmd_ln_str_r(config, "-argfile"), FALSE);
1021  if (config == NULL) {
1022  E_ERROR("Command line parsing failed\n");
1023  return 1;
1024  }
1025 
1026  if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
1027  E_ERROR("Failed to initialize wave2feat object\n");
1028  return 1;
1029  }
1030 
1031  /* If there's a control file run through it, otherwise we will do
1032  * a single file (which is what run_control_file will do
1033  * internally too) */
1034  if (cmd_ln_str_r(config, "-c"))
1035  rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
1036  else
1037  rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
1038  cmd_ln_str_r(config, "-o"));
1039 
1040  sphinx_wave2feat_free(wtf);
1041  cmd_ln_free_r(config);
1042  return rv;
1043 }
#define E_ERROR_SYSTEM(...)
Print error text; Call perror(&quot;&quot;);.
Definition: err.h:99
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_retain(cmd_ln_t *cmdln)
Retain ownership of a command-line argument set.
Definition: cmd_ln.c:1039
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
hash_entry_t * ent
Current entry in that table.
Definition: hash_table.h:170
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
Definition: hash_table.c:302
int veclen
Length of each output vector.
Definition: sphinx_fe.c:84
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
output_type_t const * ot
Output type object.
Definition: sphinx_fe.c:87
Sphinx&#39;s memory allocation/deallocation routines.
SPHINXBASE_EXPORT int cmd_ln_free_r(cmd_ln_t *cmdln)
Release a command-line argument set and all associated strings.
Definition: cmd_ln.c:1046
File names related operation.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
Definition: cmd_ln.c:556
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter(hash_table_t *h)
Start iterating over key-value pairs in a hash table.
Definition: hash_table.c:646
Line iterator for files.
Definition: pio.h:177
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
#define hash_entry_val(e)
Access macros.
Definition: hash_table.h:175
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
Definition: cmd_ln.c:949
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
FILE * infh
Input file handle.
Definition: sphinx_fe.c:78
int refcount
Reference count.
Definition: sphinx_fe.c:73
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
char * outfile
Path to output file.
Definition: sphinx_fe.c:77
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:688
SPHINXBASE_EXPORT int build_directory(const char *path)
Create a directory and all of its parent directories, as needed.
Definition: pio.c:621
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
int featsize
Size of feature buffer.
Definition: sphinx_fe.c:83
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:368
RIFF 44-byte header structure for MS wav files.
Definition: sphinx_fe.c:91
FILE * outfh
Output file handle.
Definition: sphinx_fe.c:79
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
int byteswap
Whether byteswapping is necessary.
Definition: sphinx_fe.c:86
mfcc_t ** feat
Feature buffer.
Definition: sphinx_fe.c:81
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
Definition: pio.c:264
int in_veclen
Length of each input vector (for cep&lt;-&gt;spec).
Definition: sphinx_fe.c:85
Implementation of logging routines.
Both ends of string.
Definition: strfuncs.h:73
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:501
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_file_r(cmd_ln_t *inout_cmdln, arg_t const *defn, char const *filename, int32 strict)
Parse an arguments file by deliminating on &quot; \r\t\n&quot; and putting each tokens into an argv[] for cmd_l...
Definition: cmd_ln.c:764
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
short * audio
Audio buffer.
Definition: sphinx_fe.c:80
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of &quot;words&quot;, based on whitespace separators.
Definition: strfuncs.c:123
Opaque structure used to hold the results of command-line parsing.
char * infile
Path to input file.
Definition: sphinx_fe.c:76
SPHINXBASE_EXPORT hash_iter_t * hash_table_iter_next(hash_iter_t *itor)
Get the next key-value pair in iteration.
Definition: hash_table.c:656
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string...
Definition: strfuncs.c:70
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
int blocksize
Size of audio buffer.
Definition: sphinx_fe.c:82
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
Definition: cmd_ln.h:334
cmd_ln_t * config
Configuration parameters.
Definition: sphinx_fe.c:74
Hash table implementation.
#define E_FATAL(...)
Exit with non-zero status after error message.
Definition: err.h:81
Structure for the front-end computation.
Definition: fe_internal.h:117
SPHINXBASE_EXPORT void cmd_ln_set_str_r(cmd_ln_t *cmdln, char const *name, char const *str)
Set a string in a command-line object.
Definition: cmd_ln.c:989
SPHINXBASE_EXPORT char * string_trim(char *string, enum string_edge_e which)
Remove whitespace from a string, modifying it in-place.
Definition: strfuncs.c:97
fe_t * fe
Front end object.
Definition: sphinx_fe.c:75
SPHINXBASE_EXPORT void path2dirname(const char *path, char *dir)
Strip off filename from the given path and copy the directory name into dir Caller must have allocate...
Definition: filename.c:68
file IO related operations.