SphinxBase  5prealpha
yin.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /*
3  * Copyright (c) 2008 Beyond Access, Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in
14  * the documentation and/or other materials provided with the
15  * distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY BEYOND ACCESS, INC. ``AS IS'' AND ANY
18  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BEYOND ACCESS, INC. NOR
21  * ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  */
29 
35 /* This implements part of the YIN algorithm:
36  *
37  * "YIN, a fundamental frequency estimator for speech and music".
38  * Alain de Cheveign√© and Hideki Kawahara. Journal of the Acoustical
39  * Society of America, 111 (4), April 2002.
40  */
41 
42 #include "sphinxbase/prim_type.h"
43 #include "sphinxbase/ckd_alloc.h"
44 #include "sphinxbase/fixpoint.h"
45 
46 #include "sphinxbase/yin.h"
47 
48 #include <stdio.h>
49 #include <string.h>
50 
51 struct yin_s {
52  uint16 frame_size;
54  uint16 search_range;
55  uint16 nfr;
57  unsigned char wsize;
58  unsigned char wstart;
59  unsigned char wcur;
60  unsigned char endut;
62  fixed32 **diff_window;
63  uint16 *period_window;
64 };
65 
69 static void
70 cmn_diff(int16 const *signal, int32 *out_diff, int ndiff)
71 {
72  uint32 cum, cshift;
73  int32 t, tscale;
74 
75  out_diff[0] = 32768;
76  cum = 0;
77  cshift = 0;
78 
79  /* Determine how many bits we can scale t up by below. */
80  for (tscale = 0; tscale < 32; ++tscale)
81  if (ndiff & (1<<(31-tscale)))
82  break;
83  --tscale; /* Avoid teh overflowz. */
84  /* printf("tscale is %d (ndiff - 1) << tscale is %d\n",
85  tscale, (ndiff-1) << tscale); */
86 
87  /* Somewhat elaborate block floating point implementation.
88  * The fp implementation of this is really a lot simpler. */
89  for (t = 1; t < ndiff; ++t) {
90  uint32 dd, dshift, norm;
91  int j;
92 
93  dd = 0;
94  dshift = 0;
95  for (j = 0; j < ndiff; ++j) {
96  int diff = signal[j] - signal[t + j];
97  /* Guard against overflows. */
98  if (dd > (1UL<<tscale)) {
99  dd >>= 1;
100  ++dshift;
101  }
102  dd += (diff * diff) >> dshift;
103  }
104  /* Make sure the diffs and cum are shifted to the same
105  * scaling factor (usually dshift will be zero) */
106  if (dshift > cshift) {
107  cum += dd << (dshift-cshift);
108  }
109  else {
110  cum += dd >> (cshift-dshift);
111  }
112 
113  /* Guard against overflows and also ensure that (t<<tscale) > cum. */
114  while (cum > (1UL<<tscale)) {
115  cum >>= 1;
116  ++cshift;
117  }
118  /* Avoid divide-by-zero! */
119  if (cum == 0) cum = 1;
120  /* Calculate the normalizer in high precision. */
121  norm = (t << tscale) / cum;
122  /* Do a long multiply and shift down to Q15. */
123  out_diff[t] = (int32)(((long long)dd * norm)
124  >> (tscale - 15 + cshift - dshift));
125  /* printf("dd %d cshift %d dshift %d scaledt %d cum %d norm %d cmn %d\n",
126  dd, cshift, dshift, (t<<tscale), cum, norm, out_diff[t]); */
127  }
128 }
129 
130 yin_t *
131 yin_init(int frame_size, float search_threshold,
132  float search_range, int smooth_window)
133 {
134  yin_t *pe;
135 
136  pe = ckd_calloc(1, sizeof(*pe));
137  pe->frame_size = frame_size;
138  pe->search_threshold = (uint16)(search_threshold * 32768);
139  pe->search_range = (uint16)(search_range * 32768);
140  pe->wsize = smooth_window * 2 + 1;
141  pe->diff_window = ckd_calloc_2d(pe->wsize,
142  pe->frame_size / 2,
143  sizeof(**pe->diff_window));
144  pe->period_window = ckd_calloc(pe->wsize,
145  sizeof(*pe->period_window));
146  return pe;
147 }
148 
149 void
151 {
153  ckd_free(pe->period_window);
154  ckd_free(pe);
155 }
156 
157 void
159 {
160  /* Reset the circular window pointers. */
161  pe->wstart = pe->endut = 0;
162  pe->nfr = 0;
163 }
164 
165 void
167 {
168  pe->endut = 1;
169 }
170 
171 int
172 thresholded_search(int32 *diff_window, fixed32 threshold, int start, int end)
173 {
174  int i, min, argmin;
175 
176  min = INT_MAX;
177  argmin = 0;
178  for (i = start; i < end; ++i) {
179  int diff = diff_window[i];
180 
181  if (diff < threshold) {
182  min = diff;
183  argmin = i;
184  break;
185  }
186  if (diff < min) {
187  min = diff;
188  argmin = i;
189  }
190  }
191  return argmin;
192 }
193 
194 void
195 yin_write(yin_t *pe, int16 const *frame)
196 {
197  int outptr, difflen;
198 
199  /* Rotate the window one frame forward. */
200  ++pe->wstart;
201  /* Fill in the frame before wstart. */
202  outptr = pe->wstart - 1;
203  /* Wrap around the window pointer. */
204  if (pe->wstart == pe->wsize)
205  pe->wstart = 0;
206 
207  /* Now calculate normalized difference function. */
208  difflen = pe->frame_size / 2;
209  cmn_diff(frame, pe->diff_window[outptr], difflen);
210 
211  /* Find the first point under threshold. If not found, then
212  * use the absolute minimum. */
213  pe->period_window[outptr]
214  = thresholded_search(pe->diff_window[outptr],
215  pe->search_threshold, 0, difflen);
216 
217  /* Increment total number of frames. */
218  ++pe->nfr;
219 }
220 
221 int
222 yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff)
223 {
224  int wstart, wlen, half_wsize, i;
225  int best, best_diff, search_width, low_period, high_period;
226 
227  half_wsize = (pe->wsize-1)/2;
228  /* Without any smoothing, just return the current value (don't
229  * need to do anything to the current poitner either). */
230  if (half_wsize == 0) {
231  if (pe->endut)
232  return 0;
233  *out_period = pe->period_window[0];
234  *out_bestdiff = pe->diff_window[0][pe->period_window[0]];
235  return 1;
236  }
237 
238  /* We can't do anything unless we have at least (wsize-1)/2 + 1
239  * frames, unless we're at the end of the utterance. */
240  if (pe->endut == 0 && pe->nfr < half_wsize + 1) {
241  /* Don't increment the current pointer either. */
242  return 0;
243  }
244 
245  /* Establish the smoothing window. */
246  /* End of utterance. */
247  if (pe->endut) {
248  /* We are done (no more data) when pe->wcur = pe->wstart. */
249  if (pe->wcur == pe->wstart)
250  return 0;
251  /* I.e. pe->wcur (circular minus) half_wsize. */
252  wstart = (pe->wcur + pe->wsize - half_wsize) % pe->wsize;
253  /* Number of frames from wstart up to pe->wstart. */
254  wlen = pe->wstart - wstart;
255  if (wlen < 0) wlen += pe->wsize;
256  /*printf("ENDUT! ");*/
257  }
258  /* Beginning of utterance. */
259  else if (pe->nfr < pe->wsize) {
260  wstart = 0;
261  wlen = pe->nfr;
262  }
263  /* Normal case, it is what it is. */
264  else {
265  wstart = pe->wstart;
266  wlen = pe->wsize;
267  }
268 
269  /* Now (finally) look for the best local estimate. */
270  /* printf("Searching for local estimate in %d frames around %d\n",
271  wlen, pe->nfr + 1 - wlen); */
272  best = pe->period_window[pe->wcur];
273  best_diff = pe->diff_window[pe->wcur][best];
274  for (i = 0; i < wlen; ++i) {
275  int j = wstart + i;
276  int diff;
277 
278  j %= pe->wsize;
279  diff = pe->diff_window[j][pe->period_window[j]];
280  /* printf("%.2f,%.2f ", 1.0 - (double)diff/32768,
281  pe->period_window[j] ? 8000.0/pe->period_window[j] : 8000.0); */
282  if (diff < best_diff) {
283  best_diff = diff;
284  best = pe->period_window[j];
285  }
286  }
287  /* printf("best: %.2f, %.2f\n", 1.0 - (double)best_diff/32768,
288  best ? 8000.0/best : 8000.0); */
289  /* If it's the same as the current one then return it. */
290  if (best == pe->period_window[pe->wcur]) {
291  /* Increment the current pointer. */
292  if (++pe->wcur == pe->wsize)
293  pe->wcur = 0;
294  *out_period = best;
295  *out_bestdiff = best_diff;
296  return 1;
297  }
298  /* Otherwise, redo the search inside a narrower range. */
299  search_width = best * pe->search_range / 32768;
300  /* printf("Search width = %d * %.2f = %d\n",
301  best, (double)pe->search_range/32768, search_width); */
302  if (search_width == 0) search_width = 1;
303  low_period = best - search_width;
304  high_period = best + search_width;
305  if (low_period < 0) low_period = 0;
306  if (high_period > pe->frame_size / 2) high_period = pe->frame_size / 2;
307  /* printf("Searching from %d to %d\n", low_period, high_period); */
308  best = thresholded_search(pe->diff_window[pe->wcur],
309  pe->search_threshold,
310  low_period, high_period);
311  best_diff = pe->diff_window[pe->wcur][best];
312 
313  if (out_period)
314  *out_period = (best > 32768) ? 32768 : best;
315  if (out_bestdiff)
316  *out_bestdiff = (best_diff > 32768) ? 32768 : best_diff;
317 
318  /* Increment the current pointer. */
319  if (++pe->wcur == pe->wsize)
320  pe->wcur = 0;
321  return 1;
322 }
SPHINXBASE_EXPORT void yin_end(yin_t *pe)
Mark the end of an utterance.
Definition: yin.c:166
uint16 * period_window
Window of best period estimates.
Definition: yin.c:63
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
Definition: ckd_alloc.h:270
unsigned char wstart
First frame in window.
Definition: yin.c:58
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
Sphinx&#39;s memory allocation/deallocation routines.
unsigned char endut
Hoch Hech! Are we at the utterance end?
Definition: yin.c:60
uint16 nfr
Number of frames read so far.
Definition: yin.c:55
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT int yin_read(yin_t *pe, uint16 *out_period, uint16 *out_bestdiff)
Read a raw estimated pitch value from the pitch estimator.
Definition: yin.c:222
unsigned char wsize
Size of smoothing window.
Definition: yin.c:57
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
unsigned char wcur
Current frame of analysis.
Definition: yin.c:59
SPHINXBASE_EXPORT void yin_free(yin_t *pe)
Free a moving-window pitch estimator.
Definition: yin.c:150
SPHINXBASE_EXPORT void yin_start(yin_t *pe)
Start processing an utterance.
Definition: yin.c:158
uint16 search_range
Range around best local estimate to search, in Q15.
Definition: yin.c:54
SPHINXBASE_EXPORT yin_t * yin_init(int frame_size, float search_threshold, float search_range, int smooth_window)
Initialize moving-window pitch estimation.
Definition: yin.c:131
fixed32 ** diff_window
Window of difference function outputs.
Definition: yin.c:62
Implementation of pitch estimation.
SPHINXBASE_EXPORT void yin_write(yin_t *pe, int16 const *frame)
Feed a frame of data to the pitch estimator.
Definition: yin.c:195
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
Definition: ckd_alloc.c:255
Definition: yin.c:51
uint16 search_threshold
Size of analysis frame.
Definition: yin.c:53