PocketSphinx  5prealpha
state_align_search.c
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2010 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
42 #include "state_align_search.h"
43 
44 static int
45 state_align_search_start(ps_search_t *search)
46 {
48 
49  /* Activate the initial state. */
50  hmm_enter(sas->hmms, 0, 0, 0);
51 
52  return 0;
53 }
54 
55 static void
56 renormalize_hmms(state_align_search_t *sas, int frame_idx, int32 norm)
57 {
58  int i;
59  for (i = 0; i < sas->n_phones; ++i)
60  hmm_normalize(sas->hmms + i, norm);
61 }
62 
63 static int32
64 evaluate_hmms(state_align_search_t *sas, int16 const *senscr, int frame_idx)
65 {
66  int32 bs = WORST_SCORE;
67  int i;
68 
69  hmm_context_set_senscore(sas->hmmctx, senscr);
70 
71  for (i = 0; i < sas->n_phones; ++i) {
72  hmm_t *hmm = sas->hmms + i;
73  int32 score;
74 
75  if (hmm_frame(hmm) < frame_idx)
76  continue;
77  score = hmm_vit_eval(hmm);
78  if (score BETTER_THAN bs) {
79  bs = score;
80  }
81  }
82  return bs;
83 }
84 
85 static void
86 prune_hmms(state_align_search_t *sas, int frame_idx)
87 {
88  int nf = frame_idx + 1;
89  int i;
90 
91  /* Check all phones to see if they remain active in the next frame. */
92  for (i = 0; i < sas->n_phones; ++i) {
93  hmm_t *hmm = sas->hmms + i;
94  if (hmm_frame(hmm) < frame_idx)
95  continue;
96  hmm_frame(hmm) = nf;
97  }
98 }
99 
100 static void
101 phone_transition(state_align_search_t *sas, int frame_idx)
102 {
103  int nf = frame_idx + 1;
104  int i;
105 
106  for (i = 0; i < sas->n_phones - 1; ++i) {
107  hmm_t *hmm, *nhmm;
108  int32 newphone_score;
109 
110  hmm = sas->hmms + i;
111  if (hmm_frame(hmm) != nf)
112  continue;
113 
114  newphone_score = hmm_out_score(hmm);
115  /* Transition into next phone using the usual Viterbi rule. */
116  nhmm = hmm + 1;
117  if (hmm_frame(nhmm) < frame_idx
118  || newphone_score BETTER_THAN hmm_in_score(nhmm)) {
119  hmm_enter(nhmm, newphone_score, hmm_out_history(hmm), nf);
120  }
121  }
122 }
123 
124 #define TOKEN_STEP 20
125 static void
126 extend_tokenstack(state_align_search_t *sas, int frame_idx)
127 {
128  if (frame_idx >= sas->n_fr_alloc) {
129  sas->n_fr_alloc = frame_idx + TOKEN_STEP + 1;
130  sas->tokens = ckd_realloc(sas->tokens,
131  sas->n_emit_state * sas->n_fr_alloc
132  * sizeof(*sas->tokens));
133  }
134  memset(sas->tokens + frame_idx * sas->n_emit_state, 0xff,
135  sas->n_emit_state * sizeof(*sas->tokens));
136 }
137 
138 static void
139 record_transitions(state_align_search_t *sas, int frame_idx)
140 {
141  state_align_hist_t *tokens;
142  int i;
143 
144  /* Push another frame of tokens on the stack. */
145  extend_tokenstack(sas, frame_idx);
146  tokens = sas->tokens + frame_idx * sas->n_emit_state;
147 
148  /* Scan all active HMMs */
149  for (i = 0; i < sas->n_phones; ++i) {
150  hmm_t *hmm = sas->hmms + i;
151  int j;
152 
153  if (hmm_frame(hmm) < frame_idx)
154  continue;
155  for (j = 0; j < sas->hmmctx->n_emit_state; ++j) {
156  int state_idx = i * sas->hmmctx->n_emit_state + j;
157  /* Record their backpointers on the token stack. */
158  tokens[state_idx].id = hmm_history(hmm, j);
159  tokens[state_idx].score = hmm_score(hmm, j);
160  /* Update backpointer fields with state index. */
161  hmm_history(hmm, j) = state_idx;
162  }
163  }
164 }
165 
166 static int
167 state_align_search_step(ps_search_t *search, int frame_idx)
168 {
170  acmod_t *acmod = ps_search_acmod(search);
171  int16 const *senscr;
172  int i;
173 
174  /* Calculate senone scores. */
175  for (i = 0; i < sas->n_phones; ++i)
176  acmod_activate_hmm(acmod, sas->hmms + i);
177  senscr = acmod_score(acmod, &frame_idx);
178 
179  /* Renormalize here if needed. */
180  /* FIXME: Make sure to (unit-)test this!!! */
181  if ((sas->best_score - 0x300000) WORSE_THAN WORST_SCORE) {
182  E_INFO("Renormalizing Scores at frame %d, best score %d\n",
183  frame_idx, sas->best_score);
184  renormalize_hmms(sas, frame_idx, sas->best_score);
185  }
186 
187  /* Viterbi step. */
188  sas->best_score = evaluate_hmms(sas, senscr, frame_idx);
189  prune_hmms(sas, frame_idx);
190 
191  /* Transition out of non-emitting states. */
192  phone_transition(sas, frame_idx);
193 
194  /* Generate new tokens from best path results. */
195  record_transitions(sas, frame_idx);
196 
197  /* Update frame counter */
198  sas->frame = frame_idx;
199 
200  return 0;
201 }
202 
203 static int
204 state_align_search_finish(ps_search_t *search)
205 {
207  hmm_t *final_phone = sas->hmms + sas->n_phones - 1;
208  ps_alignment_iter_t *itor;
210 
211  int last_frame, cur_frame;
212  state_align_hist_t last, cur;
213 
214  /* Best state exiting the last cur_frame. */
215  last.id = cur.id = hmm_out_history(final_phone);
216  last.score = hmm_out_score(final_phone);
217  if (last.id == 0xffff) {
218  E_ERROR("Failed to reach final state in alignment\n");
219  return -1;
220  }
221  itor = ps_alignment_states(sas->al);
222  last_frame = sas->frame + 1;
223  for (cur_frame = sas->frame - 1; cur_frame >= 0; --cur_frame) {
224  cur = sas->tokens[cur_frame * sas->n_emit_state + cur.id];
225  /* State boundary, update alignment entry for next state. */
226  if (cur.id != last.id) {
227  itor = ps_alignment_iter_goto(itor, last.id);
228  assert(itor != NULL);
229  ent = ps_alignment_iter_get(itor);
230  ent->start = cur_frame + 1;
231  ent->duration = last_frame - ent->start;
232  ent->score = last.score - cur.score;
233  E_DEBUG(1,("state %d start %d end %d\n", last.id,
234  ent->start, last_frame));
235  last = cur;
236  last_frame = cur_frame + 1;
237  }
238  }
239  /* Update alignment entry for initial state. */
240  itor = ps_alignment_iter_goto(itor, 0);
241  assert(itor != NULL);
242  ent = ps_alignment_iter_get(itor);
243  ent->start = 0;
244  ent->duration = last_frame;
245  E_DEBUG(1,("state %d start %d end %d\n", 0,
246  ent->start, last_frame));
249 
250  return 0;
251 }
252 
253 static int
254 state_align_search_reinit(ps_search_t *search, dict_t *dict, dict2pid_t *d2p)
255 {
256  /* This does nothing. */
257  return 0;
258 }
259 
260 static void
261 state_align_search_free(ps_search_t *search)
262 {
264  ps_search_base_free(search);
265  ckd_free(sas->hmms);
266  ckd_free(sas->tokens);
267  hmm_context_free(sas->hmmctx);
268  ckd_free(sas);
269 }
270 
271 static ps_searchfuncs_t state_align_search_funcs = {
272  /* start: */ state_align_search_start,
273  /* step: */ state_align_search_step,
274  /* finish: */ state_align_search_finish,
275  /* reinit: */ state_align_search_reinit,
276  /* free: */ state_align_search_free,
277  /* lattice: */ NULL,
278  /* hyp: */ NULL,
279  /* prob: */ NULL,
280  /* seg_iter: */ NULL,
281 };
282 
283 ps_search_t *
284 state_align_search_init(const char *name,
285  cmd_ln_t *config,
286  acmod_t *acmod,
287  ps_alignment_t *al)
288 {
290  ps_alignment_iter_t *itor;
291  hmm_t *hmm;
292 
293  sas = ckd_calloc(1, sizeof(*sas));
294  ps_search_init(ps_search_base(sas), &state_align_search_funcs,
295  PS_SEARCH_TYPE_STATE_ALIGN, name,
296  config, acmod, al->d2p->dict, al->d2p);
297  sas->hmmctx = hmm_context_init(bin_mdef_n_emit_state(acmod->mdef),
298  acmod->tmat->tp, NULL, acmod->mdef->sseq);
299  if (sas->hmmctx == NULL) {
300  ckd_free(sas);
301  return NULL;
302  }
303  sas->al = al;
304 
305  /* Generate HMM vector from phone level of alignment. */
306  sas->n_phones = ps_alignment_n_phones(al);
308  sas->hmms = ckd_calloc(sas->n_phones, sizeof(*sas->hmms));
309  for (hmm = sas->hmms, itor = ps_alignment_phones(al); itor;
310  ++hmm, itor = ps_alignment_iter_next(itor)) {
312  hmm_init(sas->hmmctx, hmm, FALSE,
313  ent->id.pid.ssid, ent->id.pid.tmatid);
314  }
315  return ps_search_base(sas);
316 }
int ps_alignment_n_states(ps_alignment_t *al)
Number of states.
Definition: ps_alignment.c:363
Definition: ps_alignment.h:56
int n_phones
Number of HMMs (phones).
Base structure for search module.
void hmm_init(hmm_context_t *ctx, hmm_t *hmm, int mpx, int ssid, int tmatid)
Populate a previously-allocated HMM structure, allocating internal data.
Definition: hmm.c:89
An individual HMM among the HMM search space.
uint8 *** tp
The transition matrices; kept in the same scale as acoustic scores; tp[tmatid][from-state][to-state]...
Definition: tmat.h:56
uint16 ** sseq
Unique senone sequences (2D array built at load time)
Definition: bin_mdef.h:134
void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm)
Activate senones associated with an HMM.
Definition: acmod.c:1213
History structure.
ps_alignment_iter_t * ps_alignment_iter_goto(ps_alignment_iter_t *itor, int pos)
Move alignment iterator to given index.
Definition: ps_alignment.c:424
ps_alignment_iter_t * ps_alignment_iter_next(ps_alignment_iter_t *itor)
Move an alignment iterator forward.
Definition: ps_alignment.c:437
int n_fr_alloc
Number of frames of tokens allocated.
hmm_t * hmms
Vector of HMMs corresponding to phone level.
void ps_search_init(ps_search_t *search, ps_searchfuncs_t *vt, const char *type, const char *name, cmd_ln_t *config, acmod_t *acmod, dict_t *dict, dict2pid_t *d2p)
Initialize base structure.
int32 hmm_vit_eval(hmm_t *hmm)
Viterbi evaluation of given HMM.
Definition: hmm.c:789
int ps_alignment_propagate(ps_alignment_t *al)
Propagate timing information up from state sequence.
Definition: ps_alignment.c:313
void hmm_normalize(hmm_t *h, int32 bestscr)
Renormalize the scores in this HMM based on the given best score.
Definition: hmm.c:209
hmm_context_t * hmm_context_init(int32 n_emit_state, uint8 **const *tp, int16 const *senscore, uint16 *const *sseq)
Create an HMM context.
Definition: hmm.c:56
void ps_search_base_free(ps_search_t *search)
Free search.
int frame
Current frame being processed.
#define WORST_SCORE
Large &quot;bad&quot; score.
Definition: hmm.h:84
tmat_t * tmat
Transition matrices.
Definition: acmod.h:160
state_align_hist_t * tokens
Tokens (backpointers) for state alignment.
State (and phone and word) alignment search.
void hmm_enter(hmm_t *h, int32 score, int32 histid, int frame)
Enter an HMM with the given path score and history ID.
Definition: hmm.c:201
dict_t * dict
Dictionary this table refers to.
Definition: dict2pid.h:89
#define hmm_context_set_senscore(ctx, senscr)
Change the senone score array for a context.
Definition: hmm.h:227
a structure for a dictionary.
Definition: dict.h:76
#define WORSE_THAN
Is one score worse than another?
Definition: hmm.h:100
ps_alignment_t * al
Alignment structure being operated on.
#define BETTER_THAN
Is one score better than another?
Definition: hmm.h:95
int32 best_score
Best score in current frame.
Phone loop search structure.
ps_alignment_iter_t * ps_alignment_states(ps_alignment_t *al)
Iterate over the alignment starting at the first state.
Definition: ps_alignment.c:397
void hmm_context_free(hmm_context_t *ctx)
Free an HMM context.
Definition: hmm.c:80
bin_mdef_t * mdef
Model definition.
Definition: acmod.h:159
V-table for search algorithm.
int ps_alignment_n_phones(ps_alignment_t *al)
Number of phones.
Definition: ps_alignment.c:357
int ps_alignment_iter_free(ps_alignment_iter_t *itor)
Release an iterator before completing all iterations.
Definition: ps_alignment.c:417
ps_alignment_iter_t * ps_alignment_phones(ps_alignment_t *al)
Iterate over the alignment starting at the first phone.
Definition: ps_alignment.c:383
Acoustic model structure.
Definition: acmod.h:148
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition: dict2pid.h:84
int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx)
Score one frame of data.
Definition: acmod.c:1106
ps_alignment_entry_t * ps_alignment_iter_get(ps_alignment_iter_t *itor)
Get the alignment entry pointed to by an iterator.
Definition: ps_alignment.c:411
hmm_context_t * hmmctx
HMM context structure.
int n_emit_state
Number of emitting states (tokens per frame)