47 #include <sphinxbase/ckd_alloc.h>
48 #include <sphinxbase/listelem_alloc.h>
49 #include <sphinxbase/err.h>
56 #define __CHAN_DUMP__ 0
58 #define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr)
60 #define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm)
73 ngs->n_expand_words = 0;
74 n_words = ps_search_n_words(ngs);
75 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
76 for (i = 0; i < n_words; ++i) {
77 if (!ngram_model_set_known_wid(ngs->
lmset,
78 dict_basewid(ps_search_dict(ngs),i)))
81 ngs->expand_word_list[ngs->n_expand_words] = i;
82 bitvec_set(ngs->expand_word_flag, i);
83 ngs->n_expand_words++;
85 E_INFO(
"Utterance vocabulary contains %d words\n", ngs->n_expand_words);
86 ngs->expand_word_list[ngs->n_expand_words] = -1;
93 dict_t *dict = ps_search_dict(ngs);
94 int n_words = ps_search_n_words(ngs);
100 for (w = 0; w < n_words; w++) {
101 if (dict_is_single_phone(dict, w))
108 for (w = 0; w < n_words; w++) {
109 if (!dict_is_single_phone(dict, w))
116 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef,
118 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef,
131 int n_words = ps_search_n_words(ngs);
133 for (i = w = 0; w < n_words; ++w) {
134 if (!dict_is_single_phone(ps_search_dict(ngs), w))
149 n_words = ps_search_n_words(ngs);
151 ngs->expand_word_flag = bitvec_alloc(n_words);
152 ngs->expand_word_list = ckd_calloc(n_words + 1,
sizeof(*ngs->expand_word_list));
154 ngs->min_ef_width = cmd_ln_int32_r(ps_search_config(ngs),
"-fwdflatefwid");
155 ngs->max_sf_win = cmd_ln_int32_r(ps_search_config(ngs),
"-fwdflatsfwin");
156 E_INFO(
"fwdflat: min_ef_width = %d, max_sf_win = %d\n",
157 ngs->min_ef_width, ngs->max_sf_win);
162 ngram_fwdflat_expand_all(ngs);
164 ngram_fwdflat_allocate_1ph(ngs);
171 double n_speech = (double)ngs->n_tot_frame
172 / cmd_ln_int32_r(ps_search_config(ngs),
"-frate");
174 E_INFO(
"TOTAL fwdflat %.2f CPU %.3f xRT\n",
175 ngs->fwdflat_perf.t_tot_cpu,
176 ngs->fwdflat_perf.t_tot_cpu / n_speech);
177 E_INFO(
"TOTAL fwdflat %.2f wall %.3f xRT\n",
178 ngs->fwdflat_perf.t_tot_elapsed,
179 ngs->fwdflat_perf.t_tot_elapsed / n_speech);
183 ngram_fwdflat_free_1ph(ngs);
186 bitvec_free(ngs->expand_word_flag);
187 ckd_free(ngs->expand_word_list);
198 ckd_free(ngs->expand_word_list);
199 bitvec_free(ngs->expand_word_flag);
200 n_words = ps_search_n_words(ngs);
202 ngs->expand_word_flag = bitvec_alloc(n_words);
203 ngs->expand_word_list = ckd_calloc(n_words + 1,
sizeof(*ngs->expand_word_list));
208 ngram_fwdflat_free_1ph(ngs);
214 ngram_fwdflat_expand_all(ngs);
216 ngram_fwdflat_allocate_1ph(ngs);
229 int32 i, f, sf, ef, wid, nwd;
241 for (i = 0, bp = ngs->bp_table; i < ngs->bpidx; i++, bp++) {
242 sf = (bp->
bp < 0) ? 0 : ngs->bp_table[bp->
bp].
frame + 1;
248 if (!ngram_model_set_known_wid(ngs->
lmset,
249 dict_basewid(ps_search_dict(ngs), wid)))
263 node->
fef = node->
lef = ef;
271 for (f = 0; f < ngs->
n_frame; f++) {
273 for (node = ngs->
frm_wordlist[f]; node; node = nextnode) {
274 nextnode = node->
next;
276 if ((node->
lef - node->
fef < ngs->min_ef_width) ||
278 ((node->
wid == ps_search_finish_wid(ngs)) && (node->
lef < ngs->
n_frame - 1))) {
282 prevnode->
next = nextnode;
292 bitvec_clear_all(ngs->
word_active, ps_search_n_words(ngs));
293 for (f = 0; f < ngs->
n_frame; f++) {
302 E_INFO(
"Utterance vocabulary contains %d words\n", nwd);
317 dict = ps_search_dict(ngs);
318 d2p = ps_search_dict2pid(ngs);
325 if (dict_is_single_phone(dict, wid))
334 rhmm->
ci2phone = dict_second_phone(dict, wid);
335 rhmm->
ciphone = dict_first_phone(dict, wid);
338 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, rhmm->
ciphone),
339 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, rhmm->
ciphone));
343 for (p = 1; p < dict_pronlen(dict, wid) - 1; p++) {
346 hmm->info.
rc_id = (p == dict_pronlen(dict, wid) - 1) ? 0 : -1;
350 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, hmm->
ciphone));
379 ptmr_reset(&ngs->fwdflat_perf);
380 ptmr_start(&ngs->fwdflat_perf);
381 build_fwdflat_wordlist(ngs);
382 build_fwdflat_chan(ngs);
387 for (i = 0; i < ps_search_n_words(ngs); i++)
388 ngs->word_lat_idx[i] = NO_BP;
405 ngs->renormalized = FALSE;
407 for (i = 0; i < ps_search_n_words(ngs); i++)
408 ngs->last_ltrans[i].sf = -1;
413 ngs->
st.n_fwdflat_chan = 0;
414 ngs->
st.n_fwdflat_words = 0;
415 ngs->
st.n_fwdflat_word_transition = 0;
416 ngs->
st.n_senone_active_utt = 0;
432 for (i = 0; i < nw; i++) {
435 if (hmm_frame(&rhmm->
hmm) == frame_idx) {
439 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
440 if (hmm_frame(&hmm->
hmm) == frame_idx) {
450 int32 i, w, nw, bestscore;
459 ngs->
st.n_fwdflat_words += nw;
462 for (i = 0; i < nw; i++) {
465 if (hmm_frame(&rhmm->
hmm) == frame_idx) {
466 int32 score = chan_v_eval(rhmm);
467 if ((score
BETTER_THAN bestscore) && (w != ps_search_finish_wid(ngs)))
469 ngs->
st.n_fwdflat_chan++;
472 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
473 if (hmm_frame(&hmm->
hmm) == frame_idx) {
474 int32 score = chan_v_eval(hmm);
477 ngs->
st.n_fwdflat_chan++;
488 int32 i, nw, cf, nf, w, pip, newscore, thresh, wordthresh;
497 bitvec_clear_all(ngs->
word_active, ps_search_n_words(ngs));
500 wordthresh = ngs->
best_score + ngs->fwdflatwbeam;
502 E_DEBUG(3,(
"frame %d thresh %d wordthresh %d\n", frame_idx, thresh, wordthresh));
505 for (i = 0; i < nw; i++) {
509 if (hmm_frame(&rhmm->
hmm) == cf
511 hmm_frame(&rhmm->
hmm) = nf;
515 newscore = hmm_out_score(&rhmm->
hmm);
517 assert(!dict_is_single_phone(ps_search_dict(ngs), w));
523 if (hmm->info.
rc_id >= 0) {
524 for (; hmm; hmm = hmm->
next) {
525 if ((hmm_frame(&hmm->
hmm) < cf)
528 hmm_out_history(&rhmm->
hmm), nf);
534 if ((hmm_frame(&hmm->
hmm) < cf)
537 hmm_out_history(&rhmm->
hmm), nf);
543 assert(dict_is_single_phone(ps_search_dict(ngs), w));
551 hmm_out_history(&rhmm->
hmm), 0);
557 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
558 if (hmm_frame(&hmm->
hmm) >= cf) {
561 hmm_frame(&hmm->
hmm) = nf;
564 newscore = hmm_out_score(&hmm->
hmm);
566 if (hmm->info.
rc_id < 0) {
571 if (nexthmm->info.
rc_id >= 0) {
572 for (; nexthmm; nexthmm = nexthmm->
next) {
573 if ((hmm_frame(&nexthmm->
hmm) < cf)
575 hmm_in_score(&nexthmm->
hmm))) {
578 hmm_out_history(&hmm->
hmm),
585 if ((hmm_frame(&nexthmm->
hmm) < cf)
587 hmm_in_score(&nexthmm->
hmm))) {
589 hmm_out_history(&hmm->
hmm), nf);
598 hmm_out_history(&hmm->
hmm),
604 else if (hmm_frame(&hmm->
hmm) != nf) {
619 ngs->
st.n_fwdflat_word_transition += ngs->n_expand_words;
630 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
631 ngs->n_expand_words = 0;
633 for (f = sf; f < ef; f++) {
635 if (!bitvec_is_set(ngs->expand_word_flag, node->
wid)) {
636 ngs->expand_word_list[ngs->n_expand_words++] = node->
wid;
637 bitvec_set(ngs->expand_word_flag, node->
wid);
641 ngs->expand_word_list[ngs->n_expand_words] = -1;
642 ngs->
st.n_fwdflat_word_transition += ngs->n_expand_words;
648 int32 cf, nf, b, thresh, pip, i, nw, w, newscore;
649 int32 best_silrc_score = 0, best_silrc_bp = 0;
655 dict_t *dict = ps_search_dict(ngs);
663 lwf = ngs->fwdflat_fwdtree_lw_ratio;
667 get_expand_wordlist(ngs, cf, ngs->max_sf_win);
670 for (b = ngs->bp_table_idx[cf]; b < ngs->bpidx; b++) {
674 bp = ngs->bp_table + b;
675 ngs->word_lat_idx[bp->
wid] = NO_BP;
677 if (bp->
wid == ps_search_finish_wid(ngs))
683 rcss = ngs->bscore_stack + bp->
s_idx;
690 for (i = 0; ngs->expand_word_list[i] >= 0; i++) {
693 w = ngs->expand_word_list[i];
698 newscore = rcss[rssid->
cimap[dict_first_phone(dict, w)]];
700 newscore = bp->
score;
705 * (ngram_tg_score(ngs->
lmset,
706 dict_basewid(dict, w),
715 if ((hmm_frame(&rhmm->
hmm) < cf)
720 hmm_mpx_ssid(&rhmm->
hmm, 0) =
722 dict_last_phone(dict, bp->
wid));
723 assert(IS_S3SSID(hmm_mpx_ssid(&rhmm->
hmm, 0)));
724 E_DEBUG(6,(
"ssid %d(%d,%d) = %d\n",
726 hmm_mpx_ssid(&rhmm->
hmm, 0)));
734 silscore = rcss[rssid->
cimap[ps_search_acmod(ngs)->mdef->sil]];
736 silscore = bp->
score;
738 best_silrc_score = silscore;
744 newscore = best_silrc_score + ngs->silpen + pip;
746 w = ps_search_silence_wid(ngs);
748 if ((hmm_frame(&rhmm->
hmm) < cf)
756 newscore = best_silrc_score + ngs->fillpen + pip;
758 for (w = dict_filler_start(dict); w <= dict_filler_end(dict); w++) {
759 if (w == ps_search_silence_wid(ngs))
766 if ((hmm_frame(&rhmm->
hmm) < cf)
778 for (i = 0; i < nw; i++) {
781 if (hmm_frame(&rhmm->
hmm) == cf) {
788 fwdflat_renormalize_scores(
ngram_search_t *ngs,
int frame_idx, int32 norm)
792 int32 i, nw, cf, w, *awl;
799 for (i = 0; i < nw; i++) {
802 if (hmm_frame(&rhmm->
hmm) == cf) {
805 for (hmm = rhmm->
next; hmm; hmm = hmm->
next) {
806 if (hmm_frame(&hmm->
hmm) == cf) {
812 ngs->renormalized = TRUE;
823 if (!ps_search_acmod(ngs)->compallsen)
824 compute_fwdflat_sen_active(ngs, frame_idx);
827 senscr =
acmod_score(ps_search_acmod(ngs), &frame_idx);
828 ngs->
st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active;
839 E_INFO(
"Renormalizing Scores at frame %d, best score %d\n",
841 fwdflat_renormalize_scores(ngs, frame_idx, ngs->
best_score);
848 fwdflat_eval_chan(ngs, frame_idx);
850 fwdflat_prune_chan(ngs, frame_idx);
852 fwdflat_word_transition(ngs, frame_idx);
859 if (bitvec_is_set(ngs->
word_active, wid) && wid < ps_search_start_wid(ngs)) {
865 for (i = ps_search_start_wid(ngs); i < ps_search_n_words(ngs); i++) {
891 for (f = 0; f < ngs->
n_frame; f++) {
911 if (dict_is_single_phone(ps_search_dict(ngs),wid))
933 destroy_fwdflat_chan(ngs);
934 destroy_fwdflat_wordlist(ngs);
935 bitvec_clear_all(ngs->
word_active, ps_search_n_words(ngs));
938 cf = ps_search_acmod(ngs)->output_frame;
942 ptmr_stop(&ngs->fwdflat_perf);
945 double n_speech = (double)(cf + 1)
946 / cmd_ln_int32_r(ps_search_config(ngs),
"-frate");
947 E_INFO(
"%8d words recognized (%d/fr)\n",
948 ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1));
949 E_INFO(
"%8d senones evaluated (%d/fr)\n", ngs->
st.n_senone_active_utt,
950 (ngs->
st.n_senone_active_utt + (cf >> 1)) / (cf + 1));
951 E_INFO(
"%8d channels searched (%d/fr)\n",
952 ngs->
st.n_fwdflat_chan, ngs->
st.n_fwdflat_chan / (cf + 1));
953 E_INFO(
"%8d words searched (%d/fr)\n",
954 ngs->
st.n_fwdflat_words, ngs->
st.n_fwdflat_words / (cf + 1));
955 E_INFO(
"%8d word transitions (%d/fr)\n",
956 ngs->
st.n_fwdflat_word_transition,
957 ngs->
st.n_fwdflat_word_transition / (cf + 1));
958 E_INFO(
"fwdflat %.2f CPU %.3f xRT\n",
959 ngs->fwdflat_perf.t_cpu,
960 ngs->fwdflat_perf.t_cpu / n_speech);
961 E_INFO(
"fwdflat %.2f wall %.3f xRT\n",
962 ngs->fwdflat_perf.t_elapsed,
963 ngs->fwdflat_perf.t_elapsed / n_speech);
hmm_t hmm
Basic HMM structure.
int32 n_frame_alloc
Number of frames allocated in bp_table_idx and friends.
void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
void hmm_init(hmm_context_t *ctx, hmm_t *hmm, int mpx, int ssid, int tmatid)
Populate a previously-allocated HMM structure, allocating internal data.
chan_t * next
first descendant of this channel
listelem_alloc_t * chan_alloc
For chan_t.
frame_idx_t frame
start or end frame
hmm_context_t * hmmctx
HMM context.
void hmm_deinit(hmm_t *hmm)
Free an HMM structure, releasing internal data (but not the HMM structure itself).
int16 last2_phone
next-to-last phone of this word
void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm)
Activate senones associated with an HMM.
int ngram_fwdflat_reinit(ngram_search_t *ngs)
Rebuild search structures for updated language models.
Word graph search implementation.
bitvec_t * word_active
array of active flags for all words.
void ngram_fwdflat_finish(ngram_search_t *ngs)
Finish fwdflat decoding for an utterance.
int ngram_fwdflat_search(ngram_search_t *ngs, int frame_idx)
Search one frame forward in an utterance.
int16 ciphone
first ciphone of this node; all words rooted at this node begin with this ciphone ...
int32 ** active_word_list
Array of active multi-phone words for current and next frame.
struct chan_s * next
first descendant of this channel; or, in the case of the last phone of a word, the next alternative r...
void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, int32 score, int32 path, int32 rc)
Enter a word in the backpointer table.
int32 * single_phone_wid
list of single-phone word ids
int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx)
Record the current frame's index in the backpointer table.
int16 ci2phone
second ciphone of this node; one root HMM for each unique right context
int32 n_active_word[2]
Number entries in active_word_list.
int32 rc_id
right-context id for last phone of words
#define dict2pid_rssid(d, ci, lc)
Access macros; not designed for arbitrary use.
void ngram_fwdflat_start(ngram_search_t *ngs)
Start fwdflat decoding for an utterance.
N-Gram search module structure.
void hmm_normalize(hmm_t *h, int32 bestscr)
Renormalize the scores in this HMM based on the given best score.
int32 real_wid
wid of this or latest predecessor real word
root_chan_t * rhmm_1ph
Root HMMs for single-phone words.
int32 prev_real_wid
wid of second-last real word
#define WORST_SCORE
Large "bad" score.
N-Gram based multi-pass search ("FBS")
listelem_alloc_t * latnode_alloc
For latnode_t.
ps_latnode_t ** frm_wordlist
List of active words in each frame.
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
void hmm_enter(hmm_t *h, int32 score, int32 histid, int frame)
Enter an HMM with the given path score and history ID.
Lexical tree node data type.
hmm_t hmm
Basic HMM structure.
void acmod_clear_active(acmod_t *acmod)
Clear set of active senones.
int32 wid
Dictionary word id.
#define hmm_context_set_senscore(ctx, senscr)
Change the senone score array for a context.
#define SENSCR_SHIFT
Shift count for senone scores.
a structure for a dictionary.
#define WORSE_THAN
Is one score worse than another?
s3ssid_t dict2pid_internal(dict2pid_t *d2p, int32 wid, int pos)
Return the senone sequence ID for the given word position.
int32 best_score
Best Viterbi path score.
void hmm_clear(hmm_t *h)
Reset the states of the HMM to the invalid condition.
Back pointer table (forward pass lattice; actually a tree)
cross word triphone model structure
void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
#define BETTER_THAN
Is one score better than another?
void ngram_fwdflat_deinit(ngram_search_t *ngs)
Release memory associated with fwdflat decoding.
int32 s_idx
Start of BScoreStack for various right contexts.
int32 fef
First end frame.
int32 n_frame
Number of frames actually present.
ngram_model_t * lmset
Set of language models.
int32 n_1ph_words
Number single phone words in dict (total)
listelem_alloc_t * root_chan_alloc
For root_chan_t.
void hmm_clear_scores(hmm_t *h)
Reset the scores of the HMM.
int32 ciphone
ciphone for this node
void ngram_fwdflat_init(ngram_search_t *ngs)
Initialize N-Gram search for fwdflat decoding.
ngram_search_stats_t st
Various statistics for profiling.
chan_t ** word_chan
Channels associated with a given word (only used for right contexts, single-phone words in fwdtree se...
int32 * fwdflat_wordlist
List of active word IDs for utterance.
struct ps_latnode_s * next
Next node in DAG (no ordering implied)
int32 score
Score (best among all right contexts)
#define dict_size(d)
Packaged macro access to dictionary members.
s3cipid_t * cimap
Index into ssid[] above for each ci phone.
#define dict_pron(d, w, p)
The CI phones of the word w at position p.
Building composite triphone (as well as word internal triphones) with the dictionary.
int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx)
Score one frame of data.
int16 last_phone
last phone of this word