SphinxBase  5prealpha
ngram_model.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 1999-2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 /*
38  * \file ngram_model.c N-Gram language models.
39  *
40  * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm
41  */
42 
43 #ifdef HAVE_CONFIG_H
44 #include <config.h>
45 #endif
46 
47 #include <string.h>
48 #include <assert.h>
49 
50 #include "sphinxbase/ngram_model.h"
51 #include "sphinxbase/ckd_alloc.h"
52 #include "sphinxbase/filename.h"
53 #include "sphinxbase/pio.h"
54 #include "sphinxbase/err.h"
55 #include "sphinxbase/logmath.h"
56 #include "sphinxbase/strfuncs.h"
57 #include "sphinxbase/case.h"
58 
59 #include "ngram_model_internal.h"
60 #include "ngram_model_trie.h"
61 
63 ngram_file_name_to_type(const char *file_name)
64 {
65  const char *ext;
66 
67  ext = strrchr(file_name, '.');
68  if (ext == NULL) {
69  return NGRAM_INVALID;
70  }
71  if (0 == strcmp_nocase(ext, ".gz")) {
72  while (--ext >= file_name) {
73  if (*ext == '.')
74  break;
75  }
76  if (ext < file_name) {
77  return NGRAM_INVALID;
78  }
79  }
80  else if (0 == strcmp_nocase(ext, ".bz2")) {
81  while (--ext >= file_name) {
82  if (*ext == '.')
83  break;
84  }
85  if (ext < file_name) {
86  return NGRAM_INVALID;
87  }
88  }
89  /* We use strncmp because there might be a .gz on the end. */
90  if (0 == strncmp_nocase(ext, ".ARPA", 5))
91  return NGRAM_ARPA;
92  if (0 == strncmp_nocase(ext, ".DMP", 4)
93  || 0 == strncmp_nocase(ext, ".BIN", 4))
94  return NGRAM_BIN;
95  return NGRAM_INVALID;
96 }
97 
99 ngram_str_to_type(const char *str_name)
100 {
101  if (0 == strcmp_nocase(str_name, "arpa"))
102  return NGRAM_ARPA;
103  if (0 == strcmp_nocase(str_name, "dmp")
104  || 0 == strcmp_nocase(str_name, "bin"))
105  return NGRAM_BIN;
106  return NGRAM_INVALID;
107 }
108 
109 char const *
111 {
112  switch (type) {
113  case NGRAM_ARPA:
114  return "arpa";
115  case NGRAM_BIN:
116  return "dmp/bin";
117  default:
118  return NULL;
119  }
120 }
121 
122 
125  const char *file_name,
126  ngram_file_type_t file_type, logmath_t * lmath)
127 {
128  ngram_model_t *model = NULL;
129  switch (file_type) {
130  case NGRAM_AUTO:{
131  if ((model =
132  ngram_model_trie_read_bin(config, file_name,
133  lmath)) != NULL)
134  break;
135  if ((model =
136  ngram_model_trie_read_arpa(config, file_name,
137  lmath)) != NULL)
138  break;
139  if ((model =
140  ngram_model_trie_read_dmp(config, file_name,
141  lmath)) != NULL)
142  break;
143  return NULL;
144  }
145  case NGRAM_ARPA:
146  model = ngram_model_trie_read_arpa(config, file_name, lmath);
147  break;
148  case NGRAM_BIN:
149  if ((model =
150  ngram_model_trie_read_bin(config, file_name, lmath)) != NULL)
151  break;
152  if ((model =
153  ngram_model_trie_read_dmp(config, file_name, lmath)) != NULL)
154  break;
155  return NULL;
156  default:
157  E_ERROR("language model file type not supported\n");
158  return NULL;
159  }
160 
161  /* Now set weights based on config if present. */
162  if (config) {
163  float32 lw = 1.0;
164  float32 wip = 1.0;
165 
166  if (cmd_ln_exists_r(config, "-lw"))
167  lw = cmd_ln_float32_r(config, "-lw");
168  if (cmd_ln_exists_r(config, "-wip"))
169  wip = cmd_ln_float32_r(config, "-wip");
170 
171  ngram_model_apply_weights(model, lw, wip);
172  }
173 
174  return model;
175 }
176 
177 int
178 ngram_model_write(ngram_model_t * model, const char *file_name,
179  ngram_file_type_t file_type)
180 {
181  switch (file_type) {
182  case NGRAM_AUTO:{
183  file_type = ngram_file_name_to_type(file_name);
184  /* Default to ARPA (catches .lm and other things) */
185  if (file_type == NGRAM_INVALID)
186  file_type = NGRAM_ARPA;
187  return ngram_model_write(model, file_name, file_type);
188  }
189  case NGRAM_ARPA:
190  return ngram_model_trie_write_arpa(model, file_name);
191  case NGRAM_BIN:
192  return ngram_model_trie_write_bin(model, file_name);
193  default:
194  E_ERROR("language model file type not supported\n");
195  return -1;
196  }
197  E_ERROR("language model file type not supported\n");
198  return -1;
199 }
200 
201 int32
202 ngram_model_init(ngram_model_t * base,
203  ngram_funcs_t * funcs,
204  logmath_t * lmath, int32 n, int32 n_unigram)
205 {
206  base->refcount = 1;
207  base->funcs = funcs;
208  base->n = n;
209  /* If this was previously initialized... */
210  if (base->n_counts == NULL)
211  base->n_counts = (uint32 *) ckd_calloc(n, sizeof(*base->n_counts));
212  /* Don't reset weights if logmath object hasn't changed. */
213  if (base->lmath != lmath) {
214  /* Set default values for weights. */
215  base->lw = 1.0;
216  base->log_wip = 0; /* i.e. 1.0 */
217  base->log_zero = logmath_get_zero(lmath);
218  base->lmath = lmath;
219  }
220  /* Allocate or reallocate space for word strings. */
221  if (base->word_str) {
222  /* Free all previous word strings if they were allocated. */
223  if (base->writable) {
224  int32 i;
225  for (i = 0; i < base->n_words; ++i) {
226  ckd_free(base->word_str[i]);
227  base->word_str[i] = NULL;
228  }
229  }
230  base->word_str =
231  (char **) ckd_realloc(base->word_str,
232  n_unigram * sizeof(char *));
233  }
234  else {
235  base->word_str = (char **) ckd_calloc(n_unigram, sizeof(char *));
236  }
237  /* NOTE: They are no longer case-insensitive since we are allowing
238  * other encodings for word strings. Beware. */
239  if (base->wid)
240  hash_table_empty(base->wid);
241  else
242  base->wid = hash_table_new(n_unigram, FALSE);
243  base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram;
244 
245  return 0;
246 }
247 
250 {
251  ++model->refcount;
252  return model;
253 }
254 
255 void
257 {
258  if (model->funcs && model->funcs->flush)
259  (*model->funcs->flush) (model);
260 }
261 
262 int
264 {
265  int i;
266 
267  if (model == NULL)
268  return 0;
269  if (--model->refcount > 0)
270  return model->refcount;
271  if (model->funcs && model->funcs->free)
272  (*model->funcs->free) (model);
273  if (model->writable) {
274  /* Free all words. */
275  for (i = 0; i < model->n_words; ++i) {
276  ckd_free(model->word_str[i]);
277  }
278  }
279  else {
280  /* Free all class words. */
281  for (i = 0; i < model->n_classes; ++i) {
282  ngram_class_t *lmclass;
283  int32 j;
284 
285  lmclass = model->classes[i];
286  for (j = 0; j < lmclass->n_words; ++j) {
287  ckd_free(model->word_str[lmclass->start_wid + j]);
288  }
289  for (j = 0; j < lmclass->n_hash; ++j) {
290  if (lmclass->nword_hash[j].wid != -1) {
291  ckd_free(model->word_str[lmclass->nword_hash[j].wid]);
292  }
293  }
294  }
295  }
296  for (i = 0; i < model->n_classes; ++i) {
297  ngram_class_free(model->classes[i]);
298  }
299  ckd_free(model->classes);
300  hash_table_free(model->wid);
301  ckd_free(model->word_str);
302  ckd_free(model->n_counts);
303  ckd_free(model);
304  return 0;
305 }
306 
307 int
309 {
310  int writable, i;
311  hash_table_t *new_wid;
312 
313  /* Were word strings already allocated? */
314  writable = model->writable;
315  /* Either way, we are going to allocate some word strings. */
316  model->writable = TRUE;
317 
318  /* And, don't forget, we need to rebuild the word to unigram ID
319  * mapping. */
320  new_wid = hash_table_new(model->n_words, FALSE);
321  for (i = 0; i < model->n_words; ++i) {
322  char *outstr;
323  if (writable) {
324  outstr = model->word_str[i];
325  }
326  else {
327  outstr = ckd_salloc(model->word_str[i]);
328  }
329  /* Don't case-fold <tags> or [classes] */
330  if (outstr[0] == '<' || outstr[0] == '[') {
331  }
332  else {
333  switch (kase) {
334  case NGRAM_UPPER:
335  ucase(outstr);
336  break;
337  case NGRAM_LOWER:
338  lcase(outstr);
339  break;
340  default:
341  ;
342  }
343  }
344  model->word_str[i] = outstr;
345 
346  /* Now update the hash table. We might have terrible
347  * collisions here, so warn about them. */
348  if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) {
349  E_WARN("Duplicate word in dictionary after conversion: %s\n",
350  model->word_str[i]);
351  }
352  }
353  /* Swap out the hash table. */
354  hash_table_free(model->wid);
355  model->wid = new_wid;
356  return 0;
357 }
358 
359 int
360 ngram_model_apply_weights(ngram_model_t * model, float32 lw, float32 wip)
361 {
362  return (*model->funcs->apply_weights) (model, lw, wip);
363 }
364 
365 float32
366 ngram_model_get_weights(ngram_model_t * model, int32 * out_log_wip)
367 {
368  if (out_log_wip)
369  *out_log_wip = model->log_wip;
370  return model->lw;
371 }
372 
373 
374 int32
375 ngram_ng_score(ngram_model_t * model, int32 wid, int32 * history,
376  int32 n_hist, int32 * n_used)
377 {
378  int32 score, class_weight = 0;
379  int i;
380 
381  /* Closed vocabulary, OOV word probability is zero */
382  if (wid == NGRAM_INVALID_WID)
383  return model->log_zero;
384 
385  /* "Declassify" wid and history */
386  if (NGRAM_IS_CLASSWID(wid)) {
387  ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
388 
389  class_weight = ngram_class_prob(lmclass, wid);
390  if (class_weight == 1) /* Meaning, not found in class. */
391  return model->log_zero;
392  wid = lmclass->tag_wid;
393  }
394  for (i = 0; i < n_hist; ++i) {
395  if (history[i] != NGRAM_INVALID_WID
396  && NGRAM_IS_CLASSWID(history[i]))
397  history[i] =
398  model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
399  }
400  score = (*model->funcs->score) (model, wid, history, n_hist, n_used);
401 
402  /* Multiply by unigram in-class weight. */
403  return score + class_weight;
404 }
405 
406 int32
407 ngram_score(ngram_model_t * model, const char *word, ...)
408 {
409  va_list history;
410  const char *hword;
411  int32 *histid;
412  int32 n_hist;
413  int32 n_used;
414  int32 prob;
415 
416  va_start(history, word);
417  n_hist = 0;
418  while ((hword = va_arg(history, const char *)) != NULL)
419  ++n_hist;
420  va_end(history);
421 
422  histid = ckd_calloc(n_hist, sizeof(*histid));
423  va_start(history, word);
424  n_hist = 0;
425  while ((hword = va_arg(history, const char *)) != NULL) {
426  histid[n_hist] = ngram_wid(model, hword);
427  ++n_hist;
428  }
429  va_end(history);
430 
431  prob = ngram_ng_score(model, ngram_wid(model, word),
432  histid, n_hist, &n_used);
433  ckd_free(histid);
434  return prob;
435 }
436 
437 int32
438 ngram_tg_score(ngram_model_t * model, int32 w3, int32 w2, int32 w1,
439  int32 * n_used)
440 {
441  int32 hist[2];
442  hist[0] = w2;
443  hist[1] = w1;
444  return ngram_ng_score(model, w3, hist, 2, n_used);
445 }
446 
447 int32
448 ngram_bg_score(ngram_model_t * model, int32 w2, int32 w1, int32 * n_used)
449 {
450  return ngram_ng_score(model, w2, &w1, 1, n_used);
451 }
452 
453 int32
454 ngram_ng_prob(ngram_model_t * model, int32 wid, int32 * history,
455  int32 n_hist, int32 * n_used)
456 {
457  int32 prob, class_weight = 0;
458  int i;
459 
460  /* Closed vocabulary, OOV word probability is zero */
461  if (wid == NGRAM_INVALID_WID)
462  return model->log_zero;
463 
464  /* "Declassify" wid and history */
465  if (NGRAM_IS_CLASSWID(wid)) {
466  ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)];
467 
468  class_weight = ngram_class_prob(lmclass, wid);
469  if (class_weight == 1) /* Meaning, not found in class. */
470  return class_weight;
471  wid = lmclass->tag_wid;
472  }
473  for (i = 0; i < n_hist; ++i) {
474  if (history[i] != NGRAM_INVALID_WID
475  && NGRAM_IS_CLASSWID(history[i]))
476  history[i] =
477  model->classes[NGRAM_CLASSID(history[i])]->tag_wid;
478  }
479  prob = (*model->funcs->raw_score) (model, wid, history,
480  n_hist, n_used);
481  /* Multiply by unigram in-class weight. */
482  return prob + class_weight;
483 }
484 
485 int32
486 ngram_probv(ngram_model_t * model, const char *word, ...)
487 {
488  va_list history;
489  const char *hword;
490  int32 *histid;
491  int32 n_hist;
492  int32 n_used;
493  int32 prob;
494 
495  va_start(history, word);
496  n_hist = 0;
497  while ((hword = va_arg(history, const char *)) != NULL)
498  ++n_hist;
499  va_end(history);
500 
501  histid = ckd_calloc(n_hist, sizeof(*histid));
502  va_start(history, word);
503  n_hist = 0;
504  while ((hword = va_arg(history, const char *)) != NULL) {
505  histid[n_hist] = ngram_wid(model, hword);
506  ++n_hist;
507  }
508  va_end(history);
509 
510  prob = ngram_ng_prob(model, ngram_wid(model, word),
511  histid, n_hist, &n_used);
512  ckd_free(histid);
513  return prob;
514 }
515 
516 int32
517 ngram_prob(ngram_model_t * model, const char* const *words, int32 n)
518 {
519  int32 *ctx_id;
520  int32 nused;
521  int32 prob;
522  int32 wid;
523  uint32 i;
524 
525  ctx_id = (int32 *) ckd_calloc(n - 1, sizeof(*ctx_id));
526  for (i = 1; i < (uint32) n; ++i)
527  ctx_id[i - 1] = ngram_wid(model, words[i]);
528 
529  wid = ngram_wid(model, *words);
530  prob = ngram_ng_prob(model, wid, ctx_id, n - 1, &nused);
531  ckd_free(ctx_id);
532 
533  return prob;
534 }
535 
536 int32
537 ngram_score_to_prob(ngram_model_t * base, int32 score)
538 {
539  int32 prob;
540 
541  /* Undo insertion penalty. */
542  prob = score - base->log_wip;
543  /* Undo language weight. */
544  prob = (int32) (prob / base->lw);
545 
546  return prob;
547 }
548 
549 int32
551 {
552  int32 val;
553 
554  /* FIXME: This could be memoized for speed if necessary. */
555  /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */
556  if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1)
557  return NGRAM_INVALID_WID;
558  else
559  return val;
560 }
561 
562 int32
564 {
565  return model->log_zero;
566 }
567 
568 int32
570 {
571  if (model != NULL)
572  return model->n;
573  return 0;
574 }
575 
576 uint32 const *
578 {
579  if (model != NULL)
580  return model->n_counts;
581  return NULL;
582 }
583 
584 int32
585 ngram_wid(ngram_model_t * model, const char *word)
586 {
587  int32 val;
588 
589  if (hash_table_lookup_int32(model->wid, word, &val) == -1)
590  return ngram_unknown_wid(model);
591  else
592  return val;
593 }
594 
595 const char *
596 ngram_word(ngram_model_t * model, int32 wid)
597 {
598  /* Remove any class tag */
599  wid = NGRAM_BASEWID(wid);
600  if (wid >= model->n_words)
601  return NULL;
602  return model->word_str[wid];
603 }
604 
608 int32
609 ngram_add_word_internal(ngram_model_t * model,
610  const char *word, int32 classid)
611 {
612 
613  /* Check for hash collisions. */
614  int32 wid;
615  if (hash_table_lookup_int32(model->wid, word, &wid) == 0) {
616  E_WARN("Omit duplicate word '%s'\n", word);
617  return wid;
618  }
619 
620  /* Take the next available word ID */
621  wid = model->n_words;
622  if (classid >= 0) {
623  wid = NGRAM_CLASSWID(wid, classid);
624  }
625 
626  /* Reallocate word_str if necessary. */
627  if (model->n_words >= model->n_1g_alloc) {
628  model->n_1g_alloc += UG_ALLOC_STEP;
629  model->word_str = ckd_realloc(model->word_str,
630  sizeof(*model->word_str) *
631  model->n_1g_alloc);
632  }
633  /* Add the word string in the appropriate manner. */
634  /* Class words are always dynamically allocated. */
635  model->word_str[model->n_words] = ckd_salloc(word);
636  /* Now enter it into the hash table. */
638  (model->wid, model->word_str[model->n_words], wid) != wid) {
639  E_ERROR
640  ("Hash insertion failed for word %s => %p (should not happen)\n",
641  model->word_str[model->n_words], (void *) (long) (wid));
642  }
643  /* Increment number of words. */
644  ++model->n_words;
645  return wid;
646 }
647 
648 int32
650  const char *word, float32 weight)
651 {
652  int32 wid, prob = model->log_zero;
653 
654  /* If we add word to unwritable model, we need to make it writable */
655  if (!model->writable) {
656  E_WARN("Can't add word '%s' to read-only language model. "
657  "Disable mmap with '-mmap no' to make it writable\n", word);
658  return -1;
659  }
660 
661  wid = ngram_add_word_internal(model, word, -1);
662  if (wid == NGRAM_INVALID_WID)
663  return wid;
664 
665  /* Do what needs to be done to add the word to the unigram. */
666  if (model->funcs && model->funcs->add_ug)
667  prob =
668  (*model->funcs->add_ug) (model, wid,
669  logmath_log(model->lmath, weight));
670  if (prob == 0)
671  return -1;
672 
673  return wid;
674 }
675 
677 ngram_class_new(ngram_model_t * model, int32 tag_wid, int32 start_wid,
678  glist_t classwords)
679 {
680  ngram_class_t *lmclass;
681  gnode_t *gn;
682  float32 tprob;
683  int i;
684 
685  lmclass = ckd_calloc(1, sizeof(*lmclass));
686  lmclass->tag_wid = tag_wid;
687  /* wid_base is the wid (minus class tag) of the first word in the list. */
688  lmclass->start_wid = start_wid;
689  lmclass->n_words = glist_count(classwords);
690  lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1));
691  lmclass->nword_hash = NULL;
692  lmclass->n_hash = 0;
693  tprob = 0.0;
694  for (gn = classwords; gn; gn = gnode_next(gn)) {
695  tprob += gnode_float32(gn);
696  }
697  if (tprob > 1.1 || tprob < 0.9) {
698  E_INFO("Total class probability is %f, will normalize\n", tprob);
699  for (gn = classwords; gn; gn = gnode_next(gn)) {
700  gn->data.fl /= tprob;
701  }
702  }
703  for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
704  lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn));
705  }
706 
707  return lmclass;
708 }
709 
710 int32
711 ngram_class_add_word(ngram_class_t * lmclass, int32 wid, int32 lweight)
712 {
713  int32 hash;
714 
715  if (lmclass->nword_hash == NULL) {
716  /* Initialize everything in it to -1 */
717  lmclass->nword_hash =
718  ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
719  memset(lmclass->nword_hash, 0xff,
720  NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash));
721  lmclass->n_hash = NGRAM_HASH_SIZE;
722  lmclass->n_hash_inuse = 0;
723  }
724  /* Stupidest possible hash function. This will work pretty well
725  * when this function is called repeatedly with contiguous word
726  * IDs, though... */
727  hash = wid & (lmclass->n_hash - 1);
728  if (lmclass->nword_hash[hash].wid == -1) {
729  /* Good, no collision. */
730  lmclass->nword_hash[hash].wid = wid;
731  lmclass->nword_hash[hash].prob1 = lweight;
732  ++lmclass->n_hash_inuse;
733  return hash;
734  }
735  else {
736  int32 next;
737  /* Collision... Find the end of the hash chain. */
738  while (lmclass->nword_hash[hash].next != -1)
739  hash = lmclass->nword_hash[hash].next;
740  assert(hash != -1);
741  /* Does we has any more bukkit? */
742  if (lmclass->n_hash_inuse == lmclass->n_hash) {
743  /* Oh noes! Ok, we makes more. */
744  lmclass->nword_hash = ckd_realloc(lmclass->nword_hash,
745  lmclass->n_hash * 2 *
746  sizeof(*lmclass->
747  nword_hash));
748  memset(lmclass->nword_hash + lmclass->n_hash, 0xff,
749  lmclass->n_hash * sizeof(*lmclass->nword_hash));
750  /* Just use the next allocated one (easy) */
751  next = lmclass->n_hash;
752  lmclass->n_hash *= 2;
753  }
754  else {
755  /* Look for any available bucket. We hope this doesn't happen. */
756  for (next = 0; next < lmclass->n_hash; ++next)
757  if (lmclass->nword_hash[next].wid == -1)
758  break;
759  /* This should absolutely not happen. */
760  assert(next != lmclass->n_hash);
761  }
762  lmclass->nword_hash[next].wid = wid;
763  lmclass->nword_hash[next].prob1 = lweight;
764  lmclass->nword_hash[hash].next = next;
765  ++lmclass->n_hash_inuse;
766  return next;
767  }
768 }
769 
770 void
771 ngram_class_free(ngram_class_t * lmclass)
772 {
773  ckd_free(lmclass->nword_hash);
774  ckd_free(lmclass->prob1);
775  ckd_free(lmclass);
776 }
777 
778 int32
780  const char *classname,
781  const char *word, float32 weight)
782 {
783  ngram_class_t *lmclass;
784  int32 classid, tag_wid, wid, i, scale;
785  float32 fprob;
786 
787  /* Find the class corresponding to classname. Linear search
788  * probably okay here since there won't be very many classes, and
789  * this doesn't have to be fast. */
790  tag_wid = ngram_wid(model, classname);
791  if (tag_wid == NGRAM_INVALID_WID) {
792  E_ERROR("No such word or class tag: %s\n", classname);
793  return tag_wid;
794  }
795  for (classid = 0; classid < model->n_classes; ++classid) {
796  if (model->classes[classid]->tag_wid == tag_wid)
797  break;
798  }
799  /* Hmm, no such class. It's probably not a good idea to create one. */
800  if (classid == model->n_classes) {
801  E_ERROR
802  ("Word %s is not a class tag (call ngram_model_add_class() first)\n",
803  classname);
804  return NGRAM_INVALID_WID;
805  }
806  lmclass = model->classes[classid];
807 
808  /* Add this word to the model's set of words. */
809  wid = ngram_add_word_internal(model, word, classid);
810  if (wid == NGRAM_INVALID_WID)
811  return wid;
812 
813  /* This is the fixed probability of the new word. */
814  fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1);
815  /* Now normalize everything else to fit it in. This is
816  * accomplished by simply scaling all the other probabilities
817  * by (1-fprob). */
818  scale = logmath_log(model->lmath, 1.0 - fprob);
819  for (i = 0; i < lmclass->n_words; ++i)
820  lmclass->prob1[i] += scale;
821  for (i = 0; i < lmclass->n_hash; ++i)
822  if (lmclass->nword_hash[i].wid != -1)
823  lmclass->nword_hash[i].prob1 += scale;
824 
825  /* Now add it to the class hash table. */
826  return ngram_class_add_word(lmclass, wid,
827  logmath_log(model->lmath, fprob));
828 }
829 
830 int32
832  const char *classname,
833  float32 classweight,
834  char **words, const float32 * weights, int32 n_words)
835 {
836  ngram_class_t *lmclass;
837  glist_t classwords = NULL;
838  int32 i, start_wid = -1;
839  int32 classid, tag_wid;
840 
841  /* Check if classname already exists in model. If not, add it. */
842  if ((tag_wid =
843  ngram_wid(model, classname)) == ngram_unknown_wid(model)) {
844  tag_wid = ngram_model_add_word(model, classname, classweight);
845  if (tag_wid == NGRAM_INVALID_WID)
846  return -1;
847  }
848 
849  if (model->n_classes == 128) {
850  E_ERROR("Number of classes cannot exceed 128 (sorry)\n");
851  return -1;
852  }
853  classid = model->n_classes;
854  for (i = 0; i < n_words; ++i) {
855  int32 wid;
856 
857  wid = ngram_add_word_internal(model, words[i], classid);
858  if (wid == NGRAM_INVALID_WID)
859  return -1;
860  if (start_wid == -1)
861  start_wid = NGRAM_BASEWID(wid);
862  classwords = glist_add_float32(classwords, weights[i]);
863  }
864  classwords = glist_reverse(classwords);
865  lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
866  glist_free(classwords);
867  if (lmclass == NULL)
868  return -1;
869 
870  ++model->n_classes;
871  if (model->classes == NULL)
872  model->classes = ckd_calloc(1, sizeof(*model->classes));
873  else
874  model->classes = ckd_realloc(model->classes,
875  model->n_classes *
876  sizeof(*model->classes));
877  model->classes[classid] = lmclass;
878  return classid;
879 }
880 
881 int32
882 ngram_class_prob(ngram_class_t * lmclass, int32 wid)
883 {
884  int32 base_wid = NGRAM_BASEWID(wid);
885 
886  if (base_wid < lmclass->start_wid
887  || base_wid > lmclass->start_wid + lmclass->n_words) {
888  int32 hash;
889 
890  /* Look it up in the hash table. */
891  hash = wid & (lmclass->n_hash - 1);
892  while (hash != -1 && lmclass->nword_hash[hash].wid != wid)
893  hash = lmclass->nword_hash[hash].next;
894  if (hash == -1)
895  return 1;
896  return lmclass->nword_hash[hash].prob1;
897  }
898  else {
899  return lmclass->prob1[base_wid - lmclass->start_wid];
900  }
901 }
902 
903 int32
904 read_classdef_file(hash_table_t * classes, const char *file_name)
905 {
906  FILE *fp;
907  int32 is_pipe;
908  int inclass;
909  int32 rv = -1;
910  gnode_t *gn;
911  glist_t classwords = NULL;
912  glist_t classprobs = NULL;
913  char *classname = NULL;
914 
915  if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
916  E_ERROR("File %s not found\n", file_name);
917  return -1;
918  }
919 
920  inclass = FALSE;
921  while (!feof(fp)) {
922  char line[512];
923  char *wptr[2];
924  int n_words;
925 
926  if (fgets(line, sizeof(line), fp) == NULL)
927  break;
928 
929  n_words = str2words(line, wptr, 2);
930  if (n_words <= 0)
931  continue;
932 
933  if (inclass) {
934  /* Look for an end of class marker. */
935  if (n_words == 2 && 0 == strcmp(wptr[0], "END")) {
936  classdef_t *classdef;
937  gnode_t *word, *weight;
938  int32 i;
939 
940  if (classname == NULL || 0 != strcmp(wptr[1], classname))
941  goto error_out;
942  inclass = FALSE;
943 
944  /* Construct a class from the list of words collected. */
945  classdef = ckd_calloc(1, sizeof(*classdef));
946  classwords = glist_reverse(classwords);
947  classprobs = glist_reverse(classprobs);
948  classdef->n_words = glist_count(classwords);
949  classdef->words = ckd_calloc(classdef->n_words,
950  sizeof(*classdef->words));
951  classdef->weights = ckd_calloc(classdef->n_words,
952  sizeof(*classdef->weights));
953  word = classwords;
954  weight = classprobs;
955  for (i = 0; i < classdef->n_words; ++i) {
956  classdef->words[i] = gnode_ptr(word);
957  classdef->weights[i] = gnode_float32(weight);
958  word = gnode_next(word);
959  weight = gnode_next(weight);
960  }
961 
962  /* Add this class to the hash table. */
963  if (hash_table_enter(classes, classname, classdef) !=
964  classdef) {
965  classdef_free(classdef);
966  goto error_out;
967  }
968 
969  /* Reset everything. */
970  glist_free(classwords);
971  glist_free(classprobs);
972  classwords = NULL;
973  classprobs = NULL;
974  classname = NULL;
975  }
976  else {
977  float32 fprob;
978 
979  if (n_words == 2)
980  fprob = atof_c(wptr[1]);
981  else
982  fprob = 1.0f;
983  /* Add it to the list of words for this class. */
984  classwords =
985  glist_add_ptr(classwords, ckd_salloc(wptr[0]));
986  classprobs = glist_add_float32(classprobs, fprob);
987  }
988  }
989  else {
990  /* Start a new LM class if the LMCLASS marker is seen */
991  if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) {
992  if (inclass)
993  goto error_out;
994  inclass = TRUE;
995  classname = ckd_salloc(wptr[1]);
996  }
997  /* Otherwise, just ignore whatever junk we got */
998  }
999  }
1000  rv = 0; /* Success. */
1001 
1002  error_out:
1003  /* Free all the stuff we might have allocated. */
1004  fclose_comp(fp, is_pipe);
1005  for (gn = classwords; gn; gn = gnode_next(gn))
1006  ckd_free(gnode_ptr(gn));
1007  glist_free(classwords);
1008  glist_free(classprobs);
1009  ckd_free(classname);
1010 
1011  return rv;
1012 }
1013 
1014 void
1015 classdef_free(classdef_t * classdef)
1016 {
1017  int32 i;
1018  for (i = 0; i < classdef->n_words; ++i)
1019  ckd_free(classdef->words[i]);
1020  ckd_free(classdef->words);
1021  ckd_free(classdef->weights);
1022  ckd_free(classdef);
1023 }
1024 
1025 
1026 int32
1027 ngram_model_read_classdef(ngram_model_t * model, const char *file_name)
1028 {
1029  hash_table_t *classes;
1030  glist_t hl = NULL;
1031  gnode_t *gn;
1032  int32 rv = -1;
1033 
1034  classes = hash_table_new(0, FALSE);
1035  if (read_classdef_file(classes, file_name) < 0) {
1036  hash_table_free(classes);
1037  return -1;
1038  }
1039 
1040  /* Create a new class in the language model for each classdef. */
1041  hl = hash_table_tolist(classes, NULL);
1042  for (gn = hl; gn; gn = gnode_next(gn)) {
1043  hash_entry_t *he = gnode_ptr(gn);
1044  classdef_t *classdef = he->val;
1045 
1046  if (ngram_model_add_class(model, he->key, 1.0,
1047  classdef->words,
1048  classdef->weights,
1049  classdef->n_words) < 0)
1050  goto error_out;
1051  }
1052  rv = 0;
1053 
1054  error_out:
1055  for (gn = hl; gn; gn = gnode_next(gn)) {
1056  hash_entry_t *he = gnode_ptr(gn);
1057  ckd_free((char *) he->key);
1058  classdef_free(he->val);
1059  }
1060  glist_free(hl);
1061  hash_table_free(classes);
1062  return rv;
1063 }
SPHINXBASE_EXPORT int32 ngram_probv(ngram_model_t *model, const char *word,...)
Get the &quot;raw&quot; log-probability for a general N-Gram.
Definition: ngram_model.c:486
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick &quot;raw&quot; probability lookup for a general N-Gram.
Definition: ngram_model.c:454
struct ngram_funcs_s * funcs
Implementation-specific methods.
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
Definition: hash_table.c:322
int32 next
Index of next bucket (or -1 for no collision)
SPHINXBASE_EXPORT glist_t glist_add_float32(glist_t g, float32 val)
Create and prepend a new list node containing a single-precision float.
Definition: glist.c:110
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Definition: ngram_model.c:124
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
Sphinx .DMP format.
Definition: ngram_model.h:80
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip)
Implementation-specific function for applying language model weights.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
Definition: ngram_model.c:178
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
Definition: ngram_model.c:308
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
Definition: ngram_model.c:550
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
Definition: hash_table.h:228
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
Sphinx&#39;s memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
Definition: ngram_model.c:585
int32 n_hash
Number of buckets in nword_hash (power of 2)
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
Definition: hash_table.c:616
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
#define NGRAM_INVALID_WID
Impossible word ID.
Definition: ngram_model.h:83
SPHINXBASE_EXPORT int cmd_ln_exists_r(cmd_ln_t *cmdln, char const *name)
Re-entrant version of cmd_ln_exists().
Definition: cmd_ln.c:929
File names related operation.
SPHINXBASE_EXPORT uint32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
Definition: ngram_model.c:577
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition: logmath.c:447
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
Definition: ngram_model.c:438
A node in a generic list.
Definition: glist.h:100
uint8 writable
Are word strings writable?
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:263
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
Definition: hash_table.c:158
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
Definition: hash_table.c:483
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
Definition: glist.c:74
int32 n_words
Number of base words for this class.
SPHINXBASE_EXPORT int32 strcmp_nocase(const char *str1, const char *str2)
(FIXME! The implementation is incorrect!) Case insensitive string compare.
Definition: case.c:94
int32 log_zero
Zero probability, cached here for quick lookup.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
Definition: hash_table.c:688
int refcount
Reference count.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip)
Apply a language weight, insertion penalty, and unigram weight to a language model.
Definition: ngram_model.c:360
A note by ARCHAN at 20050510: Technically what we use is so-called &quot;hash table with buckets&quot; which is...
Definition: hash_table.h:149
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
SPHINXBASE_EXPORT int32 strncmp_nocase(const char *str1, const char *str2, size_t len)
Like strcmp_nocase() but with a maximum length.
Definition: case.c:119
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
N-Gram language models.
ARPABO text format (the standard).
Definition: ngram_model.h:79
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Definition: glist.c:169
uint32 * n_counts
Counts for 1, 2, 3, ...
int32 tag_wid
Base word ID for this class tag.
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed...
Definition: glist.c:133
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
Definition: ngram_model.c:249
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
Definition: ngram_model.c:407
#define gnode_ptr(g)
Head of a list of gnodes.
Definition: glist.h:109
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
Definition: ngram_model.c:569
int32 n_hash_inuse
Number of words in nword_hash.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
Definition: ngram_model.c:110
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
Definition: ngram_model.c:448
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
Definition: ngram_model.c:649
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
Definition: ngram_model.c:831
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:501
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that &quot;file&quot; is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
Definition: pio.c:107
One class definition from a classdef file.
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition: ngram_model.c:99
int32 start_wid
Starting base word ID for this class&#39; words.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
Definition: ngram_model.c:1027
SPHINXBASE_EXPORT int logmath_get_zero(logmath_t *lmath)
Get the smallest possible value represented in this base.
Definition: logmath.c:374
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
Definition: ngram_model.c:256
uint8 n_classes
Number of classes (maximum 128)
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of &quot;words&quot;, based on whitespace separators.
Definition: strfuncs.c:123
Opaque structure used to hold the results of command-line parsing.
#define ckd_malloc(sz)
Macro for ckd_malloc
Definition: ckd_alloc.h:253
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip)
Get the current weights from a language model.
Definition: ngram_model.c:366
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition: ngram_model.c:63
float32 lw
Language model scaling factor.
Not a valid file type.
Definition: ngram_model.h:77
Implementation of ngram_class_t.
SPHINXBASE_EXPORT void lcase(char *str)
Convert str to all lower case.
int32 prob1
Probability for this word.
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
SPHINXBASE_EXPORT void ucase(char *str)
Convert str to all upper case.
Common implementation of ngram_model_t.
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
Definition: hash_table.h:155
int32 wid
Word ID of this bucket.
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to &quot;raw&quot; log-probability.
Definition: ngram_model.c:537
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
Definition: ngram_model.c:596
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition: pio.c:184
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the &quot;zero&quot; log-probability value for a language model.
Definition: ngram_model.c:563
Fast integer logarithmic addition operations.
struct ngram_class_s ** classes
Word class definitions.
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition: ckd_alloc.h:258
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
Definition: ngram_model.c:779
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *const *words, int32 n)
Get the &quot;raw&quot; log-probability for a general N-Gram.
Definition: ngram_model.c:517
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Definition: glist.c:145
Determine file type automatically.
Definition: ngram_model.h:78
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
Definition: ngram_model.c:375
file IO related operations.
Locale-independent implementation of case swapping operation.
int32 * prob1
Probability table for base words.
int32 log_wip
Log of word insertion penalty.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...