SphinxBase  5prealpha
ngram_model_trie.c
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2015 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
37 
38 #include <string.h>
39 #include <assert.h>
40 
41 #include <sphinxbase/err.h>
42 #include <sphinxbase/pio.h>
43 #include <sphinxbase/strfuncs.h>
44 #include <sphinxbase/ckd_alloc.h>
45 #include <sphinxbase/byteorder.h>
46 
47 #include "ngram_model_trie.h"
48 
49 static const char trie_hdr[] = "Trie Language Model";
50 static const char dmp_hdr[] = "Darpa Trigram LM";
51 static ngram_funcs_t ngram_model_trie_funcs;
52 
53 /*
54  * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
55  */
56 static int
57 read_counts_arpa(lineiter_t ** li, uint32 * counts, int *order)
58 {
59  int32 ngram, prev_ngram;
60  uint32 ngram_cnt;
61 
62  /* skip file until past the '\data\' marker */
63  while (*li) {
64  if (strcmp((*li)->buf, "\\data\\") == 0)
65  break;
66  *li = lineiter_next(*li);
67  }
68 
69  if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
70  E_INFO("No \\data\\ mark in LM file\n");
71  return -1;
72  }
73 
74  prev_ngram = 0;
75  *order = 0;
76  while ((*li = lineiter_next(*li))) {
77  if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
78  break;
79  if (ngram != prev_ngram + 1) {
80  E_ERROR
81  ("Ngram counts in LM file is not in order. %d goes after %d\n",
82  ngram, prev_ngram);
83  return -1;
84  }
85  prev_ngram = ngram;
86  counts[*order] = ngram_cnt;
87  (*order)++;
88  }
89 
90  if (*li == NULL) {
91  E_ERROR("EOF while reading ngram counts\n");
92  return -1;
93  }
94 
95  return 0;
96 }
97 
98 static int
99 read_1grams_arpa(lineiter_t ** li, uint32 count, ngram_model_t * base,
100  unigram_t * unigrams)
101 {
102  uint32 i;
103  int n;
104  int n_parts;
105  char *wptr[3];
106 
107  while (*li && strcmp((*li)->buf, "\\1-grams:") != 0) {
108  *li = lineiter_next(*li);
109  }
110  if (*li == NULL) {
111  E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
112  return -1;
113  }
114 
115  n_parts = 2;
116  for (i = 0; i < count; i++) {
117  unigram_t *unigram;
118 
119  *li = lineiter_next(*li);
120  if (*li == NULL) {
121  E_ERROR
122  ("Unexpected end of ARPA file. Failed to read %dth unigram\n",
123  i + 1);
124  return -1;
125  }
126  if ((n = str2words((*li)->buf, wptr, 3)) < n_parts) {
127  E_ERROR("Format error at line %s, Failed to read unigrams\n", (*li)->buf);
128  return -1;
129  }
130 
131  unigram = &unigrams[i];
132  unigram->prob =
133  logmath_log10_to_log_float(base->lmath, atof_c(wptr[0]));
134  if (unigram->prob > 0) {
135  E_WARN("Unigram '%s' has positive probability\n", wptr[1]);
136  unigram->prob = 0;
137  }
138  if (n == n_parts + 1) {
139  unigram->bo =
141  atof_c(wptr[2]));
142  }
143  else {
144  unigram->bo = 0.0f;
145  }
146 
147  /* TODO: classify float with fpclassify and warn if bad value occurred */
148  base->word_str[i] = ckd_salloc(wptr[1]);
149  }
150 
151  /* fill hash-table that maps unigram names to their word ids */
152  for (i = 0; i < count; i++) {
153  if ((hash_table_enter
154  (base->wid, base->word_str[i],
155  (void *) (long) i)) != (void *) (long) i) {
156  E_WARN("Duplicate word in dictionary: %s\n",
157  base->word_str[i]);
158  }
159  }
160  return 0;
161 }
162 
164 ngram_model_trie_read_arpa(cmd_ln_t * config,
165  const char *path, logmath_t * lmath)
166 {
167  FILE *fp;
168  lineiter_t *li;
169  ngram_model_trie_t *model;
170  ngram_model_t *base;
171  ngram_raw_t **raw_ngrams;
172  int32 is_pipe;
173  uint32 counts[NGRAM_MAX_ORDER];
174  int order;
175  int i;
176 
177  E_INFO("Trying to read LM in arpa format\n");
178  if ((fp = fopen_comp(path, "r", &is_pipe)) == NULL) {
179  E_ERROR("File %s not found\n", path);
180  return NULL;
181  }
182 
183  model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
184  li = lineiter_start_clean(fp);
185  /* Read n-gram counts from file */
186  if (read_counts_arpa(&li, counts, &order) == -1) {
187  ckd_free(model);
188  lineiter_free(li);
189  fclose_comp(fp, is_pipe);
190  return NULL;
191  }
192 
193  E_INFO("LM of order %d\n", order);
194  for (i = 0; i < order; i++) {
195  E_INFO("#%d-grams: %d\n", i + 1, counts[i]);
196  }
197 
198  base = &model->base;
199  ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
200  (int32) counts[0]);
201  base->writable = TRUE;
202 
203  model->trie = lm_trie_create(counts[0], order);
204  if (read_1grams_arpa(&li, counts[0], base, model->trie->unigrams) < 0) {
205  ngram_model_free(base);
206  lineiter_free(li);
207  fclose_comp(fp, is_pipe);
208  return NULL;
209  }
210 
211  if (order > 1) {
212  raw_ngrams =
213  ngrams_raw_read_arpa(&li, base->lmath, counts, order,
214  base->wid);
215  if (raw_ngrams == NULL) {
216  ngram_model_free(base);
217  lineiter_free(li);
218  fclose_comp(fp, is_pipe);
219  return NULL;
220  }
221  lm_trie_build(model->trie, raw_ngrams, counts, base->n_counts, order);
222  ngrams_raw_free(raw_ngrams, counts, order);
223  }
224 
225  lineiter_free(li);
226  fclose_comp(fp, is_pipe);
227 
228  return base;
229 }
230 
231 int
232 ngram_model_trie_write_arpa(ngram_model_t * base, const char *path)
233 {
234  int i;
235  uint32 j;
236  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
237  FILE *fp = fopen(path, "w");
238  if (!fp) {
239  E_ERROR("Unable to open %s to write arpa LM from trie\n", path);
240  return -1;
241  }
242  fprintf(fp,
243  "This is an ARPA-format language model file, generated by CMU Sphinx\n");
244  /* Write N-gram counts. */
245  fprintf(fp, "\\data\\\n");
246  for (i = 0; i < base->n; ++i) {
247  fprintf(fp, "ngram %d=%d\n", i + 1, base->n_counts[i]);
248  }
249  /* Write 1-grams */
250  fprintf(fp, "\n\\1-grams:\n");
251  for (j = 0; j < base->n_counts[0]; j++) {
252  unigram_t *unigram = &model->trie->unigrams[j];
253  fprintf(fp, "%.4f\t%s",
254  logmath_log_float_to_log10(base->lmath, unigram->prob),
255  base->word_str[j]);
256  if (base->n > 1) {
257  fprintf(fp, "\t%.4f",
258  logmath_log_float_to_log10(base->lmath, unigram->bo));
259  }
260  fprintf(fp, "\n");
261  }
262  /* Write ngrams */
263  if (base->n > 1) {
264  for (i = 2; i <= base->n; ++i) {
265  ngram_raw_t *raw_ngrams =
266  (ngram_raw_t *) ckd_calloc((size_t) base->n_counts[i - 1],
267  sizeof(*raw_ngrams));
268  uint32 raw_ngram_idx;
269  uint32 j;
270  uint32 hist[NGRAM_MAX_ORDER];
271  node_range_t range;
272  raw_ngram_idx = 0;
273  range.begin = range.end = 0;
274 
275  /* we need to iterate over a trie here. recursion should do the job */
276  lm_trie_fill_raw_ngram(model->trie, raw_ngrams,
277  &raw_ngram_idx, base->n_counts, range, hist, 0,
278  i, base->n);
279  assert(raw_ngram_idx == base->n_counts[i - 1]);
280  qsort(raw_ngrams, (size_t) base->n_counts[i - 1],
281  sizeof(ngram_raw_t), &ngram_ord_comparator);
282 
283  fprintf(fp, "\n\\%d-grams:\n", i);
284  for (j = 0; j < base->n_counts[i - 1]; j++) {
285  int k;
286  fprintf(fp, "%.4f", logmath_log_float_to_log10(base->lmath, raw_ngrams[j].prob));
287  for (k = 0; k < i; k++) {
288  fprintf(fp, "\t%s",
289  base->word_str[raw_ngrams[j].words[k]]);
290  }
291  ckd_free(raw_ngrams[j].words);
292  if (i < base->n) {
293  fprintf(fp, "\t%.4f", logmath_log_float_to_log10(base->lmath, raw_ngrams[j].backoff));
294  }
295  fprintf(fp, "\n");
296  }
297  ckd_free(raw_ngrams);
298  }
299  }
300  fprintf(fp, "\n\\end\\\n");
301  return fclose(fp);
302 }
303 
304 static void
305 read_word_str(ngram_model_t * base, FILE * fp)
306 {
307  int32 k;
308  uint32 i, j;
309  char *tmp_word_str;
310  /* read ascii word strings */
311  base->writable = TRUE;
312  fread(&k, sizeof(k), 1, fp);
313  tmp_word_str = (char *) ckd_calloc((size_t) k, 1);
314  fread(tmp_word_str, 1, (size_t) k, fp);
315 
316  /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */
317  for (i = 0, j = 0; i < (uint32) k; i++)
318  if (tmp_word_str[i] == '\0')
319  j++;
320  if (j != base->n_counts[0]) {
321  E_ERROR
322  ("Error reading word strings (%d doesn't match n_unigrams %d)\n",
323  j, base->n_counts[0]);
324  }
325 
326  /* Break up string just read into words */
327  j = 0;
328  for (i = 0; i < base->n_counts[0]; i++) {
329  base->word_str[i] = ckd_salloc(tmp_word_str + j);
330  if (hash_table_enter(base->wid, base->word_str[i],
331  (void *) (long) i) != (void *) (long) i) {
332  E_WARN("Duplicate word in dictionary: %s\n",
333  base->word_str[i]);
334  }
335  j += strlen(base->word_str[i]) + 1;
336  }
337  free(tmp_word_str);
338 }
339 
341 ngram_model_trie_read_bin(cmd_ln_t * config,
342  const char *path, logmath_t * lmath)
343 {
344  int32 is_pipe;
345  FILE *fp;
346  size_t hdr_size;
347  char *hdr;
348  int cmp_res;
349  uint8 i, order;
350  uint32 counts[NGRAM_MAX_ORDER];
351  ngram_model_trie_t *model;
352  ngram_model_t *base;
353 
354  E_INFO("Trying to read LM in trie binary format\n");
355  if ((fp = fopen_comp(path, "rb", &is_pipe)) == NULL) {
356  E_ERROR("File %s not found\n", path);
357  return NULL;
358  }
359  hdr_size = strlen(trie_hdr);
360  hdr = (char *) ckd_calloc(hdr_size + 1, sizeof(*hdr));
361  fread(hdr, sizeof(*hdr), hdr_size, fp);
362  cmp_res = strcmp(hdr, trie_hdr);
363  ckd_free(hdr);
364  if (cmp_res) {
365  E_INFO("Header doesn't match\n");
366  fclose_comp(fp, is_pipe);
367  return NULL;
368  }
369  model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
370  base = &model->base;
371  fread(&order, sizeof(order), 1, fp);
372  for (i = 0; i < order; i++) {
373  fread(&counts[i], sizeof(counts[i]), 1, fp);
374  }
375  ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
376  (int32) counts[0]);
377  for (i = 0; i < order; i++) {
378  base->n_counts[i] = counts[i];
379  }
380 
381  model->trie = lm_trie_read_bin(counts, order, fp);
382  read_word_str(base, fp);
383  fclose_comp(fp, is_pipe);
384 
385  return base;
386 }
387 
388 static void
389 write_word_str(FILE * fp, ngram_model_t * model)
390 {
391  int32 k;
392  uint32 i;
393 
394  k = 0;
395  for (i = 0; i < model->n_counts[0]; i++)
396  k += strlen(model->word_str[i]) + 1;
397  fwrite(&k, sizeof(k), 1, fp);
398  for (i = 0; i < model->n_counts[0]; i++)
399  fwrite(model->word_str[i], 1, strlen(model->word_str[i]) + 1, fp);
400 }
401 
402 int
403 ngram_model_trie_write_bin(ngram_model_t * base, const char *path)
404 {
405  int i;
406  int32 is_pipe;
407  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
408  FILE *fp = fopen_comp(path, "wb", &is_pipe);
409  if (!fp) {
410  E_ERROR("Unable to open %s to write binary trie LM\n", path);
411  return -1;
412  }
413 
414  fwrite(trie_hdr, sizeof(*trie_hdr), strlen(trie_hdr), fp);
415  fwrite(&model->base.n, sizeof(model->base.n), 1, fp);
416  for (i = 0; i < model->base.n; i++) {
417  fwrite(&model->base.n_counts[i], sizeof(model->base.n_counts[i]),
418  1, fp);
419  }
420  lm_trie_write_bin(model->trie, base->n_counts[0], fp);
421  write_word_str(fp, base);
422  fclose_comp(fp, is_pipe);
423  return 0;
424 }
425 
427 ngram_model_trie_read_dmp(cmd_ln_t * config,
428  const char *file_name, logmath_t * lmath)
429 {
430  uint8 do_swap;
431  int32 is_pipe;
432  int32 k;
433  uint32 j;
434  int32 vn, ts;
435  int32 count;
436  uint32 counts[3];
437  uint32 *unigram_next;
438  int order;
439  char str[1024];
440  FILE *fp;
441  ngram_model_trie_t *model;
442  ngram_model_t *base;
443  ngram_raw_t **raw_ngrams;
444 
445  E_INFO("Trying to read LM in dmp format\n");
446  if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) {
447  E_ERROR("Dump file %s not found\n", file_name);
448  return NULL;
449  }
450 
451  do_swap = FALSE;
452  fread(&k, sizeof(k), 1, fp);
453  if (k != strlen(dmp_hdr) + 1) {
454  SWAP_INT32(&k);
455  if (k != strlen(dmp_hdr) + 1) {
456  E_ERROR
457  ("Wrong magic header size number %x: %s is not a dump file\n",
458  k, file_name);
459  return NULL;
460  }
461  do_swap = 1;
462  }
463  if (fread(str, 1, k, fp) != (size_t) k) {
464  E_ERROR("Cannot read header\n");
465  return NULL;
466  }
467  if (strncmp(str, dmp_hdr, k) != 0) {
468  E_ERROR("Wrong header %s: %s is not a dump file\n", dmp_hdr);
469  return NULL;
470  }
471 
472  if (fread(&k, sizeof(k), 1, fp) != 1)
473  return NULL;
474  if (do_swap)
475  SWAP_INT32(&k);
476  if (fread(str, 1, k, fp) != (size_t) k) {
477  E_ERROR("Cannot read LM filename in header\n");
478  return NULL;
479  }
480 
481  /* read version#, if present (must be <= 0) */
482  if (fread(&vn, sizeof(vn), 1, fp) != 1)
483  return NULL;
484  if (do_swap)
485  SWAP_INT32(&vn);
486  if (vn <= 0) {
487  /* read and don't compare timestamps (we don't care) */
488  if (fread(&ts, sizeof(ts), 1, fp) != 1)
489  return NULL;
490  if (do_swap)
491  SWAP_INT32(&ts);
492 
493  /* read and skip format description */
494  for (;;) {
495  if (fread(&k, sizeof(k), 1, fp) != 1)
496  return NULL;
497  if (do_swap)
498  SWAP_INT32(&k);
499  if (k == 0)
500  break;
501  if (fread(str, 1, k, fp) != (size_t) k) {
502  E_ERROR("Failed to read word\n");
503  return NULL;
504  }
505  }
506  /* read model->ucount */
507  if (fread(&count, sizeof(count), 1, fp) != 1)
508  return NULL;
509  if (do_swap)
510  SWAP_INT32(&count);
511  counts[0] = count;
512  }
513  else {
514  counts[0] = vn;
515  }
516  /* read model->bcount, tcount */
517  if (fread(&count, sizeof(count), 1, fp) != 1)
518  return NULL;
519  if (do_swap)
520  SWAP_INT32(&count);
521  counts[1] = count;
522  if (fread(&count, sizeof(count), 1, fp) != 1)
523  return NULL;
524  if (do_swap)
525  SWAP_INT32(&count);
526  counts[2] = count;
527  E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", counts[0], counts[1], counts[2]);
528 
529  model = (ngram_model_trie_t *) ckd_calloc(1, sizeof(*model));
530  base = &model->base;
531  if (counts[2] > 0)
532  order = 3;
533  else if (counts[1] > 0)
534  order = 2;
535  else
536  order = 1;
537  ngram_model_init(base, &ngram_model_trie_funcs, lmath, order,
538  (int32) counts[0]);
539 
540  model->trie = lm_trie_create(counts[0], order);
541 
542  unigram_next =
543  (uint32 *) ckd_calloc((int32) counts[0] + 1, sizeof(unigram_next));
544  for (j = 0; j <= (int32) counts[0]; j++) {
545  int32 bigrams;
546  int32 mapid;
547  dmp_weight_t weightp;
548  dmp_weight_t weightb;
549 
550  /* Skip over the mapping ID, we don't care about it. */
551  /* Read the weights from actual unigram structure. */
552  fread(&mapid, sizeof(int32), 1, fp);
553  fread(&weightp, sizeof(weightp), 1, fp);
554  fread(&weightb, sizeof(weightb), 1, fp);
555  fread(&bigrams, sizeof(int32), 1, fp);
556  if (do_swap) {
557  SWAP_INT32(&weightp.l);
558  SWAP_INT32(&weightb.l);
559  SWAP_INT32(&bigrams);
560  }
561  model->trie->unigrams[j].prob = logmath_log10_to_log_float(lmath, weightp.f);
562  model->trie->unigrams[j].bo = logmath_log10_to_log_float(lmath, weightb.f);
563  model->trie->unigrams[j].next = bigrams;
564  unigram_next[j] = bigrams;
565  }
566 
567  if (order > 1) {
568  raw_ngrams =
569  ngrams_raw_read_dmp(fp, lmath, counts, order, unigram_next,
570  do_swap);
571  if (raw_ngrams == NULL) {
572  ngram_model_free(base);
573  ckd_free(unigram_next);
574  fclose_comp(fp, is_pipe);
575  return NULL;
576  }
577  lm_trie_build(model->trie, raw_ngrams, counts, base->n_counts, order);
578  ngrams_raw_free(raw_ngrams, counts, order);
579  }
580 
581  /* Sentinel unigram and bigrams read before */
582  ckd_free(unigram_next);
583 
584  /* read ascii word strings */
585  read_word_str(base, fp);
586 
587  fclose_comp(fp, is_pipe);
588  return base;
589 }
590 
591 static void
592 ngram_model_trie_free(ngram_model_t * base)
593 {
594  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
595  lm_trie_free(model->trie);
596 }
597 
598 static int
599 trie_apply_weights(ngram_model_t * base, float32 lw, float32 wip)
600 {
601  /* just update weights that are going to be used on score calculation */
602  base->lw = lw;
603  base->log_wip = logmath_log(base->lmath, wip);
604  return 0;
605 }
606 
607 static int32
608 weight_score(ngram_model_t * base, int32 score)
609 {
610  return (int32) (score * base->lw + base->log_wip);
611 }
612 
613 static int32
614 ngram_model_trie_raw_score(ngram_model_t * base, int32 wid, int32 * hist,
615  int32 n_hist, int32 * n_used)
616 {
617  int32 i;
618  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
619 
620  if (n_hist > model->base.n - 1)
621  n_hist = model->base.n - 1;
622  for (i = 0; i < n_hist; i++) {
623  if (hist[i] < 0) {
624  n_hist = i;
625  break;
626  }
627  }
628 
629  return (int32) lm_trie_score(model->trie, model->base.n, wid, hist,
630  n_hist, n_used);
631 }
632 
633 static int32
634 ngram_model_trie_score(ngram_model_t * base, int32 wid, int32 * hist,
635  int32 n_hist, int32 * n_used)
636 {
637  return weight_score(base,
638  ngram_model_trie_raw_score(base, wid, hist, n_hist,
639  n_used));
640 }
641 
642 static int32
643 lm_trie_add_ug(ngram_model_t * base, int32 wid, int32 lweight)
644 {
645  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
646 
647  /* This would be very bad if this happened! */
648  assert(!NGRAM_IS_CLASSWID(wid));
649 
650  /* Reallocate unigram array. */
651  model->trie->unigrams =
652  (unigram_t *) ckd_realloc(model->trie->unigrams,
653  sizeof(*model->trie->unigrams) *
654  (base->n_1g_alloc + 1));
655  memset(model->trie->unigrams + (base->n_counts[0] + 1), 0,
656  (size_t) (base->n_1g_alloc -
657  base->n_counts[0]) * sizeof(*model->trie->unigrams));
658  ++base->n_counts[0];
659  lweight += logmath_log(base->lmath, 1.0 / base->n_counts[0]);
660  model->trie->unigrams[wid + 1].next = model->trie->unigrams[wid].next;
661  model->trie->unigrams[wid].prob = (float) lweight;
662  /* This unigram by definition doesn't participate in any bigrams,
663  * so its backoff weight is undefined and next pointer same as in finish unigram*/
664  model->trie->unigrams[wid].bo = 0;
665  /* Finally, increase the unigram count */
666  /* FIXME: Note that this can actually be quite bogus due to the
667  * presence of class words. If wid falls outside the unigram
668  * count, increase it to compensate, at the cost of no longer
669  * really knowing how many unigrams we have :( */
670  if ((uint32) wid >= base->n_counts[0])
671  base->n_counts[0] = wid + 1;
672 
673  return (int32) weight_score(base, lweight);
674 }
675 
676 static void
677 lm_trie_flush(ngram_model_t * base)
678 {
679  ngram_model_trie_t *model = (ngram_model_trie_t *) base;
680  lm_trie_t *trie = model->trie;
681  memset(trie->hist_cache, -1, sizeof(trie->hist_cache));
682  memset(trie->backoff_cache, 0, sizeof(trie->backoff_cache));
683  return;
684 }
685 
686 static ngram_funcs_t ngram_model_trie_funcs = {
687  ngram_model_trie_free, /* free */
688  trie_apply_weights, /* apply_weights */
689  ngram_model_trie_score, /* score */
690  ngram_model_trie_raw_score, /* raw_score */
691  lm_trie_add_ug, /* add_ug */
692  lm_trie_flush /* flush */
693 };
#define E_ERROR_SYSTEM(...)
Print error text; Call perror(&quot;&quot;);.
Definition: err.h:99
lm_trie_t * trie
Trie structure that stores ngram relations and weights.
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
Definition: err.h:114
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
Definition: ckd_alloc.h:248
#define E_ERROR(...)
Print error message to error log.
Definition: err.h:104
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
ngram_model_t base
Base ngram_model_t structure.
Sphinx&#39;s memory allocation/deallocation routines.
SPHINXBASE_EXPORT lineiter_t * lineiter_start_clean(FILE *fh)
Start reading lines from a file, skip comments and trim lines.
Definition: pio.c:288
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
Definition: logmath.c:447
uint8 writable
Are word strings writable?
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:263
Line iterator for files.
Definition: pio.h:177
#define ckd_salloc(ptr)
Macro for ckd_salloc
Definition: ckd_alloc.h:264
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
Definition: ckd_alloc.c:244
SPHINXBASE_EXPORT float64 logmath_log_float_to_log10(logmath_t *lmath, float log_p)
Convert float log in base B to base 10 log.
Definition: logmath.c:496
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
Definition: strfuncs.c:55
SPHINXBASE_EXPORT void lineiter_free(lineiter_t *li)
Stop reading lines from a file.
Definition: pio.c:368
uint32 * n_counts
Counts for 1, 2, 3, ...
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
Definition: pio.c:347
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
Definition: hash_table.c:501
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that &quot;file&quot; is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
Definition: pio.c:107
#define E_WARN(...)
Print warning message to error log.
Definition: err.h:109
SPHINXBASE_EXPORT float logmath_log10_to_log_float(logmath_t *lmath, float64 log_p)
Convert base 10 log (in floating point) to float log in base B.
Definition: logmath.c:480
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of &quot;words&quot;, based on whitespace separators.
Definition: strfuncs.c:123
Opaque structure used to hold the results of command-line parsing.
Implementation-specific functions for operating on ngram_model_t objects.
float32 lw
Language model scaling factor.
Common implementation of ngram_model_t.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
Definition: pio.c:184
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
Definition: ckd_alloc.h:258
file IO related operations.
int32 log_wip
Log of word insertion penalty.