SphinxBase  5prealpha
ngram_model.h
Go to the documentation of this file.
1 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 /* ====================================================================
3  * Copyright (c) 2007 Carnegie Mellon University. All rights
4  * reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  * notice, this list of conditions and the following disclaimer.
12  *
13  * 2. Redistributions in binary form must reproduce the above copyright
14  * notice, this list of conditions and the following disclaimer in
15  * the documentation and/or other materials provided with the
16  * distribution.
17  *
18  * This work was supported in part by funding from the Defense Advanced
19  * Research Projects Agency and the National Science Foundation of the
20  * United States of America, and the CMU Sphinx Speech Consortium.
21  *
22  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33  *
34  * ====================================================================
35  *
36  */
43 #ifndef __NGRAM_MODEL_H__
44 #define __NGRAM_MODEL_H__
45 
46 #include <stdarg.h>
47 
48 /* Win32/WinCE DLL gunk */
49 #include <sphinxbase/sphinxbase_export.h>
50 #include <sphinxbase/prim_type.h>
51 #include <sphinxbase/cmd_ln.h>
52 #include <sphinxbase/logmath.h>
53 #include <sphinxbase/mmio.h>
54 
55 #ifdef __cplusplus
56 extern "C" {
57 #endif
58 #if 0
59 /* Fool Emacs. */
60 }
61 #endif
62 
67 
72 
76 typedef enum ngram_file_type_e {
82 
83 #define NGRAM_INVALID_WID -1
104 SPHINXBASE_EXPORT
106  const char *file_name,
107  ngram_file_type_t file_type,
108  logmath_t *lmath);
109 
115 SPHINXBASE_EXPORT
116 int ngram_model_write(ngram_model_t *model, const char *file_name,
117  ngram_file_type_t format);
118 
124 SPHINXBASE_EXPORT
125 ngram_file_type_t ngram_file_name_to_type(const char *file_name);
126 
132 SPHINXBASE_EXPORT
133 ngram_file_type_t ngram_str_to_type(const char *str_name);
134 
141 SPHINXBASE_EXPORT
142 char const *ngram_type_to_str(int type);
143 
149 SPHINXBASE_EXPORT
151 
157 SPHINXBASE_EXPORT
158 int ngram_model_free(ngram_model_t *model);
159 
163 typedef enum ngram_case_e {
164  NGRAM_UPPER,
165  NGRAM_LOWER
166 } ngram_case_t;
167 
174 SPHINXBASE_EXPORT
175 int ngram_model_casefold(ngram_model_t *model, int kase);
176 
188 SPHINXBASE_EXPORT
190  float32 lw, float32 wip);
191 
199 SPHINXBASE_EXPORT
200 float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip);
201 
234 SPHINXBASE_EXPORT
235 int32 ngram_score(ngram_model_t *model, const char *word, ...);
236 
240 SPHINXBASE_EXPORT
241 int32 ngram_tg_score(ngram_model_t *model,
242  int32 w3, int32 w2, int32 w1,
243  int32 *n_used);
244 
248 SPHINXBASE_EXPORT
249 int32 ngram_bg_score(ngram_model_t *model,
250  int32 w2, int32 w1,
251  int32 *n_used);
252 
256 SPHINXBASE_EXPORT
257 int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history,
258  int32 n_hist, int32 *n_used);
259 
270 SPHINXBASE_EXPORT
271 int32 ngram_probv(ngram_model_t *model, const char *word, ...);
272 
283 SPHINXBASE_EXPORT
284 int32 ngram_prob(ngram_model_t *model, const char* const *words, int32 n);
285 
292 SPHINXBASE_EXPORT
293 int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history,
294  int32 n_hist, int32 *n_used);
295 
307 SPHINXBASE_EXPORT
308 int32 ngram_score_to_prob(ngram_model_t *model, int32 score);
309 
313 SPHINXBASE_EXPORT
314 int32 ngram_wid(ngram_model_t *model, const char *word);
315 
319 SPHINXBASE_EXPORT
320 const char *ngram_word(ngram_model_t *model, int32 wid);
321 
335 SPHINXBASE_EXPORT
336 int32 ngram_unknown_wid(ngram_model_t *model);
337 
341 SPHINXBASE_EXPORT
342 int32 ngram_zero(ngram_model_t *model);
343 
347 SPHINXBASE_EXPORT
348 int32 ngram_model_get_size(ngram_model_t *model);
349 
353 SPHINXBASE_EXPORT
354 uint32 const *ngram_model_get_counts(ngram_model_t *model);
355 
359 typedef struct ngram_iter_s ngram_iter_t;
360 
369 SPHINXBASE_EXPORT
371 
375 SPHINXBASE_EXPORT
376 ngram_iter_t *ngram_iter(ngram_model_t *model, const char *word, ...);
377 
381 SPHINXBASE_EXPORT
382 ngram_iter_t *ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist);
383 
392 SPHINXBASE_EXPORT
393 int32 const *ngram_iter_get(ngram_iter_t *itor,
394  int32 *out_score,
395  int32 *out_bowt);
396 
402 SPHINXBASE_EXPORT
404 
408 SPHINXBASE_EXPORT
410 
414 SPHINXBASE_EXPORT
415 void ngram_iter_free(ngram_iter_t *itor);
416 
429 SPHINXBASE_EXPORT
431  const char *word, float32 weight);
432 
446 SPHINXBASE_EXPORT
448  const char *file_name);
449 
458 SPHINXBASE_EXPORT
460  const char *classname,
461  float32 classweight,
462  char **words,
463  const float32 *weights,
464  int32 n_words);
465 
475 SPHINXBASE_EXPORT
477  const char *classname,
478  const char *word,
479  float32 weight);
480 
505 SPHINXBASE_EXPORT
507  ngram_model_t **models,
508  char **names,
509  const float32 *weights,
510  int32 n_models);
511 
542 SPHINXBASE_EXPORT
544  const char *lmctlfile,
545  logmath_t *lmath);
546 
550 SPHINXBASE_EXPORT
552 
557 
563 SPHINXBASE_EXPORT
565 
571 SPHINXBASE_EXPORT
573 
577 SPHINXBASE_EXPORT
579 
587 SPHINXBASE_EXPORT
589  char const **lmname);
590 
597 SPHINXBASE_EXPORT
599  const char *name);
600 
607 SPHINXBASE_EXPORT
609  const char *name);
610 
614 SPHINXBASE_EXPORT
615 const char *ngram_model_set_current(ngram_model_t *set);
616 
624 SPHINXBASE_EXPORT
626  const char **names,
627  const float32 *weights);
628 
641 SPHINXBASE_EXPORT
643  ngram_model_t *model,
644  const char *name,
645  float32 weight,
646  int reuse_widmap);
647 
656 SPHINXBASE_EXPORT
658  const char *name,
659  int reuse_widmap);
660 
664 SPHINXBASE_EXPORT
666  const char **words,
667  int32 n_words);
668 
676 SPHINXBASE_EXPORT
678  int32 set_wid);
679 
689 SPHINXBASE_EXPORT
690 int32 ngram_model_set_known_wid(ngram_model_t *set, int32 set_wid);
691 
695 SPHINXBASE_EXPORT
697 
698 #ifdef __cplusplus
699 }
700 #endif
701 
702 
703 #endif /* __NGRAM_MODEL_H__ */
SPHINXBASE_EXPORT int32 ngram_probv(ngram_model_t *model, const char *word,...)
Get the &quot;raw&quot; log-probability for a general N-Gram.
Definition: ngram_model.c:486
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick &quot;raw&quot; probability lookup for a general N-Gram.
Definition: ngram_model.c:454
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_init(cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models)
Create a set of language models sharing a common space of word IDs.
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Definition: ngram_model.c:124
Sphinx .DMP format.
Definition: ngram_model.h:80
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
Definition: ngram_model.c:178
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
Definition: ngram_model.c:308
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_lookup(ngram_model_t *set, const char *name)
Look up a language model by name from a set.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
Definition: ngram_model.c:550
SPHINXBASE_EXPORT void ngram_iter_free(ngram_iter_t *itor)
Terminate an M-gram iterator.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
Move to the next language model in a set.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
Definition: ngram_model.c:585
SPHINXBASE_EXPORT int32 const * ngram_iter_get(ngram_iter_t *itor, int32 *out_score, int32 *out_bowt)
Get information from the current M-gram in an iterator.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_next(ngram_iter_t *itor)
Advance an M-gram iterator.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_read(cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath)
Read a set of language models from a control file.
SPHINXBASE_EXPORT uint32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
Definition: ngram_model.c:577
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
Definition: ngram_model.c:438
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
Definition: ngram_model.c:263
Basic type definitions used in Sphinx.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter_successors(ngram_iter_t *itor)
Iterate over all M-gram successors of an M-1-gram.
SPHINXBASE_EXPORT int32 ngram_model_set_count(ngram_model_t *set)
Returns the number of language models in a set.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip)
Apply a language weight, insertion penalty, and unigram weight to a language model.
Definition: ngram_model.c:360
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_add(ngram_model_t *set, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap)
Add a language model to a set.
ARPABO text format (the standard).
Definition: ngram_model.h:79
Iterator over a model set.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_remove(ngram_model_t *set, const char *name, int reuse_widmap)
Remove a language model from a set.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
Definition: ngram_model.c:249
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
Definition: ngram_model.c:407
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
Definition: ngram_model.c:569
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
Definition: ngram_model.c:110
SPHINXBASE_EXPORT void ngram_model_set_map_words(ngram_model_t *set, const char **words, int32 n_words)
Set the word-to-ID mapping for this model set.
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
Definition: ngram_model.c:448
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
Definition: ngram_model.c:649
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_iter_model(ngram_model_set_iter_t *itor, char const **lmname)
Get language model and associated name from an iterator.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
Definition: ngram_model.c:831
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
Definition: ngram_model.c:99
SPHINXBASE_EXPORT ngram_iter_t * ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist)
Get an iterator over M-grams pointing to the specified M-gram.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
Definition: ngram_model.c:1027
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_interp(ngram_model_t *set, const char **names, const float32 *weights)
Set interpolation weights for a set and enables interpolation.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter(ngram_model_t *set)
Begin iterating over language models in a set.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
Definition: ngram_model.c:256
Opaque structure used to hold the results of command-line parsing.
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
SPHINXBASE_EXPORT ngram_iter_t * ngram_iter(ngram_model_t *model, const char *word,...)
Get an iterator over M-grams pointing to the specified M-gram.
ngram_file_type_e
File types for N-Gram files.
Definition: ngram_model.h:76
SPHINXBASE_EXPORT void ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
Finish iteration over a langauge model set.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip)
Get the current weights from a language model.
Definition: ngram_model.c:366
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
Definition: ngram_model.c:63
SPHINXBASE_EXPORT const char * ngram_model_set_current(ngram_model_t *set)
Get the current language model name, if any.
Not a valid file type.
Definition: ngram_model.h:77
Implementation of ngram_class_t.
ngram_case_e
Constants for case folding.
Definition: ngram_model.h:163
Common implementation of ngram_model_t.
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to &quot;raw&quot; log-probability.
Definition: ngram_model.c:537
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
Definition: ngram_model.c:596
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the &quot;zero&quot; log-probability value for a language model.
Definition: ngram_model.c:563
Memory-mapped I/O wrappers for files.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_select(ngram_model_t *set, const char *name)
Select a single language model from a set for scoring.
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT int32 ngram_model_set_current_wid(ngram_model_t *set, int32 set_wid)
Query the word-ID mapping for the current language model.
struct ngram_iter_s ngram_iter_t
M-gram iterator object.
Definition: ngram_model.h:359
enum ngram_case_e ngram_case_t
Constants for case folding.
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
Definition: ngram_model.c:779
SPHINXBASE_EXPORT ngram_iter_t * ngram_model_mgrams(ngram_model_t *model, int m)
Iterate over all M-grams.
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *const *words, int32 n)
Get the &quot;raw&quot; log-probability for a general N-Gram.
Definition: ngram_model.c:517
SPHINXBASE_EXPORT int32 ngram_model_set_known_wid(ngram_model_t *set, int32 set_wid)
Test whether a word ID corresponds to a known word in the current state of the language model set...
Determine file type automatically.
Definition: ngram_model.h:78
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
Definition: ngram_model.c:375