55 my_compare(
const void *a,
const void *b)
58 if (strcmp(*(
char *
const *) a,
"<UNK>") == 0)
60 else if (strcmp(*(
char *
const *) b,
"<UNK>") == 0)
63 return strcmp(*(
char *
const *) a, *(
char *
const *) b);
79 for (i = 0; i < set->
n_models; ++i) {
81 for (j = 0; j < models[i]->
n_words; ++j) {
91 ngram_model_init(base, &ngram_model_set_funcs, lmath, n,
92 hash_table_inuse(vocab));
96 for (gn = hlist; gn; gn = gnode_next(gn)) {
98 base->
word_str[i++] = (
char *) ent->key;
109 for (i = 0; i < base->
n_words; ++i) {
114 for (j = 0; j < set->
n_models; ++j) {
126 char **names,
const float32 * weights, int32 n_models)
138 lmath = models[0]->
lmath;
139 for (i = 1; i < n_models; ++i) {
144 (
"Log-math parameters don't match, will not create LM set\n");
158 int32 uniform =
logmath_log(lmath, 1.0 / n_models);
159 for (i = 0; i < n_models; ++i)
167 for (i = 0; i < n_models; ++i) {
173 if (models[i]->n > n)
180 build_widmap(base, lmath, n);
186 const char *lmctlfile,
logmath_t * lmath)
191 __BIGSTACKVARIABLE__
char str[1024];
199 if ((ctlfp = fopen(lmctlfile,
"r")) == NULL) {
206 if ((c = strrchr(lmctlfile,
'/')) || (c = strrchr(lmctlfile,
'\\'))) {
209 memcpy(basedir, lmctlfile, c - lmctlfile + 1);
214 E_INFO(
"Reading LM control file '%s'\n", lmctlfile);
216 E_INFO(
"Will prepend '%s' to unqualified paths\n", basedir);
218 if (fscanf(ctlfp,
"%1023s", str) == 1) {
219 if (strcmp(str,
"{") == 0) {
221 while ((fscanf(ctlfp,
"%1023s", str) == 1)
222 && (strcmp(str,
"}") != 0)) {
228 E_INFO(
"Reading classdef from '%s'\n", deffile);
229 if (read_classdef_file(classes, deffile) < 0) {
236 if (strcmp(str,
"}") != 0) {
237 E_ERROR(
"Unexpected EOF in %s\n", lmctlfile);
242 if (fscanf(ctlfp,
"%1023s", str) != 1)
250 while (str[0] !=
'\0') {
254 if (basedir && str[0] !=
'/' && str[0] !=
'\\')
258 E_INFO(
"Reading lm from '%s'\n", lmfile);
264 if (fscanf(ctlfp,
"%1023s", str) != 1) {
265 E_ERROR(
"LMname missing after LMFileName '%s'\n", lmfile);
273 if (fscanf(ctlfp,
"%1023s", str) == 1) {
274 if (strcmp(str,
"{") == 0) {
276 while ((fscanf(ctlfp,
"%1023s", str) == 1) &&
277 (strcmp(str,
"}") != 0)) {
282 E_ERROR(
"Unknown class %s in control file\n", str);
289 classdef->n_words) < 0) {
292 E_INFO(
"Added class %s containing %d words\n",
293 str, classdef->n_words);
295 if (strcmp(str,
"}") != 0) {
296 E_ERROR(
"Unexpected EOF in %s\n", lmctlfile);
299 if (fscanf(ctlfp,
"%1023s", str) != 1)
320 lm_array =
ckd_calloc(n_models,
sizeof(*lm_array));
321 name_array =
ckd_calloc(n_models,
sizeof(*name_array));
324 for (i = 0; i < n_models; ++i) {
327 lm_node = gnode_next(lm_node);
328 name_node = gnode_next(name_node);
333 for (i = 0; i < n_models; ++i) {
345 for (gn = lms; gn; gn = gnode_next(gn)) {
350 for (gn = lmnames; gn; gn = gnode_next(gn)) {
355 for (gn = hlist; gn; gn = gnode_next(gn)) {
358 classdef_free(he->
val);
380 if (set == NULL || set->
n_models == 0)
390 if (++itor->cur == itor->set->
n_models) {
408 *lmname = itor->set->
names[itor->cur];
409 return itor->set->
lms[itor->cur];
422 return set->
lms[set->
cur];
427 if (0 == strcmp(set->
names[i], name))
442 if (0 == strcmp(set->
names[i], name))
447 return set->
lms[set->
cur];
466 if (set->
cur == -1 || set_wid >= base->
n_words)
479 else if (set->
cur == -1) {
481 for (i = 0; i < set->
n_models; ++i) {
494 const char **names,
const float32 * weights)
499 if (names && weights) {
503 for (i = 0; i < set->
n_models; ++i) {
505 if (0 == strcmp(names[i], set->
names[j]))
508 E_ERROR(
"Unknown LM name %s\n", names[i]);
526 const char *name, float32 weight,
int reuse_widmap)
540 if (model->
n > base->
n) {
543 (model->
n - 1) *
sizeof(*set->
maphist));
547 fprob = weight * 1.0f / set->
n_models;
555 for (i = 0; i < set->
n_models - 1; ++i)
564 sizeof(**new_widmap));
565 for (i = 0; i < base->
n_words; ++i) {
567 memcpy(new_widmap[i], set->
widmap[i],
568 (set->
n_models - 1) *
sizeof(**new_widmap));
577 build_widmap(base, base->
lmath, base->
n);
584 const char *name,
int reuse_widmap)
588 int32 lmidx, scale, n, i;
591 for (lmidx = 0; lmidx < set->
n_models; ++lmidx)
592 if (0 == strcmp(name, set->
names[lmidx]))
596 submodel = set->
lms[lmidx];
608 set->
names[lmidx] = NULL;
609 for (i = 0; i < set->
n_models; ++i) {
611 set->
lms[i] = set->
lms[i + 1];
616 if (set->
lms[i]->
n > n)
627 for (i = 0; i < base->
n_words; ++i) {
628 memmove(set->
widmap[i] + lmidx, set->
widmap[i] + lmidx + 1,
633 build_widmap(base, base->
lmath, n);
640 const char **words, int32 n_words)
647 for (i = 0; i < base->
n_words; ++i) {
660 for (i = 0; i < n_words; ++i) {
664 for (j = 0; j < set->
n_models; ++j) {
671 ngram_model_set_apply_weights(
ngram_model_t * base, float32 lw,
685 int32 * history, int32 n_hist, int32 * n_used)
693 if (n_hist > base->
n - 1)
694 n_hist = base->
n - 1;
697 if (set->
cur == -1) {
699 for (i = 0; i < set->
n_models; ++i) {
702 mapwid = set->
widmap[wid][i];
703 for (j = 0; j < n_hist; ++j) {
720 for (j = 0; j < n_hist; ++j) {
727 mapwid, set->
maphist, n_hist, n_used);
735 int32 * history, int32 n_hist, int32 * n_used)
743 if (n_hist > base->
n - 1)
744 n_hist = base->
n - 1;
747 if (set->
cur == -1) {
749 for (i = 0; i < set->
n_models; ++i) {
752 mapwid = set->
widmap[wid][i];
753 for (j = 0; j < n_hist; ++j) {
770 for (j = 0; j < n_hist; ++j) {
777 mapwid, set->
maphist, n_hist, n_used);
784 ngram_model_set_add_ug(
ngram_model_t * base, int32 wid, int32 lweight)
795 for (i = 0; i < set->
n_models; ++i) {
799 if (set->
cur == -1 || set->
cur == i) {
819 else if (set->
cur == -1)
836 for (i = 0; i < base->
n_words; ++i)
838 memcpy(set->
widmap[wid], newwid, set->
n_models *
sizeof(*newwid));
861 ngram_model_set_free,
862 ngram_model_set_apply_weights,
863 ngram_model_set_score,
864 ngram_model_set_raw_score,
865 ngram_model_set_add_ug,
#define E_ERROR_SYSTEM(...)
Print error text; Call perror("");.
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_init(cmd_ln_t *config, ngram_model_t **models, char **names, const float32 *weights, int32 n_models)
Create a set of language models sharing a common space of word IDs.
char ** names
Names for language models.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
SPHINXBASE_EXPORT int32 hash_table_lookup(hash_table_t *h, const char *key, void **val)
Look up a key in a hash table and optionally return the associated value.
#define ckd_calloc_2d(d1, d2, sz)
Macro for ckd_calloc_2d
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_lookup(ngram_model_t *set, const char *name)
Look up a language model by name from a set.
#define E_ERROR(...)
Print error message to error log.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter_next(ngram_model_set_iter_t *itor)
Move to the next language model in a set.
Sphinx's memory allocation/deallocation routines.
int32 * lweights
Log interpolation weights.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
#define NGRAM_INVALID_WID
Impossible word ID.
File names related operation.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_read(cmd_ln_t *config, const char *lmctlfile, logmath_t *lmath)
Read a set of language models from a control file.
int32 ** widmap
Word ID mapping for submodels.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
A node in a generic list.
Subclass of ngram_model for grouping language models.
uint8 writable
Are word strings writable?
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
#define ckd_salloc(ptr)
Macro for ckd_salloc
int32 n_models
Number of models in this set.
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
SPHINXBASE_EXPORT int32 ngram_model_set_count(ngram_model_t *set)
Returns the number of language models in a set.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
int32 * maphist
Word ID mapping for N-Gram history.
int32 log_zero
Zero probability, cached here for quick lookup.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip)
Apply a language weight, insertion penalty, and unigram weight to a language model.
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_add(ngram_model_t *set, ngram_model_t *model, const char *name, float32 weight, int reuse_widmap)
Add a language model to a set.
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
Iterator over a model set.
SPHINXBASE_EXPORT int logmath_get_shift(logmath_t *lmath)
Get the shift of the values in a log table.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_remove(ngram_model_t *set, const char *name, int reuse_widmap)
Remove a language model from a set.
ngram_model_t base
Base ngram_model_t structure.
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed...
int32 cur
Currently selected model, or -1 for none.
SPHINXBASE_EXPORT float64 logmath_get_base(logmath_t *lmath)
Get the log base.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT int path_is_absolute(const char *file)
Test whether a pathname is absolute for the current OS.
#define gnode_ptr(g)
Head of a list of gnodes.
SPHINXBASE_EXPORT void ngram_model_set_map_words(ngram_model_t *set, const char **words, int32 n_words)
Set the word-to-ID mapping for this model set.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_iter_model(ngram_model_set_iter_t *itor, char const **lmname)
Get language model and associated name from an iterator.
logmath_t * lmath
Log-math object.
ngram_model_t ** lms
Language models in this set.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
One class definition from a classdef file.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_interp(ngram_model_t *set, const char **names, const float32 *weights)
Set interpolation weights for a set and enables interpolation.
SPHINXBASE_EXPORT ngram_model_set_iter_t * ngram_model_set_iter(ngram_model_t *set)
Begin iterating over language models in a set.
Opaque structure used to hold the results of command-line parsing.
SPHINXBASE_EXPORT char * string_join(const char *base,...)
Concatenate a NULL-terminated argument list of strings, returning a newly allocated string...
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT void ngram_model_set_iter_free(ngram_model_set_iter_t *itor)
Finish iteration over a langauge model set.
SPHINXBASE_EXPORT void ckd_free_2d(void *ptr)
Free a 2-D array (ptr) previously allocated by ckd_calloc_2d.
SPHINXBASE_EXPORT const char * ngram_model_set_current(ngram_model_t *set)
Get the current language model name, if any.
Common implementation of ngram_model_t.
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
SPHINXBASE_EXPORT ngram_model_t * ngram_model_set_select(ngram_model_t *set, const char *name)
Select a single language model from a set for scoring.
SPHINXBASE_EXPORT int32 ngram_model_set_current_wid(ngram_model_t *set, int32 set_wid)
Query the word-ID mapping for the current language model.
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
SPHINXBASE_EXPORT int32 ngram_model_set_known_wid(ngram_model_t *set, int32 set_wid)
Test whether a word ID corresponds to a known word in the current state of the language model set...
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Determine file type automatically.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
SPHINXBASE_EXPORT int logmath_add(logmath_t *lmath, int logb_p, int logb_q)
Add two values in log space (i.e.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...