53 static const arg_t defn[] = {
57 "Shows the usage of the tool"},
62 "Base in which all log-likelihoods calculated" },
67 "Language model file"},
72 "Probability definition file for classes in LM"},
77 "Control file listing a set of language models"},
82 "Name of language model in -lmctlfn to use for all utterances" },
87 "Transcription file to evaluate"},
91 "Text string to evaluate"},
96 "Use memory-mapped I/O for reading binary LM files"},
101 "Language model weight" },
106 "Word insertion probability" },
111 "Print details of perplexity calculation" },
114 { NULL, 0, NULL, NULL }
121 int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score)
125 int32 i, ch, nccs, noovs, unk;
134 for (i = 0; i < n; ++i)
141 ch = noovs = nccs = 0;
142 for (i = 0; i < n; ++i) {
147 if (wids[i] == startwid) {
158 wids[i], wids + i + 1,
169 printf(
") = %d\n", prob);
174 if (out_n_ccs) *out_n_ccs = nccs;
175 if (out_n_oovs) *out_n_oovs = noovs;
191 int32 nccs, noovs, nwords, lscr;
192 float64 ch, log_to_log2;;
194 if ((fh = fopen(lsnfn,
"r")) == NULL)
200 lscr = nccs = noovs = nwords = 0;
204 int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr;
208 E_FATAL(
"str2words(line, NULL, 0) = %d, should not happen\n", n);
215 if (words[n-1][0] ==
'('
216 && words[n-1][strlen(words[n-1])-1] ==
')')
219 tmp_ch = calc_entropy(lm, words, n, &tmp_nccs,
220 &tmp_noovs, &tmp_lscr);
222 ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2;
231 ch /= (nwords - nccs - noovs);
232 printf(
"cross-entropy: %f bits\n", ch);
235 printf(
"perplexity: %f\n", pow(2.0, ch));
236 printf(
"lm score: %d\n", lscr);
239 printf(
"%d words evaluated\n", nwords);
240 printf(
"%d OOVs (%.2f%%), %d context cues removed\n",
241 noovs, (
double)noovs / nwords * 100, nccs);
249 int32 n, ch, noovs, nccs, lscr;
255 E_FATAL(
"str2words(textfoo, NULL, 0) = %d, should not happen\n", n);
261 ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr);
263 printf(
"input: %s\n", text);
264 printf(
"cross-entropy: %f bits\n",
268 printf(
"perplexity: %f\n",
logmath_exp(lmath, ch));
269 printf(
"lm score: %d\n", lscr);
272 printf(
"%d words evaluated\n", n);
273 printf(
"%d OOVs, %d context cues removed\n",
281 main(
int argc,
char *argv[])
286 const char *lmfn, *probdefn, *lsnfn, *text;
288 if ((config =
cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
295 (cmd_ln_float64_r(config,
"-logbase"), 0, 0)) == NULL) {
296 E_FATAL(
"Failed to initialize log math\n");
304 E_FATAL(
"Failed to load language model from %s\n",
307 if ((probdefn =
cmd_ln_str_r(config,
"-probdef")) != NULL)
310 cmd_ln_float32_r(config,
"-lw"),
311 cmd_ln_float32_r(config,
"-wip"));
317 evaluate_file(lm, lmath, lsnfn);
320 evaluate_string(lm, lmath, text);
Command-line and other configurationparsing and handling.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Miscellaneous useful string functions.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
#define NGRAM_INVALID_WID
Impossible word ID.
SPHINXBASE_EXPORT cmd_ln_t * cmd_ln_parse_r(cmd_ln_t *inout_cmdln, arg_t const *defn, int32 argc, char *argv[], int32 strict)
Parse a list of strings into argumetns.
#define ARG_STRING
String argument (optional).
#define ckd_salloc(ptr)
Macro for ckd_salloc
SPHINXBASE_EXPORT char const * cmd_ln_str_r(cmd_ln_t *cmdln, char const *name)
Retrieve a string from a command-line object.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT logmath_t * logmath_init(float64 base, int shift, int use_table)
Initialize a log math computation table.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip)
Apply a language weight, insertion penalty, and unigram weight to a language model.
SPHINXBASE_EXPORT float64 logmath_get_base(logmath_t *lmath)
Get the log base.
SPHINXBASE_EXPORT lineiter_t * lineiter_next(lineiter_t *li)
Move to the next line in the file.
SPHINXBASE_EXPORT lineiter_t * lineiter_start(FILE *fh)
Start reading lines from a file.
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
Implementation of logging routines.
#define E_FATAL_SYSTEM(...)
Print error text; Call perror(""); exit(errno);.
#define ARG_BOOLEAN
Boolean (true/false) argument (optional).
Argument definition structure.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Opaque structure used to hold the results of command-line parsing.
#define cmd_ln_boolean_r(c, n)
Retrieve a boolean value from a command-line object.
cmd_ln_t * config
Configuration parameters.
#define E_FATAL(...)
Exit with non-zero status after error message.
Common implementation of ngram_model_t.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
Fast integer logarithmic addition operations.
SPHINXBASE_EXPORT float64 logmath_exp(logmath_t *lmath, int logb_p)
Convert integer log in base B to linear floating point.
Determine file type automatically.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
file IO related operations.