PocketSphinx
5.0.0
A small speech recognizer
|
Speech recognizer object. More...
#include <pocketsphinx.h>
Public Member Functions | |
POCKETSPHINX_EXPORT int | ps_activate_search (ps_decoder_t *ps, const char *name) |
const POCKETSPHINX_EXPORT char * | ps_current_search (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT int | ps_remove_search (ps_decoder_t *ps, const char *name) |
POCKETSPHINX_EXPORT ps_search_iter_t * | ps_search_iter (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT ngram_model_t * | ps_get_lm (ps_decoder_t *ps, const char *name) |
POCKETSPHINX_EXPORT int | ps_add_lm (ps_decoder_t *ps, const char *name, ngram_model_t *lm) |
POCKETSPHINX_EXPORT int | ps_add_lm_file (ps_decoder_t *ps, const char *name, const char *path) |
POCKETSPHINX_EXPORT fsg_model_t * | ps_get_fsg (ps_decoder_t *ps, const char *name) |
POCKETSPHINX_EXPORT int | ps_add_fsg (ps_decoder_t *ps, const char *name, fsg_model_t *fsg) |
POCKETSPHINX_EXPORT int | ps_add_jsgf_file (ps_decoder_t *ps, const char *name, const char *path) |
POCKETSPHINX_EXPORT int | ps_add_jsgf_string (ps_decoder_t *ps, const char *name, const char *jsgf_string) |
const POCKETSPHINX_EXPORT char * | ps_get_kws (ps_decoder_t *ps, const char *name) |
POCKETSPHINX_EXPORT int | ps_add_kws (ps_decoder_t *ps, const char *name, const char *keyfile) |
POCKETSPHINX_EXPORT int | ps_add_keyphrase (ps_decoder_t *ps, const char *name, const char *keyphrase) |
POCKETSPHINX_EXPORT int | ps_add_allphone (ps_decoder_t *ps, const char *name, ngram_model_t *lm) |
POCKETSPHINX_EXPORT int | ps_add_allphone_file (ps_decoder_t *ps, const char *name, const char *path) |
POCKETSPHINX_EXPORT int | ps_set_align_text (ps_decoder_t *ps, const char *words) |
POCKETSPHINX_EXPORT int | ps_set_alignment (ps_decoder_t *ps, ps_alignment_t *al) |
POCKETSPHINX_EXPORT ps_alignment_t * | ps_get_alignment (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT int | ps_reinit (ps_decoder_t *ps, ps_config_t *config) |
POCKETSPHINX_EXPORT int | ps_reinit_feat (ps_decoder_t *ps, ps_config_t *config) |
const POCKETSPHINX_EXPORT char * | ps_get_cmn (ps_decoder_t *ps, int update) |
POCKETSPHINX_EXPORT int | ps_set_cmn (ps_decoder_t *ps, const char *cmn) |
POCKETSPHINX_EXPORT ps_decoder_t * | ps_retain (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT int | ps_free (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT ps_config_t * | ps_get_config (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT logmath_t * | ps_get_logmath (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT ps_mllr_t * | ps_update_mllr (ps_decoder_t *ps, ps_mllr_t *mllr) |
POCKETSPHINX_EXPORT int | ps_load_dict (ps_decoder_t *ps, char const *dictfile, char const *fdictfile, char const *format) |
POCKETSPHINX_EXPORT int | ps_save_dict (ps_decoder_t *ps, char const *dictfile, char const *format) |
POCKETSPHINX_EXPORT int | ps_add_word (ps_decoder_t *ps, char const *word, char const *phones, int update) |
POCKETSPHINX_EXPORT char * | ps_lookup_word (ps_decoder_t *ps, const char *word) |
POCKETSPHINX_EXPORT long | ps_decode_raw (ps_decoder_t *ps, FILE *rawfh, long maxsamps) |
POCKETSPHINX_EXPORT int | ps_decode_senscr (ps_decoder_t *ps, FILE *senfh) |
POCKETSPHINX_EXPORT int | ps_start_stream (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT int | ps_get_in_speech (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT int | ps_start_utt (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT int | ps_process_raw (ps_decoder_t *ps, int16 const *data, size_t n_samples, int no_search, int full_utt) |
POCKETSPHINX_EXPORT int | ps_process_cep (ps_decoder_t *ps, float32 **data, int n_frames, int no_search, int full_utt) |
POCKETSPHINX_EXPORT int | ps_get_n_frames (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT int | ps_end_utt (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT const char * | ps_get_hyp (ps_decoder_t *ps, int32 *out_best_score) |
POCKETSPHINX_EXPORT int32 | ps_get_prob (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT ps_lattice_t * | ps_get_lattice (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT ps_seg_t * | ps_seg_iter (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT ps_nbest_t * | ps_nbest (ps_decoder_t *ps) |
POCKETSPHINX_EXPORT void | ps_get_utt_time (ps_decoder_t *ps, double *out_nspeech, double *out_ncpu, double *out_nwall) |
POCKETSPHINX_EXPORT void | ps_get_all_time (ps_decoder_t *ps, double *out_nspeech, double *out_ncpu, double *out_nwall) |
POCKETSPHINX_EXPORT int ps_activate_search | ( | ps_decoder_t * | ps, |
const char * | name | ||
) |
Actives search with the provided name.
name | Name of search module to activate. This must have been previously added by either ps_add_fsg(), ps_add_lm(), or ps_add_kws(). If NULL, it will re-activate the default search, which is useful when running second-pass alignment, for instance. |
const POCKETSPHINX_EXPORT char * ps_current_search | ( | ps_decoder_t * | ps | ) |
Returns name of current search in decoder
POCKETSPHINX_EXPORT int ps_remove_search | ( | ps_decoder_t * | ps, |
const char * | name | ||
) |
Removes a search module and releases its resources.
Removes a search module previously added with using ps_add_fsg(), ps_add_lm(), ps_add_kws(), etc.
POCKETSPHINX_EXPORT ps_search_iter_t * ps_search_iter | ( | ps_decoder_t * | ps | ) |
Returns iterator over current searches
POCKETSPHINX_EXPORT ngram_model_t * ps_get_lm | ( | ps_decoder_t * | ps, |
const char * | name | ||
) |
Get the language model or lmset object associated with a search.
POCKETSPHINX_EXPORT int ps_add_lm | ( | ps_decoder_t * | ps, |
const char * | name, | ||
ngram_model_t * | lm | ||
) |
Adds new search based on N-gram language model.
Associates N-gram search with the provided name. The search can be activated using ps_activate_search().
POCKETSPHINX_EXPORT int ps_add_lm_file | ( | ps_decoder_t * | ps, |
const char * | name, | ||
const char * | path | ||
) |
Adds new search based on N-gram language model.
Convenient method to load N-gram model and create a search.
POCKETSPHINX_EXPORT fsg_model_t * ps_get_fsg | ( | ps_decoder_t * | ps, |
const char * | name | ||
) |
Get the finite-state grammar set object associated with a search.
name
does not correspond to an FSG search. POCKETSPHINX_EXPORT int ps_add_fsg | ( | ps_decoder_t * | ps, |
const char * | name, | ||
fsg_model_t * | fsg | ||
) |
Adds new search based on finite state grammar.
Associates FSG search with the provided name. The search can be activated using ps_activate_search().
POCKETSPHINX_EXPORT int ps_add_jsgf_file | ( | ps_decoder_t * | ps, |
const char * | name, | ||
const char * | path | ||
) |
Adds new search using JSGF model.
Convenient method to load JSGF model and create a search.
POCKETSPHINX_EXPORT int ps_add_jsgf_string | ( | ps_decoder_t * | ps, |
const char * | name, | ||
const char * | jsgf_string | ||
) |
Adds new search using JSGF model.
Convenience method to parse JSGF model from string and create a search.
const POCKETSPHINX_EXPORT char * ps_get_kws | ( | ps_decoder_t * | ps, |
const char * | name | ||
) |
Get the keyphrase associated with a KWS search
name
does not correspond to a KWS search POCKETSPHINX_EXPORT int ps_add_kws | ( | ps_decoder_t * | ps, |
const char * | name, | ||
const char * | keyfile | ||
) |
Adds keyphrases from a file to spotting
Associates KWS search with the provided name. The search can be activated using ps_activate_search().
POCKETSPHINX_EXPORT int ps_add_keyphrase | ( | ps_decoder_t * | ps, |
const char * | name, | ||
const char * | keyphrase | ||
) |
Adds new keyphrase to spot
Associates KWS search with the provided name. The search can be activated using ps_activate_search().
POCKETSPHINX_EXPORT int ps_add_allphone | ( | ps_decoder_t * | ps, |
const char * | name, | ||
ngram_model_t * | lm | ||
) |
Adds new search based on phone N-gram language model.
Associates N-gram search with the provided name. The search can be activated using ps_activate_search().
POCKETSPHINX_EXPORT int ps_add_allphone_file | ( | ps_decoder_t * | ps, |
const char * | name, | ||
const char * | path | ||
) |
Adds new search based on phone N-gram language model.
Convenient method to load N-gram model and create a search.
POCKETSPHINX_EXPORT int ps_set_align_text | ( | ps_decoder_t * | ps, |
const char * | words | ||
) |
Set up decoder to force-align a word sequence.
Unlike the ps_add_*
functions, this activates the search module immediately, since force-alignment is nearly always a single shot. Currently "under the hood" this is an FSG search but you shouldn't depend on that.
Decoding proceeds as normal, though only this word sequence will be recognized, with silences and alternate pronunciations inserted. Word alignments are available with ps_seg_iter(). To obtain phoneme or state segmentations, you must subsequently call ps_set_alignment() and re-run decoding. It's tough son, but it's life.
ps | Decoder |
words | String containing whitespace-separated words for alignment. These words are assumed to exist in the current dictionary. |
POCKETSPHINX_EXPORT int ps_set_alignment | ( | ps_decoder_t * | ps, |
ps_alignment_t * | al | ||
) |
Set up decoder to run phone and state-level alignment.
Unlike the ps_add_*
functions, this activates the search module immediately, since force-alignment is nearly always a single shot.
To align, run or re-run decoding as usual, then call ps_get_alignment() to get the resulting alignment. Note that if you call this function before rerunning decoding, you can obtain the phone and state sequence, but the durations will be invalid (phones and states will inherit the parent word's duration).
ps | Decoder object. |
al | Usually NULL, which means to construct an alignment from the current search hypothesis (this does not work with allphone or keyword spotting). You can also pass a ps_alignment_t here if you have one. The search will retain but not copy it, so after running decoding it will be updated with new durations. You can set starts and durations for words or phones (not states) to constrain the alignment. |
POCKETSPHINX_EXPORT ps_alignment_t * ps_get_alignment | ( | ps_decoder_t * | ps | ) |
Get the alignment associated with the current search module.
As noted above, if decoding has not been run, this will contain invalid durations, but that may still be useful if you just want to know the state sequence.
POCKETSPHINX_EXPORT int ps_reinit | ( | ps_decoder_t * | ps, |
ps_config_t * | config | ||
) |
Reinitialize the decoder with updated configuration.
This function allows you to switch the acoustic model, dictionary, or other configuration without creating an entirely new decoding object.
feat.params
file is present. config
, so you should free it when no longer used.ps | Decoder. |
config | An optional new configuration to use. If this is NULL, the previous configuration will be reloaded, with any changes applied. |
POCKETSPHINX_EXPORT int ps_reinit_feat | ( | ps_decoder_t * | ps, |
ps_config_t * | config | ||
) |
Reinitialize only the feature computation with updated configuration.
This function allows you to switch the feature computation parameters without otherwise affecting the decoder configuration. For example, if you change the sample rate or the frame rate, and do not want to reconfigure the rest of the decoder.
Note that if you have set a custom cepstral mean with ps_set_cmn(), it will be overridden.
config
, so you should free it when no longer used.ps | Decoder. |
config | An optional new configuration to use. If this is NULL, the previous configuration will be reloaded, with any changes to feature computation applied. |
const POCKETSPHINX_EXPORT char * ps_get_cmn | ( | ps_decoder_t * | ps, |
int | update | ||
) |
Get the current cepstral mean as a string.
This is the string representation of the current cepstral mean, which represents the acoustic channel conditions in live recognition. This can be used to initialize the decoder with the cmninit
option, e.g.:
config = ps_config_parse_json(NULL, "cmninit: 42,-1,0");
ps | Decoder |
update | Update the cepstral mean using data processed so far. |
ps_config_get_int(config, "ceplen")
comma-separated numbers. This pointer is owned by the decoder and only valid until the next call to ps_get_cmn(), ps_set_cmn() or ps_end_utt(). POCKETSPHINX_EXPORT int ps_set_cmn | ( | ps_decoder_t * | ps, |
const char * | cmn | ||
) |
Set the current cepstral mean from a string.
This does the same thing as setting cmninit
with ps_config_set_string() and running ps_reinit_feat()
but is more efficient, and can also be done in the middle of an utterance if you like.
ps | Decoder |
cmn | String representation of cepstral mean, as up to ps_config_get_int(config, "ceplen") -separated numbers (any missing values will be zero-filled). |
POCKETSPHINX_EXPORT ps_decoder_t * ps_retain | ( | ps_decoder_t * | ps | ) |
Retain a pointer to the decoder.
This increments the reference count on the decoder, allowing it to be shared between multiple parent objects. In general you will not need to use this function, ever. It is mainly here for the convenience of scripting language bindings.
POCKETSPHINX_EXPORT int ps_free | ( | ps_decoder_t * | ps | ) |
POCKETSPHINX_EXPORT ps_config_t * ps_get_config | ( | ps_decoder_t * | ps | ) |
Get the configuration object for this decoder.
POCKETSPHINX_EXPORT logmath_t * ps_get_logmath | ( | ps_decoder_t * | ps | ) |
Get the log-math computation object for this decoder.
POCKETSPHINX_EXPORT ps_mllr_t * ps_update_mllr | ( | ps_decoder_t * | ps, |
ps_mllr_t * | mllr | ||
) |
Adapt current acoustic model using a linear transform.
mllr | The new transform to use, or NULL to update the existing transform. The decoder retains ownership of this pointer, so you may free it if you no longer need it. |
POCKETSPHINX_EXPORT int ps_load_dict | ( | ps_decoder_t * | ps, |
char const * | dictfile, | ||
char const * | fdictfile, | ||
char const * | format | ||
) |
Reload the pronunciation dictionary from a file.
This function replaces the current pronunciation dictionary with the one stored in dictfile
. This also causes the active search module(s) to be reinitialized, in the same manner as calling ps_add_word() with update=TRUE.
dictfile | Path to dictionary file to load. |
fdictfile | Path to filler dictionary to load, or NULL to keep the existing filler dictionary. |
format | Format of the dictionary file, or NULL to determine automatically (currently unused,should be NULL) |
POCKETSPHINX_EXPORT int ps_save_dict | ( | ps_decoder_t * | ps, |
char const * | dictfile, | ||
char const * | format | ||
) |
Dump the current pronunciation dictionary to a file.
This function dumps the current pronunciation dictionary to a text file.
dictfile | Path to file where dictionary will be written. |
format | Format of the dictionary file, or NULL for the default (text) format (currently unused, should be NULL) |
POCKETSPHINX_EXPORT int ps_add_word | ( | ps_decoder_t * | ps, |
char const * | word, | ||
char const * | phones, | ||
int | update | ||
) |
Add a word to the pronunciation dictionary.
This function adds a word to the pronunciation dictionary and the current language model (but, obviously, not to the current FSG if FSG mode is enabled). If the word is already present in one or the other, it does whatever is necessary to ensure that the word can be recognized.
word | Word string to add. |
phones | Whitespace-separated list of phoneme strings describing pronunciation of word . |
update | If TRUE, update the search module (whichever one is currently active) to recognize the newly added word. If adding multiple words, it is more efficient to pass FALSE here in all but the last word. |
POCKETSPHINX_EXPORT char * ps_lookup_word | ( | ps_decoder_t * | ps, |
const char * | word | ||
) |
Look up a word in the dictionary and return phone transcription for it.
ps | Pocketsphinx decoder |
word | Word to look for |
word
or NULL if word is not present in the dictionary. The string is allocated and must be freed by the user. POCKETSPHINX_EXPORT long ps_decode_raw | ( | ps_decoder_t * | ps, |
FILE * | rawfh, | ||
long | maxsamps | ||
) |
Decode a raw audio stream.
No headers are recognized in this files. The configuration parameters -samprate
and -input_endian
are used to determine the sampling rate and endianness of the stream, respectively. Audio is always assumed to be 16-bit signed PCM.
ps | Decoder. |
rawfh | Previously opened file stream. |
maxsamps | Maximum number of samples to read from rawfh, or -1 to read until end-of-file. |
POCKETSPHINX_EXPORT int ps_decode_senscr | ( | ps_decoder_t * | ps, |
FILE * | senfh | ||
) |
Decode a senone score dump file.
ps | Decoder |
senfh | Previously opened file handle positioned at start of file. |
POCKETSPHINX_EXPORT int ps_start_stream | ( | ps_decoder_t * | ps | ) |
Start processing of the stream of speech.
POCKETSPHINX_EXPORT int ps_get_in_speech | ( | ps_decoder_t * | ps | ) |
Check in-speech status of decoder.
ps | Decoder. |
POCKETSPHINX_EXPORT int ps_start_utt | ( | ps_decoder_t * | ps | ) |
POCKETSPHINX_EXPORT int ps_process_raw | ( | ps_decoder_t * | ps, |
int16 const * | data, | ||
size_t | n_samples, | ||
int | no_search, | ||
int | full_utt | ||
) |
Decode raw audio data.
ps | Decoder. |
data | Audio data, as 16-bit linear PCM. |
n_samples | Number of samples (not bytes) in data . |
no_search | If non-zero, perform feature extraction but don't do any recognition yet. This may be necessary if your processor has trouble doing recognition in real-time. |
full_utt | If non-zero, this block of data is a full utterance worth of data. This may allow the recognizer to produce more accurate results. |
POCKETSPHINX_EXPORT int ps_process_cep | ( | ps_decoder_t * | ps, |
float32 ** | data, | ||
int | n_frames, | ||
int | no_search, | ||
int | full_utt | ||
) |
Decode acoustic feature data.
ps | Decoder. |
data | Acoustic feature data, a 2-dimensional array of 32-bit floating-point values. Note that this is not a standard 2-dimesional C array but rather an array of pointers to floats, each of which is one vector (or frame) of ps_config_get_int("ceplen") values. |
n_frames | Number of vectors in data . |
no_search | If non-zero, perform feature extraction but don't do any recognition yet. This may be necessary if your processor has trouble doing recognition in real-time. |
full_utt | If non-zero, this block of data is a full utterance worth of data. This may allow the recognizer to produce more accurate results. |
POCKETSPHINX_EXPORT int ps_get_n_frames | ( | ps_decoder_t * | ps | ) |
Get the number of frames of data searched.
Note that there is a delay between this and the number of frames of audio which have been input to the system. This is due to the fact that acoustic features are computed using a sliding window of audio, and dynamic features are computed over a sliding window of acoustic features.
ps | Decoder. |
POCKETSPHINX_EXPORT int ps_end_utt | ( | ps_decoder_t * | ps | ) |
POCKETSPHINX_EXPORT const char * ps_get_hyp | ( | ps_decoder_t * | ps, |
int32 * | out_best_score | ||
) |
Get hypothesis string and path score.
ps | Decoder. |
out_best_score | Output: path score corresponding to returned string. |
POCKETSPHINX_EXPORT int32 ps_get_prob | ( | ps_decoder_t * | ps | ) |
Get posterior probability.
ps | Decoder. |
POCKETSPHINX_EXPORT ps_lattice_t * ps_get_lattice | ( | ps_decoder_t * | ps | ) |
Get word lattice.
ps | Decoder. |
POCKETSPHINX_EXPORT ps_seg_t * ps_seg_iter | ( | ps_decoder_t * | ps | ) |
Get an iterator over the word segmentation for the best hypothesis.
ps | Decoder. |
POCKETSPHINX_EXPORT ps_nbest_t * ps_nbest | ( | ps_decoder_t * | ps | ) |
Get an iterator over the best hypotheses. The function may also return a NULL which means that there is no hypothesis available for this utterance.
ps | Decoder. |
POCKETSPHINX_EXPORT void ps_get_utt_time | ( | ps_decoder_t * | ps, |
double * | out_nspeech, | ||
double * | out_ncpu, | ||
double * | out_nwall | ||
) |
Get performance information for the current utterance.
ps | Decoder. |
out_nspeech | Output: Number of seconds of speech. |
out_ncpu | Output: Number of seconds of CPU time used. |
out_nwall | Output: Number of seconds of wall time used. |
POCKETSPHINX_EXPORT void ps_get_all_time | ( | ps_decoder_t * | ps, |
double * | out_nspeech, | ||
double * | out_ncpu, | ||
double * | out_nwall | ||
) |
Get overall performance information.
ps | Decoder. |
out_nspeech | Output: Number of seconds of speech. |
out_ncpu | Output: Number of seconds of CPU time used. |
out_nwall | Output: Number of seconds of wall time used. |