PocketSphinx  5.0.0
A small speech recognizer
live.c

Speech recognition with live audio input and endpointing.This file shows how to use PocketSphinx in conjunction with sox to detect and recognize speech from the default audio input device.

This file shows how to use PocketSphinx to recognize a single input file. To compile it, assuming you have built the library as in these directions, you can run:

cmake --build build --target live

Alternately, if PocketSphinx is installed system-wide, you can run:

gcc -o live live.c $(pkg-config --libs --cflags pocketsphinx)
/* Example of simple PocketSphinx speech segmentation.
*
* MIT license (c) 2022, see LICENSE for more information.
*
* Author: David Huggins-Daines <dhdaines@gmail.com>
*/
#include <pocketsphinx.h>
#include <signal.h>
static int global_done = 0;
static void
catch_sig(int signum)
{
(void)signum;
global_done = 1;
}
static FILE *
popen_sox(int sample_rate)
{
char *soxcmd;
int len;
FILE *sox;
#define SOXCMD "sox -q -r %d -c 1 -b 16 -e signed-integer -d -t raw -"
len = snprintf(NULL, 0, SOXCMD, sample_rate);
if ((soxcmd = malloc(len + 1)) == NULL)
E_FATAL_SYSTEM("Failed to allocate string");
if (snprintf(soxcmd, len + 1, SOXCMD, sample_rate) != len)
E_FATAL_SYSTEM("snprintf() failed");
if ((sox = popen(soxcmd, "r")) == NULL)
E_FATAL_SYSTEM("Failed to popen(%s)", soxcmd);
free(soxcmd);
return sox;
}
int
main(int argc, char *argv[])
{
ps_decoder_t *decoder;
ps_config_t *config;
FILE *sox;
short *frame;
size_t frame_size;
(void)argc; (void)argv;
config = ps_config_init(NULL);
if ((decoder = ps_init(config)) == NULL)
E_FATAL("PocketSphinx decoder init failed\n");
if ((ep = ps_endpointer_init(0, 0.0, 0, 0, 0)) == NULL)
E_FATAL("PocketSphinx endpointer init failed\n");
sox = popen_sox(ps_endpointer_sample_rate(ep));
frame_size = ps_endpointer_frame_size(ep);
if ((frame = malloc(frame_size * sizeof(frame[0]))) == NULL)
E_FATAL_SYSTEM("Failed to allocate frame");
if (signal(SIGINT, catch_sig) == SIG_ERR)
E_FATAL_SYSTEM("Failed to set SIGINT handler");
while (!global_done) {
const int16 *speech;
int prev_in_speech = ps_endpointer_in_speech(ep);
size_t len, end_samples;
if ((len = fread(frame, sizeof(frame[0]),
frame_size, sox)) != frame_size) {
if (len > 0) {
speech = ps_endpointer_end_stream(ep, frame,
frame_size,
&end_samples);
}
else
break;
} else {
speech = ps_endpointer_process(ep, frame);
}
if (speech != NULL) {
const char *hyp;
if (!prev_in_speech) {
fprintf(stderr, "Speech start at %.2f\n",
ps_start_utt(decoder);
}
if (ps_process_raw(decoder, speech, frame_size, FALSE, FALSE) < 0)
E_FATAL("ps_process_raw() failed\n");
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL)
fprintf(stderr, "PARTIAL RESULT: %s\n", hyp);
fprintf(stderr, "Speech end at %.2f\n",
ps_end_utt(decoder);
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL)
printf("%s\n", hyp);
}
}
}
free(frame);
if (pclose(sox) < 0)
E_ERROR_SYSTEM("Failed to pclose(sox)");
ps_free(decoder);
ps_config_free(config);
return 0;
}
ps_decoder_t::ps_process_raw
POCKETSPHINX_EXPORT int ps_process_raw(ps_decoder_t *ps, int16 const *data, size_t n_samples, int no_search, int full_utt)
ps_config_t::ps_config_init
POCKETSPHINX_EXPORT ps_config_t * ps_config_init(const ps_arg_t *defn)
ps_decoder_t::ps_get_hyp
POCKETSPHINX_EXPORT const char * ps_get_hyp(ps_decoder_t *ps, int32 *out_best_score)
ps_endpointer_t::ps_endpointer_process
const POCKETSPHINX_EXPORT int16 * ps_endpointer_process(ps_endpointer_t *ep, const int16 *frame)
ps_endpointer_t::ps_endpointer_free
POCKETSPHINX_EXPORT int ps_endpointer_free(ps_endpointer_t *ep)
ps_endpointer_t::ps_endpointer_speech_start
POCKETSPHINX_EXPORT double ps_endpointer_speech_start(ps_endpointer_t *ep)
ps_config_t::ps_config_free
POCKETSPHINX_EXPORT int ps_config_free(ps_config_t *config)
E_ERROR_SYSTEM
#define E_ERROR_SYSTEM(...)
Definition: err.h:98
ps_decoder_t::ps_free
POCKETSPHINX_EXPORT int ps_free(ps_decoder_t *ps)
ps_config_t
configuration object.
E_FATAL_SYSTEM
#define E_FATAL_SYSTEM(...)
Definition: err.h:89
ps_endpointer_t::ps_endpointer_in_speech
POCKETSPHINX_EXPORT int ps_endpointer_in_speech(ps_endpointer_t *ep)
ps_endpointer_t::ps_endpointer_end_stream
const POCKETSPHINX_EXPORT int16 * ps_endpointer_end_stream(ps_endpointer_t *ep, const int16 *frame, size_t nsamp, size_t *out_nsamp)
pocketsphinx.h
ps_decoder_t::ps_start_utt
POCKETSPHINX_EXPORT int ps_start_utt(ps_decoder_t *ps)
ps_config_t::ps_init
POCKETSPHINX_EXPORT ps_decoder_t * ps_init(ps_config_t *config)
ps_endpointer_t::ps_endpointer_init
POCKETSPHINX_EXPORT ps_endpointer_t * ps_endpointer_init(double window, double ratio, ps_vad_mode_t mode, int sample_rate, double frame_length)
ps_decoder_t
Speech recognizer object.
ps_decoder_t::ps_end_utt
POCKETSPHINX_EXPORT int ps_end_utt(ps_decoder_t *ps)
ps_config_t::ps_default_search_args
POCKETSPHINX_EXPORT void ps_default_search_args(ps_config_t *config)
E_FATAL
#define E_FATAL(...)
Definition: err.h:80
ps_endpointer_t
Simple voice activity detection based endpointing.
ps_endpointer_frame_size
#define ps_endpointer_frame_size(ep)
Definition: endpointer.h:139
ps_endpointer_t::ps_endpointer_speech_end
POCKETSPHINX_EXPORT double ps_endpointer_speech_end(ps_endpointer_t *ep)
ps_endpointer_sample_rate
#define ps_endpointer_sample_rate(ep)
Definition: endpointer.h:149