Speech recognition with live audio input and endpointing.
Speech recognition with live audio input and endpointing.This file shows how to use PocketSphinx with microphone input using the Win32 Waveform Audio API (the only one of many terrible audio APIs on Windows that isn't made even more terrible by requiring you to use C++ in an unmanaged environment).
To build it, you should be able to find a "live_win32" target in your favorite IDE after running CMake - in Visual Studio Code, look in the "CMake" tab.
Microphones on Windows tend to be miscalibrated with the recording level set much too high by default, so the endpointer may give a lot of false positives at first. Programs like Audacity seem to work around this somehow, but I don't really know how they do it.
#include <windows.h>
#include <mmsystem.h>
#include <signal.h>
static int global_done = 0;
static void
catch_sig(int signum)
{
(void)signum;
global_done = 1;
}
#define CHECK(expr) \
do { \
int err; \
if ((err = expr) != 0) \
{ \
char errbuf[MAXERRORLENGTH]; \
waveInGetErrorText(err, errbuf, sizeof(errbuf)); \
E_FATAL("error %08x: %s\n", err, errbuf); \
} \
} while (0)
int main(int argc, char *argv[])
{
size_t frame_size;
HWAVEIN wavein;
WAVEFORMATEX wavefmt;
HANDLE event;
#define NBUF 100
WAVEHDR hdrs[NBUF];
int i;
(void)argc; (void)argv;
config = ps_config_init(NULL);
ps_default_search_args(config);
if ((decoder = ps_init(config)) == NULL)
E_FATAL(
"PocketSphinx decoder init failed\n");
if ((ep = ps_endpointer_init(0, 0.0, 0,
ps_config_int(config, "samprate"),
0)) == NULL)
E_FATAL(
"PocketSphinx endpointer init failed\n");
wavefmt.wFormatTag = WAVE_FORMAT_PCM;
wavefmt.nChannels = 1;
wavefmt.wBitsPerSample = 16;
wavefmt.nBlockAlign = 2;
wavefmt.nAvgBytesPerSec = wavefmt.nSamplesPerSec * wavefmt.nBlockAlign;
wavefmt.cbSize = 0;
event = CreateEvent(NULL, TRUE, FALSE, "buffer_ready");
CHECK(waveInOpen(&wavein, WAVE_MAPPER, &wavefmt,
(DWORD_PTR)event, 0, CALLBACK_EVENT));
memset(hdrs, 0, sizeof(hdrs));
for (i = 0; i < NBUF; ++i) {
hdrs[i].lpData = malloc(frame_size * 2);
hdrs[i].dwBufferLength = (DWORD)frame_size * 2;
CHECK(waveInPrepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInAddBuffer(wavein, &hdrs[i], sizeof(hdrs[i])));
}
CHECK(waveInStart(wavein));
i = 0;
if (signal(SIGINT, catch_sig) == SIG_ERR)
while (!global_done) {
const int16 *speech;
WaitForSingleObject(event, INFINITE);
while (hdrs[i].dwFlags & WHDR_DONE) {
int prev_in_speech = ps_endpointer_in_speech(ep);
int16 *frame = (int16 *)hdrs[i].lpData;
speech = ps_endpointer_process(ep, frame);
CHECK(waveInUnprepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInPrepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInAddBuffer(wavein, &hdrs[i], sizeof(hdrs[i])));
if (++i == NBUF)
i = 0;
if (speech != NULL) {
const char *hyp;
if (!prev_in_speech) {
fprintf(stderr, "Speech start at %.2f\n",
ps_endpointer_speech_start(ep));
fflush(stderr);
ps_start_utt(decoder);
}
if (ps_process_raw(decoder, speech, frame_size, FALSE, FALSE) < 0)
E_FATAL(
"ps_process_raw() failed\n");
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL) {
fprintf(stderr, "PARTIAL RESULT: %s\n", hyp);
fflush(stderr);
}
if (!ps_endpointer_in_speech(ep)) {
fprintf(stderr, "Speech end at %.2f\n",
ps_endpointer_speech_end(ep));
fflush(stderr);
ps_end_utt(decoder);
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL) {
printf("%s\n", hyp);
fflush(stdout);
}
}
}
}
ResetEvent(event);
}
CHECK(waveInStop(wavein));
CHECK(waveInReset(wavein));
for (i = 0; i < NBUF; ++i) {
if (hdrs[i].dwFlags & WHDR_PREPARED)
CHECK(waveInUnprepareHeader(wavein, &hdrs[i],
sizeof(hdrs[i])));
free(hdrs[i].lpData);
}
CloseHandle(event);
ps_endpointer_free(ep);
ps_free(decoder);
ps_config_free(config);
return 0;
}
#define ps_endpointer_sample_rate(ep)
Definition endpointer.h:163
#define ps_endpointer_frame_size(ep)
Definition endpointer.h:153
#define E_FATAL(...)
Definition err.h:80
#define E_FATAL_SYSTEM(...)
Definition err.h:89
Speech recognizer object.
Simple voice activity detection based endpointing.