PocketSphinx 5.1.0rc2
A small speech recognizer
Loading...
Searching...
No Matches
live_win32.c

Speech recognition with live audio input and endpointing.

Speech recognition with live audio input and endpointing.This file shows how to use PocketSphinx with microphone input using the Win32 Waveform Audio API (the only one of many terrible audio APIs on Windows that isn't made even more terrible by requiring you to use C++ in an unmanaged environment).

To build it, you should be able to find a "live_win32" target in your favorite IDE after running CMake - in Visual Studio Code, look in the "CMake" tab.

Microphones on Windows tend to be miscalibrated with the recording level set much too high by default, so the endpointer may give a lot of false positives at first. Programs like Audacity seem to work around this somehow, but I don't really know how they do it.

/* Example of simple PocketSphinx speech segmentation.
*
* MIT license (c) 2022, see LICENSE for more information.
*
* Author: David Huggins-Daines <dhdaines@gmail.com>
*/
#include <windows.h>
#include <mmsystem.h>
#include <pocketsphinx.h>
#include <signal.h>
static int global_done = 0;
static void
catch_sig(int signum)
{
(void)signum;
global_done = 1;
}
#define CHECK(expr) \
do { \
int err; \
if ((err = expr) != 0) \
{ \
char errbuf[MAXERRORLENGTH]; \
waveInGetErrorText(err, errbuf, sizeof(errbuf)); \
E_FATAL("error %08x: %s\n", err, errbuf); \
} \
} while (0)
int main(int argc, char *argv[])
{
ps_decoder_t *decoder;
ps_config_t *config;
size_t frame_size;
HWAVEIN wavein;
WAVEFORMATEX wavefmt;
HANDLE event;
/* A large but somewhat arbitrary number of buffers. */
#define NBUF 100 /* 100 * 0.03 = 3 seconds */
WAVEHDR hdrs[NBUF];
int i;
(void)argc; (void)argv;
/* Initialize decoder and endpointer */
config = ps_config_init(NULL);
ps_default_search_args(config);
if ((decoder = ps_init(config)) == NULL)
E_FATAL("PocketSphinx decoder init failed\n");
if ((ep = ps_endpointer_init(0, 0.0, 0,
ps_config_int(config, "samprate"),
0)) == NULL)
E_FATAL("PocketSphinx endpointer init failed\n");
/* Frame size in samples (not bytes) */
frame_size = ps_endpointer_frame_size(ep);
/* Tell Windows what format we want (NOTE: may not be available...) */
wavefmt.wFormatTag = WAVE_FORMAT_PCM;
wavefmt.nChannels = 1;
wavefmt.nSamplesPerSec = ps_endpointer_sample_rate(ep);
wavefmt.wBitsPerSample = 16;
wavefmt.nBlockAlign = 2;
wavefmt.nAvgBytesPerSec = wavefmt.nSamplesPerSec * wavefmt.nBlockAlign;
wavefmt.cbSize = 0;
/* Create an event to tell us when a new buffer is ready. */
event = CreateEvent(NULL, TRUE, FALSE, "buffer_ready");
/* Open the recording device. */
CHECK(waveInOpen(&wavein, WAVE_MAPPER, &wavefmt,
(DWORD_PTR)event, 0, CALLBACK_EVENT));
/* Create buffers. */
memset(hdrs, 0, sizeof(hdrs));
for (i = 0; i < NBUF; ++i) {
hdrs[i].lpData = malloc(frame_size * 2);
hdrs[i].dwBufferLength = (DWORD)frame_size * 2;
CHECK(waveInPrepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInAddBuffer(wavein, &hdrs[i], sizeof(hdrs[i])));
}
/* Start recording. */
CHECK(waveInStart(wavein));
i = 0;
if (signal(SIGINT, catch_sig) == SIG_ERR)
E_FATAL_SYSTEM("Failed to set SIGINT handler");
while (!global_done) {
const int16 *speech;
WaitForSingleObject(event, INFINITE);
/* Get as many buffers as we can. */
while (hdrs[i].dwFlags & WHDR_DONE) {
int prev_in_speech = ps_endpointer_in_speech(ep);
int16 *frame = (int16 *)hdrs[i].lpData;
/* Process them one by one. */
speech = ps_endpointer_process(ep, frame);
CHECK(waveInUnprepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInPrepareHeader(wavein, &hdrs[i], sizeof(hdrs[i])));
CHECK(waveInAddBuffer(wavein, &hdrs[i], sizeof(hdrs[i])));
if (++i == NBUF)
i = 0;
if (speech != NULL) {
const char *hyp;
if (!prev_in_speech) {
fprintf(stderr, "Speech start at %.2f\n",
ps_endpointer_speech_start(ep));
fflush(stderr); /* For broken MSYS2 terminal */
ps_start_utt(decoder);
}
if (ps_process_raw(decoder, speech, frame_size, FALSE, FALSE) < 0)
E_FATAL("ps_process_raw() failed\n");
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL) {
fprintf(stderr, "PARTIAL RESULT: %s\n", hyp);
fflush(stderr);
}
if (!ps_endpointer_in_speech(ep)) {
fprintf(stderr, "Speech end at %.2f\n",
ps_endpointer_speech_end(ep));
fflush(stderr);
ps_end_utt(decoder);
if ((hyp = ps_get_hyp(decoder, NULL)) != NULL) {
printf("%s\n", hyp);
fflush(stdout);
}
}
}
}
/* Wait for another buffer. */
ResetEvent(event);
}
/* Stop recording, cancel all buffers, and free them. */
CHECK(waveInStop(wavein));
CHECK(waveInReset(wavein));
for (i = 0; i < NBUF; ++i) {
if (hdrs[i].dwFlags & WHDR_PREPARED)
CHECK(waveInUnprepareHeader(wavein, &hdrs[i],
sizeof(hdrs[i])));
free(hdrs[i].lpData);
}
CloseHandle(event);
ps_endpointer_free(ep);
ps_free(decoder);
ps_config_free(config);
return 0;
}
#define ps_endpointer_sample_rate(ep)
Definition endpointer.h:163
#define ps_endpointer_frame_size(ep)
Definition endpointer.h:153
#define E_FATAL(...)
Definition err.h:80
#define E_FATAL_SYSTEM(...)
Definition err.h:89
configuration object.
Speech recognizer object.
Simple voice activity detection based endpointing.