如何让 TTS 立即停止说话?

How do I make TTS stop speaking IMMEDIATELY?

本文关键字：TTS 更新时间：2023-10-16

我正在使用ISpVoice来说出输入字符串。现在，即使我在Speak方法中使用了SPF_ASYNC和SPF_PURGEBEFORESPEAK标记，tts 也不会在调用Pause时停止，而是继续，直到 tts 完成一个单词。

这是我的做法：

void speakSentence()
{
pVoice->Pause();
pVoice->Speak(L"This is a sentence.", SPF_ASYNC | SPF_PURGEBEFORESPEAK, NULL);
pVoice->Resume();
}

每当我尝试在单词"句子"的中间调用此函数时，tts 都不会暂停，而是继续说出单词直到最后。

来自微软文档：

ISpVoice：:P ause 在最近的警报边界暂停语音并关闭输出设备，允许访问来自其他语音的待处理语音请求。

我尝试通过以下方式更改警报边界：

pVoice->SetAlertBoundary(SPEI_PHONEME);

而且它不起作用。

有NVDA屏幕阅读器解决了这个问题，但我不知道他们是如何做到的。

有没有办法解决我的问题？

编辑：这是我的完整代码。我正在创建一个同时使用 UIAutomation 和 MSAA 的小型屏幕阅读器程序。在比较 UI 对象时，该程序可能有些不稳定，但大多数时候它可以工作。

Screeenreader.h：

#ifndef _SCREENREADER_H_
#define _SCREENREADER_H_
#define WIN32_LEAN_AND_MEAN
#ifndef UNICODE
#define UNICODE
#endif
#include <windows.h>
#include <memory>
#include "speechsynthesis.h"
#include "uiautomator.h"
class ScreenReader
{
public:
explicit ScreenReader(HINSTANCE hInstance, HINSTANCE hPrevInstance, PSTR pScmdline, int iCmdShow);
virtual ~ScreenReader();
LRESULT CALLBACK MessageHandler(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
int Exec();
private:
void InitializeWindows();
void InitRawInputDevices();
bool IsMouseMove();
private:
LPCWSTR m_applicationName;
HINSTANCE m_hInstance;
HINSTANCE m_hPrevInstance;
PSTR m_pScmdline;
int m_iCmdShow;
HWND m_hWnd;
SpeechSynthesis *m_pSpeech;
UIAutomator *m_pAutomator;
RAWINPUTDEVICE rid[2];
LONG m_prevMouseX;
LONG m_prevMouseY;
BSTR currItem;
};
static ScreenReader *application;
static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
#endif

屏幕阅读器.cpp：在这一部分中，我在消息循环部分调用了ISpVoice。在ScreenReader::MessageHandler()IsMouseMove条件下起作用。

#include "screenreader.h"

ScreenReader::ScreenReader(HINSTANCE hInstance, HINSTANCE hPrevInstance, PSTR pScmdline, int iCmdShow)
{
CoInitialize(NULL);
m_pSpeech = new SpeechSynthesis;
m_pAutomator = new UIAutomator;
// Get current Cursor position.
POINT pt;
GetCursorPos(&pt);
m_prevMouseX = pt.x;
m_prevMouseY = pt.y;
// Notify user the program is loading.
m_pSpeech->Speak(L"Loading Rescan. Please wait.", SPF_DEFAULT, NULL);
m_hInstance = hInstance;
m_hPrevInstance = hPrevInstance;
m_pScmdline = pScmdline;
m_iCmdShow = iCmdShow;
application = this;
InitializeWindows();
InitRawInputDevices();
}

ScreenReader::~ScreenReader()
{
if (m_pSpeech != nullptr)
{
delete m_pSpeech;
m_pSpeech = nullptr;
}
if (m_pAutomator != nullptr)
{
delete m_pAutomator;
m_pAutomator = nullptr;
}
if (currItem != NULL)
{
SysFreeString(currItem);
currItem = NULL;
}
CoUninitialize();
}
LRESULT CALLBACK ScreenReader::MessageHandler(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
{
switch (message)
{
case WM_INPUT:
{
UINT dwSize;
GetRawInputData(
(HRAWINPUT)lParam,
RID_INPUT,
NULL,
&dwSize,
sizeof(RAWINPUTHEADER)
);
std::unique_ptr<BYTE[]> lpb(new BYTE[dwSize]);
if (!lpb)
return 0;
if (GetRawInputData(
(HRAWINPUT)lParam,
RID_INPUT,
lpb.get(),
&dwSize,
sizeof(RAWINPUTHEADER)
) != dwSize)
OutputDebugString(L"GetRawInputData does not return correct size!n");
RAWINPUT *raw = (RAWINPUT*)lpb.get();
if (raw->header.dwType == RIM_TYPEKEYBOARD)
{
UINT mess = raw->data.keyboard.Message;
UINT vKey = raw->data.keyboard.VKey;
if (mess == WM_KEYDOWN)
{
}
}
else if (raw->header.dwType == RIM_TYPEMOUSE)
{
if (IsMouseMove())
{
BSTR item;
HRESULT hr = m_pAutomator->GetUIAutomationItemNameAtMousePoint(&item);
if (item == NULL)
return 0;
if (currItem == NULL)
currItem = SysAllocString(item);
if (wcscmp(currItem, item) != 0)
{
m_pSpeech->Stop();
m_pSpeech->Speak(item);
if (currItem != NULL)
SysFreeString(currItem);
currItem = SysAllocString(item);
}
SysFreeString(item);
}
}
}
return 0;
default:
return DefWindowProc(hWnd, message, wParam, lParam);
}
}
int ScreenReader::Exec()
{
MSG msg;
ShowWindow(m_hWnd, m_iCmdShow);
// Tell the user that the program is ready.
m_pSpeech->Speak(L"Rescan ready.", SPF_PURGEBEFORESPEAK);
// The message loop
while (GetMessage(&msg, NULL, 0, 0))
{
TranslateMessage(&msg);
DispatchMessage(&msg);
}
return msg.wParam;
}
void ScreenReader::InitializeWindows()
{
// Create Window class.
WNDCLASSEX wc;
m_applicationName = L"Rescan Screen Reader";
wc.cbSize = sizeof(WNDCLASSEX);
wc.style = CS_HREDRAW | CS_VREDRAW | CS_OWNDC;
wc.lpfnWndProc = WndProc;
wc.cbClsExtra = 0;
wc.cbWndExtra = 0;
wc.hInstance = m_hInstance;
wc.hIcon = LoadIcon(NULL, IDI_APPLICATION);
wc.hCursor = LoadCursor(NULL, IDC_ARROW);
wc.hbrBackground = (HBRUSH)(COLOR_WINDOW + 1);
wc.lpszMenuName = NULL;
wc.lpszClassName = m_applicationName;
wc.hIconSm = wc.hIcon;
// Register the window class.
RegisterClassEx(&wc);
m_hWnd = CreateWindowEx(
WS_EX_OVERLAPPEDWINDOW,
m_applicationName,
L"Rescan Screen Reader",
WS_CAPTION | WS_MINIMIZEBOX | WS_OVERLAPPED | WS_SYSMENU,
(GetSystemMetrics(SM_CXSCREEN) - 500) / 2,
(GetSystemMetrics(SM_CYSCREEN) - 300) / 2,
500,
300,
NULL,
NULL,
m_hInstance,
NULL
);
}
void ScreenReader::InitRawInputDevices()
{
// Initialize Keyboard
rid[0].usUsagePage = 0x01;
rid[0].usUsage = 0x06;
rid[0].dwFlags = RIDEV_INPUTSINK;
rid[0].hwndTarget = m_hWnd;
// Initialize Mouse
rid[1].usUsagePage = 0x01;
rid[1].usUsage = 0x02;
rid[1].dwFlags = RIDEV_INPUTSINK;
rid[1].hwndTarget = m_hWnd;
// Register RIDs
RegisterRawInputDevices(rid, 2, sizeof(RAWINPUTDEVICE));
}
bool ScreenReader::IsMouseMove()
{
POINT pt;
GetCursorPos(&pt);
bool result = !(m_prevMouseX == pt.x && m_prevMouseY == pt.y);
m_prevMouseX = pt.x;
m_prevMouseY = pt.y;
return result;
}
LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
{
switch (message)
{
case WM_QUIT:
PostQuitMessage(0);
return 0;
case WM_DESTROY:
PostQuitMessage(0);
return 0;
default:
return application->MessageHandler(hWnd, message, wParam, lParam);
}
}

我把ISpVoice打包到SpeechSynthesis class中。

语音合成.h：

#ifndef _SPEECHSYNTHESIS_H_
#define _SPEECHSYNTHESIS_H_
#pragma warning(disable :  4996)
#define SPCAT_VOICES_ONECORE L"HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Speech_OneCore\Voices"
#include <sapi.h>
#include <sphelper.h>
#include <atlbase.h>
class SpeechSynthesis
{
public:
SpeechSynthesis();
~SpeechSynthesis();
HRESULT Speak(LPCWSTR pwcs, DWORD dwFlags = SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, ULONG *pulStreamNumber = NULL);
HRESULT Resume();
HRESULT Pause();
HRESULT Stop();
ISpVoice* getVoice();
private:
CComPtr<ISpObjectToken> cpVoiceToken;
CComPtr<IEnumSpObjectTokens> cpEnum;
ISpVoice* pVoice;
ULONG count;
};
#endif

语音合成.cpp：

#include "speechsynthesis.h"

SpeechSynthesis::SpeechSynthesis()
{
HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, (void **)&pVoice);
if (SUCCEEDED(hr))
hr = SpEnumTokens(SPCAT_VOICES_ONECORE, NULL, NULL, &cpEnum);
if (SUCCEEDED(hr))
hr = cpEnum->GetCount(&count);
if (SUCCEEDED(hr))
{
cpEnum->Item(1, &cpVoiceToken);
pVoice->SetPriority(SPVPRIORITY::SPVPRI_ALERT);
pVoice->SetAlertBoundary(SPEI_PHONEME);
pVoice->SetOutput(NULL, TRUE);
pVoice->SetVoice(cpVoiceToken);
}
if (FAILED(hr))
{
MessageBox(NULL, "A fatal error has occured", "Error Message", MB_ABORTRETRYIGNORE);
}
}

SpeechSynthesis::~SpeechSynthesis()
{
pVoice->Release();
}
HRESULT SpeechSynthesis::Speak(LPCWSTR pwcs, DWORD dwFlags, ULONG *pulStreamNumber)
{
return pVoice->Speak(pwcs, dwFlags, pulStreamNumber);
}
HRESULT SpeechSynthesis::Resume()
{
return pVoice->Resume();
}
HRESULT SpeechSynthesis::Pause()
{
return pVoice->Pause();
}
HRESULT SpeechSynthesis::Stop()
{
return Speak(NULL);
}
ISpVoice * SpeechSynthesis::getVoice()
{
return pVoice;
}

uiautomator.h

#ifndef _UIAUTOMATOR_H_
#define _UIAUTOMATOR_H_
#include <windows.h>
#include <oleacc.h>
#include <uiautomation.h>
#pragma comment(lib, "oleacc.lib")
class UIAutomator
{
public:
UIAutomator();
~UIAutomator();
HRESULT GetItemNameAtMousePoint(BSTR *pStr);
HRESULT GetUIAutomationItemNameAtMousePoint(BSTR *pStr);
private:
HRESULT InitUIAutomation();
private:
IUIAutomation *m_automation;
};
#endif

uiautomator.cpp

#include "uiautomator.h"

UIAutomator::UIAutomator()
{
SetProcessDPIAware();
HRESULT hr = InitUIAutomation();
}

UIAutomator::~UIAutomator()
{
}
HRESULT UIAutomator::GetItemNameAtMousePoint(BSTR * pStr)
{
POINT pt;
GetPhysicalCursorPos(&pt);
VARIANT varItem;
IAccessible *pAcc;
HRESULT hr = AccessibleObjectFromPoint(pt, &pAcc, &varItem);
if (SUCCEEDED(hr))
{
hr = pAcc->get_accName(varItem, pStr);
VariantClear(&varItem);
pAcc->Release();
}
return hr;
}
HRESULT UIAutomator::GetUIAutomationItemNameAtMousePoint(BSTR * pStr)
{
CONTROLTYPEID id;
POINT pt;
IUIAutomationElement *elem;
VARIANT val;
GetCursorPos(&pt);
HRESULT hr = m_automation->ElementFromPoint(pt, &elem);
if (SUCCEEDED(hr))
{
hr = elem->get_CurrentControlType(&id);
if (SUCCEEDED(hr))
{
if (id == UIA_PaneControlTypeId)
GetItemNameAtMousePoint(pStr);
else if (id == UIA_EditControlTypeId)
{
hr = elem->GetCurrentPropertyValue(UIA_ValueValuePropertyId, &val);
if (SUCCEEDED(hr))
{
*pStr = SysAllocString(val.bstrVal);
VariantClear(&val);
}
}
else
{
hr = elem->get_CurrentName(pStr);
}
}
elem->Release();
}
return hr;
}
HRESULT UIAutomator::InitUIAutomation()
{
HRESULT hr = CoCreateInstance(__uuidof(CUIAutomation), NULL, CLSCTX_INPROC_SERVER,
__uuidof(IUIAutomation), (void**)&m_automation);
return hr;
}

主.cpp

#include "vld.h"
#include "screenreader.h"
#include <memory>
int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, PSTR pScmdline, int iCmdShow)
{
std::unique_ptr<ScreenReader> app(new ScreenReader(hInstance, hPrevInstance, pScmdline, iCmdShow));
return app->Exec();
}

如果您没有时间编译，这里是程序。

如果启动它并将鼠标悬停在程序窗口上，则突出显示最小化和关闭按钮时会出现延迟。此外，有时当您将鼠标悬停在另一个对象上时，tts 不会立即停止。

将此与NVDA屏幕阅读器进行比较。你会注意到很大的不同。

无论是否设置SetAlertBoundary(SPEI_PHONEME)它都对我有用。

以下是我的测试代码，大家可以试一试。

HRESULT hr = ::CoInitialize(nullptr);
if (FAILED(hr))
{
return EXIT_FAILURE;
}
std::wstring text;
text = L"This is a sentence.";
CComPtr<ISpVoice> cpVoice;
// Create a SAPI voice
hr = cpVoice.CoCreateInstance(CLSID_SpVoice);
//cpVoice->SetAlertBoundary(SPEI_PHONEME);
// set the output to the default audio device
if (SUCCEEDED(hr))
{
hr = cpVoice->SetOutput(NULL, TRUE);
}
// Speak the text
if (SUCCEEDED(hr))
{
hr = cpVoice->Speak(text.c_str(), SPF_ASYNC | SPF_PURGEBEFORESPEAK, NULL);
}
text = L"The third type, a logarithm of the unsigned fold change, is undoubtedly the most tractable.";
Sleep(600);
hr = cpVoice->Pause();      
hr = cpVoice->Resume();
hr = cpVoice->Speak(text.c_str(), SPF_ASYNC | SPF_PURGEBEFORESPEAK, NULL);
Sleep(10000);
::CoUninitialize();
if (SUCCEEDED(hr))
{
return EXIT_SUCCESS;
}
return EXIT_FAILURE;

我终于明白了！

CComPtr<ISpAudio> audio;
CSpStreamFormat format;
format.AssignFormat(SPSF_11kHz8BitMono);

初始化音频

SpCreateDefaultObjectFromCategoryId(SPCAT_AUDIOOUT, &audio);

然后，设置其格式并将其设置为输出 pVoice。

audio->SetFormat(format.FormatId(), format.WaveFormatExPtr());
pVoice->SetOutput(audio, FALSE);

现在我可以访问音频流！

现在要立即停止音频，请拨打：

audio->SetState(SPAS_STOP, 0);

然后使用以下命令再次说话：

audio->SetState(SPAS_RUN, 0);
pVoice->Speak(L"This is a sentence", SPF_ASYNC | SPF_PURGEBEFORESPEAK, NULL);

没有找到相关文章