mirror of
https://github.com/home-assistant/core.git
synced 2025-12-24 12:59:34 +00:00
Add speech detection and conversation_id to esphome voice assistant (#93578)
* Add speech detection to esphome voice assistant * Timeout after silence Ensure events are sent before finish is called * use va_version 3 instead of ESPHome version * Convert repeated fixtures to factory * Add some v3 tests * Add conversation_id * Bump aioesphomeapi to 13.8.0 * Fix missed buffering of detected chunk * Alter log message * Updates * Spelling * Fix return type
This commit is contained in:
@@ -19,43 +19,47 @@ _TEST_OUTPUT_TEXT = "This is an output test"
|
||||
_TEST_OUTPUT_URL = "output.mp3"
|
||||
_TEST_MEDIA_ID = "12345"
|
||||
|
||||
_ONE_SECOND = 16000 * 2 # 16Khz 16-bit
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def voice_assistant_udp_server(
|
||||
hass: HomeAssistant,
|
||||
) -> VoiceAssistantUDPServer:
|
||||
"""Return the UDP server factory."""
|
||||
|
||||
def _voice_assistant_udp_server(entry):
|
||||
entry_data = DomainData.get(hass).get_entry_data(entry)
|
||||
|
||||
server: VoiceAssistantUDPServer = None
|
||||
|
||||
def handle_finished():
|
||||
nonlocal server
|
||||
assert server is not None
|
||||
server.close()
|
||||
|
||||
server = VoiceAssistantUDPServer(hass, entry_data, Mock(), handle_finished)
|
||||
return server
|
||||
|
||||
return _voice_assistant_udp_server
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def voice_assistant_udp_server_v1(
|
||||
hass: HomeAssistant,
|
||||
voice_assistant_udp_server,
|
||||
mock_voice_assistant_v1_entry,
|
||||
) -> VoiceAssistantUDPServer:
|
||||
"""Return the UDP server."""
|
||||
entry_data = DomainData.get(hass).get_entry_data(mock_voice_assistant_v1_entry)
|
||||
|
||||
server: VoiceAssistantUDPServer = None
|
||||
|
||||
def handle_finished():
|
||||
nonlocal server
|
||||
assert server is not None
|
||||
server.close()
|
||||
|
||||
server = VoiceAssistantUDPServer(hass, entry_data, Mock(), handle_finished)
|
||||
return server
|
||||
return voice_assistant_udp_server(entry=mock_voice_assistant_v1_entry)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def voice_assistant_udp_server_v2(
|
||||
hass: HomeAssistant,
|
||||
voice_assistant_udp_server,
|
||||
mock_voice_assistant_v2_entry,
|
||||
) -> VoiceAssistantUDPServer:
|
||||
"""Return the UDP server."""
|
||||
entry_data = DomainData.get(hass).get_entry_data(mock_voice_assistant_v2_entry)
|
||||
|
||||
server: VoiceAssistantUDPServer = None
|
||||
|
||||
def handle_finished():
|
||||
nonlocal server
|
||||
assert server is not None
|
||||
server.close()
|
||||
|
||||
server = VoiceAssistantUDPServer(hass, entry_data, Mock(), handle_finished)
|
||||
return server
|
||||
return voice_assistant_udp_server(entry=mock_voice_assistant_v2_entry)
|
||||
|
||||
|
||||
async def test_pipeline_events(
|
||||
@@ -117,7 +121,7 @@ async def test_pipeline_events(
|
||||
):
|
||||
voice_assistant_udp_server_v1.transport = Mock()
|
||||
|
||||
await voice_assistant_udp_server_v1.run_pipeline()
|
||||
await voice_assistant_udp_server_v1.run_pipeline(conversation_id=None)
|
||||
|
||||
|
||||
async def test_udp_server(
|
||||
@@ -335,3 +339,136 @@ async def test_send_tts(
|
||||
await voice_assistant_udp_server_v2._tts_done.wait()
|
||||
|
||||
voice_assistant_udp_server_v2.transport.sendto.assert_called()
|
||||
|
||||
|
||||
async def test_speech_detection(
|
||||
hass: HomeAssistant,
|
||||
voice_assistant_udp_server_v2: VoiceAssistantUDPServer,
|
||||
) -> None:
|
||||
"""Test the UDP server queues incoming data."""
|
||||
|
||||
def is_speech(self, chunk, sample_rate):
|
||||
"""Anything non-zero is speech."""
|
||||
return sum(chunk) > 0
|
||||
|
||||
async def async_pipeline_from_audio_stream(*args, **kwargs):
|
||||
stt_stream = kwargs["stt_stream"]
|
||||
event_callback = kwargs["event_callback"]
|
||||
async for _chunk in stt_stream:
|
||||
pass
|
||||
|
||||
# Test empty data
|
||||
event_callback(
|
||||
PipelineEvent(
|
||||
type=PipelineEventType.STT_END,
|
||||
data={"stt_output": {"text": _TEST_INPUT_TEXT}},
|
||||
)
|
||||
)
|
||||
|
||||
with patch(
|
||||
"webrtcvad.Vad.is_speech",
|
||||
new=is_speech,
|
||||
), patch(
|
||||
"homeassistant.components.esphome.voice_assistant.async_pipeline_from_audio_stream",
|
||||
new=async_pipeline_from_audio_stream,
|
||||
):
|
||||
voice_assistant_udp_server_v2.started = True
|
||||
|
||||
voice_assistant_udp_server_v2.queue.put_nowait(bytes(_ONE_SECOND))
|
||||
voice_assistant_udp_server_v2.queue.put_nowait(bytes([255] * _ONE_SECOND * 2))
|
||||
voice_assistant_udp_server_v2.queue.put_nowait(bytes([255] * _ONE_SECOND * 2))
|
||||
voice_assistant_udp_server_v2.queue.put_nowait(bytes(_ONE_SECOND))
|
||||
|
||||
await voice_assistant_udp_server_v2.run_pipeline(
|
||||
conversation_id=None, use_vad=True, pipeline_timeout=1.0
|
||||
)
|
||||
|
||||
|
||||
async def test_no_speech(
|
||||
hass: HomeAssistant,
|
||||
voice_assistant_udp_server_v2: VoiceAssistantUDPServer,
|
||||
) -> None:
|
||||
"""Test there is no speech."""
|
||||
|
||||
def is_speech(self, chunk, sample_rate):
|
||||
"""Anything non-zero is speech."""
|
||||
return sum(chunk) > 0
|
||||
|
||||
def handle_event(
|
||||
event_type: esphome.VoiceAssistantEventType, data: dict[str, str] | None
|
||||
) -> None:
|
||||
assert event_type == esphome.VoiceAssistantEventType.VOICE_ASSISTANT_ERROR
|
||||
assert data is not None
|
||||
assert data["code"] == "speech-timeout"
|
||||
|
||||
voice_assistant_udp_server_v2.handle_event = handle_event
|
||||
|
||||
with patch(
|
||||
"webrtcvad.Vad.is_speech",
|
||||
new=is_speech,
|
||||
):
|
||||
voice_assistant_udp_server_v2.started = True
|
||||
|
||||
voice_assistant_udp_server_v2.queue.put_nowait(bytes(_ONE_SECOND))
|
||||
|
||||
await voice_assistant_udp_server_v2.run_pipeline(
|
||||
conversation_id=None, use_vad=True, pipeline_timeout=1.0
|
||||
)
|
||||
|
||||
|
||||
async def test_speech_timeout(
|
||||
hass: HomeAssistant,
|
||||
voice_assistant_udp_server_v2: VoiceAssistantUDPServer,
|
||||
) -> None:
|
||||
"""Test when speech was detected, but the pipeline times out."""
|
||||
|
||||
def is_speech(self, chunk, sample_rate):
|
||||
"""Anything non-zero is speech."""
|
||||
return sum(chunk) > 255
|
||||
|
||||
async def async_pipeline_from_audio_stream(*args, **kwargs):
|
||||
stt_stream = kwargs["stt_stream"]
|
||||
async for _chunk in stt_stream:
|
||||
# Stream will end when VAD detects end of "speech"
|
||||
pass
|
||||
|
||||
async def segment_audio(*args, **kwargs):
|
||||
raise asyncio.TimeoutError()
|
||||
async for chunk in []:
|
||||
yield chunk
|
||||
|
||||
with patch(
|
||||
"webrtcvad.Vad.is_speech",
|
||||
new=is_speech,
|
||||
), patch(
|
||||
"homeassistant.components.esphome.voice_assistant.async_pipeline_from_audio_stream",
|
||||
new=async_pipeline_from_audio_stream,
|
||||
), patch(
|
||||
"homeassistant.components.esphome.voice_assistant.VoiceAssistantUDPServer._segment_audio",
|
||||
new=segment_audio,
|
||||
):
|
||||
voice_assistant_udp_server_v2.started = True
|
||||
|
||||
voice_assistant_udp_server_v2.queue.put_nowait(bytes([255] * (_ONE_SECOND * 2)))
|
||||
|
||||
await voice_assistant_udp_server_v2.run_pipeline(
|
||||
conversation_id=None, use_vad=True, pipeline_timeout=1.0
|
||||
)
|
||||
|
||||
|
||||
async def test_cancelled(
|
||||
hass: HomeAssistant,
|
||||
voice_assistant_udp_server_v2: VoiceAssistantUDPServer,
|
||||
) -> None:
|
||||
"""Test when the server is stopped while waiting for speech."""
|
||||
|
||||
voice_assistant_udp_server_v2.started = True
|
||||
|
||||
voice_assistant_udp_server_v2.queue.put_nowait(b"")
|
||||
|
||||
await voice_assistant_udp_server_v2.run_pipeline(
|
||||
conversation_id=None, use_vad=True, pipeline_timeout=1.0
|
||||
)
|
||||
|
||||
# No events should be sent if cancelled while waiting for speech
|
||||
voice_assistant_udp_server_v2.handle_event.assert_not_called()
|
||||
|
||||
Reference in New Issue
Block a user