Update TurnDetector plugins to use the new Stream API

dangusev · dangusev · commit a8a115021aeb · 2026-04-20T23:50:30.000+02:00
diff --git a/agents-core/vision_agents/core/turn_detection/__init__.py b/agents-core/vision_agents/core/turn_detection/__init__.py
@@ -1,18 +1,12 @@
-from .turn_detection import (
-    TurnEvent,
-    TurnDetector,
-)
-from .events import (
-    TurnStartedEvent,
-    TurnEndedEvent,
-)
-
+from .events import TurnEndedEvent, TurnStartedEvent
+from .turn_detection import TurnDetector, TurnEnded, TurnStarted
 
 __all__ = [
     # Base classes and types
-    "TurnEvent",
     "TurnDetector",
     # Events
     "TurnStartedEvent",
     "TurnEndedEvent",
+    "TurnEnded",
+    "TurnStarted",
 ]
diff --git a/agents-core/vision_agents/core/turn_detection/turn_detection.py b/agents-core/vision_agents/core/turn_detection/turn_detection.py
@@ -1,20 +1,36 @@
-from typing import Optional
-from abc import ABC, abstractmethod
-from enum import Enum
 import uuid
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
 from getstream.video.rtc.track_util import PcmData
 from vision_agents.core.events.manager import EventManager
-from . import events
-from .events import TurnStartedEvent, TurnEndedEvent
-from ..agents.conversation import Conversation
+
 from ..edge.types import Participant
+from ..utils.stream import Stream
+from . import events
+
+if TYPE_CHECKING:
+    from vision_agents.core.agents.conversation import Conversation
 
 
-class TurnEvent(Enum):
-    """Events that can occur during turn detection (deprecated - use TurnStartedEvent/TurnEndedEvent)."""
+@dataclass
+class TurnStarted:
+    """
+    Event emitted when a speaker starts their turn.
+    """
 
-    TURN_STARTED = "turn_started"
-    TURN_ENDED = "turn_ended"
+    participant: Participant
+    confidence: float
+
+
+@dataclass
+class TurnEnded:
+    participant: Participant
+    confidence: float
+    eager: bool = False
+    trailing_silence_ms: Optional[float] = None
+    duration_ms: Optional[float] = None
 
 
 class TurnDetector(ABC):
@@ -29,44 +45,24 @@ def __init__(
         self.provider_name = provider_name or self.__class__.__name__
         self.events = EventManager()
         self.events.register_events_from_module(events, ignore_not_compatible=True)
+        self._output: Stream[TurnEnded | TurnStarted] = Stream()
 
-    def _emit_start_turn_event(self, event: TurnStartedEvent) -> None:
-        event.session_id = self.session_id
-        event.plugin_name = self.provider_name
-        self.events.send(event)
-
-    def _emit_end_turn_event(
-        self,
-        participant: Participant,
-        confidence: Optional[float] = None,
-        trailing_silence_ms: Optional[float] = None,
-        duration_ms: Optional[float] = None,
-        eager_end_of_turn: bool = False,
-    ) -> None:
-        if confidence is None:
-            confidence = 0.5
-        event = TurnEndedEvent(
-            session_id=self.session_id,
-            plugin_name=self.provider_name,
-            participant=participant,
-            confidence=confidence,
-            trailing_silence_ms=trailing_silence_ms,
-            duration_ms=duration_ms,
-            eager_end_of_turn=eager_end_of_turn,
-        )
-        self.events.send(event)
+    @property
+    def output(self) -> Stream[TurnEnded | TurnStarted]:
+        """Pipeline output stream: consumers iterate, subclasses push via send_nowait."""
+        return self._output
 
     @abstractmethod
     async def process_audio(
         self,
-        audio_data: PcmData,
+        data: PcmData,
         participant: Participant,
-        conversation: Optional[Conversation],
+        conversation: "Conversation | None" = None,
     ) -> None:
         """Process the audio and trigger turn start or turn end events
 
         Args:
-            audio_data: PcmData object containing audio samples from Stream
+            data: PcmData object containing audio samples from Stream
             participant: Participant that's speaking, includes user data
             conversation: Transcription/ chat history, sometimes useful for turn detection
         """
diff --git a/plugins/smart_turn/tests/test_smart_turn.py b/plugins/smart_turn/tests/test_smart_turn.py
@@ -3,7 +3,7 @@
 import pytest
 from vision_agents.core.agents.conversation import InMemoryConversation
 from vision_agents.core.edge.types import Participant
-from vision_agents.core.turn_detection import TurnEndedEvent, TurnStartedEvent
+from vision_agents.core.turn_detection import TurnEnded, TurnStarted
 from vision_agents.core.vad.silero import SileroVADSessionPool
 from vision_agents.plugins.smart_turn.smart_turn_detection import SmartTurnDetection
 
@@ -34,26 +34,21 @@ async def test_turn_detection_chunks(self, smart_turn, mia_audio_16khz):
         participant = Participant(user_id="mia", id="mia", original={})
         conversation = InMemoryConversation(instructions="be nice", messages=[])
 
-        event_order = []
-
-        # Subscribe to events
-        @smart_turn.events.subscribe
-        async def on_start(event: TurnStartedEvent):
-            logger.info(f"Smart turn turn started on {event.session_id}")
-            event_order.append("start")
-
-        @smart_turn.events.subscribe
-        async def on_stop(event: TurnEndedEvent):
-            logger.info(f"Smart turn turn ended on {event.session_id}")
-            event_order.append("stop")
-
         for pcm in mia_audio_16khz.chunks(chunk_size=304):
             await smart_turn.process_audio(pcm, participant, conversation)
 
-        # Wait for background processing to complete
         await smart_turn.wait_for_processing_complete()
 
-        assert event_order == ["start", "stop"] or event_order == [
+        items = await smart_turn.output.collect(timeout=1.0)
+        kinds = [
+            "start"
+            if isinstance(item, TurnStarted)
+            else "stop"
+            if isinstance(item, TurnEnded)
+            else None
+            for item in items
+        ]
+        assert kinds == ["start", "stop"] or kinds == [
             "start",
             "stop",
             "start",
@@ -63,39 +58,51 @@ async def on_stop(event: TurnEndedEvent):
     async def test_turn_detection(self, smart_turn, mia_audio_16khz):
         participant = Participant(user_id="mia", id="mia", original={})
         conversation = InMemoryConversation(instructions="be nice", messages=[])
-        event_order = []
-
-        # Subscribe to events
-        @smart_turn.events.subscribe
-        async def on_start(event: TurnStartedEvent):
-            logger.info(f"Smart turn turn started on {event.session_id}")
-            event_order.append("start")
-
-        @smart_turn.events.subscribe
-        async def on_stop(event: TurnEndedEvent):
-            logger.info(f"Smart turn turn ended on {event.session_id}")
-            event_order.append("stop")
 
         await smart_turn.process_audio(mia_audio_16khz, participant, conversation)
 
-        # Wait for background processing to complete
         await smart_turn.wait_for_processing_complete()
 
-        # Verify that turn detection is working - we should get at least some turn events
+        items = await smart_turn.output.collect(timeout=1.0)
+        kinds = [
+            "start"
+            if isinstance(item, TurnStarted)
+            else "stop"
+            if isinstance(item, TurnEnded)
+            else None
+            for item in items
+        ]
         # With continuous processing, we may get multiple start/stop cycles
-        assert event_order == ["start", "stop"] or event_order == [
+        assert kinds == ["start", "stop"] or kinds == [
             "start",
             "stop",
             "start",
             "stop",
         ]
 
+    async def test_silence_does_not_start_segment(self, smart_turn, silence_1s_16khz):
+        participant = Participant(user_id="mia", id="mia", original={})
+        conversation = InMemoryConversation(instructions="be nice", messages=[])
+
+        await smart_turn.process_audio(silence_1s_16khz, participant, conversation)
+        await smart_turn.wait_for_processing_complete()
+
+        items = await smart_turn.output.collect(timeout=0.5)
+        assert items == []
+
+    async def test_speech_starts_segment(self, smart_turn, mia_audio_16khz):
+        participant = Participant(user_id="mia", id="mia", original={})
+        conversation = InMemoryConversation(instructions="be nice", messages=[])
+
+        await smart_turn.process_audio(mia_audio_16khz, participant, conversation)
+        await smart_turn.wait_for_processing_complete()
+
+        items = await smart_turn.output.collect(timeout=1.0)
+        assert any(isinstance(item, TurnStarted) for item in items)
+
     """
     TODO
     - Test that the 2nd turn detect includes the audio from the first turn
     - Test that turn detection is ran after 8s of audio
     - Test that turn detection is run after speech and 2s of silence
-    - Test that silence doens't start a new segmetn
-    - Test that speaking starts a new segment
-
     """
diff --git a/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py b/plugins/smart_turn/vision_agents/plugins/smart_turn/smart_turn_detection.py
@@ -14,7 +14,8 @@
 from vision_agents.core.edge.types import Participant
 from vision_agents.core.turn_detection import (
     TurnDetector,
-    TurnStartedEvent,
+    TurnEnded,
+    TurnStarted,
 )
 from vision_agents.core.utils.utils import ensure_model
 from vision_agents.core.vad.silero import SileroVADSession, SileroVADSessionPool
@@ -162,7 +163,7 @@ async def process_audio(
         self,
         audio_data: PcmData,
         participant: Participant,
-        conversation: Optional[Conversation],
+        conversation: Conversation | None = None,
     ) -> None:
         """
         Fast, non-blocking audio packet enqueueing.
@@ -289,11 +290,13 @@ async def _process_audio_packet(
                     prediction = await self._predict_turn_completed(merged, participant)
                     turn_ended = prediction > 0.5
                     if turn_ended:
-                        self._emit_end_turn_event(
-                            participant=participant,
-                            confidence=prediction,
-                            trailing_silence_ms=trailing_silence_ms,
-                            duration_ms=self._active_segment.duration_ms,
+                        await self.output.send(
+                            TurnEnded(
+                                participant=participant,
+                                confidence=prediction,
+                                trailing_silence_ms=trailing_silence_ms,
+                                duration_ms=self._active_segment.duration_ms,
+                            )
                         )
                         self._active_segment = None
                         self._silence = Silence()
@@ -304,7 +307,12 @@ async def _process_audio_packet(
                         self._pre_speech_buffer.append(merged)
                         self._pre_speech_buffer = self._pre_speech_buffer.tail(8)
             elif is_speech and self._active_segment is None:
-                self._emit_start_turn_event(TurnStartedEvent(participant=participant))
+                await self.output.send(
+                    TurnStarted(
+                        participant=participant,
+                        confidence=speech_probability,
+                    )
+                )
                 # create a new segment
                 self._active_segment = PcmData(
                     sample_rate=RATE, channels=1, format=AudioFormat.F32
diff --git a/plugins/vogent/tests/test_vogent.py b/plugins/vogent/tests/test_vogent.py
@@ -1,10 +1,9 @@
-import asyncio
 import logging
 
 import pytest
 from vision_agents.core.agents.conversation import InMemoryConversation
 from vision_agents.core.edge.types import Participant
-from vision_agents.core.turn_detection import TurnEndedEvent, TurnStartedEvent
+from vision_agents.core.turn_detection import TurnEnded, TurnStarted
 from vision_agents.plugins.vogent.vogent_turn_detection import VogentTurnDetection
 
 logger = logging.getLogger(__name__)
@@ -30,18 +29,6 @@ async def test_turn_detection(
     ):
         participant = Participant(user_id="mia", original={}, id="mia")
         conversation = InMemoryConversation(instructions="be nice", messages=[])
-        event_order = []
-
-        # Subscribe to events
-        @vogent_turn_detection.events.subscribe
-        async def on_start(event: TurnStartedEvent):
-            logger.info(f"Vogent turn started on {event.session_id}")
-            event_order.append("start")
-
-        @vogent_turn_detection.events.subscribe
-        async def on_stop(event: TurnEndedEvent):
-            logger.info(f"Vogent turn ended on {event.session_id}")
-            event_order.append("stop")
 
         await vogent_turn_detection.process_audio(
             mia_audio_16khz, participant, conversation
@@ -50,12 +37,18 @@ async def on_stop(event: TurnEndedEvent):
             silence_2s_48khz, participant, conversation
         )
 
-        await asyncio.sleep(0.001)
+        await vogent_turn_detection.wait_for_processing_complete()
 
-        await asyncio.sleep(5)
-
-        # Verify that turn detection is working - we should get at least some turn events
-        assert event_order == ["start", "stop"] or event_order == [
+        items = await vogent_turn_detection.output.collect(timeout=1.0)
+        kinds = [
+            "start"
+            if isinstance(item, TurnStarted)
+            else "stop"
+            if isinstance(item, TurnEnded)
+            else None
+            for item in items
+        ]
+        assert kinds == ["start", "stop"] or kinds == [
             "start",
             "stop",
             "start",
diff --git a/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py b/plugins/vogent/vision_agents/plugins/vogent/vogent_turn_detection.py