STT missing span attributes (telemetry)

Open Daniel-Sedlacek opened this issue 1 month ago • 1 comments

Feature Type

Would make my life easier

Feature Description

The problem The STTMetrics, such as the STT model and provider, are not visible in Langfuse. The likely cause is that the speech-to-text is missing the span attributes (self._tts_request_span.set_attribute), which LLM and TTS do have (see the STT vs TTS examples below)

livekit/agents/stt/stt.py

async def _metrics_monitor_task(self, event_aiter: AsyncIterable[SpeechEvent]) -> None:
        """Task used to collect metrics"""

        async for ev in event_aiter:
            if ev.type == SpeechEventType.RECOGNITION_USAGE:
                assert ev.recognition_usage is not None, (
                    "recognition_usage must be provided for RECOGNITION_USAGE event"
                )

                stt_metrics = STTMetrics(
                    request_id=ev.request_id,
                    timestamp=time.time(),
                    duration=0.0,
                    label=self._stt._label,
                    audio_duration=ev.recognition_usage.audio_duration,
                    streamed=True,
                    metadata=Metadata(
                        model_name=self._stt.model, model_provider=self._stt.provider
                    ),
                )

                self._stt.emit("metrics_collected", stt_metrics)
            elif ev.type == SpeechEventType.FINAL_TRANSCRIPT:
                # reset the retry count after a successful recognition
                self._num_retries = 0

livekit/agents/tts/tts.py

async def _metrics_monitor_task(self, event_aiter: AsyncIterable[SynthesizedAudio]) -> None:
        """Task used to collect metrics"""

        start_time = time.perf_counter()
        audio_duration = 0.0
        ttfb = -1.0
        request_id = ""

        async for ev in event_aiter:
            request_id = ev.request_id
            if ttfb == -1.0:
                ttfb = time.perf_counter() - start_time

            audio_duration += ev.frame.duration

        duration = time.perf_counter() - start_time

        if self._current_attempt_has_error:
            return

        metrics = TTSMetrics(
            timestamp=time.time(),
            request_id=request_id,
            ttfb=ttfb,
            duration=duration,
            characters_count=len(self._input_text),
            audio_duration=audio_duration,
            cancelled=self._synthesize_task.cancelled(),
            label=self._tts._label,
            streamed=False,
            metadata=Metadata(model_name=self._tts.model, model_provider=self._tts.provider),
        )
        if self._tts_request_span:
            self._tts_request_span.set_attribute(
                trace_types.ATTR_TTS_METRICS, metrics.model_dump_json()
            )
        self._tts.emit("metrics_collected", metrics)

Workarounds / Alternatives

No response

Additional Context

No response

Dec 13 '25 10:12 Daniel-Sedlacek