RealtimeSTT icon indicating copy to clipboard operation
RealtimeSTT copied to clipboard

Why setting up the audio input stream for recording at the highest possible sample rate?

Open sangheonEN opened this issue 1 year ago • 3 comments

    def get_highest_sample_rate(audio_interface, device_index):
        """Get the highest supported sample rate for the specified device."""
        try:
            device_info = audio_interface.get_device_info_by_index(device_index)
            max_rate = int(device_info['defaultSampleRate'])
            
            if 'supportedSampleRates' in device_info:
                supported_rates = [int(rate) for rate in device_info['supportedSampleRates']]
                if supported_rates:
                    max_rate = max(supported_rates)
            
            return max_rate
        except Exception as e:
            logging.warning(f"Failed to get highest sample rate: {e}")
            return 48000  # Fallback to a common high sample rate
    def initialize_audio_stream(audio_interface, device_index, sample_rate, chunk_size):
        """Initialize the audio stream with error handling."""
        try:
            stream = audio_interface.open(
                format=pyaudio.paInt16,
                channels=1,
                rate=sample_rate,
                input=True,
                frames_per_buffer=chunk_size,
                input_device_index=device_index,
            )
            return stream
        except Exception as e:
            logging.error(f"Error initializing audio stream: {e}")
            raise
    def preprocess_audio(chunk, original_sample_rate, target_sample_rate):
        """Preprocess audio chunk similar to feed_audio method."""
        if isinstance(chunk, np.ndarray):
            # Handle stereo to mono conversion if necessary
            if chunk.ndim == 2:
                chunk = np.mean(chunk, axis=1)
            # Resample to target_sample_rate if necessary
            if original_sample_rate != target_sample_rate:
                num_samples = int(len(chunk) * target_sample_rate / original_sample_rate)
                chunk = signal.resample(chunk, num_samples)
            # Ensure data type is int16
            chunk = chunk.astype(np.int16)
        else:
            # If chunk is bytes, convert to numpy array
            chunk = np.frombuffer(chunk, dtype=np.int16)
            # Resample if necessary
            if original_sample_rate != target_sample_rate:
                num_samples = int(len(chunk) * target_sample_rate / original_sample_rate)
                chunk = signal.resample(chunk, num_samples)
                chunk = chunk.astype(np.int16)
        return chunk.tobytes()
    audio_interface = None
    stream = None
    device_sample_rate = None
    chunk_size = 1024  # Increased chunk size for better performance

I'm curious about the need for the member functions of the added _audio_data_worker function. Why do we need as high a sample rate as possible?

sangheonEN avatar Oct 02 '24 00:10 sangheonEN