RealtimeSTT
RealtimeSTT copied to clipboard
Why setting up the audio input stream for recording at the highest possible sample rate?
def get_highest_sample_rate(audio_interface, device_index):
"""Get the highest supported sample rate for the specified device."""
try:
device_info = audio_interface.get_device_info_by_index(device_index)
max_rate = int(device_info['defaultSampleRate'])
if 'supportedSampleRates' in device_info:
supported_rates = [int(rate) for rate in device_info['supportedSampleRates']]
if supported_rates:
max_rate = max(supported_rates)
return max_rate
except Exception as e:
logging.warning(f"Failed to get highest sample rate: {e}")
return 48000 # Fallback to a common high sample rate
def initialize_audio_stream(audio_interface, device_index, sample_rate, chunk_size):
"""Initialize the audio stream with error handling."""
try:
stream = audio_interface.open(
format=pyaudio.paInt16,
channels=1,
rate=sample_rate,
input=True,
frames_per_buffer=chunk_size,
input_device_index=device_index,
)
return stream
except Exception as e:
logging.error(f"Error initializing audio stream: {e}")
raise
def preprocess_audio(chunk, original_sample_rate, target_sample_rate):
"""Preprocess audio chunk similar to feed_audio method."""
if isinstance(chunk, np.ndarray):
# Handle stereo to mono conversion if necessary
if chunk.ndim == 2:
chunk = np.mean(chunk, axis=1)
# Resample to target_sample_rate if necessary
if original_sample_rate != target_sample_rate:
num_samples = int(len(chunk) * target_sample_rate / original_sample_rate)
chunk = signal.resample(chunk, num_samples)
# Ensure data type is int16
chunk = chunk.astype(np.int16)
else:
# If chunk is bytes, convert to numpy array
chunk = np.frombuffer(chunk, dtype=np.int16)
# Resample if necessary
if original_sample_rate != target_sample_rate:
num_samples = int(len(chunk) * target_sample_rate / original_sample_rate)
chunk = signal.resample(chunk, num_samples)
chunk = chunk.astype(np.int16)
return chunk.tobytes()
audio_interface = None
stream = None
device_sample_rate = None
chunk_size = 1024 # Increased chunk size for better performance
I'm curious about the need for the member functions of the added _audio_data_worker function. Why do we need as high a sample rate as possible?