diff --git a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py index 1192be10606d..42f4bf3117da 100644 --- a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py +++ b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py @@ -284,17 +284,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech.to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/lasr/feature_extraction_lasr.py b/src/transformers/models/lasr/feature_extraction_lasr.py index 7cf1822ee40d..26cacd39b09a 100644 --- a/src/transformers/models/lasr/feature_extraction_lasr.py +++ b/src/transformers/models/lasr/feature_extraction_lasr.py @@ -232,17 +232,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/parakeet/feature_extraction_parakeet.py b/src/transformers/models/parakeet/feature_extraction_parakeet.py index c745d02c9629..95289cc00d99 100644 --- a/src/transformers/models/parakeet/feature_extraction_parakeet.py +++ b/src/transformers/models/parakeet/feature_extraction_parakeet.py @@ -217,17 +217,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py index 9ce98251e50e..3c3c1723a35a 100644 --- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py @@ -145,17 +145,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech] diff --git a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py index 58355f3c0d7c..f13006f6b198 100644 --- a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py +++ b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py @@ -203,17 +203,17 @@ def __call__( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - raw_speech = raw_speech.mean(-1) + raw_speech = raw_speech.mean(1) is_batched_sequence = isinstance(raw_speech, (list, tuple)) if is_batched_sequence: - for speech in raw_speech: + for index, speech in enumerate(raw_speech): if len(speech.shape) > 1: logger.warning( f"Only mono-channel audio is supported for input to {self.__class__.__name__}. " "We will take the mean of the channels to convert to mono." ) - speech = speech.mean(-1) + raw_speech[index] = speech.mean(0) if is_batched_torch or is_batched_sequence: raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]