diff --git a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
index 1192be10606d..42f4bf3117da 100644
--- a/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
+++ b/src/transformers/models/cohere_asr/feature_extraction_cohere_asr.py
@@ -284,17 +284,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech.to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/lasr/feature_extraction_lasr.py b/src/transformers/models/lasr/feature_extraction_lasr.py
index 7cf1822ee40d..26cacd39b09a 100644
--- a/src/transformers/models/lasr/feature_extraction_lasr.py
+++ b/src/transformers/models/lasr/feature_extraction_lasr.py
@@ -232,17 +232,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/parakeet/feature_extraction_parakeet.py b/src/transformers/models/parakeet/feature_extraction_parakeet.py
index c745d02c9629..95289cc00d99 100644
--- a/src/transformers/models/parakeet/feature_extraction_parakeet.py
+++ b/src/transformers/models/parakeet/feature_extraction_parakeet.py
@@ -217,17 +217,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
index 9ce98251e50e..3c3c1723a35a 100644
--- a/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py
@@ -145,17 +145,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]
diff --git a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
index 58355f3c0d7c..f13006f6b198 100644
--- a/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
+++ b/src/transformers/models/voxtral_realtime/feature_extraction_voxtral_realtime.py
@@ -203,17 +203,17 @@ def __call__(
                 f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                 "We will take the mean of the channels to convert to mono."
             )
-            raw_speech = raw_speech.mean(-1)
+            raw_speech = raw_speech.mean(1)
 
         is_batched_sequence = isinstance(raw_speech, (list, tuple))
         if is_batched_sequence:
-            for speech in raw_speech:
+            for index, speech in enumerate(raw_speech):
                 if len(speech.shape) > 1:
                     logger.warning(
                         f"Only mono-channel audio is supported for input to {self.__class__.__name__}. "
                         "We will take the mean of the channels to convert to mono."
                     )
-                    speech = speech.mean(-1)
+                    raw_speech[index] = speech.mean(0)
 
         if is_batched_torch or is_batched_sequence:
             raw_speech = [speech[:, None].to(torch.float32) for speech in raw_speech]