import numpy as np class FeatureExtractor: def __init__( self, feature_size=80, sampling_rate=16000, hop_length=160, chunk_length=30, n_fft=400, ): self.n_fft = n_fft self.hop_length = hop_length self.chunk_length = chunk_length self.n_samples = chunk_length * sampling_rate self.nb_max_frames = self.n_samples // hop_length self.time_per_frame = hop_length / sampling_rate self.sampling_rate = sampling_rate self.mel_filters = self.get_mel_filters( sampling_rate, n_fft, n_mels=feature_size ).astype("float32") @staticmethod def get_mel_filters(sr, n_fft, n_mels=128): # Initialize the weights n_mels = int(n_mels) # Center freqs of each FFT bin fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr) # 'Center freqs' of mel bands - uniformly spaced between limits min_mel = 0.0 max_mel = 45.245640471924965 mels = np.linspace(min_mel, max_mel, n_mels + 2) # Fill in the linear scale f_min = 0.0 f_sp = 200.0 / 3 freqs = f_min + f_sp * mels # And now the nonlinear scale min_log_hz = 1000.0 # beginning of log region (Hz) min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) logstep = np.log(6.4) / 27.0 # step size for log region # If we have vector data, vectorize log_t = mels >= min_log_mel freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) fdiff = np.diff(freqs) ramps = freqs.reshape(-1, 1) - fftfreqs.reshape(1, -1) lower = -ramps[:-2] / np.expand_dims(fdiff[:-1], axis=1) upper = ramps[2:] / np.expand_dims(fdiff[1:], axis=1) # Intersect them with each other and zero, vectorized across all i weights = np.maximum(np.zeros_like(lower), np.minimum(lower, upper)) # Slaney-style mel is scaled to be approx constant energy per channel enorm = 2.0 / (freqs[2 : n_mels + 2] - freqs[:n_mels]) weights *= np.expand_dims(enorm, axis=1) return weights @staticmethod def stft( input_array: np.ndarray, n_fft: int, hop_length: int = None, win_length: int = None, window: np.ndarray = None, center: bool = True, mode: str = "reflect", normalized: bool = False, onesided: bool = None, return_complex: bool = None, ): # Default initialization for hop_length and win_length hop_length = hop_length if hop_length is not None else n_fft // 4 win_length = win_length if win_length is not None else n_fft input_is_complex = np.iscomplexobj(input_array) # Determine if the output should be complex return_complex = ( return_complex if return_complex is not None else (input_is_complex or (window is not None and np.iscomplexobj(window))) ) if not return_complex and return_complex is None: raise ValueError( "stft requires the return_complex parameter for real inputs." ) # Input checks if not np.issubdtype(input_array.dtype, np.floating) and not input_is_complex: raise ValueError( "stft: expected an array of floating point or complex values," f" got {input_array.dtype}" ) if input_array.ndim > 2 or input_array.ndim < 1: raise ValueError( f"stft: expected a 1D or 2D array, but got {input_array.ndim}D array" ) # Handle 1D input if input_array.ndim == 1: input_array = np.expand_dims(input_array, axis=0) input_array_1d = True else: input_array_1d = False # Center padding if required if center: pad_amount = n_fft // 2 input_array = np.pad( input_array, ((0, 0), (pad_amount, pad_amount)), mode=mode ) batch, length = input_array.shape # Additional input checks if n_fft <= 0 or n_fft > length: raise ValueError( f"stft: expected 0 < n_fft <= {length}, but got n_fft={n_fft}" ) if hop_length <= 0: raise ValueError( f"stft: expected hop_length > 0, but got hop_length={hop_length}" ) if win_length <= 0 or win_length > n_fft: raise ValueError( f"stft: expected 0 < win_length <= n_fft, but got win_length={win_length}" ) if window is not None: if window.ndim != 1 or window.shape[0] != win_length: raise ValueError( f"stft: expected a 1D window array of size equal to win_length={win_length}, " f"but got window with size {window.shape}" ) # Handle padding of the window if necessary if win_length < n_fft: left = (n_fft - win_length) // 2 window_ = np.zeros(n_fft, dtype=window.dtype) window_[left : left + win_length] = window else: window_ = window # Calculate the number of frames n_frames = 1 + (length - n_fft) // hop_length # Time to columns input_array = np.lib.stride_tricks.as_strided( input_array, (batch, n_frames, n_fft), ( input_array.strides[0], hop_length * input_array.strides[1], input_array.strides[1], ), ) if window_ is not None: input_array = input_array * window_ # FFT and transpose complex_fft = input_is_complex onesided = onesided if onesided is not None else not complex_fft if normalized: norm = "ortho" else: norm = None if complex_fft: if onesided: raise ValueError( "Cannot have onesided output if window or input is complex" ) output = np.fft.fft(input_array, n=n_fft, axis=-1, norm=norm) else: output = np.fft.rfft(input_array, n=n_fft, axis=-1, norm=norm) output = output.transpose((0, 2, 1)) if input_array_1d: output = output.squeeze(0) return output if return_complex else np.real(output) def __call__(self, waveform: np.ndarray, padding=160, chunk_length=None): """ Compute the log-Mel spectrogram of the provided audio. """ if chunk_length is not None: self.n_samples = chunk_length * self.sampling_rate self.nb_max_frames = self.n_samples // self.hop_length if waveform.dtype is not np.float32: waveform = waveform.astype(np.float32) if padding: waveform = np.pad(waveform, (0, padding)) window = np.hanning(self.n_fft + 1)[:-1].astype("float32") stft = self.stft( waveform, self.n_fft, self.hop_length, window=window, return_complex=True, ).astype("complex64") magnitudes = np.abs(stft[..., :-1]) ** 2 mel_spec = self.mel_filters @ magnitudes log_spec = np.log10(np.clip(mel_spec, a_min=1e-10, a_max=None)) log_spec = np.maximum(log_spec, log_spec.max() - 8.0) log_spec = (log_spec + 4.0) / 4.0 return log_spec