Source code for minispec.core.spectrum

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Utilities for spectral processing'''
import warnings

import numpy as np
import scipy.fftpack as fft
import six

from .. import util
from ..util.exceptions import ParameterError
from ..filters import get_window, window_sumsquare

__all__ = ['stft', 'istft', 'magphase',
           'power_to_db', 'db_to_power',
           'amplitude_to_db', 'db_to_amplitude']


[docs]def stft(y, n_fft=2048, hop_length=None, win_length=None, window='hann', center=True, dtype=np.complex64, pad_mode='reflect'): """Short-time Fourier transform (STFT) Returns a complex-valued matrix D such that `np.abs(D[f, t])` is the magnitude of frequency bin `f` at frame `t` `np.angle(D[f, t])` is the phase of frequency bin `f` at frame `t` Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] number audio of frames between STFT columns. If unspecified, defaults `win_length / 4`. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. If unspecified, defaults to ``win_length = n_fft``. window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] - a window specification (string, tuple, or number); see `scipy.signal.get_window` - a window function, such as `scipy.signal.hanning` - a vector or array of length `n_fft` .. see also:: `filters.get_window` center : boolean - If `True`, the signal `y` is padded so that frame `D[:, t]` is centered at `y[t * hop_length]`. - If `False`, then `D[:, t]` begins at `y[t * hop_length]` dtype : numeric type Complex numeric type for `D`. Default is 64-bit complex. pad_mode : string If `center=True`, the padding mode to use at the edges of the signal. By default, STFT uses reflection padding. Returns ------- D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype] STFT matrix See Also -------- istft : Inverse STFT ifgram : Instantaneous frequency spectrogram np.pad : array padding Examples -------- >>> y, sr = minispec.load(minispec.util.example_audio_file()) >>> D = np.abs(minispec.stft(y)) >>> D array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ..., 6.82710262e-04, 2.51654536e-04, 7.23036574e-05], [2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ..., 3.48026224e-04, 2.35853557e-04, 7.54836728e-05], [7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ..., 6.29352580e-04, 3.38571583e-04, 8.38094638e-05], ..., [9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ..., 1.85637656e-08, 2.89708542e-08, 5.74304337e-09], [1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ..., 3.49099771e-08, 3.11740926e-08, 5.29926236e-09], [1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ..., 5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32) Use left-aligned frames, instead of centered frames >>> D_left = np.abs(minispec.stft(y, center=False)) Use a shorter hop length >>> D_short = np.abs(minispec.stft(y, hop_length=64)) Display a spectrogram >>> import matplotlib.pyplot as plt >>> minispec.display.specshow(minispec.amplitude_to_db(D, ... ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('Power spectrogram') >>> plt.colorbar(format='%+2.0f dB') >>> plt.tight_layout() """ # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) fft_window = get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, int(n_fft // 2), mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] return stft_matrix
[docs]def istft(stft_matrix, hop_length=None, win_length=None, window='hann', center=True, dtype=np.float32, length=None): """ Inverse short-time Fourier transform (ISTFT). Converts a complex-valued spectrogram `stft_matrix` to time-series `y` by minimizing the mean squared error between `stft_matrix` and STFT of `y` as described in [1]_ up to Section 2 (reconstruction from MSTFT). In general, window function, hop length and other parameters should be same as in stft, which mostly leads to perfect reconstruction of a signal from unmodified `stft_matrix`. .. [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform," IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. Parameters ---------- stft_matrix : np.ndarray [shape=(1 + n_fft/2, t)] STFT matrix from `stft` hop_length : int > 0 [scalar] Number of frames between STFT columns. If unspecified, defaults to `win_length / 4`. win_length : int <= n_fft = 2 * (stft_matrix.shape[0] - 1) When reconstructing the time series, each frame is windowed and each sample is normalized by the sum of squared window according to the `window` function (see below). If unspecified, defaults to `n_fft`. window : string, tuple, number, function, np.ndarray [shape=(n_fft,)] - a window specification (string, tuple, or number); see `scipy.signal.get_window` - a window function, such as `scipy.signal.hanning` - a user-specified window vector of length `n_fft` .. see also:: `filters.get_window` center : boolean - If `True`, `D` is assumed to have centered frames. - If `False`, `D` is assumed to have left-aligned frames. dtype : numeric type Real numeric type for `y`. Default is 32-bit float. length : int > 0, optional If provided, the output `y` is zero-padded or clipped to exactly `length` samples. Returns ------- y : np.ndarray [shape=(n,)] time domain signal reconstructed from `stft_matrix` See Also -------- stft : Short-time Fourier Transform Examples -------- >>> y, sr = minispec.load(minispec.util.example_audio_file()) >>> D = minispec.stft(y) >>> y_hat = minispec.istft(D) >>> y_hat array([ -4.812e-06, -4.267e-06, ..., 6.271e-06, 2.827e-07], dtype=float32) Exactly preserving length of the input signal requires explicit padding. Otherwise, a partial frame at the end of `y` will not be represented. >>> n = len(y) >>> n_fft = 2048 >>> y_pad = minispec.util.fix_length(y, n + n_fft // 2) >>> D = minispec.stft(y_pad, n_fft=n_fft) >>> y_out = minispec.istft(D, length=n) >>> np.max(np.abs(y - y_out)) 1.4901161e-07 """ n_fft = 2 * (stft_matrix.shape[0] - 1) # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) ifft_window = get_window(window, win_length, fftbins=True) # Pad out to match n_fft ifft_window = util.pad_center(ifft_window, n_fft) n_frames = stft_matrix.shape[1] expected_signal_len = n_fft + hop_length * (n_frames - 1) y = np.zeros(expected_signal_len, dtype=dtype) for i in range(n_frames): sample = i * hop_length spec = stft_matrix[:, i].flatten() spec = np.concatenate((spec, spec[-2:0:-1].conj()), 0) ytmp = ifft_window * fft.ifft(spec).real y[sample:(sample + n_fft)] = y[sample:(sample + n_fft)] + ytmp # Normalize by sum of squared window ifft_window_sum = window_sumsquare(window, n_frames, win_length=win_length, n_fft=n_fft, hop_length=hop_length, dtype=dtype) approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum) y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices] if length is None: # If we don't need to control length, just do the usual center trimming # to eliminate padded data if center: y = y[int(n_fft // 2):-int(n_fft // 2)] else: if center: # If we're centering, crop off the first n_fft//2 samples # and then trim/pad to the target length. # We don't trim the end here, so that if the signal is zero-padded # to a longer duration, the decay is smooth by windowing start = int(n_fft // 2) else: # If we're not centering, start at 0 and trim/pad as necessary start = 0 y = util.fix_length(y[start:], length) return y
def magphase(D, power=1): """Separate a complex-valued spectrogram D into its magnitude (S) and phase (P) components, so that `D = S * P`. Parameters ---------- D : np.ndarray [shape=(d, t), dtype=complex] complex-valued spectrogram power : float > 0 Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Returns ------- D_mag : np.ndarray [shape=(d, t), dtype=real] magnitude of `D`, raised to `power` D_phase : np.ndarray [shape=(d, t), dtype=complex] `exp(1.j * phi)` where `phi` is the phase of `D` Examples -------- >>> y, sr = minispec.load(minispec.util.example_audio_file()) >>> D = minispec.stft(y) >>> magnitude, phase = minispec.magphase(D) >>> magnitude array([[ 2.524e-03, 4.329e-02, ..., 3.217e-04, 3.520e-05], [ 2.645e-03, 5.152e-02, ..., 3.283e-04, 3.432e-04], ..., [ 1.966e-05, 9.828e-06, ..., 3.164e-07, 9.370e-06], [ 1.966e-05, 9.830e-06, ..., 3.161e-07, 9.366e-06]], dtype=float32) >>> phase array([[ 1.000e+00 +0.000e+00j, 1.000e+00 +0.000e+00j, ..., -1.000e+00 +8.742e-08j, -1.000e+00 +8.742e-08j], [ 1.000e+00 +1.615e-16j, 9.950e-01 -1.001e-01j, ..., 9.794e-01 +2.017e-01j, 1.492e-02 -9.999e-01j], ..., [ 1.000e+00 -5.609e-15j, -5.081e-04 +1.000e+00j, ..., -9.549e-01 -2.970e-01j, 2.938e-01 -9.559e-01j], [ -1.000e+00 +8.742e-08j, -1.000e+00 +8.742e-08j, ..., -1.000e+00 +8.742e-08j, -1.000e+00 +8.742e-08j]], dtype=complex64) Or get the phase angle (in radians) >>> np.angle(phase) array([[ 0.000e+00, 0.000e+00, ..., 3.142e+00, 3.142e+00], [ 1.615e-16, -1.003e-01, ..., 2.031e-01, -1.556e+00], ..., [ -5.609e-15, 1.571e+00, ..., -2.840e+00, -1.273e+00], [ 3.142e+00, 3.142e+00, ..., 3.142e+00, 3.142e+00]], dtype=float32) """ mag = np.abs(D) mag **= power phase = np.exp(1.j * np.angle(D)) return mag, phase
[docs]def power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0): """Convert a power spectrogram (amplitude squared) to decibel (dB) units This computes the scaling ``10 * log10(S / ref)`` in a numerically stable way. Parameters ---------- S : np.ndarray input power ref : scalar or callable If scalar, the amplitude `abs(S)` is scaled relative to `ref`: `10 * log10(S / ref)`. Zeros in the output correspond to positions where `S == ref`. If callable, the reference value is computed as `ref(S)`. amin : float > 0 [scalar] minimum threshold for `abs(S)` and `ref` top_db : float >= 0 [scalar] threshold the output at `top_db` below the peak: ``max(10 * log10(S)) - top_db`` Returns ------- S_db : np.ndarray ``S_db ~= 10 * log10(S) - 10 * log10(ref)`` See Also -------- perceptual_weighting db_to_power amplitude_to_db db_to_amplitude Examples -------- Get a power spectrogram from a waveform ``y`` >>> y, sr = minispec.load(minispec.util.example_audio_file()) >>> S = np.abs(minispec.stft(y)) >>> minispec.power_to_db(S**2) array([[-33.293, -27.32 , ..., -33.293, -33.293], [-33.293, -25.723, ..., -33.293, -33.293], ..., [-33.293, -33.293, ..., -33.293, -33.293], [-33.293, -33.293, ..., -33.293, -33.293]], dtype=float32) Compute dB relative to peak power >>> minispec.power_to_db(S**2, ref=np.max) array([[-80. , -74.027, ..., -80. , -80. ], [-80. , -72.431, ..., -80. , -80. ], ..., [-80. , -80. , ..., -80. , -80. ], [-80. , -80. , ..., -80. , -80. ]], dtype=float32) Or compare to median power >>> minispec.power_to_db(S**2, ref=np.median) array([[-0.189, 5.784, ..., -0.189, -0.189], [-0.189, 7.381, ..., -0.189, -0.189], ..., [-0.189, -0.189, ..., -0.189, -0.189], [-0.189, -0.189, ..., -0.189, -0.189]], dtype=float32) And plot the results >>> import matplotlib.pyplot as plt >>> plt.figure() >>> plt.subplot(2, 1, 1) >>> minispec.display.specshow(S**2, sr=sr, y_axis='log') >>> plt.colorbar() >>> plt.title('Power spectrogram') >>> plt.subplot(2, 1, 2) >>> minispec.display.specshow(minispec.power_to_db(S**2, ref=np.max), ... sr=sr, y_axis='log', x_axis='time') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Log-Power spectrogram') >>> plt.tight_layout() """ S = np.asarray(S) if amin <= 0: raise ParameterError('amin must be strictly positive') if np.issubdtype(S.dtype, np.complexfloating): warnings.warn('power_to_db was called on complex input so phase ' 'information will be discarded. To suppress this warning, ' 'call power_to_db(np.abs(D)**2) instead.') magnitude = np.abs(S) else: magnitude = S if six.callable(ref): # User supplied a function to calculate reference power ref_value = ref(magnitude) else: ref_value = np.abs(ref) log_spec = 10.0 * np.log10(np.maximum(amin, magnitude)) log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value)) if top_db is not None: if top_db < 0: raise ParameterError('top_db must be non-negative') log_spec = np.maximum(log_spec, log_spec.max() - top_db) return log_spec
[docs]def db_to_power(S_db, ref=1.0): '''Convert a dB-scale spectrogram to a power spectrogram. This effectively inverts `power_to_db`: `db_to_power(S_db) ~= ref * 10.0**(S_db / 10)` Parameters ---------- S_db : np.ndarray dB-scaled spectrogram ref : number > 0 Reference power: output will be scaled by this value Returns ------- S : np.ndarray Power spectrogram ''' return ref * np.power(10.0, 0.1 * S_db)
[docs]def amplitude_to_db(S, ref=1.0, amin=1e-5, top_db=80.0): '''Convert an amplitude spectrogram to dB-scaled spectrogram. This is equivalent to ``power_to_db(S**2)``, but is provided for convenience. Parameters ---------- S : np.ndarray input amplitude ref : scalar or callable If scalar, the amplitude `abs(S)` is scaled relative to `ref`: `20 * log10(S / ref)`. Zeros in the output correspond to positions where `S == ref`. If callable, the reference value is computed as `ref(S)`. amin : float > 0 [scalar] minimum threshold for `S` and `ref` top_db : float >= 0 [scalar] threshold the output at `top_db` below the peak: ``max(20 * log10(S)) - top_db`` Returns ------- S_db : np.ndarray ``S`` measured in dB See Also -------- power_to_db, db_to_amplitude ''' S = np.asarray(S) if np.issubdtype(S.dtype, np.complexfloating): warnings.warn('amplitude_to_db was called on complex input so phase ' 'information will be discarded. To suppress this warning, ' 'call amplitude_to_db(np.abs(S)) instead.') magnitude = np.abs(S) if six.callable(ref): # User supplied a function to calculate reference power ref_value = ref(magnitude) else: ref_value = np.abs(ref) power = np.square(magnitude, out=magnitude) return power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db)
[docs]def db_to_amplitude(S_db, ref=1.0): '''Convert a dB-scaled spectrogram to an amplitude spectrogram. This effectively inverts `amplitude_to_db`: `db_to_amplitude(S_db) ~= 10.0**(0.5 * (S_db + log10(ref)/10))` Parameters ---------- S_db : np.ndarray dB-scaled spectrogram ref: number > 0 Optional reference power. Returns ------- S : np.ndarray Linear magnitude spectrogram ''' return db_to_power(S_db, ref=ref**2)**0.5
def _spectrogram(y=None, S=None, n_fft=2048, hop_length=512, power=1): '''Helper function to retrieve a magnitude spectrogram. This is primarily used in feature extraction functions that can operate on either audio time-series or spectrogram input. Parameters ---------- y : None or np.ndarray [ndim=1] If provided, an audio time series S : None or np.ndarray Spectrogram input, optional n_fft : int > 0 STFT window size hop_length : int > 0 STFT hop length power : float > 0 Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc. Returns ------- S_out : np.ndarray [dtype=np.float32] - If `S` is provided as input, then `S_out == S` - Else, `S_out = |stft(y, n_fft=n_fft, hop_length=hop_length)|**power` n_fft : int > 0 - If `S` is provided, then `n_fft` is inferred from `S` - Else, copied from input ''' if S is not None: # Infer n_fft from spectrogram shape n_fft = 2 * (S.shape[0] - 1) else: # Otherwise, compute a magnitude spectrogram from input S = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length))**power return S, n_fft