#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''Utilities for spectral processing'''
import warnings
import numpy as np
import scipy.fftpack as fft
import six
from .. import util
from ..util.exceptions import ParameterError
from ..filters import get_window, window_sumsquare
__all__ = ['stft', 'istft', 'magphase',
'power_to_db', 'db_to_power',
'amplitude_to_db', 'db_to_amplitude']
[docs]def stft(y, n_fft=2048, hop_length=None, win_length=None, window='hann',
center=True, dtype=np.complex64, pad_mode='reflect'):
"""Short-time Fourier transform (STFT)
Returns a complex-valued matrix D such that
`np.abs(D[f, t])` is the magnitude of frequency bin `f`
at frame `t`
`np.angle(D[f, t])` is the phase of frequency bin `f`
at frame `t`
Parameters
----------
y : np.ndarray [shape=(n,)], real-valued
the input signal (audio time series)
n_fft : int > 0 [scalar]
FFT window size
hop_length : int > 0 [scalar]
number audio of frames between STFT columns.
If unspecified, defaults `win_length / 4`.
win_length : int <= n_fft [scalar]
Each frame of audio is windowed by `window()`.
The window will be of length `win_length` and then padded
with zeros to match `n_fft`.
If unspecified, defaults to ``win_length = n_fft``.
window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, or number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.hanning`
- a vector or array of length `n_fft`
.. see also:: `filters.get_window`
center : boolean
- If `True`, the signal `y` is padded so that frame
`D[:, t]` is centered at `y[t * hop_length]`.
- If `False`, then `D[:, t]` begins at `y[t * hop_length]`
dtype : numeric type
Complex numeric type for `D`. Default is 64-bit complex.
pad_mode : string
If `center=True`, the padding mode to use at the edges of the signal.
By default, STFT uses reflection padding.
Returns
-------
D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype]
STFT matrix
See Also
--------
istft : Inverse STFT
ifgram : Instantaneous frequency spectrogram
np.pad : array padding
Examples
--------
>>> y, sr = minispec.load(minispec.util.example_audio_file())
>>> D = np.abs(minispec.stft(y))
>>> D
array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ...,
6.82710262e-04, 2.51654536e-04, 7.23036574e-05],
[2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ...,
3.48026224e-04, 2.35853557e-04, 7.54836728e-05],
[7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ...,
6.29352580e-04, 3.38571583e-04, 8.38094638e-05],
...,
[9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ...,
1.85637656e-08, 2.89708542e-08, 5.74304337e-09],
[1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ...,
3.49099771e-08, 3.11740926e-08, 5.29926236e-09],
[1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ...,
5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32)
Use left-aligned frames, instead of centered frames
>>> D_left = np.abs(minispec.stft(y, center=False))
Use a shorter hop length
>>> D_short = np.abs(minispec.stft(y, hop_length=64))
Display a spectrogram
>>> import matplotlib.pyplot as plt
>>> minispec.display.specshow(minispec.amplitude_to_db(D,
... ref=np.max),
... y_axis='log', x_axis='time')
>>> plt.title('Power spectrogram')
>>> plt.colorbar(format='%+2.0f dB')
>>> plt.tight_layout()
"""
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
fft_window = get_window(window, win_length, fftbins=True)
# Pad the window out to n_fft size
fft_window = util.pad_center(fft_window, n_fft)
# Reshape so that the window can be broadcast
fft_window = fft_window.reshape((-1, 1))
# Check audio is valid
util.valid_audio(y)
# Pad the time series so that frames are centered
if center:
y = np.pad(y, int(n_fft // 2), mode=pad_mode)
# Window the time series.
y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length)
# Pre-allocate the STFT matrix
stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]),
dtype=dtype,
order='F')
# how many columns can we fit within MAX_MEM_BLOCK?
n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] *
stft_matrix.itemsize))
for bl_s in range(0, stft_matrix.shape[1], n_columns):
bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window *
y_frames[:, bl_s:bl_t],
axis=0)[:stft_matrix.shape[0]]
return stft_matrix
[docs]def istft(stft_matrix, hop_length=None, win_length=None, window='hann',
center=True, dtype=np.float32, length=None):
"""
Inverse short-time Fourier transform (ISTFT).
Converts a complex-valued spectrogram `stft_matrix` to time-series `y`
by minimizing the mean squared error between `stft_matrix` and STFT of
`y` as described in [1]_ up to Section 2 (reconstruction from MSTFT).
In general, window function, hop length and other parameters should be same
as in stft, which mostly leads to perfect reconstruction of a signal from
unmodified `stft_matrix`.
.. [1] D. W. Griffin and J. S. Lim,
"Signal estimation from modified short-time Fourier transform,"
IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
Parameters
----------
stft_matrix : np.ndarray [shape=(1 + n_fft/2, t)]
STFT matrix from `stft`
hop_length : int > 0 [scalar]
Number of frames between STFT columns.
If unspecified, defaults to `win_length / 4`.
win_length : int <= n_fft = 2 * (stft_matrix.shape[0] - 1)
When reconstructing the time series, each frame is windowed
and each sample is normalized by the sum of squared window
according to the `window` function (see below).
If unspecified, defaults to `n_fft`.
window : string, tuple, number, function, np.ndarray [shape=(n_fft,)]
- a window specification (string, tuple, or number);
see `scipy.signal.get_window`
- a window function, such as `scipy.signal.hanning`
- a user-specified window vector of length `n_fft`
.. see also:: `filters.get_window`
center : boolean
- If `True`, `D` is assumed to have centered frames.
- If `False`, `D` is assumed to have left-aligned frames.
dtype : numeric type
Real numeric type for `y`. Default is 32-bit float.
length : int > 0, optional
If provided, the output `y` is zero-padded or clipped to exactly
`length` samples.
Returns
-------
y : np.ndarray [shape=(n,)]
time domain signal reconstructed from `stft_matrix`
See Also
--------
stft : Short-time Fourier Transform
Examples
--------
>>> y, sr = minispec.load(minispec.util.example_audio_file())
>>> D = minispec.stft(y)
>>> y_hat = minispec.istft(D)
>>> y_hat
array([ -4.812e-06, -4.267e-06, ..., 6.271e-06, 2.827e-07], dtype=float32)
Exactly preserving length of the input signal requires explicit padding.
Otherwise, a partial frame at the end of `y` will not be represented.
>>> n = len(y)
>>> n_fft = 2048
>>> y_pad = minispec.util.fix_length(y, n + n_fft // 2)
>>> D = minispec.stft(y_pad, n_fft=n_fft)
>>> y_out = minispec.istft(D, length=n)
>>> np.max(np.abs(y - y_out))
1.4901161e-07
"""
n_fft = 2 * (stft_matrix.shape[0] - 1)
# By default, use the entire frame
if win_length is None:
win_length = n_fft
# Set the default hop, if it's not already specified
if hop_length is None:
hop_length = int(win_length // 4)
ifft_window = get_window(window, win_length, fftbins=True)
# Pad out to match n_fft
ifft_window = util.pad_center(ifft_window, n_fft)
n_frames = stft_matrix.shape[1]
expected_signal_len = n_fft + hop_length * (n_frames - 1)
y = np.zeros(expected_signal_len, dtype=dtype)
for i in range(n_frames):
sample = i * hop_length
spec = stft_matrix[:, i].flatten()
spec = np.concatenate((spec, spec[-2:0:-1].conj()), 0)
ytmp = ifft_window * fft.ifft(spec).real
y[sample:(sample + n_fft)] = y[sample:(sample + n_fft)] + ytmp
# Normalize by sum of squared window
ifft_window_sum = window_sumsquare(window,
n_frames,
win_length=win_length,
n_fft=n_fft,
hop_length=hop_length,
dtype=dtype)
approx_nonzero_indices = ifft_window_sum > util.tiny(ifft_window_sum)
y[approx_nonzero_indices] /= ifft_window_sum[approx_nonzero_indices]
if length is None:
# If we don't need to control length, just do the usual center trimming
# to eliminate padded data
if center:
y = y[int(n_fft // 2):-int(n_fft // 2)]
else:
if center:
# If we're centering, crop off the first n_fft//2 samples
# and then trim/pad to the target length.
# We don't trim the end here, so that if the signal is zero-padded
# to a longer duration, the decay is smooth by windowing
start = int(n_fft // 2)
else:
# If we're not centering, start at 0 and trim/pad as necessary
start = 0
y = util.fix_length(y[start:], length)
return y
def magphase(D, power=1):
"""Separate a complex-valued spectrogram D into its magnitude (S)
and phase (P) components, so that `D = S * P`.
Parameters
----------
D : np.ndarray [shape=(d, t), dtype=complex]
complex-valued spectrogram
power : float > 0
Exponent for the magnitude spectrogram,
e.g., 1 for energy, 2 for power, etc.
Returns
-------
D_mag : np.ndarray [shape=(d, t), dtype=real]
magnitude of `D`, raised to `power`
D_phase : np.ndarray [shape=(d, t), dtype=complex]
`exp(1.j * phi)` where `phi` is the phase of `D`
Examples
--------
>>> y, sr = minispec.load(minispec.util.example_audio_file())
>>> D = minispec.stft(y)
>>> magnitude, phase = minispec.magphase(D)
>>> magnitude
array([[ 2.524e-03, 4.329e-02, ..., 3.217e-04, 3.520e-05],
[ 2.645e-03, 5.152e-02, ..., 3.283e-04, 3.432e-04],
...,
[ 1.966e-05, 9.828e-06, ..., 3.164e-07, 9.370e-06],
[ 1.966e-05, 9.830e-06, ..., 3.161e-07, 9.366e-06]], dtype=float32)
>>> phase
array([[ 1.000e+00 +0.000e+00j, 1.000e+00 +0.000e+00j, ...,
-1.000e+00 +8.742e-08j, -1.000e+00 +8.742e-08j],
[ 1.000e+00 +1.615e-16j, 9.950e-01 -1.001e-01j, ...,
9.794e-01 +2.017e-01j, 1.492e-02 -9.999e-01j],
...,
[ 1.000e+00 -5.609e-15j, -5.081e-04 +1.000e+00j, ...,
-9.549e-01 -2.970e-01j, 2.938e-01 -9.559e-01j],
[ -1.000e+00 +8.742e-08j, -1.000e+00 +8.742e-08j, ...,
-1.000e+00 +8.742e-08j, -1.000e+00 +8.742e-08j]], dtype=complex64)
Or get the phase angle (in radians)
>>> np.angle(phase)
array([[ 0.000e+00, 0.000e+00, ..., 3.142e+00, 3.142e+00],
[ 1.615e-16, -1.003e-01, ..., 2.031e-01, -1.556e+00],
...,
[ -5.609e-15, 1.571e+00, ..., -2.840e+00, -1.273e+00],
[ 3.142e+00, 3.142e+00, ..., 3.142e+00, 3.142e+00]], dtype=float32)
"""
mag = np.abs(D)
mag **= power
phase = np.exp(1.j * np.angle(D))
return mag, phase
[docs]def power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0):
"""Convert a power spectrogram (amplitude squared) to decibel (dB) units
This computes the scaling ``10 * log10(S / ref)`` in a numerically
stable way.
Parameters
----------
S : np.ndarray
input power
ref : scalar or callable
If scalar, the amplitude `abs(S)` is scaled relative to `ref`:
`10 * log10(S / ref)`.
Zeros in the output correspond to positions where `S == ref`.
If callable, the reference value is computed as `ref(S)`.
amin : float > 0 [scalar]
minimum threshold for `abs(S)` and `ref`
top_db : float >= 0 [scalar]
threshold the output at `top_db` below the peak:
``max(10 * log10(S)) - top_db``
Returns
-------
S_db : np.ndarray
``S_db ~= 10 * log10(S) - 10 * log10(ref)``
See Also
--------
perceptual_weighting
db_to_power
amplitude_to_db
db_to_amplitude
Examples
--------
Get a power spectrogram from a waveform ``y``
>>> y, sr = minispec.load(minispec.util.example_audio_file())
>>> S = np.abs(minispec.stft(y))
>>> minispec.power_to_db(S**2)
array([[-33.293, -27.32 , ..., -33.293, -33.293],
[-33.293, -25.723, ..., -33.293, -33.293],
...,
[-33.293, -33.293, ..., -33.293, -33.293],
[-33.293, -33.293, ..., -33.293, -33.293]], dtype=float32)
Compute dB relative to peak power
>>> minispec.power_to_db(S**2, ref=np.max)
array([[-80. , -74.027, ..., -80. , -80. ],
[-80. , -72.431, ..., -80. , -80. ],
...,
[-80. , -80. , ..., -80. , -80. ],
[-80. , -80. , ..., -80. , -80. ]], dtype=float32)
Or compare to median power
>>> minispec.power_to_db(S**2, ref=np.median)
array([[-0.189, 5.784, ..., -0.189, -0.189],
[-0.189, 7.381, ..., -0.189, -0.189],
...,
[-0.189, -0.189, ..., -0.189, -0.189],
[-0.189, -0.189, ..., -0.189, -0.189]], dtype=float32)
And plot the results
>>> import matplotlib.pyplot as plt
>>> plt.figure()
>>> plt.subplot(2, 1, 1)
>>> minispec.display.specshow(S**2, sr=sr, y_axis='log')
>>> plt.colorbar()
>>> plt.title('Power spectrogram')
>>> plt.subplot(2, 1, 2)
>>> minispec.display.specshow(minispec.power_to_db(S**2, ref=np.max),
... sr=sr, y_axis='log', x_axis='time')
>>> plt.colorbar(format='%+2.0f dB')
>>> plt.title('Log-Power spectrogram')
>>> plt.tight_layout()
"""
S = np.asarray(S)
if amin <= 0:
raise ParameterError('amin must be strictly positive')
if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn('power_to_db was called on complex input so phase '
'information will be discarded. To suppress this warning, '
'call power_to_db(np.abs(D)**2) instead.')
magnitude = np.abs(S)
else:
magnitude = S
if six.callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
if top_db is not None:
if top_db < 0:
raise ParameterError('top_db must be non-negative')
log_spec = np.maximum(log_spec, log_spec.max() - top_db)
return log_spec
[docs]def db_to_power(S_db, ref=1.0):
'''Convert a dB-scale spectrogram to a power spectrogram.
This effectively inverts `power_to_db`:
`db_to_power(S_db) ~= ref * 10.0**(S_db / 10)`
Parameters
----------
S_db : np.ndarray
dB-scaled spectrogram
ref : number > 0
Reference power: output will be scaled by this value
Returns
-------
S : np.ndarray
Power spectrogram
'''
return ref * np.power(10.0, 0.1 * S_db)
[docs]def amplitude_to_db(S, ref=1.0, amin=1e-5, top_db=80.0):
'''Convert an amplitude spectrogram to dB-scaled spectrogram.
This is equivalent to ``power_to_db(S**2)``, but is provided for convenience.
Parameters
----------
S : np.ndarray
input amplitude
ref : scalar or callable
If scalar, the amplitude `abs(S)` is scaled relative to `ref`:
`20 * log10(S / ref)`.
Zeros in the output correspond to positions where `S == ref`.
If callable, the reference value is computed as `ref(S)`.
amin : float > 0 [scalar]
minimum threshold for `S` and `ref`
top_db : float >= 0 [scalar]
threshold the output at `top_db` below the peak:
``max(20 * log10(S)) - top_db``
Returns
-------
S_db : np.ndarray
``S`` measured in dB
See Also
--------
power_to_db, db_to_amplitude
'''
S = np.asarray(S)
if np.issubdtype(S.dtype, np.complexfloating):
warnings.warn('amplitude_to_db was called on complex input so phase '
'information will be discarded. To suppress this warning, '
'call amplitude_to_db(np.abs(S)) instead.')
magnitude = np.abs(S)
if six.callable(ref):
# User supplied a function to calculate reference power
ref_value = ref(magnitude)
else:
ref_value = np.abs(ref)
power = np.square(magnitude, out=magnitude)
return power_to_db(power, ref=ref_value**2, amin=amin**2,
top_db=top_db)
[docs]def db_to_amplitude(S_db, ref=1.0):
'''Convert a dB-scaled spectrogram to an amplitude spectrogram.
This effectively inverts `amplitude_to_db`:
`db_to_amplitude(S_db) ~= 10.0**(0.5 * (S_db + log10(ref)/10))`
Parameters
----------
S_db : np.ndarray
dB-scaled spectrogram
ref: number > 0
Optional reference power.
Returns
-------
S : np.ndarray
Linear magnitude spectrogram
'''
return db_to_power(S_db, ref=ref**2)**0.5
def _spectrogram(y=None, S=None, n_fft=2048, hop_length=512, power=1):
'''Helper function to retrieve a magnitude spectrogram.
This is primarily used in feature extraction functions that can operate on
either audio time-series or spectrogram input.
Parameters
----------
y : None or np.ndarray [ndim=1]
If provided, an audio time series
S : None or np.ndarray
Spectrogram input, optional
n_fft : int > 0
STFT window size
hop_length : int > 0
STFT hop length
power : float > 0
Exponent for the magnitude spectrogram,
e.g., 1 for energy, 2 for power, etc.
Returns
-------
S_out : np.ndarray [dtype=np.float32]
- If `S` is provided as input, then `S_out == S`
- Else, `S_out = |stft(y, n_fft=n_fft, hop_length=hop_length)|**power`
n_fft : int > 0
- If `S` is provided, then `n_fft` is inferred from `S`
- Else, copied from input
'''
if S is not None:
# Infer n_fft from spectrogram shape
n_fft = 2 * (S.shape[0] - 1)
else:
# Otherwise, compute a magnitude spectrogram from input
S = np.abs(stft(y, n_fft=n_fft, hop_length=hop_length))**power
return S, n_fft