Source code for libfmp.c8.c8s2_salience

"""
Module: libfmp.c8.c8s2_salience
Author: Sebastian Rosenzweig, Meinard Müller
License: The MIT license, https://opensource.org/licenses/MIT

This file is part of the FMP Notebooks (https://www.audiolabs-erlangen.de/FMP)
"""

import numpy as np
import librosa
from scipy import ndimage
from numba import jit

import libfmp.b
import libfmp.c8


[docs]@jit(nopython=True) def principal_argument(v): """Principal argument function | Notebook: C6/C6S1_NoveltyPhase.ipynb, see also | Notebook: C8/C8S2_InstantFreqEstimation.ipynb Args: v (float or np.ndarray): Value (or vector of values) Returns: w (float or np.ndarray): Principle value of v """ w = np.mod(v + 0.5, 1) - 0.5 return w
[docs]@jit(nopython=True) def compute_if(X, Fs, N, H): """Instantenous frequency (IF) estamation | Notebook: C8/C8S2_InstantFreqEstimation.ipynb, see also | Notebook: C6/C6S1_NoveltyPhase.ipynb Args: X (np.ndarray): STFT Fs (scalar): Sampling rate N (int): Window size in samples H (int): Hop size in samples Returns: F_coef_IF (np.ndarray): Matrix of IF values """ phi_1 = np.angle(X[:, 0:-1]) / (2 * np.pi) phi_2 = np.angle(X[:, 1:]) / (2 * np.pi) K = X.shape[0] index_k = np.arange(0, K).reshape(-1, 1) # Bin offset (FMP, Eq. (8.45)) kappa = (N / H) * principal_argument(phi_2 - phi_1 - index_k * H / N) # Instantaneous frequencies (FMP, Eq. (8.44)) F_coef_IF = (index_k + kappa) * Fs / N # Extend F_coef_IF by copying first column to match dimensions of X F_coef_IF = np.hstack((np.copy(F_coef_IF[:, 0]).reshape(-1, 1), F_coef_IF)) return F_coef_IF
[docs]@jit(nopython=True) def f_coef(k, Fs, N): """STFT center frequency Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: k (int): Coefficient number Fs (scalar): Sampling rate in Hz N (int): Window length in samples Returns: freq (float): STFT center frequency """ return k * Fs / N
[docs]@jit(nopython=True) def frequency_to_bin_index(F, R=10.0, F_ref=55.0): """| Binning function with variable frequency resolution | Note: Indexing starts with 0 (opposed to [FMP, Eq. (8.49)]) Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: F (float): Frequency in Hz R (float): Frequency resolution in cents (Default value = 10.0) F_ref (float): Reference frequency in Hz (Default value = 55.0) Returns: bin_index (int): Index for bin (starting with index 0) """ bin_index = np.floor((1200 / R) * np.log2(F / F_ref) + 0.5).astype(np.int64) return bin_index
[docs]@jit(nopython=True) def p_bin(b, freq, R=10.0, F_ref=55.0): """Computes binning mask [FMP, Eq. (8.50)] Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: b (int): Bin index freq (float): Center frequency R (float): Frequency resolution in cents (Default value = 10.0) F_ref (float): Reference frequency in Hz (Default value = 55.0) Returns: mask (float): Binning mask """ mask = frequency_to_bin_index(freq, R, F_ref) == b mask = mask.reshape(-1, 1) return mask
[docs]@jit(nopython=True) def compute_y_lf_bin(Y, Fs, N, R=10.0, F_min=55.0, F_max=1760.0): """Log-frequency Spectrogram with variable frequency resolution using binning Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: Y (np.ndarray): Magnitude spectrogram Fs (scalar): Sampling rate in Hz N (int): Window length in samples R (float): Frequency resolution in cents (Default value = 10.0) F_min (float): Lower frequency bound (reference frequency) (Default value = 55.0) F_max (float): Upper frequency bound (is included) (Default value = 1760.0) Returns: Y_LF_bin (np.ndarray): Binned log-frequency spectrogram F_coef_hertz (np.ndarray): Frequency axis in Hz F_coef_cents (np.ndarray): Frequency axis in cents """ # [FMP, Eq. (8.51)] B = frequency_to_bin_index(np.array([F_max]), R, F_min)[0] + 1 F_coef_hertz = 2 ** (np.arange(0, B) * R / 1200) * F_min F_coef_cents = np.arange(0, B*R, R) Y_LF_bin = np.zeros((B, Y.shape[1])) K = Y.shape[0] freq = f_coef(np.arange(0, K), Fs, N) freq_lim_idx = np.where(np.logical_and(freq >= F_min, freq <= F_max))[0] freq_lim = freq[freq_lim_idx] Y_lim = Y[freq_lim_idx, :] for b in range(B): coef_mask = p_bin(b, freq_lim, R, F_min) Y_LF_bin[b, :] = (Y_lim*coef_mask).sum(axis=0) return Y_LF_bin, F_coef_hertz, F_coef_cents
[docs]@jit(nopython=True) def p_bin_if(b, F_coef_IF, R=10.0, F_ref=55.0): """Computes binning mask for instantaneous frequency binning [FMP, Eq. (8.52)] Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: b (int): Bin index F_coef_IF (float): Instantaneous frequencies R (float): Frequency resolution in cents (Default value = 10.0) F_ref (float): Reference frequency in Hz (Default value = 55.0) Returns: mask (np.ndarray): Binning mask """ mask = frequency_to_bin_index(F_coef_IF, R, F_ref) == b return mask
[docs]@jit(nopython=True) def compute_y_lf_if_bin(X, Fs, N, H, R=10, F_min=55.0, F_max=1760.0, gamma=0.0): """Binned Log-frequency Spectrogram with variable frequency resolution based on instantaneous frequency Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: X (np.ndarray): Complex spectrogram Fs (scalar): Sampling rate in Hz N (int): Window length in samples H (int): Hopsize in samples R (float): Frequency resolution in cents (Default value = 10) F_min (float): Lower frequency bound (reference frequency) (Default value = 55.0) F_max (float): Upper frequency bound (Default value = 1760.0) gamma (float): Logarithmic compression factor (Default value = 0.0) Returns: Y_LF_IF_bin (np.ndarray): Binned log-frequency spectrogram using instantaneous frequency F_coef_hertz (np.ndarray): Frequency axis in Hz F_coef_cents (np.ndarray): Frequency axis in cents """ # Compute instantaneous frequencies F_coef_IF = libfmp.c8.compute_if(X, Fs, N, H) freq_lim_mask = np.logical_and(F_coef_IF >= F_min, F_coef_IF < F_max) F_coef_IF = F_coef_IF * freq_lim_mask # Initialize ouput array and compute frequency axis B = frequency_to_bin_index(np.array([F_max]), R, F_min)[0] + 1 F_coef_hertz = 2 ** (np.arange(0, B) * R / 1200) * F_min F_coef_cents = np.arange(0, B*R, R) Y_LF_IF_bin = np.zeros((B, X.shape[1])) # Magnitude binning if gamma == 0: Y = np.abs(X) ** 2 else: Y = np.log(1 + np.float32(gamma)*np.abs(X)) for b in range(B): coef_mask = p_bin_if(b, F_coef_IF, R, F_min) Y_LF_IF_bin[b, :] = (Y * coef_mask).sum(axis=0) return Y_LF_IF_bin, F_coef_hertz, F_coef_cents
[docs]@jit(nopython=True) def harmonic_summation(Y, num_harm=10, alpha=1.0): """Harmonic summation for spectrogram [FMP, Eq. (8.54)] Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: Y (np.ndarray): Magnitude spectrogram num_harm (int): Number of harmonics (Default value = 10) alpha (float): Weighting parameter (Default value = 1.0) Returns: Y_HS (np.ndarray): Spectrogram after harmonic summation """ Y_HS = np.zeros(Y.shape) Y_zero_pad = np.vstack((Y, np.zeros((Y.shape[0]*num_harm, Y.shape[1])))) K = Y.shape[0] for k in range(K): harm_idx = np.arange(1, num_harm+1)*(k) weights = alpha ** (np.arange(1, num_harm+1) - 1).reshape(-1, 1) Y_HS[k, :] = (Y_zero_pad[harm_idx, :] * weights).sum(axis=0) return Y_HS
[docs]@jit(nopython=True) def harmonic_summation_lf(Y_LF_bin, R, num_harm=10, alpha=1.0): """Harmonic summation for log-frequency spectrogram [FMP, Eq. (8.55)] Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: Y_LF_bin (np.ndarray): Log-frequency spectrogram R (float): Frequency resolution in cents num_harm (int): Number of harmonics (Default value = 10) alpha (float): Weighting parameter (Default value = 1.0) Returns: Y_LF_bin_HS (np.ndarray): Log-frequency spectrogram after harmonic summation """ Y_LF_bin_HS = np.zeros(Y_LF_bin.shape) pad_len = int(np.floor(np.log2(num_harm) * 1200 / R)) Y_LF_bin_zero_pad = np.vstack((Y_LF_bin, np.zeros((pad_len, Y_LF_bin.shape[1])))) B = Y_LF_bin.shape[0] for b in range(B): harmonics = np.arange(1, num_harm+1) harm_idx = b + np.floor(np.log2(harmonics) * 1200 / R).astype(np.int64) weights = alpha ** (np.arange(1, num_harm+1) - 1).reshape(-1, 1) Y_LF_bin_HS[b, :] = (Y_LF_bin_zero_pad[harm_idx, :] * weights).sum(axis=0) return Y_LF_bin_HS
[docs]def compute_salience_rep(x, Fs, N, H, R, F_min=55.0, F_max=1760.0, num_harm=10, freq_smooth_len=11, alpha=1.0, gamma=0.0): """Salience representation [FMP, Eq. (8.56)] Notebook: C8/C8S2_SalienceRepresentation.ipynb Args: x (np.ndarray): Audio signal Fs (scalar): Sampling frequency N (int): Window length in samples H (int): Hopsize in samples R (float): Frequency resolution in cents F_min (float): Lower frequency bound (reference frequency) (Default value = 55.0) F_max (float): Upper frequency bound (Default value = 1760.0) num_harm (int): Number of harmonics (Default value = 10) freq_smooth_len (int): Filter length for vertical smoothing (Default value = 11) alpha (float): Weighting parameter (Default value = 1.0) gamma (float): Logarithmic compression factor (Default value = 0.0) Returns: Z (np.ndarray): Salience representation F_coef_hertz (np.ndarray): Frequency axis in Hz F_coef_cents (np.ndarray): Frequency axis in cents """ X = librosa.stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant') Y_LF_IF_bin, F_coef_hertz, F_coef_cents = compute_y_lf_if_bin(X, Fs, N, H, R, F_min, F_max, gamma=gamma) # smoothing Y_LF_IF_bin = ndimage.filters.convolve1d(Y_LF_IF_bin, np.hanning(freq_smooth_len), axis=0, mode='constant') Z = harmonic_summation_lf(Y_LF_IF_bin, R=R, num_harm=num_harm, alpha=alpha) return Z, F_coef_hertz, F_coef_cents