Source code for libfmp.c8.c8s2_f0

"""
Module: libfmp.c8.c8s2_f0
Author: Sebastian Rosenzweig, Meinard Müller
License: The MIT license, https://opensource.org/licenses/MIT

This file is part of the FMP Notebooks (https://www.audiolabs-erlangen.de/FMP)
"""

import numpy as np
import librosa
from scipy import ndimage, linalg
from scipy.interpolate import interp1d
from numba import jit
import matplotlib
import matplotlib.pyplot as plt

import libfmp.b
import libfmp.c8


[docs]def hz_to_cents(F, F_ref=55.0): """Converts frequency in Hz to cents Notebook: C8/C8S2_FundFreqTracking.ipynb Args: F (float or np.ndarray): Frequency value in Hz F_ref (float): Reference frequency in Hz (Default value = 55.0) Returns: F_cent (float or np.ndarray): Frequency in cents """ F_cent = 1200 * np.log2(F / F_ref) return F_cent
[docs]def cents_to_hz(F_cent, F_ref=55.0): """Converts frequency in cents to Hz Notebook: C8/C8S2_FundFreqTracking.ipynb Args: F_cent (float or np.ndarray): Frequency in cents F_ref (float): Reference frequency in Hz (Default value = 55.0) Returns: F (float or np.ndarray): Frequency in Hz """ F = F_ref * 2 ** (F_cent / 1200) return F
[docs]def sonify_trajectory_with_sinusoid(traj, audio_len, Fs=22050, amplitude=0.3, smooth_len=11): """Sonification of trajectory with sinusoidal Notebook: C8/C8S2_FundFreqTracking.ipynb Args: traj (np.ndarray): F0 trajectory (time in seconds, frequency in Hz) audio_len (int): Desired audio length in samples Fs (scalar): Sampling rate (Default value = 22050) amplitude (float): Amplitude (Default value = 0.3) smooth_len (int): Length of amplitude smoothing filter (Default value = 11) Returns: x_soni (np.ndarray): Sonification """ # unit confidence if not specified if traj.shape[1] < 3: confidence = np.zeros(traj.shape[0]) confidence[traj[:, 1] > 0] = amplitude else: confidence = traj[:, 2] # initialize x_soni = np.zeros(audio_len) amplitude_mod = np.zeros(audio_len) # Computation of hop size # sine_len = int(2 ** np.round(np.log(traj[1, 0]*Fs) / np.log(2))) sine_len = int(traj[1, 0] * Fs) t = np.arange(0, sine_len) / Fs phase = 0 # loop over all F0 values, insure continuous phase for idx in np.arange(0, traj.shape[0]): cur_f = traj[idx, 1] cur_amp = confidence[idx] if cur_f == 0: phase = 0 continue cur_soni = np.sin(2*np.pi*(cur_f*t+phase)) diff = np.maximum(0, (idx+1)*sine_len - len(x_soni)) if diff > 0: x_soni[idx * sine_len:(idx + 1) * sine_len - diff] = cur_soni[:-diff] amplitude_mod[idx * sine_len:(idx + 1) * sine_len - diff] = cur_amp else: x_soni[idx*sine_len:(idx+1)*sine_len-diff] = cur_soni amplitude_mod[idx*sine_len:(idx+1)*sine_len-diff] = cur_amp phase += cur_f * sine_len / Fs phase -= 2 * np.round(phase/2) # filter amplitudes to avoid transients amplitude_mod = np.convolve(amplitude_mod, np.hanning(smooth_len)/np.sum(np.hanning(smooth_len)), 'same') x_soni = x_soni * amplitude_mod return x_soni
[docs]def visualize_salience_traj_constraints(Z, T_coef, F_coef_cents, F_ref=55.0, colorbar=True, cmap='gray_r', figsize=(7, 4), traj=None, constraint_region=None, ax=None): """Visualize salience representation with optional F0-trajectory and constraint regions Notebook: C8/C8S2_FundFreqTracking.ipynb Args: Z: Salience representation T_coef: Time axis F_coef_cents: Frequency axis in cents F_ref: Reference frequency (Default value = 55.0) colorbar: Show or hide colorbar (Default value = True) cmap: Color map (Default value = 'gray_r') figsize: Figure size (Default value = (7, 4)) traj: F0 trajectory (time in seconds, frequency in Hz) (Default value = None) constraint_region: Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end,hz) (Default value = None) ax: Handle to existing axis (Default value = None) Returns: fig: Handle to figure ax: Handle to cent axis ax_f: Handle to frequency axis """ fig = None if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) sal = ax.imshow(Z, extent=[T_coef[0], T_coef[-1], F_coef_cents[0], F_coef_cents[-1]], cmap=cmap, origin='lower', aspect='auto', interpolation='nearest') y_ticklabels_left = np.arange(F_coef_cents[0], F_coef_cents[-1]+1, 1200) ax.set_yticks(y_ticklabels_left) ax.set_yticklabels(y_ticklabels_left) ax.set_ylabel('Frequency (Cents)') if colorbar: plt.colorbar(sal, ax=ax, pad=0.1) ax_f = ax.twinx() # instantiate a second axes that shares the same y-axis ax_f.set_yticks(y_ticklabels_left - F_coef_cents[0]) y_ticklabels_right = cents_to_hz(y_ticklabels_left, F_ref).astype(int) ax_f.set_yticklabels(y_ticklabels_right) ax_f.set_ylabel('Frequency (Hz)') # plot contour if traj is not None: traj_plot = traj[traj[:, 1] > 0, :] traj_plot[:, 1] = hz_to_cents(traj_plot[:, 1], F_ref) ax.plot(traj_plot[:, 0], traj_plot[:, 1], color='r', markersize=4, marker='.', linestyle='') # plot constraint regions if constraint_region is not None: for row in constraint_region: t_start = row[0] # sec t_end = row[1] # sec f_start = row[2] # Hz f_end = row[3] # Hz ax.add_patch(matplotlib.patches.Rectangle(( t_start, hz_to_cents(f_start, F_ref)), width=t_end-t_start, height=hz_to_cents(f_end, F_ref)-hz_to_cents(f_start, F_ref), fill=False, edgecolor='k', linewidth=3, zorder=2)) ax.set_xlabel('Time (seconds)') if fig is not None: plt.tight_layout() return fig, ax, ax_f
# @jit(nopython=True)
[docs]def define_transition_matrix(B, tol=0, score_low=0.01, score_high=1.0): """Generate transition matrix Notebook: C8/C8S2_FundFreqTracking.ipynb Args: B (int): Number of bins tol (int): Tolerance parameter for transition matrix (Default value = 0) score_low (float): Score (low) for transition matrix (Default value = 0.01) score_high (float): Score (high) for transition matrix (Default value = 1.0) Returns: T (np.ndarray): Transition matrix """ col = np.ones((B,)) * score_low col[0:tol+1] = np.ones((tol+1, )) * score_high T = linalg.toeplitz(col) return T
[docs]@jit(nopython=True) def compute_trajectory_dp(Z, T): """Trajectory tracking using dynamic programming Notebook: C8/C8S2_FundFreqTracking.ipynb Args: Z: Salience representation T: Transisition matrix Returns: eta_DP (np.ndarray): Trajectory indices """ B, N = Z.shape eps_machine = np.finfo(np.float32).eps Z_log = np.log(Z + eps_machine) T_log = np.log(T + eps_machine) E = np.zeros((B, N)) D = np.zeros((B, N)) D[:, 0] = Z_log[:, 0] for n in np.arange(1, N): for b in np.arange(0, B): D[b, n] = np.max(T_log[b, :] + D[:, n-1]) + Z_log[b, n] E[b, n-1] = np.argmax(T_log[b, :] + D[:, n-1]) # backtracking eta_DP = np.zeros(N) eta_DP[N-1] = int(np.argmax(D[:, N-1])) for n in np.arange(N-2, -1, -1): eta_DP[n] = E[int(eta_DP[n+1]), n] return eta_DP.astype(np.int64)
[docs]def convert_ann_to_constraint_region(ann, tol_freq_cents=300.0): """Convert score annotations to constraint regions Notebook: C8/C8S2_FundFreqTracking.ipynb Args: ann (list): Score annotations [[start_time, end_time, MIDI_pitch], ... tol_freq_cents (float): Tolerance in pitch directions specified in cents (Default value = 300.0) Returns: constraint_region (np.ndarray): Constraint regions """ tol_pitch = tol_freq_cents / 100 freq_lower = 2 ** ((ann[:, 2] - tol_pitch - 69)/12) * 440 freq_upper = 2 ** ((ann[:, 2] + tol_pitch - 69)/12) * 440 constraint_region = np.concatenate((ann[:, 0:2], freq_lower.reshape(-1, 1), freq_upper.reshape(-1, 1)), axis=1) return constraint_region
# @jit(nopython=True)
[docs]def compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region=None, tol=5, score_low=0.01, score_high=1.0): """Trajectory tracking with constraint regions Notebook: C8/C8S2_FundFreqTracking.ipynb Args: Z (np.ndarray): Salience representation T_coef (np.ndarray): Time axis F_coef_hertz (np.ndarray): Frequency axis in Hz constraint_region (np.ndarray): Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end_hz) (Default value = None) tol (int): Tolerance parameter for transition matrix (Default value = 5) score_low (float): Score (low) for transition matrix (Default value = 0.01) score_high (float): Score (high) for transition matrix (Default value = 1.0) Returns: eta (np.ndarray): Trajectory indices, unvoiced frames are indicated with -1 """ # do tracking within every constraint region if constraint_region is not None: # initialize contour, unvoiced frames are indicated with -1 eta = np.full(len(T_coef), -1) for row_idx in range(constraint_region.shape[0]): t_start = constraint_region[row_idx, 0] # sec t_end = constraint_region[row_idx, 1] # sec f_start = constraint_region[row_idx, 2] # Hz f_end = constraint_region[row_idx, 3] # Hz # convert start/end values to indices t_start_idx = np.argmin(np.abs(T_coef - t_start)) t_end_idx = np.argmin(np.abs(T_coef - t_end)) f_start_idx = np.argmin(np.abs(F_coef_hertz - f_start)) f_end_idx = np.argmin(np.abs(F_coef_hertz - f_end)) # track in salience part cur_Z = Z[f_start_idx:f_end_idx+1, t_start_idx:t_end_idx+1] T = define_transition_matrix(cur_Z.shape[0], tol=tol, score_low=score_low, score_high=score_high) cur_eta = compute_trajectory_dp(cur_Z, T) # fill contour eta[t_start_idx:t_end_idx+1] = f_start_idx + cur_eta else: T = define_transition_matrix(Z.shape[0], tol=tol, score_low=score_low, score_high=score_high) eta = compute_trajectory_dp(Z, T) return eta
[docs]def compute_traj_from_audio(x, Fs=22050, N=1024, H=128, R=10.0, F_min=55.0, F_max=1760.0, num_harm=10, freq_smooth_len=11, alpha=0.9, gamma=0.0, constraint_region=None, tol=5, score_low=0.01, score_high=1.0): """Compute F0 contour from audio signal Notebook: C8/C8S2_FundFreqTracking.ipynb Args: x (np.ndarray): Audio signal Fs (scalar): Sampling frequency (Default value = 22050) N (int): Window length in samples (Default value = 1024) H (int): Hopsize in samples (Default value = 128) R (float): Frequency resolution in cents (Default value = 10.0) F_min (float): Lower frequency bound (reference frequency) (Default value = 55.0) F_max (float): Upper frequency bound (Default value = 1760.0) num_harm (int): Number of harmonics (Default value = 10) freq_smooth_len (int): Filter length for vertical smoothing (Default value = 11) alpha (float): Weighting parameter for harmonics (Default value = 0.9) gamma (float): Logarithmic compression factor (Default value = 0.0) constraint_region (np.ndarray): Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end,hz) (Default value = None) tol (int): Tolerance parameter for transition matrix (Default value = 5) score_low (float): Score (low) for transition matrix (Default value = 0.01) score_high (float): Score (high) for transition matrix (Default value = 1.0) Returns: traj (np.ndarray): F0 contour, time in seconds in 1st column, frequency in Hz in 2nd column Z (np.ndarray): Salience representation T_coef (np.ndarray): Time axis F_coef_hertz (np.ndarray): Frequency axis in Hz F_coef_cents (np.ndarray): Frequency axis in cents """ Z, F_coef_hertz, F_coef_cents = libfmp.c8.compute_salience_rep( x, Fs, N=N, H=H, R=R, F_min=F_min, F_max=F_max, num_harm=num_harm, freq_smooth_len=freq_smooth_len, alpha=alpha, gamma=gamma) T_coef = (np.arange(Z.shape[1]) * H) / Fs index_CR = compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region, tol=tol, score_low=score_low, score_high=score_high) traj = np.hstack((T_coef.reshape(-1, 1), F_coef_hertz[index_CR].reshape(-1, 1))) traj[index_CR == -1, 1] = 0 return traj, Z, T_coef, F_coef_hertz, F_coef_cents
[docs]def convert_trajectory_to_mask_bin(traj, F_coef, n_harmonics=1, tol_bin=0): """Computes binary mask from F0 trajectory Notebook: C8/C8S2_MelodyExtractSep.ipynb Args: traj (np.ndarray): F0 trajectory (time in seconds in 1st column, frequency in Hz in 2nd column) F_coef (np.ndarray): Frequency axis n_harmonics (int): Number of harmonics (Default value = 1) tol_bin (int): Tolerance in frequency bins (Default value = 0) Returns: mask (np.ndarray): Binary mask """ # Compute STFT bin for trajectory traj_bin = np.argmin(np.abs(F_coef[:, None] - traj[:, 1][None, :]), axis=0) K = len(F_coef) N = traj.shape[0] max_idx_harm = np.max([K, np.max(traj_bin)*n_harmonics]) mask_pad = np.zeros((max_idx_harm.astype(int)+1, N)) for h in range(n_harmonics): mask_pad[traj_bin*h, np.arange(N)] = 1 mask = mask_pad[1:K+1, :] if tol_bin > 0: smooth_len = 2*tol_bin + 1 mask = ndimage.filters.maximum_filter1d(mask, smooth_len, axis=0, mode='constant', cval=0, origin=0) return mask
[docs]def convert_trajectory_to_mask_cent(traj, F_coef, n_harmonics=1, tol_cent=0.0): """Computes binary mask from F0 trajectory Notebook: C8/C8S2_MelodyExtractSep.ipynb Args: traj (np.ndarray): F0 trajectory (time in seconds in 1st column, frequency in Hz in 2nd column) F_coef (np.ndarray): Frequency axis n_harmonics (int): Number of harmonics (Default value = 1) tol_cent (float): Tolerance in cents (Default value = 0.0) Returns: mask (np.ndarray): Binary mask """ K = len(F_coef) N = traj.shape[0] mask = np.zeros((K, N)) freq_res = F_coef[1] - F_coef[0] tol_factor = np.power(2, tol_cent/1200) F_coef_upper = F_coef * tol_factor F_coef_lower = F_coef / tol_factor F_coef_upper_bin = (np.ceil(F_coef_upper / freq_res)).astype(int) F_coef_upper_bin[F_coef_upper_bin > K-1] = K-1 F_coef_lower_bin = (np.floor(F_coef_lower / freq_res)).astype(int) for n in range(N): for h in range(n_harmonics): freq = traj[n, 1] * (1 + h) freq_bin = np.round(freq / freq_res).astype(int) if freq_bin < K: idx_upper = F_coef_upper_bin[freq_bin] idx_lower = F_coef_lower_bin[freq_bin] mask[idx_lower:idx_upper+1, n] = 1 return mask
[docs]def separate_melody_accompaniment(x, Fs, N, H, traj, n_harmonics=10, tol_cent=50.0): """F0-based melody-accompaniement separation Notebook: C8/C8S2_MelodyExtractSep.ipynb Args: x (np.ndarray): Audio signal Fs (scalar): Sampling frequency N (int): Window size in samples H (int): Hopsize in samples traj (np.ndarray): F0 traj (time in seconds in 1st column, frequency in Hz in 2nd column) n_harmonics (int): Number of harmonics (Default value = 10) tol_cent (float): Tolerance in cents (Default value = 50.0) Returns: x_mel (np.ndarray): Reconstructed audio signal for melody x_acc (np.ndarray): Reconstructed audio signal for accompaniement """ # Compute STFT X = librosa.stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant') Fs_feature = Fs / H T_coef = np.arange(X.shape[1]) / Fs_feature freq_res = Fs / N F_coef = np.arange(X.shape[0]) * freq_res # Adjust trajectory traj_X_values = interp1d(traj[:, 0], traj[:, 1], kind='nearest', fill_value='extrapolate')(T_coef) traj_X = np.hstack((T_coef[:, None], traj_X_values[:, None, ])) # Compute binary masks mask_mel = convert_trajectory_to_mask_cent(traj_X, F_coef, n_harmonics=n_harmonics, tol_cent=tol_cent) mask_acc = np.ones(mask_mel.shape) - mask_mel # Compute masked STFTs X_mel = X * mask_mel X_acc = X * mask_acc # Reconstruct signals x_mel = librosa.istft(X_mel, hop_length=H, win_length=N, window='hann', center=True, length=x.size) x_acc = librosa.istft(X_acc, hop_length=H, win_length=N, window='hann', center=True, length=x.size) return x_mel, x_acc