"""
Description: libtsm time-scale modification functions
Contributors: Sebastian Rosenzweig, Simon Schwär, Jonathan Driedger, Meinard Müller
License: The MIT license, https://opensource.org/licenses/MIT
This file is part of libtsm (https://www.audiolabs-erlangen.de/resources/MIR/2021-DAFX-AdaptivePitchShifting)
"""
import numpy as np
import scipy.interpolate
from .utils import win, stft, istft, cross_corr, hps, find_peaks
[docs]def pv_tsm(x, alpha, syn_hop=512, win_length=2048, win_beta=1, Fs=22050, zero_pad=0, restore_energy=False,
fft_shift=False, phase_locking=False) -> np.ndarray:
"""
Phase Vocoder Time scale modification algorithm, that rescales the time-axis of the input signal x
according to the time-stretch function s without altering the pitch of x.
Parameters
----------
x : np.ndarray [shape=(N, )], real - valued
Signal to be transformed
alpha : float or np.ndarray [shape=(S, 2)]
Time stretch function, given by a constant (float) or a set of S anchor points (int)
syn_hop : int
hop size of the synthesis window
win_length : int
length of analysis and synthesis window for STFT
win_beta : int
exponent of sin^beta window
Fs : int
Sampling rate of the input audio signal x
zero_pad : int
For FFT. Number of zeros padded to the window to increase the fft size
restore_energy : bool
For FFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
fft_shift : bool
For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
phase_locking : bool
when True, Applies identity phase locking
Returns
-------
y : np.ndarray [shape=(L,1)], real - valued
"""
# Pre-calculations
window = win(win_length, win_beta)
w = np.concatenate((np.zeros(int(np.floor(zero_pad / 2))), window, np.zeros(int(np.floor(zero_pad / 2)))))
win_len = len(w)
win_len_half = win_len // 2
if len(x.shape) == 1:
x = x.reshape(-1, 1)
num_of_chan = x.shape[1]
# Time-stretch function
if np.isscalar(alpha):
anchor_points = np.array([[0, 0], [int(x.shape[0]) - 1, int(np.ceil(alpha * x.shape[0])) - 1]])
else:
anchor_points = alpha.astype(int)
output_length = anchor_points[-1, 1] + 1
syn_win_pos = np.arange(0, output_length + win_len_half, syn_hop) # positions of the synthesis winLenHalf windows
# in the output
fi = scipy.interpolate.interp1d(anchor_points[:, 1], anchor_points[:, 0], kind='linear', fill_value='extrapolate')
ana_win_pos = fi(syn_win_pos)
ana_win_pos = np.round(ana_win_pos).astype(int) # positions of the analysis windows in the input
ana_hop = np.append([0], ana_win_pos[1:] - ana_win_pos[:-1]) # analysis hop sizes
# Phase Vocoder
y = np.zeros((output_length, num_of_chan)) # initialize output
for c in range(num_of_chan): # loop over channels
x_c = x[:, c]
# STFT
X, f, t = stft(x_c, ana_hop=ana_win_pos, win_length=win_length, win_beta=win_beta, Fs=Fs, num_of_frames=-1,
fft_shift=fft_shift, zero_pad=zero_pad)
# Phase adaptation
Y = np.zeros(X.shape, dtype=complex) # the spectrogram of the output
Y[:, 0] = X[:, 0] # phase initialization
N = len(w)
k = np.arange(0, N // 2 + 1) # center frequencies of the N/2+1 first bins of the spectrum in
# 'oscillations per frame'
omega = 2 * np.pi * k / N # phase advances per sample for the frequencies k
for i in range(1, X.shape[1]):
dphi = omega * ana_hop[i] # expected phase advances from the last to the current input frame
ph_curr = np.angle(X[:, i]) # phases of the current input frame
ph_last = np.angle(X[:, i - 1]) # phases of the last input frame
hpi = (ph_curr - ph_last) - dphi # heterodyned phase increments
# hpi = np.array([num - 2 * np.pi * round(num/(2*np.pi)) for num in hpi]) # reduce to the range -pi:pi,
# np.round() deviates from Matlab's round()
# matlab_round = np.vectorize(lambda v: round(v))
hpi = hpi - 2 * np.pi * np.round(hpi / (2 * np.pi)) # reduce to the range -pi:pi
ipa_sample = omega + hpi / ana_hop[i] # instantaneous phase advances per sample
ipa_hop = ipa_sample * syn_hop # instantaneous phase advances per synthesis hopsize
ph_syn = np.angle(Y[:, i - 1]) # phases of the last synthesized frame
# We now compute a phasor that rotates the phase angles of the current input frame by angles theta such
# that no phase discontinuities occur when re-synthesizing the resulting spectrogram with the synthesis
# hopsize
if not phase_locking: # standard phase vocoder: the phase continuity of every bin is preserved separately
theta = ph_syn + ipa_hop - ph_curr # phases of the last output frame Instantaneous phase advance
# Phases of the current input frame
phasor = np.exp(1j * theta)
else: # Phase vocoder with identity phase locking: the phase relationships from the input frame are
# partially preserved by 'locking' the phases of bins in the region of influence of a peak in the
# sprectrum to the phase of the peak's bin
p, irs, ire = find_peaks(X[:, i]) # Get the peaks in the spectrum together with their regions of
# influence
theta = np.zeros(Y[:, i].shape)
for n in range(0, len(p)):
theta[irs[n]:ire[n]] = ph_syn[p[n]] + ipa_hop[p[n]] - ph_curr[p[n]] # Phases of the last
# output frame, Instantaneous phase advance, Phases of the current input frame
phasor = np.exp(1j * theta)
Y[:, i] = phasor * X[:, i]
# ISTFT
y_c = istft(Y, syn_hop=syn_hop, win_length=win_length, win_beta=win_beta, zero_pad=zero_pad, num_of_iter=1,
orig_sig_len=output_length, restore_energy=restore_energy, fft_shift=fft_shift)
y[:, c] = y_c[:, 0]
return y
[docs]def wsola_tsm(x, alpha, syn_hop=512, win_length=1024, win_beta=2, tol=512) -> np.ndarray:
"""
Waveform Similarity Overlap and Add (WSOLA) algorithm that rescales the time-axis of the input signal x
according to the time-stretch function s without altering the pitch of x.
Parameters
----------
x : np.ndarray [shape=(N,num_of_chan)], real - valued
Signal to be transformed
alpha : float or np.ndarray [shape=(S,2)]
Time stretch function, given by a constant (float) or a set of S anchor points (int)
syn_hop : int
hop size of the synthesis window
win_length : int
length of the analysis and synthesis window
win_beta : int
exponent of sin^beta window
tol : int
Amount of samples the window will be shifted to avoid phase discontinuities when overlap-adding to form the
output signal
Returns
-------
y : np.ndarray [shape=(L,num_of_chan)], real - valued
The time-scale modified output signal
"""
# Pre-calculations
window = win(win_length, win_beta)
w = window
win_len = len(w)
win_len_half = np.around(win_len / 2).astype(int)
if len(x.shape) == 1:
x = x.reshape(-1, 1)
num_of_chan = x.shape[1]
# Time-stretch function
if np.isscalar(alpha):
anchor_points = np.array([[0, 0], [int(x.shape[0]) - 1, int(np.ceil(alpha * x.shape[0])) - 1]])
else:
anchor_points = alpha.astype(int)
output_length = anchor_points[-1, 1] + 1
syn_win_pos = np.arange(0, output_length + win_len_half, syn_hop) # positions of the synthesis winLenHalf
# windows in the output
fi = scipy.interpolate.interp1d(anchor_points[:, 1], anchor_points[:, 0], kind='linear',
fill_value='extrapolate')
ana_win_pos = fi(syn_win_pos)
ana_win_pos = np.round(ana_win_pos).astype(int) # positions of the analysis windows in the input
ana_hop = np.append([0], ana_win_pos[1:] - ana_win_pos[:-1]) # analysis hop sizes
# WSOLA
y = np.zeros((output_length, num_of_chan)) # initialize output
min_fac = np.min(syn_hop / ana_hop[1:]) # the minimal local stretching factor
# to avoid that we access x outside its range, we need to zero pad it appropriately
x = np.pad(x, [(win_len_half + tol, int(np.ceil(1 / min_fac)) * win_len + tol), (0, 0)])
ana_win_pos += tol # compensate for the extra 'tol' padded zeros at the beginning of x
for c in range(num_of_chan): # loop over channels
x_c = x[:, c]
y_c = np.zeros((output_length + 2 * win_len, 1)) # initialize the output signal
ow = np.zeros((output_length + 2 * win_len, 1)) # keep track of overlapping windows
delay = 0 # shift of the current analysis window position
for i in range(len(ana_win_pos) - 1):
# OLA
curr_syn_win_ran = np.arange(syn_win_pos[i], syn_win_pos[i] + win_len, dtype=int) # range of current
# synthesis window
curr_ana_win_ran = np.arange(ana_win_pos[i] + delay, ana_win_pos[i] + win_len + delay, dtype=int) # range
# of the current analysis window, shift by 'del' offset
y_c[curr_syn_win_ran, 0] += x_c[curr_ana_win_ran] * w # overlap and add
ow[curr_syn_win_ran, 0] += w # update the sum of overlapping windows
nat_prog = x_c[curr_ana_win_ran + syn_hop] # 'natural progression' of the last copied audio segment
next_ana_win_ran = np.arange(ana_win_pos[i + 1] - tol, ana_win_pos[i + 1] + win_len + tol, dtype=int) #
# range where the next analysis window could be located (including the tolerance region)
x_next_ana_win_ran = x_c[next_ana_win_ran] # corresponding segment in x
# Cross Correlation
cc = cross_corr(x_next_ana_win_ran, nat_prog, win_len) # compute the cross correlation
max_index = np.argmax(cc) # pick the optimizing index in the cross correlation
delay = tol - max_index # infer the new 'delay'
# process last frame
y_c[syn_win_pos[-1]:syn_win_pos[-1] + win_len, 0] += x_c[ana_win_pos[i] + delay:ana_win_pos[
i] + win_len + delay] * w
ow[syn_win_pos[-1]:syn_win_pos[-1] + win_len, 0] += w
# re-normalize the signal by dividing by the added windows
ow[ow < 10 ** (-3)] = 1 # avoid potential division by zero
y_c /= ow
# remove zero-padding at the beginning
y_c = y_c[win_len_half:]
# remove zero-padding at the end
y_c = y_c[0:output_length]
y[:, c] = y_c[:, 0]
return y
[docs]def hps_tsm(x, alpha, Fs=22050, hps_ana_hop=256, hps_win_length=1024, hps_win_beta=2, hps_zero_pad=0,
hps_fil_len_harm=10, hps_fil_len_perc=10, pv_syn_hop=512, pv_win_length=2048, pv_win_beta=2, pv_zero_pad=0,
pv_restore_energy=False, pv_fft_shift=False, ola_syn_hop=128, ola_win_length=256, ola_win_beta=2) \
-> np.ndarray:
"""
Time Scale Modification algorithm based on Harmonic - Percussive separation. After separation is
performed, the algorithm uses two phase vocoder TSM and WSOLA TSM algorithms for the Harmonic and percussive part
separately.
Parameters
----------
x : np.ndarray [shape=(N, )], real - valued
Signal to be transformed
alpha : float or np.ndarray [shape=(S,2)]
Time stretch function, given by a constant (float) or a set of S anchor points (int)
Fs : int
Sampling rate
hps_ana_hop : int
hop size for HPS
hps_win_length : int
window length for HPS
hps_win_beta : int
exponent of sin^beta window
hps_zero_pad : int
For FFT. Number of zeros padded to the window to increase the fft size
hps_fil_len_harm: int
Length of the median filter in time direction.
hps_fil_len_perc: int
Length of the median filter in frequency direction.
pv_syn_hop : int
hop size for synthesize windows of phase vocoder
pv_win_length : int
window length for phase vocoder
pv_win_beta : int
exponent of sin^beta window
pv_zero_pad : int
phase vocoder zero padding
pv_restore_energy : bool
restore energy of signal in phase vocoder
pv_fft_shift : bool
fft shift in phase vocoder
ola_syn_hop : int
synthesis hop size of OLA
ola_win_length : int
window length for OLA
ola_win_beta : int
exponent of sin^beta window
Returns
-------
y : np.ndarray [shape=(L,1)], real - valued
The time-scale modified output signal
"""
# Harmonic-Percussive Separation
x_harm, x_perc = hps(x, ana_hop=hps_ana_hop, win_length=hps_win_length, win_beta=hps_win_beta, Fs=Fs,
zero_pad=hps_zero_pad, fil_len_harm=hps_fil_len_harm, fil_len_perc=hps_fil_len_perc,
masking_mode='binary')
# Phase Vocoder for harmonic part
y_harm = pv_tsm(x_harm, alpha=alpha, syn_hop=pv_syn_hop, win_length=pv_win_length, win_beta=pv_win_beta, Fs=Fs,
zero_pad=pv_zero_pad, restore_energy=pv_restore_energy, fft_shift=pv_fft_shift, phase_locking=True)
# OLA for percussive part
y_perc = wsola_tsm(x_perc, alpha=alpha, syn_hop=ola_syn_hop, win_length=ola_win_length, win_beta=ola_win_beta,
tol=0)
# Synthesis
y = y_harm + y_perc
return y
[docs]def pv_int_tsm(x, alpha, syn_hop=512, win_length=2048, win_beta=2, Fs=22050, zero_pad=-1, restore_energy=False,
fft_shift=True) -> np.ndarray:
"""
Phase Vocoder Time scale modification algorithm, that rescales the time-axis of the input signal x
according to the time-stretch function s without altering the pitch of x. This algorithm is optimized for integer
values of the time stretching function.
Parameters
----------
x : np.ndarray [shape=(N,)], real - valued
Signal to be transformed
alpha : int or np.ndarray
Time stretch factor
syn_hop : int
hop size of the synthesis window
win_length : int
length of analysis and synthesis window for STFT
win_beta : int
exponent of sin^beta window
Fs : int
Sampling rate of the input audio signal x
zero_pad : int
For FFT. Number of zeros padded to the window to increase the fft size
restore_energy : bool
For FFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
fft_shift: bool
For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
Returns
-------
y : np.ndarray [shape=(L,1)], real - valued
The time-scale modified output signal
"""
# Pre-Calculations
window = win(win_length, win_beta)
if len(x.shape) == 1:
x = x.reshape(-1, 1)
num_of_chan = x.shape[1]
if zero_pad == -1:
zero_pad = alpha * window.shape[0] // 2
wn = np.hstack((np.zeros((int(np.floor(zero_pad / 2)))),
window,
np.zeros((int(np.floor(zero_pad / 2))))))
win_len = wn.shape[0]
win_len_half = int(np.round(win_len / 2))
if (np.isscalar(alpha)) and (np.mod(alpha, 1) == 0) and (alpha >= 1):
anchor_points = np.array([[0, 0], [int(x.shape[0]), int(np.ceil(alpha * x.shape[0]))]])
else:
raise Exception("alpha needs to be an integer >= 1 !")
while np.mod(syn_hop, alpha) != 0:
syn_hop = syn_hop + 1
output_length = int(anchor_points[-1, 1])
output_window_pos = np.arange(0, output_length + win_len_half, syn_hop) # positions of the synthesis winLenHalf
# windows in the output
input_window_pos = output_window_pos // alpha
y = np.zeros((output_length, num_of_chan))
for c in range(num_of_chan):
# stft
X, f, t = stft(x[:, c], ana_hop=input_window_pos, win_length=win_length, win_beta=win_beta, Fs=Fs,
num_of_frames=-1, fft_shift=fft_shift, zero_pad=zero_pad)
# Phase Adaption
Y = np.abs(X) * np.exp(1j * alpha * np.angle(X))
# istft
y_c = istft(Y, syn_hop=syn_hop, win_length=win_length, win_beta=win_beta, Fs=Fs, zero_pad=zero_pad,
num_of_iter=1, orig_sig_len=output_length, restore_energy=restore_energy, fft_shift=fft_shift)
y[:, c] = y_c[:, 0]
return y
[docs]def two_step_tsm(x, alpha, Fs=22050, order='exact-coarse') -> np.ndarray:
"""
Time Scale Modification algorithm, where the signal is stretched by the integer and decimal part
of the time stretch function using two different algorithms.
Parameters
----------
x : np.ndarray [shape=(N, )], real - valued
Signal to be transformed
alpha : float
Scalar time stretch factor
Fs : int
Sampling rate of the input audio signal x
order : 'exact-coarse' or 'coarse-exact'
Decides which of the two time stretching functions will be computed first, coarse corresponding to the integer
part
Returns
-------
y : np.ndarray [shape=(L,num_of_chan)], real - valued
The time-scale modified output signal
"""
if len(x.shape) == 1:
x = x.reshape(-1, 1)
alpha_rough = np.max([1, np.round(alpha)]).astype(int)
alpha_exact = alpha / alpha_rough
if order == 'exact-coarse':
y_exact = hps_tsm(x, alpha_exact, Fs=Fs)
y = pv_int_tsm(y_exact[:, 0], alpha_rough, Fs=Fs)
elif order == 'coarse-exact':
y_coarse = pv_int_tsm(x, alpha_rough, Fs=Fs)
y = hps_tsm(y_coarse[:, 0], alpha_exact, Fs=Fs)
else:
raise Exception("Invalid order!")
return y