"""This module provides functionality for pitch analysis.
References:
Nakano et al, "An Automatic Singing Skill Evaluation Method
for Unknown Melodies Using Pitch Interval Accuracy and Vibrato Features"
Proc. Interspeech 2006.
山田 et al, "HMM に基づく歌声合成のためのビブラートモデル化"
IPSJ SIG Tech. Report 2009.
Note that vibrato extraction method in this module is exerimental.
Because details of the vibrato extraction method are not described
in the above papers and not trivial to implement (in my opinion),
my implementation may not work well compared to the original author's one.
Also note that there are a lot of tunable parameters (threshold,
window size, min/max extent, cut-off frequency, etc.).
If you want to get maximum performance, you might want to tune these
parameters with your dataset.
I tested this code with kiritan_singing and nit-song070 database.
"""
import librosa
import numpy as np
import torch
from nnsvs.dsp import lowpass_filter
from scipy.signal import argrelmax, argrelmin
_c4_hz = 440 * 2 ** (3 / 12 - 1)
_c4_cent = 4800
[docs]
def hz_to_cent_based_c4(hz):
"""Convert Hz to cent based on C4
Args:
hz (np.ndarray): array of Hz
Returns:
np.ndarray: array of cent
"""
out = hz.copy()
nonzero_indices = np.where(hz > 0)[0]
out[nonzero_indices] = (
1200 * np.log(hz[nonzero_indices] / _c4_hz) / np.log(2) + _c4_cent
)
return out
[docs]
def cent_to_hz_based_c4(cent):
"""Convert cent to Hz based on C4
Args:
cent (np.ndarray): array of cent
Returns:
np.ndarray: array of Hz
"""
out = cent.copy()
nonzero_indices = np.where(cent > 0)[0]
out[nonzero_indices] = (
np.exp((cent[nonzero_indices] - _c4_cent) * np.log(2) / 1200) * _c4_hz
)
return out
[docs]
def nonzero_segments(f0):
"""Find nonzero segments
Args:
f0 (np.ndarray): array of f0
Returns:
list: list of (start, end)
"""
vuv = f0 > 0
started = False
s, e = 0, 0
segments = []
for idx in range(len(f0)):
if vuv[idx] > 0 and not started:
started = True
s = idx
elif started and (vuv[idx] <= 0):
e = idx
started = False
segments.append((s, e))
else:
pass
if started and vuv[-1] > 0:
segments.append((s, len(vuv) - 1))
return segments
[docs]
def note_segments(lf0_score_denorm):
"""Compute note segments (start and end indices) from log-F0
Note that unvoiced frames must be set to 0 in advance.
Args:
lf0_score_denorm (Tensor): (B, T)
Returns:
list: list of note (start, end) indices
"""
segments = []
for s, e in nonzero_segments(lf0_score_denorm):
out = torch.sign(torch.abs(torch.diff(lf0_score_denorm[s : e + 1])))
transitions = torch.where(out > 0)[0]
note_start, note_end = s, -1
for pos in transitions:
note_end = int(s + pos)
segments.append((note_start, note_end))
note_start = note_end + 1
# Handle last note
while (
note_start < len(lf0_score_denorm) - 1 and lf0_score_denorm[note_start] <= 0
):
note_start += 1
note_end = note_start + 1
while note_end < len(lf0_score_denorm) - 1 and lf0_score_denorm[note_end] > 0:
note_end += 1
if note_end != note_start + 1:
segments.append((note_start, note_end))
return segments
def compute_f0_correction_ratio(
f0,
f0_score,
edges_to_be_excluded=50,
out_of_tune_threshold=200,
correction_threshold=100,
):
"""Compute f0 correction ratio
Args:
f0 (np.ndarray): array of f0
f0_score (np.ndarray): array of f0 score
Returns:
float: correction ratio to multiplied to F0 (i.e. f0 * ratio)
"""
segments = note_segments(torch.from_numpy(f0_score))
center_f0s = []
center_score_f0s = []
# edges_to_be_excluded = 50 # 0.25 sec for excluding overshoot/preparation
for s, e in segments:
L = e - s
if L > edges_to_be_excluded * 2:
center_f0s.append(f0[s + edges_to_be_excluded : e - edges_to_be_excluded])
center_score_f0s.append(
f0_score[s + edges_to_be_excluded : e - edges_to_be_excluded]
)
center_f0s = np.concatenate(center_f0s)
center_score_f0s = np.concatenate(center_score_f0s)
# Compute pitch ratio to be multiplied
nonzero_indices = (center_f0s > 0) & (center_score_f0s > 0)
ratio = center_score_f0s[nonzero_indices] / center_f0s[nonzero_indices]
# Exclude too out-of-tune frames (over 2 semitone)
up_threshold = np.exp(out_of_tune_threshold * np.log(2) / 1200)
low_threshold = np.exp(-out_of_tune_threshold * np.log(2) / 1200)
ratio = ratio[(ratio < up_threshold) & (ratio > low_threshold)]
global_offset = ratio.mean()
# Avoid corrections over semi-tone
# If more than semi-tone pitch correction is needed, it is better to correct
# data by hand or fix musicxml or UST instead.
up_threshold = np.exp(correction_threshold * np.log(2) / 1200)
low_threshold = np.exp(-correction_threshold * np.log(2) / 1200)
if global_offset > up_threshold or global_offset < low_threshold:
print(
f"""warn: more than 1 semitone pitch correction is needed.
global_offset: {global_offset} cent.
It is likely that manual pitch corrections are preferable."""
)
global_offset = np.clip(global_offset, low_threshold, up_threshold)
return global_offset
def extract_vibrato_parameters_impl(pitch_seg, sr):
"""Extract vibrato parameters for a single pitch segment
Nakano et al, "An Automatic Singing Skill Evaluation Method
for Unknown Melodies Using Pitch Interval Accuracy and Vibrato Features"
Proc. Interspeech 2006.
山田 et al, "HMM に基づく歌声合成のためのビブラートモデル化"
IPSJ SIG Tech. Report 2009.
Args:
pitch_seg (np.ndarray): array of pitch
sr (int): sampling rate
Returns:
tuple: (R, E, m_a, m_f)
"""
peak_high_pos = argrelmax(pitch_seg)[0]
peak_low_pos = argrelmin(pitch_seg)[0]
m_a = np.zeros(len(pitch_seg))
m_f = np.zeros(len(pitch_seg))
if len(peak_high_pos) != len(peak_low_pos) + 1:
print("Warning! Probably a bug...T.T")
print(peak_high_pos, peak_low_pos)
return None, None, None, None
peak_high_pos_diff = np.diff(peak_high_pos)
peak_low_pos_diff = np.diff(peak_low_pos)
R = np.zeros(len(peak_high_pos_diff) + len(peak_low_pos_diff))
R[0::2] = peak_high_pos_diff
R[1::2] = peak_low_pos_diff
m_f_ind = np.zeros(len(R), dtype=int)
m_f_ind[0::2] = peak_high_pos[:-1]
m_f_ind[1::2] = peak_low_pos[:-1]
m_f[m_f_ind] = (1 / R) * sr
peak_high_pitch = pitch_seg[peak_high_pos]
peak_low_pitch = pitch_seg[peak_low_pos]
E = np.zeros(len(R))
E[0::2] = (peak_high_pitch[1:] + peak_high_pitch[:-1]) / 2 - peak_low_pitch
E[1::2] = peak_high_pitch[1:-1] - (peak_low_pitch[1:] + peak_low_pitch[:-1]) / 2
m_a_ind = np.zeros(len(R), dtype=int)
m_a_ind[0::2] = peak_low_pos
m_a_ind[1::2] = peak_high_pos[1:-1]
m_a[m_a_ind] = 0.5 * E
rate = 1 / R.mean() * sr
extent = 0.5 * E.mean()
print(f"Rate: {rate}, Extent: {extent}")
return R, E, m_a, m_f
def compute_extent(pitch_seg):
"""Compute extent of a pitch segment
Args:
pitch_seg (np.ndarray): array of pitch
Returns:
np.ndarray: array of extent
"""
peak_high_pos = argrelmax(pitch_seg)[0]
peak_low_pos = argrelmin(pitch_seg)[0]
if len(peak_high_pos) == 1 or len(peak_low_pos) == 1:
return np.array([-1])
if len(peak_high_pos) < len(peak_low_pos):
peak_low_pos = peak_low_pos[:-2]
elif len(peak_high_pos) == len(peak_low_pos):
peak_low_pos = peak_low_pos[:-1]
peak_high_pitch = pitch_seg[peak_high_pos]
peak_low_pitch = pitch_seg[peak_low_pos]
peak_high_pos_diff = np.diff(peak_high_pos)
peak_low_pos_diff = np.diff(peak_low_pos)
# TODO: would probably be a bug...
if len(peak_high_pitch) != len(peak_low_pitch) + 1:
return np.array([-1])
E = np.zeros(len(peak_high_pos_diff) + len(peak_low_pos_diff))
E[0::2] = (peak_high_pitch[1:] + peak_high_pitch[:-1]) / 2 - peak_low_pitch
E[1::2] = peak_high_pitch[1:-1] - (peak_low_pitch[1:] + peak_low_pitch[:-1]) / 2
return E
def extract_smoothed_f0(f0, sr, cutoff=8):
"""Extract smoothed f0 by low-pass filtering
Note that the low-pass filter is only applied to voiced segments.
Args:
f0 (np.ndarray): array of f0
sr (int): sampling rate
cutoff (float): cutoff frequency
Returns:
np.ndarray: array of smoothed f0
"""
segments = nonzero_segments(f0)
f0_smooth = f0.copy()
for s, e in segments:
f0_smooth[s:e] = lowpass_filter(f0[s:e], sr, cutoff=cutoff)
return f0_smooth
def extract_smoothed_continuous_f0(f0, sr, cutoff=20):
"""Extract smoothed continuous f0 by low-pass filtering
Note that the input must be continuous F0 or log-F0.
Args:
f0 (np.ndarray): array of continuous f0
sr (int): sampling rate
cutoff (float): initial cutoff frequency
Returns:
np.ndarray: array of smoothed continuous f0
"""
is_2d = len(f0.shape) == 2
f0 = f0.reshape(-1) if is_2d else f0
# Ref: https://bit.ly/3SOePFw
f0_smooth = lowpass_filter(f0, sr, cutoff=cutoff)
# Fallback case: shound't happen I believe
# NOTE: hard-coded for now
next_cutoff = 50
while (f0_smooth < 0).any():
f0_smooth = lowpass_filter(f0, sr, cutoff=next_cutoff)
next_cutoff *= 2
f0_smooth = f0_smooth.reshape(len(f0), 1) if is_2d else f0_smooth
return f0_smooth
def interp_vibrato(m_f):
"""Interpolate a sequence of vibrato parameter by linear interpolation
Args:
m_f (np.ndarray): array of vibrato parameter
Returns:
np.ndarray: array of vibrato parameter
"""
nonzero_indices = np.where(m_f > 0)[0]
nonzero_indices = [0] + list(nonzero_indices) + [len(m_f) - 1]
out = np.interp(np.arange(len(m_f)), nonzero_indices, m_f[nonzero_indices])
return out
[docs]
def gen_sine_vibrato(f0, sr, m_a, m_f, scale=1.0):
"""Generate F0 with sine-based vibrato
Args:
f0 (ndarray): fundamental frequency
sr (int): sampling rate
m_a (ndarray): amplitude of vibrato
m_f (ndarray): frequency of vibrato
scale (float): scale factor
Returns:
ndarray: F0 with sine-based vibrato
"""
f0_gen = f0.copy()
voiced_end_indices = np.asarray([e for _, e in nonzero_segments(f0)])
for s, e in nonzero_segments(m_a):
# limit vibrato rate to [3, 8] Hz
m_f_seg = np.clip(m_f[s:e], 3, 8)
# limit vibrato extent to [30, 150] cent
m_a_seg = np.clip(m_a[s:e], 30, 150)
cent = scale * m_a_seg * np.sin(2 * np.pi / sr * m_f_seg * np.arange(0, e - s))
new_f0 = f0[s:e] * np.exp(cent * np.log(2) / 1200)
f0_gen[s:e] = new_f0
# NOTE: this is a hack to avoid discontinuity at the end of vibrato
voiced_ends_next_to_vibrato = voiced_end_indices[voiced_end_indices > e]
if len(voiced_ends_next_to_vibrato) > 0:
voiced_end = voiced_ends_next_to_vibrato[0]
f0_gen[s:voiced_end] = lowpass_filter(f0_gen[s:voiced_end], sr, cutoff=12)
return f0_gen