Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True):
x, _ = librosa.load(example_audio_file(), sr=sr)
x, _ = librosa.effects.trim(x, top_db=15)
# To save computational cost
x = x[:N]
# For power conditioning wavenet
if returns_power:
# (1 x N')
p = librosa.feature.rmse(x, frame_length=256, hop_length=128)
upsample_factor = x.size // p.size
# (1 x N)
p = np.repeat(p, upsample_factor, axis=-1)
if p.size < x.size:
# pad against time axis
p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0)
# shape adajst
hop_length = 347*duration
fmin = 20 #min freq
fmax = sampling_rate // 2 #max freq
n_mels = 128 #number of mels
n_fft = n_mels * 20 #fft window size
padmode = 'constant'
samples = sampling_rate * duration #number of samples
n_mfcc = 13 #number of Mel FCC to use
try:
audio, sr = librosa.load(file_path, sr=sampling_rate)
#Trim silence
if len(audio)> 0:
audio, _ = librosa.effects.trim(audio)
#Trim if audio length > samples
if len(audio) > samples:
audio = audio[0:0+samples]
#Else pad blanks if shorter
else:
padding = samples - len(audio)
offset = padding // 2
audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
#Get Mel spectogram of audio
spectrogram = librosa.feature.melspectrogram(audio,
sr=sampling_rate,
n_mels=n_mels,
hop_length=hop_length,
def load_audio(file_path):
# Window length in audio samples.
win_len = ms_to_samples(model_params.win_len, model_params.sampling_rate)
# Window hop in audio samples.
hop_len = ms_to_samples(model_params.win_hop, model_params.sampling_rate)
# Load the actual audio file.
wav, sr = load_wav(file_path.decode())
# TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9).
# Remove silence at the beginning and end of the wav so the network does not have to learn
# some random initial silence delay after which it is allowed to speak.
wav, _ = librosa.effects.trim(wav)
# Calculate the linear scale spectrogram.
# Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers
# for example are applied to each frame automatically.
linear_spec = linear_scale_spectrogram(wav, model_params.n_fft, hop_len, win_len).T
# Calculate the Mel. scale spectrogram.
# Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for
# example are applied to each frame automatically.
mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr, model_params.n_mels,
model_params.mel_fmin, model_params.mel_fmax, hop_len,
win_len, 1).T
# Convert the linear spectrogram into decibel representation.
linear_mag = np.abs(linear_spec)
linear_mag_db = magnitude_to_decibel(linear_mag)
filelist = list(zip(X_list, y_list))
for mix_path, inst_path in tqdm(filelist):
basename_mix, _ = os.path.splitext(os.path.basename(mix_path))
basename_inst, _ = os.path.splitext(os.path.basename(inst_path))
outpath_mix = os.path.join(args.mixtures, basename_mix + suffix)
outpath_inst = os.path.join(args.instruments, basename_inst + suffix)
if os.path.exists(outpath_mix) and os.path.exists(outpath_inst):
continue
X, _ = librosa.load(
mix_path, args.sr, False, dtype=np.float32, res_type='kaiser_fast')
y, _ = librosa.load(
inst_path, args.sr, False, dtype=np.float32, res_type='kaiser_fast')
X, _ = librosa.effects.trim(X)
y, _ = librosa.effects.trim(y)
X, y = spec_utils.align_wave_head_and_tail(X, y, args.sr)
v = X - y
sf.write(input_i, y.T, args.sr)
sf.write(input_v, v.T, args.sr)
subprocess.call(cmd_i, stderr=subprocess.DEVNULL)
subprocess.call(cmd_v, stderr=subprocess.DEVNULL)
y, _ = librosa.load(
output_i, args.sr, False, dtype=np.float32, res_type='kaiser_fast')
v, _ = librosa.load(
output_v, args.sr, False, dtype=np.float32, res_type='kaiser_fast')
X = y + v
spec = spec_utils.calc_spec(X, args.hop_length)
def spectrogram2wav(mag): # Generate wave file from spectrogram
mag = mag.T # transpose
mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db # de-noramlize
mag = np.power(10.0, mag * 0.05) # to amplitude
wav = griffin_lim(mag) # wav reconstruction
wav = signal.lfilter([1], [1, -hp.preemphasis], wav) # de-preemphasis
wav, _ = librosa.effects.trim(wav) # trim
return wav.astype(np.float32)
def trim_silence(wav, hparams):
return librosa.effects.trim(wav,
top_db=hparams.trim_top_db,
frame_length=hparams.trim_fft_size,
hop_length=hparams.trim_hop_size)[0]
def mix(hp, args, audio, num, s1_dvec, s1_target, s2, train):
srate = hp.audio.sample_rate
dir_ = os.path.join(args.out_dir, 'train' if train else 'test')
d, _ = librosa.load(s1_dvec, sr=srate)
w1, _ = librosa.load(s1_target, sr=srate)
w2, _ = librosa.load(s2, sr=srate)
assert len(d.shape) == len(w1.shape) == len(w2.shape) == 1, \
'wav files must be mono, not stereo'
d, _ = librosa.effects.trim(d, top_db=20)
w1, _ = librosa.effects.trim(w1, top_db=20)
w2, _ = librosa.effects.trim(w2, top_db=20)
# if reference for d-vector is too short, discard it
if d.shape[0] < 1.1 * hp.embedder.window * hp.audio.hop_length:
return
# LibriSpeech dataset have many silent interval, so let's vad-merge them
# VoiceFilter paper didn't do that. To test SDR in same way, don't vad-merge.
if args.vad == 1:
w1, w2 = vad_merge(w1), vad_merge(w2)
# I think random segment length will be better, but let's follow the paper first
# fit audio to `hp.data.audio_len` seconds.
# if merged audio is shorter than `L`, discard it
L = int(srate * hp.data.audio_len)
if w1.shape[0] < L or w2.shape[0] < L: