Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
start_time = 0
for i in range(len(raw_wave) // audio_config.chunk + 1):
feature_out = wrapper.convert_next(time_length=audio_config.chunk / audio_config.in_rate)
wrapper.voice_changer_stream.add_out_feature(start_time=start_time, feature=feature_out, frame_period=frame_period)
start_time += audio_config.chunk / audio_config.in_rate
print('cent', i, flush=True)
start_time = 0
for i in range(len(raw_wave) // audio_config.chunk + 1):
wave_out = wrapper.post_convert_next(time_length=audio_config.chunk / audio_config.out_rate)
wave_out_list.append(wave_out)
start_time += audio_config.chunk / audio_config.out_rate
print('post', i, flush=True)
out_wave = numpy.concatenate([w.wave for w in wave_out_list]).astype(numpy.float32)
librosa.output.write_wav(str(test_output_path), out_wave, sr=audio_config.out_rate)
def _test_data(sr=4000, N=3000, returns_power=False, mulaw=True):
x, _ = librosa.load(example_audio_file(), sr=sr)
x, _ = librosa.effects.trim(x, top_db=15)
# To save computational cost
x = x[:N]
# For power conditioning wavenet
if returns_power:
# (1 x N')
p = librosa.feature.rmse(x, frame_length=256, hop_length=128)
upsample_factor = x.size // p.size
# (1 x N)
p = np.repeat(p, upsample_factor, axis=-1)
if p.size < x.size:
# pad against time axis
p = np.pad(p, [(0, 0), (0, x.size - p.size)], mode="constant", constant_values=0)
# shape adajst
mel_spectrograms = collections.deque()
stft_magnitudes = collections.deque()
stft_phases = collections.deque()
segment_names = collections.deque()
# Mel filterbank matrix for computing the mel spectrograms
mel_filterbank = librosa.filters.mel(config["sampling_rate"],
n_fft=STFT_frame_n_samples,
n_mels=config["n_Mel_filters"],
fmin=config["Mel_min_freq"],
fmax=config["Mel_max_freq"])
# Loop over all the 10 second long audio file.
for idx, (audio_file, label_file) in enumerate(zip(all_wavs_filenames, all_labels_filenames)):
try: # read file audio data and parse the label file. If this fail, continue to next file
audio, _ = librosa.core.load(audio_file, sr=config["sampling_rate"], mono=True)
labels_segment = parse_label_file(label_file, classes)
if len(labels_segment) != len(classes):
raise ValueError(
'Length of labels_segment is ' + str(len(labels_segment)) + ' while there are only ' + str(
len(classes)) + ' classes.')
except Exception as e:
print(e)
print(audio_file)
continue
# Split the audio into segments
n_seg_in_audio = audio.shape[0] // segment_n_samples
audio = audio[:n_seg_in_audio * segment_n_samples]
segments = np.split(audio, n_seg_in_audio)
# For all segments, add white noise if needed, compute audio features and store them in queues,
output_fname,
n_fft=4096,
n_layers=1,
n_filters=4096,
hop_length=256,
alpha=0.05,
k_w=15,
k_h=3,
optimizer='bfgs',
stride=1,
iterations=300,
sr=22050):
frame_size = n_fft // 2
audio, fs = librosa.load(content_fname, sr=sr)
content = chop(audio, hop_size=hop_length, frame_size=frame_size)
audio, fs = librosa.load(style_fname, sr=sr)
style = chop(audio, hop_size=hop_length, frame_size=frame_size)
n_frames = min(content.shape[0], style.shape[0])
n_samples = min(content.shape[1], style.shape[1])
content = content[:n_frames, :n_samples]
style = style[:n_frames, :n_samples]
content_features, style_gram, kernels, freqs = compute_features(
content=content,
style=style,
stride=stride,
n_fft=n_fft,
n_layers=n_layers,
n_filters=n_filters,
energy_output = energy.astype(np.uint16)
else:
energy_output = np.zeros(2).astype(np.uint16)
# fft or mel
if is_fft or is_mel:
global sample_rate
# down-sample by 4, with filtering, energy not scaled
data_np = librosa.resample(data_np,
sample_rate,
sample_rate/4,
res_type='kaiser_fast')
# short time fft over n_fft samples
fft_data = librosa.stft(data_np, n_fft,
hop_length=n_fft,
center=False)
# calculate FFT or Mel
if is_fft:
fft_data_mag = np.abs(fft_data[0:n_fft // 2]) ** 2
fft_data_mag *= 2**3
fft_output = get_output_fft_bins(fft_data_mag, n_out_bins)
else:
fft_data_mag = np.abs(fft_data)**2
fft_data_mag *= 2**2
mel_data = librosa.feature.melspectrogram(S=fft_data_mag, sr=sample_rate / 4, n_mels=n_mel)
fft_output = get_output_fft_bins(mel_data, n_out_bins)
# output uint8_t
fft_output = fft_output.astype(np.uint8)
#Trim silence
if len(audio)> 0:
audio, _ = librosa.effects.trim(audio)
#Trim if audio length > samples
if len(audio) > samples:
audio = audio[0:0+samples]
#Else pad blanks if shorter
else:
padding = samples - len(audio)
offset = padding // 2
audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
#Get Mel spectogram of audio
spectrogram = librosa.feature.melspectrogram(audio,
sr=sampling_rate,
n_mels=n_mels,
hop_length=hop_length,
n_fft=n_fft,
fmin=fmin,
fmax=fmax)
#Convert to log scale (DB)
spectrogram = librosa.power_to_db(spectrogram)
#Get MFCC and second derivatives
mfcc = librosa.feature.mfcc(S=spectrogram, n_mfcc=n_mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
#Append MFCC to spectrogram and flatten
features = np.concatenate((spectrogram,mfcc,delta2_mfcc),axis=0)
X = features.ravel()
def _build_mel_basis(hparams):
assert hparams.fmax <= hparams.sample_rate // 2
return librosa.filters.mel(hparams.sample_rate,
hparams.n_fft,
n_mels=hparams.num_mels,
fmin=hparams.fmin,
fmax=hparams.fmax)
hop_length = 347*duration
fmin = 20 #min freq
fmax = sampling_rate // 2 #max freq
n_mels = 128 #number of mels
n_fft = n_mels * 20 #fft window size
padmode = 'constant'
samples = sampling_rate * duration #number of samples
n_mfcc = 13 #number of Mel FCC to use
try:
audio, sr = librosa.load(file_path, sr=sampling_rate)
#Trim silence
if len(audio)> 0:
audio, _ = librosa.effects.trim(audio)
#Trim if audio length > samples
if len(audio) > samples:
audio = audio[0:0+samples]
#Else pad blanks if shorter
else:
padding = samples - len(audio)
offset = padding // 2
audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
#Get Mel spectogram of audio
spectrogram = librosa.feature.melspectrogram(audio,
sr=sampling_rate,
n_mels=n_mels,
hop_length=hop_length,
def load_audio(file_path):
# Window length in audio samples.
win_len = ms_to_samples(model_params.win_len, model_params.sampling_rate)
# Window hop in audio samples.
hop_len = ms_to_samples(model_params.win_hop, model_params.sampling_rate)
# Load the actual audio file.
wav, sr = load_wav(file_path.decode())
# TODO: Determine a better silence reference level for the CMU_ARCTIC dataset (See: #9).
# Remove silence at the beginning and end of the wav so the network does not have to learn
# some random initial silence delay after which it is allowed to speak.
wav, _ = librosa.effects.trim(wav)
# Calculate the linear scale spectrogram.
# Note the spectrogram shape is transposed to be (T_spec, 1 + n_fft // 2) so dense layers
# for example are applied to each frame automatically.
linear_spec = linear_scale_spectrogram(wav, model_params.n_fft, hop_len, win_len).T
# Calculate the Mel. scale spectrogram.
# Note the spectrogram shape is transposed to be (T_spec, n_mels) so dense layers for
# example are applied to each frame automatically.
mel_spec = mel_scale_spectrogram(wav, model_params.n_fft, sr, model_params.n_mels,
model_params.mel_fmin, model_params.mel_fmax, hop_len,
win_len, 1).T
# Convert the linear spectrogram into decibel representation.
linear_mag = np.abs(linear_spec)
linear_mag_db = magnitude_to_decibel(linear_mag)
def compute(self, chunk, sampling_rate, corpus=None, utterance=None):
# Cleanup rest if it's the first frame
if chunk.offset == 0:
self.rest = None
# Compute mel-spectrogram
power_spec = np.abs(spectral.stft_from_frames(chunk.data.T)) ** 2
mel = np.abs(librosa.feature.melspectrogram(S=power_spec, n_mels=self.n_mels, sr=sampling_rate))
mel_power = librosa.power_to_db(mel)
# Compute onset strengths
oenv = librosa.onset.onset_strength(S=mel_power, center=False)
# Remove context, otherwise we have duplicate frames while online processing
oenv = oenv[chunk.left_context:]
if self.rest is not None:
all_frames = np.concatenate([self.rest, oenv])
else:
# Its the first chunk --> pad to center tempogram windows at the beginning
all_frames = np.pad(oenv, (self.win_length // 2, 0), mode='linear_ramp', end_values=0)
if chunk.is_last:
# Its the last chunk --> pad to center tempogram windows at end
all_frames = np.pad(all_frames, (0, self.win_length // 2), mode='linear_ramp', end_values=0)