Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test(self):
example = corpus.examples[0]
mel_power_spectrogram = librosa.feature.melspectrogram(
y=example.raw_audio(), n_fft=example.fourier_window_length, hop_length=example.hop_length,
sr=example.sample_rate)
self.assertTrue(np.array_equal(mel_power_spectrogram,
example.spectrogram(type=SpectrogramType.power,
frequency_scale=SpectrogramFrequencyScale.mel)))
def speech_to_text(wav_file):
wav, sr = librosa.load(wav_file, mono=True)
mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr), axis=0), [0, 2, 1])
logit = speech_to_text_network()
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
decoded = tf.transpose(logit, perm=[1, 0,2])
decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False)
predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
output = sess.run(decoded, feed_dict={X: mfcc})
print(output)
msg = ''.join([words[n] for n in output[0][1]])
print(msg)
testing_file = "../../data/speech/train/wav/train/A11/A11_0.wav"
def ext_mfcc_feature(url_path):
"""
Return the MFCC feature
"""
with open(url_path, 'r') as urls:
mfccs = []
labels = []
for url in list(urls):
url, label = str(url).split(" ")
index = eval(str(label).split("\n")[0])
label = np.zeros(config.N_SPEAKER)
label[index] = 1
y, sr = librosa.load(url)
mfcc_ = librosa.feature.mfcc(y, sr, n_mfcc=13)
mfcc_delta = librosa.feature.delta(mfcc_, width=3)
mfcc_delta_delta = librosa.feature.delta(mfcc_delta, width=3)
mfcc = np.vstack([mfcc_, np.vstack([mfcc_delta, mfcc_delta_delta])])
mfcc = slide_windows(mfcc)
for i in mfcc:
mfccs.append(i)
labels.append(label)
return mfccs, labels
n_fft = int(args.window*SAMPLE_RATE/1000.0)
hop_length = int(args.step * SAMPLE_RATE / 1000.0)
if 'mfe' == args.feature_type:
if args.backend=='speechpy':
log_cut = 1e-8
spec, energy = mfe(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft)
if args.energy:
acoustic_features = np.hstack((spec, energy[:, np.newaxis]))
acoustic_features = np.log(acoustic_features + log_cut)
else:
spec = librosa.feature.melspectrogram(y=waveform, sr=SAMPLE_RATE, n_fft=n_fft,
hop_length=hop_length, n_mels=args.n_mels)
acoustic_features = librosa.core.amplitude_to_db(spec).transpose()
if args.energy:
energy = librosa.feature.rms(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
acoustic_features = np.hstack((acoustic_features, energy))
elif 'mfcc' == args.feature_type:
if args.backend=='speechpy':
acoustic_features = mfcc(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft,
num_cepstral = args.n_mfcc)
else:
acoustic_features = librosa.feature.mfcc(y=waveform, sr=SAMPLE_RATE, n_mfcc=args.n_mfcc,
n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels).transpose()
if args.energy:
energy = librosa.feature.rms(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
acoustic_features = np.hstack((acoustic_features, energy))
elif 'lyon' == args.feature_type:
from lyon.calc import LyonCalc
lyon_calc = LyonCalc()
The threshold (in decibels) below reference to consider as
silence
ref : callable or float
The reference power
Returns
-------
non_silent : np.ndarray, shape=(m,), dtype=bool
Indicator of non-silent frames
'''
# Convert to mono
y_mono = core.to_mono(y)
# Compute the MSE for the signal
mse = feature.rms(y=y_mono,
frame_length=frame_length,
hop_length=hop_length)**2
return (core.power_to_db(mse.squeeze(),
ref=ref,
top_db=None) > - top_db)
def prediction(self, filepath):
"""prediction of the new audio input / filepath to that"""
try:
DTFTarray, sampling_rate = librosa.load(filepath)
except Exception as e:
logger.error('[/!/] Librosa loading test file failed.')
begin_silence = classify.get_silence(DTFTarray)
end_silence = classify.get_silence(np.flipud(DTFTarray))
logger.info('[.] Trimming the audio ...')
DTFTarray_trimmed = DTFTarray[begin_silence: (len(DTFTarray) - end_silence)]
mfccs = librosa.feature.mfccs(y=DTFTarray_trimmed, sr=sampling_rate)
average = np.mean(mfccs, axis=1)
features = average.reshape(20)
logger.info('[.] Predicting with the audio features ...')
predicted = self.knn_model.predict(datatuple.features)
inv_dataset_names = {value: key for key, value in self.dataset_names.items()}
label_predicted = inv_dataset_names[predicted]
print('[*] Prediction: The audio relates to ', label_predicted)
return True
:param y: the input signal (audio time series)
:param sr: sample rate of 'y'
:param size: the length (seconds) of random crop from original audio, default as 3 seconds
:return: log-mel spectrogram feature
"""
# normalization
y = y.astype(np.float32)
normalization_factor = 1 / np.max(np.abs(y))
y = y * normalization_factor
# random crop
start = random.randint(0, len(y) - size * sr)
y = y[start: start + size * sr]
# extract log mel spectrogram #####
melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024, n_mels=60)
logmelspec = librosa.power_to_db(melspectrogram)
return logmelspec
y_copy = np.append(y_copy,pad)
# print y_copy
i = 0
segments = []
while ((i + (4*sr)) <= y_copy.shape[0]):
segments.append(y_copy[i:(i+4*sr)])
i = i + (2*sr)
ref_spectrograms_segments = []
for seg in segments:
S = librosa.feature.melspectrogram(y=seg, sr=sr, n_fft=1024, hop_length=1024, power=2)
S = S[:, 0:128]
S_db = librosa.power_to_db(S, ref=np.max)
ref_spec = [S_db]
ref_spec = np.array(ref_spec).astype('float32')
# print ref_spec.shape
ref_spec = normalize_spectrogram(ref_spec)
ref_spectrograms_segments.append(ref_spec)
ref_file_names.append(f)
ref_sepctrograms.append(ref_spectrograms_segments)
ref_spectrograms_segments = []
# print ref_file_names
X = sound_file.read(dtype="float32")
sample_rate = sound_file.samplerate
if chroma or contrast:
stft = np.abs(librosa.stft(X))
result = np.array([])
if mfcc:
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
result = np.hstack((result, mfccs))
if chroma:
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, chroma))
if mel:
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
result = np.hstack((result, mel))
if contrast:
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, contrast))
if tonnetz:
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
result = np.hstack((result, tonnetz))
return result
def featurize(wavfile):
#initialize features
hop_length = 512
n_fft=2048
#load file
y, sr = librosa.load(wavfile)
#extract mfcc coefficients
mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
mfcc_delta = librosa.feature.delta(mfcc)
#extract mean, standard deviation, min, and max value in mfcc frame, do this across all mfccs
mfcc_features=np.array([np.mean(mfcc[0]),np.std(mfcc[0]),np.amin(mfcc[0]),np.amax(mfcc[0]),
np.mean(mfcc[1]),np.std(mfcc[1]),np.amin(mfcc[1]),np.amax(mfcc[1]),
np.mean(mfcc[2]),np.std(mfcc[2]),np.amin(mfcc[2]),np.amax(mfcc[2]),
np.mean(mfcc[3]),np.std(mfcc[3]),np.amin(mfcc[3]),np.amax(mfcc[3]),
np.mean(mfcc[4]),np.std(mfcc[4]),np.amin(mfcc[4]),np.amax(mfcc[4]),
np.mean(mfcc[5]),np.std(mfcc[5]),np.amin(mfcc[5]),np.amax(mfcc[5]),
np.mean(mfcc[6]),np.std(mfcc[6]),np.amin(mfcc[6]),np.amax(mfcc[6]),
np.mean(mfcc[7]),np.std(mfcc[7]),np.amin(mfcc[7]),np.amax(mfcc[7]),
np.mean(mfcc[8]),np.std(mfcc[8]),np.amin(mfcc[8]),np.amax(mfcc[8]),
np.mean(mfcc[9]),np.std(mfcc[9]),np.amin(mfcc[9]),np.amax(mfcc[9]),
np.mean(mfcc[10]),np.std(mfcc[10]),np.amin(mfcc[10]),np.amax(mfcc[10]),
np.mean(mfcc[11]),np.std(mfcc[11]),np.amin(mfcc[11]),np.amax(mfcc[11]),
np.mean(mfcc[12]),np.std(mfcc[12]),np.amin(mfcc[12]),np.amax(mfcc[12]),
np.mean(mfcc_delta[0]),np.std(mfcc_delta[0]),np.amin(mfcc_delta[0]),np.amax(mfcc_delta[0]),