Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
if os.path.exists( target_filename ):
continue
# print info
print("TEDLIUM corpus preprocessing (%d / %d) - '%s-%.2f]" % (i, len(wave_files), wave_file, offset))
# load wave file
if not os.path.exists( wave_file ):
sph_file = wave_file.rsplit('.',1)[0]
if os.path.exists( sph_file ):
convert_sph( sph_file, wave_file )
else:
raise RuntimeError("Missing sph file from TedLium corpus at %s"%(sph_file))
wave, sr = librosa.load(wave_file, mono=True, sr=None, offset=offset, duration=dur)
# get mfcc feature
mfcc = librosa.feature.mfcc(wave, sr=16000)
# save result ( exclude small mfcc data to prevent ctc loss )
if len(label) < mfcc.shape[1]:
# filename
# save meta info
writer.writerow([fn] + label)
# save mfcc
np.save(target_filename, mfcc, allow_pickle=False)
decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]), seq_len, merge_repeated=False)
# to dense tensor
y = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
#
# regcognize wave file
#
# command line argument for input wave file path
tf.sg_arg_def(file=(filepath, 'speech wave file to recognize.'))
# load wave file
wav, _ = librosa.load(tf.sg_arg().file, mono=True, sr=16000)
# get mfcc feature
mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, 16000), axis=0), [0, 2, 1])
# run network
with tf.Session() as sess:
# init variables
tf.sg_init(sess)
# restore parameters
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('asset/train'))
# run session
label = sess.run(y, feed_dict={x: mfcc})
# print label
# data.print_index(label)
def data2array(file):
dic = {}
i=0
for filename in file:
y, sr = sf.read(path+filename, dtype='float32')
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
max_mfcc=np.max(mfcc, axis=1)
mins, maxs=np.min(max_mfcc), np.max(max_mfcc)
scaled_mfcc=(max_mfcc-mins)/(maxs-mins)
dic[i] = scaled_mfcc
i+=1
array=np.array(list(dic.values()))
return(array)
def extract_feature(file_name):
"""Generates feature input (mfccs, chroma, mel, contrast, tonnetz).
-*- author: mtobeiyf https://github.com/mtobeiyf/audio-classification -*-
"""
X, sample_rate = sf.read(file_name, dtype='float32')
if X.ndim > 1:
X = X[:,0]
X = X.T
X = np.asfortranarray(X)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
return mfccs, chroma, mel, contrast, tonnetz
def mfcc_features(y, sr, n_mels=128, n_mfcc=13):
"""Extract MFCCs (Mel-Frequency Cepstral Coefficients)"""
# Analyze only first second
y = y[0:sr]
# Calculate MFCCs (Mel-Frequency Cepstral Coefficients)
mel_spectrum = librosa.feature.melspectrogram(y,
sr=sr,
n_mels=n_mels)
log_spectrum = librosa.amplitude_to_db(mel_spectrum,
ref=np.max)
mfcc = librosa.feature.mfcc(S=log_spectrum,
sr=sr,
n_mfcc=n_mfcc)
if mfcc.shape[-1] < DELTA_WIDTH:
raise RuntimeError('MFCC vector does not contain enough time steps')
if not mfcc.any():
return np.zeros(n_mfcc * 3)
# Standardize feature for equal variance
delta_mfcc = librosa.feature.delta(mfcc, width=DELTA_WIDTH)
delta2_mfcc = librosa.feature.delta(mfcc, order=2, width=DELTA_WIDTH)
feature_vector = np.concatenate((
np.mean(mfcc, 1),
np.mean(delta_mfcc, 1),
np.mean(delta2_mfcc, 1)))
def get_mfcc_vector(sample, framelength=0.025, frameoverlap=0.0125, n_mfccs=13):
fs, data = read(sample)
samplevector = list()
for frame, samples in get_frames(framelength, frameoverlap, fs, data).iteritems():
samplevector.append(librosa.feature.mfcc(samples, sr=fs, n_mfcc=13))
return numpy.array(samplevector)
def featurize(wavfile):
#initialize features
hop_length = 512
n_fft=2048
#load file
y, sr = librosa.load(wavfile)
#extract mfcc coefficients
mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
mfcc_delta = librosa.feature.delta(mfcc)
#extract mean, standard deviation, min, and max value in mfcc frame, do this across all mfccs
mfcc_features=np.array([np.mean(mfcc[0]),np.std(mfcc[0]),np.amin(mfcc[0]),np.amax(mfcc[0]),
np.mean(mfcc[1]),np.std(mfcc[1]),np.amin(mfcc[1]),np.amax(mfcc[1]),
np.mean(mfcc[2]),np.std(mfcc[2]),np.amin(mfcc[2]),np.amax(mfcc[2]),
np.mean(mfcc[3]),np.std(mfcc[3]),np.amin(mfcc[3]),np.amax(mfcc[3]),
np.mean(mfcc[4]),np.std(mfcc[4]),np.amin(mfcc[4]),np.amax(mfcc[4]),
np.mean(mfcc[5]),np.std(mfcc[5]),np.amin(mfcc[5]),np.amax(mfcc[5]),
np.mean(mfcc[6]),np.std(mfcc[6]),np.amin(mfcc[6]),np.amax(mfcc[6]),
np.mean(mfcc[7]),np.std(mfcc[7]),np.amin(mfcc[7]),np.amax(mfcc[7]),
np.mean(mfcc[8]),np.std(mfcc[8]),np.amin(mfcc[8]),np.amax(mfcc[8]),
np.mean(mfcc[9]),np.std(mfcc[9]),np.amin(mfcc[9]),np.amax(mfcc[9]),
np.mean(mfcc[10]),np.std(mfcc[10]),np.amin(mfcc[10]),np.amax(mfcc[10]),
np.mean(mfcc[11]),np.std(mfcc[11]),np.amin(mfcc[11]),np.amax(mfcc[11]),
np.mean(mfcc[12]),np.std(mfcc[12]),np.amin(mfcc[12]),np.amax(mfcc[12]),
np.mean(mfcc_delta[0]),np.std(mfcc_delta[0]),np.amin(mfcc_delta[0]),np.amax(mfcc_delta[0]),
pass it through the tensor-flow model to extract the *features_list*
:param audio: String pointing where the audio is located
:param sampling_rate: Sampling rate used when loading the audio (change it for down-sampling)
:return features: Extracted features per *audio* song
"""
if feature_type == 'MFCC':
src_zeros = np.zeros(1024) # min length to have 3-frame mfcc's
src, sr = librosa.load(audio, sr=sampling_rate, duration=29.) # max len: 29s, can be shorter.
if len(src) < 1024:
src_zeros[:len(src)] = src
src = src_zeros
mfcc = librosa.feature.mfcc(src, sampling_rate, n_mfcc=20)
dmfcc = mfcc[:, 1:] - mfcc[:, :-1]
ddmfcc = dmfcc[:, 1:] - dmfcc[:, :-1]
return np.concatenate((np.mean(mfcc, axis=1), np.std(mfcc, axis=1),
np.mean(dmfcc, axis=1), np.std(dmfcc, axis=1),
np.mean(ddmfcc, axis=1), np.std(ddmfcc, axis=1)),
axis=0)
@author: kimseunghyuck
"""
import librosa
import soundfile as sf
from matplotlib import pyplot as plt
import numpy as np
import os
path = '/Users/kimseunghyuck/desktop/audio_train/'
files=os.listdir(path)
#show one sample file
filename = files[0]
y, sr = sf.read(path+filename, dtype='float32')
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
#stft=librosa.core.stft(y=y)
#stft.shape #1025, 161
mfcc.shape #20, 161
#show second sample file
filename = files[1]
y, sr = sf.read(path+filename, dtype='float32')
#mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
stft=librosa.core.stft(y=y)
stft.shape #1025, 109
#주파수 범위가 1025, 109는 시간
#패딩해서 max로 맟춘 다음에 cnn하면 되지 않을까.
#show graph
plt.figure(figsize=(15, 5))
plt.plot(mfcc)
def extract_feature(file_name):
X, sample_rate = librosa.load(file_name)
stft = np.abs(librosa.stft(X))
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0)
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X),
sr=sample_rate).T, axis=0)
return mfccs, chroma, mel, contrast, tonnetz