Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
#Trim silence
if len(audio)> 0:
audio, _ = librosa.effects.trim(audio)
#Trim if audio length > samples
if len(audio) > samples:
audio = audio[0:0+samples]
#Else pad blanks if shorter
else:
padding = samples - len(audio)
offset = padding // 2
audio = np.pad(audio, (offset, samples - len(audio) - offset), padmode)
#Get Mel spectogram of audio
spectrogram = librosa.feature.melspectrogram(audio,
sr=sampling_rate,
n_mels=n_mels,
hop_length=hop_length,
n_fft=n_fft,
fmin=fmin,
fmax=fmax)
#Convert to log scale (DB)
spectrogram = librosa.power_to_db(spectrogram)
#Get MFCC and second derivatives
mfcc = librosa.feature.mfcc(S=spectrogram, n_mfcc=n_mfcc)
delta2_mfcc = librosa.feature.delta(mfcc, order=2)
#Append MFCC to spectrogram and flatten
features = np.concatenate((spectrogram,mfcc,delta2_mfcc),axis=0)
X = features.ravel()
def extract_features_from_waveforms(waveforms):
"""
Extract log-scaled mel-spectrograms and their corresponding
deltas from the audio waveform (not the filename)
"""
log_specgrams = []
#labels=[]
for s in waveforms:
sound_clip = shape_sound_clip(s)
melspec = librosa.feature.melspectrogram(sound_clip, n_mels = 120, n_fft=1024)
#print melspec.shape
logspec = librosa.power_to_db(melspec, ref = np.max)
#print logspec.shape
logspec = logspec.T.flatten()[:, np.newaxis].T
#print logspec.shape
#print "Produce of two elements in melspec: ", melspec.shape[0]*melspec.shape[1]
log_specgrams.append(logspec)
del sound_clip
del melspec
del logspec
#labels.append(labeltext2labelid(f.split('/')[-2]))
log_specgrams=np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
# Trimming
y, _ = librosa.effects.trim(y)
# stft. D: (1+n_fft//2, T)
D = librosa.stft(y=y,
n_fft=hp.n_fft,
hop_length=hp.hop_length,
win_length=hp.win_length)
# magnitude spectrogram
magnitude = np.abs(D) #(1+n_fft/2, T)
# power spectrogram
power = magnitude**2 #(1+n_fft/2, T)
# mel spectrogram
S = librosa.feature.melspectrogram(S=power, n_mels=hp.n_mels) #(n_mels, T)
return np.transpose(S.astype(np.float32)), np.transpose(magnitude.astype(np.float32)) # (T, n_mels), (T, 1+n_fft/2)
def __call__(self, y):
return librosa.feature.melspectrogram(y=y, **self.__dict__)
def audio_read(f):
y, sr = librosa.core.load(f, sr=22050)
d = librosa.core.get_duration(y=y, sr=sr)
S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=512, n_mels=128)
S = np.transpose(np.log(1+10000*S))
S = np.expand_dims(S, axis=0)
return y, S, int(d)
wave, _ = librosa.effects.trim(wave)
# first pad the audio to the maximum length
# we ensure it is a multiple of 4r so it works with max frames
assert math.ceil(maximum_audio_length / hop_length) % 4*r == 0
if wave.shape[0] <= maximum_audio_length:
wave = np.pad(wave,
(0,maximum_audio_length - wave.shape[0]), 'constant', constant_values=0)
else:
return None, None
pre_emphasis = 0.97
wave = np.append(wave[0], wave[1:] - pre_emphasis * wave[:-1])
stft = librosa.stft(wave, n_fft=n_fft, win_length=win_length, hop_length=hop_length)
mel = librosa.feature.melspectrogram(S=stft, n_mels=80)
stft = np.log(np.abs(stft) + 1e-8)
mel = np.log(np.abs(mel) + 1e-8)
stft = reshape_frames(stft)
mel = reshape_frames(mel)
return mel, stft
def extract_melspec(in_fp, sr, win_size, hop_size, n_mels):
sig, sr = librosa.core.load(in_fp, sr=sr)
feat = librosa.feature.melspectrogram(sig, sr=sr,
n_fft=win_size,
hop_length=hop_size,
n_mels=n_mels).T
feat = np.log(1+10000*feat)
return feat
def generate (y, sample_rate = SAMPLE_RATE, use_mel = True, use_stft = False):
feature_stack = []
numfeat = 0
# stft ----------------------------------------------------
stft = librosa.stft (y, n_fft=2048, win_length=1200, hop_length=256)
if use_mel:
# mel specrogram ------------------------------------------
mel = librosa.feature.melspectrogram(S = stft, n_mels=80)
mel = np.log(np.abs(mel) + 1e-8)
feature_stack.extend (_featuring (mel))
numfeat += 80 * 9
if use_stft:
stft = np.log(np.abs(stft) + 1e-8)
feature_stack.extend (_featuring (stft))
numfeat += 1025 * 9
# mfcc -----------------------------------
vec = librosa.feature.mfcc(S = stft, sr = sample_rate, n_mfcc=20, n_fft=512, hop_length = 256)
feature_stack.extend (_featuring (vec))
# chroma_cqt -----------------------------------
cqt = librosa.feature.chroma_cqt (y=y, sr = sample_rate, n_chroma = 12, hop_length = 256)
cqt = np.log(np.abs(cqt) + 1e-8) # to dB lebel
def preprocess_audio(audio_path):
'''
Returns Features (time_steps, nb_features) and sequence length (scalar)
'''
y, sr = librosa.load(audio_path)
S = librosa.feature.melspectrogram(y, sr=sr, hop_length=int(1e-2*sr), n_fft=int(25e-3*sr), n_mels=40)
d = librosa.feature.delta(S)
dd = librosa.feature.delta(S, order=2)
S_e = np.log(librosa.feature.rmse(S=S))
d_e = np.log(librosa.feature.rmse(S=d))
dd_e = np.log(librosa.feature.rmse(S=dd))
return np.vstack((S, d, dd, S_e, d_e, dd_e)).T, S.shape[1]