Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
energy_output = energy.astype(np.uint16)
else:
energy_output = np.zeros(2).astype(np.uint16)
# fft or mel
if is_fft or is_mel:
global sample_rate
# down-sample by 4, with filtering, energy not scaled
data_np = librosa.resample(data_np,
sample_rate,
sample_rate/4,
res_type='kaiser_fast')
# short time fft over n_fft samples
fft_data = librosa.stft(data_np, n_fft,
hop_length=n_fft,
center=False)
# calculate FFT or Mel
if is_fft:
fft_data_mag = np.abs(fft_data[0:n_fft // 2]) ** 2
fft_data_mag *= 2**3
fft_output = get_output_fft_bins(fft_data_mag, n_out_bins)
else:
fft_data_mag = np.abs(fft_data)**2
fft_data_mag *= 2**2
mel_data = librosa.feature.melspectrogram(S=fft_data_mag, sr=sample_rate / 4, n_mels=n_mel)
fft_output = get_output_fft_bins(mel_data, n_out_bins)
# output uint8_t
fft_output = fft_output.astype(np.uint8)
def extract_features():
X = sounddevice.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
sounddevice.wait()
X= np.squeeze(X)
stft = np.abs(librosa.stft(X))
mfccs = np.array(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=8).T)
chroma = np.array(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T)
mel = np.array(librosa.feature.melspectrogram(X, sr=sample_rate).T)
contrast = np.array(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T)
tonnetz = np.array(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T)
ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
features = np.vstack([features,ext_features])
return features
def read_audio_spectum(filename, n_fft=2048, hop_length=512, sr=22050):
x, sr = librosa.load(filename, sr=sr)
S = librosa.stft(x, n_fft, hop_length)
S = np.log1p(np.abs(S)).T
return S, sr
assert sample_rate_ == sample_rate, \
'Sample rate of %s != -sample_rate (%d vs %d)' \
% (audio_path, sample_rate_, sample_rate)
sound = sound.numpy()
if len(sound.shape) > 1:
if sound.shape[1] == 1:
sound = sound.squeeze()
else:
sound = sound.mean(axis=1) # average multiple channels
n_fft = int(sample_rate * window_size)
win_length = n_fft
hop_length = int(sample_rate * window_stride)
# STFT
d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length,
win_length=win_length, window=window)
spect, _ = librosa.magphase(d)
spect = np.log1p(spect)
spect = torch.FloatTensor(spect)
if normalize_audio:
mean = spect.mean()
std = spect.std()
spect.add_(-mean)
spect.div_(std)
return spect
def stft(y):
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
mag: Magnitude spectrogram.
phase_angle: Initial condition for phase.
n_fft: Size of the FFT.
hop: Stride of FFT. Defaults to n_fft/2.
num_iters: Griffin-Lim iterations to perform.
Returns:
audio: 1-D array of float32 sound samples.
"""
fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop, center=True)
ifft_config = dict(win_length=n_fft, hop_length=hop, center=True)
complex_specgram = inv_magphase(mag, phase_angle)
for i in range(num_iters):
audio = librosa.istft(complex_specgram, **ifft_config)
if i != num_iters - 1:
complex_specgram = librosa.stft(audio, **fft_config)
_, phase = librosa.magphase(complex_specgram)
phase_angle = np.angle(phase)
complex_specgram = inv_magphase(mag, phase_angle)
return audio
def spectrogram(x, n_fft, n_shift,
win_length=None, window='hann'):
spc = np.abs(librosa.stft(x, n_fft, n_shift, win_length, window=window)).T
return spc
filename = '/Users/avin/git/vc/outputs/male.wav'
sr = 22050
n_fft = 1024
len_hop = n_fft / 4
plot_wav = False
plot_spec = True
# Waveforms
wav = read(filename, sr, mono=True)
# wav = np.where(wav == 0, 1000, wav)
# wav = np.zeros_like(wav)
# wav[0] = np.ones_like(wav[0])
# Spectrogram
spec = librosa.stft(wav, n_fft=n_fft, hop_length=len_hop)
# Plot waveforms
if plot_wav:
plt.figure(1)
librosa.display.waveplot(wav, sr=sr, color='b')
plt.title('waveform')
plt.tight_layout()
plt.show()
# Plot spectrogram
if plot_spec:
plt.figure(2)
librosa.display.specshow(librosa.amplitude_to_db(spec, ref=np.max), sr=sr, hop_length=len_hop, y_axis='linear', x_axis='time')
def _get_mfcc_log_spec_and_log_mel_spec(wav, preemphasis_coeff, n_fft, win_length, hop_length):
'''
Args:
wav - Wave object loaded using librosa
Returns:
mfcc - coefficients
mag - magnitude spectrum
mel
'''
# Pre-emphasis
y_preem = preemphasis(wav, coeff=preemphasis_coeff)
# Get spectrogram
D = librosa.stft(y=y_preem, n_fft=n_fft,
hop_length=hop_length, win_length=win_length)
mag = np.abs(D)
# Get mel-spectrogram
mel_basis = librosa.filters.mel(
hp.Default.sr, hp.Default.n_fft, hp.Default.n_mels) # (n_mels, 1+n_fft//2)
mel = np.dot(mel_basis, mag) # (n_mels, t) # mel spectrogram
# Get mfccs
db = librosa.amplitude_to_db(mel)
mfccs = np.dot(librosa.filters.dct(hp.Default.n_mfcc, db.shape[0]), db)
# Log
mag = np.log(mag + sys.float_info.epsilon)
mel = np.log(mel + sys.float_info.epsilon)
# Normalization
def featurize_file(audio_filename, label_filename):
df_labels = pd.read_csv(label_filename)
y, sr = librosa.load(audio_filename)
spectrogram = np.abs(librosa.stft(y))
chroma = librosa.feature.chroma_stft(S=spectrogram, sr=sr)
file_duration = len(y) / sr
chroma_per_second = chroma.shape[1] / file_duration
features = []
for idx, seconds_start, seconds_end, label in df_labels.itertuples():
chroma_start_idx = int(np.round(seconds_start * chroma_per_second))
chroma_end_idx = int(np.round(seconds_end * chroma_per_second))
chroma_segment = chroma[:, chroma_start_idx:chroma_end_idx]
features.append(featurize_audio_segment(chroma_segment))
note_names = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
feature_names = [f'chroma-{note}' for note in note_names]
df = pd.DataFrame(features, columns=feature_names)
df['chord'] = df_labels['chord']