Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_reverse(self):
x_orig, _ = torchaudio.load(self.test_filepath)
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(self.test_filepath)
E.append_effect_to_chain("reverse", "")
x_rev, _ = E.sox_build_flow_effects()
# check if effect worked
rev_idx = torch.LongTensor(range(x_orig.size(1))[::-1])
self.assertTrue(x_orig.allclose(x_rev[:, rev_idx], rtol=1e-5, atol=2e-5))
def test_batch_pitch(self):
waveform, sample_rate = torchaudio.load(self.test_filepath)
# Single then transform then batch
expected = F.detect_pitch_frequency(waveform, sample_rate)
expected = expected.unsqueeze(0).repeat(3, 1, 1)
# Batch then transform
waveform = waveform.unsqueeze(0).repeat(3, 1, 1)
computed = F.detect_pitch_frequency(waveform, sample_rate)
self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape))
self.assertTrue(torch.allclose(computed, expected))
_test_torchscript_functional(F.detect_pitch_frequency, waveform, sample_rate)
def load_yesno_item(fileid, path, ext_audio):
# Read label
labels = [int(c) for c in fileid.split("_")]
# Read wav
file_audio = os.path.join(path, fileid + ext_audio)
waveform, sample_rate = torchaudio.load(file_audio)
return waveform, sample_rate, labels
def load_vctk_item(
fileid, path, ext_audio, ext_txt, folder_audio, folder_txt, downsample=False
):
speaker_id, utterance_id = fileid.split("_")
# Read text
file_txt = os.path.join(path, folder_txt, speaker_id, fileid + ext_txt)
with open(file_txt) as file_text:
utterance = file_text.readlines()[0]
# Read wav
file_audio = os.path.join(path, folder_audio, speaker_id, fileid + ext_audio)
waveform, sample_rate = torchaudio.load(file_audio)
if downsample:
# TODO Remove this parameter after deprecation
F = torchaudio.functional
T = torchaudio.transforms
# rate
sample = T.Resample(sample_rate, 16000, resampling_method='sinc_interpolation')
waveform = sample(waveform)
# dither
waveform = F.dither(waveform, noise_shaping=True)
return waveform, sample_rate, utterance, speaker_id, utterance_id
def load_audio(path, frame_start=0, frame_end=-1):
sound, _ = torchaudio.load(path)
sound = sound.numpy()
if len(sound.shape) > 1:
if sound.shape[1] == 1:
sound = sound.squeeze()
else:
sound = sound.mean(axis=1) # multiple channels, average
if frame_end > 0 or frame_start > 0:
assert frame_start < frame_end, "slicing does not yet support inverting audio"
if frame_end > sound.shape[0]:
repeats = ceil((frame_end - sound.shape[0])/float(sound.shape[0]))
appendage = sound
for _ in range(int(repeats)):
sound = np.concatenate((sound,appendage))
sound = sound[frame_start:frame_end]
return sound
transformer = Augment(resample=True, sample_rate=params.SAMPLE_RATE)
wav_file = Path("/home/jbaik/src/enf/stt/test/conan1-8k.wav")
audio = transformer(wav_file)
# test Spectrogram
if True:
import matplotlib
matplotlib.use('TkAgg')
matplotlib.interactive(True)
import matplotlib.pyplot as plt
nperseg = int(params.SAMPLE_RATE * params.WINDOW_SIZE)
noverlap = int(params.SAMPLE_RATE * (params.WINDOW_SIZE - params.WINDOW_SHIFT))
wav_file = Path("../data/aspire/000/fe_03_00047-A-025005-025135.wav")
audio, _ = torchaudio.load(wav_file)
# pyplot specgram
audio = torch.squeeze(audio)
fig = plt.figure(0)
plt.specgram(audio, Fs=params.SAMPLE_RATE, NFFT=params.NFFT, noverlap=noverlap, cmap='plasma')
# implemented transformer - scipy stft
transformer = Spectrogram(sample_rate=params.SAMPLE_RATE, window_stride=params.WINDOW_SHIFT,
window_size=params.WINDOW_SIZE, nfft=params.NFFT)
data, f, t = transformer(audio)
print(data.shape)
mag = data[0]
fig = plt.figure(1)
plt.pcolormesh(t, f, np.log10(np.expm1(data[0])), cmap='plasma')
fig = plt.figure(2)
plt.pcolormesh(t, f, data[1], cmap='plasma')
def load_torchaudio(fp):
sig, rate = torchaudio.load(fp)
return sig
def read_wav(fname, return_rate=False):
'''
Read wavfile using Pytorch audio
input:
fname: wav file path
return_rate: Whether to return the sampling rate
output:
src: output tensor of size C x L
L is the number of audio frames
C is the number of channels.
sr: sample rate
'''
src, sr = torchaudio.load(fname, channels_first=True)
if return_rate:
return src.squeeze(), sr
else:
return src.squeeze()
offset=False, offset_range=(0, 40),
padding=False, num_padding=0)
wav_file = "/d1/jbaik/ics-asr/temp/conan1-8k.wav"
audio = transformer(wav_file)
# test Spectrogram
elif test == 2:
import matplotlib
matplotlib.use('TkAgg')
matplotlib.interactive(True)
import matplotlib.pyplot as plt
nperseg = int(params.SAMPLE_RATE * params.WINDOW_SIZE)
noverlap = int(params.SAMPLE_RATE * (params.WINDOW_SIZE - params.WINDOW_SHIFT))
wav_file = Path("../data/aspire/000/fe_03_00047-A-025005-025135.wav")
audio, _ = torchaudio.load(wav_file)
# pyplot specgram
audio = torch.squeeze(audio)
fig = plt.figure(0)
plt.specgram(audio, Fs=params.SAMPLE_RATE, NFFT=params.NFFT, noverlap=noverlap, cmap='plasma')
# implemented transformer - scipy stft
transformer = Spectrogram(sample_rate=params.SAMPLE_RATE, window_stride=params.WINDOW_SHIFT,
window_size=params.WINDOW_SIZE, nfft=params.NFFT)
data, f, t = transformer(audio)
mag = data[0]
fig = plt.figure(1)
plt.pcolormesh(t, f, np.log10(np.expm1(data[0])), cmap='plasma')
fig = plt.figure(2)
plt.pcolormesh(t, f, data[1], cmap='plasma')
#print(max(data[0].view(257*601)), min(data[0].view(257*601)))
def read_audio(fp, trim_silence=False):
if trim_silence:
E = torchaudio.sox_effects.SoxEffectsChain()
E.set_input_file(fp)
E.append_effect_to_chain("silence", [1, 100, 1])
E.append_effect_to_chain("reverse")
E.append_effect_to_chain("silence", [1, 100, 1])
E.append_effect_to_chain("reverse")
sig, sample_rate = E.sox_build_flow_effects()
else:
sig, sample_rate = torchaudio.load(fp)
sig = sig.contiguous()
return sig, sample_rate