How to use the librosa.feature function in librosa

To help you get started, we’ve selected a few librosa examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JuliusKunze / speechless / test / test_labeled_example.py View on Github external
def test(self):
        example = corpus.examples[0]
        mel_power_spectrogram = librosa.feature.melspectrogram(
            y=example.raw_audio(), n_fft=example.fourier_window_length, hop_length=example.hop_length,
            sr=example.sample_rate)

        self.assertTrue(np.array_equal(mel_power_spectrogram,
                                       example.spectrogram(type=SpectrogramType.power,
                                                           frequency_scale=SpectrogramFrequencyScale.mel)))
github hooloong / My_TensorFlow / Test15_chinessespeech / test15_speech_val.py View on Github external
def speech_to_text(wav_file):
    wav, sr = librosa.load(wav_file, mono=True)
    mfcc = np.transpose(np.expand_dims(librosa.feature.mfcc(wav, sr), axis=0), [0, 2, 1])

    logit = speech_to_text_network()

    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, tf.train.latest_checkpoint('.'))

        decoded = tf.transpose(logit, perm=[1, 0,2])
        decoded, _ = tf.nn.ctc_beam_search_decoder(decoded, sequence_len, merge_repeated=False)
        predict = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape, decoded[0].values) + 1
        output = sess.run(decoded, feed_dict={X: mfcc})
        print(output)
        msg = ''.join([words[n] for n in output[0][1]])
        print(msg)
testing_file = "../../data/speech/train/wav/train/A11/A11_0.wav"
github bjfu-ai-institute / speaker-recognition-papers / DeepSpeaker / speech_processing.py View on Github external
def ext_mfcc_feature(url_path):
    """
    Return the MFCC feature
    """
    with open(url_path, 'r') as urls:
        mfccs = []
        labels = []
        for url in list(urls):
            url, label = str(url).split(" ")
            index = eval(str(label).split("\n")[0])
            label = np.zeros(config.N_SPEAKER)
            label[index] = 1
            y, sr = librosa.load(url)
            mfcc_ = librosa.feature.mfcc(y, sr, n_mfcc=13)
            mfcc_delta = librosa.feature.delta(mfcc_, width=3)
            mfcc_delta_delta = librosa.feature.delta(mfcc_delta, width=3)
            mfcc = np.vstack([mfcc_, np.vstack([mfcc_delta, mfcc_delta_delta])])
            mfcc = slide_windows(mfcc)
            for i in mfcc:
                mfccs.append(i)
                labels.append(label)
        return mfccs, labels
github sciforce / phones-las / preprocess_all.py View on Github external
n_fft = int(args.window*SAMPLE_RATE/1000.0)
    hop_length = int(args.step * SAMPLE_RATE / 1000.0)
    if 'mfe' == args.feature_type:
        if args.backend=='speechpy':
            log_cut = 1e-8
            spec, energy = mfe(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
                frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft)
            if args.energy:
                acoustic_features = np.hstack((spec, energy[:, np.newaxis]))
            acoustic_features = np.log(acoustic_features + log_cut)
        else:
            spec = librosa.feature.melspectrogram(y=waveform, sr=SAMPLE_RATE, n_fft=n_fft, 
                hop_length=hop_length, n_mels=args.n_mels)
            acoustic_features = librosa.core.amplitude_to_db(spec).transpose()
            if args.energy:
                energy = librosa.feature.rms(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
                acoustic_features = np.hstack((acoustic_features, energy))
    elif 'mfcc' == args.feature_type:
        if args.backend=='speechpy':
            acoustic_features = mfcc(waveform, SAMPLE_RATE, frame_length=args.window*1e-3,
                frame_stride=args.step*1e-3, num_filters=args.n_mels, fft_length=n_fft,
                num_cepstral = args.n_mfcc)
        else:
            acoustic_features = librosa.feature.mfcc(y=waveform, sr=SAMPLE_RATE, n_mfcc=args.n_mfcc,
                n_fft=n_fft, hop_length=hop_length, n_mels=args.n_mels).transpose()
            if args.energy:
                energy = librosa.feature.rms(y=waveform, frame_length=n_fft, hop_length=hop_length).transpose()
                acoustic_features = np.hstack((acoustic_features, energy))
    elif 'lyon' == args.feature_type:
        from lyon.calc import LyonCalc
        lyon_calc = LyonCalc()
github librosa / librosa / librosa / effects.py View on Github external
The threshold (in decibels) below reference to consider as
        silence

    ref : callable or float
        The reference power

    Returns
    -------
    non_silent : np.ndarray, shape=(m,), dtype=bool
        Indicator of non-silent frames
    '''
    # Convert to mono
    y_mono = core.to_mono(y)

    # Compute the MSE for the signal
    mse = feature.rms(y=y_mono,
                      frame_length=frame_length,
                      hop_length=hop_length)**2

    return (core.power_to_db(mse.squeeze(),
                             ref=ref,
                             top_db=None) > - top_db)
github Nishkarsh5 / MozartFlow / instrument_classification / model.py View on Github external
def prediction(self, filepath):
        """prediction of the new audio input / filepath to that"""

        try:
            DTFTarray, sampling_rate = librosa.load(filepath)
        except Exception as e:
            logger.error('[/!/] Librosa loading test file failed.')

        begin_silence = classify.get_silence(DTFTarray)
        end_silence = classify.get_silence(np.flipud(DTFTarray))

        logger.info('[.] Trimming the audio ...')

        DTFTarray_trimmed = DTFTarray[begin_silence: (len(DTFTarray) - end_silence)]

        mfccs = librosa.feature.mfccs(y=DTFTarray_trimmed, sr=sampling_rate)
        average = np.mean(mfccs, axis=1)
        
        features = average.reshape(20)

        logger.info('[.] Predicting with the audio features ...')
        predicted = self.knn_model.predict(datatuple.features)
        
        inv_dataset_names = {value: key for key, value in self.dataset_names.items()}

        label_predicted = inv_dataset_names[predicted]
        print('[*] Prediction: The audio relates to ', label_predicted)

        return True
github JasonZhang156 / Sound-Recognition-Tutorial / feature_extraction.py View on Github external
:param y: the input signal (audio time series)
    :param sr: sample rate of 'y'
    :param size: the length (seconds) of random crop from original audio, default as 3 seconds
    :return: log-mel spectrogram feature
    """
    # normalization
    y = y.astype(np.float32)
    normalization_factor = 1 / np.max(np.abs(y))
    y = y * normalization_factor

    # random crop
    start = random.randint(0, len(y) - size * sr)
    y = y[start: start + size * sr]

    # extract log mel spectrogram #####
    melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024, n_mels=60)
    logmelspec = librosa.power_to_db(melspectrogram)

    return logmelspec
github interactiveaudiolab / voogle / archive / prec_at_1.py View on Github external
y_copy = np.append(y_copy,pad)

            # print y_copy

            i = 0
            segments = []
            while ((i + (4*sr)) <= y_copy.shape[0]):
                segments.append(y_copy[i:(i+4*sr)])
                i = i + (2*sr)


            ref_spectrograms_segments = []

            for seg in segments:

                S = librosa.feature.melspectrogram(y=seg, sr=sr, n_fft=1024, hop_length=1024, power=2)

                S = S[:, 0:128]
                S_db = librosa.power_to_db(S, ref=np.max)
                ref_spec = [S_db]
                ref_spec = np.array(ref_spec).astype('float32')
                # print ref_spec.shape
                ref_spec = normalize_spectrogram(ref_spec)

                ref_spectrograms_segments.append(ref_spec)

            ref_file_names.append(f)

            ref_sepctrograms.append(ref_spectrograms_segments)
            ref_spectrograms_segments = []

    # print ref_file_names
github x4nth055 / emotion-recognition-using-speech / utils.py View on Github external
X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma or contrast:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
        if contrast:
            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, contrast))
        if tonnetz:
            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
            result = np.hstack((result, tonnetz))
    return result
github jim-schwoebel / voicebook / chapter_4_modeling / train_audiokeras.py View on Github external
def featurize(wavfile):
    #initialize features 
    hop_length = 512
    n_fft=2048
    #load file 
    y, sr = librosa.load(wavfile)
    #extract mfcc coefficients 
    mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length, n_mfcc=13)
    mfcc_delta = librosa.feature.delta(mfcc) 
    #extract mean, standard deviation, min, and max value in mfcc frame, do this across all mfccs
    mfcc_features=np.array([np.mean(mfcc[0]),np.std(mfcc[0]),np.amin(mfcc[0]),np.amax(mfcc[0]),
                            np.mean(mfcc[1]),np.std(mfcc[1]),np.amin(mfcc[1]),np.amax(mfcc[1]),
                            np.mean(mfcc[2]),np.std(mfcc[2]),np.amin(mfcc[2]),np.amax(mfcc[2]),
                            np.mean(mfcc[3]),np.std(mfcc[3]),np.amin(mfcc[3]),np.amax(mfcc[3]),
                            np.mean(mfcc[4]),np.std(mfcc[4]),np.amin(mfcc[4]),np.amax(mfcc[4]),
                            np.mean(mfcc[5]),np.std(mfcc[5]),np.amin(mfcc[5]),np.amax(mfcc[5]),
                            np.mean(mfcc[6]),np.std(mfcc[6]),np.amin(mfcc[6]),np.amax(mfcc[6]),
                            np.mean(mfcc[7]),np.std(mfcc[7]),np.amin(mfcc[7]),np.amax(mfcc[7]),
                            np.mean(mfcc[8]),np.std(mfcc[8]),np.amin(mfcc[8]),np.amax(mfcc[8]),
                            np.mean(mfcc[9]),np.std(mfcc[9]),np.amin(mfcc[9]),np.amax(mfcc[9]),
                            np.mean(mfcc[10]),np.std(mfcc[10]),np.amin(mfcc[10]),np.amax(mfcc[10]),
                            np.mean(mfcc[11]),np.std(mfcc[11]),np.amin(mfcc[11]),np.amax(mfcc[11]),
                            np.mean(mfcc[12]),np.std(mfcc[12]),np.amin(mfcc[12]),np.amax(mfcc[12]),
                            np.mean(mfcc_delta[0]),np.std(mfcc_delta[0]),np.amin(mfcc_delta[0]),np.amax(mfcc_delta[0]),