Skip to content

Commit

Permalink
Support librosa.filters.mel() (#37)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jan 12, 2024
1 parent f08a90c commit 8142b24
Show file tree
Hide file tree
Showing 11 changed files with 224 additions and 2,743 deletions.
10 changes: 9 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,15 @@ cmake_minimum_required(VERSION 3.3 FATAL_ERROR)

project(kaldi-native-fbank CXX C)

set(KALDI_NATIVE_FBANK_VERSION "1.18.5")
set(KALDI_NATIVE_FBANK_VERSION "1.18.6")

# Disable warning about
#
# "The DOWNLOAD_EXTRACT_TIMESTAMP option was not given and policy CMP0135 is
# not set.
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif()

if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
Expand Down
37 changes: 0 additions & 37 deletions kaldi-native-fbank/csrc/generate-whisper-melbank.py

This file was deleted.

142 changes: 134 additions & 8 deletions kaldi-native-fbank/csrc/mel-computations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,22 @@ float MelBanks::VtlnWarpMelFreq(

MelBanks::MelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
float vtln_warp_factor)
: htk_mode_(opts.htk_mode) {
float vtln_warp_factor) {
if (opts.is_librosa) {
InitLibrosaMelBanks(opts, frame_opts, vtln_warp_factor);
} else {
InitKaldiMelBanks(opts, frame_opts, vtln_warp_factor);
}
}

void MelBanks::InitKaldiMelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
float vtln_warp_factor) {
htk_mode_ = opts.htk_mode;
int32_t num_bins = opts.num_bins;
if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins";
if (num_bins < 3) {
KNF_LOG(FATAL) << "Must have at least 3 mel bins";
}

float sample_freq = frame_opts.samp_freq;
int32_t window_length_padded = frame_opts.PaddedWindowSize();
Expand All @@ -119,10 +131,11 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
float nyquist = 0.5f * sample_freq;

float low_freq = opts.low_freq, high_freq;
if (opts.high_freq > 0.0f)
if (opts.high_freq > 0.0f) {
high_freq = opts.high_freq;
else
} else {
high_freq = nyquist + opts.high_freq;
}

if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
high_freq > nyquist || high_freq <= low_freq) {
Expand Down Expand Up @@ -183,12 +196,15 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
float mel = MelScale(freq);
if (mel > left_mel && mel < right_mel) {
float weight;
if (mel <= center_mel)
if (mel <= center_mel) {
weight = (mel - left_mel) / (center_mel - left_mel);
else
} else {
weight = (right_mel - mel) / (right_mel - center_mel);
}
this_bin[i] = weight;
if (first_index == -1) first_index = i;
if (first_index == -1) {
first_index = i;
}
last_index = i;
}
}
Expand Down Expand Up @@ -218,6 +234,116 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
}
}

void MelBanks::InitLibrosaMelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
float vtln_warp_factor) {
int32_t num_bins = opts.num_bins;
if (num_bins < 3) {
KNF_LOG(FATAL) << "Must have at least 3 mel bins";
}

float sample_freq = frame_opts.samp_freq;
int32_t window_length_padded = frame_opts.PaddedWindowSize();
KNF_CHECK_EQ(window_length_padded % 2, 0);

int32_t num_fft_bins = window_length_padded / 2;
float nyquist = 0.5f * sample_freq;

float low_freq = opts.low_freq, high_freq;
if (opts.high_freq > 0.0f) {
high_freq = opts.high_freq;
} else {
high_freq = nyquist + opts.high_freq;
}

if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
high_freq > nyquist || high_freq <= low_freq) {
KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq
<< " and high-freq " << high_freq << " vs. nyquist "
<< nyquist;
}

float fft_bin_width = sample_freq / window_length_padded;

float mel_low_freq = MelScaleSlaney(low_freq);
float mel_high_freq = MelScaleSlaney(high_freq);

debug_ = opts.debug_mel;

// divide by num_bins+1 in next line because of end-effects where the bins
// spread out to the sides.
float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);

bool slaney_norm = false;
if (!opts.norm.empty()) {
if (opts.norm != "slaney") {
KNF_LOG(FATAL) << "Unsupported norm: " << opts.norm;
}
slaney_norm = true;
}

bins_.resize(num_bins);
for (int32_t bin = 0; bin < num_bins; ++bin) {
float left_mel = mel_low_freq + bin * mel_freq_delta;
float center_mel = mel_low_freq + (bin + 1) * mel_freq_delta;
float right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;

float left_hz = InverseMelScaleSlaney(left_mel);
float center_hz = InverseMelScaleSlaney(center_mel);
float right_hz = InverseMelScaleSlaney(right_mel);

// this_bin will be a vector of coefficients that is only
// nonzero where this mel bin is active.
//
// It is not an error to use num_fft_bins + 1 here. It is different
// from Kaldi.
std::vector<float> this_bin(num_fft_bins + 1);

int32_t first_index = -1, last_index = -1;
for (int32_t i = 0; i < num_fft_bins + 1; ++i) {
float hz = (fft_bin_width * i); // Center frequency of this fft bin.
if (hz > left_hz && hz < right_hz) {
float weight;
if (hz <= center_hz) {
weight = (hz - left_hz) / (center_hz - left_hz);
} else {
weight = (right_hz - hz) / (right_hz - center_hz);
}

if (slaney_norm) {
weight *= 2 / (right_hz - left_hz);
}

this_bin[i] = weight;
if (first_index == -1) {
first_index = i;
}

last_index = i;
}
} // for (int32_t i = 0; i < num_fft_bins + 1; ++i)

KNF_CHECK(first_index != -1 && last_index >= first_index &&
"You may have set num_mel_bins too large.");

bins_[bin].first = first_index;
int32_t size = last_index + 1 - first_index;
bins_[bin].second.insert(bins_[bin].second.end(),
this_bin.begin() + first_index,
this_bin.begin() + first_index + size);
} // for (int32_t bin = 0; bin < num_bins; ++bin)

if (debug_) {
std::ostringstream os;
for (size_t i = 0; i < bins_.size(); i++) {
os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
for (auto k : bins_[i].second) os << k << ", ";
os << "\n";
}
fprintf(stderr, "%s\n", os.str().c_str());
}
}

MelBanks::MelBanks(const float *weights, int32_t num_rows, int32_t num_cols)
: debug_(false), htk_mode_(false) {
bins_.resize(num_rows);
Expand Down
55 changes: 55 additions & 0 deletions kaldi-native-fbank/csrc/mel-computations.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,18 @@ struct MelBanksOptions {
// mel-energy flooring and reproduces a bug in HTK.
bool htk_mode = false;

// Note that if you set is_librosa, you probably need to set
// low_freq to 0.
// Please see
// https://librosa.org/doc/main/generated/librosa.filters.mel.html
bool is_librosa = false;

// used only when is_librosa=true
// Possible values: "", slaney. We don't support a numeric value here, but
// it can be added on demand.
// See https://librosa.org/doc/main/generated/librosa.filters.mel.html
std::string norm = "slaney";

std::string ToString() const {
std::ostringstream os;
os << "num_bins: " << num_bins << "\n";
Expand All @@ -57,6 +69,8 @@ struct MelBanksOptions {
os << "vtln_high: " << vtln_high << "\n";
os << "debug_mel: " << debug_mel << "\n";
os << "htk_mode: " << htk_mode << "\n";
os << "is_librosa: " << is_librosa << "\n";
os << "norm: " << norm << "\n";
return os.str();
}
};
Expand All @@ -65,14 +79,43 @@ std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);

class MelBanks {
public:
// see also https://en.wikipedia.org/wiki/Mel_scale
// htk, mel to hz
static inline float InverseMelScale(float mel_freq) {
return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
}

// htk, hz to mel
static inline float MelScale(float freq) {
return 1127.0f * logf(1.0f + freq / 700.0f);
}

// slaney, mel to hz
static inline float InverseMelScaleSlaney(float mel_freq) {
if (mel_freq <= 15) {
return 200.0f / 3 * mel_freq;
}

// return 1000 * expf((mel_freq - 15) * logf(6.4f) / 27);

// Note: log(6.4)/27 = 0.06875177742094911

return 1000 * expf((mel_freq - 15) * 0.06875177742094911f);
}

// slaney, hz to mel
static inline float MelScaleSlaney(float freq) {
if (freq <= 1000) {
return freq * 3 / 200.0f;
}

// return 15 + 27 * logf(freq / 1000) / logf(6.4f)
//
// Note: 27/log(6.4) = 14.545078505785561

return 15 + 14.545078505785561f * logf(freq / 1000);
}

static float VtlnWarpFreq(
float vtln_low_cutoff,
float vtln_high_cutoff, // discontinuities in warp func
Expand Down Expand Up @@ -104,6 +147,18 @@ class MelBanks {

int32_t NumBins() const { return bins_.size(); }

private:
// for kaldi-compatible
void InitKaldiMelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
float vtln_warp_factor);

// for librosa-compatible
// See https://librosa.org/doc/main/generated/librosa.filters.mel.html
void InitLibrosaMelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
float vtln_warp_factor);

private:
// the "bins_" vector is a vector, one for each bin, of a pair:
// (the first nonzero fft-bin), (the vector of weights).
Expand Down
13 changes: 9 additions & 4 deletions kaldi-native-fbank/csrc/whisper-feature.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include <vector>

#include "kaldi-native-fbank/csrc/mel-computations.h"
#include "kaldi-native-fbank/csrc/whisper-mel-bank.h"

#ifndef M_2PI
#define M_2PI 6.283185307179586476925286766559005
Expand Down Expand Up @@ -115,8 +114,7 @@ static void fft(const std::vector<float> &in, std::vector<float> *out) {
}

WhisperFeatureComputer::WhisperFeatureComputer(
const FrameExtractionOptions & /*unused={}*/)
: mel_banks_(kWhisperMelArray, kWhisperMelRows, kWhisperMelCols) {
const FrameExtractionOptions & /*unused={}*/) {
frame_opts_.samp_freq = 16000;
frame_opts_.frame_shift_ms = 10;
frame_opts_.frame_length_ms = 25;
Expand All @@ -126,6 +124,13 @@ WhisperFeatureComputer::WhisperFeatureComputer(
frame_opts_.window_type = "hann";
frame_opts_.round_to_power_of_two = false;
frame_opts_.snip_edges = false;

MelBanksOptions mel_opts;
mel_opts.num_bins = 80;
mel_opts.low_freq = 0;
mel_opts.is_librosa = true;

mel_banks_ = std::make_unique<MelBanks>(mel_opts, frame_opts_, 1.0f);
}

void WhisperFeatureComputer::Compute(float /*signal_raw_log_energy*/,
Expand All @@ -147,7 +152,7 @@ void WhisperFeatureComputer::Compute(float /*signal_raw_log_energy*/,
}

// feature is pre-allocated by the user
mel_banks_.Compute(power.data(), feature);
mel_banks_->Compute(power.data(), feature);
}

} // namespace knf
3 changes: 2 additions & 1 deletion kaldi-native-fbank/csrc/whisper-feature.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#ifndef KALDI_NATIVE_FBANK_CSRC_WHISPER_FEATURE_H_
#define KALDI_NATIVE_FBANK_CSRC_WHISPER_FEATURE_H_

#include <memory>
#include <vector>

#include "kaldi-native-fbank/csrc/feature-window.h"
Expand All @@ -44,7 +45,7 @@ class WhisperFeatureComputer {
using Options = FrameExtractionOptions;

private:
MelBanks mel_banks_;
std::unique_ptr<MelBanks> mel_banks_;
FrameExtractionOptions frame_opts_;
};

Expand Down
Loading

0 comments on commit 8142b24

Please sign in to comment.