From 4767223325d4e7ada9a3cee924c577446556446e Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:16:04 +0800 Subject: [PATCH 01/21] Fix MSVC compile error C3688 Instead of simply using 'add_compile_options(/utf-8)' to address the MSVC compile error C3688, a better approach would be to handle it in a way that prevents passing '/utf-8' to NVCC. --- CMakeLists.txt | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b5846d96bc..ae1fd1715a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,17 +251,20 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES else() message(STATUS "x86 detected") if (MSVC) - if(NOT WHISPER_NO_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") - else() - if(NOT WHISPER_NO_AVX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX") - endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8") + if(NOT WHISPER_NO_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") + else() + if(NOT WHISPER_NO_AVX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX") endif() + endif() else() if (EMSCRIPTEN) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread") From 38eaeff09c5e40c133560b8037c24dd092a9a99a Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Wed, 2 Aug 2023 00:04:30 +0800 Subject: [PATCH 02/21] Significantly improve inference quality In the function `log_mel_spectrogram_worker_thread`, there's an array out-of-bounds issue occurring during the calculation of complex number moduli. This issue is causing disruptions in the FFT spectrum, which, in turn, is reducing the quality of inference. --- whisper.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 9923fa06002..a318bc0402a 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2396,12 +2396,10 @@ static void fft(const std::vector & in, std::vector & out) { even.reserve(N/2); odd.reserve(N/2); - for (int i = 0; i < N; i++) { - if (i % 2 == 0) { - even.push_back(in[i]); - } else { - odd.push_back(in[i]); - } + // + for (int i = 0; i < N; i+=2) { + even.push_back(in[i]); + odd.push_back(in[i + 1]); } std::vector even_fft; @@ -2442,22 +2440,27 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector if (offset + j < n_samples) { fft_in[j] = hann[j] * samples[offset + j]; } else { - fft_in[j] = 0.0; + break; } } // FFT -> mag^2 fft(fft_in, fft_out); - for (int j = 0; j < fft_size; j++) { + // Calculate modulus of complex numbers + // It should be fft_size - 1, not fft_size. + // Otherwise, it will cause array out-of-bounds, polluting the FFT spectrum + for (int j = 0; j < fft_size - 1; j++) { fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); } + + // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. for (int j = 1; j < fft_size / 2; j++) { fft_out[j] += fft_out[fft_size - j]; } if (speed_up) { - // scale down in the frequency domain results in a speed up in the time domain + // scale down in the frequency domain results in a speed-up in the time domain for (int j = 0; j < n_fft; j++) { fft_out[j] = 0.5 * (fft_out[2 * j] + fft_out[2 * j + 1]); } From 4ebe45008e9ff825371cc5eaace2bebc397c1a93 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Wed, 2 Aug 2023 10:40:00 +0800 Subject: [PATCH 03/21] Significantly improve inference quality At last, I've pinpointed the actual source of the problem. Given that the frequency spectrum generated from real input data is symmetrical around the Nyquist frequency, there's a for-loop within the `log_mel_spectrogram_worker_thread` function that attempts to fold the frequency spectrum. Regrettably, a bug within this for-loop is causing a frame shift in the frequency spectrum. The previous attempt to remedy this, which involved using `fft_size + 1` when calculating the modulus, was merely a band-aid solution and did not address the underlying issue. --- whisper.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index a318bc0402a..d001e302d3e 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2448,15 +2448,14 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector fft(fft_in, fft_out); // Calculate modulus of complex numbers - // It should be fft_size - 1, not fft_size. - // Otherwise, it will cause array out-of-bounds, polluting the FFT spectrum - for (int j = 0; j < fft_size - 1; j++) { + for (int j = 0; j < fft_size; j++) { fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); } // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. - for (int j = 1; j < fft_size / 2; j++) { - fft_out[j] += fft_out[fft_size - j]; + // This is where the actual issue lies + for (int j = 0; j < fft_size / 2; j++) { + fft_out[j] += fft_out[fft_size - j - 1]; } if (speed_up) { From 6f445d14731f156ce07d6b131cad04935e23b827 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Wed, 2 Aug 2023 14:22:39 +0800 Subject: [PATCH 04/21] Addressed a few minor issues Fixed the issue of `fft_out` continuously expanding. Resolved the fallback caused by using 'break' instead of `fft_in[j] = 0`. --- whisper.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index d001e302d3e..2fe1a1c82a9 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2422,17 +2422,18 @@ static void fft(const std::vector & in, std::vector & out) { out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; + } } static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const float *samples, int n_samples, int fft_size, int fft_step, int n_threads, const whisper_filters &filters, bool speed_up, whisper_mel &mel) { - std::vector fft_in(fft_size, 0.0); - std::vector fft_out(2 * fft_size); int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2); for (int i = ith; i < mel.n_len; i += n_threads) { + std::vector fft_in(fft_size, 0.0); + std::vector fft_out(2 * fft_size); const int offset = i * fft_step; // apply Hanning window @@ -2473,10 +2474,10 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector int k = 0; for (k = 0; k < n_fft - 3; k += 4) { sum += - fft_out[k + 0] * filters.data[j*n_fft + k + 0] + - fft_out[k + 1] * filters.data[j*n_fft + k + 1] + - fft_out[k + 2] * filters.data[j*n_fft + k + 2] + - fft_out[k + 3] * filters.data[j*n_fft + k + 3]; + fft_out[k + 0] * filters.data[j * n_fft + k + 0] + + fft_out[k + 1] * filters.data[j * n_fft + k + 1] + + fft_out[k + 2] * filters.data[j * n_fft + k + 2] + + fft_out[k + 3] * filters.data[j * n_fft + k + 3]; } // handle n_fft remainder @@ -3636,7 +3637,7 @@ static void whisper_process_logits( WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab); // extract the logits for the last token - // we will be mutating and therefore we don't want to use the ctx.logits buffer directly + // we will be mutating, and therefore we don't want to use the ctx.logits buffer directly auto & probs = decoder.probs; auto & logits = decoder.logits; auto & logprobs = decoder.logprobs; From 7f690dd4886a8b07bc1e8ac09d96af23643b45d6 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Wed, 2 Aug 2023 17:22:38 +0800 Subject: [PATCH 05/21] Significantly improve inference quality Thanks for your patience everyone. It's finally sorted out. Now, the right side of the FFT spectrum is being flipped over to the left, and the amplitudes at corresponding positions on the left and right are added together (the spectrum on the left needs to be shifted by one position), then the average is calculated. FFT_OUT[0] is no longer discarded, making full use of the limited space to pack in more information. --- whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper.cpp b/whisper.cpp index 2fe1a1c82a9..83d4c073648 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2456,7 +2456,7 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. // This is where the actual issue lies for (int j = 0; j < fft_size / 2; j++) { - fft_out[j] += fft_out[fft_size - j - 1]; + fft_out[j] = (fft_out[fft_size - j - 1] + fft_out[j + 1]) / 2; } if (speed_up) { From f3e7774ceea0caa776e785b9c50bbbf1a33515dd Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Thu, 3 Aug 2023 20:11:34 +0800 Subject: [PATCH 06/21] Add annotation and performance improvement --- whisper.cpp | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 83d4c073648..8225845b042 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2425,32 +2425,33 @@ static void fft(const std::vector & in, std::vector & out) { } } - +static bool printed = false; static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const float *samples, int n_samples, int fft_size, int fft_step, int n_threads, const whisper_filters &filters, bool speed_up, whisper_mel &mel) { + std::vector fft_in(fft_size, 0.0); + std::vector fft_out(2 * fft_size); int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2); for (int i = ith; i < mel.n_len; i += n_threads) { - std::vector fft_in(fft_size, 0.0); - std::vector fft_out(2 * fft_size); const int offset = i * fft_step; - // apply Hanning window - for (int j = 0; j < fft_size; j++) { - if (offset + j < n_samples) { - fft_in[j] = hann[j] * samples[offset + j]; - } else { - break; - } + // apply Hanning window (~10% faster) + for (int j = 0; j < std::min(fft_size, n_samples - offset); j++) { + fft_in[j] = hann[j] * samples[offset + j]; + } + // Can anyone explain why (n_samples - offset) can be negative ??? + // If they are negative, fft_in would be all zero! + if (0 < n_samples - offset && n_samples - offset < fft_size) { + std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); } // FFT -> mag^2 fft(fft_in, fft_out); - // Calculate modulus of complex numbers + // Calculate modulus^2 of complex numbers for (int j = 0; j < fft_size; j++) { - fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + fft_out[j] = (pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2)); } // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. @@ -2461,8 +2462,8 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector if (speed_up) { // scale down in the frequency domain results in a speed-up in the time domain - for (int j = 0; j < n_fft; j++) { - fft_out[j] = 0.5 * (fft_out[2 * j] + fft_out[2 * j + 1]); + for (int j = 0; j < n_fft - 1; j++) { + fft_out[j] = (fft_out[2 * j] + fft_out[2 * j + 1]) / 2; } } @@ -2511,6 +2512,10 @@ static bool log_mel_spectrogram( std::vector hann; hann.resize(fft_size); for (int i = 0; i < fft_size; i++) { + // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html + // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 + // So it should be hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size - 1))); + // But using fft_size - 1 causes inference quality degradation ??? hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size))); } From 95be6dc6aa0e9c9e98ffd823687f21b5d892d4e1 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Thu, 3 Aug 2023 21:24:40 +0800 Subject: [PATCH 07/21] Calculate FFT only when fft_in are not all zero --- whisper.cpp | 102 ++++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 8225845b042..4d5b10c5488 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2436,59 +2436,71 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector for (int i = ith; i < mel.n_len; i += n_threads) { const int offset = i * fft_step; - // apply Hanning window (~10% faster) - for (int j = 0; j < std::min(fft_size, n_samples - offset); j++) { - fft_in[j] = hann[j] * samples[offset + j]; - } - // Can anyone explain why (n_samples - offset) can be negative ??? - // If they are negative, fft_in would be all zero! - if (0 < n_samples - offset && n_samples - offset < fft_size) { - std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); - } - - // FFT -> mag^2 - fft(fft_in, fft_out); + // Calculate FFT only when fft_in are not all zero + if (n_samples - offset > 0) { - // Calculate modulus^2 of complex numbers - for (int j = 0; j < fft_size; j++) { - fft_out[j] = (pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2)); - } + // apply Hanning window (~10% faster) + for (int j = 0; j < std::min(fft_size, n_samples - offset); j++) { + fft_in[j] = hann[j] * samples[offset + j]; + } + // Can anyone explain why (n_samples - offset) can be negative ??? + // If they are negative, fft_in would be all zero! + if (n_samples - offset < fft_size) { + std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + } - // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. - // This is where the actual issue lies - for (int j = 0; j < fft_size / 2; j++) { - fft_out[j] = (fft_out[fft_size - j - 1] + fft_out[j + 1]) / 2; - } + // FFT -> mag^2 + fft(fft_in, fft_out); - if (speed_up) { - // scale down in the frequency domain results in a speed-up in the time domain - for (int j = 0; j < n_fft - 1; j++) { - fft_out[j] = (fft_out[2 * j] + fft_out[2 * j + 1]) / 2; + // Calculate modulus^2 of complex numbers + // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. + for (int j = 0; j < fft_size; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); } - } - // mel spectrogram - for (int j = 0; j < mel.n_mel; j++) { - double sum = 0.0; - - // unroll loop (suggested by GH user @lunixbochs) - int k = 0; - for (k = 0; k < n_fft - 3; k += 4) { - sum += - fft_out[k + 0] * filters.data[j * n_fft + k + 0] + - fft_out[k + 1] * filters.data[j * n_fft + k + 1] + - fft_out[k + 2] * filters.data[j * n_fft + k + 2] + - fft_out[k + 3] * filters.data[j * n_fft + k + 3]; + // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. + // This is where the actual issue lies + for (int j = 0; j < fft_size / 2; j++) { + fft_out[j] = (fft_out[fft_size - j - 1] + fft_out[j + 1]) / 2; } - // handle n_fft remainder - for (; k < n_fft; k++) { - sum += fft_out[k] * filters.data[j * n_fft + k]; + if (speed_up) { + // scale down in the frequency domain results in a speed-up in the time domain + for (int j = 0; j < n_fft - 1; j++) { + fft_out[j] = (fft_out[2 * j] + fft_out[2 * j + 1]) / 2; + } } - sum = log10(std::max(sum, 1e-10)); + // mel spectrogram + for (int j = 0; j < mel.n_mel; j++) { + double sum = 0.0; + + // unroll loop (suggested by GH user @lunixbochs) + int k = 0; + for (k = 0; k < n_fft - 3; k += 4) { + sum += + fft_out[k + 0] * filters.data[j * n_fft + k + 0] + + fft_out[k + 1] * filters.data[j * n_fft + k + 1] + + fft_out[k + 2] * filters.data[j * n_fft + k + 2] + + fft_out[k + 3] * filters.data[j * n_fft + k + 3]; + } + + // handle n_fft remainder + for (; k < n_fft; k++) { + sum += fft_out[k] * filters.data[j * n_fft + k]; + } + + sum = log10(std::max(sum, 1e-10)); - mel.data[j * mel.n_len + i] = sum; + mel.data[j * mel.n_len + i] = sum; + } + + } else { + // Otherwise fft_out are all zero + double sum = log10(1e-10); + for (int j = 0; j < mel.n_mel; j++) { + mel.data[j * mel.n_len + i] = sum; + } } } } @@ -2514,9 +2526,7 @@ static bool log_mel_spectrogram( for (int i = 0; i < fft_size; i++) { // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 - // So it should be hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size - 1))); - // But using fft_size - 1 causes inference quality degradation ??? - hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size))); + hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size - 1))); } mel.n_mel = n_mel; From bd1dbd17116e7c41153af6798dd7e6bdc9cd9813 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Fri, 4 Aug 2023 13:21:06 +0800 Subject: [PATCH 08/21] Some minor performance improvement --- whisper.cpp | 112 ++++++++++++++++++++++++++-------------------------- 1 file changed, 56 insertions(+), 56 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 4d5b10c5488..ea9f89c379a 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2425,82 +2425,82 @@ static void fft(const std::vector & in, std::vector & out) { } } -static bool printed = false; + static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const float *samples, int n_samples, int fft_size, int fft_step, int n_threads, const whisper_filters &filters, bool speed_up, whisper_mel &mel) { std::vector fft_in(fft_size, 0.0); std::vector fft_out(2 * fft_size); + // Is using 32-bit float to calculate log_mel appropriate? + // 32-bit float has about 7 digits of precision, but minimum value of log_mel is 1e-10. int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2); + int i = ith; - for (int i = ith; i < mel.n_len; i += n_threads) { + // Calculate FFT only when fft_in are not all zero + for (; i < std::min((n_samples / fft_step) + 1, mel.n_len); i += n_threads) { const int offset = i * fft_step; - // Calculate FFT only when fft_in are not all zero - if (n_samples - offset > 0) { + // apply Hanning window (~10% faster) + for (int j = 0; j < std::min(fft_size, n_samples - offset); j++) { + fft_in[j] = hann[j] * samples[offset + j]; + } + // fill the rest with zeros + if (n_samples - offset < fft_size) { + std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + } - // apply Hanning window (~10% faster) - for (int j = 0; j < std::min(fft_size, n_samples - offset); j++) { - fft_in[j] = hann[j] * samples[offset + j]; - } - // Can anyone explain why (n_samples - offset) can be negative ??? - // If they are negative, fft_in would be all zero! - if (n_samples - offset < fft_size) { - std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); - } + // FFT + fft(fft_in, fft_out); - // FFT -> mag^2 - fft(fft_in, fft_out); + // Calculate modulus^2 of complex numbers + // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. + for (int j = 0; j < fft_size; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + } - // Calculate modulus^2 of complex numbers - // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. - for (int j = 0; j < fft_size; j++) { - fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); - } + // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. + // This is where the actual issue lies + for (int j = 0; j < fft_size / 2; j++) { + fft_out[j] = (fft_out[fft_size - j - 1] + fft_out[j + 1]) / 2; + } - // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. - // This is where the actual issue lies - for (int j = 0; j < fft_size / 2; j++) { - fft_out[j] = (fft_out[fft_size - j - 1] + fft_out[j + 1]) / 2; + if (speed_up) { + // scale down in the frequency domain results in a speed-up in the time domain + for (int j = 0; j < n_fft - 1; j++) { + fft_out[j] = (fft_out[2 * j] + fft_out[2 * j + 1]) / 2; } + } - if (speed_up) { - // scale down in the frequency domain results in a speed-up in the time domain - for (int j = 0; j < n_fft - 1; j++) { - fft_out[j] = (fft_out[2 * j] + fft_out[2 * j + 1]) / 2; - } + // mel spectrogram + for (int j = 0; j < mel.n_mel; j++) { + double sum = 0.0; + + // unroll loop (suggested by GH user @lunixbochs) + int k = 0; + for (k = 0; k < n_fft - 3; k += 4) { + sum += + fft_out[k + 0] * filters.data[j * n_fft + k + 0] + + fft_out[k + 1] * filters.data[j * n_fft + k + 1] + + fft_out[k + 2] * filters.data[j * n_fft + k + 2] + + fft_out[k + 3] * filters.data[j * n_fft + k + 3]; } - // mel spectrogram - for (int j = 0; j < mel.n_mel; j++) { - double sum = 0.0; - - // unroll loop (suggested by GH user @lunixbochs) - int k = 0; - for (k = 0; k < n_fft - 3; k += 4) { - sum += - fft_out[k + 0] * filters.data[j * n_fft + k + 0] + - fft_out[k + 1] * filters.data[j * n_fft + k + 1] + - fft_out[k + 2] * filters.data[j * n_fft + k + 2] + - fft_out[k + 3] * filters.data[j * n_fft + k + 3]; - } - - // handle n_fft remainder - for (; k < n_fft; k++) { - sum += fft_out[k] * filters.data[j * n_fft + k]; - } + // handle n_fft remainder + for (; k < n_fft; k++) { + sum += fft_out[k] * filters.data[j * n_fft + k]; + } - sum = log10(std::max(sum, 1e-10)); + sum = log10(std::max(sum, 1e-10)); - mel.data[j * mel.n_len + i] = sum; - } + mel.data[j * mel.n_len + i] = sum; + } + } - } else { - // Otherwise fft_out are all zero - double sum = log10(1e-10); - for (int j = 0; j < mel.n_mel; j++) { - mel.data[j * mel.n_len + i] = sum; - } + // Otherwise fft_out are all zero + double sum = log10(1e-10); + for (; i < mel.n_len; i++) { + for (int j = 0; j < mel.n_mel; j++) { + mel.data[j * mel.n_len + i] = sum; } } } From 2c49c9b71ca66ddd096e1156ed172cd1295dfa4c Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Fri, 4 Aug 2023 14:16:18 +0800 Subject: [PATCH 09/21] Fixed a bug impacting inference quality --- whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/whisper.cpp b/whisper.cpp index ea9f89c379a..e741bcaaf02 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2498,7 +2498,7 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector // Otherwise fft_out are all zero double sum = log10(1e-10); - for (; i < mel.n_len; i++) { + for (; i < mel.n_len; i += n_threads) { for (int j = 0; j < mel.n_mel; j++) { mel.data[j * mel.n_len + i] = sum; } From e40ec2792461e0e3105b787e806c9ecab4682c96 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Fri, 11 Aug 2023 23:26:26 +0800 Subject: [PATCH 10/21] The first version after all the analysis is completed. --- whisper.cpp | 202 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 144 insertions(+), 58 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index e741bcaaf02..7f0ba8961fe 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2396,10 +2396,12 @@ static void fft(const std::vector & in, std::vector & out) { even.reserve(N/2); odd.reserve(N/2); - // - for (int i = 0; i < N; i+=2) { - even.push_back(in[i]); - odd.push_back(in[i + 1]); + for (int i = 0; i < N; i++) { + if (i % 2 == 0) { + even.push_back(in[i]); + } else { + odd.push_back(in[i]); + } } std::vector even_fft; @@ -2422,30 +2424,27 @@ static void fft(const std::vector & in, std::vector & out) { out[2*(k + N/2) + 0] = even_fft[2*k + 0] - re*re_odd + im*im_odd; out[2*(k + N/2) + 1] = even_fft[2*k + 1] - re*im_odd - im*re_odd; - } } static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const float *samples, - int n_samples, int fft_size, int fft_step, int n_threads, + int n_samples, int frame_size, int frame_step, int n_threads, const whisper_filters &filters, bool speed_up, whisper_mel &mel) { - std::vector fft_in(fft_size, 0.0); - std::vector fft_out(2 * fft_size); - // Is using 32-bit float to calculate log_mel appropriate? - // 32-bit float has about 7 digits of precision, but minimum value of log_mel is 1e-10. - int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2); + std::vector fft_in(frame_size, 0.0); + std::vector fft_out(2 * frame_step); + int n_fft = 1 + (speed_up ? frame_size / 4 : frame_size / 2); int i = ith; // Calculate FFT only when fft_in are not all zero - for (; i < std::min((n_samples / fft_step) + 1, mel.n_len); i += n_threads) { - const int offset = i * fft_step; + for (; i < std::min((n_samples - frame_size) / frame_step + 1, mel.n_len); i += n_threads) { + const int offset = i * frame_step; // apply Hanning window (~10% faster) - for (int j = 0; j < std::min(fft_size, n_samples - offset); j++) { + for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { fft_in[j] = hann[j] * samples[offset + j]; } // fill the rest with zeros - if (n_samples - offset < fft_size) { + if (n_samples - offset < frame_size) { std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); } @@ -2454,16 +2453,10 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector // Calculate modulus^2 of complex numbers // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. - for (int j = 0; j < fft_size; j++) { + for (int j = 0; j < frame_size; j++) { fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); } - // The frequency spectrum produced by real input data is symmetrical around the Nyquist frequency. - // This is where the actual issue lies - for (int j = 0; j < fft_size / 2; j++) { - fft_out[j] = (fft_out[fft_size - j - 1] + fft_out[j + 1]) / 2; - } - if (speed_up) { // scale down in the frequency domain results in a speed-up in the time domain for (int j = 0; j < n_fft - 1; j++) { @@ -2505,14 +2498,14 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector } } -// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124 +// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 static bool log_mel_spectrogram( whisper_state & wstate, const float * samples, const int n_samples, const int /*sample_rate*/, - const int fft_size, - const int fft_step, + const int frame_size, + const int frame_step, const int n_mel, const int n_threads, const whisper_filters & filters, @@ -2520,53 +2513,149 @@ static bool log_mel_spectrogram( whisper_mel & mel) { const int64_t t_start_us = ggml_time_us(); - // Hanning window - std::vector hann; - hann.resize(fft_size); - for (int i = 0; i < fft_size; i++) { - // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html - // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 - hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size - 1))); - } - - mel.n_mel = n_mel; - mel.n_len = n_samples/fft_step; - mel.n_len_org = mel.n_len; - + // Hanning window (Hard-coded to eliminate difference) + // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html + // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 + std::vector hann = {0.0, 6.16908073425293e-05, 0.0002467334270477295, 0.0005550682544708252, + 0.000986635684967041, 0.0015413463115692139, 0.0022190213203430176, 0.0030195116996765137, + 0.003942638635635376, 0.004988163709640503, 0.006155818700790405, 0.007445335388183594, + 0.008856385946273804, 0.010388582944869995, 0.012041628360748291, 0.013815045356750488, + 0.01570841670036316, 0.01772129535675049, 0.019853144884109497, 0.022103488445281982, + 0.02447172999382019, 0.026957333087921143, 0.029559612274169922, 0.03227800130844116, + 0.03511175513267517, 0.03806024789810181, 0.0411226749420166, 0.044298380613327026, + 0.04758647084236145, 0.05098623037338257, 0.05449673533439636, 0.058117181062698364, + 0.06184667348861694, 0.0656842589378357, 0.06962898373603821, 0.07367992401123047, + 0.0778360664844513, 0.08209633827209473, 0.08645972609519958, 0.09092515707015991, + 0.09549149870872498, 0.10015767812728882, 0.10492250323295593, 0.1097848117351532, + 0.11474338173866272, 0.11979702115058899, 0.12494447827339172, 0.13018447160720825, + 0.1355157196521759, 0.14093685150146484, 0.1464466154575348, 0.15204361081123352, + 0.1577264666557312, 0.16349375247955322, 0.16934409737586975, 0.1752760112285614, + 0.18128803372383118, 0.18737870454788208, 0.19354650378227234, 0.1997898817062378, + 0.20610737800598145, 0.21249738335609436, 0.21895831823349, 0.2254886031150818, + 0.23208662867546082, 0.23875075578689575, 0.24547931551933289, 0.2522706985473633, + 0.25912320613861084, 0.26603513956069946, 0.27300477027893066, 0.2800304591655731, + 0.2871103882789612, 0.29424285888671875, 0.30142611265182495, 0.30865830183029175, + 0.31593772768974304, 0.3232625722885132, 0.3306310474872589, 0.3380413055419922, + 0.34549152851104736, 0.352979838848114, 0.3605044484138489, 0.3680635094642639, + 0.37565508484840393, 0.38327735662460327, 0.3909284174442291, 0.39860638976097107, + 0.4063093662261963, 0.41403549909591675, 0.42178282141685486, 0.4295494258403778, + 0.43733343482017517, 0.44513291120529175, 0.45294591784477234, 0.46077051758766174, + 0.46860480308532715, 0.4764467775821686, 0.4842946231365204, 0.492146372795105, + 0.5, 0.5078536868095398, 0.515705406665802, 0.5235532522201538, + 0.5313953161239624, 0.5392295718193054, 0.5470541715621948, 0.5548672080039978, + 0.562666654586792, 0.5704506635665894, 0.5782172679901123, 0.5859646201133728, + 0.5936906933784485, 0.6013936996459961, 0.609071671962738, 0.6167227625846863, + 0.6243450045585632, 0.6319366097450256, 0.6394955515861511, 0.6470202207565308, + 0.6545085310935974, 0.6619587540626526, 0.6693689823150635, 0.6767374277114868, + 0.6840623021125793, 0.691341757774353, 0.6985740065574646, 0.7057572603225708, + 0.7128896713256836, 0.719969630241394, 0.7269952893257141, 0.7339649796485901, + 0.7408769130706787, 0.7477294206619263, 0.7545207738876343, 0.761249303817749, + 0.7679134607315063, 0.774511456489563, 0.7810417413711548, 0.7875027060508728, + 0.7938927412033081, 0.800210177898407, 0.8064535856246948, 0.8126214146614075, + 0.8187121152877808, 0.8247240781784058, 0.8306560516357422, 0.8365063667297363, + 0.8422735929489136, 0.8479564785957336, 0.8535534143447876, 0.8590631484985352, + 0.8644843101501465, 0.8698155879974365, 0.8750555515289307, 0.8802030086517334, + 0.8852566480636597, 0.8902152180671692, 0.8950775265693665, 0.899842381477356, + 0.9045084714889526, 0.9090749025344849, 0.9135403037071228, 0.9179036617279053, + 0.9221639633178711, 0.9263200759887695, 0.9303710460662842, 0.9343158006668091, + 0.9381533861160278, 0.941882848739624, 0.945503294467926, 0.9490138292312622, + 0.9524135589599609, 0.9557017087936401, 0.9588773250579834, 0.961939811706543, + 0.9648882746696472, 0.9677220582962036, 0.9704403877258301, 0.9730427265167236, + 0.9755282998085022, 0.9778965711593628, 0.9801468849182129, 0.9822787046432495, + 0.9842916131019592, 0.9861849546432495, 0.9879584312438965, 0.9896113872528076, + 0.9911436438560486, 0.9925546646118164, 0.9938441514968872, 0.9950118064880371, + 0.996057391166687, 0.9969804883003235, 0.997780978679657, 0.9984586238861084, + 0.999013364315033, 0.9994449615478516, 0.9997532367706299, 0.9999383091926575, + 1.0, 0.9999383091926575, 0.9997532367706299, 0.9994449615478516, + 0.999013364315033, 0.9984586238861084, 0.997780978679657, 0.9969804286956787, + 0.9960573315620422, 0.9950118064880371, 0.9938441514968872, 0.9925546646118164, + 0.9911435842514038, 0.9896113872528076, 0.9879583716392517, 0.9861849546432495, + 0.9842915534973145, 0.9822787046432495, 0.9801468253135681, 0.9778964519500732, + 0.9755282402038574, 0.9730426073074341, 0.9704403877258301, 0.9677219390869141, + 0.9648882150650024, 0.9619396924972534, 0.9588772654533386, 0.9557015895843506, + 0.9524134397506714, 0.9490137100219727, 0.9455032348632812, 0.9418827295303345, + 0.9381532669067383, 0.9343156814575195, 0.9303709268569946, 0.9263200759887695, + 0.9221639633178711, 0.9179036617279053, 0.913540244102478, 0.9090747833251953, + 0.9045084714889526, 0.8998422622680664, 0.8950774669647217, 0.8902151584625244, + 0.8852565884590149, 0.8802029490470886, 0.8750554919242859, 0.869815468788147, + 0.8644842505455017, 0.8590630888938904, 0.853553295135498, 0.8479562997817993, + 0.842273473739624, 0.836506187915802, 0.8306558728218079, 0.8247239589691162, + 0.8187118768692017, 0.8126212358474731, 0.8064534664154053, 0.8002099990844727, + 0.793892502784729, 0.7875025272369385, 0.7810416221618652, 0.7745113372802734, + 0.767913281917572, 0.7612491846084595, 0.7545205950737, 0.7477291822433472, + 0.7408767342567444, 0.7339648008346558, 0.7269951105117798, 0.7199694514274597, + 0.7128894925117493, 0.7057570219039917, 0.6985738277435303, 0.6913415789604187, + 0.684062123298645, 0.6767372488975525, 0.6693688035011292, 0.6619585752487183, + 0.6545083522796631, 0.6470199823379517, 0.6394953727722168, 0.6319363117218018, + 0.6243447661399841, 0.6167224645614624, 0.6090714335441589, 0.601393461227417, + 0.5936904549598694, 0.5859643220901489, 0.5782170295715332, 0.5704504251480103, + 0.5626664161682129, 0.5548669099807739, 0.5470539331436157, 0.5392293334007263, + 0.5313950181007385, 0.5235530138015747, 0.5157051682472229, 0.507853627204895, + 0.5, 0.4921463429927826, 0.484294593334198, 0.4764467477798462, + 0.46860471367836, 0.4607704281806946, 0.4529458284378052, 0.4451328217983246, + 0.437333345413208, 0.42954933643341064, 0.4217827320098877, 0.4140354096889496, + 0.4063093066215515, 0.3986063003540039, 0.39092832803726196, 0.3832772672176361, + 0.37565499544143677, 0.36806342005729675, 0.3605043888092041, 0.35297977924346924, + 0.3454914391040802, 0.338041216135025, 0.33063095808029175, 0.3232625126838684, + 0.3159376382827759, 0.3086581826210022, 0.3014259934425354, 0.2942427396774292, + 0.28711026906967163, 0.2800303101539612, 0.2730046510696411, 0.2660350203514099, + 0.2591230869293213, 0.25227057933807373, 0.24547919631004333, 0.2387506067752838, + 0.23208650946617126, 0.22548848390579224, 0.21895819902420044, 0.2124972641468048, + 0.2061072587966919, 0.19978976249694824, 0.1935463547706604, 0.18737855553627014, + 0.18128788471221924, 0.17527586221694946, 0.1693439483642578, 0.16349363327026367, + 0.15772631764411926, 0.15204349160194397, 0.14644649624824524, 0.1409367322921753, + 0.13551557064056396, 0.1301843225955963, 0.12494435906410217, 0.11979690194129944, + 0.11474326252937317, 0.10978469252586365, 0.10492238402366638, 0.10015755891799927, + 0.09549137949943542, 0.09092503786087036, 0.08645960688591003, 0.08209621906280518, + 0.07783591747283936, 0.07367980480194092, 0.06962886452674866, 0.06568413972854614, + 0.06184655427932739, 0.0581170916557312, 0.0544966459274292, 0.05098611116409302, + 0.04758638143539429, 0.044298261404037476, 0.04112258553504944, 0.038060128688812256, + 0.03511166572570801, 0.03227788209915161, 0.02955952286720276, 0.02695724368095398, + 0.024471670389175415, 0.02210339903831482, 0.01985308527946472, 0.017721205949783325, + 0.015708357095718384, 0.0138150155544281, 0.012041598558425903, 0.010388582944869995, + 0.008856356143951416, 0.007445335388183594, 0.006155818700790405, 0.004988163709640503, + 0.003942638635635376, 0.0030195116996765137, 0.0022190213203430176, 0.0015413165092468262, + 0.000986635684967041, 0.0005550682544708252, 0.0002467334270477295, 6.16908073425293e-05}; + + // Calculate the length of padding + int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; + int64_t stage_2_pad = frame_size / 2; + + // Initialize a vector and copy data from C array to it. std::vector samples_padded; + samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); + std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); - // pad audio with at least one extra chunk of zeros - { - const int pad = (100*WHISPER_CHUNK_SIZE)/2; + // pad 30 seconds of zeros at the end of audio (48,000 samples) + std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_2_pad + stage_1_pad, 0); - if (mel.n_len % pad != 0) { - mel.n_len = (mel.n_len/pad + 1)*pad; - } - mel.n_len += pad; + // reflective pad 200 samples at the beginning of audio + std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); - samples_padded.resize(mel.n_len*fft_step); - memcpy(samples_padded.data(), samples, n_samples*sizeof(float)); - memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float)); + // reflective pad 200 samples at the end of audio + std::reverse_copy(samples + n_samples - stage_2_pad - 1, samples + n_samples - 1 , samples_padded.begin() + n_samples + stage_2_pad + stage_1_pad); - samples = samples_padded.data(); - } - mel.data.resize(mel.n_mel*mel.n_len); + mel.n_mel = n_mel; + // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 + mel.n_len = 1 + (samples_padded.size() - frame_size) / frame_step; + // remove the last frame + mel.n_len -= 1; + mel.n_len_org = mel.n_len; + mel.data.resize(mel.n_mel * mel.n_len); - //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len); - //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate); { std::vector workers(n_threads - 1); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw] = std::thread( log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples, - n_samples, fft_size, fft_step, n_threads, + n_samples, frame_size, frame_step, n_threads, std::cref(filters), speed_up, std::ref(mel)); } // main thread - log_mel_spectrogram_worker_thread(0, hann, samples, n_samples, fft_size, fft_step, n_threads, filters, speed_up, mel); + log_mel_spectrogram_worker_thread(0, hann, samples, n_samples, frame_size, frame_step, n_threads, filters, speed_up, mel); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw].join(); @@ -2580,7 +2669,6 @@ static bool log_mel_spectrogram( mmax = mel.data[i]; } } - //printf("%s: max = %f\n", __func__, mmax); mmax -= 8.0; @@ -2594,8 +2682,6 @@ static bool log_mel_spectrogram( wstate.t_mel_us += ggml_time_us() - t_start_us; - //printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step); - return true; } From 715bf6180cdc016ab1882033fecda6292b334752 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sat, 12 Aug 2023 16:17:12 +0800 Subject: [PATCH 11/21] Fix some bugs and add debug mode --- whisper.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 7f0ba8961fe..87a0962f145 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2626,21 +2626,16 @@ static bool log_mel_spectrogram( samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2); std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad); - // pad 30 seconds of zeros at the end of audio (48,000 samples) - std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_2_pad + stage_1_pad, 0); + // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio + std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0); // reflective pad 200 samples at the beginning of audio std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin()); - // reflective pad 200 samples at the end of audio - std::reverse_copy(samples + n_samples - stage_2_pad - 1, samples + n_samples - 1 , samples_padded.begin() + n_samples + stage_2_pad + stage_1_pad); - - mel.n_mel = n_mel; // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 - mel.n_len = 1 + (samples_padded.size() - frame_size) / frame_step; - // remove the last frame - mel.n_len -= 1; + // Calculate number of frames + remove the last frame + mel.n_len = (samples_padded.size() - frame_size) / frame_step; mel.n_len_org = mel.n_len; mel.data.resize(mel.n_mel * mel.n_len); @@ -2682,6 +2677,15 @@ static bool log_mel_spectrogram( wstate.t_mel_us += ggml_time_us() - t_start_us; + // Debug log_mel_spectrogram + std::ofstream outFile("output.json"); + outFile << "["; + for (uint64_t i = 0; i < mel.data.size() - 1; i++) { + outFile << mel.data[i] << ", "; + } + outFile << mel.data[mel.data.size() - 1] << "]"; + outFile.close(); + return true; } From 3fe41d5e42458daf50a377f8af6e9130185df41a Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sat, 12 Aug 2023 18:13:16 +0800 Subject: [PATCH 12/21] Fixed several bugs --- whisper.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 87a0962f145..5017b0a3795 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2427,7 +2427,7 @@ static void fft(const std::vector & in, std::vector & out) { } } -static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const float *samples, +static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const std::vector &samples, int n_samples, int frame_size, int frame_step, int n_threads, const whisper_filters &filters, bool speed_up, whisper_mel &mel) { std::vector fft_in(frame_size, 0.0); @@ -2436,7 +2436,7 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector int i = ith; // Calculate FFT only when fft_in are not all zero - for (; i < std::min((n_samples - frame_size) / frame_step + 1, mel.n_len); i += n_threads) { + for (; i < std::min((n_samples - frame_size) / frame_step + 3, mel.n_len); i += n_threads) { const int offset = i * frame_step; // apply Hanning window (~10% faster) @@ -2644,13 +2644,13 @@ static bool log_mel_spectrogram( std::vector workers(n_threads - 1); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw] = std::thread( - log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples, - n_samples, frame_size, frame_step, n_threads, + log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples_padded, + n_samples + stage_2_pad, frame_size, frame_step, n_threads, std::cref(filters), speed_up, std::ref(mel)); } // main thread - log_mel_spectrogram_worker_thread(0, hann, samples, n_samples, frame_size, frame_step, n_threads, filters, speed_up, mel); + log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, speed_up, mel); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw].join(); From 36b0df755b96d098d0aebcb385b9fed06ec42af4 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sun, 13 Aug 2023 15:25:31 +0800 Subject: [PATCH 13/21] Temporarily disable speed-up mode and add debug mode. --- examples/main/main.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4fbc3f69ad2..04c7bffc2d8 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -70,6 +70,7 @@ struct whisper_params { float logprob_thold = -1.00f; bool speed_up = false; + bool debug_mode = false; bool translate = false; bool detect_language = false; bool diarize = false; @@ -134,7 +135,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); } else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); } else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); } - else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } + // else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } + else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-di" || arg == "--diarize") { params.diarize = true; } else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } @@ -188,7 +190,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold); fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold); fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); - fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); + // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); + fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode\n", params.debug_mode ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); @@ -893,6 +896,7 @@ int main(int argc, char ** argv) { wparams.split_on_word = params.split_on_word; wparams.speed_up = params.speed_up; + wparams.debug_mode = params.debug_mode; wparams.tdrz_enable = params.tinydiarize; // [TDRZ] From 444b59a64724c890b250a0e3be233135fbf3e8c7 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sun, 13 Aug 2023 15:26:06 +0800 Subject: [PATCH 14/21] Add debug mode --- whisper.h | 1 + 1 file changed, 1 insertion(+) diff --git a/whisper.h b/whisper.h index e7c1a1259f5..e6a093d76d6 100644 --- a/whisper.h +++ b/whisper.h @@ -375,6 +375,7 @@ extern "C" { // [EXPERIMENTAL] speed-up techniques // note: these can significantly reduce the quality of the output bool speed_up; // speed-up the audio by 2x using Phase Vocoder + bool debug_mode; // TODO int audio_ctx; // overwrite the audio context size (0 = use default) // [EXPERIMENTAL] [TDRZ] tinydiarize From 308f4900abc766dd5175c7c42c8e430baac82049 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sun, 13 Aug 2023 15:28:07 +0800 Subject: [PATCH 15/21] Disable speed-up mode and add debug mode --- whisper.cpp | 75 ++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 5017b0a3795..09db20617ca 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2429,14 +2429,14 @@ static void fft(const std::vector & in, std::vector & out) { static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const std::vector &samples, int n_samples, int frame_size, int frame_step, int n_threads, - const whisper_filters &filters, bool speed_up, whisper_mel &mel) { + const whisper_filters &filters, whisper_mel &mel) { std::vector fft_in(frame_size, 0.0); std::vector fft_out(2 * frame_step); - int n_fft = 1 + (speed_up ? frame_size / 4 : frame_size / 2); + int n_fft = 1 + (frame_size / 2); int i = ith; // Calculate FFT only when fft_in are not all zero - for (; i < std::min((n_samples - frame_size) / frame_step + 3, mel.n_len); i += n_threads) { + for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) { const int offset = i * frame_step; // apply Hanning window (~10% faster) @@ -2457,13 +2457,6 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); } - if (speed_up) { - // scale down in the frequency domain results in a speed-up in the time domain - for (int j = 0; j < n_fft - 1; j++) { - fft_out[j] = (fft_out[2 * j] + fft_out[2 * j + 1]) / 2; - } - } - // mel spectrogram for (int j = 0; j < mel.n_mel; j++) { double sum = 0.0; @@ -2509,7 +2502,7 @@ static bool log_mel_spectrogram( const int n_mel, const int n_threads, const whisper_filters & filters, - const bool speed_up, + const bool debug, whisper_mel & mel) { const int64_t t_start_us = ggml_time_us(); @@ -2646,11 +2639,11 @@ static bool log_mel_spectrogram( workers[iw] = std::thread( log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, - std::cref(filters), speed_up, std::ref(mel)); + std::cref(filters), std::ref(mel)); } // main thread - log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, speed_up, mel); + log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel); for (int iw = 0; iw < n_threads - 1; ++iw) { workers[iw].join(); @@ -2677,14 +2670,16 @@ static bool log_mel_spectrogram( wstate.t_mel_us += ggml_time_us() - t_start_us; - // Debug log_mel_spectrogram - std::ofstream outFile("output.json"); - outFile << "["; - for (uint64_t i = 0; i < mel.data.size() - 1; i++) { - outFile << mel.data[i] << ", "; + // Dump log_mel_spectrogram + if (debug) { + std::ofstream outFile("log_mel_spectrogram.json"); + outFile << "["; + for (uint64_t i = 0; i < mel.data.size() - 1; i++) { + outFile << mel.data[i] << ", "; + } + outFile << mel.data[mel.data.size() - 1] << "]"; + outFile.close(); } - outFile << mel.data[mel.data.size() - 1] << "]"; - outFile.close(); return true; } @@ -3102,8 +3097,8 @@ void whisper_free_params(struct whisper_full_params * params) { } } -int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) { +int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads, bool debug) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, debug, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3111,13 +3106,13 @@ int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_s return 0; } -int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { - return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads); +int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads, bool debug) { + return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads, debug); } -// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 -int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) { +// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (Phase Vocoder without phase lock is not good) +int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads, bool debug) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, debug, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3125,11 +3120,20 @@ int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, st return 0; } -// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 -int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { - return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads); +// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (Phase Vocoder without phase lock is not good) +int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads, bool debug) { + return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, debug); } +// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2 +// TODO + +// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2 +// TODO + +// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2 +// TODO + int whisper_set_mel_with_state( struct whisper_context * /*ctx*/, struct whisper_state * state, @@ -3581,6 +3585,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /*.max_tokens =*/ 0, /*.speed_up =*/ false, + /*.debug_mode =*/ false, /*.audio_ctx =*/ 0, /*.tdrz_enable =*/ false, @@ -4145,12 +4150,12 @@ int whisper_full_with_state( // compute log mel spectrogram if (params.speed_up) { - if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) { - log("%s: failed to compute log mel spectrogram\n", __func__); - return -1; - } + // Temporarily disable speed_up mode + // TODO: Replace PV with more advanced algorithm + log("%s: failed to compute log mel spectrogram\n", __func__); + return -1; } else { - if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) { + if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads, params.debug_mode) != 0) { log("%s: failed to compute log mel spectrogram\n", __func__); return -2; } From 252f80768a3a87187ad7ccd0477368e6d315ec01 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sun, 13 Aug 2023 16:33:39 +0800 Subject: [PATCH 16/21] Fix CI error (#1) * Fix error * Fix error --- examples/main/main.cpp | 2 +- whisper.cpp | 2 +- whisper.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 04c7bffc2d8..39ad3540bf9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -191,7 +191,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold); fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); - fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode\n", params.debug_mode ? "true" : "false"); + fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. Dump log_mel)\n", params.debug_mode ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); diff --git a/whisper.cpp b/whisper.cpp index 09db20617ca..cef652eb145 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3120,7 +3120,7 @@ int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, st return 0; } -// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (Phase Vocoder without phase lock is not good) +// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads, bool debug) { return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, debug); } diff --git a/whisper.h b/whisper.h index e6a093d76d6..3b1b2431041 100644 --- a/whisper.h +++ b/whisper.h @@ -375,7 +375,7 @@ extern "C" { // [EXPERIMENTAL] speed-up techniques // note: these can significantly reduce the quality of the output bool speed_up; // speed-up the audio by 2x using Phase Vocoder - bool debug_mode; // TODO + bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel) int audio_ctx; // overwrite the audio context size (0 = use default) // [EXPERIMENTAL] [TDRZ] tinydiarize From 0a5f4355203b9a6d933822c85fb92c48d58bcb89 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sun, 13 Aug 2023 20:51:28 +0800 Subject: [PATCH 17/21] Fixed several bugs including [BLANK_AUDIO] problem --- whisper.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index cef652eb145..42dc3405438 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2629,7 +2629,8 @@ static bool log_mel_spectrogram( // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936 // Calculate number of frames + remove the last frame mel.n_len = (samples_padded.size() - frame_size) / frame_step; - mel.n_len_org = mel.n_len; + // Calculate semi-padded sample length to ensure compatibility + mel.n_len_org = 1 + (n_samples + stage_2_pad) / frame_step; mel.data.resize(mel.n_mel * mel.n_len); @@ -3106,8 +3107,8 @@ int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_s return 0; } -int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads, bool debug) { - return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads, debug); +int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { + return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads, false); } // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (Phase Vocoder without phase lock is not good) @@ -4189,8 +4190,8 @@ int whisper_full_with_state( const int seek_start = params.offset_ms/10; const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10; - // if length of spectrogram is less than 1s (100 samples), then return - // basically don't process anything that is less than 1s + // if length of spectrogram is less than 1.0s (100 frames), then return + // basically don't process anything that is less than 1.0s // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39 if (seek_end < seek_start + (params.speed_up ? 50 : 100)) { return 0; From 65fd0e1c72b627d6cbded6438c920906b87c6f84 Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Sun, 13 Aug 2023 21:44:33 +0800 Subject: [PATCH 18/21] Remove Hard-coded hann window --- whisper.cpp | 118 +++++----------------------------------------------- 1 file changed, 11 insertions(+), 107 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 42dc3405438..e72da2dc70a 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2361,8 +2361,8 @@ static void dft(const std::vector & in, std::vector & out) { for (int n = 0; n < N; n++) { float angle = 2*M_PI*k*n/N; - re += in[n]*cos(angle); - im -= in[n]*sin(angle); + re += in[n]*cosf(angle); + im -= in[n]*sinf(angle); } out[k*2 + 0] = re; @@ -2413,8 +2413,8 @@ static void fft(const std::vector & in, std::vector & out) { for (int k = 0; k < N/2; k++) { float theta = 2*M_PI*k/N; - float re = cos(theta); - float im = -sin(theta); + float re = cosf(theta); + float im = -sinf(theta); float re_odd = odd_fft[2*k + 0]; float im_odd = odd_fft[2*k + 1]; @@ -2506,109 +2506,13 @@ static bool log_mel_spectrogram( whisper_mel & mel) { const int64_t t_start_us = ggml_time_us(); - // Hanning window (Hard-coded to eliminate difference) + // Hanning window (Use cosf to eliminate difference) // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 - std::vector hann = {0.0, 6.16908073425293e-05, 0.0002467334270477295, 0.0005550682544708252, - 0.000986635684967041, 0.0015413463115692139, 0.0022190213203430176, 0.0030195116996765137, - 0.003942638635635376, 0.004988163709640503, 0.006155818700790405, 0.007445335388183594, - 0.008856385946273804, 0.010388582944869995, 0.012041628360748291, 0.013815045356750488, - 0.01570841670036316, 0.01772129535675049, 0.019853144884109497, 0.022103488445281982, - 0.02447172999382019, 0.026957333087921143, 0.029559612274169922, 0.03227800130844116, - 0.03511175513267517, 0.03806024789810181, 0.0411226749420166, 0.044298380613327026, - 0.04758647084236145, 0.05098623037338257, 0.05449673533439636, 0.058117181062698364, - 0.06184667348861694, 0.0656842589378357, 0.06962898373603821, 0.07367992401123047, - 0.0778360664844513, 0.08209633827209473, 0.08645972609519958, 0.09092515707015991, - 0.09549149870872498, 0.10015767812728882, 0.10492250323295593, 0.1097848117351532, - 0.11474338173866272, 0.11979702115058899, 0.12494447827339172, 0.13018447160720825, - 0.1355157196521759, 0.14093685150146484, 0.1464466154575348, 0.15204361081123352, - 0.1577264666557312, 0.16349375247955322, 0.16934409737586975, 0.1752760112285614, - 0.18128803372383118, 0.18737870454788208, 0.19354650378227234, 0.1997898817062378, - 0.20610737800598145, 0.21249738335609436, 0.21895831823349, 0.2254886031150818, - 0.23208662867546082, 0.23875075578689575, 0.24547931551933289, 0.2522706985473633, - 0.25912320613861084, 0.26603513956069946, 0.27300477027893066, 0.2800304591655731, - 0.2871103882789612, 0.29424285888671875, 0.30142611265182495, 0.30865830183029175, - 0.31593772768974304, 0.3232625722885132, 0.3306310474872589, 0.3380413055419922, - 0.34549152851104736, 0.352979838848114, 0.3605044484138489, 0.3680635094642639, - 0.37565508484840393, 0.38327735662460327, 0.3909284174442291, 0.39860638976097107, - 0.4063093662261963, 0.41403549909591675, 0.42178282141685486, 0.4295494258403778, - 0.43733343482017517, 0.44513291120529175, 0.45294591784477234, 0.46077051758766174, - 0.46860480308532715, 0.4764467775821686, 0.4842946231365204, 0.492146372795105, - 0.5, 0.5078536868095398, 0.515705406665802, 0.5235532522201538, - 0.5313953161239624, 0.5392295718193054, 0.5470541715621948, 0.5548672080039978, - 0.562666654586792, 0.5704506635665894, 0.5782172679901123, 0.5859646201133728, - 0.5936906933784485, 0.6013936996459961, 0.609071671962738, 0.6167227625846863, - 0.6243450045585632, 0.6319366097450256, 0.6394955515861511, 0.6470202207565308, - 0.6545085310935974, 0.6619587540626526, 0.6693689823150635, 0.6767374277114868, - 0.6840623021125793, 0.691341757774353, 0.6985740065574646, 0.7057572603225708, - 0.7128896713256836, 0.719969630241394, 0.7269952893257141, 0.7339649796485901, - 0.7408769130706787, 0.7477294206619263, 0.7545207738876343, 0.761249303817749, - 0.7679134607315063, 0.774511456489563, 0.7810417413711548, 0.7875027060508728, - 0.7938927412033081, 0.800210177898407, 0.8064535856246948, 0.8126214146614075, - 0.8187121152877808, 0.8247240781784058, 0.8306560516357422, 0.8365063667297363, - 0.8422735929489136, 0.8479564785957336, 0.8535534143447876, 0.8590631484985352, - 0.8644843101501465, 0.8698155879974365, 0.8750555515289307, 0.8802030086517334, - 0.8852566480636597, 0.8902152180671692, 0.8950775265693665, 0.899842381477356, - 0.9045084714889526, 0.9090749025344849, 0.9135403037071228, 0.9179036617279053, - 0.9221639633178711, 0.9263200759887695, 0.9303710460662842, 0.9343158006668091, - 0.9381533861160278, 0.941882848739624, 0.945503294467926, 0.9490138292312622, - 0.9524135589599609, 0.9557017087936401, 0.9588773250579834, 0.961939811706543, - 0.9648882746696472, 0.9677220582962036, 0.9704403877258301, 0.9730427265167236, - 0.9755282998085022, 0.9778965711593628, 0.9801468849182129, 0.9822787046432495, - 0.9842916131019592, 0.9861849546432495, 0.9879584312438965, 0.9896113872528076, - 0.9911436438560486, 0.9925546646118164, 0.9938441514968872, 0.9950118064880371, - 0.996057391166687, 0.9969804883003235, 0.997780978679657, 0.9984586238861084, - 0.999013364315033, 0.9994449615478516, 0.9997532367706299, 0.9999383091926575, - 1.0, 0.9999383091926575, 0.9997532367706299, 0.9994449615478516, - 0.999013364315033, 0.9984586238861084, 0.997780978679657, 0.9969804286956787, - 0.9960573315620422, 0.9950118064880371, 0.9938441514968872, 0.9925546646118164, - 0.9911435842514038, 0.9896113872528076, 0.9879583716392517, 0.9861849546432495, - 0.9842915534973145, 0.9822787046432495, 0.9801468253135681, 0.9778964519500732, - 0.9755282402038574, 0.9730426073074341, 0.9704403877258301, 0.9677219390869141, - 0.9648882150650024, 0.9619396924972534, 0.9588772654533386, 0.9557015895843506, - 0.9524134397506714, 0.9490137100219727, 0.9455032348632812, 0.9418827295303345, - 0.9381532669067383, 0.9343156814575195, 0.9303709268569946, 0.9263200759887695, - 0.9221639633178711, 0.9179036617279053, 0.913540244102478, 0.9090747833251953, - 0.9045084714889526, 0.8998422622680664, 0.8950774669647217, 0.8902151584625244, - 0.8852565884590149, 0.8802029490470886, 0.8750554919242859, 0.869815468788147, - 0.8644842505455017, 0.8590630888938904, 0.853553295135498, 0.8479562997817993, - 0.842273473739624, 0.836506187915802, 0.8306558728218079, 0.8247239589691162, - 0.8187118768692017, 0.8126212358474731, 0.8064534664154053, 0.8002099990844727, - 0.793892502784729, 0.7875025272369385, 0.7810416221618652, 0.7745113372802734, - 0.767913281917572, 0.7612491846084595, 0.7545205950737, 0.7477291822433472, - 0.7408767342567444, 0.7339648008346558, 0.7269951105117798, 0.7199694514274597, - 0.7128894925117493, 0.7057570219039917, 0.6985738277435303, 0.6913415789604187, - 0.684062123298645, 0.6767372488975525, 0.6693688035011292, 0.6619585752487183, - 0.6545083522796631, 0.6470199823379517, 0.6394953727722168, 0.6319363117218018, - 0.6243447661399841, 0.6167224645614624, 0.6090714335441589, 0.601393461227417, - 0.5936904549598694, 0.5859643220901489, 0.5782170295715332, 0.5704504251480103, - 0.5626664161682129, 0.5548669099807739, 0.5470539331436157, 0.5392293334007263, - 0.5313950181007385, 0.5235530138015747, 0.5157051682472229, 0.507853627204895, - 0.5, 0.4921463429927826, 0.484294593334198, 0.4764467477798462, - 0.46860471367836, 0.4607704281806946, 0.4529458284378052, 0.4451328217983246, - 0.437333345413208, 0.42954933643341064, 0.4217827320098877, 0.4140354096889496, - 0.4063093066215515, 0.3986063003540039, 0.39092832803726196, 0.3832772672176361, - 0.37565499544143677, 0.36806342005729675, 0.3605043888092041, 0.35297977924346924, - 0.3454914391040802, 0.338041216135025, 0.33063095808029175, 0.3232625126838684, - 0.3159376382827759, 0.3086581826210022, 0.3014259934425354, 0.2942427396774292, - 0.28711026906967163, 0.2800303101539612, 0.2730046510696411, 0.2660350203514099, - 0.2591230869293213, 0.25227057933807373, 0.24547919631004333, 0.2387506067752838, - 0.23208650946617126, 0.22548848390579224, 0.21895819902420044, 0.2124972641468048, - 0.2061072587966919, 0.19978976249694824, 0.1935463547706604, 0.18737855553627014, - 0.18128788471221924, 0.17527586221694946, 0.1693439483642578, 0.16349363327026367, - 0.15772631764411926, 0.15204349160194397, 0.14644649624824524, 0.1409367322921753, - 0.13551557064056396, 0.1301843225955963, 0.12494435906410217, 0.11979690194129944, - 0.11474326252937317, 0.10978469252586365, 0.10492238402366638, 0.10015755891799927, - 0.09549137949943542, 0.09092503786087036, 0.08645960688591003, 0.08209621906280518, - 0.07783591747283936, 0.07367980480194092, 0.06962886452674866, 0.06568413972854614, - 0.06184655427932739, 0.0581170916557312, 0.0544966459274292, 0.05098611116409302, - 0.04758638143539429, 0.044298261404037476, 0.04112258553504944, 0.038060128688812256, - 0.03511166572570801, 0.03227788209915161, 0.02955952286720276, 0.02695724368095398, - 0.024471670389175415, 0.02210339903831482, 0.01985308527946472, 0.017721205949783325, - 0.015708357095718384, 0.0138150155544281, 0.012041598558425903, 0.010388582944869995, - 0.008856356143951416, 0.007445335388183594, 0.006155818700790405, 0.004988163709640503, - 0.003942638635635376, 0.0030195116996765137, 0.0022190213203430176, 0.0015413165092468262, - 0.000986635684967041, 0.0005550682544708252, 0.0002467334270477295, 6.16908073425293e-05}; + std::vector hann(frame_size); + for (int i = 0; i < frame_size; i++) { + hann[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(frame_size))); + } // Calculate the length of padding int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; @@ -3122,8 +3026,8 @@ int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, st } // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) -int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads, bool debug) { - return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, debug); +int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { + return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, false); } // same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2 From 386ef32184647aadeae582a46e4cf6a33d74b01f Mon Sep 17 00:00:00 2001 From: bobqianic <129547291+bobqianic@users.noreply.github.com> Date: Mon, 14 Aug 2023 16:58:10 +0800 Subject: [PATCH 19/21] Some Final Fix (#2) * Fix error * Fix error * Probably the last commit * Probably the last commit --- whisper.cpp | 38 ++++++++++++++++++++++++++------------ whisper.h | 6 ++++-- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index e72da2dc70a..65af7056074 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2427,11 +2427,27 @@ static void fft(const std::vector & in, std::vector & out) { } } +static bool hann_window(int length, bool periodic, std::vector &output) { + if (output.size() < length) { + output.resize(length); + } + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + output[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(length + offset))); + } + + return true; +} + static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const std::vector &samples, int n_samples, int frame_size, int frame_step, int n_threads, const whisper_filters &filters, whisper_mel &mel) { std::vector fft_in(frame_size, 0.0); std::vector fft_out(2 * frame_step); + // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist int n_fft = 1 + (frame_size / 2); int i = ith; @@ -2493,26 +2509,25 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157 static bool log_mel_spectrogram( - whisper_state & wstate, - const float * samples, + whisper_state & wstate, + const float * samples, const int n_samples, const int /*sample_rate*/, const int frame_size, const int frame_step, const int n_mel, const int n_threads, - const whisper_filters & filters, - const bool debug, - whisper_mel & mel) { + const whisper_filters & filters, + const bool debug, + whisper_mel & mel) { const int64_t t_start_us = ggml_time_us(); // Hanning window (Use cosf to eliminate difference) // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147 - std::vector hann(frame_size); - for (int i = 0; i < frame_size; i++) { - hann[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(frame_size))); - } + std::vector hann; + hann_window(frame_size, true, hann); + // Calculate the length of padding int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30; @@ -2534,7 +2549,7 @@ static bool log_mel_spectrogram( // Calculate number of frames + remove the last frame mel.n_len = (samples_padded.size() - frame_size) / frame_step; // Calculate semi-padded sample length to ensure compatibility - mel.n_len_org = 1 + (n_samples + stage_2_pad) / frame_step; + mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step; mel.data.resize(mel.n_mel * mel.n_len); @@ -3015,7 +3030,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads, false); } -// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (Phase Vocoder without phase lock is not good) +// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads, bool debug) { if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, debug, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); @@ -4055,7 +4070,6 @@ int whisper_full_with_state( // compute log mel spectrogram if (params.speed_up) { - // Temporarily disable speed_up mode // TODO: Replace PV with more advanced algorithm log("%s: failed to compute log mel spectrogram\n", __func__); return -1; diff --git a/whisper.h b/whisper.h index 3b1b2431041..b1ee6a981e5 100644 --- a/whisper.h +++ b/whisper.h @@ -147,7 +147,8 @@ extern "C" { struct whisper_state * state, const float * samples, int n_samples, - int n_threads); + int n_threads, + bool debug); // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. // The resulting spectrogram is stored inside the default state of the provided whisper context. @@ -163,7 +164,8 @@ extern "C" { struct whisper_state * state, const float * samples, int n_samples, - int n_threads); + int n_threads, + bool debug); // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context. // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. From 22d348cedbf074baf1dc67ae5bb9190e966544af Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 19:20:20 +0300 Subject: [PATCH 20/21] whisper : minor coding style changes --- examples/main/main.cpp | 2 +- whisper.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 39ad3540bf9..993f9775e35 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -191,7 +191,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold); fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); - fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. Dump log_mel)\n", params.debug_mode ? "true" : "false"); + fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); diff --git a/whisper.cpp b/whisper.cpp index 65391c2e9c9..7f99431f302 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2445,7 +2445,7 @@ static void fft(const std::vector & in, std::vector & out) { } } -static bool hann_window(int length, bool periodic, std::vector &output) { +static bool hann_window(int length, bool periodic, std::vector & output) { if (output.size() < length) { output.resize(length); } @@ -2460,16 +2460,16 @@ static bool hann_window(int length, bool periodic, std::vector &output) { return true; } -static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const std::vector &samples, +static void log_mel_spectrogram_worker_thread(int ith, const std::vector & hann, const std::vector & samples, int n_samples, int frame_size, int frame_step, int n_threads, - const whisper_filters &filters, whisper_mel &mel) { + const whisper_filters & filters, whisper_mel & mel) { std::vector fft_in(frame_size, 0.0); std::vector fft_out(2 * frame_step); // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist int n_fft = 1 + (frame_size / 2); int i = ith; - // Calculate FFT only when fft_in are not all zero + // calculate FFT only when fft_in are not all zero for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) { const int offset = i * frame_step; From 590a12e6ad75671da3d8b57f8fc312c82ed6f39a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 27 Aug 2023 19:49:34 +0300 Subject: [PATCH 21/21] whisper : remove debug from public API --- whisper.cpp | 14 +++++++------- whisper.h | 6 ++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/whisper.cpp b/whisper.cpp index 7f99431f302..cc436f94ed6 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3036,8 +3036,8 @@ void whisper_free_params(struct whisper_full_params * params) { } } -int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads, bool debug) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, debug, state->mel)) { +int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3046,12 +3046,12 @@ int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_s } int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { - return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads, false); + return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads); } // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) -int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads, bool debug) { - if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, debug, state->mel)) { +int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) { + if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) { log("%s: failed to compute mel spectrogram\n", __func__); return -1; } @@ -3061,7 +3061,7 @@ int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, st // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good) int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) { - return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads, false); + return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads); } // same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2 @@ -4093,7 +4093,7 @@ int whisper_full_with_state( log("%s: failed to compute log mel spectrogram\n", __func__); return -1; } else { - if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads, params.debug_mode) != 0) { + if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) { log("%s: failed to compute log mel spectrogram\n", __func__); return -2; } diff --git a/whisper.h b/whisper.h index b1ee6a981e5..3b1b2431041 100644 --- a/whisper.h +++ b/whisper.h @@ -147,8 +147,7 @@ extern "C" { struct whisper_state * state, const float * samples, int n_samples, - int n_threads, - bool debug); + int n_threads); // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. // The resulting spectrogram is stored inside the default state of the provided whisper context. @@ -164,8 +163,7 @@ extern "C" { struct whisper_state * state, const float * samples, int n_samples, - int n_threads, - bool debug); + int n_threads); // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context. // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.