From ab9b8bf76a5b2206825cc04aa05358c3f06d59e4 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Fri, 15 Dec 2023 04:14:48 +0000
Subject: [PATCH 1/3] =?UTF-8?q?Refactor:=20decode=E5=89=8D=E5=87=A6?=
 =?UTF-8?q?=E7=90=86/=E5=BE=8C=E5=87=A6=E7=90=86=E3=81=AE=E9=96=A2?=
 =?UTF-8?q?=E6=95=B0=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 test/test_synthesis_engine.py              | 23 +++------
 voicevox_engine/tts_pipeline/tts_engine.py | 57 ++++++++++++++++------
 2 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py
index 00730bb5d..65ee57b35 100644
--- a/test/test_synthesis_engine.py
+++ b/test/test_synthesis_engine.py
@@ -26,6 +26,7 @@
     calc_frame_pitch,
     mora_phoneme_list,
     pre_process,
+    query_to_decoder_feature,
     split_mora,
     to_flatten_moras,
     to_flatten_phonemes,
@@ -446,8 +447,8 @@ def test_calc_frame_phoneme():
     assert numpy.array_equal(frame_phoneme, true_frame_phoneme)
 
 
-def test_feat_to_framescale():
-    """Test Mora/Phonemefeature-to-framescaleFeature pipeline."""
+def test_query_to_decoder_feature():
+    """Test `query_to_decoder_feature`."""
     # Inputs
     accent_phrases = [
         AccentPhrase(
@@ -485,9 +486,9 @@ def test_feat_to_framescale():
     # phoneme
     #                     Pr  k   o   o  N  N pau  h   i   i   h   h  O Pt Pt Pt
     frame_phoneme_idxs = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0]
-    true_frame_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32)
+    true_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32)
     for frame_idx, phoneme_idx in enumerate(frame_phoneme_idxs):
-        true_frame_phoneme[frame_idx, phoneme_idx] = 1.0
+        true_phoneme[frame_idx, phoneme_idx] = 1.0
     # Pitch
     #          Pre   ko      N    pau   hi    hO   Pst
     true_f0 = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0]  # mean 300
@@ -503,19 +504,9 @@ def test_feat_to_framescale():
     true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32)
 
     # Outputs
-    flatten_moras = to_flatten_moras(query.accent_phrases)
-    flatten_moras = apply_prepost_silence(flatten_moras, query)
-    flatten_moras = apply_speed_scale(flatten_moras, query)
-    flatten_moras = apply_pitch_scale(flatten_moras, query)
-    flatten_moras = apply_intonation_scale(flatten_moras, query)
-
-    phoneme_data_list = to_flatten_phonemes(flatten_moras)
+    phoneme, f0 = query_to_decoder_feature(query)
 
-    frame_per_phoneme = calc_frame_per_phoneme(flatten_moras)
-    f0 = calc_frame_pitch(flatten_moras)
-    frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
-
-    assert numpy.array_equal(frame_phoneme, true_frame_phoneme)
+    assert numpy.array_equal(phoneme, true_phoneme)
     assert numpy.array_equal(f0, true_f0)
 
 
diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py
index 372900c6f..5cf93d660 100644
--- a/voicevox_engine/tts_pipeline/tts_engine.py
+++ b/voicevox_engine/tts_pipeline/tts_engine.py
@@ -359,6 +359,43 @@ def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray:
     return wave
 
 
+def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]:
+    """
+    音声合成用のクエリをデコーダー用特徴量へ変換する。
+    Parameters
+    ----------
+    query : AudioQuery
+        音声合成クエリ
+    Returns
+    -------
+    phoneme : ndarray
+        フレームごとの音素、shape=(Frame,)
+    f0 : ndarray
+        フレームごとの基本周波数、shape=(Frame,)
+    """
+    flatten_moras = to_flatten_moras(query.accent_phrases)
+    flatten_moras = apply_prepost_silence(flatten_moras, query)
+    flatten_moras = apply_speed_scale(flatten_moras, query)
+    flatten_moras = apply_pitch_scale(flatten_moras, query)
+    flatten_moras = apply_intonation_scale(flatten_moras, query)
+
+    phoneme_data_list = to_flatten_phonemes(flatten_moras)
+
+    frame_per_phoneme = calc_frame_per_phoneme(flatten_moras)
+    f0 = calc_frame_pitch(flatten_moras)
+    phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
+
+    return phoneme, f0
+
+
+def wave_to_output(query: AudioQuery, wave: ndarray, sr_wave: int) -> ndarray:
+    """音声波形に音声合成用のクエリを適用して出力を生成する"""
+    wave = apply_volume_scale(wave, query)
+    wave = apply_output_sampling_rate(wave, sr_wave, query)
+    wave = apply_output_stereo(wave, query)
+    return wave
+
+
 class SynthesisEngine(SynthesisEngineBase):
     """音声合成器（core）の管理/実行/プロキシと音声合成フロー"""
 
@@ -608,23 +645,13 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
             スタイルID
         Returns
         -------
-        wave : numpy.ndarray
+        output : numpy.ndarray
             音声合成結果
         """
         # モデルがロードされていない場合はロードする
         self.initialize_style_id_synthesis(style_id, skip_reinit=True)
 
-        flatten_moras = to_flatten_moras(query.accent_phrases)
-        flatten_moras = apply_prepost_silence(flatten_moras, query)
-        flatten_moras = apply_speed_scale(flatten_moras, query)
-        flatten_moras = apply_pitch_scale(flatten_moras, query)
-        flatten_moras = apply_intonation_scale(flatten_moras, query)
-
-        phoneme_data_list = to_flatten_phonemes(flatten_moras)
-
-        frame_per_phoneme = calc_frame_per_phoneme(flatten_moras)
-        f0 = calc_frame_pitch(flatten_moras)
-        phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
+        phoneme, f0 = query_to_decoder_feature(query)
 
         # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する
         with self.mutex:
@@ -637,8 +664,6 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
             )
             sr_wave = self.default_sampling_rate
 
-        wave = apply_volume_scale(wave, query)
-        wave = apply_output_sampling_rate(wave, sr_wave, query)
-        wave = apply_output_stereo(wave, query)
+        output = wave_to_output(query, wave, sr_wave)
 
-        return wave
+        return output

From 34917fa0dd71b32064ab04b5df4a05d408fe0eaa Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Fri, 15 Dec 2023 04:22:22 +0000
Subject: [PATCH 2/3] =?UTF-8?q?Fix:=20=E5=88=86=E5=89=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 voicevox_engine/tts_pipeline/tts_engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py
index 5cf93d660..7c830b4f2 100644
--- a/voicevox_engine/tts_pipeline/tts_engine.py
+++ b/voicevox_engine/tts_pipeline/tts_engine.py
@@ -374,6 +374,7 @@ def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]:
         フレームごとの基本周波数、shape=(Frame,)
     """
     flatten_moras = to_flatten_moras(query.accent_phrases)
+
     flatten_moras = apply_prepost_silence(flatten_moras, query)
     flatten_moras = apply_speed_scale(flatten_moras, query)
     flatten_moras = apply_pitch_scale(flatten_moras, query)

From aa4a922dd373ece7175e4592257b6d3974c25199 Mon Sep 17 00:00:00 2001
From: tarepan <tarepan5884@gmail.com>
Date: Sun, 17 Dec 2023 00:18:02 +0000
Subject: [PATCH 3/3] =?UTF-8?q?Fix:=20=E6=B3=A2=E5=BD=A2=E5=A4=89=E6=95=B0?=
 =?UTF-8?q?=E5=90=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 voicevox_engine/tts_pipeline/tts_engine.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py
index 7c830b4f2..3b1227706 100644
--- a/voicevox_engine/tts_pipeline/tts_engine.py
+++ b/voicevox_engine/tts_pipeline/tts_engine.py
@@ -389,8 +389,8 @@ def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]:
     return phoneme, f0
 
 
-def wave_to_output(query: AudioQuery, wave: ndarray, sr_wave: int) -> ndarray:
-    """音声波形に音声合成用のクエリを適用して出力を生成する"""
+def raw_wave_to_output_wave(query: AudioQuery, wave: ndarray, sr_wave: int) -> ndarray:
+    """生音声波形に音声合成用のクエリを適用して出力音声波形を生成する"""
     wave = apply_volume_scale(wave, query)
     wave = apply_output_sampling_rate(wave, sr_wave, query)
     wave = apply_output_stereo(wave, query)
@@ -646,7 +646,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
             スタイルID
         Returns
         -------
-        output : numpy.ndarray
+        wave : numpy.ndarray
             音声合成結果
         """
         # モデルがロードされていない場合はロードする
@@ -656,15 +656,15 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
 
         # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する
         with self.mutex:
-            wave = self.core.decode_forward(
+            raw_wave = self.core.decode_forward(
                 length=phoneme.shape[0],
                 phoneme_size=phoneme.shape[1],
                 f0=f0[:, numpy.newaxis],
                 phoneme=phoneme,
                 style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
             )
-            sr_wave = self.default_sampling_rate
+            sr_raw_wave = self.default_sampling_rate
 
-        output = wave_to_output(query, wave, sr_wave)
+        wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)
 
-        return output
+        return wave