diff --git a/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_output.json b/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_output.json index ac6687c51..a375651ad 100644 --- a/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_output.json +++ b/test/unit/tts_pipeline/__snapshots__/test_tts_engine/test_mocked_synthesize_wave_output.json @@ -1,8 +1,8 @@ { - "hash": "MD5:0972d97e926609c90a9117490fe9813e", + "hash": "MD5:fe14492fa48cd7c1f14a755b527a4bdd", "shape": [ 1, - 14080, + 15872, 2 ] } diff --git a/test/unit/tts_pipeline/test_wave_synthesizer.py b/test/unit/tts_pipeline/test_wave_synthesizer.py index aca333936..b8ecb4f36 100644 --- a/test/unit/tts_pipeline/test_wave_synthesizer.py +++ b/test/unit/tts_pipeline/test_wave_synthesizer.py @@ -258,23 +258,26 @@ def test_query_to_decoder_feature() -> None: # Expects # frame_per_phoneme # Pre k o N pau h i h O Pst - true_frame_per_phoneme = [1, 1, 2, 2, 2, 1, 2, 2, 1, 3] + true_frame_per_phoneme = [2, 1, 2, 2, 2, 1, 2, 2, 1, 6] n_frame = sum(true_frame_per_phoneme) # phoneme - # Pr k o o N N pau pau h i i h h O Pt Pt Pt - frame_phoneme_idxs = [0, 23, 30, 30, 4, 4, 0, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0] + # Pr Pr k o o N N pau pau] + frame_phoneme_idxs = [0, 0, 23, 30, 30, 4, 4, 0, 0] + # h i i h h O Pt Pt Pt Pt Pt Pt + frame_phoneme_idxs += [19, 21, 21, 19, 19, 5, 0, 0, 0, 0, 0, 0] + true_phoneme = np.zeros([n_frame, TRUE_NUM_PHONEME], dtype=np.float32) for frame_idx, phoneme_idx in enumerate(frame_phoneme_idxs): true_phoneme[frame_idx, phoneme_idx] = 1.0 # Pitch # paw ko N pau hi hO paw - # frame_per_vowel = [1, 3, 2, 1, 3, 3, 3] - # pau ko ko ko N N - true1_f0 = [0.0, 22.0, 22.0, 22.0, 22.0, 22.0] - # pau pau hi hi hi + # frame_per_vowel = [2, 3, 2, 2, 3, 3, 6] + # pau pau ko ko ko N N + true1_f0 = [0.0, 0.0, 22.0, 22.0, 22.0, 22.0, 22.0] + # pau pau hi hi hi true2_f0 = [0.0, 0.0, 28.0, 28.0, 28.0] - # hO hO hO paw paw paw - true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] + # hO hO hO paw paw paw paw paw paw + true3_f0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] true_f0 = np.array(true1_f0 + true2_f0 + true3_f0, dtype=np.float32) # Outputs diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 664e11b95..b374d5ad3 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -228,10 +228,10 @@ def _query_to_decoder_feature( moras = to_flatten_moras(query.accent_phrases) # 設定を適用する - moras = _apply_prepost_silence(moras, query) moras = _apply_pause_length(moras, query) moras = _apply_pause_length_scale(moras, query) moras = _apply_speed_scale(moras, query) + moras = _apply_prepost_silence(moras, query) moras = _apply_pitch_scale(moras, query) moras = _apply_intonation_scale(moras, query)