diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 00730bb5d..65ee57b35 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -26,6 +26,7 @@ calc_frame_pitch, mora_phoneme_list, pre_process, + query_to_decoder_feature, split_mora, to_flatten_moras, to_flatten_phonemes, @@ -446,8 +447,8 @@ def test_calc_frame_phoneme(): assert numpy.array_equal(frame_phoneme, true_frame_phoneme) -def test_feat_to_framescale(): - """Test Mora/Phonemefeature-to-framescaleFeature pipeline.""" +def test_query_to_decoder_feature(): + """Test `query_to_decoder_feature`.""" # Inputs accent_phrases = [ AccentPhrase( @@ -485,9 +486,9 @@ def test_feat_to_framescale(): # phoneme # Pr k o o N N pau h i i h h O Pt Pt Pt frame_phoneme_idxs = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0] - true_frame_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32) + true_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32) for frame_idx, phoneme_idx in enumerate(frame_phoneme_idxs): - true_frame_phoneme[frame_idx, phoneme_idx] = 1.0 + true_phoneme[frame_idx, phoneme_idx] = 1.0 # Pitch # Pre ko N pau hi hO Pst true_f0 = [0.0, 200.0, 200.0, 0.0, 500.0, 0.0, 0.0] # mean 300 @@ -503,19 +504,9 @@ def test_feat_to_framescale(): true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) # Outputs - flatten_moras = to_flatten_moras(query.accent_phrases) - flatten_moras = apply_prepost_silence(flatten_moras, query) - flatten_moras = apply_speed_scale(flatten_moras, query) - flatten_moras = apply_pitch_scale(flatten_moras, query) - flatten_moras = apply_intonation_scale(flatten_moras, query) - - phoneme_data_list = to_flatten_phonemes(flatten_moras) + phoneme, f0 = query_to_decoder_feature(query) - frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) - f0 = calc_frame_pitch(flatten_moras) - frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) - - assert numpy.array_equal(frame_phoneme, true_frame_phoneme) + assert numpy.array_equal(phoneme, true_phoneme) assert numpy.array_equal(f0, true_f0) diff --git a/voicevox_engine/tts_pipeline/tts_engine.py b/voicevox_engine/tts_pipeline/tts_engine.py index 372900c6f..3b1227706 100644 --- a/voicevox_engine/tts_pipeline/tts_engine.py +++ b/voicevox_engine/tts_pipeline/tts_engine.py @@ -359,6 +359,44 @@ def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray: return wave +def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]: + """ + 音声合成用のクエリをデコーダー用特徴量へ変換する。 + Parameters + ---------- + query : AudioQuery + 音声合成クエリ + Returns + ------- + phoneme : ndarray + フレームごとの音素、shape=(Frame,) + f0 : ndarray + フレームごとの基本周波数、shape=(Frame,) + """ + flatten_moras = to_flatten_moras(query.accent_phrases) + + flatten_moras = apply_prepost_silence(flatten_moras, query) + flatten_moras = apply_speed_scale(flatten_moras, query) + flatten_moras = apply_pitch_scale(flatten_moras, query) + flatten_moras = apply_intonation_scale(flatten_moras, query) + + phoneme_data_list = to_flatten_phonemes(flatten_moras) + + frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) + f0 = calc_frame_pitch(flatten_moras) + phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) + + return phoneme, f0 + + +def raw_wave_to_output_wave(query: AudioQuery, wave: ndarray, sr_wave: int) -> ndarray: + """生音声波形に音声合成用のクエリを適用して出力音声波形を生成する""" + wave = apply_volume_scale(wave, query) + wave = apply_output_sampling_rate(wave, sr_wave, query) + wave = apply_output_stereo(wave, query) + return wave + + class SynthesisEngine(SynthesisEngineBase): """音声合成器(core)の管理/実行/プロキシと音声合成フロー""" @@ -614,31 +652,19 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): # モデルがロードされていない場合はロードする self.initialize_style_id_synthesis(style_id, skip_reinit=True) - flatten_moras = to_flatten_moras(query.accent_phrases) - flatten_moras = apply_prepost_silence(flatten_moras, query) - flatten_moras = apply_speed_scale(flatten_moras, query) - flatten_moras = apply_pitch_scale(flatten_moras, query) - flatten_moras = apply_intonation_scale(flatten_moras, query) - - phoneme_data_list = to_flatten_phonemes(flatten_moras) - - frame_per_phoneme = calc_frame_per_phoneme(flatten_moras) - f0 = calc_frame_pitch(flatten_moras) - phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) + phoneme, f0 = query_to_decoder_feature(query) # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する with self.mutex: - wave = self.core.decode_forward( + raw_wave = self.core.decode_forward( length=phoneme.shape[0], phoneme_size=phoneme.shape[1], f0=f0[:, numpy.newaxis], phoneme=phoneme, style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1), ) - sr_wave = self.default_sampling_rate + sr_raw_wave = self.default_sampling_rate - wave = apply_volume_scale(wave, query) - wave = apply_output_sampling_rate(wave, sr_wave, query) - wave = apply_output_stereo(wave, query) + wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave) return wave