Skip to content

Commit

Permalink
整理: _synthesis_impl 前処理/後処理の関数化 (#873)
Browse files Browse the repository at this point in the history
  • Loading branch information
tarepan authored Dec 17, 2023
1 parent d0b8fff commit b6a0477
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 32 deletions.
23 changes: 7 additions & 16 deletions test/test_synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
calc_frame_pitch,
mora_phoneme_list,
pre_process,
query_to_decoder_feature,
split_mora,
to_flatten_moras,
to_flatten_phonemes,
Expand Down Expand Up @@ -446,8 +447,8 @@ def test_calc_frame_phoneme():
assert numpy.array_equal(frame_phoneme, true_frame_phoneme)


def test_feat_to_framescale():
"""Test Mora/Phonemefeature-to-framescaleFeature pipeline."""
def test_query_to_decoder_feature():
"""Test `query_to_decoder_feature`."""
# Inputs
accent_phrases = [
AccentPhrase(
Expand Down Expand Up @@ -484,9 +485,9 @@ def test_feat_to_framescale():
# phoneme
# Pr k o o N N pau h i i h h O Pt Pt Pt
frame_phoneme_idxs = [0, 23, 30, 30, 4, 4, 0, 19, 21, 21, 19, 19, 5, 0, 0, 0]
true_frame_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32)
true_phoneme = numpy.zeros([n_frame, TRUE_NUM_PHONEME], dtype=numpy.float32)
for frame_idx, phoneme_idx in enumerate(frame_phoneme_idxs):
true_frame_phoneme[frame_idx, phoneme_idx] = 1.0
true_phoneme[frame_idx, phoneme_idx] = 1.0
# Pitch
# paw ko N pau hi hO paw
# frame_per_vowel = [1, 3, 2, 1, 3, 3, 3]
Expand All @@ -499,19 +500,9 @@ def test_feat_to_framescale():
true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32)

# Outputs
flatten_moras = to_flatten_moras(query.accent_phrases)
flatten_moras = apply_prepost_silence(flatten_moras, query)
flatten_moras = apply_speed_scale(flatten_moras, query)
flatten_moras = apply_pitch_scale(flatten_moras, query)
flatten_moras = apply_intonation_scale(flatten_moras, query)

phoneme_data_list = to_flatten_phonemes(flatten_moras)
phoneme, f0 = query_to_decoder_feature(query)

frame_per_phoneme = calc_frame_per_phoneme(flatten_moras)
f0 = calc_frame_pitch(flatten_moras)
frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)

assert numpy.array_equal(frame_phoneme, true_frame_phoneme)
assert numpy.array_equal(phoneme, true_phoneme)
assert numpy.array_equal(f0, true_f0)


Expand Down
58 changes: 42 additions & 16 deletions voicevox_engine/tts_pipeline/tts_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,44 @@ def apply_output_stereo(wave: ndarray, query: AudioQuery) -> ndarray:
return wave


def query_to_decoder_feature(query: AudioQuery) -> tuple[ndarray, ndarray]:
"""
音声合成用のクエリをデコーダー用特徴量へ変換する。
Parameters
----------
query : AudioQuery
音声合成クエリ
Returns
-------
phoneme : ndarray
フレームごとの音素、shape=(Frame,)
f0 : ndarray
フレームごとの基本周波数、shape=(Frame,)
"""
flatten_moras = to_flatten_moras(query.accent_phrases)

flatten_moras = apply_prepost_silence(flatten_moras, query)
flatten_moras = apply_speed_scale(flatten_moras, query)
flatten_moras = apply_pitch_scale(flatten_moras, query)
flatten_moras = apply_intonation_scale(flatten_moras, query)

phoneme_data_list = to_flatten_phonemes(flatten_moras)

frame_per_phoneme = calc_frame_per_phoneme(flatten_moras)
f0 = calc_frame_pitch(flatten_moras)
phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)

return phoneme, f0


def raw_wave_to_output_wave(query: AudioQuery, wave: ndarray, sr_wave: int) -> ndarray:
"""生音声波形に音声合成用のクエリを適用して出力音声波形を生成する"""
wave = apply_volume_scale(wave, query)
wave = apply_output_sampling_rate(wave, sr_wave, query)
wave = apply_output_stereo(wave, query)
return wave


class SynthesisEngine(SynthesisEngineBase):
"""音声合成器(core)の管理/実行/プロキシと音声合成フロー"""

Expand Down Expand Up @@ -614,31 +652,19 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
# モデルがロードされていない場合はロードする
self.initialize_style_id_synthesis(style_id, skip_reinit=True)

flatten_moras = to_flatten_moras(query.accent_phrases)
flatten_moras = apply_prepost_silence(flatten_moras, query)
flatten_moras = apply_speed_scale(flatten_moras, query)
flatten_moras = apply_pitch_scale(flatten_moras, query)
flatten_moras = apply_intonation_scale(flatten_moras, query)

phoneme_data_list = to_flatten_phonemes(flatten_moras)

frame_per_phoneme = calc_frame_per_phoneme(flatten_moras)
f0 = calc_frame_pitch(flatten_moras)
phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
phoneme, f0 = query_to_decoder_feature(query)

# 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する
with self.mutex:
wave = self.core.decode_forward(
raw_wave = self.core.decode_forward(
length=phoneme.shape[0],
phoneme_size=phoneme.shape[1],
f0=f0[:, numpy.newaxis],
phoneme=phoneme,
style_id=numpy.array(style_id, dtype=numpy.int64).reshape(-1),
)
sr_wave = self.default_sampling_rate
sr_raw_wave = self.default_sampling_rate

wave = apply_volume_scale(wave, query)
wave = apply_output_sampling_rate(wave, sr_wave, query)
wave = apply_output_stereo(wave, query)
wave = raw_wave_to_output_wave(query, raw_wave, sr_raw_wave)

return wave

0 comments on commit b6a0477

Please sign in to comment.