From 408d5d470494a09658ad437631c429a416027b7e Mon Sep 17 00:00:00 2001 From: tarepan Date: Sat, 9 Dec 2023 19:21:12 +0000 Subject: [PATCH] =?UTF-8?q?Refactor:=20`frame=5Fper=5Fmora`=20=E3=81=AB?= =?UTF-8?q?=E3=82=88=E3=82=8B=E7=BD=AE=E3=81=8D=E6=8F=9B=E3=81=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/test_synthesis_engine.py | 48 +++++++++++++------ .../synthesis_engine/synthesis_engine.py | 43 ++++++++++------- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/test/test_synthesis_engine.py b/test/test_synthesis_engine.py index 9ff7fb563..eee0cae66 100644 --- a/test/test_synthesis_engine.py +++ b/test/test_synthesis_engine.py @@ -20,6 +20,7 @@ apply_prepost_silence, apply_speed_scale, apply_volume_scale, + calc_frame_per_mora, calc_frame_per_phoneme, calc_frame_phoneme, calc_frame_pitch, @@ -353,24 +354,43 @@ def test_calc_frame_per_phoneme(): assert numpy.array_equal(frame_per_phoneme, true_frame_per_phoneme) +def test_calc_frame_per_mora(): + """Test `calc_frame_per_mora`.""" + # Inputs + moras = [ + _gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame] + _gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0), + _gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0), + _gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0), + _gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0), + _gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0), + ] + + # Expects + # Pre ko N pau hi hO Pst + true_frame_per_mora = [2, 6, 4, 2, 6, 6, 6] + true_frame_per_mora = numpy.array(true_frame_per_mora, dtype=numpy.int32) + + # Outputs + frame_per_phoneme = numpy.array(list(map(calc_frame_per_mora, moras))) + + assert numpy.array_equal(frame_per_phoneme, true_frame_per_mora) + + def test_calc_frame_pitch(): """Test `test_calc_frame_pitch`.""" # Inputs query = _gen_query(pitchScale=2.0, intonationScale=0.5) moras = [ - _gen_mora(" ", None, None, " ", 0.0, 0.0), - _gen_mora("コ", "k", 0.0, "o", 0.0, 50.0), - _gen_mora("ン", None, None, "N", 0.0, 50.0), - _gen_mora("、", None, None, "pau", 0.0, 0.0), - _gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0), - _gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0), - _gen_mora(" ", None, None, " ", 0.0, 0.0), + _gen_mora(" ", None, None, " ", 1 * 0.01067, 0.0), + _gen_mora("コ", "k", 1 * 0.01067, "o", 2 * 0.01067, 50.0), + _gen_mora("ン", None, None, "N", 2 * 0.01067, 50.0), + _gen_mora("、", None, None, "pau", 1 * 0.01067, 0.0), + _gen_mora("ヒ", "h", 1 * 0.01067, "i", 2 * 0.01067, 125.0), + _gen_mora("ホ", "h", 2 * 0.01067, "O", 1 * 0.01067, 0.0), + _gen_mora(" ", None, None, " ", 3 * 0.01067, 0.0), ] - phoneme_str = "pau k o N pau h i h O pau" - phonemes = [OjtPhoneme(p) for p in phoneme_str.split()] - # Pre k o N pau h i h O Pst - frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3] - frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32) # Expects - x4 value scaled -> mean=300 var x0.5 intonation scaling # pau ko ko ko N N @@ -382,7 +402,7 @@ def test_calc_frame_pitch(): true_f0 = numpy.array(true1_f0 + true2_f0 + true3_f0, dtype=numpy.float32) # Outputs - f0 = calc_frame_pitch(query, moras, phonemes, frame_per_phoneme) + f0 = calc_frame_pitch(query, moras) assert numpy.array_equal(f0, true_f0) @@ -461,7 +481,7 @@ def test_feat_to_framescale(): # Outputs flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) - f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme) + f0 = calc_frame_pitch(query, flatten_moras) frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) assert numpy.array_equal(frame_phoneme, true_frame_phoneme) diff --git a/voicevox_engine/synthesis_engine/synthesis_engine.py b/voicevox_engine/synthesis_engine/synthesis_engine.py index 9fa12d3a5..d36d9a407 100644 --- a/voicevox_engine/synthesis_engine/synthesis_engine.py +++ b/voicevox_engine/synthesis_engine/synthesis_engine.py @@ -191,6 +191,29 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]): return frame_per_phoneme +def _to_frame(sec: float) -> ndarray: + FRAMERATE = 93.75 # 24000 / 256 [frame/sec] + return numpy.round(sec * FRAMERATE).astype(numpy.int32) + + +def calc_frame_per_mora(mora: Mora) -> ndarray: + """ + モーラあたりのフレーム長を算出 + Parameters + ---------- + mora : Mora + モーラ + Returns + ------- + frame_per_mora : NDArray[] + モーラあたりのフレーム長。端数丸め。 + """ + # 音素ごとにフレーム長を算出し、和をモーラのフレーム長とする + vowel_frames = _to_frame(mora.vowel_length) + consonant_frames = _to_frame(mora.consonant_length) if mora.consonant else 0 + return vowel_frames + consonant_frames + + def apply_pitch_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: """ 音高スケール(`pitchScale`)の適用 @@ -233,12 +256,7 @@ def apply_intonation_scale(moras: list[Mora], query: AudioQuery) -> list[Mora]: return moras -def calc_frame_pitch( - query: AudioQuery, - moras: List[Mora], - phonemes: List[OjtPhoneme], - frame_per_phoneme: numpy.ndarray, -): +def calc_frame_pitch(query: AudioQuery, moras: list[Mora]) -> ndarray: """ フレームごとのピッチの生成 Parameters @@ -247,10 +265,6 @@ def calc_frame_pitch( 音声合成クエリ moras : List[Mora] モーラ列 - phonemes : List[OjtPhoneme] - 音素列 - frame_per_phoneme: NDArray - 音素あたりのフレーム長。端数丸め。 Returns ------- frame_f0 : NDArray[] @@ -265,10 +279,7 @@ def calc_frame_pitch( # Rescale: 時間スケールの変更(モーラ -> フレーム) # 母音インデックスに基づき "音素あたりのフレーム長" を "モーラあたりのフレーム長" に集約 - vowel_indexes = numpy.array(split_mora(phonemes)[2]) - frame_per_mora = [ - a.sum() for a in numpy.split(frame_per_phoneme, vowel_indexes[:-1] + 1) - ] + frame_per_mora = numpy.array(list(map(calc_frame_per_mora, moras))) frame_f0 = numpy.repeat(f0, frame_per_mora) return frame_f0 @@ -619,9 +630,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int): flatten_moras = apply_prepost_silence(flatten_moras, query) frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras) - f0 = calc_frame_pitch( - query, flatten_moras, phoneme_data_list, frame_per_phoneme - ) + f0 = calc_frame_pitch(query, flatten_moras) phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme) # 今まで生成された情報をdecode_forwardにかけ、推論器によって音声波形を生成する