Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

整理: 前後空白挿入の統合 #814

Merged
merged 8 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions test/test_synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
calc_frame_phoneme,
calc_frame_pitch,
mora_phoneme_list,
pad_with_silence,
pre_process,
split_mora,
to_flatten_moras,
Expand Down Expand Up @@ -168,24 +169,43 @@ def _gen_mora(
)


def test_pad_with_silence():
"""Test `pad_with_silence`."""
# Inputs
query = _gen_query(prePhonemeLength=2 * 0.01067, postPhonemeLength=6 * 0.01067)
moras = [
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
]

# Expects
true_moras_with_silence = [
_gen_mora(" ", None, None, "sil", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 100.0),
_gen_mora(" ", None, None, "sil", 6 * 0.01067, 0.0),
]

# Outputs
moras_with_silence = pad_with_silence(moras, query)

assert moras_with_silence == true_moras_with_silence


def test_calc_frame_per_phoneme():
"""Test `calc_frame_per_phoneme`."""
# Inputs
query = _gen_query(
speedScale=2.0,
prePhonemeLength=2 * 0.01067, # 0.01067 [sec/frame]
postPhonemeLength=6 * 0.01067,
)
query = _gen_query(speedScale=2.0)
moras = [
_gen_mora(" ", None, None, " ", 2 * 0.01067, 0.0), # 0.01067 [sec/frame]
_gen_mora("コ", "k", 2 * 0.01067, "o", 4 * 0.01067, 0.0),
_gen_mora("ン", None, None, "N", 4 * 0.01067, 0.0),
_gen_mora("、", None, None, "pau", 2 * 0.01067, 0.0),
_gen_mora("ヒ", "h", 2 * 0.01067, "i", 4 * 0.01067, 0.0),
_gen_mora("ホ", "h", 4 * 0.01067, "O", 2 * 0.01067, 0.0),
_gen_mora(" ", None, None, " ", 6 * 0.01067, 0.0),
]

# Expects
# Pre k o N pau h i h O Pst
# Pre k o N pau h i h O Pst
true_frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
true_frame_per_phoneme = numpy.array(true_frame_per_phoneme, dtype=numpy.int32)

Expand All @@ -200,15 +220,17 @@ def test_calc_frame_pitch():
# Inputs
query = _gen_query(pitchScale=2.0, intonationScale=0.5)
moras = [
_gen_mora(" ", None, None, " ", 0.0, 0.0),
_gen_mora("コ", "k", 0.0, "o", 0.0, 50.0),
_gen_mora("ン", None, None, "N", 0.0, 50.0),
_gen_mora("、", None, None, "pau", 0.0, 0.0),
_gen_mora("ヒ", "h", 0.0, "i", 0.0, 125.0),
_gen_mora("ホ", "h", 0.0, "O", 0.0, 0.0),
_gen_mora(" ", None, None, " ", 0.0, 0.0),
]
phoneme_str = "pau k o N pau h i h O pau"
phonemes = [OjtPhoneme(p, 0, 0) for p in phoneme_str.split()]
# Pre k o N pau h i h O Pst
# Pre k o N pau h i h O Pst
frame_per_phoneme = [1, 1, 2, 2, 1, 1, 2, 2, 1, 3]
frame_per_phoneme = numpy.array(frame_per_phoneme, dtype=numpy.int32)

Expand Down Expand Up @@ -299,6 +321,7 @@ def test_feat_to_framescale():
assert true_frame_per_phoneme.shape[0] == len(phoneme_data_list), "Prerequisites"

# Outputs
flatten_moras = pad_with_silence(flatten_moras, query)
frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
f0 = calc_frame_pitch(query, flatten_moras, phoneme_data_list, frame_per_phoneme)
frame_phoneme = calc_frame_phoneme(phoneme_data_list, frame_per_phoneme)
Expand Down
47 changes: 35 additions & 12 deletions voicevox_engine/synthesis_engine/synthesis_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def pre_process(
Returns
-------
flatten_moras : List[Mora]
AccentPhraseモデルのリスト内に含まれるすべてのMoraをリスト化したものを返す
モーラ列(前後の無音含まない)
phoneme_data_list : List[OjtPhoneme]
flatten_morasから取り出したすべてのPhonemeをOjtPhonemeに変換したものを返す
音素列(前後の無音含む)
"""
flatten_moras = to_flatten_moras(accent_phrases)

Expand All @@ -110,6 +110,30 @@ def pre_process(
return flatten_moras, phoneme_data_list


def generate_silence_mora(length: float) -> Mora:
"""無音モーラの生成"""
return Mora(text=" ", vowel="sil", vowel_length=length, pitch=0.0)


def pad_with_silence(moras: list[Mora], query: AudioQuery) -> list[Mora]:
"""モーラ列の先頭/最後尾へqueryに基づいた無音モーラを追加
Parameters
----------
moras : List[Mora]
モーラ時系列
query : AudioQuery
音声合成クエリ
Returns
-------
moras : List[Mora]
前後無音が付加されたモーラ時系列
"""
pre_silence_moras = [generate_silence_mora(query.prePhonemeLength)]
post_silence_moras = [generate_silence_mora(query.postPhonemeLength)]
moras = pre_silence_moras + moras + post_silence_moras
return moras


def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
"""
音素あたりのフレーム長を算出
Expand All @@ -122,27 +146,25 @@ def calc_frame_per_phoneme(query: AudioQuery, moras: List[Mora]):
Returns
-------
frame_per_phoneme : NDArray[]
音素(前後の無音含む)あたりのフレーム長。端数丸め。
音素あたりのフレーム長。端数丸め。
"""
# 音素(前後の無音含む)あたりの継続長
# 音素あたりの継続長
sec_per_phoneme = numpy.array(
[query.prePhonemeLength]
+ [
[
length
for mora in moras
for length in (
[mora.consonant_length] if mora.consonant is not None else []
)
+ [mora.vowel_length]
]
+ [query.postPhonemeLength],
],
dtype=numpy.float32,
)

# 話速による継続長の補正
sec_per_phoneme /= query.speedScale

# 音素(前後の無音含む)あたりのフレーム長。端数丸め。
# 音素あたりのフレーム長。端数丸め。
framerate = 24000 / 256 # framerate 93.75 [frame/sec]
frame_per_phoneme = numpy.round(sec_per_phoneme * framerate).astype(numpy.int32)

Expand All @@ -166,15 +188,15 @@ def calc_frame_pitch(
phonemes : List[OjtPhoneme]
音素列
frame_per_phoneme: NDArray
音素(前後の無音含む)あたりのフレーム長。端数丸め。
音素あたりのフレーム長。端数丸め。
Returns
-------
frame_f0 : NDArray[]
フレームごとの基本周波数系列
"""
# TODO: Better function name (c.f. VOICEVOX/voicevox_engine#790)
# モーラ(前後の無音含む)ごとの基本周波数
f0 = numpy.array([0] + [mora.pitch for mora in moras] + [0], dtype=numpy.float32)
# モーラごとの基本周波数
f0 = numpy.array([mora.pitch for mora in moras], dtype=numpy.float32)

# 音高スケールによる補正
f0 *= 2**query.pitchScale
Expand Down Expand Up @@ -482,6 +504,7 @@ def _synthesis_impl(self, query: AudioQuery, style_id: int):
# AccentPhraseをすべてMoraおよびOjtPhonemeの形に分解し、処理可能な形にする
flatten_moras, phoneme_data_list = pre_process(query.accent_phrases)

flatten_moras = pad_with_silence(flatten_moras, query)
frame_per_phoneme = calc_frame_per_phoneme(query, flatten_moras)
f0 = calc_frame_pitch(
query, flatten_moras, phoneme_data_list, frame_per_phoneme
Expand Down