-
Notifications
You must be signed in to change notification settings - Fork 117
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
styleIdとsession.runに渡す数値が異なっているVVMでも音声合成できるようにする #551
The head ref may contain hidden characters: "styleId\u3068session.run\u306B\u6E21\u3059\u6570\u5024\u304C\u7570\u306A\u3063\u3066\u3044\u308BVVM\u3067\u3082\u97F3\u58F0\u5408\u6210\u3067\u304D\u308B\u3088\u3046\u306B\u3059\u308B"
Changes from 7 commits
2fc2ad9
b7072c5
88ac130
c13debb
6f67812
73756c0
5d8e33f
85dcd1b
000e9f1
c765ba2
e7896bb
d731bbc
1704b73
41e8b5b
e8383e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,8 +21,8 @@ pub enum VoicevoxResultCode { | |
VOICEVOX_RESULT_LOAD_METAS_ERROR = 5, | ||
/// 無効なstyle_idが指定された | ||
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6, | ||
/// 無効なmodel_indexが指定された | ||
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7, | ||
/// 無効なvoice_model_idが指定された | ||
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR = 7, | ||
/// 推論に失敗した | ||
VOICEVOX_RESULT_INFERENCE_ERROR = 8, | ||
/// コンテキストラベル出力に失敗した | ||
|
@@ -74,7 +74,7 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati | |
|
||
VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0", | ||
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR => "無効なspeaker_idです\0", | ||
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0", | ||
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR => "無効なmodel_indexです\0", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Model IndexではなくModel Idになったけど、このエラーメッセージが1回も使われてなかったのでスルーされてました |
||
VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0", | ||
VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => { | ||
"入力テキストからのフルコンテキストラベル抽出に失敗しました\0" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,7 +23,7 @@ pub struct Status { | |
merged_metas: VoiceModelMeta, | ||
light_session_options: SessionOptions, // 軽いモデルはこちらを使う | ||
heavy_session_options: SessionOptions, // 重いモデルはこちらを使う | ||
id_relations: BTreeMap<StyleId, VoiceModelId>, | ||
pub id_relations: BTreeMap<StyleId, (VoiceModelId, ModelInnerId)>, // FIXME: pubはやめたい | ||
} | ||
|
||
struct StatusModels { | ||
|
@@ -113,7 +113,13 @@ impl Status { | |
|
||
for speaker in model.metas().iter() { | ||
for style in speaker.styles().iter() { | ||
self.id_relations.insert(*style.id(), model.id().clone()); | ||
self.id_relations.insert( | ||
*style.id(), | ||
( | ||
model.id().clone(), | ||
model.style_id_to_model_inner_id(*style.id()), | ||
), | ||
); | ||
} | ||
} | ||
self.set_metas(); | ||
|
@@ -141,7 +147,7 @@ impl Status { | |
let remove_style_ids = self | ||
.id_relations | ||
.iter() | ||
.filter(|&(_, loaded_model_id)| loaded_model_id == voice_model_id) | ||
.filter(|&(_, (loaded_model_id, _))| loaded_model_id == voice_model_id) | ||
.map(|(&style_id, _)| style_id) | ||
.collect::<Vec<_>>(); | ||
|
||
|
@@ -228,61 +234,55 @@ impl Status { | |
|
||
pub fn predict_duration_session_run( | ||
&self, | ||
style_id: StyleId, | ||
model_id: &VoiceModelId, | ||
inputs: Vec<&mut dyn AnyArray>, | ||
) -> Result<Vec<f32>> { | ||
Comment on lines
232
to
236
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ここはスタイルIDではなくモデルIDを渡す設計の方が正しそうだったのでそうしました |
||
if let Some(model_id) = self.id_relations.get(&style_id) { | ||
if let Some(model) = self.models.predict_duration.get(model_id) { | ||
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { | ||
Ok(output_tensors[0].as_slice().unwrap().to_owned()) | ||
} else { | ||
Err(Error::InferenceFailed) | ||
} | ||
if let Some(model) = self.models.predict_duration.get(model_id) { | ||
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { | ||
Ok(output_tensors[0].as_slice().unwrap().to_owned()) | ||
} else { | ||
Err(Error::InvalidStyleId { style_id }) | ||
Err(Error::InferenceFailed) | ||
} | ||
} else { | ||
Err(Error::InvalidStyleId { style_id }) | ||
Err(Error::InvalidModelId { | ||
model_id: model_id.clone(), | ||
}) | ||
} | ||
} | ||
|
||
pub fn predict_intonation_session_run( | ||
&self, | ||
style_id: StyleId, | ||
model_id: &VoiceModelId, | ||
inputs: Vec<&mut dyn AnyArray>, | ||
) -> Result<Vec<f32>> { | ||
if let Some(model_id) = self.id_relations.get(&style_id) { | ||
if let Some(model) = self.models.predict_intonation.get(model_id) { | ||
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { | ||
Ok(output_tensors[0].as_slice().unwrap().to_owned()) | ||
} else { | ||
Err(Error::InferenceFailed) | ||
} | ||
if let Some(model) = self.models.predict_intonation.get(model_id) { | ||
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { | ||
Ok(output_tensors[0].as_slice().unwrap().to_owned()) | ||
} else { | ||
Err(Error::InvalidStyleId { style_id }) | ||
Err(Error::InferenceFailed) | ||
} | ||
} else { | ||
Err(Error::InvalidStyleId { style_id }) | ||
Err(Error::InvalidModelId { | ||
model_id: model_id.clone(), | ||
}) | ||
} | ||
} | ||
|
||
pub fn decode_session_run( | ||
&self, | ||
style_id: StyleId, | ||
model_id: &VoiceModelId, | ||
inputs: Vec<&mut dyn AnyArray>, | ||
) -> Result<Vec<f32>> { | ||
if let Some(model_id) = self.id_relations.get(&style_id) { | ||
if let Some(model) = self.models.decode.get(model_id) { | ||
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { | ||
Ok(output_tensors[0].as_slice().unwrap().to_owned()) | ||
} else { | ||
Err(Error::InferenceFailed) | ||
} | ||
if let Some(model) = self.models.decode.get(model_id) { | ||
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) { | ||
Ok(output_tensors[0].as_slice().unwrap().to_owned()) | ||
} else { | ||
Err(Error::InvalidStyleId { style_id }) | ||
Err(Error::InferenceFailed) | ||
} | ||
} else { | ||
Err(Error::InvalidStyleId { style_id }) | ||
Err(Error::InvalidModelId { | ||
model_id: model_id.clone(), | ||
}) | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
今言うのもなんですが、"model_inner_id"という名前に欠点があるとすれば、声(voice)を指すことが少しだけわかりずらくなっているというのがあるかなと思いました。
"speaker_id"という表現はもう
compatible_engine
以外で使っていないので、"true_speaker_id"とかにするというのもアリなんじゃないかと思いました。There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
確かに言われておっしゃる通りなかなか分かりにくくなっているなと思いました。
大事なのは
model_inner
の部分だと思っていて、例えば同じ声を作れるものが、別のモデルでは別のIDになっていることもなくはない感じです。speaker
は声なのか話者なのか一意に定まらないので、やるとしたらmodel_inner_voice_id
あたりなのかな~と思いました。とりあえずこの値はここでしか使われていないから、一旦このままでも通じるのかなと思いました!
ただ分かりにくいのはおっしゃる通りだと思うので、いつか変更する場合はたぶん賛成できます。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ああ言われてみればモデルごとに違ってもいいんですね。
他にあるとすれば...
local_voice_id
とか...?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
確かに
model
含めなくてもいいかもですね!inner_voice_id
とかもありかもです。