Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

styleIdとsession.runに渡す数値が異なっているVVMでも音声合成できるようにする #551

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions crates/voicevox_core/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,14 @@ pub enum Error {
source: anyhow::Error,
},

#[error("{},{filename}", base_error_message(VOICEVOX_VVM_MODEL_READ_ERROR))]
#[error(
"{}({path}):{source}",
base_error_message(VOICEVOX_VVM_MODEL_READ_ERROR)
)]
VvmRead {
filename: String,
path: PathBuf,
#[source]
source: Option<anyhow::Error>,
source: anyhow::Error,
},

#[error("{},{0}", base_error_message(VOICEVOX_RESULT_LOAD_METAS_ERROR))]
Expand All @@ -67,10 +70,10 @@ pub enum Error {
InvalidStyleId { style_id: StyleId },

#[error(
"{}: {model_index}",
base_error_message(VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR)
"{}: {model_id:?}",
base_error_message(VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR)
)]
InvalidModelIndex { model_index: usize },
InvalidModelId { model_id: VoiceModelId },

#[error("{}", base_error_message(VOICEVOX_RESULT_INFERENCE_ERROR))]
InferenceFailed,
Expand Down
30 changes: 24 additions & 6 deletions crates/voicevox_core/src/inference_core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,21 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> =
vec![&mut phoneme_vector_array, &mut speaker_id_array];

let mut output = self
.status
.predict_duration_session_run(style_id, input_tensors)?;
.predict_duration_session_run(model_id, input_tensors)?;

for output_item in output.iter_mut() {
if *output_item < PHONEME_LENGTH_MINIMAL {
Expand All @@ -106,6 +112,12 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

let mut length_array = NdArray::new(ndarray::arr0(length as i64));
let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector));
let mut consonant_phoneme_vector_array =
Expand All @@ -116,7 +128,7 @@ impl InferenceCore {
NdArray::new(ndarray::arr1(start_accent_phrase_vector));
let mut end_accent_phrase_vector_array =
NdArray::new(ndarray::arr1(end_accent_phrase_vector));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> = vec![
&mut length_array,
Expand All @@ -130,7 +142,7 @@ impl InferenceCore {
];

self.status
.predict_intonation_session_run(style_id, input_tensors)
.predict_intonation_session_run(model_id, input_tensors)
}

pub async fn decode(
Expand All @@ -145,6 +157,12 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

// 音が途切れてしまうのを避けるworkaround処理が入っている
// TODO: 改善したらここのpadding処理を取り除く
const PADDING_SIZE: f64 = 0.4;
Expand All @@ -171,13 +189,13 @@ impl InferenceCore {
.into_shape([length_with_padding, phoneme_size])
.unwrap(),
);
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> =
vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array];

self.status
.decode_session_run(style_id, input_tensors)
.decode_session_run(model_id, input_tensors)
.map(|output| Self::trim_padding_from_output(output, padding_size))
}

Expand Down
26 changes: 24 additions & 2 deletions crates/voicevox_core/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use std::fmt::Display;
use std::{collections::BTreeMap, fmt::Display};

use derive_getters::Getters;
use derive_new::new;
use serde::Deserialize;
use serde::{Deserialize, Serialize};

use super::*;

pub type RawManifestVersion = String;
#[derive(Deserialize, Clone, Debug, PartialEq, new)]
Expand All @@ -20,11 +22,31 @@ impl Display for ManifestVersion {
}
}

/// モデル内IDの実体
pub type RawModelInnerId = u32;
/// モデル内ID
#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
pub struct ModelInnerId(RawModelInnerId);

impl ModelInnerId {
pub fn raw_id(self) -> RawModelInnerId {
self.0
}
}

impl Display for ModelInnerId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.raw_id())
}
}

#[derive(Deserialize, Getters, Clone)]
pub struct Manifest {
manifest_version: ManifestVersion,
metas_filename: String,
decode_filename: String,
predict_duration_filename: String,
predict_intonation_filename: String,
#[serde(default)]
style_id_to_model_inner_id: BTreeMap<StyleId, ModelInnerId>,
}
Copy link
Member

@qryxip qryxip Aug 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

今言うのもなんですが、"model_inner_id"という名前に欠点があるとすれば、声(voice)を指すことが少しだけわかりずらくなっているというのがあるかなと思いました。

"speaker_id"という表現はもうcompatible_engine以外で使っていないので、"true_speaker_id"とかにするというのもアリなんじゃないかと思いました。

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

確かに言われておっしゃる通りなかなか分かりにくくなっているなと思いました。

大事なのはmodel_innerの部分だと思っていて、例えば同じ声を作れるものが、別のモデルでは別のIDになっていることもなくはない感じです。
speakerは声なのか話者なのか一意に定まらないので、やるとしたらmodel_inner_voice_idあたりなのかな~と思いました。

とりあえずこの値はここでしか使われていないから、一旦このままでも通じるのかなと思いました!
ただ分かりにくいのはおっしゃる通りだと思うので、いつか変更する場合はたぶん賛成できます。

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ああ言われてみればモデルごとに違ってもいいんですね。
他にあるとすれば...local_voice_idとか...?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

確かにmodel含めなくてもいいかもですね! inner_voice_idとかもありかもです。

6 changes: 3 additions & 3 deletions crates/voicevox_core/src/result_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ pub enum VoicevoxResultCode {
VOICEVOX_RESULT_LOAD_METAS_ERROR = 5,
/// 無効なstyle_idが指定された
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6,
/// 無効なmodel_indexが指定された
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7,
/// 無効なmodel_idが指定された
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR = 7,
/// 推論に失敗した
VOICEVOX_RESULT_INFERENCE_ERROR = 8,
/// コンテキストラベル出力に失敗した
Expand Down Expand Up @@ -74,7 +74,7 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati

VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0",
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR => "無効なspeaker_idです\0",
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0",
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR => "無効なmodel_idです\0",
VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0",
VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => {
"入力テキストからのフルコンテキストラベル抽出に失敗しました\0"
Expand Down
63 changes: 30 additions & 33 deletions crates/voicevox_core/src/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub struct Status {
merged_metas: VoiceModelMeta,
light_session_options: SessionOptions, // 軽いモデルはこちらを使う
heavy_session_options: SessionOptions, // 重いモデルはこちらを使う
id_relations: BTreeMap<StyleId, VoiceModelId>,
pub id_relations: BTreeMap<StyleId, (VoiceModelId, ModelInnerId)>, // FIXME: pubはやめたい
}

struct StatusModels {
Expand Down Expand Up @@ -113,7 +113,10 @@ impl Status {

for speaker in model.metas().iter() {
for style in speaker.styles().iter() {
self.id_relations.insert(*style.id(), model.id().clone());
self.id_relations.insert(
*style.id(),
(model.id().clone(), model.model_inner_id_for(*style.id())),
);
}
}
self.set_metas();
Expand Down Expand Up @@ -141,7 +144,7 @@ impl Status {
let remove_style_ids = self
.id_relations
.iter()
.filter(|&(_, loaded_model_id)| loaded_model_id == voice_model_id)
.filter(|&(_, (loaded_model_id, _))| loaded_model_id == voice_model_id)
.map(|(&style_id, _)| style_id)
.collect::<Vec<_>>();

Expand Down Expand Up @@ -228,61 +231,55 @@ impl Status {

pub fn predict_duration_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
Comment on lines 232 to 236
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ここはスタイルIDではなくモデルIDを渡す設計の方が正しそうだったのでそうしました

if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.predict_duration.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.predict_duration.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}

pub fn predict_intonation_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.predict_intonation.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.predict_intonation.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}

pub fn decode_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.decode.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.decode.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,9 @@
"metas_filename": "metas.json",
"decode_filename": "decode.onnx",
"predict_duration_filename": "predict_duration.onnx",
"predict_intonation_filename": "predict_intonation.onnx"
"predict_intonation_filename": "predict_intonation.onnx",
"style_id_to_model_inner_id": {
"302": 2,
"303": 3
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
"styles": [
{
"name": "style3-1",
"id": 2
"id": 302
},
{
"name": "style3-2",
"id": 3
"id": 303
}
],
"speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3",
Expand Down
Loading
Loading