Skip to content

Commit

Permalink
styleIdとsession.runに渡す数値が異なっているVVMでも音声合成できるようにする (#551)
Browse files Browse the repository at this point in the history
Co-authored-by: Ryo Yamashita <[email protected]>
  • Loading branch information
Hiroshiba and qryxip authored Aug 3, 2023
1 parent f2b66ec commit e0d32a5
Show file tree
Hide file tree
Showing 13 changed files with 157 additions and 86 deletions.
15 changes: 9 additions & 6 deletions crates/voicevox_core/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,14 @@ pub enum Error {
source: anyhow::Error,
},

#[error("{},{filename}", base_error_message(VOICEVOX_VVM_MODEL_READ_ERROR))]
#[error(
"{}({path}):{source}",
base_error_message(VOICEVOX_VVM_MODEL_READ_ERROR)
)]
VvmRead {
filename: String,
path: PathBuf,
#[source]
source: Option<anyhow::Error>,
source: anyhow::Error,
},

#[error("{},{0}", base_error_message(VOICEVOX_RESULT_LOAD_METAS_ERROR))]
Expand All @@ -63,10 +66,10 @@ pub enum Error {
InvalidStyleId { style_id: StyleId },

#[error(
"{}: {model_index}",
base_error_message(VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR)
"{}: {model_id:?}",
base_error_message(VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR)
)]
InvalidModelIndex { model_index: usize },
InvalidModelId { model_id: VoiceModelId },

#[error("{}", base_error_message(VOICEVOX_RESULT_INFERENCE_ERROR))]
InferenceFailed,
Expand Down
30 changes: 24 additions & 6 deletions crates/voicevox_core/src/inference_core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,21 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> =
vec![&mut phoneme_vector_array, &mut speaker_id_array];

let mut output = self
.status
.predict_duration_session_run(style_id, input_tensors)?;
.predict_duration_session_run(model_id, input_tensors)?;

for output_item in output.iter_mut() {
if *output_item < PHONEME_LENGTH_MINIMAL {
Expand All @@ -106,6 +112,12 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

let mut length_array = NdArray::new(ndarray::arr0(length as i64));
let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector));
let mut consonant_phoneme_vector_array =
Expand All @@ -116,7 +128,7 @@ impl InferenceCore {
NdArray::new(ndarray::arr1(start_accent_phrase_vector));
let mut end_accent_phrase_vector_array =
NdArray::new(ndarray::arr1(end_accent_phrase_vector));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> = vec![
&mut length_array,
Expand All @@ -130,7 +142,7 @@ impl InferenceCore {
];

self.status
.predict_intonation_session_run(style_id, input_tensors)
.predict_intonation_session_run(model_id, input_tensors)
}

pub async fn decode(
Expand All @@ -145,6 +157,12 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

// 音が途切れてしまうのを避けるworkaround処理が入っている
// TODO: 改善したらここのpadding処理を取り除く
const PADDING_SIZE: f64 = 0.4;
Expand All @@ -171,13 +189,13 @@ impl InferenceCore {
.into_shape([length_with_padding, phoneme_size])
.unwrap(),
);
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> =
vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array];

self.status
.decode_session_run(style_id, input_tensors)
.decode_session_run(model_id, input_tensors)
.map(|output| Self::trim_padding_from_output(output, padding_size))
}

Expand Down
26 changes: 24 additions & 2 deletions crates/voicevox_core/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
use std::fmt::Display;
use std::{collections::BTreeMap, fmt::Display};

use derive_getters::Getters;
use derive_new::new;
use serde::Deserialize;
use serde::{Deserialize, Serialize};

use super::*;

pub type RawManifestVersion = String;
#[derive(Deserialize, Clone, Debug, PartialEq, new)]
Expand All @@ -20,11 +22,31 @@ impl Display for ManifestVersion {
}
}

/// モデル内IDの実体
pub type RawModelInnerId = u32;
/// モデル内ID
#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
pub struct ModelInnerId(RawModelInnerId);

impl ModelInnerId {
pub fn raw_id(self) -> RawModelInnerId {
self.0
}
}

impl Display for ModelInnerId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.raw_id())
}
}

#[derive(Deserialize, Getters, Clone)]
pub struct Manifest {
manifest_version: ManifestVersion,
metas_filename: String,
decode_filename: String,
predict_duration_filename: String,
predict_intonation_filename: String,
#[serde(default)]
style_id_to_model_inner_id: BTreeMap<StyleId, ModelInnerId>,
}
6 changes: 3 additions & 3 deletions crates/voicevox_core/src/result_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ pub enum VoicevoxResultCode {
VOICEVOX_RESULT_LOAD_METAS_ERROR = 5,
/// 無効なstyle_idが指定された
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6,
/// 無効なmodel_indexが指定された
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7,
/// 無効なmodel_idが指定された
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR = 7,
/// 推論に失敗した
VOICEVOX_RESULT_INFERENCE_ERROR = 8,
/// コンテキストラベル出力に失敗した
Expand Down Expand Up @@ -74,7 +74,7 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati

VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0",
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR => "無効なspeaker_idです\0",
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0",
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR => "無効なmodel_idです\0",
VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0",
VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => {
"入力テキストからのフルコンテキストラベル抽出に失敗しました\0"
Expand Down
63 changes: 30 additions & 33 deletions crates/voicevox_core/src/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub struct Status {
merged_metas: VoiceModelMeta,
light_session_options: SessionOptions, // 軽いモデルはこちらを使う
heavy_session_options: SessionOptions, // 重いモデルはこちらを使う
id_relations: BTreeMap<StyleId, VoiceModelId>,
pub id_relations: BTreeMap<StyleId, (VoiceModelId, ModelInnerId)>, // FIXME: pubはやめたい
}

struct StatusModels {
Expand Down Expand Up @@ -113,7 +113,10 @@ impl Status {

for speaker in model.metas().iter() {
for style in speaker.styles().iter() {
self.id_relations.insert(*style.id(), model.id().clone());
self.id_relations.insert(
*style.id(),
(model.id().clone(), model.model_inner_id_for(*style.id())),
);
}
}
self.set_metas();
Expand Down Expand Up @@ -141,7 +144,7 @@ impl Status {
let remove_style_ids = self
.id_relations
.iter()
.filter(|&(_, loaded_model_id)| loaded_model_id == voice_model_id)
.filter(|&(_, (loaded_model_id, _))| loaded_model_id == voice_model_id)
.map(|(&style_id, _)| style_id)
.collect::<Vec<_>>();

Expand Down Expand Up @@ -228,61 +231,55 @@ impl Status {

pub fn predict_duration_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.predict_duration.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.predict_duration.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}

pub fn predict_intonation_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.predict_intonation.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.predict_intonation.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}

pub fn decode_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.decode.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.decode.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,9 @@
"metas_filename": "metas.json",
"decode_filename": "decode.onnx",
"predict_duration_filename": "predict_duration.onnx",
"predict_intonation_filename": "predict_intonation.onnx"
"predict_intonation_filename": "predict_intonation.onnx",
"style_id_to_model_inner_id": {
"302": 2,
"303": 3
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
"styles": [
{
"name": "style3-1",
"id": 2
"id": 302
},
{
"name": "style3-2",
"id": 3
"id": 303
}
],
"speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3",
Expand Down
Loading

0 comments on commit e0d32a5

Please sign in to comment.