Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

styleIdとsession.runに渡す数値が異なっているVVMでも音声合成できるようにする #551

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions crates/voicevox_core/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ pub enum Error {
InvalidStyleId { style_id: StyleId },

#[error(
"{}: {model_index}",
base_error_message(VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR)
"{}: {model_id:?}",
base_error_message(VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR)
)]
InvalidModelIndex { model_index: usize },
InvalidModelId { model_id: VoiceModelId },

#[error("{}", base_error_message(VOICEVOX_RESULT_INFERENCE_ERROR))]
InferenceFailed,
Expand Down
30 changes: 24 additions & 6 deletions crates/voicevox_core/src/inference_core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,21 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> =
vec![&mut phoneme_vector_array, &mut speaker_id_array];

let mut output = self
.status
.predict_duration_session_run(style_id, input_tensors)?;
.predict_duration_session_run(model_id, input_tensors)?;

for output_item in output.iter_mut() {
if *output_item < PHONEME_LENGTH_MINIMAL {
Expand All @@ -106,6 +112,12 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

let mut length_array = NdArray::new(ndarray::arr0(length as i64));
let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector));
let mut consonant_phoneme_vector_array =
Expand All @@ -116,7 +128,7 @@ impl InferenceCore {
NdArray::new(ndarray::arr1(start_accent_phrase_vector));
let mut end_accent_phrase_vector_array =
NdArray::new(ndarray::arr1(end_accent_phrase_vector));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> = vec![
&mut length_array,
Expand All @@ -130,7 +142,7 @@ impl InferenceCore {
];

self.status
.predict_intonation_session_run(style_id, input_tensors)
.predict_intonation_session_run(model_id, input_tensors)
}

pub async fn decode(
Expand All @@ -145,6 +157,12 @@ impl InferenceCore {
return Err(Error::InvalidStyleId { style_id });
}

let (model_id, model_inner_id) = self
.status
.id_relations
.get(&style_id)
.ok_or(Error::InvalidStyleId { style_id })?;

// 音が途切れてしまうのを避けるworkaround処理が入っている
// TODO: 改善したらここのpadding処理を取り除く
const PADDING_SIZE: f64 = 0.4;
Expand All @@ -171,13 +189,13 @@ impl InferenceCore {
.into_shape([length_with_padding, phoneme_size])
.unwrap(),
);
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));

let input_tensors: Vec<&mut dyn AnyArray> =
vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array];

self.status
.decode_session_run(style_id, input_tensors)
.decode_session_run(model_id, input_tensors)
.map(|output| Self::trim_padding_from_output(output, padding_size))
}

Expand Down
5 changes: 4 additions & 1 deletion crates/voicevox_core/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use std::fmt::Display;
use std::{collections::BTreeMap, fmt::Display};

use derive_getters::Getters;
use derive_new::new;
use serde::Deserialize;

use super::*;

pub type RawManifestVersion = String;
#[derive(Deserialize, Clone, Debug, PartialEq, new)]
pub struct ManifestVersion(RawManifestVersion);
Expand All @@ -27,4 +29,5 @@ pub struct Manifest {
decode_filename: String,
predict_duration_filename: String,
predict_intonation_filename: String,
style_id_to_model_inner_id: Option<BTreeMap<StyleId, ModelInnerId>>,
}
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
6 changes: 3 additions & 3 deletions crates/voicevox_core/src/result_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ pub enum VoicevoxResultCode {
VOICEVOX_RESULT_LOAD_METAS_ERROR = 5,
/// 無効なstyle_idが指定された
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6,
/// 無効なmodel_indexが指定された
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7,
/// 無効なvoice_model_idが指定された
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR = 7,
/// 推論に失敗した
VOICEVOX_RESULT_INFERENCE_ERROR = 8,
/// コンテキストラベル出力に失敗した
Expand Down Expand Up @@ -74,7 +74,7 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati

VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0",
VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR => "無効なspeaker_idです\0",
VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0",
VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR => "無効なmodel_indexです\0",
Copy link
Member Author

@Hiroshiba Hiroshiba Jul 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Model IndexではなくModel Idになったけど、このエラーメッセージが1回も使われてなかったのでスルーされてました

VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0",
VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => {
"入力テキストからのフルコンテキストラベル抽出に失敗しました\0"
Expand Down
66 changes: 33 additions & 33 deletions crates/voicevox_core/src/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pub struct Status {
merged_metas: VoiceModelMeta,
light_session_options: SessionOptions, // 軽いモデルはこちらを使う
heavy_session_options: SessionOptions, // 重いモデルはこちらを使う
id_relations: BTreeMap<StyleId, VoiceModelId>,
pub id_relations: BTreeMap<StyleId, (VoiceModelId, ModelInnerId)>, // FIXME: pubはやめたい
}

struct StatusModels {
Expand Down Expand Up @@ -113,7 +113,13 @@ impl Status {

for speaker in model.metas().iter() {
for style in speaker.styles().iter() {
self.id_relations.insert(*style.id(), model.id().clone());
self.id_relations.insert(
*style.id(),
(
model.id().clone(),
model.style_id_to_model_inner_id(*style.id()),
),
);
}
}
self.set_metas();
Expand Down Expand Up @@ -141,7 +147,7 @@ impl Status {
let remove_style_ids = self
.id_relations
.iter()
.filter(|&(_, loaded_model_id)| loaded_model_id == voice_model_id)
.filter(|&(_, (loaded_model_id, _))| loaded_model_id == voice_model_id)
.map(|(&style_id, _)| style_id)
.collect::<Vec<_>>();

Expand Down Expand Up @@ -228,61 +234,55 @@ impl Status {

pub fn predict_duration_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
Comment on lines 232 to 236
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ここはスタイルIDではなくモデルIDを渡す設計の方が正しそうだったのでそうしました

if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.predict_duration.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.predict_duration.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}

pub fn predict_intonation_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.predict_intonation.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.predict_intonation.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}

pub fn decode_session_run(
&self,
style_id: StyleId,
model_id: &VoiceModelId,
inputs: Vec<&mut dyn AnyArray>,
) -> Result<Vec<f32>> {
if let Some(model_id) = self.id_relations.get(&style_id) {
if let Some(model) = self.models.decode.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InferenceFailed)
}
if let Some(model) = self.models.decode.get(model_id) {
if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
Ok(output_tensors[0].as_slice().unwrap().to_owned())
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InferenceFailed)
}
} else {
Err(Error::InvalidStyleId { style_id })
Err(Error::InvalidModelId {
model_id: model_id.clone(),
})
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,9 @@
"metas_filename": "metas.json",
"decode_filename": "decode.onnx",
"predict_duration_filename": "predict_duration.onnx",
"predict_intonation_filename": "predict_intonation.onnx"
"predict_intonation_filename": "predict_intonation.onnx",
"style_id_to_model_inner_id": {
"302": 2,
"303": 3
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@
"styles": [
{
"name": "style3-1",
"id": 2
"id": 302
},
{
"name": "style3-2",
"id": 3
"id": 303
}
],
"speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3",
Expand Down
37 changes: 33 additions & 4 deletions crates/voicevox_core/src/voice_model.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use async_zip::{read::fs::ZipFileReader, ZipEntry};
use futures::future::{join3, join_all};
use serde::{de::DeserializeOwned, Deserialize};
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::fmt::Display;

use super::*;
use std::{
Expand All @@ -18,6 +19,24 @@ pub struct VoiceModelId {
raw_voice_model_id: RawVoiceModelId,
}

/// モデル内IDの実体
pub type RawModelInnerId = u32;
/// モデル内ID
#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
pub struct ModelInnerId(RawModelInnerId);

impl ModelInnerId {
pub fn raw_id(self) -> RawModelInnerId {
self.0
}
}

impl Display for ModelInnerId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.raw_id())
}
}

Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
/// 音声モデル
#[derive(Getters, Clone)]
pub struct VoiceModel {
Expand Down Expand Up @@ -99,6 +118,16 @@ impl VoiceModel {
join_all(vvm_paths).await.into_iter().collect()
}
const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR";

/// スタイルIDからモデル内IDを取得する。
/// モデル内IDのマッピングが存在しない場合はそのままスタイルIDを返す。
pub(crate) fn style_id_to_model_inner_id(&self, style_id: StyleId) -> ModelInnerId {
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
self.manifest
.style_id_to_model_inner_id()
.as_ref()
.and_then(|id_map| id_map.get(&style_id).cloned())
.unwrap_or_else(|| ModelInnerId::new(style_id.raw_id()))
}
}

struct VvmEntry {
Expand Down Expand Up @@ -149,23 +178,23 @@ impl VvmEntryReader {
async fn read_vvm_entry(&self, filename: &str) -> Result<Vec<u8>> {
let me = self.entry_map.get(filename).ok_or(Error::VvmRead {
filename: filename.into(),
source: None,
source: None, // FIXME: ちゃんとエラーを返す
Hiroshiba marked this conversation as resolved.
Show resolved Hide resolved
})?;
let mut manifest_reader =
self.reader
.entry(me.index)
.await
.map_err(|_| Error::VvmRead {
filename: filename.into(),
source: None,
source: None, // FIXME: ちゃんとエラーを返す
})?;
let mut buf = Vec::with_capacity(me.entry.uncompressed_size() as usize);
manifest_reader
.read_to_end_checked(&mut buf, &me.entry)
.await
.map_err(|_| Error::VvmRead {
filename: filename.into(),
source: None,
source: None, // FIXME: ちゃんとエラーを返す
})?;
Ok(buf)
}
Expand Down
2 changes: 1 addition & 1 deletion crates/voicevox_core_c_api/include/voicevox_core.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion crates/voicevox_core_c_api/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes
Err(RustApi(LoadMetas(_))) => VOICEVOX_RESULT_LOAD_METAS_ERROR,
Err(RustApi(GetSupportedDevices(_))) => VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR,
Err(RustApi(InvalidStyleId { .. })) => VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR,
Err(RustApi(InvalidModelIndex { .. })) => VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR,
Err(RustApi(InvalidModelId { .. })) => VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR,
Err(RustApi(InferenceFailed)) => VOICEVOX_RESULT_INFERENCE_ERROR,
Err(RustApi(ExtractFullContextLabel(_))) => {
VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR
Expand Down
9 changes: 9 additions & 0 deletions docs/vvm.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# VVM ファイル

音声合成するために必要な onnx モデルファイルなどがまとめられた zip 形式のファイル。
root パスに確定で`manifest.json`を持つ。

## マニフェストファイル

ファイルの構成や、onnx モデルなどを読み込む・利用するのに必要な情報を記述した json ファイル。
root パスに`manifest.json`として配置する。
Binary file modified model/sample.vvm
Binary file not shown.
Loading