VOICEVOX · Hiroshiba · Aug 3, 2023 · Jul 27, 2023 · Jul 28, 2023 · Jul 28, 2023
diff --git a/crates/voicevox_core/src/error.rs b/crates/voicevox_core/src/error.rs
@@ -67,10 +67,10 @@ pub enum Error {
     InvalidStyleId { style_id: StyleId },
 
     #[error(
-        "{}: {model_index}",
-        base_error_message(VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR)
+        "{}: {model_id:?}",
+        base_error_message(VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR)
     )]
-    InvalidModelIndex { model_index: usize },
+    InvalidModelId { model_id: VoiceModelId },
 
     #[error("{}", base_error_message(VOICEVOX_RESULT_INFERENCE_ERROR))]
     InferenceFailed,

diff --git a/crates/voicevox_core/src/inference_core.rs b/crates/voicevox_core/src/inference_core.rs
@@ -71,15 +71,21 @@ impl InferenceCore {
             return Err(Error::InvalidStyleId { style_id });
         }
 
+        let (model_id, model_inner_id) = self
+            .status
+            .id_relations
+            .get(&style_id)
+            .ok_or(Error::InvalidStyleId { style_id })?;
+
         let mut phoneme_vector_array = NdArray::new(ndarray::arr1(phoneme_vector));
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));
 
         let input_tensors: Vec<&mut dyn AnyArray> =
             vec![&mut phoneme_vector_array, &mut speaker_id_array];
 
         let mut output = self
             .status
-            .predict_duration_session_run(style_id, input_tensors)?;
+            .predict_duration_session_run(model_id, input_tensors)?;
 
         for output_item in output.iter_mut() {
             if *output_item < PHONEME_LENGTH_MINIMAL {
@@ -106,6 +112,12 @@ impl InferenceCore {
             return Err(Error::InvalidStyleId { style_id });
         }
 
+        let (model_id, model_inner_id) = self
+            .status
+            .id_relations
+            .get(&style_id)
+            .ok_or(Error::InvalidStyleId { style_id })?;
+
         let mut length_array = NdArray::new(ndarray::arr0(length as i64));
         let mut vowel_phoneme_vector_array = NdArray::new(ndarray::arr1(vowel_phoneme_vector));
         let mut consonant_phoneme_vector_array =
@@ -116,7 +128,7 @@ impl InferenceCore {
             NdArray::new(ndarray::arr1(start_accent_phrase_vector));
         let mut end_accent_phrase_vector_array =
             NdArray::new(ndarray::arr1(end_accent_phrase_vector));
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));
 
         let input_tensors: Vec<&mut dyn AnyArray> = vec![
             &mut length_array,
@@ -130,7 +142,7 @@ impl InferenceCore {
         ];
 
         self.status
-            .predict_intonation_session_run(style_id, input_tensors)
+            .predict_intonation_session_run(model_id, input_tensors)
     }
 
     pub async fn decode(
@@ -145,6 +157,12 @@ impl InferenceCore {
             return Err(Error::InvalidStyleId { style_id });
         }
 
+        let (model_id, model_inner_id) = self
+            .status
+            .id_relations
+            .get(&style_id)
+            .ok_or(Error::InvalidStyleId { style_id })?;
+
         // 音が途切れてしまうのを避けるworkaround処理が入っている
         // TODO: 改善したらここのpadding処理を取り除く
         const PADDING_SIZE: f64 = 0.4;
@@ -171,13 +189,13 @@ impl InferenceCore {
                 .into_shape([length_with_padding, phoneme_size])
                 .unwrap(),
         );
-        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[style_id.raw_id() as i64]));
+        let mut speaker_id_array = NdArray::new(ndarray::arr1(&[model_inner_id.raw_id() as i64]));
 
         let input_tensors: Vec<&mut dyn AnyArray> =
             vec![&mut f0_array, &mut phoneme_array, &mut speaker_id_array];
 
         self.status
-            .decode_session_run(style_id, input_tensors)
+            .decode_session_run(model_id, input_tensors)
             .map(|output| Self::trim_padding_from_output(output, padding_size))
     }
 

diff --git a/crates/voicevox_core/src/manifest.rs b/crates/voicevox_core/src/manifest.rs
@@ -1,9 +1,11 @@
-use std::fmt::Display;
+use std::{collections::BTreeMap, fmt::Display};
 
 use derive_getters::Getters;
 use derive_new::new;
 use serde::Deserialize;
 
+use super::*;
+
 pub type RawManifestVersion = String;
 #[derive(Deserialize, Clone, Debug, PartialEq, new)]
 pub struct ManifestVersion(RawManifestVersion);
@@ -27,4 +29,5 @@ pub struct Manifest {
     decode_filename: String,
     predict_duration_filename: String,
     predict_intonation_filename: String,
+    style_id_to_model_inner_id: Option<BTreeMap<StyleId, ModelInnerId>>,
 }
diff --git a/crates/voicevox_core/src/result_code.rs b/crates/voicevox_core/src/result_code.rs
@@ -21,8 +21,8 @@ pub enum VoicevoxResultCode {
     VOICEVOX_RESULT_LOAD_METAS_ERROR = 5,
     /// 無効なstyle_idが指定された
     VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR = 6,
-    /// 無効なmodel_indexが指定された
-    VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR = 7,
+    /// 無効なvoice_model_idが指定された
+    VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR = 7,
     /// 推論に失敗した
     VOICEVOX_RESULT_INFERENCE_ERROR = 8,
     /// コンテキストラベル出力に失敗した
@@ -74,7 +74,7 @@ pub const fn error_result_to_message(result_code: VoicevoxResultCode) -> &'stati
 
         VOICEVOX_RESULT_OK => "エラーが発生しませんでした\0",
         VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR => "無効なspeaker_idです\0",
-        VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR => "無効なmodel_indexです\0",
+        VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR => "無効なmodel_indexです\0",
         VOICEVOX_RESULT_INFERENCE_ERROR => "推論に失敗しました\0",
         VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR => {
             "入力テキストからのフルコンテキストラベル抽出に失敗しました\0"

diff --git a/crates/voicevox_core/src/status.rs b/crates/voicevox_core/src/status.rs
@@ -23,7 +23,7 @@ pub struct Status {
     merged_metas: VoiceModelMeta,
     light_session_options: SessionOptions, // 軽いモデルはこちらを使う
     heavy_session_options: SessionOptions, // 重いモデルはこちらを使う
-    id_relations: BTreeMap<StyleId, VoiceModelId>,
+    pub id_relations: BTreeMap<StyleId, (VoiceModelId, ModelInnerId)>, // FIXME: pubはやめたい
 }
 
 struct StatusModels {
@@ -113,7 +113,13 @@ impl Status {
 
         for speaker in model.metas().iter() {
             for style in speaker.styles().iter() {
-                self.id_relations.insert(*style.id(), model.id().clone());
+                self.id_relations.insert(
+                    *style.id(),
+                    (
+                        model.id().clone(),
+                        model.style_id_to_model_inner_id(*style.id()),
+                    ),
+                );
             }
         }
         self.set_metas();
@@ -141,7 +147,7 @@ impl Status {
             let remove_style_ids = self
                 .id_relations
                 .iter()
-                .filter(|&(_, loaded_model_id)| loaded_model_id == voice_model_id)
+                .filter(|&(_, (loaded_model_id, _))| loaded_model_id == voice_model_id)
                 .map(|(&style_id, _)| style_id)
                 .collect::<Vec<_>>();
 
@@ -228,61 +234,55 @@ impl Status {
 
     pub fn predict_duration_session_run(
         &self,
-        style_id: StyleId,
+        model_id: &VoiceModelId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model_id) = self.id_relations.get(&style_id) {
-            if let Some(model) = self.models.predict_duration.get(model_id) {
-                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
-                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
-                } else {
-                    Err(Error::InferenceFailed)
-                }
+        if let Some(model) = self.models.predict_duration.get(model_id) {
+            if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                Ok(output_tensors[0].as_slice().unwrap().to_owned())
             } else {
-                Err(Error::InvalidStyleId { style_id })
+                Err(Error::InferenceFailed)
             }
         } else {
-            Err(Error::InvalidStyleId { style_id })
+            Err(Error::InvalidModelId {
+                model_id: model_id.clone(),
+            })
         }
     }
 
     pub fn predict_intonation_session_run(
         &self,
-        style_id: StyleId,
+        model_id: &VoiceModelId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model_id) = self.id_relations.get(&style_id) {
-            if let Some(model) = self.models.predict_intonation.get(model_id) {
-                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
-                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
-                } else {
-                    Err(Error::InferenceFailed)
-                }
+        if let Some(model) = self.models.predict_intonation.get(model_id) {
+            if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                Ok(output_tensors[0].as_slice().unwrap().to_owned())
             } else {
-                Err(Error::InvalidStyleId { style_id })
+                Err(Error::InferenceFailed)
             }
         } else {
-            Err(Error::InvalidStyleId { style_id })
+            Err(Error::InvalidModelId {
+                model_id: model_id.clone(),
+            })
         }
     }
 
     pub fn decode_session_run(
         &self,
-        style_id: StyleId,
+        model_id: &VoiceModelId,
         inputs: Vec<&mut dyn AnyArray>,
     ) -> Result<Vec<f32>> {
-        if let Some(model_id) = self.id_relations.get(&style_id) {
-            if let Some(model) = self.models.decode.get(model_id) {
-                if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
-                    Ok(output_tensors[0].as_slice().unwrap().to_owned())
-                } else {
-                    Err(Error::InferenceFailed)
-                }
+        if let Some(model) = self.models.decode.get(model_id) {
+            if let Ok(output_tensors) = model.lock().unwrap().run(inputs) {
+                Ok(output_tensors[0].as_slice().unwrap().to_owned())
             } else {
-                Err(Error::InvalidStyleId { style_id })
+                Err(Error::InferenceFailed)
             }
         } else {
-            Err(Error::InvalidStyleId { style_id })
+            Err(Error::InvalidModelId {
+                model_id: model_id.clone(),
+            })
         }
     }
 }

diff --git a/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/manifest.json
@@ -3,5 +3,9 @@
   "metas_filename": "metas.json",
   "decode_filename": "decode.onnx",
   "predict_duration_filename": "predict_duration.onnx",
-  "predict_intonation_filename": "predict_intonation.onnx"
+  "predict_intonation_filename": "predict_intonation.onnx",
+  "style_id_to_model_inner_id": {
+    "302": 2,
+    "303": 3
+  }
 }
diff --git a/crates/voicevox_core/src/test_data/model_sources/load_model_works1/metas.json b/crates/voicevox_core/src/test_data/model_sources/load_model_works1/metas.json
@@ -26,11 +26,11 @@
     "styles": [
       {
         "name": "style3-1",
-        "id": 2
+        "id": 302
       },
       {
         "name": "style3-2",
-        "id": 3
+        "id": 303
       }
     ],
     "speaker_uuid": "5d3d9aa9-88e5-4a96-8ef7-f13a3cad1cb3",

diff --git a/crates/voicevox_core/src/voice_model.rs b/crates/voicevox_core/src/voice_model.rs
@@ -1,6 +1,7 @@
 use async_zip::{read::fs::ZipFileReader, ZipEntry};
 use futures::future::{join3, join_all};
-use serde::{de::DeserializeOwned, Deserialize};
+use serde::{de::DeserializeOwned, Deserialize, Serialize};
+use std::fmt::Display;
 
 use super::*;
 use std::{
@@ -18,6 +19,24 @@ pub struct VoiceModelId {
     raw_voice_model_id: RawVoiceModelId,
 }
 
+/// モデル内IDの実体
+pub type RawModelInnerId = u32;
+/// モデル内ID
+#[derive(PartialEq, Eq, Clone, Copy, Ord, PartialOrd, Deserialize, Serialize, new, Debug)]
+pub struct ModelInnerId(RawModelInnerId);
+
+impl ModelInnerId {
+    pub fn raw_id(self) -> RawModelInnerId {
+        self.0
+    }
+}
+
+impl Display for ModelInnerId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.raw_id())
+    }
+}
+
 /// 音声モデル
 #[derive(Getters, Clone)]
 pub struct VoiceModel {
@@ -99,6 +118,16 @@ impl VoiceModel {
         join_all(vvm_paths).await.into_iter().collect()
     }
     const ROOT_DIR_ENV_NAME: &str = "VV_MODELS_ROOT_DIR";
+
+    /// スタイルIDからモデル内IDを取得する。
+    /// モデル内IDのマッピングが存在しない場合はそのままスタイルIDを返す。
+    pub(crate) fn style_id_to_model_inner_id(&self, style_id: StyleId) -> ModelInnerId {
+        self.manifest
+            .style_id_to_model_inner_id()
+            .as_ref()
+            .and_then(|id_map| id_map.get(&style_id).cloned())
+            .unwrap_or_else(|| ModelInnerId::new(style_id.raw_id()))
+    }
 }
 
 struct VvmEntry {
@@ -149,23 +178,23 @@ impl VvmEntryReader {
     async fn read_vvm_entry(&self, filename: &str) -> Result<Vec<u8>> {
         let me = self.entry_map.get(filename).ok_or(Error::VvmRead {
             filename: filename.into(),
-            source: None,
+            source: None, // FIXME: ちゃんとエラーを返す
         })?;
         let mut manifest_reader =
             self.reader
                 .entry(me.index)
                 .await
                 .map_err(|_| Error::VvmRead {
                     filename: filename.into(),
-                    source: None,
+                    source: None, // FIXME: ちゃんとエラーを返す
                 })?;
         let mut buf = Vec::with_capacity(me.entry.uncompressed_size() as usize);
         manifest_reader
             .read_to_end_checked(&mut buf, &me.entry)
             .await
             .map_err(|_| Error::VvmRead {
                 filename: filename.into(),
-                source: None,
+                source: None, // FIXME: ちゃんとエラーを返す
             })?;
         Ok(buf)
     }

diff --git a/crates/voicevox_core_c_api/include/voicevox_core.h b/crates/voicevox_core_c_api/include/voicevox_core.h
diff --git a/crates/voicevox_core_c_api/src/helpers.rs b/crates/voicevox_core_c_api/src/helpers.rs
@@ -30,7 +30,7 @@ pub(crate) fn into_result_code_with_error(result: CApiResult<()>) -> VoicevoxRes
             Err(RustApi(LoadMetas(_))) => VOICEVOX_RESULT_LOAD_METAS_ERROR,
             Err(RustApi(GetSupportedDevices(_))) => VOICEVOX_RESULT_GET_SUPPORTED_DEVICES_ERROR,
             Err(RustApi(InvalidStyleId { .. })) => VOICEVOX_RESULT_INVALID_STYLE_ID_ERROR,
-            Err(RustApi(InvalidModelIndex { .. })) => VOICEVOX_RESULT_INVALID_MODEL_INDEX_ERROR,
+            Err(RustApi(InvalidModelId { .. })) => VOICEVOX_RESULT_INVALID_MODEL_ID_ERROR,
             Err(RustApi(InferenceFailed)) => VOICEVOX_RESULT_INFERENCE_ERROR,
             Err(RustApi(ExtractFullContextLabel(_))) => {
                 VOICEVOX_RESULT_EXTRACT_FULL_CONTEXT_LABEL_ERROR

diff --git a/docs/vvm.md b/docs/vvm.md
@@ -0,0 +1,9 @@
+# VVM ファイル
+
+音声合成するために必要な onnx モデルファイルなどがまとめられた zip 形式のファイル。
+root パスに確定で`manifest.json`を持つ。
+
+## マニフェストファイル
+
+ファイルの構成や、onnx モデルなどを読み込む・利用するのに必要な情報を記述した json ファイル。
+root パスに`manifest.json`として配置する。
diff --git a/model/sample.vvm b/model/sample.vvm