improve output formats support (#39)

- improve support for output formats - add command line flag to list supported formats - streamline command line parsing for voices and formats
pviotti · Apr 6, 2020 · da2f50f · da2f50f
1 parent 87ed773
commit da2f50f
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -21,43 +21,52 @@ and as CLR binary artifact.
 
 ## Usage
 
-```bash
+```
 $ ./sayit --help
-USAGE: sayit [--help] [--setup] [--version] [--listvoices] [--voice <de|en|es|fr|hi|it|ja|pt|ru|zh>] [--output <output>] <input>
+USAGE: sayit [--help] [--version] [--setup] [--list-voices] [--list-formats]
+             [--voice <voice>] [--format <format>] [--output <output>] <input>
 
 INPUT:
 
     <input>               the text to be pronounced
 
 OPTIONS:
 
-    --setup               setup the configuration file
     --version             print sayit version
-    --listvoices, -lv     list available voice shorthands, with their corresponding voice ids
-    --voice, -v <de|en|es|fr|hi|it|ja|pt|ru|zh>
-                          the voice shorthand, which maps to one of the available voice ids (see https://aka.ms/speech/tts-languages)
+    --setup               setup the configuration file
+    --list-voices, -lv    list available voice shorthands, with their
+                          corresponding voice ids
+    --list-formats, -lf   list available output format shorthands, with their
+                          corresponding output format ids
+    --voice, -v <voice>   the voice shorthand, which maps to one of the
+                          available voice ids (see `sayit -lv` for details)
+    --format, -f <format> the format shorthand of the audio output, which maps
+                          to one fo the available format ids (see `sayit -lf`
+                          for details)
     --output, -o <output> the path of the output file
     --help                display this list of options.
 ```
 At the first use you're required to run the setup wizard (`./sayit --setup`)
 and enter the configuration parameters of your Azure Cognitive Services resource,
-which are the subscription key (which you can find in the Azure portal) 
+such as the subscription key (which you can find in the Azure portal) 
 and the region identifier (see [here][region-ids]).
 SayIt will store these parameters in the configuration folder of the current
 user (e.g. `~/.config/` in Linux) as an [App Setting XML file][appsetting].
 
-Currently, SayIt supports these settings:
- - languages (*voices*): English, Italian, French, German, Spanish, Hindi, Portuguese, Russian, Japanese and Chinese (Mandarin).
+SayIt supports these settings:
+ - [languages][voices]: English, Italian, French, German, Spanish, Hindi, Portuguese, Russian, Japanese and Chinese (Mandarin).
+ - [output formats](output-formats): 
+`audio-16khz-32kbitrate-mono-mp3`, `audio-16khz-64kbitrate-mono-mp3`, `audio-16khz-128kbitrate-mono-mp3`, `audio-24khz-96kbitrate-mono-mp3`, `audio-24khz-160kbitrate-mono-mp3`, `audio-24khz-48kbitrate-mono-mp3`, `riff-8khz-16bit-mono-pcm`, `riff-16khz-16bit-mono-pcm`, `riff-24khz-16bit-mono-pcm`.
 
-    > NB: some neural voices might not be supported by your Azure Cognitive Services resource,
+> NB: some languages and output formats might not be supported by your Azure Cognitive Services resource,
 depending on its region (see [here][region-voices]).
 
- - audio export formats: MP3 16Khz 32KB/s mono
-
  [az-sub]: https://azure.microsoft.com/en-us/free/
  [az-cs]: https://azure.microsoft.com/en-us/services/cognitive-services/speech-services/
  [az-cs-price]: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/speech-services/
  [release]: https://github.com/pviotti/sayit/releases
  [appsetting]: https://docs.microsoft.com/en-us/dotnet/framework/configure-apps/file-schema/appsettings/
  [region-ids]: https://aka.ms/speech/sdkregion#speech-to-text-text-to-speech-and-translation
  [region-voices]: https://aka.ms/speech/sdkregion#text-to-speech
+ [voices]: https://aka.ms/speech/tts-languages
+ [output-formats]: https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs
diff --git a/SayIt/Config.fs b/SayIt/Config.fs
@@ -7,6 +7,7 @@ open System.Reflection
 open Argu
 
 open SayIt.Voices
+open SayIt.Formats
 
 let VERSION = Assembly.GetExecutingAssembly().GetName().Version.ToString()
 let PROGRAM_NAME = "sayit"
@@ -15,21 +16,25 @@ let CONFIG_FILE = "sayit.config"
 type Env = Environment
 
 type Args =
-    | [<NoAppSettings>] Setup
     | [<NoAppSettings>] Version
-    | [<AltCommandLine("-lv"); NoAppSettings>] ListVoices
-    | [<AltCommandLine("-v")>] Voice of VoiceType
+    | [<NoAppSettings>] Setup    
+    | [<AltCommandLine("-lv"); NoAppSettings>] List_Voices
+    | [<AltCommandLine("-lf"); NoAppSettings>] List_Formats
+    | [<AltCommandLine("-v")>] Voice of voice: string
+    | [<AltCommandLine("-f")>] Format of format: string
     | [<AltCommandLine("-o"); NoAppSettings>] Output of output: string
     | [<NoCommandLine; Mandatory>] Key of key: string
     | [<NoCommandLine; Mandatory>] Region of region: string
     | [<MainCommand; Mandatory; NoAppSettings>] Input of input: string
     interface IArgParserTemplate with
         member s.Usage =
             match s with
-            | Setup _ -> "setup the configuration file"
             | Version _ -> "print sayit version"
-            | ListVoices _ -> "list available voice shorthands, with their corresponding voice ids"
-            | Voice _ -> "the voice shorthand, which maps to one of the available voice ids (see https://aka.ms/speech/tts-languages)"
+            | Setup _ -> "setup the configuration file"            
+            | List_Voices _ -> "list available voice shorthands, with their corresponding voice ids"
+            | List_Formats _ -> "list available output format shorthands, with their corresponding output format ids"
+            | Voice _ -> "the voice shorthand, which maps to one of the available voice ids (see `sayit -lv` for details)"
+            | Format _ -> "the format shorthand of the audio output, which maps to one fo the available format ids (see `sayit -lf` for details)"
             | Output _ -> "the path of the output file"
             | Input _ -> "the text to be pronounced"
             | Key _ -> "the subscription key of your Azure Cognitive Services resource"
@@ -45,14 +50,15 @@ let getConfigFilePath() =
     Env.GetFolderPath(Env.SpecialFolder.ApplicationData, Env.SpecialFolderOption.Create)
     + string Path.DirectorySeparatorChar + CONFIG_FILE
 
-let writeConfig (key: string, region: string, voice: VoiceType) =
+let writeConfig (key: string, region: string, voice: VoiceType, format: FormatType) =
     let parser = ArgumentParser.Create<Args>()
 
     let xml =
         parser.PrintAppSettingsArguments
             [ Key key
               Region region
-              Voice voice ]
+              Voice (voice.ToString()) 
+              Format (format.ToString()) ]
     File.WriteAllText(getConfigFilePath(), xml, Text.Encoding.UTF8)
 
 let configWizard() =
@@ -65,11 +71,22 @@ let configWizard() =
     let subReg = ask "Region identifier: "
 
     let voice =
-        match VoiceType.FromString(ask "Default voice [en]: ") with
-        | Some x -> x
-        | None -> En
-
-    writeConfig (subId, subReg, voice)
+        try
+            VoiceType.FromString(ask "Default voice [en]: ") 
+        with
+            | Failure _ -> 
+                Console.WriteLine "Voice defaulted to \"en\"."
+                En
+
+    let format =
+        try
+            FormatType.FromString(ask "Default output format [mp324khz96kbps]: ")  
+        with
+            | Failure _ -> 
+                Console.WriteLine "Output format defaulted to \"mp324khz96kbps\"."
+                Mp324khz96kbps
+
+    writeConfig (subId, subReg, voice, format)
     ("The configuration has been written to " + getConfigFilePath()) |> Console.WriteLine
 
 let getConfiguration argv =
@@ -82,9 +99,12 @@ let getConfiguration argv =
     elif config.Contains Setup then
         configWizard()
         ReturnVal 0
-    elif config.Contains ListVoices then
+    elif config.Contains List_Voices then
         listVoices()
         ReturnVal 0
+    elif config.Contains List_Formats then
+        listFormats()
+        ReturnVal 0
     elif File.Exists(getConfigFilePath()) then
         let confReader = ConfigurationReader.FromAppSettingsFile(getConfigFilePath())
         Config (parser.Parse(argv, confReader, ignoreMissing = true))

diff --git a/SayIt/Formats.fs b/SayIt/Formats.fs
@@ -0,0 +1,37 @@
+module SayIt.Formats
+
+open Microsoft.CognitiveServices.Speech
+open Microsoft.FSharp.Reflection
+open SayIt.Utils
+
+type FormatType =
+    | Mp316khz32kbps
+    | Mp316khz64kbps
+    | Mp316khz128kbps
+    | Mp324khz48kbps
+    | Mp324khz96kbps
+    | Mp324khz160kbps
+    | Pcm8khz16b
+    | Pcm16khz16b
+    | Pcm24khz16b
+    override this.ToString() = toString this
+    static member FromString s = fromString<FormatType> s
+
+let getFormatId (format: FormatType) =
+    match format with
+    | Mp316khz32kbps ->SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3 
+    | Mp316khz64kbps ->SpeechSynthesisOutputFormat.Audio16Khz64KBitRateMonoMp3 
+    | Mp316khz128kbps-> SpeechSynthesisOutputFormat.Audio16Khz128KBitRateMonoMp3
+    | Mp324khz48kbps->SpeechSynthesisOutputFormat.Audio24Khz48KBitRateMonoMp3 
+    | Mp324khz96kbps-> SpeechSynthesisOutputFormat.Audio24Khz96KBitRateMonoMp3
+    | Mp324khz160kbps -> SpeechSynthesisOutputFormat.Audio24Khz160KBitRateMonoMp3
+    | Pcm8khz16b-> SpeechSynthesisOutputFormat.Riff8Khz16BitMonoPcm
+    | Pcm16khz16b -> SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm
+    | Pcm24khz16b-> SpeechSynthesisOutputFormat.Riff24Khz16BitMonoPcm
+
+let listFormats() =
+    let types = FSharpType.GetUnionCases typeof<FormatType>
+    printfn "Shorthand -> Id pairs for supported output formats (see https://bit.ly/2UOjVpg):"
+    for t in types do
+        let id = getFormatId (FormatType.FromString(t.Name.ToLower()))
+        printfn "%s -> %A" (t.Name.ToLower()) id
diff --git a/SayIt/Program.fs b/SayIt/Program.fs
@@ -2,6 +2,7 @@ module SayIt.Program
 
 open SayIt.Config
 open SayIt.Voices
+open SayIt.Formats
 
 open System.Threading.Tasks
 open Microsoft.CognitiveServices.Speech
@@ -27,7 +28,8 @@ let handleSynthesisResult (task: Task<SpeechSynthesisResult>) =
 let performSpeechSynthesis (config: Argu.ParseResults<Args>, speechConfig: SpeechConfig) =
     if config.Contains Output then
         let output = config.GetResult Output
-        speechConfig.SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3)
+        let outputFormat = getFormatId( config.PostProcessResult (Format, FormatType.FromString))
+        speechConfig.SetSpeechSynthesisOutputFormat(outputFormat)
         use fileOutput = AudioConfig.FromWavFileOutput(output)
         use synthetizer = new SpeechSynthesizer(speechConfig, fileOutput)
         handleSynthesisResult (synthetizer.SpeakTextAsync(config.GetResult Input))
@@ -37,14 +39,14 @@ let performSpeechSynthesis (config: Argu.ParseResults<Args>, speechConfig: Speec
 
 [<EntryPoint>]
 let main argv =
-    match Config.getConfiguration (argv) with
+    match getConfiguration (argv) with
     | Config config ->
-        let key = config.GetResult Key
-        let region = config.GetResult Region
-        let voice = getVoiceId (config.GetResult Voice)
+         let key = config.GetResult Key
+         let region = config.GetResult Region
+         let voice = getVoiceId (config.PostProcessResult (Voice, VoiceType.FromString))
 
-        let speechConfig = SpeechConfig.FromSubscription(key, region)
-        speechConfig.SpeechSynthesisVoiceName <- voice
+         let speechConfig = SpeechConfig.FromSubscription(key, region)
+         speechConfig.SpeechSynthesisVoiceName <- voice
 
-        performSpeechSynthesis (config, speechConfig)
+         performSpeechSynthesis (config, speechConfig)
     | ReturnVal ret -> ret
diff --git a/SayIt/SayIt.fsproj b/SayIt/SayIt.fsproj
@@ -1,20 +1,18 @@
-<Project Sdk="Microsoft.NET.Sdk">
-
-    <PropertyGroup>
-        <OutputType>Exe</OutputType>
-        <TargetFramework>netcoreapp3.1</TargetFramework>
-        <Version>0.3.0</Version>
-    </PropertyGroup>
-
-    <ItemGroup>
-        <Compile Include="Voices.fs" />
-        <Compile Include="Config.fs" />
-        <Compile Include="Program.fs" />
-    </ItemGroup>
-
-    <ItemGroup>
-        <PackageReference Include="Argu" Version="6.0.0" />
-        <PackageReference Include="Microsoft.CognitiveServices.Speech" Version="1.11.0" />
-    </ItemGroup>
-
-</Project>
+<Project Sdk="Microsoft.NET.Sdk">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+    <Version>0.3.0</Version>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="Utils.fs" />
+    <Compile Include="Formats.fs" />
+    <Compile Include="Voices.fs" />
+    <Compile Include="Config.fs" />
+    <Compile Include="Program.fs" />
+  </ItemGroup>
+  <ItemGroup>
+    <PackageReference Include="Argu" Version="6.0.0" />
+    <PackageReference Include="Microsoft.CognitiveServices.Speech" Version="1.11.0" />
+  </ItemGroup>
+</Project>
diff --git a/SayIt/Utils.fs b/SayIt/Utils.fs
@@ -0,0 +1,13 @@
+module SayIt.Utils
+
+open Microsoft.FSharp.Reflection
+
+// Create discriminated unions from string - http://fssnip.net/9l
+let toString (x: 'a) =
+    match FSharpValue.GetUnionFields(x, typeof<'a>) with
+    | case, _ -> case.Name.ToLower()
+
+let fromString<'a> (s: string) =
+    match FSharpType.GetUnionCases typeof<'a> |> Array.filter (fun case -> case.Name.ToLower() = s) with
+    | [| case |] -> FSharpValue.MakeUnion(case, [||]) :?> 'a
+    | _ -> failwith (s + " not recognized as a valid parameter.")
diff --git a/SayIt/Voices.fs b/SayIt/Voices.fs
@@ -1,16 +1,7 @@
 module SayIt.Voices
 
 open Microsoft.FSharp.Reflection
-
-// Create discriminated unions from string - http://fssnip.net/9l
-let toString (x: 'a) =
-    match FSharpValue.GetUnionFields(x, typeof<'a>) with
-    | case, _ -> case.Name
-
-let fromString<'a> (s: string) =
-    match FSharpType.GetUnionCases typeof<'a> |> Array.filter (fun case -> case.Name = s) with
-    | [| case |] -> Some(FSharpValue.MakeUnion(case, [||]) :?> 'a)
-    | _ -> None
+open SayIt.Utils
 
 type VoiceType =
     | De
@@ -41,6 +32,7 @@ let getVoiceId (voice: VoiceType) =
 
 let listVoices() =
     let types = FSharpType.GetUnionCases typeof<VoiceType>
+    printfn "Shorthand -> Id pairs for supported voices (see https://aka.ms/speech/tts-languages):"
     for t in types do
-        let id = getVoiceId (VoiceType.FromString(t.Name).Value)
-        printfn "%s - %s" (t.Name.ToLower()) id
+        let id = getVoiceId (VoiceType.FromString(t.Name.ToLower()))
+        printfn "%s -> %s" (t.Name.ToLower()) id