From 6c688948f315a1cf645e039d2a94a3ee4e4f7ceb Mon Sep 17 00:00:00 2001 From: uezo Date: Thu, 12 Dec 2024 21:41:42 +0900 Subject: [PATCH] Add support for NijiVoice as a speech synthesizer - Added support for NijiVoice as a speech synthesis service. - To use NijiVoice, add the `NijiVoiceSpeechSynthesizer` component to the AIAvatar object and configure the following in the inspector: - Set the API key and Voice Actor Id. - Enable the `IsEnabled` option. - When using NijiVoice, set the `Voice Prefetch Mode` to `Sequential`. This can be configured in the ModelController inspector. --- README.ja.md | 4 +- README.md | 4 +- .../NijiVoiceSpeechSynthesizer.cs | 193 ++++++++++++++++++ .../NijiVoiceSpeechSynthesizer.cs.meta | 11 + 4 files changed, 208 insertions(+), 4 deletions(-) create mode 100644 Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs create mode 100644 Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs.meta diff --git a/README.ja.md b/README.ja.md index 499c78c..e0d97b7 100644 --- a/README.ja.md +++ b/README.ja.md @@ -10,7 +10,7 @@ ChatdollKitは、お好みの3Dモデルを使って音声対話可能なチャ - **生成AI対応**: ChatGPT、Anthropic Claude、Google Gemini Pro、Difyなど、複数のLLMをサポートし、ファンクションコーリング(ChatGPT/Gemini)やマルチモーダル機能にも対応 - **3Dモデル表現**: 発話とモーションの同期、表情やアニメーションの自律制御、瞬きや口の動きの同期をサポート -- **対話制御**: 音声認識と音声合成(OpenAI、Azure、Google、Watson、VOICEVOX、VOICEROIDなど)の統合、対話状態(コンテキスト)の管理、意図抽出とトピックのルーティング、ウェイクワード検出をサポート +- **対話制御**: 音声認識と音声合成(OpenAI、Azure、Google、Watson、VOICEVOX / AivisSpeech、Style-Bert-VITS2、にじボイスなど)の統合、対話状態(コンテキスト)の管理、意図抽出とトピックのルーティング、ウェイクワード検出をサポート - **マルチプラットフォーム**: Windows、Mac、Linux、iOS、Android、およびその他のUnityサポートプラットフォーム(VR、AR、WebGLを含む)に対応 @@ -414,7 +414,7 @@ ChatdollKitはこのCoTの手法に、` ~ `の中身を読 ## 🗣️ Speech Synthesizer (Text-to-Speech) -音声合成サービスとしてクラウドサービスとして提供されるGoogle、Azure、OpenAI、Watsonをサポートするほか、キャラクターとしてより魅力的な音声を提供するVOICEVOX、VOICEROID、Style-Bert-VITS2をサポートします。 +音声合成サービスとしてクラウドサービスとして提供されるGoogle、Azure、OpenAI、Watsonをサポートするほか、キャラクターとしてより魅力的な音声を提供するVOICEVOX / AivisSpeech、VOICEROID、Style-Bert-VITS2, にじボイスをサポートします。 音声合成サービスを使用するには、`ChatdollKit/Scripts/SpeechSynthesizer`の各サービス名が含まれる`SpeechSynthesizer`をAIAvatarオブジェクトにアタッチして、`IsEnabled`にチェックを入れてください。すでに他のSpeechSynthesizerがアタッチされている場合、使用しないSpeechSynthesizerの`IsEnabled`はチェックを外す必要がある点に注意してください。 アタッチしたSpeechSynthesizerには、APIキーやエンドポイントなどのパラメーターをインスペクター上で設定することができます。これらのパラメーターの意味や設定すべき値等については各TTSサービス・製品のAPIリファレンスを参照してください。 diff --git a/README.md b/README.md index 9be3b59..7b70180 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ - **Generative AI Native**: Supports multiple LLMs like ChatGPT, Anthropic Claude, Google Gemini Pro, Dify, and others, with function calling (ChatGPT/Gemini) and multimodal capabilities. - **3D model expression**: Synchronizes speech and motion, controls facial expressions and animations autonomously, supports blinking and lip-sync. -- **Dialog control**: Integrates Speech-to-Text and Text-to-Speech (OpenAI, Azure, Google, Watson, VOICEVOX, VOICEROID, etc.), manages dialog state (context), extracts intents and routes topics, supports wakeword detection. +- **Dialog control**: Integrates Speech-to-Text and Text-to-Speech (OpenAI, Azure, Google, VOICEVOX / AivisSpeech, Style-Bert-VITS2, NijiVoice etc.), manages dialog state (context), extracts intents and routes topics, supports wakeword detection. - **Multi platforms**: Compatible with Windows, Mac, Linux, iOS, Android, and other Unity-supported platforms, including VR, AR, and WebGL. @@ -420,7 +420,7 @@ You can customize the tag by setting a preferred word (e.g., "reason") as the `T ## 🗣️ Speech Synthesizer (Text-to-Speech) -We support cloud-based speech synthesis services such as Google, Azure, OpenAI, and Watson, in addition to VOICEVOX, VOICEROID, and Style-Bert-VITS2 for more characterful and engaging voices. To use a speech synthesis service, attach `SpeechSynthesizer` from `ChatdollKit/Scripts/SpeechListener` to the AIAvatar object and check the `IsEnabled` box. If other `SpeechSynthesizer` components are attached, make sure to uncheck the `IsEnabled` box for those not in use. +We support cloud-based speech synthesis services such as Google, Azure, OpenAI, and Watson, in addition to VOICEVOX / AivisSpeech, VOICEROID, Style-Bert-VITS2, and NijiVoice for more characterful and engaging voices. To use a speech synthesis service, attach `SpeechSynthesizer` from `ChatdollKit/Scripts/SpeechListener` to the AIAvatar object and check the `IsEnabled` box. If other `SpeechSynthesizer` components are attached, make sure to uncheck the `IsEnabled` box for those not in use. You can configure parameters like API keys and endpoints on the attached `SpeechSynthesizer` in the inspector. For more details of these parameters, refer to the API references of TTS services. diff --git a/Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs b/Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs new file mode 100644 index 0000000..f5650ae --- /dev/null +++ b/Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs @@ -0,0 +1,193 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using UnityEngine; +using UnityEngine.Networking; +using Cysharp.Threading.Tasks; +using ChatdollKit.IO; +using ChatdollKit.Network; + +namespace ChatdollKit.SpeechSynthesizer +{ + public class NijiVoiceSpeechSynthesizer : SpeechSynthesizerBase + { + public bool _IsEnabled = true; + public override bool IsEnabled + { + get + { + return _IsEnabled; + } + set + { + _IsEnabled = value; + } + } + + public string EndpointUrl; + public string ApiKey; + + [Header("Voice Settings")] + public string VoiceActorId = "dba2fa0e-f750-43ad-b9f6-d5aeaea7dc16"; + public float Speed = 1.0f; + [SerializeField] + private AudioType audioType = AudioType.WAV; + + public List VoiceModelSpeeds; + + [SerializeField] + protected bool printSupportedSpeakers; + + private ChatdollHttp client; + + private void Start() + { + client = new ChatdollHttp(Timeout); + if (printSupportedSpeakers) + { + _ = ListSpeakersAsync(CancellationToken.None); + } + } + + // Get audio clip from NijiVoice API + protected override async UniTask DownloadAudioClipAsync(string text, Dictionary parameters, CancellationToken token) + { + if (token.IsCancellationRequested) { return null; }; + + var textToSpeech = text.Replace(" ", "").Replace("\n", "").Trim(); + if (string.IsNullOrEmpty(textToSpeech) || textToSpeech == "」") return null; + + // Generate audio data on NijiVoice server + var url = (string.IsNullOrEmpty(EndpointUrl) ? "https://api.nijivoice.com" : EndpointUrl) + $"/api/platform/v1/voice-actors/{VoiceActorId}/generate-voice"; + var speed = Speed > 0 ? Speed : VoiceModelSpeeds.FirstOrDefault(v => v.id == VoiceActorId)?.speed ?? 1.0f; + var data = new Dictionary() { + { "script", text }, + { "speed", speed.ToString() }, + { "format", audioType == AudioType.MPEG ? "mp3" : "wav" }, + }; + var headers = new Dictionary() { { "Content-Type", "application/json" }, { "x-api-key", ApiKey } }; + var generatedVoiceResponse = await client.PostJsonAsync(url, data, headers, cancellationToken: token); + +#if UNITY_WEBGL && !UNITY_EDITOR + return await DownloadAudioClipWebGLAsync(generatedVoiceResponse.generatedvoice.audioFileUrl, token); +#else + return await DownloadAudioClipNativeAsync(generatedVoiceResponse.generatedvoice.audioFileUrl, token); +#endif + } + + protected async UniTask DownloadAudioClipNativeAsync(string url, CancellationToken token) + { + using (var www = UnityWebRequestMultimedia.GetAudioClip(url, audioType)) + { + www.timeout = Timeout; + www.method = "GET"; + + // Send request + try + { + await www.SendWebRequest().ToUniTask(cancellationToken: token); + } + catch (Exception ex) + { + Debug.LogError($"Error occured while processing NijiVoice text-to-speech: {ex}"); + return null; + } + + return DownloadHandlerAudioClip.GetContent(www); + } + } + + protected async UniTask DownloadAudioClipWebGLAsync(string url, CancellationToken token) + { + var audioResp = await client.GetAsync(url, cancellationToken: token); + return AudioConverter.PCMToAudioClip(audioResp.Data); + } + + public async UniTask ListSpeakersAsync(CancellationToken token) + { + if (printSupportedSpeakers) + { + Debug.Log("==== Supported speakers ===="); + } + + VoiceModelSpeeds.Clear(); + foreach (var s in await GetSpearkersAsync(token)) + { + if (printSupportedSpeakers) + { + Debug.Log($"{s.Key}: {s.Value.name} ({s.Value.recommendedVoiceSpeed})"); + } + VoiceModelSpeeds.Add(new VoiceModelSpeed(){ id = s.Key, speed = s.Value.recommendedVoiceSpeed }); + } + } + + private async UniTask> GetSpearkersAsync(CancellationToken token) + { + var speakers = new Dictionary(); + + var speakerResponse = await client.GetJsonAsync( + (string.IsNullOrEmpty(EndpointUrl) ? "https://api.nijivoice.com" : EndpointUrl) + "/api/platform/v1/voice-actors", + headers: new Dictionary(){ + { "x-api-key", ApiKey } + }, + cancellationToken: token); + + foreach (var va in speakerResponse.voiceActors) + { + speakers.Add(va.id, va); + } + + return speakers; + } + + private class SpeakersResponse + { + public List voiceActors; + } + + [Serializable] + public class VoiceStyle + { + public int id; + public string style; + } + + [Serializable] + public class VoiceActorData + { + public string id; + public string name; + public string nameReading; + public int age; + public string gender; + public int birthMonth; + public int birthDay; + public string smallImageUrl; + public string mediumImageUrl; + public string largeImageUrl; + public string sampleVoiceUrl; + public string sampleScript; + public float recommendedVoiceSpeed; + public List voiceStyles; + } + + [Serializable] + public class VoiceModelSpeed + { + public string id; + public float speed; + } + + private class GeneratedVoiceResponse + { + public GeneratedVoice generatedvoice { get; set; } + } + + private class GeneratedVoice + { + public string audioFileUrl { get; set; } + public int duration { get; set; } + } + } +} diff --git a/Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs.meta b/Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs.meta new file mode 100644 index 0000000..ae294b7 --- /dev/null +++ b/Scripts/SpeechSynthesizer/NijiVoiceSpeechSynthesizer.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 5282c47e3a2e0478cafd3898ac8ca763 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: