From d5af48df75a51b92c2fa5fa426dc2f0bf3ccba5b Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 6 Sep 2023 09:00:15 +0200 Subject: [PATCH 01/15] Initial commit for in-memory tts --- libs/langchain/langchain/tools/audio_utils.py | 12 +++++++ .../azure_cognitive_services/text2speech.py | 36 ++++++++++--------- .../tools/eleven_labs/text2speech.py | 16 ++++----- 3 files changed, 38 insertions(+), 26 deletions(-) create mode 100644 libs/langchain/langchain/tools/audio_utils.py diff --git a/libs/langchain/langchain/tools/audio_utils.py b/libs/langchain/langchain/tools/audio_utils.py new file mode 100644 index 0000000000000..7b59ef7ce8324 --- /dev/null +++ b/libs/langchain/langchain/tools/audio_utils.py @@ -0,0 +1,12 @@ +import tempfile + +def save_audio(audio) -> str: + with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f: + f.write(audio) + return f.name + + +def load_audio(audio_file: str) -> bytes: + with open(audio_file, mode="rb") as f: + audio = f.read() + return audio \ No newline at end of file diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index fee35591f38e3..9d4d8824cfceb 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -1,14 +1,21 @@ from __future__ import annotations import logging -import tempfile -from typing import Any, Dict, Optional +from IPython import display +from typing import Any, Dict, Optional, Union from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.pydantic_v1 import root_validator from langchain.tools.base import BaseTool from langchain.utils import get_from_dict_or_env +try: + import azure.cognitiveservices.speech as speechsdk +except ImportError: + raise ImportError( + "azure.cognitiveservices.speech is not installed. " "Run `pip install azure-cognitiveservices-speech` to install." + ) + logger = logging.getLogger(__name__) @@ -55,11 +62,7 @@ def validate_environment(cls, values: Dict) -> Dict: return values - def _text2speech(self, text: str, speech_language: str) -> str: - try: - import azure.cognitiveservices.speech as speechsdk - except ImportError: - pass + def _text2speech(self, text: str, speech_language: str) -> Union[speechsdk.AudioDataStream, str]: self.speech_config.speech_synthesis_language = speech_language speech_synthesizer = speechsdk.SpeechSynthesizer( @@ -69,12 +72,7 @@ def _text2speech(self, text: str, speech_language: str) -> str: if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: stream = speechsdk.AudioDataStream(result) - with tempfile.NamedTemporaryFile( - mode="wb", suffix=".wav", delete=False - ) as f: - stream.save_to_wav_file(f.name) - - return f.name + return stream elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details @@ -93,10 +91,16 @@ def _run( self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None, - ) -> str: + ) -> Union[speechsdk.AudioDataStream, str]: """Use the tool.""" try: - speech_file = self._text2speech(query, self.speech_language) - return speech_file + speech = self._text2speech(query, self.speech_language) + self.play(speech) + return speech except Exception as e: raise RuntimeError(f"Error while running AzureCogsText2SpeechTool: {e}") + + def play(self, speech): + + audio = display.Audio(speech) + display.display(audio) diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index 5c6edb00b9c1d..8a268c5ac74c6 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -39,25 +39,21 @@ def validate_environment(cls, values: Dict) -> Dict: return values - def _text2speech(self, text: str) -> str: + def _text2speech(self, text: str) -> bytes: speech = elevenlabs.generate(text=text, model=self.model) - with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f: - f.write(speech) - return f.name + return speech def _run(self, query: str) -> str: """Use the tool.""" try: - speech_file = self._text2speech(query) - return speech_file + speech = self._text2speech(query) + self.play(speech) + return "Speech has been generated" except Exception as e: raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}") - def play(self, speech_file: str) -> None: + def play(self, speech) -> None: """Play the text as speech.""" - with open(speech_file, mode="rb") as f: - speech = f.read() - elevenlabs.play(speech) def stream(self, query: str) -> None: From ffc6e8d8b17c808efd0e64d42b233de73cd34c7a Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 6 Sep 2023 09:54:54 +0200 Subject: [PATCH 02/15] Update docstring --- .../langchain/tools/azure_cognitive_services/text2speech.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index 9d4d8824cfceb..c915f3edf3683 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -91,16 +91,16 @@ def _run( self, query: str, run_manager: Optional[CallbackManagerForToolRun] = None, - ) -> Union[speechsdk.AudioDataStream, str]: + ) -> str: """Use the tool.""" try: speech = self._text2speech(query, self.speech_language) self.play(speech) - return speech + return "Speech has been generated" except Exception as e: raise RuntimeError(f"Error while running AzureCogsText2SpeechTool: {e}") def play(self, speech): - + """Play the speech.""" audio = display.Audio(speech) display.display(audio) From ed8e86af156c079d80a2956e29a536ab35ba6e4e Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 6 Sep 2023 09:57:44 +0200 Subject: [PATCH 03/15] Saving and loading utils --- libs/langchain/langchain/tools/audio_utils.py | 5 ++-- .../azure_cognitive_services/text2speech.py | 25 +++++++++++++++---- .../tools/eleven_labs/text2speech.py | 13 +++++++++- 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/libs/langchain/langchain/tools/audio_utils.py b/libs/langchain/langchain/tools/audio_utils.py index 7b59ef7ce8324..40cf917d6f994 100644 --- a/libs/langchain/langchain/tools/audio_utils.py +++ b/libs/langchain/langchain/tools/audio_utils.py @@ -1,5 +1,6 @@ import tempfile - + + def save_audio(audio) -> str: with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f: f.write(audio) @@ -9,4 +10,4 @@ def save_audio(audio) -> str: def load_audio(audio_file: str) -> bytes: with open(audio_file, mode="rb") as f: audio = f.read() - return audio \ No newline at end of file + return audio diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index c915f3edf3683..e5a7d1b962c85 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -1,11 +1,13 @@ from __future__ import annotations import logging -from IPython import display from typing import Any, Dict, Optional, Union +from IPython import display + from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.pydantic_v1 import root_validator +from langchain.tools.audio_utils import save_audio, load_audio from langchain.tools.base import BaseTool from langchain.utils import get_from_dict_or_env @@ -13,7 +15,8 @@ import azure.cognitiveservices.speech as speechsdk except ImportError: raise ImportError( - "azure.cognitiveservices.speech is not installed. " "Run `pip install azure-cognitiveservices-speech` to install." + "azure.cognitiveservices.speech is not installed. " + "Run `pip install azure-cognitiveservices-speech` to install." ) logger = logging.getLogger(__name__) @@ -62,8 +65,9 @@ def validate_environment(cls, values: Dict) -> Dict: return values - def _text2speech(self, text: str, speech_language: str) -> Union[speechsdk.AudioDataStream, str]: - + def _text2speech( + self, text: str, speech_language: str + ) -> Union[speechsdk.AudioDataStream, str]: self.speech_config.speech_synthesis_language = speech_language speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=self.speech_config, audio_config=None @@ -99,8 +103,19 @@ def _run( return "Speech has been generated" except Exception as e: raise RuntimeError(f"Error while running AzureCogsText2SpeechTool: {e}") - + def play(self, speech): """Play the speech.""" audio = display.Audio(speech) display.display(audio) + + def generate_and_save(self, query: str) -> str: + """Save the text as speech to a temporary file.""" + speech = self._text2speech(query) + path = save_audio(speech) + return path + + def load_and_play(self, path: str) -> None: + """Load the text as speech from a temporary file.""" + speech = load_audio(path) + self.play(speech) \ No newline at end of file diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index 8a268c5ac74c6..7a6bbfef3989c 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -1,7 +1,7 @@ -import tempfile from typing import Dict, Union from langchain.pydantic_v1 import root_validator +from langchain.tools.audio_utils import save_audio, load_audio from langchain.tools.base import BaseTool from langchain.tools.eleven_labs.models import ElevenLabsModel from langchain.utils import get_from_dict_or_env @@ -61,3 +61,14 @@ def stream(self, query: str) -> None: Play the text in your speakers.""" speech_stream = elevenlabs.generate(text=query, model=self.model, stream=True) elevenlabs.stream(speech_stream) + + def generate_and_save(self, query: str) -> str: + """Save the text as speech to a temporary file.""" + speech = self._text2speech(query) + path = save_audio(speech) + return path + + def load_and_play(self, path: str) -> None: + """Load the text as speech from a temporary file.""" + speech = load_audio(path) + self.play(speech) \ No newline at end of file From dcb2a2033a6b20d6c5f19522272e994879d9ebd9 Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 6 Sep 2023 10:13:40 +0200 Subject: [PATCH 04/15] Fix linters and update notebook --- .../integrations/tools/eleven_labs_tts.ipynb | 98 +++++++++++++++---- libs/langchain/langchain/agents/load_tools.py | 4 +- libs/langchain/langchain/tools/audio_utils.py | 2 +- .../azure_cognitive_services/text2speech.py | 31 +++--- .../langchain/tools/eleven_labs/__init__.py | 2 +- .../tools/eleven_labs/text2speech.py | 10 +- 6 files changed, 97 insertions(+), 50 deletions(-) diff --git a/docs/extras/integrations/tools/eleven_labs_tts.ipynb b/docs/extras/integrations/tools/eleven_labs_tts.ipynb index 7178bf8704203..733e34940c33d 100644 --- a/docs/extras/integrations/tools/eleven_labs_tts.ipynb +++ b/docs/extras/integrations/tools/eleven_labs_tts.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "2f57a647-9214-4562-a8cf-f263a15d1f40", "metadata": {}, "outputs": [ @@ -60,7 +60,7 @@ "'eleven_labs_text2speech'" ] }, - "execution_count": 6, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -79,18 +79,28 @@ "id": "d4613fed-66f0-47c6-be50-7e7670654427", "metadata": {}, "source": [ - "We can generate audio, save it to the temporary file and then play it." + "We can generate audio and play it." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "id": "f1984844-aa75-4f83-9d42-1c8052d87cc0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Speech has been generated'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "speech_file = tts.run(text_to_speak)\n", - "tts.play(speech_file)" + "tts.run(text_to_speak)" ] }, { @@ -103,12 +113,60 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "d72822f8-3223-47e2-8d2e-6ff46b8c8645", "metadata": {}, "outputs": [], "source": [ - "tts.stream(text_to_speak)" + "tts.stream_speech(text_to_speak)" + ] + }, + { + "cell_type": "markdown", + "id": "33b7c8b5-c0a7-4d4f-8256-475aac248322", + "metadata": {}, + "source": [ + "Speech can also be generated and save to temporary file." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3265a6db-773c-4043-905b-48e40fc1adc4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/tmp/tmpfiy4n39j.wav'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path = tts.generate_and_save(text_to_speak)\n", + "path" + ] + }, + { + "cell_type": "markdown", + "id": "6a25d66b-ade4-4f88-ae86-57c67ad066c4", + "metadata": {}, + "source": [ + "And played from the path." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a95326ba-ce70-4950-a03f-612e3705dfdd", + "metadata": {}, + "outputs": [], + "source": [ + "tts.load_and_play(path)" ] }, { @@ -121,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "37626aea-0cf0-4849-9c00-c0f40515ffe0", "metadata": {}, "outputs": [], @@ -132,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b", "metadata": {}, "outputs": [], @@ -149,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "id": "336bf95a-3ccb-4963-aac3-638a4df2ed78", "metadata": {}, "outputs": [ @@ -171,13 +229,13 @@ "```\n", "\n", "\u001b[0m\n", - "Observation: \u001b[36;1m\u001b[1;3m/tmp/tmpsfg783f1.wav\u001b[0m\n", - "Thought:\u001b[32;1m\u001b[1;3m I have the audio file ready to be sent to the human\n", + "Observation: \u001b[36;1m\u001b[1;3mSpeech has been generated\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I need to provide the final answer\n", "Action:\n", "```\n", "{\n", " \"action\": \"Final Answer\",\n", - " \"action_input\": \"/tmp/tmpsfg783f1.wav\"\n", + " \"action_input\": \"Why did the chicken cross the playground? To get to the other slide!\"\n", "}\n", "```\n", "\n", @@ -188,18 +246,16 @@ } ], "source": [ - "audio_file = agent.run(\"Tell me a joke and read it out for me.\")" + "output = agent.run(\"Tell me a joke and read it out for me.\")" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "f0aa7aa9-4682-4599-8cae-59347d9e5210", + "execution_count": null, + "id": "5747b523-b0c5-43aa-acf9-cf4687de1517", "metadata": {}, "outputs": [], - "source": [ - "tts.play(audio_file)" - ] + "source": [] } ], "metadata": { diff --git a/libs/langchain/langchain/agents/load_tools.py b/libs/langchain/langchain/agents/load_tools.py index 714abf0f3bfd0..28a3b48350376 100644 --- a/libs/langchain/langchain/agents/load_tools.py +++ b/libs/langchain/langchain/agents/load_tools.py @@ -286,8 +286,8 @@ def _get_dataforseo_api_search_json(**kwargs: Any) -> BaseTool: return DataForSeoAPISearchResults(api_wrapper=DataForSeoAPIWrapper(**kwargs)) -def _get_eleven_labs_text2speech() -> BaseTool: - return ElevenLabsText2SpeechTool() +def _get_eleven_labs_text2speech(**kwargs: Any) -> BaseTool: + return ElevenLabsText2SpeechTool(**kwargs) _EXTRA_LLM_TOOLS: Dict[ diff --git a/libs/langchain/langchain/tools/audio_utils.py b/libs/langchain/langchain/tools/audio_utils.py index 40cf917d6f994..b49a8efdd43c9 100644 --- a/libs/langchain/langchain/tools/audio_utils.py +++ b/libs/langchain/langchain/tools/audio_utils.py @@ -1,7 +1,7 @@ import tempfile -def save_audio(audio) -> str: +def save_audio(audio: bytes) -> str: with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f: f.write(audio) return f.name diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index e5a7d1b962c85..8790d18d1ee43 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -1,13 +1,13 @@ from __future__ import annotations import logging -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Optional from IPython import display from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.pydantic_v1 import root_validator -from langchain.tools.audio_utils import save_audio, load_audio +from langchain.tools.audio_utils import load_audio, save_audio from langchain.tools.base import BaseTool from langchain.utils import get_from_dict_or_env @@ -51,23 +51,14 @@ def validate_environment(cls, values: Dict) -> Dict: values, "azure_cogs_region", "AZURE_COGS_REGION" ) - try: - import azure.cognitiveservices.speech as speechsdk - - values["speech_config"] = speechsdk.SpeechConfig( - subscription=azure_cogs_key, region=azure_cogs_region - ) - except ImportError: - raise ImportError( - "azure-cognitiveservices-speech is not installed. " - "Run `pip install azure-cognitiveservices-speech` to install." - ) - + values["speech_config"] = speechsdk.SpeechConfig( + subscription=azure_cogs_key, region=azure_cogs_region + ) return values def _text2speech( self, text: str, speech_language: str - ) -> Union[speechsdk.AudioDataStream, str]: + ) -> speechsdk.AudioDataStream: self.speech_config.speech_synthesis_language = speech_language speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=self.speech_config, audio_config=None @@ -86,10 +77,10 @@ def _text2speech( f"Speech synthesis error: {cancellation_details.error_details}" ) - return "Speech synthesis canceled." + raise RuntimeError("Speech synthesis canceled.") else: - return f"Speech synthesis failed: {result.reason}" + raise RuntimeError(f"Speech synthesis failed: {result.reason}") def _run( self, @@ -104,18 +95,18 @@ def _run( except Exception as e: raise RuntimeError(f"Error while running AzureCogsText2SpeechTool: {e}") - def play(self, speech): + def play(self, speech: speechsdk.AudioDataStream) -> None: """Play the speech.""" audio = display.Audio(speech) display.display(audio) def generate_and_save(self, query: str) -> str: """Save the text as speech to a temporary file.""" - speech = self._text2speech(query) + speech = self._text2speech(query, self.speech_language) path = save_audio(speech) return path def load_and_play(self, path: str) -> None: """Load the text as speech from a temporary file.""" speech = load_audio(path) - self.play(speech) \ No newline at end of file + self.play(speech) diff --git a/libs/langchain/langchain/tools/eleven_labs/__init__.py b/libs/langchain/langchain/tools/eleven_labs/__init__.py index 077acb1e4e641..86ccba0804acb 100644 --- a/libs/langchain/langchain/tools/eleven_labs/__init__.py +++ b/libs/langchain/langchain/tools/eleven_labs/__init__.py @@ -2,4 +2,4 @@ from langchain.tools.eleven_labs.text2speech import ElevenLabsText2SpeechTool -__all__ = [ElevenLabsText2SpeechTool] +__all__ = ["ElevenLabsText2SpeechTool"] diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index 7a6bbfef3989c..1f9f12667c04d 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -1,7 +1,7 @@ from typing import Dict, Union from langchain.pydantic_v1 import root_validator -from langchain.tools.audio_utils import save_audio, load_audio +from langchain.tools.audio_utils import load_audio, save_audio from langchain.tools.base import BaseTool from langchain.tools.eleven_labs.models import ElevenLabsModel from langchain.utils import get_from_dict_or_env @@ -52,16 +52,16 @@ def _run(self, query: str) -> str: except Exception as e: raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}") - def play(self, speech) -> None: + def play(self, speech: bytes) -> None: """Play the text as speech.""" elevenlabs.play(speech) - def stream(self, query: str) -> None: + def stream_speech(self, query: str) -> None: """Stream the text as speech as it is generated. Play the text in your speakers.""" speech_stream = elevenlabs.generate(text=query, model=self.model, stream=True) elevenlabs.stream(speech_stream) - + def generate_and_save(self, query: str) -> str: """Save the text as speech to a temporary file.""" speech = self._text2speech(query) @@ -71,4 +71,4 @@ def generate_and_save(self, query: str) -> str: def load_and_play(self, path: str) -> None: """Load the text as speech from a temporary file.""" speech = load_audio(path) - self.play(speech) \ No newline at end of file + self.play(speech) From eb24bfaf0f7b03c8b086e871b58b0883acaa74be Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 6 Sep 2023 10:18:10 +0200 Subject: [PATCH 05/15] Fix unit test --- .../langchain/langchain/tools/eleven_labs/text2speech.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index 1f9f12667c04d..cfe73994d4e41 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -1,5 +1,6 @@ -from typing import Dict, Union +from typing import Dict, Optional, Union +from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.pydantic_v1 import root_validator from langchain.tools.audio_utils import load_audio, save_audio from langchain.tools.base import BaseTool @@ -43,7 +44,11 @@ def _text2speech(self, text: str) -> bytes: speech = elevenlabs.generate(text=text, model=self.model) return speech - def _run(self, query: str) -> str: + def _run( + self, + query: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: """Use the tool.""" try: speech = self._text2speech(query) From dc441009eb64600f7e11dbcc0647a65366853fca Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Thu, 7 Sep 2023 09:07:24 +0200 Subject: [PATCH 06/15] Working version for AzureCogServices --- .../toolkits/azure_cognitive_services.ipynb | 101 +++++++++--------- .../azure_cognitive_services/text2speech.py | 9 +- .../tools/eleven_labs/text2speech.py | 3 +- 3 files changed, 52 insertions(+), 61 deletions(-) diff --git a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb index 609cc2e4e498e..a59cc78ad4fc7 100644 --- a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb +++ b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -71,19 +71,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['Azure Cognitive Services Image Analysis',\n", - " 'Azure Cognitive Services Form Recognizer',\n", - " 'Azure Cognitive Services Speech2Text',\n", - " 'Azure Cognitive Services Text2Speech']" + "['azure_cognitive_services_form_recognizer',\n", + " 'azure_cognitive_services_speech2text',\n", + " 'azure_cognitive_services_text2speech',\n", + " 'azure_cognitive_services_image_analysis']" ] }, - "execution_count": null, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -101,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -111,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -126,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -140,11 +140,10 @@ "Action:\n", "```\n", "{\n", - " \"action\": \"Azure Cognitive Services Image Analysis\",\n", + " \"action\": \"azure_cognitive_services_image_analysis\",\n", " \"action_input\": \"https://images.openai.com/blob/9ad5a2ab-041f-475f-ad6a-b51899c50182/ingredients.png\"\n", "}\n", "```\n", - "\n", "\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3mCaption: a group of eggs and flour in bowls\n", "Objects: Egg, Egg, Food\n", @@ -154,7 +153,7 @@ "```\n", "{\n", " \"action\": \"Final Answer\",\n", - " \"action_input\": \"You can make pancakes, omelettes, or quiches with these ingredients!\"\n", + " \"action_input\": \"Based on the objects and tags in the image, you could make a cake, pancakes, omelette, or quiche.\"\n", "}\n", "```\u001b[0m\n", "\n", @@ -164,10 +163,10 @@ { "data": { "text/plain": [ - "'You can make pancakes, omelettes, or quiches with these ingredients!'" + "'Based on the objects and tags in the image, you could make a cake, pancakes, omelette, or quiche.'" ] }, - "execution_count": null, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -181,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -194,58 +193,54 @@ "\u001b[32;1m\u001b[1;3mAction:\n", "```\n", "{\n", - " \"action\": \"Azure Cognitive Services Text2Speech\",\n", - " \"action_input\": \"Why did the chicken cross the playground? To get to the other slide!\"\n", + " \"action\": \"azure_cognitive_services_text2speech\",\n", + " \"action_input\": \"Why did the chicken cross the road? To get to the other side!\"\n", "}\n", "```\n", "\n", - "\u001b[0m\n", - "Observation: \u001b[31;1m\u001b[1;3m/tmp/tmpa3uu_j6b.wav\u001b[0m\n", - "Thought:\u001b[32;1m\u001b[1;3m I have the audio file of the joke\n", - "Action:\n", - "```\n", - "{\n", - " \"action\": \"Final Answer\",\n", - " \"action_input\": \"/tmp/tmpa3uu_j6b.wav\"\n", - "}\n", - "```\u001b[0m\n", - "\n", - "\u001b[1m> Finished chain.\u001b[0m\n" + "\u001b[0m" ] }, { "data": { + "text/html": [ + "\n", + " \n", + " " + ], "text/plain": [ - "'/tmp/tmpa3uu_j6b.wav'" + "" ] }, - "execution_count": null, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Observation: \u001b[38;5;200m\u001b[1;3mSpeech has been generated\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I have the response to the human\n", + "Action:\n", + "```\n", + "{\n", + " \"action\": \"Final Answer\",\n", + " \"action_input\": \"Why did the chicken cross the road? To get to the other side!\"\n", + "}\n", + "```\n", + "\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] } ], "source": [ "audio_file = agent.run(\"Tell me a joke and read it out for me.\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from IPython import display\n", - "\n", - "audio = display.Audio(audio_file)\n", - "display.display(audio)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -264,7 +259,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index 8790d18d1ee43..70889cb4a0e27 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -56,9 +56,7 @@ def validate_environment(cls, values: Dict) -> Dict: ) return values - def _text2speech( - self, text: str, speech_language: str - ) -> speechsdk.AudioDataStream: + def _text2speech(self, text: str, speech_language: str) -> bytes: self.speech_config.speech_synthesis_language = speech_language speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=self.speech_config, audio_config=None @@ -66,8 +64,7 @@ def _text2speech( result = speech_synthesizer.speak_text(text) if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: - stream = speechsdk.AudioDataStream(result) - return stream + return result.audio_data elif result.reason == speechsdk.ResultReason.Canceled: cancellation_details = result.cancellation_details @@ -95,7 +92,7 @@ def _run( except Exception as e: raise RuntimeError(f"Error while running AzureCogsText2SpeechTool: {e}") - def play(self, speech: speechsdk.AudioDataStream) -> None: + def play(self, speech: bytes) -> None: """Play the speech.""" audio = display.Audio(speech) display.display(audio) diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index cfe73994d4e41..68409b7c81ef3 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -9,7 +9,6 @@ try: import elevenlabs - except ImportError: raise ImportError( "elevenlabs is not installed. " "Run `pip install elevenlabs` to install." @@ -74,6 +73,6 @@ def generate_and_save(self, query: str) -> str: return path def load_and_play(self, path: str) -> None: - """Load the text as speech from a temporary file.""" + """Load the text as speech from a temporary file and play it.""" speech = load_audio(path) self.play(speech) From 2bc6e3838f9fdd957bb05905f12d271d13172b1d Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Thu, 7 Sep 2023 09:10:44 +0200 Subject: [PATCH 07/15] Add docstring --- libs/langchain/langchain/tools/audio_utils.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/libs/langchain/langchain/tools/audio_utils.py b/libs/langchain/langchain/tools/audio_utils.py index b49a8efdd43c9..a8c03a94025bf 100644 --- a/libs/langchain/langchain/tools/audio_utils.py +++ b/libs/langchain/langchain/tools/audio_utils.py @@ -1,13 +1,18 @@ import tempfile +from pathlib import Path def save_audio(audio: bytes) -> str: + """Save audio to a temporary file and return the path.""" with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f: f.write(audio) return f.name -def load_audio(audio_file: str) -> bytes: - with open(audio_file, mode="rb") as f: - audio = f.read() - return audio +def load_audio(audio_file_path: str) -> bytes: + """Load audio from a file into bytes.""" + if Path(audio_file_path).exists(): + with open(audio_file_path, mode="rb") as f: + audio = f.read() + return audio + raise FileNotFoundError(f"File {audio_file_path} not found.") From c79e39998cc150b935cd846ca4d69f39964abff4 Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Thu, 7 Sep 2023 09:14:48 +0200 Subject: [PATCH 08/15] Modify docstring in tts class --- .../langchain/tools/azure_cognitive_services/text2speech.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index 70889cb4a0e27..2423e580b38c7 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -104,6 +104,6 @@ def generate_and_save(self, query: str) -> str: return path def load_and_play(self, path: str) -> None: - """Load the text as speech from a temporary file.""" + """Load the text as speech from a temporary file and play it.""" speech = load_audio(path) self.play(speech) From 34e1b7b9821b369ac8d3ccd348bac1cfd455c7da Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 13 Sep 2023 11:56:12 +0200 Subject: [PATCH 09/15] Some changes after rebasing --- .../toolkits/azure_cognitive_services.ipynb | 31 ++++++++++++++++++- .../integrations/tools/eleven_labs_tts.ipynb | 26 ++++++---------- .../tools/eleven_labs/text2speech.py | 3 +- 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb index a59cc78ad4fc7..5fef860ba0d84 100644 --- a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb +++ b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb @@ -224,7 +224,7 @@ "text": [ "\n", "Observation: \u001b[38;5;200m\u001b[1;3mSpeech has been generated\u001b[0m\n", - "Thought:\u001b[32;1m\u001b[1;3m I have the response to the human\n", + "Thought:\u001b[32;1m\u001b[1;3m I need to read out the joke\n", "Action:\n", "```\n", "{\n", @@ -241,6 +241,35 @@ "source": [ "audio_file = agent.run(\"Tell me a joke and read it out for me.\")" ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tts = toolkit.get_tools()[-2]\n", + "path = tts.generate_and_save(\"Hello world\")\n", + "tts.load_and_play(path)" + ] } ], "metadata": { diff --git a/docs/extras/integrations/tools/eleven_labs_tts.ipynb b/docs/extras/integrations/tools/eleven_labs_tts.ipynb index 733e34940c33d..598714d5323ba 100644 --- a/docs/extras/integrations/tools/eleven_labs_tts.ipynb +++ b/docs/extras/integrations/tools/eleven_labs_tts.ipynb @@ -113,7 +113,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "d72822f8-3223-47e2-8d2e-6ff46b8c8645", "metadata": {}, "outputs": [], @@ -131,17 +131,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "3265a6db-773c-4043-905b-48e40fc1adc4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'/tmp/tmpfiy4n39j.wav'" + "'/tmp/tmp03j5opkw.wav'" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -161,7 +161,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "a95326ba-ce70-4950-a03f-612e3705dfdd", "metadata": {}, "outputs": [], @@ -179,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "37626aea-0cf0-4849-9c00-c0f40515ffe0", "metadata": {}, "outputs": [], @@ -190,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b", "metadata": {}, "outputs": [], @@ -207,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "336bf95a-3ccb-4963-aac3-638a4df2ed78", "metadata": {}, "outputs": [ @@ -230,7 +230,7 @@ "\n", "\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3mSpeech has been generated\u001b[0m\n", - "Thought:\u001b[32;1m\u001b[1;3m I need to provide the final answer\n", + "Thought:\u001b[32;1m\u001b[1;3m I have the response\n", "Action:\n", "```\n", "{\n", @@ -248,14 +248,6 @@ "source": [ "output = agent.run(\"Tell me a joke and read it out for me.\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5747b523-b0c5-43aa-acf9-cf4687de1517", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index f0d6cfa65e370..50cf4722d9501 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -76,7 +76,8 @@ def stream_speech(self, query: str) -> None: def generate_and_save(self, query: str) -> str: """Save the text as speech to a temporary file.""" - speech = self._text2speech(query) + elevenlabs = _import_elevenlabs() + speech = elevenlabs.generate(text=query, model=self.model) path = save_audio(speech) return path From 18bd6e979262880f7fc72dfcaf976c242dd5f690 Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 13 Sep 2023 12:02:21 +0200 Subject: [PATCH 10/15] Fix linter error --- libs/langchain/langchain/tools/eleven_labs/text2speech.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index 50cf4722d9501..d15e8558bdfb5 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -1,4 +1,3 @@ -import tempfile from enum import Enum from typing import Any, Dict, Optional, Union From 069f4c2e44b4da3f5b727336ac80468dfa4190f1 Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 13 Sep 2023 12:54:56 +0200 Subject: [PATCH 11/15] Self-review --- .../azure_cognitive_services/text2speech.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index 2423e580b38c7..793a45cfcaee5 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -11,13 +11,16 @@ from langchain.tools.base import BaseTool from langchain.utils import get_from_dict_or_env -try: - import azure.cognitiveservices.speech as speechsdk -except ImportError: - raise ImportError( - "azure.cognitiveservices.speech is not installed. " - "Run `pip install azure-cognitiveservices-speech` to install." - ) + +def _import_azure_speech() -> Any: + try: + import azure.cognitiveservices.speech as speechsdk + except ImportError: + raise ImportError( + "azure.cognitiveservices.speech is not installed. " + "Run `pip install azure-cognitiveservices-speech` to install." + ) + return speechsdk logger = logging.getLogger(__name__) @@ -43,6 +46,7 @@ class AzureCogsText2SpeechTool(BaseTool): @root_validator(pre=True) def validate_environment(cls, values: Dict) -> Dict: """Validate that api key and endpoint exists in environment.""" + speechsdk = _import_azure_speech() azure_cogs_key = get_from_dict_or_env( values, "azure_cogs_key", "AZURE_COGS_KEY" ) @@ -57,6 +61,7 @@ def validate_environment(cls, values: Dict) -> Dict: return values def _text2speech(self, text: str, speech_language: str) -> bytes: + speechsdk = _import_azure_speech() self.speech_config.speech_synthesis_language = speech_language speech_synthesizer = speechsdk.SpeechSynthesizer( speech_config=self.speech_config, audio_config=None From 6ec068955f150fe5061a2848e289b202126f058b Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Wed, 13 Sep 2023 12:55:42 +0200 Subject: [PATCH 12/15] Fix linter --- .../langchain/tools/azure_cognitive_services/text2speech.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index 793a45cfcaee5..28346a6e26327 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -22,6 +22,7 @@ def _import_azure_speech() -> Any: ) return speechsdk + logger = logging.getLogger(__name__) From befb02dbe59543baae1910711cefa73db63a6eee Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Thu, 14 Sep 2023 13:05:31 +0200 Subject: [PATCH 13/15] CR comment --- libs/langchain/langchain/tools/audio_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/tools/audio_utils.py b/libs/langchain/langchain/tools/audio_utils.py index a8c03a94025bf..a008af8ea9661 100644 --- a/libs/langchain/langchain/tools/audio_utils.py +++ b/libs/langchain/langchain/tools/audio_utils.py @@ -11,7 +11,7 @@ def save_audio(audio: bytes) -> str: def load_audio(audio_file_path: str) -> bytes: """Load audio from a file into bytes.""" - if Path(audio_file_path).exists(): + if Path(audio_file_path).is_file(): with open(audio_file_path, mode="rb") as f: audio = f.read() return audio From 8bfcb84ebd52955d59df06450e2450e64769c1e9 Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Thu, 14 Sep 2023 14:55:09 +0200 Subject: [PATCH 14/15] CR comments --- .../toolkits/azure_cognitive_services.ipynb | 29 ---------- .../integrations/tools/eleven_labs_tts.ipynb | 56 ++----------------- libs/langchain/langchain/tools/audio_utils.py | 18 ------ .../azure_cognitive_services/text2speech.py | 12 ---- .../tools/eleven_labs/text2speech.py | 13 ----- 5 files changed, 4 insertions(+), 124 deletions(-) delete mode 100644 libs/langchain/langchain/tools/audio_utils.py diff --git a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb index 5fef860ba0d84..f9c08093a1c3d 100644 --- a/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb +++ b/docs/extras/integrations/toolkits/azure_cognitive_services.ipynb @@ -241,35 +241,6 @@ "source": [ "audio_file = agent.run(\"Tell me a joke and read it out for me.\")" ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tts = toolkit.get_tools()[-2]\n", - "path = tts.generate_and_save(\"Hello world\")\n", - "tts.load_and_play(path)" - ] } ], "metadata": { diff --git a/docs/extras/integrations/tools/eleven_labs_tts.ipynb b/docs/extras/integrations/tools/eleven_labs_tts.ipynb index 598714d5323ba..5d73f4575297a 100644 --- a/docs/extras/integrations/tools/eleven_labs_tts.ipynb +++ b/docs/extras/integrations/tools/eleven_labs_tts.ipynb @@ -121,54 +121,6 @@ "tts.stream_speech(text_to_speak)" ] }, - { - "cell_type": "markdown", - "id": "33b7c8b5-c0a7-4d4f-8256-475aac248322", - "metadata": {}, - "source": [ - "Speech can also be generated and save to temporary file." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "3265a6db-773c-4043-905b-48e40fc1adc4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'/tmp/tmp03j5opkw.wav'" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "path = tts.generate_and_save(text_to_speak)\n", - "path" - ] - }, - { - "cell_type": "markdown", - "id": "6a25d66b-ade4-4f88-ae86-57c67ad066c4", - "metadata": {}, - "source": [ - "And played from the path." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a95326ba-ce70-4950-a03f-612e3705dfdd", - "metadata": {}, - "outputs": [], - "source": [ - "tts.load_and_play(path)" - ] - }, { "cell_type": "markdown", "id": "a152766d-5f06-48b1-ac89-b4e8d88d3c9f", @@ -179,7 +131,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "37626aea-0cf0-4849-9c00-c0f40515ffe0", "metadata": {}, "outputs": [], @@ -190,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b", "metadata": {}, "outputs": [], @@ -207,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "id": "336bf95a-3ccb-4963-aac3-638a4df2ed78", "metadata": {}, "outputs": [ @@ -230,7 +182,7 @@ "\n", "\u001b[0m\n", "Observation: \u001b[36;1m\u001b[1;3mSpeech has been generated\u001b[0m\n", - "Thought:\u001b[32;1m\u001b[1;3m I have the response\n", + "Thought:\u001b[32;1m\u001b[1;3m I need to provide the final answer\n", "Action:\n", "```\n", "{\n", diff --git a/libs/langchain/langchain/tools/audio_utils.py b/libs/langchain/langchain/tools/audio_utils.py deleted file mode 100644 index a008af8ea9661..0000000000000 --- a/libs/langchain/langchain/tools/audio_utils.py +++ /dev/null @@ -1,18 +0,0 @@ -import tempfile -from pathlib import Path - - -def save_audio(audio: bytes) -> str: - """Save audio to a temporary file and return the path.""" - with tempfile.NamedTemporaryFile(mode="bx", suffix=".wav", delete=False) as f: - f.write(audio) - return f.name - - -def load_audio(audio_file_path: str) -> bytes: - """Load audio from a file into bytes.""" - if Path(audio_file_path).is_file(): - with open(audio_file_path, mode="rb") as f: - audio = f.read() - return audio - raise FileNotFoundError(f"File {audio_file_path} not found.") diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index 28346a6e26327..b35e384c17b67 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -7,7 +7,6 @@ from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.pydantic_v1 import root_validator -from langchain.tools.audio_utils import load_audio, save_audio from langchain.tools.base import BaseTool from langchain.utils import get_from_dict_or_env @@ -102,14 +101,3 @@ def play(self, speech: bytes) -> None: """Play the speech.""" audio = display.Audio(speech) display.display(audio) - - def generate_and_save(self, query: str) -> str: - """Save the text as speech to a temporary file.""" - speech = self._text2speech(query, self.speech_language) - path = save_audio(speech) - return path - - def load_and_play(self, path: str) -> None: - """Load the text as speech from a temporary file and play it.""" - speech = load_audio(path) - self.play(speech) diff --git a/libs/langchain/langchain/tools/eleven_labs/text2speech.py b/libs/langchain/langchain/tools/eleven_labs/text2speech.py index d15e8558bdfb5..1916b350fed98 100644 --- a/libs/langchain/langchain/tools/eleven_labs/text2speech.py +++ b/libs/langchain/langchain/tools/eleven_labs/text2speech.py @@ -3,7 +3,6 @@ from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.pydantic_v1 import root_validator -from langchain.tools.audio_utils import load_audio, save_audio from langchain.tools.base import BaseTool from langchain.utils import get_from_dict_or_env @@ -72,15 +71,3 @@ def stream_speech(self, query: str) -> None: elevenlabs = _import_elevenlabs() speech_stream = elevenlabs.generate(text=query, model=self.model, stream=True) elevenlabs.stream(speech_stream) - - def generate_and_save(self, query: str) -> str: - """Save the text as speech to a temporary file.""" - elevenlabs = _import_elevenlabs() - speech = elevenlabs.generate(text=query, model=self.model) - path = save_audio(speech) - return path - - def load_and_play(self, path: str) -> None: - """Load the text as speech from a temporary file and play it.""" - speech = load_audio(path) - self.play(speech) From b52584408c8ec47c9d639730e3a3ae4b6d282b13 Mon Sep 17 00:00:00 2001 From: "mateusz.wosinski" Date: Thu, 14 Sep 2023 18:56:33 +0200 Subject: [PATCH 15/15] Fix import --- .../tools/azure_cognitive_services/text2speech.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py index b35e384c17b67..4e4568d78f003 100644 --- a/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py +++ b/libs/langchain/langchain/tools/azure_cognitive_services/text2speech.py @@ -3,8 +3,6 @@ import logging from typing import Any, Dict, Optional -from IPython import display - from langchain.callbacks.manager import CallbackManagerForToolRun from langchain.pydantic_v1 import root_validator from langchain.tools.base import BaseTool @@ -99,5 +97,11 @@ def _run( def play(self, speech: bytes) -> None: """Play the speech.""" + try: + from IPython import display + except ImportError: + raise ImportError( + "IPython is not installed. " "Run `pip install ipython` to install." + ) audio = display.Audio(speech) display.display(audio)