Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Text2Speech: Keeping speech in memory #11

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
101 changes: 48 additions & 53 deletions docs/extras/integrations/toolkits/azure_cognitive_services.ipynb

Large diffs are not rendered by default.

50 changes: 25 additions & 25 deletions docs/extras/integrations/tools/eleven_labs_tts.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"id": "2f57a647-9214-4562-a8cf-f263a15d1f40",
"metadata": {},
"outputs": [
Expand All @@ -60,7 +60,7 @@
"'eleven_labs_text2speech'"
]
},
"execution_count": 6,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -79,18 +79,28 @@
"id": "d4613fed-66f0-47c6-be50-7e7670654427",
"metadata": {},
"source": [
"We can generate audio, save it to the temporary file and then play it."
"We can generate audio and play it."
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 3,
"id": "f1984844-aa75-4f83-9d42-1c8052d87cc0",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'Speech has been generated'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"speech_file = tts.run(text_to_speak)\n",
"tts.play(speech_file)"
"tts.run(text_to_speak)"
]
},
{
Expand All @@ -103,7 +113,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 4,
"id": "d72822f8-3223-47e2-8d2e-6ff46b8c8645",
"metadata": {},
"outputs": [],
Expand All @@ -121,7 +131,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 5,
"id": "37626aea-0cf0-4849-9c00-c0f40515ffe0",
"metadata": {},
"outputs": [],
Expand All @@ -132,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 6,
"id": "c168f28e-d5b7-4c93-bed8-0ab317b4a44b",
"metadata": {},
"outputs": [],
Expand All @@ -149,7 +159,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 7,
"id": "336bf95a-3ccb-4963-aac3-638a4df2ed78",
"metadata": {},
"outputs": [
Expand All @@ -171,13 +181,13 @@
"```\n",
"\n",
"\u001b[0m\n",
"Observation: \u001b[36;1m\u001b[1;3m/tmp/tmpsfg783f1.wav\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3m I have the audio file ready to be sent to the human\n",
"Observation: \u001b[36;1m\u001b[1;3mSpeech has been generated\u001b[0m\n",
"Thought:\u001b[32;1m\u001b[1;3m I need to provide the final answer\n",
"Action:\n",
"```\n",
"{\n",
" \"action\": \"Final Answer\",\n",
" \"action_input\": \"/tmp/tmpsfg783f1.wav\"\n",
" \"action_input\": \"Why did the chicken cross the playground? To get to the other slide!\"\n",
"}\n",
"```\n",
"\n",
Expand All @@ -188,17 +198,7 @@
}
],
"source": [
"audio_file = agent.run(\"Tell me a joke and read it out for me.\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f0aa7aa9-4682-4599-8cae-59347d9e5210",
"metadata": {},
"outputs": [],
"source": [
"tts.play(audio_file)"
"output = agent.run(\"Tell me a joke and read it out for me.\")"
]
}
],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,25 @@
from __future__ import annotations

import logging
import tempfile
from typing import Any, Dict, Optional

from langchain.callbacks.manager import CallbackManagerForToolRun
from langchain.pydantic_v1 import root_validator
from langchain.tools.base import BaseTool
from langchain.utils import get_from_dict_or_env


def _import_azure_speech() -> Any:
try:
import azure.cognitiveservices.speech as speechsdk
except ImportError:
raise ImportError(
"azure.cognitiveservices.speech is not installed. "
"Run `pip install azure-cognitiveservices-speech` to install."
)
return speechsdk


logger = logging.getLogger(__name__)


Expand All @@ -33,6 +44,7 @@ class AzureCogsText2SpeechTool(BaseTool):
@root_validator(pre=True)
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that api key and endpoint exists in environment."""
speechsdk = _import_azure_speech()
azure_cogs_key = get_from_dict_or_env(
values, "azure_cogs_key", "AZURE_COGS_KEY"
)
Expand All @@ -41,40 +53,21 @@ def validate_environment(cls, values: Dict) -> Dict:
values, "azure_cogs_region", "AZURE_COGS_REGION"
)

try:
import azure.cognitiveservices.speech as speechsdk

values["speech_config"] = speechsdk.SpeechConfig(
subscription=azure_cogs_key, region=azure_cogs_region
)
except ImportError:
raise ImportError(
"azure-cognitiveservices-speech is not installed. "
"Run `pip install azure-cognitiveservices-speech` to install."
)

values["speech_config"] = speechsdk.SpeechConfig(
subscription=azure_cogs_key, region=azure_cogs_region
)
return values

def _text2speech(self, text: str, speech_language: str) -> str:
try:
import azure.cognitiveservices.speech as speechsdk
except ImportError:
pass

def _text2speech(self, text: str, speech_language: str) -> bytes:
speechsdk = _import_azure_speech()
self.speech_config.speech_synthesis_language = speech_language
speech_synthesizer = speechsdk.SpeechSynthesizer(
speech_config=self.speech_config, audio_config=None
)
result = speech_synthesizer.speak_text(text)

if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
stream = speechsdk.AudioDataStream(result)
with tempfile.NamedTemporaryFile(
mode="wb", suffix=".wav", delete=False
) as f:
stream.save_to_wav_file(f.name)

return f.name
return result.audio_data

elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
Expand All @@ -84,10 +77,10 @@ def _text2speech(self, text: str, speech_language: str) -> str:
f"Speech synthesis error: {cancellation_details.error_details}"
)

return "Speech synthesis canceled."
raise RuntimeError("Speech synthesis canceled.")

else:
return f"Speech synthesis failed: {result.reason}"
raise RuntimeError(f"Speech synthesis failed: {result.reason}")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't you breaking agents by those changes?


def _run(
self,
Expand All @@ -96,7 +89,19 @@ def _run(
) -> str:
"""Use the tool."""
try:
speech_file = self._text2speech(query, self.speech_language)
return speech_file
speech = self._text2speech(query, self.speech_language)
self.play(speech)
return "Speech has been generated"
except Exception as e:
raise RuntimeError(f"Error while running AzureCogsText2SpeechTool: {e}")

def play(self, speech: bytes) -> None:
"""Play the speech."""
try:
from IPython import display
except ImportError:
raise ImportError(
"IPython is not installed. " "Run `pip install ipython` to install."
)
audio = display.Audio(speech)
display.display(audio)
13 changes: 3 additions & 10 deletions libs/langchain/langchain/tools/eleven_labs/text2speech.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import tempfile
from enum import Enum
from typing import Any, Dict, Optional, Union

Expand Down Expand Up @@ -56,20 +55,14 @@ def _run(
elevenlabs = _import_elevenlabs()
try:
speech = elevenlabs.generate(text=query, model=self.model)
with tempfile.NamedTemporaryFile(
mode="bx", suffix=".wav", delete=False
) as f:
f.write(speech)
return f.name
self.play(speech)
return "Speech has been generated"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it should just return None or just speech as bytes?
After reading code, I see it was created as a tool - it means it was crafted to be directly used within agents etc. I removed my other comments.

I think the best usage would be to have a single tool which can have different implementation provided, similar to PythonREPLTool.
Then you can have utilities which use more useful API to operate with elevenlabs/azure, just an idea.

except Exception as e:
raise RuntimeError(f"Error while running ElevenLabsText2SpeechTool: {e}")

def play(self, speech_file: str) -> None:
def play(self, speech: bytes) -> None:
"""Play the text as speech."""
elevenlabs = _import_elevenlabs()
with open(speech_file, mode="rb") as f:
speech = f.read()

elevenlabs.play(speech)

def stream_speech(self, query: str) -> None:
Expand Down