feat:add tts-streaming config and future (#5492)

This commit is contained in:
chenxu9741 2024-07-09 11:33:58 +08:00 committed by GitHub
parent b29a36f461
commit 6ef401a9f0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
44 changed files with 1280 additions and 358 deletions

View File

@ -0,0 +1,4 @@
TTS_AUTO_PLAY_TIMEOUT = 5
# sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
TTS_AUTO_PLAY_YIELD_CPU_TIME = 0.02

View File

@ -81,15 +81,36 @@ class ChatMessageTextApi(Resource):
@account_initialization_required @account_initialization_required
@get_app_model @get_app_model
def post(self, app_model): def post(self, app_model):
from werkzeug.exceptions import InternalServerError
try: try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, location='json')
parser.add_argument('text', type=str, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()
message_id = args.get('message_id', None)
text = args.get('text', None)
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
'voice')
except Exception:
voice = None
response = AudioService.transcript_tts( response = AudioService.transcript_tts(
app_model=app_model, app_model=app_model,
text=request.form['text'], text=text,
voice=request.form['voice'], message_id=message_id,
streaming=False voice=voice
) )
return response
return {'data': response.data.decode('latin1')}
except services.errors.app_model_config.AppModelConfigBrokenError: except services.errors.app_model_config.AppModelConfigBrokenError:
logging.exception("App model config broken.") logging.exception("App model config broken.")
raise AppUnavailableError() raise AppUnavailableError()

View File

@ -19,6 +19,7 @@ from controllers.console.app.error import (
from controllers.console.explore.wraps import InstalledAppResource from controllers.console.explore.wraps import InstalledAppResource
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError from core.model_runtime.errors.invoke import InvokeError
from models.model import AppMode
from services.audio_service import AudioService from services.audio_service import AudioService
from services.errors.audio import ( from services.errors.audio import (
AudioTooLargeServiceError, AudioTooLargeServiceError,
@ -70,16 +71,33 @@ class ChatAudioApi(InstalledAppResource):
class ChatTextApi(InstalledAppResource): class ChatTextApi(InstalledAppResource):
def post(self, installed_app): def post(self, installed_app):
app_model = installed_app.app from flask_restful import reqparse
app_model = installed_app.app
try: try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, required=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()
message_id = args.get('message_id')
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
except Exception:
voice = None
response = AudioService.transcript_tts( response = AudioService.transcript_tts(
app_model=app_model, app_model=app_model,
text=request.form['text'], message_id=message_id,
voice=request.form['voice'] if request.form.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice'), voice=voice
streaming=False
) )
return {'data': response.data.decode('latin1')} return response
except services.errors.app_model_config.AppModelConfigBrokenError: except services.errors.app_model_config.AppModelConfigBrokenError:
logging.exception("App model config broken.") logging.exception("App model config broken.")
raise AppUnavailableError() raise AppUnavailableError()
@ -108,3 +126,5 @@ class ChatTextApi(InstalledAppResource):
api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio') api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
api.add_resource(ChatTextApi, '/installed-apps/<uuid:installed_app_id>/text-to-audio', endpoint='installed_app_text') api.add_resource(ChatTextApi, '/installed-apps/<uuid:installed_app_id>/text-to-audio', endpoint='installed_app_text')
# api.add_resource(ChatTextApiWithMessageId, '/installed-apps/<uuid:installed_app_id>/text-to-audio/message-id',
# endpoint='installed_app_text_with_message_id')

View File

@ -20,7 +20,7 @@ from controllers.service_api.app.error import (
from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError from core.model_runtime.errors.invoke import InvokeError
from models.model import App, EndUser from models.model import App, AppMode, EndUser
from services.audio_service import AudioService from services.audio_service import AudioService
from services.errors.audio import ( from services.errors.audio import (
AudioTooLargeServiceError, AudioTooLargeServiceError,
@ -72,19 +72,30 @@ class AudioApi(Resource):
class TextApi(Resource): class TextApi(Resource):
@validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON)) @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON))
def post(self, app_model: App, end_user: EndUser): def post(self, app_model: App, end_user: EndUser):
parser = reqparse.RequestParser()
parser.add_argument('text', type=str, required=True, nullable=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, required=False, nullable=False, location='json')
args = parser.parse_args()
try: try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, required=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()
message_id = args.get('message_id')
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
'voice')
except Exception:
voice = None
response = AudioService.transcript_tts( response = AudioService.transcript_tts(
app_model=app_model, app_model=app_model,
text=args['text'], message_id=message_id,
end_user=end_user, end_user=end_user.external_user_id,
voice=args.get('voice'), voice=voice
streaming=args['streaming']
) )
return response return response

View File

@ -19,7 +19,7 @@ from controllers.web.error import (
from controllers.web.wraps import WebApiResource from controllers.web.wraps import WebApiResource
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError from core.model_runtime.errors.invoke import InvokeError
from models.model import App from models.model import App, AppMode
from services.audio_service import AudioService from services.audio_service import AudioService
from services.errors.audio import ( from services.errors.audio import (
AudioTooLargeServiceError, AudioTooLargeServiceError,
@ -69,16 +69,35 @@ class AudioApi(WebApiResource):
class TextApi(WebApiResource): class TextApi(WebApiResource):
def post(self, app_model: App, end_user): def post(self, app_model: App, end_user):
from flask_restful import reqparse
try: try:
parser = reqparse.RequestParser()
parser.add_argument('message_id', type=str, required=False, location='json')
parser.add_argument('voice', type=str, location='json')
parser.add_argument('streaming', type=bool, location='json')
args = parser.parse_args()
message_id = args.get('message_id')
if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
and app_model.workflow
and app_model.workflow.features_dict):
text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
else:
try:
voice = args.get('voice') if args.get(
'voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
except Exception:
voice = None
response = AudioService.transcript_tts( response = AudioService.transcript_tts(
app_model=app_model, app_model=app_model,
text=request.form['text'], message_id=message_id,
end_user=end_user.external_user_id, end_user=end_user.external_user_id,
voice=request.form['voice'] if request.form.get('voice') else None, voice=voice
streaming=False
) )
return {'data': response.data.decode('latin1')} return response
except services.errors.app_model_config.AppModelConfigBrokenError: except services.errors.app_model_config.AppModelConfigBrokenError:
logging.exception("App model config broken.") logging.exception("App model config broken.")
raise AppUnavailableError() raise AppUnavailableError()

View File

@ -0,0 +1,135 @@
import base64
import concurrent.futures
import logging
import queue
import re
import threading
from core.app.entities.queue_entities import QueueAgentMessageEvent, QueueLLMChunkEvent, QueueTextChunkEvent
from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType
class AudioTrunk:
def __init__(self, status: str, audio):
self.audio = audio
self.status = status
def _invoiceTTS(text_content: str, model_instance, tenant_id: str, voice: str):
if not text_content or text_content.isspace():
return
return model_instance.invoke_tts(
content_text=text_content.strip(),
user="responding_tts",
tenant_id=tenant_id,
voice=voice
)
def _process_future(future_queue, audio_queue):
while True:
try:
future = future_queue.get()
if future is None:
break
for audio in future.result():
audio_base64 = base64.b64encode(bytes(audio))
audio_queue.put(AudioTrunk("responding", audio=audio_base64))
except Exception as e:
logging.getLogger(__name__).warning(e)
break
audio_queue.put(AudioTrunk("finish", b''))
class AppGeneratorTTSPublisher:
def __init__(self, tenant_id: str, voice: str):
self.logger = logging.getLogger(__name__)
self.tenant_id = tenant_id
self.msg_text = ''
self._audio_queue = queue.Queue()
self._msg_queue = queue.Queue()
self.match = re.compile(r'[。.!?]')
self.model_manager = ModelManager()
self.model_instance = self.model_manager.get_default_model_instance(
tenant_id=self.tenant_id,
model_type=ModelType.TTS
)
self.voices = self.model_instance.get_tts_voices()
values = [voice.get('value') for voice in self.voices]
self.voice = voice
if not voice or voice not in values:
self.voice = self.voices[0].get('value')
self.MAX_SENTENCE = 2
self._last_audio_event = None
self._runtime_thread = threading.Thread(target=self._runtime).start()
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
def publish(self, message):
try:
self._msg_queue.put(message)
except Exception as e:
self.logger.warning(e)
def _runtime(self):
future_queue = queue.Queue()
threading.Thread(target=_process_future, args=(future_queue, self._audio_queue)).start()
while True:
try:
message = self._msg_queue.get()
if message is None:
if self.msg_text and len(self.msg_text.strip()) > 0:
futures_result = self.executor.submit(_invoiceTTS, self.msg_text,
self.model_instance, self.tenant_id, self.voice)
future_queue.put(futures_result)
break
elif isinstance(message.event, QueueAgentMessageEvent | QueueLLMChunkEvent):
self.msg_text += message.event.chunk.delta.message.content
elif isinstance(message.event, QueueTextChunkEvent):
self.msg_text += message.event.text
self.last_message = message
sentence_arr, text_tmp = self._extract_sentence(self.msg_text)
if len(sentence_arr) >= min(self.MAX_SENTENCE, 7):
self.MAX_SENTENCE += 1
text_content = ''.join(sentence_arr)
futures_result = self.executor.submit(_invoiceTTS, text_content,
self.model_instance,
self.tenant_id,
self.voice)
future_queue.put(futures_result)
if text_tmp:
self.msg_text = text_tmp
else:
self.msg_text = ''
except Exception as e:
self.logger.warning(e)
break
future_queue.put(None)
def checkAndGetAudio(self) -> AudioTrunk | None:
try:
if self._last_audio_event and self._last_audio_event.status == "finish":
if self.executor:
self.executor.shutdown(wait=False)
return self.last_message
audio = self._audio_queue.get_nowait()
if audio and audio.status == "finish":
self.executor.shutdown(wait=False)
self._runtime_thread = None
if audio:
self._last_audio_event = audio
return audio
except queue.Empty:
return None
def _extract_sentence(self, org_text):
tx = self.match.finditer(org_text)
start = 0
result = []
for i in tx:
end = i.regs[0][1]
result.append(org_text[start:end])
start = end
return result, org_text[start:]

View File

@ -4,6 +4,8 @@ import time
from collections.abc import Generator from collections.abc import Generator
from typing import Any, Optional, Union, cast from typing import Any, Optional, Union, cast
from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
from core.app.entities.app_invoke_entities import ( from core.app.entities.app_invoke_entities import (
AdvancedChatAppGenerateEntity, AdvancedChatAppGenerateEntity,
@ -33,6 +35,8 @@ from core.app.entities.task_entities import (
ChatbotAppStreamResponse, ChatbotAppStreamResponse,
ChatflowStreamGenerateRoute, ChatflowStreamGenerateRoute,
ErrorStreamResponse, ErrorStreamResponse,
MessageAudioEndStreamResponse,
MessageAudioStreamResponse,
MessageEndStreamResponse, MessageEndStreamResponse,
StreamResponse, StreamResponse,
) )
@ -71,13 +75,13 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
_iteration_nested_relations: dict[str, list[str]] _iteration_nested_relations: dict[str, list[str]]
def __init__( def __init__(
self, application_generate_entity: AdvancedChatAppGenerateEntity, self, application_generate_entity: AdvancedChatAppGenerateEntity,
workflow: Workflow, workflow: Workflow,
queue_manager: AppQueueManager, queue_manager: AppQueueManager,
conversation: Conversation, conversation: Conversation,
message: Message, message: Message,
user: Union[Account, EndUser], user: Union[Account, EndUser],
stream: bool stream: bool
) -> None: ) -> None:
""" """
Initialize AdvancedChatAppGenerateTaskPipeline. Initialize AdvancedChatAppGenerateTaskPipeline.
@ -129,7 +133,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
self._application_generate_entity.query self._application_generate_entity.query
) )
generator = self._process_stream_response( generator = self._wrapper_process_stream_response(
trace_manager=self._application_generate_entity.trace_manager trace_manager=self._application_generate_entity.trace_manager
) )
if self._stream: if self._stream:
@ -138,7 +142,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
return self._to_blocking_response(generator) return self._to_blocking_response(generator)
def _to_blocking_response(self, generator: Generator[StreamResponse, None, None]) \ def _to_blocking_response(self, generator: Generator[StreamResponse, None, None]) \
-> ChatbotAppBlockingResponse: -> ChatbotAppBlockingResponse:
""" """
Process blocking response. Process blocking response.
:return: :return:
@ -169,7 +173,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
raise Exception('Queue listening stopped unexpectedly.') raise Exception('Queue listening stopped unexpectedly.')
def _to_stream_response(self, generator: Generator[StreamResponse, None, None]) \ def _to_stream_response(self, generator: Generator[StreamResponse, None, None]) \
-> Generator[ChatbotAppStreamResponse, None, None]: -> Generator[ChatbotAppStreamResponse, None, None]:
""" """
To stream response. To stream response.
:return: :return:
@ -182,14 +186,68 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
stream_response=stream_response stream_response=stream_response
) )
def _listenAudioMsg(self, publisher, task_id: str):
if not publisher:
return None
audio_msg: AudioTrunk = publisher.checkAndGetAudio()
if audio_msg and audio_msg.status != "finish":
return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
return None
def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
Generator[StreamResponse, None, None]:
publisher = None
task_id = self._application_generate_entity.task_id
tenant_id = self._application_generate_entity.app_config.tenant_id
features_dict = self._workflow.features_dict
if features_dict.get('text_to_speech') and features_dict['text_to_speech'].get('enabled') and features_dict[
'text_to_speech'].get('autoPlay') == 'enabled':
publisher = AppGeneratorTTSPublisher(tenant_id, features_dict['text_to_speech'].get('voice'))
for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
while True:
audio_response = self._listenAudioMsg(publisher, task_id=task_id)
if audio_response:
yield audio_response
else:
break
yield response
start_listener_time = time.time()
# timeout
while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
try:
if not publisher:
break
audio_trunk = publisher.checkAndGetAudio()
if audio_trunk is None:
# release cpu
# sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
continue
if audio_trunk.status == "finish":
break
else:
start_listener_time = time.time()
yield MessageAudioStreamResponse(audio=audio_trunk.audio, task_id=task_id)
except Exception as e:
logger.error(e)
break
yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
def _process_stream_response( def _process_stream_response(
self, trace_manager: Optional[TraceQueueManager] = None self,
publisher: AppGeneratorTTSPublisher,
trace_manager: Optional[TraceQueueManager] = None
) -> Generator[StreamResponse, None, None]: ) -> Generator[StreamResponse, None, None]:
""" """
Process stream response. Process stream response.
:return: :return:
""" """
for message in self._queue_manager.listen(): for message in self._queue_manager.listen():
if publisher:
publisher.publish(message=message)
event = message.event event = message.event
if isinstance(event, QueueErrorEvent): if isinstance(event, QueueErrorEvent):
@ -301,7 +359,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
continue continue
if not self._is_stream_out_support( if not self._is_stream_out_support(
event=event event=event
): ):
continue continue
@ -318,7 +376,8 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
yield self._ping_stream_response() yield self._ping_stream_response()
else: else:
continue continue
if publisher:
publisher.publish(None)
if self._conversation_name_generate_thread: if self._conversation_name_generate_thread:
self._conversation_name_generate_thread.join() self._conversation_name_generate_thread.join()
@ -402,7 +461,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
return stream_generate_routes return stream_generate_routes
def _get_answer_start_at_node_ids(self, graph: dict, target_node_id: str) \ def _get_answer_start_at_node_ids(self, graph: dict, target_node_id: str) \
-> list[str]: -> list[str]:
""" """
Get answer start at node id. Get answer start at node id.
:param graph: graph :param graph: graph
@ -457,7 +516,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
start_node_id = target_node_id start_node_id = target_node_id
start_node_ids.append(start_node_id) start_node_ids.append(start_node_id)
elif node_type == NodeType.START.value or \ elif node_type == NodeType.START.value or \
node_iteration_id is not None and iteration_start_node_id == source_node.get('id'): node_iteration_id is not None and iteration_start_node_id == source_node.get('id'):
start_node_id = source_node_id start_node_id = source_node_id
start_node_ids.append(start_node_id) start_node_ids.append(start_node_id)
else: else:
@ -515,7 +574,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
# all route chunks are generated # all route chunks are generated
if self._task_state.current_stream_generate_state.current_route_position == len( if self._task_state.current_stream_generate_state.current_route_position == len(
self._task_state.current_stream_generate_state.generate_route self._task_state.current_stream_generate_state.generate_route
): ):
self._task_state.current_stream_generate_state = None self._task_state.current_stream_generate_state = None
@ -525,7 +584,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
:return: :return:
""" """
if not self._task_state.current_stream_generate_state: if not self._task_state.current_stream_generate_state:
return None return
route_chunks = self._task_state.current_stream_generate_state.generate_route[ route_chunks = self._task_state.current_stream_generate_state.generate_route[
self._task_state.current_stream_generate_state.current_route_position:] self._task_state.current_stream_generate_state.current_route_position:]
@ -573,7 +632,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
# get route chunk node execution info # get route chunk node execution info
route_chunk_node_execution_info = self._task_state.ran_node_execution_infos[route_chunk_node_id] route_chunk_node_execution_info = self._task_state.ran_node_execution_infos[route_chunk_node_id]
if (route_chunk_node_execution_info.node_type == NodeType.LLM if (route_chunk_node_execution_info.node_type == NodeType.LLM
and latest_node_execution_info.node_type == NodeType.LLM): and latest_node_execution_info.node_type == NodeType.LLM):
# only LLM support chunk stream output # only LLM support chunk stream output
self._task_state.current_stream_generate_state.current_route_position += 1 self._task_state.current_stream_generate_state.current_route_position += 1
continue continue
@ -643,7 +702,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
# all route chunks are generated # all route chunks are generated
if self._task_state.current_stream_generate_state.current_route_position == len( if self._task_state.current_stream_generate_state.current_route_position == len(
self._task_state.current_stream_generate_state.generate_route self._task_state.current_stream_generate_state.generate_route
): ):
self._task_state.current_stream_generate_state = None self._task_state.current_stream_generate_state = None

View File

@ -51,7 +51,6 @@ class AppQueueManager:
listen_timeout = current_app.config.get("APP_MAX_EXECUTION_TIME") listen_timeout = current_app.config.get("APP_MAX_EXECUTION_TIME")
start_time = time.time() start_time = time.time()
last_ping_time = 0 last_ping_time = 0
while True: while True:
try: try:
message = self._q.get(timeout=1) message = self._q.get(timeout=1)

View File

@ -1,7 +1,10 @@
import logging import logging
import time
from collections.abc import Generator from collections.abc import Generator
from typing import Any, Optional, Union from typing import Any, Optional, Union
from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
from core.app.apps.base_app_queue_manager import AppQueueManager from core.app.apps.base_app_queue_manager import AppQueueManager
from core.app.entities.app_invoke_entities import ( from core.app.entities.app_invoke_entities import (
InvokeFrom, InvokeFrom,
@ -25,6 +28,8 @@ from core.app.entities.queue_entities import (
) )
from core.app.entities.task_entities import ( from core.app.entities.task_entities import (
ErrorStreamResponse, ErrorStreamResponse,
MessageAudioEndStreamResponse,
MessageAudioStreamResponse,
StreamResponse, StreamResponse,
TextChunkStreamResponse, TextChunkStreamResponse,
TextReplaceStreamResponse, TextReplaceStreamResponse,
@ -105,7 +110,7 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
db.session.refresh(self._user) db.session.refresh(self._user)
db.session.close() db.session.close()
generator = self._process_stream_response( generator = self._wrapper_process_stream_response(
trace_manager=self._application_generate_entity.trace_manager trace_manager=self._application_generate_entity.trace_manager
) )
if self._stream: if self._stream:
@ -161,8 +166,58 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
stream_response=stream_response stream_response=stream_response
) )
def _listenAudioMsg(self, publisher, task_id: str):
if not publisher:
return None
audio_msg: AudioTrunk = publisher.checkAndGetAudio()
if audio_msg and audio_msg.status != "finish":
return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
return None
def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
Generator[StreamResponse, None, None]:
publisher = None
task_id = self._application_generate_entity.task_id
tenant_id = self._application_generate_entity.app_config.tenant_id
features_dict = self._workflow.features_dict
if features_dict.get('text_to_speech') and features_dict['text_to_speech'].get('enabled') and features_dict[
'text_to_speech'].get('autoPlay') == 'enabled':
publisher = AppGeneratorTTSPublisher(tenant_id, features_dict['text_to_speech'].get('voice'))
for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
while True:
audio_response = self._listenAudioMsg(publisher, task_id=task_id)
if audio_response:
yield audio_response
else:
break
yield response
start_listener_time = time.time()
while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
try:
if not publisher:
break
audio_trunk = publisher.checkAndGetAudio()
if audio_trunk is None:
# release cpu
# sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
continue
if audio_trunk.status == "finish":
break
else:
yield MessageAudioStreamResponse(audio=audio_trunk.audio, task_id=task_id)
except Exception as e:
logger.error(e)
break
yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
def _process_stream_response( def _process_stream_response(
self, self,
publisher: AppGeneratorTTSPublisher,
trace_manager: Optional[TraceQueueManager] = None trace_manager: Optional[TraceQueueManager] = None
) -> Generator[StreamResponse, None, None]: ) -> Generator[StreamResponse, None, None]:
""" """
@ -170,6 +225,8 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
:return: :return:
""" """
for message in self._queue_manager.listen(): for message in self._queue_manager.listen():
if publisher:
publisher.publish(message=message)
event = message.event event = message.event
if isinstance(event, QueueErrorEvent): if isinstance(event, QueueErrorEvent):
@ -251,6 +308,10 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
else: else:
continue continue
if publisher:
publisher.publish(None)
def _save_workflow_app_log(self, workflow_run: WorkflowRun) -> None: def _save_workflow_app_log(self, workflow_run: WorkflowRun) -> None:
""" """
Save workflow app log. Save workflow app log.

View File

@ -69,6 +69,7 @@ class WorkflowTaskState(TaskState):
iteration_nested_node_ids: list[str] = None iteration_nested_node_ids: list[str] = None
class AdvancedChatTaskState(WorkflowTaskState): class AdvancedChatTaskState(WorkflowTaskState):
""" """
AdvancedChatTaskState entity AdvancedChatTaskState entity
@ -86,6 +87,8 @@ class StreamEvent(Enum):
ERROR = "error" ERROR = "error"
MESSAGE = "message" MESSAGE = "message"
MESSAGE_END = "message_end" MESSAGE_END = "message_end"
TTS_MESSAGE = "tts_message"
TTS_MESSAGE_END = "tts_message_end"
MESSAGE_FILE = "message_file" MESSAGE_FILE = "message_file"
MESSAGE_REPLACE = "message_replace" MESSAGE_REPLACE = "message_replace"
AGENT_THOUGHT = "agent_thought" AGENT_THOUGHT = "agent_thought"
@ -130,6 +133,22 @@ class MessageStreamResponse(StreamResponse):
answer: str answer: str
class MessageAudioStreamResponse(StreamResponse):
"""
MessageStreamResponse entity
"""
event: StreamEvent = StreamEvent.TTS_MESSAGE
audio: str
class MessageAudioEndStreamResponse(StreamResponse):
"""
MessageStreamResponse entity
"""
event: StreamEvent = StreamEvent.TTS_MESSAGE_END
audio: str
class MessageEndStreamResponse(StreamResponse): class MessageEndStreamResponse(StreamResponse):
""" """
MessageEndStreamResponse entity MessageEndStreamResponse entity
@ -186,6 +205,7 @@ class WorkflowStartStreamResponse(StreamResponse):
""" """
WorkflowStartStreamResponse entity WorkflowStartStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -205,6 +225,7 @@ class WorkflowFinishStreamResponse(StreamResponse):
""" """
WorkflowFinishStreamResponse entity WorkflowFinishStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -232,6 +253,7 @@ class NodeStartStreamResponse(StreamResponse):
""" """
NodeStartStreamResponse entity NodeStartStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -273,6 +295,7 @@ class NodeFinishStreamResponse(StreamResponse):
""" """
NodeFinishStreamResponse entity NodeFinishStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -323,10 +346,12 @@ class NodeFinishStreamResponse(StreamResponse):
} }
} }
class IterationNodeStartStreamResponse(StreamResponse): class IterationNodeStartStreamResponse(StreamResponse):
""" """
NodeStartStreamResponse entity NodeStartStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -344,10 +369,12 @@ class IterationNodeStartStreamResponse(StreamResponse):
workflow_run_id: str workflow_run_id: str
data: Data data: Data
class IterationNodeNextStreamResponse(StreamResponse): class IterationNodeNextStreamResponse(StreamResponse):
""" """
NodeStartStreamResponse entity NodeStartStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -365,10 +392,12 @@ class IterationNodeNextStreamResponse(StreamResponse):
workflow_run_id: str workflow_run_id: str
data: Data data: Data
class IterationNodeCompletedStreamResponse(StreamResponse): class IterationNodeCompletedStreamResponse(StreamResponse):
""" """
NodeCompletedStreamResponse entity NodeCompletedStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -393,10 +422,12 @@ class IterationNodeCompletedStreamResponse(StreamResponse):
workflow_run_id: str workflow_run_id: str
data: Data data: Data
class TextChunkStreamResponse(StreamResponse): class TextChunkStreamResponse(StreamResponse):
""" """
TextChunkStreamResponse entity TextChunkStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -411,6 +442,7 @@ class TextReplaceStreamResponse(StreamResponse):
""" """
TextReplaceStreamResponse entity TextReplaceStreamResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -473,6 +505,7 @@ class ChatbotAppBlockingResponse(AppBlockingResponse):
""" """
ChatbotAppBlockingResponse entity ChatbotAppBlockingResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -492,6 +525,7 @@ class CompletionAppBlockingResponse(AppBlockingResponse):
""" """
CompletionAppBlockingResponse entity CompletionAppBlockingResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -510,6 +544,7 @@ class WorkflowAppBlockingResponse(AppBlockingResponse):
""" """
WorkflowAppBlockingResponse entity WorkflowAppBlockingResponse entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity
@ -528,10 +563,12 @@ class WorkflowAppBlockingResponse(AppBlockingResponse):
workflow_run_id: str workflow_run_id: str
data: Data data: Data
class WorkflowIterationState(BaseModel): class WorkflowIterationState(BaseModel):
""" """
WorkflowIterationState entity WorkflowIterationState entity
""" """
class Data(BaseModel): class Data(BaseModel):
""" """
Data entity Data entity

View File

@ -4,6 +4,8 @@ import time
from collections.abc import Generator from collections.abc import Generator
from typing import Optional, Union, cast from typing import Optional, Union, cast
from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
from core.app.entities.app_invoke_entities import ( from core.app.entities.app_invoke_entities import (
AgentChatAppGenerateEntity, AgentChatAppGenerateEntity,
@ -32,6 +34,8 @@ from core.app.entities.task_entities import (
CompletionAppStreamResponse, CompletionAppStreamResponse,
EasyUITaskState, EasyUITaskState,
ErrorStreamResponse, ErrorStreamResponse,
MessageAudioEndStreamResponse,
MessageAudioStreamResponse,
MessageEndStreamResponse, MessageEndStreamResponse,
StreamResponse, StreamResponse,
) )
@ -87,6 +91,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
""" """
super().__init__(application_generate_entity, queue_manager, user, stream) super().__init__(application_generate_entity, queue_manager, user, stream)
self._model_config = application_generate_entity.model_conf self._model_config = application_generate_entity.model_conf
self._app_config = application_generate_entity.app_config
self._conversation = conversation self._conversation = conversation
self._message = message self._message = message
@ -102,7 +107,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
self._conversation_name_generate_thread = None self._conversation_name_generate_thread = None
def process( def process(
self, self,
) -> Union[ ) -> Union[
ChatbotAppBlockingResponse, ChatbotAppBlockingResponse,
CompletionAppBlockingResponse, CompletionAppBlockingResponse,
@ -123,7 +128,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
self._application_generate_entity.query self._application_generate_entity.query
) )
generator = self._process_stream_response( generator = self._wrapper_process_stream_response(
trace_manager=self._application_generate_entity.trace_manager trace_manager=self._application_generate_entity.trace_manager
) )
if self._stream: if self._stream:
@ -202,14 +207,64 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
stream_response=stream_response stream_response=stream_response
) )
def _listenAudioMsg(self, publisher, task_id: str):
if publisher is None:
return None
audio_msg: AudioTrunk = publisher.checkAndGetAudio()
if audio_msg and audio_msg.status != "finish":
# audio_str = audio_msg.audio.decode('utf-8', errors='ignore')
return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
return None
def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
Generator[StreamResponse, None, None]:
tenant_id = self._application_generate_entity.app_config.tenant_id
task_id = self._application_generate_entity.task_id
publisher = None
text_to_speech_dict = self._app_config.app_model_config_dict.get('text_to_speech')
if text_to_speech_dict and text_to_speech_dict.get('autoPlay') == 'enabled' and text_to_speech_dict.get('enabled'):
publisher = AppGeneratorTTSPublisher(tenant_id, text_to_speech_dict.get('voice', None))
for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
while True:
audio_response = self._listenAudioMsg(publisher, task_id)
if audio_response:
yield audio_response
else:
break
yield response
start_listener_time = time.time()
# timeout
while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
if publisher is None:
break
audio = publisher.checkAndGetAudio()
if audio is None:
# release cpu
# sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
continue
if audio.status == "finish":
break
else:
start_listener_time = time.time()
yield MessageAudioStreamResponse(audio=audio.audio,
task_id=task_id)
yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
def _process_stream_response( def _process_stream_response(
self, trace_manager: Optional[TraceQueueManager] = None self,
publisher: AppGeneratorTTSPublisher,
trace_manager: Optional[TraceQueueManager] = None
) -> Generator[StreamResponse, None, None]: ) -> Generator[StreamResponse, None, None]:
""" """
Process stream response. Process stream response.
:return: :return:
""" """
for message in self._queue_manager.listen(): for message in self._queue_manager.listen():
if publisher:
publisher.publish(message)
event = message.event event = message.event
if isinstance(event, QueueErrorEvent): if isinstance(event, QueueErrorEvent):
@ -272,12 +327,13 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
yield self._ping_stream_response() yield self._ping_stream_response()
else: else:
continue continue
if publisher:
publisher.publish(None)
if self._conversation_name_generate_thread: if self._conversation_name_generate_thread:
self._conversation_name_generate_thread.join() self._conversation_name_generate_thread.join()
def _save_message( def _save_message(
self, trace_manager: Optional[TraceQueueManager] = None self, trace_manager: Optional[TraceQueueManager] = None
) -> None: ) -> None:
""" """
Save message. Save message.

View File

@ -264,7 +264,7 @@ class ModelInstance:
user=user user=user
) )
def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \ def invoke_tts(self, content_text: str, tenant_id: str, voice: str, user: Optional[str] = None) \
-> str: -> str:
""" """
Invoke large language tts model Invoke large language tts model
@ -287,8 +287,7 @@ class ModelInstance:
content_text=content_text, content_text=content_text,
user=user, user=user,
tenant_id=tenant_id, tenant_id=tenant_id,
voice=voice, voice=voice
streaming=streaming
) )
def _round_robin_invoke(self, function: Callable, *args, **kwargs): def _round_robin_invoke(self, function: Callable, *args, **kwargs):

View File

@ -1,4 +1,6 @@
import hashlib import hashlib
import logging
import re
import subprocess import subprocess
import uuid import uuid
from abc import abstractmethod from abc import abstractmethod
@ -10,7 +12,7 @@ from core.model_runtime.entities.model_entities import ModelPropertyKey, ModelTy
from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.model_providers.__base.ai_model import AIModel from core.model_runtime.model_providers.__base.ai_model import AIModel
logger = logging.getLogger(__name__)
class TTSModel(AIModel): class TTSModel(AIModel):
""" """
Model class for ttstext model. Model class for ttstext model.
@ -20,7 +22,7 @@ class TTSModel(AIModel):
# pydantic configs # pydantic configs
model_config = ConfigDict(protected_namespaces=()) model_config = ConfigDict(protected_namespaces=())
def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
user: Optional[str] = None): user: Optional[str] = None):
""" """
Invoke large language model Invoke large language model
@ -35,14 +37,15 @@ class TTSModel(AIModel):
:return: translated audio file :return: translated audio file
""" """
try: try:
logger.info(f"Invoke TTS model: {model} , invoke content : {content_text}")
self._is_ffmpeg_installed() self._is_ffmpeg_installed()
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, return self._invoke(model=model, credentials=credentials, user=user,
content_text=content_text, voice=voice, tenant_id=tenant_id) content_text=content_text, voice=voice, tenant_id=tenant_id)
except Exception as e: except Exception as e:
raise self._transform_invoke_error(e) raise self._transform_invoke_error(e)
@abstractmethod @abstractmethod
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
user: Optional[str] = None): user: Optional[str] = None):
""" """
Invoke large language model Invoke large language model
@ -123,26 +126,26 @@ class TTSModel(AIModel):
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS] return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
@staticmethod @staticmethod
def _split_text_into_sentences(text: str, limit: int, delimiters=None): def _split_text_into_sentences(org_text, max_length=2000, pattern=r'[。.!?]'):
if delimiters is None: match = re.compile(pattern)
delimiters = set('。!?;\n') tx = match.finditer(org_text)
start = 0
buf = [] result = []
word_count = 0 one_sentence = ''
for char in text: for i in tx:
buf.append(char) end = i.regs[0][1]
if char in delimiters: tmp = org_text[start:end]
if word_count >= limit: if len(one_sentence + tmp) > max_length:
yield ''.join(buf) result.append(one_sentence)
buf = [] one_sentence = ''
word_count = 0 one_sentence += tmp
else: start = end
word_count += 1 last_sens = org_text[start:]
else: if last_sens:
word_count += 1 one_sentence += last_sens
if one_sentence != '':
if buf: result.append(one_sentence)
yield ''.join(buf) return result
@staticmethod @staticmethod
def _is_ffmpeg_installed(): def _is_ffmpeg_installed():

View File

@ -4,7 +4,7 @@ from functools import reduce
from io import BytesIO from io import BytesIO
from typing import Optional from typing import Optional
from flask import Response, stream_with_context from flask import Response
from openai import AzureOpenAI from openai import AzureOpenAI
from pydub import AudioSegment from pydub import AudioSegment
@ -14,7 +14,6 @@ from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.azure_openai._common import _CommonAzureOpenAI from core.model_runtime.model_providers.azure_openai._common import _CommonAzureOpenAI
from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_MODELS, AzureBaseModel from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_MODELS, AzureBaseModel
from extensions.ext_storage import storage
class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel): class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
@ -23,7 +22,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
""" """
def _invoke(self, model: str, tenant_id: str, credentials: dict, def _invoke(self, model: str, tenant_id: str, credentials: dict,
content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any: content_text: str, voice: str, user: Optional[str] = None) -> any:
""" """
_invoke text2speech model _invoke text2speech model
@ -32,30 +31,23 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
:param credentials: model credentials :param credentials: model credentials
:param content_text: text content to be translated :param content_text: text content to be translated
:param voice: model timbre :param voice: model timbre
:param streaming: output is streaming
:param user: unique user id :param user: unique user id
:return: text translated to audio file :return: text translated to audio file
""" """
audio_type = self._get_model_audio_type(model, credentials)
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]: if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials) voice = self._get_model_default_voice(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
tenant_id=tenant_id,
voice=voice)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: return self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
voice=voice)
def validate_credentials(self, model: str, credentials: dict) -> None:
""" """
validate credentials text2speech model validate credentials text2speech model
:param model: model name :param model: model name
:param credentials: model credentials :param credentials: model credentials
:param user: unique user id
:return: text translated to audio file :return: text translated to audio file
""" """
try: try:
@ -82,7 +74,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
word_limit = self._get_model_word_limit(model, credentials) word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials) max_workers = self._get_model_workers_limit(model, credentials)
try: try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = [] audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences # Create a thread pool and map the function to the list of sentences
@ -107,34 +99,37 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
except Exception as ex: except Exception as ex:
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
voice: str) -> any: voice: str) -> any:
""" """
_tts_invoke_streaming text2speech model _tts_invoke_streaming text2speech model
:param model: model name :param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials :param credentials: model credentials
:param content_text: text content to be translated :param content_text: text content to be translated
:param voice: model timbre :param voice: model timbre
:return: text translated to audio file :return: text translated to audio file
""" """
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try: try:
# doc: https://platform.openai.com/docs/guides/text-to-speech
credentials_kwargs = self._to_credential_kwargs(credentials)
client = AzureOpenAI(**credentials_kwargs) client = AzureOpenAI(**credentials_kwargs)
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) # max font is 4096,there is 3500 limit for each request
for sentence in sentences: max_length = 3500
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) if len(content_text) > max_length:
# response.stream_to_file(file_path) sentences = self._split_text_into_sentences(content_text, max_length=max_length)
storage.save(file_path, response.read()) executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
response_format="mp3",
input=sentences[i], voice=voice) for i in range(len(sentences))]
for index, future in enumerate(futures):
yield from future.result().__enter__().iter_bytes(1024)
else:
response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
response_format="mp3",
input=content_text.strip())
yield from response.__enter__().iter_bytes(1024)
except Exception as ex: except Exception as ex:
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))
@ -162,7 +157,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
@staticmethod @staticmethod
def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel: def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel | None:
for ai_model_entity in TTS_BASE_MODELS: for ai_model_entity in TTS_BASE_MODELS:
if ai_model_entity.base_model_name == base_model_name: if ai_model_entity.base_model_name == base_model_name:
ai_model_entity_copy = copy.deepcopy(ai_model_entity) ai_model_entity_copy = copy.deepcopy(ai_model_entity)
@ -170,5 +165,4 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
ai_model_entity_copy.entity.label.en_US = model ai_model_entity_copy.entity.label.en_US = model
ai_model_entity_copy.entity.label.zh_Hans = model ai_model_entity_copy.entity.label.zh_Hans = model
return ai_model_entity_copy return ai_model_entity_copy
return None return None

View File

@ -21,7 +21,7 @@ model_properties:
- mode: 'shimmer' - mode: 'shimmer'
name: 'Shimmer' name: 'Shimmer'
language: [ 'zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID' ] language: [ 'zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID' ]
word_limit: 120 word_limit: 3500
audio_type: 'mp3' audio_type: 'mp3'
max_workers: 5 max_workers: 5
pricing: pricing:

View File

@ -21,7 +21,7 @@ model_properties:
- mode: 'shimmer' - mode: 'shimmer'
name: 'Shimmer' name: 'Shimmer'
language: ['zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID'] language: ['zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID']
word_limit: 120 word_limit: 3500
audio_type: 'mp3' audio_type: 'mp3'
max_workers: 5 max_workers: 5
pricing: pricing:

View File

@ -3,7 +3,7 @@ from functools import reduce
from io import BytesIO from io import BytesIO
from typing import Optional from typing import Optional
from flask import Response, stream_with_context from flask import Response
from openai import OpenAI from openai import OpenAI
from pydub import AudioSegment from pydub import AudioSegment
@ -11,7 +11,6 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.openai._common import _CommonOpenAI from core.model_runtime.model_providers.openai._common import _CommonOpenAI
from extensions.ext_storage import storage
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
@ -20,7 +19,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
""" """
def _invoke(self, model: str, tenant_id: str, credentials: dict, def _invoke(self, model: str, tenant_id: str, credentials: dict,
content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any: content_text: str, voice: str, user: Optional[str] = None) -> any:
""" """
_invoke text2speech model _invoke text2speech model
@ -29,22 +28,17 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
:param credentials: model credentials :param credentials: model credentials
:param content_text: text content to be translated :param content_text: text content to be translated
:param voice: model timbre :param voice: model timbre
:param streaming: output is streaming
:param user: unique user id :param user: unique user id
:return: text translated to audio file :return: text translated to audio file
""" """
audio_type = self._get_model_audio_type(model, credentials)
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]: if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials) voice = self._get_model_default_voice(model, credentials)
if streaming: # if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model, return self._tts_invoke_streaming(model=model,
credentials=credentials, credentials=credentials,
content_text=content_text, content_text=content_text,
tenant_id=tenant_id, voice=voice)
voice=voice)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
""" """
@ -79,7 +73,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
word_limit = self._get_model_word_limit(model, credentials) word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials) max_workers = self._get_model_workers_limit(model, credentials)
try: try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = [] audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences # Create a thread pool and map the function to the list of sentences
@ -104,34 +98,40 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
except Exception as ex: except Exception as ex:
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str, def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any: voice: str) -> any:
""" """
_tts_invoke_streaming text2speech model _tts_invoke_streaming text2speech model
:param model: model name :param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials :param credentials: model credentials
:param content_text: text content to be translated :param content_text: text content to be translated
:param voice: model timbre :param voice: model timbre
:return: text translated to audio file :return: text translated to audio file
""" """
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try: try:
# doc: https://platform.openai.com/docs/guides/text-to-speech
credentials_kwargs = self._to_credential_kwargs(credentials)
client = OpenAI(**credentials_kwargs) client = OpenAI(**credentials_kwargs)
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
for sentence in sentences: voice = self._get_model_default_voice(model, credentials)
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) word_limit = self._get_model_word_limit(model, credentials)
# response.stream_to_file(file_path) if len(content_text) > word_limit:
storage.save(file_path, response.read()) sentences = self._split_text_into_sentences(content_text, max_length=word_limit)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
response_format="mp3",
input=sentences[i], voice=voice) for i in range(len(sentences))]
for index, future in enumerate(futures):
yield from future.result().__enter__().iter_bytes(1024)
else:
response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
response_format="mp3",
input=content_text.strip())
yield from response.__enter__().iter_bytes(1024)
except Exception as ex: except Exception as ex:
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))

View File

@ -129,7 +129,7 @@ model_properties:
- mode: "sambert-waan-v1" - mode: "sambert-waan-v1"
name: "Waan泰语女声" name: "Waan泰语女声"
language: [ "th-TH" ] language: [ "th-TH" ]
word_limit: 120 word_limit: 7000
audio_type: 'mp3' audio_type: 'mp3'
max_workers: 5 max_workers: 5
pricing: pricing:

View File

@ -1,17 +1,21 @@
import concurrent.futures import concurrent.futures
import threading
from functools import reduce from functools import reduce
from io import BytesIO from io import BytesIO
from queue import Queue
from typing import Optional from typing import Optional
import dashscope import dashscope
from flask import Response, stream_with_context from dashscope import SpeechSynthesizer
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
from flask import Response
from pydub import AudioSegment from pydub import AudioSegment
from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
from extensions.ext_storage import storage
class TongyiText2SpeechModel(_CommonTongyi, TTSModel): class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
@ -19,7 +23,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
Model class for Tongyi Speech to text model. Model class for Tongyi Speech to text model.
""" """
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
user: Optional[str] = None) -> any: user: Optional[str] = None) -> any:
""" """
_invoke text2speech model _invoke text2speech model
@ -29,22 +33,17 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
:param credentials: model credentials :param credentials: model credentials
:param voice: model timbre :param voice: model timbre
:param content_text: text content to be translated :param content_text: text content to be translated
:param streaming: output is streaming
:param user: unique user id :param user: unique user id
:return: text translated to audio file :return: text translated to audio file
""" """
audio_type = self._get_model_audio_type(model, credentials) if not voice or voice not in [d['value'] for d in
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]: self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials) voice = self._get_model_default_voice(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model, return self._tts_invoke_streaming(model=model,
credentials=credentials, credentials=credentials,
content_text=content_text, content_text=content_text,
voice=voice, voice=voice)
tenant_id=tenant_id)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
""" """
@ -79,7 +78,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
word_limit = self._get_model_word_limit(model, credentials) word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials) max_workers = self._get_model_workers_limit(model, credentials)
try: try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = [] audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences # Create a thread pool and map the function to the list of sentences
@ -105,14 +104,12 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
except Exception as ex: except Exception as ex:
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
voice: str) -> any: voice: str) -> any:
""" """
_tts_invoke_streaming text2speech model _tts_invoke_streaming text2speech model
:param model: model name :param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials :param credentials: model credentials
:param voice: model timbre :param voice: model timbre
:param content_text: text content to be translated :param content_text: text content to be translated
@ -120,18 +117,32 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
""" """
word_limit = self._get_model_word_limit(model, credentials) word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials) audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try: try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) audio_queue: Queue = Queue()
for sentence in sentences: callback = Callback(queue=audio_queue)
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
api_key=credentials.get('dashscope_api_key'), def invoke_remote(content, v, api_key, cb, at, wl):
text=sentence.strip(), if len(content) < word_limit:
format=audio_type, word_timestamp_enabled=True, sentences = [content]
phoneme_timestamp_enabled=True) else:
if isinstance(response.get_audio_data(), bytes): sentences = list(self._split_text_into_sentences(org_text=content, max_length=wl))
storage.save(file_path, response.get_audio_data()) for sentence in sentences:
SpeechSynthesizer.call(model=v, sample_rate=16000,
api_key=api_key,
text=sentence.strip(),
callback=cb,
format=at, word_timestamp_enabled=True,
phoneme_timestamp_enabled=True)
threading.Thread(target=invoke_remote, args=(
content_text, voice, credentials.get('dashscope_api_key'), callback, audio_type, word_limit)).start()
while True:
audio = audio_queue.get()
if audio is None:
break
yield audio
except Exception as ex: except Exception as ex:
raise InvokeBadRequestError(str(ex)) raise InvokeBadRequestError(str(ex))
@ -152,3 +163,29 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
format=audio_type) format=audio_type)
if isinstance(response.get_audio_data(), bytes): if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data() return response.get_audio_data()
class Callback(ResultCallback):
def __init__(self, queue: Queue):
self._queue = queue
def on_open(self):
pass
def on_complete(self):
self._queue.put(None)
self._queue.task_done()
def on_error(self, response: SpeechSynthesisResponse):
self._queue.put(None)
self._queue.task_done()
def on_close(self):
self._queue.put(None)
self._queue.task_done()
def on_event(self, result: SpeechSynthesisResult):
ad = result.get_audio_frame()
if ad:
self._queue.put(ad)

View File

@ -49,7 +49,7 @@ ignore = [
"B006", # mutable-argument-default "B006", # mutable-argument-default
"B007", # unused-loop-control-variable "B007", # unused-loop-control-variable
"B026", # star-arg-unpacking-after-keyword-arg "B026", # star-arg-unpacking-after-keyword-arg
"B901", # return-in-generator # "B901", # return-in-generator
"B904", # raise-without-from-inside-except "B904", # raise-without-from-inside-except
"B905", # zip-without-explicit-strict "B905", # zip-without-explicit-strict
] ]

View File

@ -123,6 +123,8 @@ class AppService:
app.icon = args['icon'] app.icon = args['icon']
app.icon_background = args['icon_background'] app.icon_background = args['icon_background']
app.tenant_id = tenant_id app.tenant_id = tenant_id
app.api_rph = args.get('api_rph', 0)
app.api_rpm = args.get('api_rpm', 0)
db.session.add(app) db.session.add(app)
db.session.flush() db.session.flush()

View File

@ -1,11 +1,12 @@
import io import io
import logging
from typing import Optional from typing import Optional
from werkzeug.datastructures import FileStorage from werkzeug.datastructures import FileStorage
from core.model_manager import ModelManager from core.model_manager import ModelManager
from core.model_runtime.entities.model_entities import ModelType from core.model_runtime.entities.model_entities import ModelType
from models.model import App, AppMode, AppModelConfig from models.model import App, AppMode, AppModelConfig, Message
from services.errors.audio import ( from services.errors.audio import (
AudioTooLargeServiceError, AudioTooLargeServiceError,
NoAudioUploadedServiceError, NoAudioUploadedServiceError,
@ -18,6 +19,8 @@ FILE_SIZE = 30
FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr'] ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
logger = logging.getLogger(__name__)
class AudioService: class AudioService:
@classmethod @classmethod
@ -64,51 +67,74 @@ class AudioService:
return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)} return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
@classmethod @classmethod
def transcript_tts(cls, app_model: App, text: str, streaming: bool, def transcript_tts(cls, app_model: App, text: Optional[str] = None,
voice: Optional[str] = None, end_user: Optional[str] = None): voice: Optional[str] = None, end_user: Optional[str] = None, message_id: Optional[str] = None):
if app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]: from collections.abc import Generator
workflow = app_model.workflow
if workflow is None:
raise ValueError("TTS is not enabled")
features_dict = workflow.features_dict from flask import Response, stream_with_context
if 'text_to_speech' not in features_dict or not features_dict['text_to_speech'].get('enabled'):
raise ValueError("TTS is not enabled")
voice = features_dict['text_to_speech'].get('voice') if voice is None else voice from app import app
else: from extensions.ext_database import db
text_to_speech_dict = app_model.app_model_config.text_to_speech_dict
if not text_to_speech_dict.get('enabled'): def invoke_tts(text_content: str, app_model, voice: Optional[str] = None):
raise ValueError("TTS is not enabled") with app.app_context():
if app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]:
workflow = app_model.workflow
if workflow is None:
raise ValueError("TTS is not enabled")
voice = text_to_speech_dict.get('voice') if voice is None else voice features_dict = workflow.features_dict
if 'text_to_speech' not in features_dict or not features_dict['text_to_speech'].get('enabled'):
raise ValueError("TTS is not enabled")
model_manager = ModelManager() voice = features_dict['text_to_speech'].get('voice') if voice is None else voice
model_instance = model_manager.get_default_model_instance(
tenant_id=app_model.tenant_id,
model_type=ModelType.TTS
)
if model_instance is None:
raise ProviderNotSupportTextToSpeechServiceError()
try:
if not voice:
voices = model_instance.get_tts_voices()
if voices:
voice = voices[0].get('value')
else: else:
raise ValueError("Sorry, no voice available.") text_to_speech_dict = app_model.app_model_config.text_to_speech_dict
return model_instance.invoke_tts( if not text_to_speech_dict.get('enabled'):
content_text=text.strip(), raise ValueError("TTS is not enabled")
user=end_user,
streaming=streaming, voice = text_to_speech_dict.get('voice') if voice is None else voice
tenant_id=app_model.tenant_id,
voice=voice model_manager = ModelManager()
) model_instance = model_manager.get_default_model_instance(
except Exception as e: tenant_id=app_model.tenant_id,
raise e model_type=ModelType.TTS
)
try:
if not voice:
voices = model_instance.get_tts_voices()
if voices:
voice = voices[0].get('value')
else:
raise ValueError("Sorry, no voice available.")
return model_instance.invoke_tts(
content_text=text_content.strip(),
user=end_user,
tenant_id=app_model.tenant_id,
voice=voice
)
except Exception as e:
raise e
if message_id:
message = db.session.query(Message).filter(
Message.id == message_id
).first()
if message.answer == '' and message.status == 'normal':
return None
else:
response = invoke_tts(message.answer, app_model=app_model, voice=voice)
if isinstance(response, Generator):
return Response(stream_with_context(response), content_type='audio/mpeg')
return response
else:
response = invoke_tts(text, app_model, voice)
if isinstance(response, Generator):
return Response(stream_with_context(response), content_type='audio/mpeg')
return response
@classmethod @classmethod
def transcript_tts_voices(cls, tenant_id: str, language: str): def transcript_tts_voices(cls, tenant_id: str, language: str):

View File

@ -11,11 +11,13 @@ import { usePathname } from 'next/navigation'
import { useTranslation } from 'react-i18next' import { useTranslation } from 'react-i18next'
import { Listbox, Transition } from '@headlessui/react' import { Listbox, Transition } from '@headlessui/react'
import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid' import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid'
import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
import type { Item } from '@/app/components/base/select' import type { Item } from '@/app/components/base/select'
import ConfigContext from '@/context/debug-configuration' import ConfigContext from '@/context/debug-configuration'
import { fetchAppVoices } from '@/service/apps' import { fetchAppVoices } from '@/service/apps'
import Tooltip from '@/app/components/base/tooltip' import Tooltip from '@/app/components/base/tooltip'
import { languages } from '@/i18n/language' import { languages } from '@/i18n/language'
import { TtsAutoPlay } from '@/types/app'
const VoiceParamConfig: FC = () => { const VoiceParamConfig: FC = () => {
const { t } = useTranslation() const { t } = useTranslation()
const pathname = usePathname() const pathname = usePathname()
@ -27,12 +29,16 @@ const VoiceParamConfig: FC = () => {
setTextToSpeechConfig, setTextToSpeechConfig,
} = useContext(ConfigContext) } = useContext(ConfigContext)
const languageItem = languages.find(item => item.value === textToSpeechConfig.language) let languageItem = languages.find(item => item.value === textToSpeechConfig.language)
const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select') const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
if (languages && !languageItem)
languageItem = languages[0]
const language = languageItem?.value const language = languageItem?.value
const voiceItems = useSWR({ appId, language }, fetchAppVoices).data const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice) let voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
if (voiceItems && !voiceItem)
voiceItem = voiceItems[0]
const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select') const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
return ( return (
@ -42,8 +48,9 @@ const VoiceParamConfig: FC = () => {
<div className='pt-3 space-y-6'> <div className='pt-3 space-y-6'>
<div> <div>
<div className='mb-2 flex items-center space-x-1'> <div className='mb-2 flex items-center space-x-1'>
<div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div> <div
<Tooltip htmlContent={<div className='w-[180px]' > className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
<Tooltip htmlContent={<div className='w-[180px]'>
{t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => ( {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
<div key={item}>{item}</div> <div key={item}>{item}</div>
))} ))}
@ -61,7 +68,8 @@ const VoiceParamConfig: FC = () => {
}} }}
> >
<div className={'relative h-9'}> <div className={'relative h-9'}>
<Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> <Listbox.Button
className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
<span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}> <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
{languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder} {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
</span> </span>
@ -79,7 +87,8 @@ const VoiceParamConfig: FC = () => {
leaveTo="opacity-0" leaveTo="opacity-0"
> >
<Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> <Listbox.Options
className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
{languages.map((item: Item) => ( {languages.map((item: Item) => (
<Listbox.Option <Listbox.Option
key={item.value} key={item.value}
@ -100,7 +109,7 @@ const VoiceParamConfig: FC = () => {
'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
)} )}
> >
<CheckIcon className="h-5 w-5" aria-hidden="true" /> <CheckIcon className="h-5 w-5" aria-hidden="true"/>
</span> </span>
)} )}
</> </>
@ -112,9 +121,9 @@ const VoiceParamConfig: FC = () => {
</div> </div>
</Listbox> </Listbox>
</div> </div>
<div> <div>
<div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div> <div
className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
<Listbox <Listbox
value={voiceItem} value={voiceItem}
disabled={!languageItem} disabled={!languageItem}
@ -126,8 +135,10 @@ const VoiceParamConfig: FC = () => {
}} }}
> >
<div className={'relative h-9'}> <div className={'relative h-9'}>
<Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> <Listbox.Button
<span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span> className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
<span
className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
<span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2"> <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
<ChevronDownIcon <ChevronDownIcon
className="h-5 w-5 text-gray-400" className="h-5 w-5 text-gray-400"
@ -142,7 +153,8 @@ const VoiceParamConfig: FC = () => {
leaveTo="opacity-0" leaveTo="opacity-0"
> >
<Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> <Listbox.Options
className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
{voiceItems?.map((item: Item) => ( {voiceItems?.map((item: Item) => (
<Listbox.Option <Listbox.Option
key={item.value} key={item.value}
@ -162,7 +174,7 @@ const VoiceParamConfig: FC = () => {
'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
)} )}
> >
<CheckIcon className="h-5 w-5" aria-hidden="true" /> <CheckIcon className="h-5 w-5" aria-hidden="true"/>
</span> </span>
)} )}
</> </>
@ -174,6 +186,30 @@ const VoiceParamConfig: FC = () => {
</div> </div>
</Listbox> </Listbox>
</div> </div>
<div>
<div
className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
<RadioGroup
className='space-x-3'
options={[
{
label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
value: TtsAutoPlay.enabled,
},
{
label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
value: TtsAutoPlay.disabled,
},
]}
value={textToSpeechConfig.autoPlay ? textToSpeechConfig.autoPlay : TtsAutoPlay.disabled}
onChange={(value: TtsAutoPlay) => {
setTextToSpeechConfig({
...textToSpeechConfig,
autoPlay: value,
})
}}
/>
</div>
</div> </div>
</div> </div>
</div> </div>

View File

@ -40,7 +40,6 @@ const TextToSpeech: FC = () => {
{ languageInfo?.example && ( { languageInfo?.example && (
<AudioBtn <AudioBtn
value={languageInfo?.example} value={languageInfo?.example}
voice={voiceItem?.value}
isAudition isAudition
noCache noCache
/> />

View File

@ -428,8 +428,7 @@ const GenerationItem: FC<IGenerationItemProps> = ({
<> <>
<div className='ml-2 mr-2 h-[14px] w-[1px] bg-gray-200'></div> <div className='ml-2 mr-2 h-[14px] w-[1px] bg-gray-200'></div>
<AudioBtn <AudioBtn
value={content} id={messageId!}
noCache={false}
className={'mr-1'} className={'mr-1'}
/> />
</> </>

View File

@ -0,0 +1,53 @@
import AudioPlayer from '@/app/components/base/audio-btn/audio'
declare global {
// eslint-disable-next-line @typescript-eslint/consistent-type-definitions
interface AudioPlayerManager {
instance: AudioPlayerManager
}
}
export class AudioPlayerManager {
private static instance: AudioPlayerManager
private audioPlayers: AudioPlayer | null = null
private msgId: string | undefined
private constructor() {
}
public static getInstance(): AudioPlayerManager {
if (!AudioPlayerManager.instance) {
AudioPlayerManager.instance = new AudioPlayerManager()
this.instance = AudioPlayerManager.instance
}
return AudioPlayerManager.instance
}
public getAudioPlayer(url: string, isPublic: boolean, id: string | undefined, msgContent: string | null | undefined, voice: string | undefined, callback: ((event: string) => {}) | null): AudioPlayer {
if (this.msgId && this.msgId === id && this.audioPlayers) {
this.audioPlayers.setCallback(callback)
return this.audioPlayers
}
else {
if (this.audioPlayers) {
try {
this.audioPlayers.pauseAudio()
this.audioPlayers.cacheBuffers = []
this.audioPlayers.sourceBuffer?.abort()
}
catch (e) {
}
}
this.msgId = id
this.audioPlayers = new AudioPlayer(url, isPublic, id, msgContent, callback)
return this.audioPlayers
}
}
public resetMsgId(msgId: string) {
this.msgId = msgId
this.audioPlayers?.resetMsgId(msgId)
}
}

View File

@ -0,0 +1,263 @@
import Toast from '@/app/components/base/toast'
import { textToAudioStream } from '@/service/share'
declare global {
// eslint-disable-next-line @typescript-eslint/consistent-type-definitions
interface Window {
ManagedMediaSource: any
}
}
export default class AudioPlayer {
mediaSource: MediaSource | null
audio: HTMLAudioElement
audioContext: AudioContext
sourceBuffer?: SourceBuffer
cacheBuffers: ArrayBuffer[] = []
pauseTimer: number | null = null
msgId: string | undefined
msgContent: string | null | undefined = null
voice: string | undefined = undefined
isLoadData = false
url: string
isPublic: boolean
callback: ((event: string) => {}) | null
constructor(streamUrl: string, isPublic: boolean, msgId: string | undefined, msgContent: string | null | undefined, callback: ((event: string) => {}) | null) {
this.audioContext = new AudioContext()
this.msgId = msgId
this.msgContent = msgContent
this.url = streamUrl
this.isPublic = isPublic
this.callback = callback
// Compatible with iphone ios17 ManagedMediaSource
const MediaSource = window.MediaSource || window.ManagedMediaSource
if (!MediaSource) {
Toast.notify({
message: 'Your browser does not support audio streaming, if you are using an iPhone, please update to iOS 17.1 or later.',
type: 'error',
})
}
this.mediaSource = MediaSource ? new MediaSource() : null
this.audio = new Audio()
this.setCallback(callback)
this.audio.src = this.mediaSource ? URL.createObjectURL(this.mediaSource) : ''
this.audio.autoplay = true
const source = this.audioContext.createMediaElementSource(this.audio)
source.connect(this.audioContext.destination)
this.listenMediaSource('audio/mpeg')
}
public resetMsgId(msgId: string) {
this.msgId = msgId
}
private listenMediaSource(contentType: string) {
this.mediaSource?.addEventListener('sourceopen', () => {
if (this.sourceBuffer)
return
this.sourceBuffer = this.mediaSource?.addSourceBuffer(contentType)
// this.sourceBuffer?.addEventListener('update', () => {
// if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
// const cacheBuffer = this.cacheBuffers.shift()!
// this.sourceBuffer?.appendBuffer(cacheBuffer)
// }
// // this.pauseAudio()
// })
//
// this.sourceBuffer?.addEventListener('updateend', () => {
// if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
// const cacheBuffer = this.cacheBuffers.shift()!
// this.sourceBuffer?.appendBuffer(cacheBuffer)
// }
// // this.pauseAudio()
// })
})
}
public setCallback(callback: ((event: string) => {}) | null) {
this.callback = callback
if (callback) {
this.audio.addEventListener('ended', () => {
callback('ended')
}, false)
this.audio.addEventListener('paused', () => {
callback('paused')
}, true)
this.audio.addEventListener('loaded', () => {
callback('loaded')
}, true)
this.audio.addEventListener('play', () => {
callback('play')
}, true)
this.audio.addEventListener('timeupdate', () => {
callback('timeupdate')
}, true)
this.audio.addEventListener('loadeddate', () => {
callback('loadeddate')
}, true)
this.audio.addEventListener('canplay', () => {
callback('canplay')
}, true)
this.audio.addEventListener('error', () => {
callback('error')
}, true)
}
}
private async loadAudio() {
try {
const audioResponse: any = await textToAudioStream(this.url, this.isPublic, { content_type: 'audio/mpeg' }, {
message_id: this.msgId,
streaming: true,
voice: this.voice,
text: this.msgContent,
})
if (audioResponse.status !== 200) {
this.isLoadData = false
if (this.callback)
this.callback('error')
}
const reader = audioResponse.body.getReader()
while (true) {
const { value, done } = await reader.read()
if (done) {
this.receiveAudioData(value)
break
}
this.receiveAudioData(value)
}
}
catch (error) {
this.isLoadData = false
this.callback && this.callback('error')
}
}
// play audio
public playAudio() {
if (this.isLoadData) {
if (this.audioContext.state === 'suspended') {
this.audioContext.resume().then((_) => {
this.audio.play()
this.callback && this.callback('play')
})
}
else if (this.audio.ended) {
this.audio.play()
this.callback && this.callback('play')
}
if (this.callback)
this.callback('play')
}
else {
this.isLoadData = true
this.loadAudio()
}
}
private theEndOfStream() {
const endTimer = setInterval(() => {
if (!this.sourceBuffer?.updating) {
this.mediaSource?.endOfStream()
clearInterval(endTimer)
}
console.log('finishStream endOfStream endTimer')
}, 10)
}
private finishStream() {
const timer = setInterval(() => {
if (!this.cacheBuffers.length) {
this.theEndOfStream()
clearInterval(timer)
}
if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
const arrayBuffer = this.cacheBuffers.shift()!
this.sourceBuffer?.appendBuffer(arrayBuffer)
}
console.log('finishStream timer')
}, 10)
}
public async playAudioWithAudio(audio: string, play = true) {
if (!audio || !audio.length) {
this.finishStream()
return
}
const audioContent = Buffer.from(audio, 'base64')
this.receiveAudioData(new Uint8Array(audioContent))
if (play) {
this.isLoadData = true
if (this.audio.paused) {
this.audioContext.resume().then((_) => {
this.audio.play()
this.callback && this.callback('play')
})
}
else if (this.audio.ended) {
this.audio.play()
this.callback && this.callback('play')
}
else if (this.audio.played) { /* empty */ }
else {
this.audio.play()
this.callback && this.callback('play')
}
}
}
public pauseAudio() {
this.callback && this.callback('paused')
this.audio.pause()
this.audioContext.suspend()
}
private cancer() {
}
private receiveAudioData(unit8Array: Uint8Array) {
if (!unit8Array) {
this.finishStream()
return
}
const audioData = this.byteArrayToArrayBuffer(unit8Array)
if (!audioData.byteLength) {
if (this.mediaSource?.readyState === 'open')
this.finishStream()
return
}
if (this.sourceBuffer?.updating) {
this.cacheBuffers.push(audioData)
}
else {
if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
this.cacheBuffers.push(audioData)
const cacheBuffer = this.cacheBuffers.shift()!
this.sourceBuffer?.appendBuffer(cacheBuffer)
}
else {
this.sourceBuffer?.appendBuffer(audioData)
}
}
}
private byteArrayToArrayBuffer(byteArray: Uint8Array): ArrayBuffer {
const arrayBuffer = new ArrayBuffer(byteArray.length)
const uint8Array = new Uint8Array(arrayBuffer)
uint8Array.set(byteArray)
return arrayBuffer
}
}

View File

@ -1,124 +1,78 @@
'use client' 'use client'
import { useEffect, useRef, useState } from 'react' import { useRef, useState } from 'react'
import { t } from 'i18next' import { t } from 'i18next'
import { useParams, usePathname } from 'next/navigation' import { useParams, usePathname } from 'next/navigation'
import s from './style.module.css' import s from './style.module.css'
import Tooltip from '@/app/components/base/tooltip' import Tooltip from '@/app/components/base/tooltip'
import { randomString } from '@/utils' import { randomString } from '@/utils'
import { textToAudio } from '@/service/share'
import Loading from '@/app/components/base/loading' import Loading from '@/app/components/base/loading'
import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
type AudioBtnProps = { type AudioBtnProps = {
value: string id?: string
voice?: string voice?: string
value?: string
className?: string className?: string
isAudition?: boolean isAudition?: boolean
noCache: boolean noCache?: boolean
} }
type AudioState = 'initial' | 'loading' | 'playing' | 'paused' | 'ended' type AudioState = 'initial' | 'loading' | 'playing' | 'paused' | 'ended'
const AudioBtn = ({ const AudioBtn = ({
value, id,
voice, voice,
value,
className, className,
isAudition, isAudition,
noCache,
}: AudioBtnProps) => { }: AudioBtnProps) => {
const audioRef = useRef<HTMLAudioElement | null>(null)
const [audioState, setAudioState] = useState<AudioState>('initial') const [audioState, setAudioState] = useState<AudioState>('initial')
const selector = useRef(`play-tooltip-${randomString(4)}`) const selector = useRef(`play-tooltip-${randomString(4)}`)
const params = useParams() const params = useParams()
const pathname = usePathname() const pathname = usePathname()
const removeCodeBlocks = (inputText: any) => { const audio_finished_call = (event: string): any => {
const codeBlockRegex = /```[\s\S]*?```/g switch (event) {
if (inputText) case 'ended':
return inputText.replace(codeBlockRegex, '') setAudioState('ended')
return '' break
} case 'paused':
setAudioState('ended')
const loadAudio = async () => { break
const formData = new FormData() case 'loaded':
formData.append('text', removeCodeBlocks(value)) setAudioState('loading')
formData.append('voice', removeCodeBlocks(voice)) break
case 'play':
if (value !== '') {
setAudioState('loading')
let url = ''
let isPublic = false
if (params.token) {
url = '/text-to-audio'
isPublic = true
}
else if (params.appId) {
if (pathname.search('explore/installed') > -1)
url = `/installed-apps/${params.appId}/text-to-audio`
else
url = `/apps/${params.appId}/text-to-audio`
}
try {
const audioResponse = await textToAudio(url, isPublic, formData)
const blob_bytes = Buffer.from(audioResponse.data, 'latin1')
const blob = new Blob([blob_bytes], { type: 'audio/wav' })
const audioUrl = URL.createObjectURL(blob)
audioRef.current!.src = audioUrl
}
catch (error) {
setAudioState('initial')
console.error('Error playing audio:', error)
}
}
}
const handleToggle = async () => {
if (audioState === 'initial' || noCache) {
await loadAudio()
}
else if (audioRef.current) {
if (audioState === 'playing') {
audioRef.current.pause()
setAudioState('paused')
}
else {
audioRef.current.play()
setAudioState('playing') setAudioState('playing')
} break
case 'error':
setAudioState('ended')
break
} }
} }
let url = ''
let isPublic = false
useEffect(() => { if (params.token) {
const currentAudio = audioRef.current url = '/text-to-audio'
isPublic = true
const handleLoading = () => { }
else if (params.appId) {
if (pathname.search('explore/installed') > -1)
url = `/installed-apps/${params.appId}/text-to-audio`
else
url = `/apps/${params.appId}/text-to-audio`
}
const handleToggle = async () => {
if (audioState === 'playing' || audioState === 'loading') {
setAudioState('paused')
AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).pauseAudio()
}
else {
setAudioState('loading') setAudioState('loading')
AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).playAudio()
} }
}
const handlePlay = () => {
currentAudio?.play()
setAudioState('playing')
}
const handleEnded = () => {
setAudioState('ended')
}
currentAudio?.addEventListener('progress', handleLoading)
currentAudio?.addEventListener('canplaythrough', handlePlay)
currentAudio?.addEventListener('ended', handleEnded)
return () => {
currentAudio?.removeEventListener('progress', handleLoading)
currentAudio?.removeEventListener('canplaythrough', handlePlay)
currentAudio?.removeEventListener('ended', handleEnded)
URL.revokeObjectURL(currentAudio?.src || '')
currentAudio?.pause()
currentAudio?.setAttribute('src', '')
}
}, [])
const tooltipContent = { const tooltipContent = {
initial: t('appApi.play'), initial: t('appApi.play'),
@ -151,7 +105,6 @@ const AudioBtn = ({
)} )}
</button> </button>
</Tooltip> </Tooltip>
<audio ref={audioRef} src='' className='hidden' />
</div> </div>
) )
} }

View File

@ -8,6 +8,7 @@ import type {
ChatConfig, ChatConfig,
ChatItem, ChatItem,
} from '../../types' } from '../../types'
import { useChatContext } from '../context'
import Operation from './operation' import Operation from './operation'
import AgentContent from './agent-content' import AgentContent from './agent-content'
import BasicContent from './basic-content' import BasicContent from './basic-content'
@ -59,23 +60,25 @@ const Answer: FC<AnswerProps> = ({
} = item } = item
const hasAgentThoughts = !!agent_thoughts?.length const hasAgentThoughts = !!agent_thoughts?.length
const [containerWidth, setContainerWidth] = useState(0) const [containerWidth] = useState(0)
const [contentWidth, setContentWidth] = useState(0) const [contentWidth, setContentWidth] = useState(0)
const containerRef = useRef<HTMLDivElement>(null) const containerRef = useRef<HTMLDivElement>(null)
const contentRef = useRef<HTMLDivElement>(null) const contentRef = useRef<HTMLDivElement>(null)
const getContainerWidth = () => { const {
if (containerRef.current) config: chatContextConfig,
setContainerWidth(containerRef.current?.clientWidth + 16) } = useChatContext()
}
const voiceRef = useRef(chatContextConfig?.text_to_speech?.voice)
const getContentWidth = () => { const getContentWidth = () => {
if (contentRef.current) if (contentRef.current)
setContentWidth(contentRef.current?.clientWidth) setContentWidth(contentRef.current?.clientWidth)
} }
useEffect(() => { useEffect(() => {
getContainerWidth() voiceRef.current = chatContextConfig?.text_to_speech?.voice
}, []) }
, [chatContextConfig?.text_to_speech?.voice])
useEffect(() => { useEffect(() => {
if (!responding) if (!responding)

View File

@ -119,9 +119,9 @@ const Operation: FC<OperationProps> = ({
<> <>
<div className='mx-1 w-[1px] h-[14px] bg-gray-200'/> <div className='mx-1 w-[1px] h-[14px] bg-gray-200'/>
<AudioBtn <AudioBtn
id={id}
value={content} value={content}
noCache={false} noCache={false}
voice={config?.text_to_speech?.voice}
className='hidden group-hover:block' className='hidden group-hover:block'
/> />
</> </>

View File

@ -6,6 +6,8 @@ import {
} from 'react' } from 'react'
import { useTranslation } from 'react-i18next' import { useTranslation } from 'react-i18next'
import { produce, setAutoFreeze } from 'immer' import { produce, setAutoFreeze } from 'immer'
import { useParams, usePathname } from 'next/navigation'
import { v4 as uuidV4 } from 'uuid'
import type { import type {
ChatConfig, ChatConfig,
ChatItem, ChatItem,
@ -20,6 +22,7 @@ import { replaceStringWithValues } from '@/app/components/app/configuration/prom
import type { Annotation } from '@/models/log' import type { Annotation } from '@/models/log'
import { WorkflowRunningStatus } from '@/app/components/workflow/types' import { WorkflowRunningStatus } from '@/app/components/workflow/types'
import useTimestamp from '@/hooks/use-timestamp' import useTimestamp from '@/hooks/use-timestamp'
import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
type GetAbortController = (abortController: AbortController) => void type GetAbortController = (abortController: AbortController) => void
type SendCallback = { type SendCallback = {
@ -91,7 +94,8 @@ export const useChat = (
const conversationMessagesAbortControllerRef = useRef<AbortController | null>(null) const conversationMessagesAbortControllerRef = useRef<AbortController | null>(null)
const suggestedQuestionsAbortControllerRef = useRef<AbortController | null>(null) const suggestedQuestionsAbortControllerRef = useRef<AbortController | null>(null)
const checkPromptVariables = useCheckPromptVariables() const checkPromptVariables = useCheckPromptVariables()
const params = useParams()
const pathname = usePathname()
useEffect(() => { useEffect(() => {
setAutoFreeze(false) setAutoFreeze(false)
return () => { return () => {
@ -262,6 +266,19 @@ export const useChat = (
let isAgentMode = false let isAgentMode = false
let hasSetResponseId = false let hasSetResponseId = false
let ttsUrl = ''
let ttsIsPublic = false
if (params.token) {
ttsUrl = '/text-to-audio'
ttsIsPublic = true
}
else if (params.appId) {
if (pathname.search('explore/installed') > -1)
ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
else
ttsUrl = `/apps/${params.appId}/text-to-audio`
}
const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
ssePost( ssePost(
url, url,
{ {
@ -530,6 +547,15 @@ export const useChat = (
} }
})) }))
}, },
onTTSChunk: (messageId: string, audio: string) => {
if (!audio || audio === '')
return
player.playAudioWithAudio(audio, true)
AudioPlayerManager.getInstance().resetMsgId(messageId)
},
onTTSEnd: (messageId: string, audio: string) => {
player.playAudioWithAudio(audio, false)
},
}) })
return true return true
}, [ }, [

View File

@ -19,6 +19,8 @@ import type { Item } from '@/app/components/base/select'
import { fetchAppVoices } from '@/service/apps' import { fetchAppVoices } from '@/service/apps'
import Tooltip from '@/app/components/base/tooltip' import Tooltip from '@/app/components/base/tooltip'
import { languages } from '@/i18n/language' import { languages } from '@/i18n/language'
import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
import { TtsAutoPlay } from '@/types/app'
type VoiceParamConfigProps = { type VoiceParamConfigProps = {
onChange?: OnFeaturesChange onChange?: OnFeaturesChange
@ -33,12 +35,16 @@ const VoiceParamConfig = ({
const text2speech = useFeatures(state => state.features.text2speech) const text2speech = useFeatures(state => state.features.text2speech)
const featuresStore = useFeaturesStore() const featuresStore = useFeaturesStore()
const languageItem = languages.find(item => item.value === text2speech.language) let languageItem = languages.find(item => item.value === text2speech?.language)
if (languages && !languageItem)
languageItem = languages[0]
const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select') const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
const language = languageItem?.value const language = languageItem?.value
const voiceItems = useSWR({ appId, language }, fetchAppVoices).data const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
const voiceItem = voiceItems?.find(item => item.value === text2speech.voice) let voiceItem = voiceItems?.find(item => item.value === text2speech?.voice)
if (voiceItems && !voiceItem)
voiceItem = voiceItems[0]
const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select') const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
const handleChange = (value: Record<string, string>) => { const handleChange = (value: Record<string, string>) => {
@ -66,13 +72,14 @@ const VoiceParamConfig = ({
<div className='pt-3 space-y-6'> <div className='pt-3 space-y-6'>
<div> <div>
<div className='mb-2 flex items-center space-x-1'> <div className='mb-2 flex items-center space-x-1'>
<div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div> <div
<Tooltip htmlContent={<div className='w-[180px]' > className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
<Tooltip htmlContent={<div className='w-[180px]'>
{t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => ( {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
<div key={item}>{item}</div> <div key={item}>{item}</div>
))} ))}
</div>} selector='config-resolution-tooltip'> </div>} selector='config-resolution-tooltip'>
<RiQuestionLine className='w-[14px] h-[14px] text-gray-400' /> <RiQuestionLine className='w-[14px] h-[14px] text-gray-400'/>
</Tooltip> </Tooltip>
</div> </div>
<Listbox <Listbox
@ -84,7 +91,8 @@ const VoiceParamConfig = ({
}} }}
> >
<div className={'relative h-9'}> <div className={'relative h-9'}>
<Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> <Listbox.Button
className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
<span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}> <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
{languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder} {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
</span> </span>
@ -102,7 +110,8 @@ const VoiceParamConfig = ({
leaveTo="opacity-0" leaveTo="opacity-0"
> >
<Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> <Listbox.Options
className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
{languages.map((item: Item) => ( {languages.map((item: Item) => (
<Listbox.Option <Listbox.Option
key={item.value} key={item.value}
@ -117,13 +126,13 @@ const VoiceParamConfig = ({
<> <>
<span <span
className={classNames('block', selected && 'font-normal')}>{t(`common.voice.language.${(item.value).toString().replace('-', '')}`)}</span> className={classNames('block', selected && 'font-normal')}>{t(`common.voice.language.${(item.value).toString().replace('-', '')}`)}</span>
{(selected || item.value === text2speech.language) && ( {(selected || item.value === text2speech?.language) && (
<span <span
className={classNames( className={classNames(
'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
)} )}
> >
<CheckIcon className="h-5 w-5" aria-hidden="true" /> <CheckIcon className="h-5 w-5" aria-hidden="true"/>
</span> </span>
)} )}
</> </>
@ -137,7 +146,8 @@ const VoiceParamConfig = ({
</div> </div>
<div> <div>
<div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div> <div
className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
<Listbox <Listbox
value={voiceItem} value={voiceItem}
disabled={!languageItem} disabled={!languageItem}
@ -148,8 +158,10 @@ const VoiceParamConfig = ({
}} }}
> >
<div className={'relative h-9'}> <div className={'relative h-9'}>
<Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}> <Listbox.Button
<span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span> className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
<span
className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
<span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2"> <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
<ChevronDownIcon <ChevronDownIcon
className="h-5 w-5 text-gray-400" className="h-5 w-5 text-gray-400"
@ -164,7 +176,8 @@ const VoiceParamConfig = ({
leaveTo="opacity-0" leaveTo="opacity-0"
> >
<Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm"> <Listbox.Options
className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
{voiceItems?.map((item: Item) => ( {voiceItems?.map((item: Item) => (
<Listbox.Option <Listbox.Option
key={item.value} key={item.value}
@ -178,13 +191,13 @@ const VoiceParamConfig = ({
{({ /* active, */ selected }) => ( {({ /* active, */ selected }) => (
<> <>
<span className={classNames('block', selected && 'font-normal')}>{item.name}</span> <span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
{(selected || item.value === text2speech.voice) && ( {(selected || item.value === text2speech?.voice) && (
<span <span
className={classNames( className={classNames(
'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700', 'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
)} )}
> >
<CheckIcon className="h-5 w-5" aria-hidden="true" /> <CheckIcon className="h-5 w-5" aria-hidden="true"/>
</span> </span>
)} )}
</> </>
@ -196,6 +209,29 @@ const VoiceParamConfig = ({
</div> </div>
</Listbox> </Listbox>
</div> </div>
<div>
<div
className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
<RadioGroup
className='space-x-3'
options={[
{
label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
value: TtsAutoPlay.enabled,
},
{
label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
value: TtsAutoPlay.disabled,
},
]}
value={text2speech?.autoPlay ? text2speech?.autoPlay : TtsAutoPlay.disabled}
onChange={(value: TtsAutoPlay) => {
handleChange({
autoPlay: value,
})
}}
/>
</div>
</div> </div>
</div> </div>
</div> </div>

View File

@ -1,4 +1,4 @@
import type { TransferMethod } from '@/types/app' import type { TransferMethod, TtsAutoPlay } from '@/types/app'
export type EnabledOrDisabled = { export type EnabledOrDisabled = {
enabled?: boolean enabled?: boolean
@ -14,6 +14,7 @@ export type SuggestedQuestionsAfterAnswer = EnabledOrDisabled
export type TextToSpeech = EnabledOrDisabled & { export type TextToSpeech = EnabledOrDisabled & {
language?: string language?: string
voice?: string voice?: string
autoPlay?: TtsAutoPlay
} }
export type SpeechToText = EnabledOrDisabled export type SpeechToText = EnabledOrDisabled

View File

@ -4,6 +4,8 @@ import {
useStoreApi, useStoreApi,
} from 'reactflow' } from 'reactflow'
import produce from 'immer' import produce from 'immer'
import { v4 as uuidV4 } from 'uuid'
import { usePathname } from 'next/navigation'
import { useWorkflowStore } from '../store' import { useWorkflowStore } from '../store'
import { useNodesSyncDraft } from '../hooks' import { useNodesSyncDraft } from '../hooks'
import { import {
@ -19,6 +21,7 @@ import {
stopWorkflowRun, stopWorkflowRun,
} from '@/service/workflow' } from '@/service/workflow'
import { useFeaturesStore } from '@/app/components/base/features/hooks' import { useFeaturesStore } from '@/app/components/base/features/hooks'
import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
export const useWorkflowRun = () => { export const useWorkflowRun = () => {
const store = useStoreApi() const store = useStoreApi()
@ -27,6 +30,7 @@ export const useWorkflowRun = () => {
const featuresStore = useFeaturesStore() const featuresStore = useFeaturesStore()
const { doSyncWorkflowDraft } = useNodesSyncDraft() const { doSyncWorkflowDraft } = useNodesSyncDraft()
const { handleUpdateWorkflowCanvas } = useWorkflowUpdate() const { handleUpdateWorkflowCanvas } = useWorkflowUpdate()
const pathname = usePathname()
const handleBackupDraft = useCallback(() => { const handleBackupDraft = useCallback(() => {
const { const {
@ -134,6 +138,20 @@ export const useWorkflowRun = () => {
let isInIteration = false let isInIteration = false
let iterationLength = 0 let iterationLength = 0
let ttsUrl = ''
let ttsIsPublic = false
if (params.token) {
ttsUrl = '/text-to-audio'
ttsIsPublic = true
}
else if (params.appId) {
if (pathname.search('explore/installed') > -1)
ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
else
ttsUrl = `/apps/${params.appId}/text-to-audio`
}
const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
ssePost( ssePost(
url, url,
{ {
@ -468,6 +486,15 @@ export const useWorkflowRun = () => {
draft.resultText = text draft.resultText = text
})) }))
}, },
onTTSChunk: (messageId: string, audio: string, audioType?: string) => {
if (!audio || audio === '')
return
player.playAudioWithAudio(audio, true)
AudioPlayerManager.getInstance().resetMsgId(messageId)
},
onTTSEnd: (messageId: string, audio: string, audioType?: string) => {
player.playAudioWithAudio(audio, false)
},
...restCallback, ...restCallback,
}, },
) )

View File

@ -323,6 +323,9 @@ const translation = {
language: 'Language', language: 'Language',
resolutionTooltip: 'Text-to-speech voice support language。', resolutionTooltip: 'Text-to-speech voice support language。',
voice: 'Voice', voice: 'Voice',
autoPlay: 'Auto Play',
autoPlayEnabled: 'Turn On',
autoPlayDisabled: 'Turn Off',
}, },
}, },
openingStatement: { openingStatement: {

View File

@ -319,6 +319,9 @@ const translation = {
language: '言語', language: '言語',
resolutionTooltip: 'テキスト読み上げの音声言語をサポートします。', resolutionTooltip: 'テキスト読み上げの音声言語をサポートします。',
voice: '音声', voice: '音声',
autoPlay: '自動再生',
autoPlayEnabled: '開ける',
autoPlayDisabled: '關閉',
}, },
}, },
openingStatement: { openingStatement: {

View File

@ -319,6 +319,9 @@ const translation = {
language: '语言', language: '语言',
resolutionTooltip: '文本转语音音色支持语言。', resolutionTooltip: '文本转语音音色支持语言。',
voice: '音色', voice: '音色',
autoPlay: '自动播放',
autoPlayEnabled: '开启',
autoPlayDisabled: '关闭',
}, },
}, },
openingStatement: { openingStatement: {

View File

@ -318,6 +318,9 @@ const translation = {
language: '語言', language: '語言',
resolutionTooltip: '文字轉語音音色支援語言。', resolutionTooltip: '文字轉語音音色支援語言。',
voice: '音色', voice: '音色',
autoPlay: '自動播放',
autoPlayEnabled: '開啟',
autoPlayDisabled: '關閉',
}, },
}, },
openingStatement: { openingStatement: {

View File

@ -1,4 +1,4 @@
import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem } from '@/types/app' import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem, TtsAutoPlay } from '@/types/app'
export type Inputs = Record<string, string | number | object> export type Inputs = Record<string, string | number | object>
export enum PromptMode { export enum PromptMode {
@ -79,6 +79,7 @@ export type TextToSpeechConfig = {
enabled: boolean enabled: boolean
voice?: string voice?: string
language?: string language?: string
autoPlay?: TtsAutoPlay
} }
export type CitationConfig = MoreLikeThisConfig export type CitationConfig = MoreLikeThisConfig

View File

@ -34,6 +34,7 @@ const nextConfig = {
// https://nextjs.org/docs/api-reference/next.config.js/ignoring-typescript-errors // https://nextjs.org/docs/api-reference/next.config.js/ignoring-typescript-errors
ignoreBuildErrors: true, ignoreBuildErrors: true,
}, },
reactStrictMode: true,
async redirects() { async redirects() {
return [ return [
{ {

View File

@ -120,6 +120,7 @@ export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { u
} }
export const fetchAppVoices: Fetcher<AppVoicesListResponse, { appId: string; language?: string }> = ({ appId, language }) => { export const fetchAppVoices: Fetcher<AppVoicesListResponse, { appId: string; language?: string }> = ({ appId, language }) => {
language = language || 'en-US'
return get<AppVoicesListResponse>(`apps/${appId}/text-to-audio/voices?language=${language}`) return get<AppVoicesListResponse>(`apps/${appId}/text-to-audio/voices?language=${language}`)
} }

View File

@ -19,6 +19,7 @@ const TIME_OUT = 100000
const ContentType = { const ContentType = {
json: 'application/json', json: 'application/json',
stream: 'text/event-stream', stream: 'text/event-stream',
audio: 'audio/mpeg',
form: 'application/x-www-form-urlencoded; charset=UTF-8', form: 'application/x-www-form-urlencoded; charset=UTF-8',
download: 'application/octet-stream', // for download download: 'application/octet-stream', // for download
upload: 'multipart/form-data', // for upload upload: 'multipart/form-data', // for upload
@ -59,6 +60,8 @@ export type IOnIterationStarted = (workflowStarted: IterationStartedResponse) =>
export type IOnIterationNexted = (workflowStarted: IterationNextedResponse) => void export type IOnIterationNexted = (workflowStarted: IterationNextedResponse) => void
export type IOnIterationFinished = (workflowFinished: IterationFinishedResponse) => void export type IOnIterationFinished = (workflowFinished: IterationFinishedResponse) => void
export type IOnTextChunk = (textChunk: TextChunkResponse) => void export type IOnTextChunk = (textChunk: TextChunkResponse) => void
export type IOnTTSChunk = (messageId: string, audioStr: string, audioType?: string) => void
export type IOnTTSEnd = (messageId: string, audioStr: string, audioType?: string) => void
export type IOnTextReplace = (textReplace: TextReplaceResponse) => void export type IOnTextReplace = (textReplace: TextReplaceResponse) => void
export type IOtherOptions = { export type IOtherOptions = {
@ -84,6 +87,8 @@ export type IOtherOptions = {
onIterationNext?: IOnIterationNexted onIterationNext?: IOnIterationNexted
onIterationFinish?: IOnIterationFinished onIterationFinish?: IOnIterationFinished
onTextChunk?: IOnTextChunk onTextChunk?: IOnTextChunk
onTTSChunk?: IOnTTSChunk
onTTSEnd?: IOnTTSEnd
onTextReplace?: IOnTextReplace onTextReplace?: IOnTextReplace
} }
@ -135,6 +140,8 @@ const handleStream = (
onIterationNext?: IOnIterationNexted, onIterationNext?: IOnIterationNexted,
onIterationFinish?: IOnIterationFinished, onIterationFinish?: IOnIterationFinished,
onTextChunk?: IOnTextChunk, onTextChunk?: IOnTextChunk,
onTTSChunk?: IOnTTSChunk,
onTTSEnd?: IOnTTSEnd,
onTextReplace?: IOnTextReplace, onTextReplace?: IOnTextReplace,
) => { ) => {
if (!response.ok) if (!response.ok)
@ -227,6 +234,12 @@ const handleStream = (
else if (bufferObj.event === 'text_replace') { else if (bufferObj.event === 'text_replace') {
onTextReplace?.(bufferObj as TextReplaceResponse) onTextReplace?.(bufferObj as TextReplaceResponse)
} }
else if (bufferObj.event === 'tts_message') {
onTTSChunk?.(bufferObj.message_id, bufferObj.audio, bufferObj.audio_type)
}
else if (bufferObj.event === 'tts_message_end') {
onTTSEnd?.(bufferObj.message_id, bufferObj.audio)
}
} }
}) })
buffer = lines[lines.length - 1] buffer = lines[lines.length - 1]
@ -390,9 +403,10 @@ const baseFetch = <T>(
} }
// return data // return data
const data: Promise<T> = options.headers.get('Content-type') === ContentType.download ? res.blob() : res.json() if (options.headers.get('Content-type') === ContentType.download || options.headers.get('Content-type') === ContentType.audio)
resolve(needAllResponseContent ? resClone : res.blob())
resolve(needAllResponseContent ? resClone : data) else resolve(needAllResponseContent ? resClone : res.json())
}) })
.catch((err) => { .catch((err) => {
if (!silent) if (!silent)
@ -475,6 +489,8 @@ export const ssePost = (
onIterationNext, onIterationNext,
onIterationFinish, onIterationFinish,
onTextChunk, onTextChunk,
onTTSChunk,
onTTSEnd,
onTextReplace, onTextReplace,
onError, onError,
getAbortController, getAbortController,
@ -527,7 +543,7 @@ export const ssePost = (
return return
} }
onData?.(str, isFirstMessage, moreInfo) onData?.(str, isFirstMessage, moreInfo)
}, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTextReplace) }, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTTSChunk, onTTSEnd, onTextReplace)
}).catch((e) => { }).catch((e) => {
if (e.toString() !== 'AbortError: The user aborted a request.') if (e.toString() !== 'AbortError: The user aborted a request.')
Toast.notify({ type: 'error', message: e }) Toast.notify({ type: 'error', message: e })

View File

@ -1,4 +1,4 @@
import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base' import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTTSChunk, IOnTTSEnd, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base'
import { import {
del as consoleDel, get as consoleGet, patch as consolePatch, post as consolePost, del as consoleDel, get as consoleGet, patch as consolePatch, post as consolePost,
delPublic as del, getPublic as get, patchPublic as patch, postPublic as post, ssePost, delPublic as del, getPublic as get, patchPublic as patch, postPublic as post, ssePost,
@ -30,7 +30,7 @@ export function getUrl(url: string, isInstalledApp: boolean, installedAppId: str
return isInstalledApp ? `installed-apps/${installedAppId}/${url.startsWith('/') ? url.slice(1) : url}` : url return isInstalledApp ? `installed-apps/${installedAppId}/${url.startsWith('/') ? url.slice(1) : url}` : url
} }
export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace }: { export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd }: {
onData: IOnData onData: IOnData
onCompleted: IOnCompleted onCompleted: IOnCompleted
onFile: IOnFile onFile: IOnFile
@ -39,13 +39,15 @@ export const sendChatMessage = async (body: Record<string, any>, { onData, onCom
onMessageEnd?: IOnMessageEnd onMessageEnd?: IOnMessageEnd
onMessageReplace?: IOnMessageReplace onMessageReplace?: IOnMessageReplace
getAbortController?: (abortController: AbortController) => void getAbortController?: (abortController: AbortController) => void
onTTSChunk?: IOnTTSChunk
onTTSEnd?: IOnTTSEnd
}, isInstalledApp: boolean, installedAppId = '') => { }, isInstalledApp: boolean, installedAppId = '') => {
return ssePost(getUrl('chat-messages', isInstalledApp, installedAppId), { return ssePost(getUrl('chat-messages', isInstalledApp, installedAppId), {
body: { body: {
...body, ...body,
response_mode: 'streaming', response_mode: 'streaming',
}, },
}, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace }) }, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd })
} }
export const stopChatMessageResponding = async (appId: string, taskId: string, isInstalledApp: boolean, installedAppId = '') => { export const stopChatMessageResponding = async (appId: string, taskId: string, isInstalledApp: boolean, installedAppId = '') => {
@ -214,6 +216,10 @@ export const textToAudio = (url: string, isPublicAPI: boolean, body: FormData) =
return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ data: string }> return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ data: string }>
} }
export const textToAudioStream = (url: string, isPublicAPI: boolean, header: { content_type: string }, body: { streaming: boolean; voice?: string; message_id?: string; text?: string | null | undefined }) => {
return (getAction('post', !isPublicAPI))(url, { body, header }, { needAllResponseContent: true })
}
export const fetchAccessToken = async (appCode: string) => { export const fetchAccessToken = async (appCode: string) => {
const headers = new Headers() const headers = new Headers()
headers.append('X-App-Code', appCode) headers.append('X-App-Code', appCode)

View File

@ -160,6 +160,7 @@ export type ModelConfig = {
enabled: boolean enabled: boolean
voice?: string voice?: string
language?: string language?: string
autoPlay?: TtsAutoPlay
} }
retriever_resource: { retriever_resource: {
enabled: boolean enabled: boolean
@ -349,6 +350,11 @@ export enum TransferMethod {
remote_url = 'remote_url', remote_url = 'remote_url',
} }
export enum TtsAutoPlay {
enabled = 'enabled',
disabled = 'disabled',
}
export const ALLOW_FILE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'webp', 'gif'] export const ALLOW_FILE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'webp', 'gif']
export type VisionSettings = { export type VisionSettings = {