diff --git a/api/core/model_runtime/entities/message_entities.py b/api/core/model_runtime/entities/message_entities.py index fc37227bc9..d4d56a42a4 100644 --- a/api/core/model_runtime/entities/message_entities.py +++ b/api/core/model_runtime/entities/message_entities.py @@ -58,6 +58,7 @@ class PromptMessageContentType(Enum): IMAGE = "image" AUDIO = "audio" VIDEO = "video" + DOCUMENT = "document" class PromptMessageContent(BaseModel): diff --git a/api/core/model_runtime/entities/model_entities.py b/api/core/model_runtime/entities/model_entities.py index 52ea787c3a..4e1ce17533 100644 --- a/api/core/model_runtime/entities/model_entities.py +++ b/api/core/model_runtime/entities/model_entities.py @@ -87,6 +87,9 @@ class ModelFeature(Enum): AGENT_THOUGHT = "agent-thought" VISION = "vision" STREAM_TOOL_CALL = "stream-tool-call" + DOCUMENT = "document" + VIDEO = "video" + AUDIO = "audio" class DefaultParameterName(str, Enum): diff --git a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml index 256e87edbe..5a14bfc47f 100644 --- a/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml +++ b/api/core/model_runtime/model_providers/openai/llm/gpt-4o-audio-preview.yaml @@ -8,6 +8,7 @@ features: - agent-thought - stream-tool-call - vision + - audio model_properties: mode: chat context_size: 128000 diff --git a/api/core/workflow/nodes/llm/node.py b/api/core/workflow/nodes/llm/node.py index f0b8830eb5..a5620dbc01 100644 --- a/api/core/workflow/nodes/llm/node.py +++ b/api/core/workflow/nodes/llm/node.py @@ -193,6 +193,17 @@ class LLMNode(BaseNode[LLMNodeData]): ) ) return + except Exception as e: + logger.exception(f"Node {self.node_id} failed to run: {e}") + yield RunCompletedEvent( + run_result=NodeRunResult( + status=WorkflowNodeExecutionStatus.FAILED, + error=str(e), + inputs=node_inputs, + process_data=process_data, + ) + ) + return outputs = {"text": result_text, "usage": jsonable_encoder(usage), "finish_reason": finish_reason} @@ -607,11 +618,31 @@ class LLMNode(BaseNode[LLMNodeData]): if isinstance(prompt_message.content, list): prompt_message_content = [] for content_item in prompt_message.content: - # Skip image if vision is disabled or model doesn't support vision - if content_item.type == PromptMessageContentType.IMAGE and ( - not vision_enabled - or not model_config.model_schema.features - or ModelFeature.VISION not in model_config.model_schema.features + # Skip content if features are not defined + if not model_config.model_schema.features: + if content_item.type != PromptMessageContentType.TEXT: + continue + prompt_message_content.append(content_item) + continue + + # Skip content if corresponding feature is not supported + if ( + ( + content_item.type == PromptMessageContentType.IMAGE + and (not vision_enabled or ModelFeature.VISION not in model_config.model_schema.features) + ) + or ( + content_item.type == PromptMessageContentType.DOCUMENT + and ModelFeature.DOCUMENT not in model_config.model_schema.features + ) + or ( + content_item.type == PromptMessageContentType.VIDEO + and ModelFeature.VIDEO not in model_config.model_schema.features + ) + or ( + content_item.type == PromptMessageContentType.AUDIO + and ModelFeature.AUDIO not in model_config.model_schema.features + ) ): continue prompt_message_content.append(content_item) @@ -854,22 +885,22 @@ class LLMNode(BaseNode[LLMNodeData]): ) # Process segments for images - image_contents = [] + file_contents = [] for segment in segment_group.value: if isinstance(segment, ArrayFileSegment): for file in segment.value: - if file.type == FileType.IMAGE: - image_content = file_manager.to_prompt_message_content( + if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}: + file_content = file_manager.to_prompt_message_content( file, image_detail_config=self.node_data.vision.configs.detail ) - image_contents.append(image_content) + file_contents.append(file_content) if isinstance(segment, FileSegment): file = segment.value - if file.type == FileType.IMAGE: - image_content = file_manager.to_prompt_message_content( + if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}: + file_content = file_manager.to_prompt_message_content( file, image_detail_config=self.node_data.vision.configs.detail ) - image_contents.append(image_content) + file_contents.append(file_content) # Create message with text from all segments plain_text = segment_group.text @@ -877,9 +908,9 @@ class LLMNode(BaseNode[LLMNodeData]): prompt_message = _combine_text_message_with_role(text=plain_text, role=message.role) prompt_messages.append(prompt_message) - if image_contents: + if file_contents: # Create message with image contents - prompt_message = UserPromptMessage(content=image_contents) + prompt_message = UserPromptMessage(content=file_contents) prompt_messages.append(prompt_message) return prompt_messages diff --git a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py index da21710832..6ec219aa8d 100644 --- a/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/llm/test_node.py @@ -423,6 +423,32 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config): ) }, ), + LLMNodeTestScenario( + description="Prompt template with variable selector of File with video file and vision feature", + user_query=fake_query, + user_files=[], + vision_enabled=True, + vision_detail=fake_vision_detail, + features=[ModelFeature.VISION], + window_size=fake_window_size, + prompt_template=[ + LLMNodeChatModelMessage( + text="{{#input.image#}}", + role=PromptMessageRole.USER, + edition_type="basic", + ), + ], + expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)], + file_variables={ + "input.image": File( + tenant_id="test", + type=FileType.VIDEO, + filename="test1.jpg", + transfer_method=FileTransferMethod.REMOTE_URL, + remote_url=fake_remote_url, + ) + }, + ), ] for scenario in test_scenarios: