feat: add support for document, video, and audio content

Expanded the system to handle document types across different modules and introduced video and audio content handling in model features. Adjusted the prompt message logic to conditionally process content based on available features, enhancing flexibility in media processing. Added comprehensive error handling in `LLMNode` for better runtime resilience. Updated YAML configuration and unit tests to reflect these changes.
This commit is contained in:
-LAN- 2024-11-14 21:22:19 +08:00
parent fb94d0b7cf
commit 94794d892e
5 changed files with 76 additions and 14 deletions

View File

@ -58,6 +58,7 @@ class PromptMessageContentType(Enum):
IMAGE = "image"
AUDIO = "audio"
VIDEO = "video"
DOCUMENT = "document"
class PromptMessageContent(BaseModel):

View File

@ -87,6 +87,9 @@ class ModelFeature(Enum):
AGENT_THOUGHT = "agent-thought"
VISION = "vision"
STREAM_TOOL_CALL = "stream-tool-call"
DOCUMENT = "document"
VIDEO = "video"
AUDIO = "audio"
class DefaultParameterName(str, Enum):

View File

@ -8,6 +8,7 @@ features:
- agent-thought
- stream-tool-call
- vision
- audio
model_properties:
mode: chat
context_size: 128000

View File

@ -193,6 +193,17 @@ class LLMNode(BaseNode[LLMNodeData]):
)
)
return
except Exception as e:
logger.exception(f"Node {self.node_id} failed to run: {e}")
yield RunCompletedEvent(
run_result=NodeRunResult(
status=WorkflowNodeExecutionStatus.FAILED,
error=str(e),
inputs=node_inputs,
process_data=process_data,
)
)
return
outputs = {"text": result_text, "usage": jsonable_encoder(usage), "finish_reason": finish_reason}
@ -607,11 +618,31 @@ class LLMNode(BaseNode[LLMNodeData]):
if isinstance(prompt_message.content, list):
prompt_message_content = []
for content_item in prompt_message.content:
# Skip image if vision is disabled or model doesn't support vision
if content_item.type == PromptMessageContentType.IMAGE and (
not vision_enabled
or not model_config.model_schema.features
or ModelFeature.VISION not in model_config.model_schema.features
# Skip content if features are not defined
if not model_config.model_schema.features:
if content_item.type != PromptMessageContentType.TEXT:
continue
prompt_message_content.append(content_item)
continue
# Skip content if corresponding feature is not supported
if (
(
content_item.type == PromptMessageContentType.IMAGE
and (not vision_enabled or ModelFeature.VISION not in model_config.model_schema.features)
)
or (
content_item.type == PromptMessageContentType.DOCUMENT
and ModelFeature.DOCUMENT not in model_config.model_schema.features
)
or (
content_item.type == PromptMessageContentType.VIDEO
and ModelFeature.VIDEO not in model_config.model_schema.features
)
or (
content_item.type == PromptMessageContentType.AUDIO
and ModelFeature.AUDIO not in model_config.model_schema.features
)
):
continue
prompt_message_content.append(content_item)
@ -854,22 +885,22 @@ class LLMNode(BaseNode[LLMNodeData]):
)
# Process segments for images
image_contents = []
file_contents = []
for segment in segment_group.value:
if isinstance(segment, ArrayFileSegment):
for file in segment.value:
if file.type == FileType.IMAGE:
image_content = file_manager.to_prompt_message_content(
if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
file_content = file_manager.to_prompt_message_content(
file, image_detail_config=self.node_data.vision.configs.detail
)
image_contents.append(image_content)
file_contents.append(file_content)
if isinstance(segment, FileSegment):
file = segment.value
if file.type == FileType.IMAGE:
image_content = file_manager.to_prompt_message_content(
if file.type in {FileType.IMAGE, FileType.VIDEO, FileType.AUDIO}:
file_content = file_manager.to_prompt_message_content(
file, image_detail_config=self.node_data.vision.configs.detail
)
image_contents.append(image_content)
file_contents.append(file_content)
# Create message with text from all segments
plain_text = segment_group.text
@ -877,9 +908,9 @@ class LLMNode(BaseNode[LLMNodeData]):
prompt_message = _combine_text_message_with_role(text=plain_text, role=message.role)
prompt_messages.append(prompt_message)
if image_contents:
if file_contents:
# Create message with image contents
prompt_message = UserPromptMessage(content=image_contents)
prompt_message = UserPromptMessage(content=file_contents)
prompt_messages.append(prompt_message)
return prompt_messages

View File

@ -423,6 +423,32 @@ def test_fetch_prompt_messages__basic(faker, llm_node, model_config):
)
},
),
LLMNodeTestScenario(
description="Prompt template with variable selector of File with video file and vision feature",
user_query=fake_query,
user_files=[],
vision_enabled=True,
vision_detail=fake_vision_detail,
features=[ModelFeature.VISION],
window_size=fake_window_size,
prompt_template=[
LLMNodeChatModelMessage(
text="{{#input.image#}}",
role=PromptMessageRole.USER,
edition_type="basic",
),
],
expected_messages=mock_history[fake_window_size * -2 :] + [UserPromptMessage(content=fake_query)],
file_variables={
"input.image": File(
tenant_id="test",
type=FileType.VIDEO,
filename="test1.jpg",
transfer_method=FileTransferMethod.REMOTE_URL,
remote_url=fake_remote_url,
)
},
),
]
for scenario in test_scenarios: