Hello
i am using a combination of agents that need to interact together to build a state machine representing a system :
def vision_agent(self) -> Agent:
return Agent(
role="HMI Analyser",
goal="Command a robot to observe and interact with HMI in a strict sequence",
backstory="Specialized in procedural HMI analysis with strict operation ordering",
llm=self.llm,
tools=[get_icons_tool, get_text_elements_tool],
memory=MilvusMemory(),
verbose=True
)
def control_agent(self) -> Agent:
return Agent(
role="You are a Senior HMI Tester who Excels at taking decisions to explore HMI System Functionnalities",
goal="Your Goal is to Command a Robot through tools to explore the HMI system you are testing",
backstory="Specialized in procedural HMI analysis with strict operation ordering",
llm=self.llm,
tools=[click, double_click, swipe],
memory=MilvusMemory(),
verbose=True
)
and these are the tasks :
def scan_interface_task(self) -> Task:
return Task(
description="""
<description>
<TOOLS_DESCRIPTIONS>
<tool>
<name>get_icons_tool</name>
<description>Triggered when we need to get the icons currently displayed in the HMI system. This tool does not require parameters.</description>
</tool>
<tool>
<name>get_text_elements_tool</name>
<description>Triggered when we need to get the text elements currently displayed in the HMI system. This tool does not require parameters.</description>
</tool>
</TOOLS_DESCRIPTIONS>
<RULES>
<rule>1. RETURN ONLY the completed JSON object. No extra explanation or output is allowed.</rule>
</RULES>
<INSTRUCTIONS>
<instruction> 0. Make sure to start with the <think> tag </instruction>
<instruction>1. Use the Tool: get_icons_tool to get the Current Icons. Fail the Task if the tool is not used.</instruction>
<instruction>2. Use the Tool: get_text_elements_tool to get the Current Text elements. Fail the Task if the tool is not used.</instruction>
<instruction>3. DO NOT hardcode or simulate results — always call the tools to fetch real-time data.</instruction>
<instruction>4. Populate the results into a JSON object with the following structure:
<json_structure>
{
"id": "",
"Current_Icons": [/* list of icon names */],
"Current_text": [/* list of text elements */],
"ui_state": {
/* for each icon: { "interactable": true/false, "interaction_type": ["click/swipe/double_click"], "position": [] } */
},
"text_state": {
/* for each text element: { "interactable": true/false, "interaction_type": ["click/swipe/double_click"], "position": [] } */
}
}
</json_structure>
</instruction>
<instruction>5. The JSON keys and structure are fixed and must be followed exactly.</instruction>
<instruction>6. Interaction metadata (interactable, interaction_type, position) should be initialized to default values as shown in the example.</instruction>
<instruction>7. If either tool fails or returns no data, fail the task accordingly.</instruction>
</INSTRUCTIONS>
</description>
""",
expected_output="""
<expected_output>
<description>A JSON object representing the current UI state with detected icons and text.</description>
<example_format>
{
"id": "{node_id}",
"Current_Icons": [{use_tool_to_get_it}],
"Current_text": [{use_tool_to_get_it}],
"ui_state": {
/* for each icon: { "interactable": true, "interaction_type": ["click / swipe/ doubleclick"], "position": [] } */
},
"text_state": {
/* for each text element: { "interactable": true, "interaction_type": ["click / swipe/ doubleclick"], "position": [] } */
}
}
</example_format>
</expected_output>
""",
output_file="outputs/scan_output.json",
output_json=ScanResult,
agent=self.vision_agent(),
tools=[get_icons_tool, get_text_elements_tool]
)
def execute_action_task(self, context) -> Task:
tools_description = """
<TOOLS_DESCRIPTION>
<tool_requirement>
<rule>You MUST use one of these tools for every action. Never bypass them.</rule>
<tool>
<name>click</name>
<description>Single press on a UI element. Parameters: {"element_id": "string"}</description>
<purpose>Select buttons/icons/text.</purpose>
</tool>
<tool>
<name>double_click</name>
<description>Two rapid presses. Parameters: {"element_id": "string", "interval_ms": 300}</description>
<purpose>Zoom/shortcuts/advanced menus.</purpose>
</tool>
<tool>
<name>swipe</name>
<description>Directional movement. Parameters: {"start_x": int, "start_y": int, "end_x": int, "end_y": int}</description>
<purpose>Scroll/swipe between screens.</purpose>
</tool>
</tool_requirement>
</TOOLS_DESCRIPTION>
"""
return Task(
description=f"""
<task_description>
{tools_description}
<INSTRUCTIONS>
<instruction> 0. Make sure to start with the <think> tag </instruction>
<instruction>1. Carefully analyze the provided context including:
<subpoint>- Previous interface state</subpoint>
</instruction>
<instruction>2. Examine the current scanned interface data (icons, text elements)</instruction>
<instruction>3. Determine the most appropriate tool (click, double_click, or swipe) based on:
<subpoint>- The current interface state</subpoint>
<subpoint>- The historical context</subpoint>
</instruction>
<instruction>4. Make sure to Call the Tool Api provided to Command the Robot otherwise fail the Task</instruction>
<instruction>5. Construct a JSON transition node using the Template Provided:
<subpoint>- DO NOT COPY THE TEMPLATE AS IS</subpoint>
<subpoint>- MAKE SURE TO REPLACE ALL FIELDS WITH ACTUAL DATA</subpoint>
</instruction>
<RULES>
<rule>- Must thoroughly analyze context before selecting action</rule>
<rule>- Must wait for action execution to finish before post-action scan</rule>
<rule>- Final output must conform exactly to this schema:
<schema>{JSON_TEMPLATE}</schema>
</rule>
</RULES>
</INSTRUCTIONS>
</task_description>
""",
expected_output="<expected_output>A structured JSON representation of the HMI state transition with context analysis.</expected_output>",
output_file="outputs/output.txt",
output_json_schema=GraphModel,
agent=self.control_agent(),
tools=[click, double_click, swipe],
max_retries=5,
context=[context]
)
the problem is i can’t guarantee a 100% tool use for each iteration , the process of the graph build can take a very long time and many iteration so a single absence of a tool can mess things up , should i use a flow instead or should i fix my prompts?
your insights are appreciated