Skip to content

Harness Engine

HarnessEngine

Central harness engine orchestrating safe code execution.

All agent operations pass through the HarnessEngine, which ensures: - Policy compliance via PolicyEvaluator - Code safety via static AST analysis - Isolated execution via SandboxRuntime - Full audit trail via AuditRecorder

Example

engine = HarnessEngine( policies=[CodePolicy(denied_modules=["os"]), DataPolicy(deny_ddl=True)], ) result = await engine.execute( intent_raw="Show order count", intent_action="read", intent_domain="order", generated_code="result = db.query('SELECT COUNT(*) FROM orders')", endpoint_name="orders", )

Source code in src/agenticapi/harness/engine.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
class HarnessEngine:
    """Central harness engine orchestrating safe code execution.

    All agent operations pass through the HarnessEngine, which ensures:
    - Policy compliance via PolicyEvaluator
    - Code safety via static AST analysis
    - Isolated execution via SandboxRuntime
    - Full audit trail via AuditRecorder

    Example:
        engine = HarnessEngine(
            policies=[CodePolicy(denied_modules=["os"]), DataPolicy(deny_ddl=True)],
        )
        result = await engine.execute(
            intent_raw="Show order count",
            intent_action="read",
            intent_domain="order",
            generated_code="result = db.query('SELECT COUNT(*) FROM orders')",
            endpoint_name="orders",
        )
    """

    def __init__(
        self,
        *,
        policies: list[Policy] | None = None,
        sandbox: ProcessSandbox | None = None,
        audit_recorder: AuditRecorder | None = None,
        approval_workflow: ApprovalWorkflow | None = None,
        monitors: list[ExecutionMonitor] | None = None,
        validators: list[ResultValidator] | None = None,
    ) -> None:
        """Initialize the harness engine.

        Args:
            policies: List of policies to enforce. If None, no policy checks.
            sandbox: Sandbox runtime for code execution. If None, a
                ProcessSandbox is created with default limits.
            audit_recorder: Recorder for audit traces. If None, a default
                in-memory recorder is created.
            approval_workflow: Optional approval workflow for human-in-the-loop
                control. If provided, operations matching approval rules will
                raise ApprovalRequired.
            monitors: Optional list of execution monitors to run after sandbox
                execution. Monitors check resource usage and output size.
            validators: Optional list of result validators to run after
                monitors. Validators check output correctness.
        """
        self._evaluator = PolicyEvaluator(policies=policies)
        self._sandbox = sandbox or ProcessSandbox()
        self._audit_recorder = audit_recorder or AuditRecorder()
        self._approval: ApprovalWorkflow | None = approval_workflow
        self._monitors: list[ExecutionMonitor] = monitors or []
        self._validators: list[ResultValidator] = validators or []

    @property
    def audit_recorder(self) -> AuditRecorder:
        """Access the audit recorder for retrieving traces."""
        return self._audit_recorder

    @property
    def evaluator(self) -> PolicyEvaluator:
        """Access the policy evaluator."""
        return self._evaluator

    async def execute(
        self,
        *,
        intent_raw: str,
        intent_action: str,
        intent_domain: str,
        generated_code: str,
        reasoning: str | None = None,
        endpoint_name: str = "",
        context: AgentContext | None = None,
        tools: ToolRegistry | None = None,
        sandbox_data: dict[str, object] | None = None,
    ) -> ExecutionResult:
        """Execute generated code through the full harness pipeline.

        Runs policy evaluation, static analysis, sandbox execution,
        and audit recording in sequence.

        Args:
            intent_raw: The original natural language request.
            intent_action: The classified action type.
            intent_domain: The domain of the request.
            generated_code: The Python code to evaluate and execute.
            reasoning: Optional LLM reasoning for the code.
            endpoint_name: Name of the agent endpoint.
            context: Optional agent execution context.
            tools: Optional tool registry for sandbox execution.

        Returns:
            ExecutionResult with the output and audit trace.

        Raises:
            PolicyViolation: If any policy denies the code or static
                analysis finds safety violations.
            CodeExecutionError: If sandbox execution fails.
            SandboxViolation: If sandbox detects a security violation.
        """
        trace_id = uuid.uuid4().hex
        start_time = time.monotonic()
        timestamp = datetime.now(tz=UTC)

        trace = ExecutionTrace(
            trace_id=trace_id,
            endpoint_name=endpoint_name,
            timestamp=timestamp,
            intent_raw=intent_raw,
            intent_action=intent_action,
            generated_code=generated_code,
            reasoning=reasoning,
        )

        logger.info(
            "harness_execute_start",
            trace_id=trace_id,
            endpoint_name=endpoint_name,
            intent_action=intent_action,
            code_lines=generated_code.count("\n") + 1,
        )

        from agenticapi.observability import (
            AgenticAPIAttributes,
            SpanNames,
            get_tracer,
        )

        tracer = get_tracer()

        try:
            # Step 1: Policy evaluation
            with tracer.start_as_current_span(SpanNames.POLICY_EVALUATE.value) as policy_span:
                evaluation = self._evaluator.evaluate(
                    code=generated_code,
                    intent_action=intent_action,
                    intent_domain=intent_domain,
                )
                policy_span.set_attribute(AgenticAPIAttributes.POLICY_ALLOWED.value, evaluation.allowed)
                if evaluation.violations:
                    policy_span.set_attribute(
                        AgenticAPIAttributes.POLICY_VIOLATIONS.value,
                        "; ".join(evaluation.violations)[:500],
                    )
            trace.policy_evaluations = [
                {
                    "policy_name": r.policy_name,
                    "allowed": r.allowed,
                    "violations": r.violations,
                    "warnings": r.warnings,
                }
                for r in evaluation.results
            ]

            # Step 2: Static analysis
            with tracer.start_as_current_span(SpanNames.STATIC_ANALYSIS.value) as static_span:
                denied_modules: list[str] = []
                for policy in self._evaluator.policies:
                    from agenticapi.harness.policy.code_policy import CodePolicy

                    if isinstance(policy, CodePolicy):
                        denied_modules = policy.denied_modules
                        break

                safety_result = check_code_safety(
                    generated_code,
                    denied_modules=denied_modules or None,
                    deny_eval_exec=True,
                    deny_dynamic_import=True,
                )

                if not safety_result.safe:
                    violation_descriptions = [v.description for v in safety_result.violations if v.severity == "error"]
                    violation_summary = "; ".join(violation_descriptions)
                    static_span.set_attribute(AgenticAPIAttributes.POLICY_VIOLATIONS.value, violation_summary[:500])
                    logger.warning(
                        "harness_static_analysis_failed",
                        trace_id=trace_id,
                        violations=violation_descriptions,
                    )
                    raise PolicyViolation(
                        policy="static_analysis",
                        violation=violation_summary,
                        generated_code=generated_code,
                    )

            # Step 3: Approval check
            if self._approval is not None:
                rule = self._approval.check_approval_required(
                    intent_action=intent_action,
                    intent_domain=intent_domain,
                )
                if rule is not None:
                    try:
                        with tracer.start_as_current_span(SpanNames.APPROVAL_WAIT.value) as approval_span:
                            approval_span.set_attribute(AgenticAPIAttributes.APPROVAL_REQUIRED.value, True)
                            await self._approval.create_request(
                                rule=rule,
                                trace_id=trace_id,
                                intent_raw=intent_raw,
                                intent_action=intent_action,
                                intent_domain=intent_domain,
                                generated_code=generated_code,
                            )
                    except ApprovalRequired as exc:
                        trace.approval_request_id = exc.request_id
                        raise

            # Step 4: Sandbox execution (with pre-fetched data injected)
            sandbox_result: SandboxResult | None = None
            with tracer.start_as_current_span(SpanNames.SANDBOX_EXECUTE.value) as sandbox_span:
                sandbox_span.set_attribute(
                    AgenticAPIAttributes.SANDBOX_BACKEND.value,
                    type(self._sandbox).__name__,
                )
                sandbox_start = time.monotonic()
                async with self._sandbox as sandbox:
                    sandbox_result = await sandbox.execute(
                        code=generated_code,
                        tools=tools,
                        resource_limits=ResourceLimits(),
                        sandbox_data=sandbox_data,
                    )
                sandbox_span.set_attribute(
                    AgenticAPIAttributes.SANDBOX_DURATION_MS.value,
                    (time.monotonic() - sandbox_start) * 1000,
                )

            # Step 5: Post-execution monitors
            for monitor in self._monitors:
                monitor_result = await monitor.on_execution_complete(
                    sandbox_result,
                    code=generated_code,
                )
                if not monitor_result.passed:
                    violation_msg = "; ".join(monitor_result.violations)
                    raise SandboxViolation(f"Monitor violation: {violation_msg}")

            # Step 6: Post-execution validators
            for validator in self._validators:
                validation = await validator.validate(
                    sandbox_result,
                    code=generated_code,
                    intent_action=intent_action,
                )
                if not validation.valid:
                    error_msg = "; ".join(validation.errors)
                    raise SandboxViolation(f"Validation failed: {error_msg}")

            trace.execution_result = sandbox_result.return_value if sandbox_result else None
            duration_ms = (time.monotonic() - start_time) * 1000
            trace.execution_duration_ms = duration_ms

            logger.info(
                "harness_execute_complete",
                trace_id=trace_id,
                duration_ms=duration_ms,
            )

            result = ExecutionResult(
                output=sandbox_result.return_value if sandbox_result else None,
                generated_code=generated_code,
                reasoning=reasoning,
                trace=trace,
                sandbox_result=sandbox_result,
            )

        except Exception as exc:
            duration_ms = (time.monotonic() - start_time) * 1000
            trace.execution_duration_ms = duration_ms
            trace.error = str(exc)

            logger.error(
                "harness_execute_failed",
                trace_id=trace_id,
                error=str(exc),
                error_type=type(exc).__name__,
                duration_ms=duration_ms,
            )

            # Record the failed trace before re-raising.  Wrap in
            # try/except so a recorder failure does not mask the
            # original exception.
            try:
                await self._audit_recorder.record(trace)
            except Exception:
                logger.exception("audit_record_failed_on_error_path", trace_id=trace_id)
            raise

        # Step 7: Record audit trace
        await self._audit_recorder.record(trace)

        return result

    def evaluate_intent_text(
        self,
        *,
        intent_text: str,
        intent_action: str = "",
        intent_domain: str = "",
    ) -> None:
        """Run input-scanning policies on raw intent text before the LLM fires.

        Called by the framework at the top of ``_execute_intent`` — the
        earliest point after the intent string is extracted from the
        request body and before any LLM call, code generation, or
        handler execution. This ensures ``PromptInjectionPolicy`` and
        ``PIIPolicy`` block unsafe input at the LLM boundary.

        Raises :class:`~agenticapi.exceptions.PolicyViolation` if any
        policy denies the text. On success, returns silently.

        Policies that don't override
        :meth:`~agenticapi.harness.policy.base.Policy.evaluate_intent_text`
        default to allow, so adding this call has zero effect on policies
        like ``CodePolicy`` or ``DataPolicy`` whose domain is generated
        code, not user text.
        """
        self._evaluator.evaluate_intent_text(
            intent_text=intent_text,
            intent_action=intent_action,
            intent_domain=intent_domain,
        )

    async def call_tool(
        self,
        *,
        tool: Any,
        arguments: dict[str, Any],
        intent_raw: str = "",
        intent_action: str = "",
        intent_domain: str = "",
        endpoint_name: str = "",
        context: AgentContext | None = None,
    ) -> ExecutionResult:
        """Invoke a registered tool directly (Phase E4 tool-first path).

        Skips code generation and sandbox execution entirely. The
        only policy pass that runs is
        :meth:`PolicyEvaluator.evaluate_tool_call` — AST checks and
        sandbox safety nets are irrelevant when we're calling a
        *pre-registered* Python function whose implementation we
        control. In exchange the caller gets roughly an order of
        magnitude lower latency and removes a whole class of
        code-gen failures from the hot path.

        Args:
            tool: The :class:`~agenticapi.runtime.tools.base.Tool` to
                invoke. Typically obtained from the app's
                :class:`ToolRegistry` via the name the LLM emitted.
            arguments: Keyword arguments for the tool call, usually
                coming straight from
                :attr:`agenticapi.runtime.llm.base.ToolCall.arguments`.
            intent_raw: The original intent string for the audit
                trace.
            intent_action: Classified action for policy evaluation
                (``read``/``write``/etc.).
            intent_domain: Classified domain.
            endpoint_name: Endpoint whose request triggered this
                call, for the audit record.
            context: Optional :class:`AgentContext` — currently
                unused by the tool-first path but accepted for
                symmetry with :meth:`execute`.

        Returns:
            :class:`ExecutionResult` with the tool's return value,
            no generated code, and a fully populated audit trace.

        Raises:
            PolicyViolation: If any policy's
                :meth:`evaluate_tool_call` hook denies the call.
            ToolError: If the tool's own invocation fails.
        """
        del context
        trace_id = uuid.uuid4().hex
        start = time.monotonic()
        timestamp = datetime.now(tz=UTC)
        tool_name = tool.definition.name

        trace = ExecutionTrace(
            trace_id=trace_id,
            endpoint_name=endpoint_name,
            timestamp=timestamp,
            intent_raw=intent_raw,
            intent_action=intent_action,
            generated_code=f"# tool-first call: {tool_name}({arguments})",
        )

        logger.info(
            "harness_tool_call_start",
            trace_id=trace_id,
            tool=tool_name,
            intent_action=intent_action,
        )

        try:
            evaluation = self._evaluator.evaluate_tool_call(
                tool_name=tool_name,
                arguments=arguments,
                intent_action=intent_action,
                intent_domain=intent_domain,
            )
            trace.policy_evaluations = [
                {
                    "policy_name": r.policy_name,
                    "allowed": r.allowed,
                    "violations": r.violations,
                    "warnings": r.warnings,
                }
                for r in evaluation.results
            ]

            invocation_start = time.monotonic()
            output = await tool.invoke(**arguments)
            tool_duration_ms = (time.monotonic() - invocation_start) * 1000
            trace.execution_result = output
            trace.execution_duration_ms = (time.monotonic() - start) * 1000

            logger.info(
                "harness_tool_call_complete",
                trace_id=trace_id,
                tool=tool_name,
                duration_ms=trace.execution_duration_ms,
                tool_duration_ms=tool_duration_ms,
            )

        except PolicyViolation:
            trace.execution_duration_ms = (time.monotonic() - start) * 1000
            trace.error = f"Policy denied tool call {tool_name}"
            await self._audit_recorder.record(trace)
            raise
        except Exception as exc:
            trace.execution_duration_ms = (time.monotonic() - start) * 1000
            trace.error = f"{type(exc).__name__}: {exc}"
            logger.error(
                "harness_tool_call_failed",
                trace_id=trace_id,
                tool=tool_name,
                error=str(exc),
            )
            await self._audit_recorder.record(trace)
            raise

        await self._audit_recorder.record(trace)
        return ExecutionResult(
            output=output,
            generated_code=trace.generated_code,
            reasoning=None,
            trace=trace,
            sandbox_result=None,
        )

audit_recorder property

audit_recorder: AuditRecorder

Access the audit recorder for retrieving traces.

evaluator property

evaluator: PolicyEvaluator

Access the policy evaluator.

__init__

__init__(
    *,
    policies: list[Policy] | None = None,
    sandbox: ProcessSandbox | None = None,
    audit_recorder: AuditRecorder | None = None,
    approval_workflow: ApprovalWorkflow | None = None,
    monitors: list[ExecutionMonitor] | None = None,
    validators: list[ResultValidator] | None = None,
) -> None

Initialize the harness engine.

Parameters:

Name Type Description Default
policies list[Policy] | None

List of policies to enforce. If None, no policy checks.

None
sandbox ProcessSandbox | None

Sandbox runtime for code execution. If None, a ProcessSandbox is created with default limits.

None
audit_recorder AuditRecorder | None

Recorder for audit traces. If None, a default in-memory recorder is created.

None
approval_workflow ApprovalWorkflow | None

Optional approval workflow for human-in-the-loop control. If provided, operations matching approval rules will raise ApprovalRequired.

None
monitors list[ExecutionMonitor] | None

Optional list of execution monitors to run after sandbox execution. Monitors check resource usage and output size.

None
validators list[ResultValidator] | None

Optional list of result validators to run after monitors. Validators check output correctness.

None
Source code in src/agenticapi/harness/engine.py
def __init__(
    self,
    *,
    policies: list[Policy] | None = None,
    sandbox: ProcessSandbox | None = None,
    audit_recorder: AuditRecorder | None = None,
    approval_workflow: ApprovalWorkflow | None = None,
    monitors: list[ExecutionMonitor] | None = None,
    validators: list[ResultValidator] | None = None,
) -> None:
    """Initialize the harness engine.

    Args:
        policies: List of policies to enforce. If None, no policy checks.
        sandbox: Sandbox runtime for code execution. If None, a
            ProcessSandbox is created with default limits.
        audit_recorder: Recorder for audit traces. If None, a default
            in-memory recorder is created.
        approval_workflow: Optional approval workflow for human-in-the-loop
            control. If provided, operations matching approval rules will
            raise ApprovalRequired.
        monitors: Optional list of execution monitors to run after sandbox
            execution. Monitors check resource usage and output size.
        validators: Optional list of result validators to run after
            monitors. Validators check output correctness.
    """
    self._evaluator = PolicyEvaluator(policies=policies)
    self._sandbox = sandbox or ProcessSandbox()
    self._audit_recorder = audit_recorder or AuditRecorder()
    self._approval: ApprovalWorkflow | None = approval_workflow
    self._monitors: list[ExecutionMonitor] = monitors or []
    self._validators: list[ResultValidator] = validators or []

execute async

execute(
    *,
    intent_raw: str,
    intent_action: str,
    intent_domain: str,
    generated_code: str,
    reasoning: str | None = None,
    endpoint_name: str = "",
    context: AgentContext | None = None,
    tools: ToolRegistry | None = None,
    sandbox_data: dict[str, object] | None = None,
) -> ExecutionResult

Execute generated code through the full harness pipeline.

Runs policy evaluation, static analysis, sandbox execution, and audit recording in sequence.

Parameters:

Name Type Description Default
intent_raw str

The original natural language request.

required
intent_action str

The classified action type.

required
intent_domain str

The domain of the request.

required
generated_code str

The Python code to evaluate and execute.

required
reasoning str | None

Optional LLM reasoning for the code.

None
endpoint_name str

Name of the agent endpoint.

''
context AgentContext | None

Optional agent execution context.

None
tools ToolRegistry | None

Optional tool registry for sandbox execution.

None

Returns:

Type Description
ExecutionResult

ExecutionResult with the output and audit trace.

Raises:

Type Description
PolicyViolation

If any policy denies the code or static analysis finds safety violations.

CodeExecutionError

If sandbox execution fails.

SandboxViolation

If sandbox detects a security violation.

Source code in src/agenticapi/harness/engine.py
async def execute(
    self,
    *,
    intent_raw: str,
    intent_action: str,
    intent_domain: str,
    generated_code: str,
    reasoning: str | None = None,
    endpoint_name: str = "",
    context: AgentContext | None = None,
    tools: ToolRegistry | None = None,
    sandbox_data: dict[str, object] | None = None,
) -> ExecutionResult:
    """Execute generated code through the full harness pipeline.

    Runs policy evaluation, static analysis, sandbox execution,
    and audit recording in sequence.

    Args:
        intent_raw: The original natural language request.
        intent_action: The classified action type.
        intent_domain: The domain of the request.
        generated_code: The Python code to evaluate and execute.
        reasoning: Optional LLM reasoning for the code.
        endpoint_name: Name of the agent endpoint.
        context: Optional agent execution context.
        tools: Optional tool registry for sandbox execution.

    Returns:
        ExecutionResult with the output and audit trace.

    Raises:
        PolicyViolation: If any policy denies the code or static
            analysis finds safety violations.
        CodeExecutionError: If sandbox execution fails.
        SandboxViolation: If sandbox detects a security violation.
    """
    trace_id = uuid.uuid4().hex
    start_time = time.monotonic()
    timestamp = datetime.now(tz=UTC)

    trace = ExecutionTrace(
        trace_id=trace_id,
        endpoint_name=endpoint_name,
        timestamp=timestamp,
        intent_raw=intent_raw,
        intent_action=intent_action,
        generated_code=generated_code,
        reasoning=reasoning,
    )

    logger.info(
        "harness_execute_start",
        trace_id=trace_id,
        endpoint_name=endpoint_name,
        intent_action=intent_action,
        code_lines=generated_code.count("\n") + 1,
    )

    from agenticapi.observability import (
        AgenticAPIAttributes,
        SpanNames,
        get_tracer,
    )

    tracer = get_tracer()

    try:
        # Step 1: Policy evaluation
        with tracer.start_as_current_span(SpanNames.POLICY_EVALUATE.value) as policy_span:
            evaluation = self._evaluator.evaluate(
                code=generated_code,
                intent_action=intent_action,
                intent_domain=intent_domain,
            )
            policy_span.set_attribute(AgenticAPIAttributes.POLICY_ALLOWED.value, evaluation.allowed)
            if evaluation.violations:
                policy_span.set_attribute(
                    AgenticAPIAttributes.POLICY_VIOLATIONS.value,
                    "; ".join(evaluation.violations)[:500],
                )
        trace.policy_evaluations = [
            {
                "policy_name": r.policy_name,
                "allowed": r.allowed,
                "violations": r.violations,
                "warnings": r.warnings,
            }
            for r in evaluation.results
        ]

        # Step 2: Static analysis
        with tracer.start_as_current_span(SpanNames.STATIC_ANALYSIS.value) as static_span:
            denied_modules: list[str] = []
            for policy in self._evaluator.policies:
                from agenticapi.harness.policy.code_policy import CodePolicy

                if isinstance(policy, CodePolicy):
                    denied_modules = policy.denied_modules
                    break

            safety_result = check_code_safety(
                generated_code,
                denied_modules=denied_modules or None,
                deny_eval_exec=True,
                deny_dynamic_import=True,
            )

            if not safety_result.safe:
                violation_descriptions = [v.description for v in safety_result.violations if v.severity == "error"]
                violation_summary = "; ".join(violation_descriptions)
                static_span.set_attribute(AgenticAPIAttributes.POLICY_VIOLATIONS.value, violation_summary[:500])
                logger.warning(
                    "harness_static_analysis_failed",
                    trace_id=trace_id,
                    violations=violation_descriptions,
                )
                raise PolicyViolation(
                    policy="static_analysis",
                    violation=violation_summary,
                    generated_code=generated_code,
                )

        # Step 3: Approval check
        if self._approval is not None:
            rule = self._approval.check_approval_required(
                intent_action=intent_action,
                intent_domain=intent_domain,
            )
            if rule is not None:
                try:
                    with tracer.start_as_current_span(SpanNames.APPROVAL_WAIT.value) as approval_span:
                        approval_span.set_attribute(AgenticAPIAttributes.APPROVAL_REQUIRED.value, True)
                        await self._approval.create_request(
                            rule=rule,
                            trace_id=trace_id,
                            intent_raw=intent_raw,
                            intent_action=intent_action,
                            intent_domain=intent_domain,
                            generated_code=generated_code,
                        )
                except ApprovalRequired as exc:
                    trace.approval_request_id = exc.request_id
                    raise

        # Step 4: Sandbox execution (with pre-fetched data injected)
        sandbox_result: SandboxResult | None = None
        with tracer.start_as_current_span(SpanNames.SANDBOX_EXECUTE.value) as sandbox_span:
            sandbox_span.set_attribute(
                AgenticAPIAttributes.SANDBOX_BACKEND.value,
                type(self._sandbox).__name__,
            )
            sandbox_start = time.monotonic()
            async with self._sandbox as sandbox:
                sandbox_result = await sandbox.execute(
                    code=generated_code,
                    tools=tools,
                    resource_limits=ResourceLimits(),
                    sandbox_data=sandbox_data,
                )
            sandbox_span.set_attribute(
                AgenticAPIAttributes.SANDBOX_DURATION_MS.value,
                (time.monotonic() - sandbox_start) * 1000,
            )

        # Step 5: Post-execution monitors
        for monitor in self._monitors:
            monitor_result = await monitor.on_execution_complete(
                sandbox_result,
                code=generated_code,
            )
            if not monitor_result.passed:
                violation_msg = "; ".join(monitor_result.violations)
                raise SandboxViolation(f"Monitor violation: {violation_msg}")

        # Step 6: Post-execution validators
        for validator in self._validators:
            validation = await validator.validate(
                sandbox_result,
                code=generated_code,
                intent_action=intent_action,
            )
            if not validation.valid:
                error_msg = "; ".join(validation.errors)
                raise SandboxViolation(f"Validation failed: {error_msg}")

        trace.execution_result = sandbox_result.return_value if sandbox_result else None
        duration_ms = (time.monotonic() - start_time) * 1000
        trace.execution_duration_ms = duration_ms

        logger.info(
            "harness_execute_complete",
            trace_id=trace_id,
            duration_ms=duration_ms,
        )

        result = ExecutionResult(
            output=sandbox_result.return_value if sandbox_result else None,
            generated_code=generated_code,
            reasoning=reasoning,
            trace=trace,
            sandbox_result=sandbox_result,
        )

    except Exception as exc:
        duration_ms = (time.monotonic() - start_time) * 1000
        trace.execution_duration_ms = duration_ms
        trace.error = str(exc)

        logger.error(
            "harness_execute_failed",
            trace_id=trace_id,
            error=str(exc),
            error_type=type(exc).__name__,
            duration_ms=duration_ms,
        )

        # Record the failed trace before re-raising.  Wrap in
        # try/except so a recorder failure does not mask the
        # original exception.
        try:
            await self._audit_recorder.record(trace)
        except Exception:
            logger.exception("audit_record_failed_on_error_path", trace_id=trace_id)
        raise

    # Step 7: Record audit trace
    await self._audit_recorder.record(trace)

    return result

evaluate_intent_text

evaluate_intent_text(
    *,
    intent_text: str,
    intent_action: str = "",
    intent_domain: str = "",
) -> None

Run input-scanning policies on raw intent text before the LLM fires.

Called by the framework at the top of _execute_intent — the earliest point after the intent string is extracted from the request body and before any LLM call, code generation, or handler execution. This ensures PromptInjectionPolicy and PIIPolicy block unsafe input at the LLM boundary.

Raises :class:~agenticapi.exceptions.PolicyViolation if any policy denies the text. On success, returns silently.

Policies that don't override :meth:~agenticapi.harness.policy.base.Policy.evaluate_intent_text default to allow, so adding this call has zero effect on policies like CodePolicy or DataPolicy whose domain is generated code, not user text.

Source code in src/agenticapi/harness/engine.py
def evaluate_intent_text(
    self,
    *,
    intent_text: str,
    intent_action: str = "",
    intent_domain: str = "",
) -> None:
    """Run input-scanning policies on raw intent text before the LLM fires.

    Called by the framework at the top of ``_execute_intent`` — the
    earliest point after the intent string is extracted from the
    request body and before any LLM call, code generation, or
    handler execution. This ensures ``PromptInjectionPolicy`` and
    ``PIIPolicy`` block unsafe input at the LLM boundary.

    Raises :class:`~agenticapi.exceptions.PolicyViolation` if any
    policy denies the text. On success, returns silently.

    Policies that don't override
    :meth:`~agenticapi.harness.policy.base.Policy.evaluate_intent_text`
    default to allow, so adding this call has zero effect on policies
    like ``CodePolicy`` or ``DataPolicy`` whose domain is generated
    code, not user text.
    """
    self._evaluator.evaluate_intent_text(
        intent_text=intent_text,
        intent_action=intent_action,
        intent_domain=intent_domain,
    )

call_tool async

call_tool(
    *,
    tool: Any,
    arguments: dict[str, Any],
    intent_raw: str = "",
    intent_action: str = "",
    intent_domain: str = "",
    endpoint_name: str = "",
    context: AgentContext | None = None,
) -> ExecutionResult

Invoke a registered tool directly (Phase E4 tool-first path).

Skips code generation and sandbox execution entirely. The only policy pass that runs is :meth:PolicyEvaluator.evaluate_tool_call — AST checks and sandbox safety nets are irrelevant when we're calling a pre-registered Python function whose implementation we control. In exchange the caller gets roughly an order of magnitude lower latency and removes a whole class of code-gen failures from the hot path.

Parameters:

Name Type Description Default
tool Any

The :class:~agenticapi.runtime.tools.base.Tool to invoke. Typically obtained from the app's :class:ToolRegistry via the name the LLM emitted.

required
arguments dict[str, Any]

Keyword arguments for the tool call, usually coming straight from :attr:agenticapi.runtime.llm.base.ToolCall.arguments.

required
intent_raw str

The original intent string for the audit trace.

''
intent_action str

Classified action for policy evaluation (read/write/etc.).

''
intent_domain str

Classified domain.

''
endpoint_name str

Endpoint whose request triggered this call, for the audit record.

''
context AgentContext | None

Optional :class:AgentContext — currently unused by the tool-first path but accepted for symmetry with :meth:execute.

None

Returns:

Type Description
ExecutionResult

class:ExecutionResult with the tool's return value,

ExecutionResult

no generated code, and a fully populated audit trace.

Raises:

Type Description
PolicyViolation

If any policy's :meth:evaluate_tool_call hook denies the call.

ToolError

If the tool's own invocation fails.

Source code in src/agenticapi/harness/engine.py
async def call_tool(
    self,
    *,
    tool: Any,
    arguments: dict[str, Any],
    intent_raw: str = "",
    intent_action: str = "",
    intent_domain: str = "",
    endpoint_name: str = "",
    context: AgentContext | None = None,
) -> ExecutionResult:
    """Invoke a registered tool directly (Phase E4 tool-first path).

    Skips code generation and sandbox execution entirely. The
    only policy pass that runs is
    :meth:`PolicyEvaluator.evaluate_tool_call` — AST checks and
    sandbox safety nets are irrelevant when we're calling a
    *pre-registered* Python function whose implementation we
    control. In exchange the caller gets roughly an order of
    magnitude lower latency and removes a whole class of
    code-gen failures from the hot path.

    Args:
        tool: The :class:`~agenticapi.runtime.tools.base.Tool` to
            invoke. Typically obtained from the app's
            :class:`ToolRegistry` via the name the LLM emitted.
        arguments: Keyword arguments for the tool call, usually
            coming straight from
            :attr:`agenticapi.runtime.llm.base.ToolCall.arguments`.
        intent_raw: The original intent string for the audit
            trace.
        intent_action: Classified action for policy evaluation
            (``read``/``write``/etc.).
        intent_domain: Classified domain.
        endpoint_name: Endpoint whose request triggered this
            call, for the audit record.
        context: Optional :class:`AgentContext` — currently
            unused by the tool-first path but accepted for
            symmetry with :meth:`execute`.

    Returns:
        :class:`ExecutionResult` with the tool's return value,
        no generated code, and a fully populated audit trace.

    Raises:
        PolicyViolation: If any policy's
            :meth:`evaluate_tool_call` hook denies the call.
        ToolError: If the tool's own invocation fails.
    """
    del context
    trace_id = uuid.uuid4().hex
    start = time.monotonic()
    timestamp = datetime.now(tz=UTC)
    tool_name = tool.definition.name

    trace = ExecutionTrace(
        trace_id=trace_id,
        endpoint_name=endpoint_name,
        timestamp=timestamp,
        intent_raw=intent_raw,
        intent_action=intent_action,
        generated_code=f"# tool-first call: {tool_name}({arguments})",
    )

    logger.info(
        "harness_tool_call_start",
        trace_id=trace_id,
        tool=tool_name,
        intent_action=intent_action,
    )

    try:
        evaluation = self._evaluator.evaluate_tool_call(
            tool_name=tool_name,
            arguments=arguments,
            intent_action=intent_action,
            intent_domain=intent_domain,
        )
        trace.policy_evaluations = [
            {
                "policy_name": r.policy_name,
                "allowed": r.allowed,
                "violations": r.violations,
                "warnings": r.warnings,
            }
            for r in evaluation.results
        ]

        invocation_start = time.monotonic()
        output = await tool.invoke(**arguments)
        tool_duration_ms = (time.monotonic() - invocation_start) * 1000
        trace.execution_result = output
        trace.execution_duration_ms = (time.monotonic() - start) * 1000

        logger.info(
            "harness_tool_call_complete",
            trace_id=trace_id,
            tool=tool_name,
            duration_ms=trace.execution_duration_ms,
            tool_duration_ms=tool_duration_ms,
        )

    except PolicyViolation:
        trace.execution_duration_ms = (time.monotonic() - start) * 1000
        trace.error = f"Policy denied tool call {tool_name}"
        await self._audit_recorder.record(trace)
        raise
    except Exception as exc:
        trace.execution_duration_ms = (time.monotonic() - start) * 1000
        trace.error = f"{type(exc).__name__}: {exc}"
        logger.error(
            "harness_tool_call_failed",
            trace_id=trace_id,
            tool=tool_name,
            error=str(exc),
        )
        await self._audit_recorder.record(trace)
        raise

    await self._audit_recorder.record(trace)
    return ExecutionResult(
        output=output,
        generated_code=trace.generated_code,
        reasoning=None,
        trace=trace,
        sandbox_result=None,
    )

ExecutionResult

ExecutionResult dataclass

Result of harness-controlled code execution.

Attributes:

Name Type Description
output Any

The primary output from execution.

generated_code str

The code that was executed.

reasoning str | None

Optional LLM reasoning for the generated code.

trace ExecutionTrace | None

The audit trace for this execution.

sandbox_result SandboxResult | None

The raw sandbox execution result.

Source code in src/agenticapi/harness/engine.py
@dataclass(frozen=True, slots=True)
class ExecutionResult:
    """Result of harness-controlled code execution.

    Attributes:
        output: The primary output from execution.
        generated_code: The code that was executed.
        reasoning: Optional LLM reasoning for the generated code.
        trace: The audit trace for this execution.
        sandbox_result: The raw sandbox execution result.
    """

    output: Any
    generated_code: str
    reasoning: str | None = None
    trace: ExecutionTrace | None = None
    sandbox_result: SandboxResult | None = None

PolicyEvaluator

PolicyEvaluator

Evaluates generated code against a collection of policies.

Runs all registered policies and aggregates results. If any policy returns allowed=False, the overall result is not allowed and a PolicyViolation is raised.

Example

evaluator = PolicyEvaluator(policies=[ CodePolicy(denied_modules=["os"]), DataPolicy(deny_ddl=True), ]) result = evaluator.evaluate(code="SELECT 1", intent_action="read")

Source code in src/agenticapi/harness/policy/evaluator.py
class PolicyEvaluator:
    """Evaluates generated code against a collection of policies.

    Runs all registered policies and aggregates results. If any policy
    returns allowed=False, the overall result is not allowed and a
    PolicyViolation is raised.

    Example:
        evaluator = PolicyEvaluator(policies=[
            CodePolicy(denied_modules=["os"]),
            DataPolicy(deny_ddl=True),
        ])
        result = evaluator.evaluate(code="SELECT 1", intent_action="read")
    """

    def __init__(self, policies: list[Policy] | None = None) -> None:
        """Initialize the evaluator with optional policies.

        Args:
            policies: Initial list of policies to evaluate against.
        """
        self._policies: list[Policy] = list(policies) if policies else []

    def add_policy(self, policy: Policy) -> None:
        """Add a policy to the evaluator.

        Args:
            policy: The policy to add.
        """
        self._policies.append(policy)
        logger.info("policy_added", policy_type=type(policy).__name__)

    def evaluate(
        self,
        *,
        code: str,
        intent_action: str = "",
        intent_domain: str = "",
        **kwargs: Any,
    ) -> EvaluationResult:
        """Evaluate code against all registered policies.

        Runs every policy and aggregates results. If any policy denies
        the code, raises PolicyViolation with all violations.

        Args:
            code: The generated Python source code to evaluate.
            intent_action: The classified action type.
            intent_domain: The domain of the request.
            **kwargs: Additional context passed to each policy.

        Returns:
            EvaluationResult with aggregated results.

        Raises:
            PolicyViolation: If any policy denies the code.
        """
        results: list[PolicyResult] = []
        all_violations: list[str] = []
        all_warnings: list[str] = []
        overall_allowed = True

        for policy in self._policies:
            policy_name = type(policy).__name__
            logger.debug("policy_evaluation_start", policy=policy_name)

            result = policy.evaluate(
                code=code,
                intent_action=intent_action,
                intent_domain=intent_domain,
                **kwargs,
            )
            results.append(result)

            if not result.allowed:
                overall_allowed = False
                all_violations.extend(result.violations)
                logger.warning(
                    "policy_denied",
                    policy=policy_name,
                    violations=result.violations,
                )
            else:
                logger.debug("policy_allowed", policy=policy_name)

            all_warnings.extend(result.warnings)

        if all_warnings:
            logger.info("policy_warnings", warnings=all_warnings)

        evaluation = EvaluationResult(
            allowed=overall_allowed,
            results=results,
            violations=all_violations,
            warnings=all_warnings,
        )

        if not overall_allowed:
            violation_summary = "; ".join(all_violations)
            policy_names = ", ".join(r.policy_name for r in results if not r.allowed)
            raise PolicyViolation(
                policy=policy_names,
                violation=violation_summary,
                generated_code=code,
            )

        return evaluation

    def evaluate_intent_text(
        self,
        *,
        intent_text: str,
        intent_action: str = "",
        intent_domain: str = "",
        **kwargs: Any,
    ) -> EvaluationResult:
        """Evaluate raw intent text against every registered policy.

        Called by the framework **before** the LLM fires. Fans out to
        each policy's :meth:`~Policy.evaluate_intent_text` hook and
        aggregates results identically to :meth:`evaluate`. Raises
        :class:`PolicyViolation` on denial so the request pipeline
        can abort before the LLM ever sees the text.

        This is the **input-scanning** counterpart to :meth:`evaluate`
        (post-code-gen) and :meth:`evaluate_tool_call` (tool-first).
        Policies that don't override the hook default to allow.
        """
        results: list[PolicyResult] = []
        all_violations: list[str] = []
        all_warnings: list[str] = []
        overall_allowed = True

        for policy in self._policies:
            policy_name = type(policy).__name__
            logger.debug("policy_intent_text_evaluation_start", policy=policy_name)
            result = policy.evaluate_intent_text(
                intent_text=intent_text,
                intent_action=intent_action,
                intent_domain=intent_domain,
                **kwargs,
            )
            results.append(result)
            if not result.allowed:
                overall_allowed = False
                all_violations.extend(result.violations)
                logger.warning(
                    "policy_intent_text_denied",
                    policy=policy_name,
                    violations=result.violations,
                )
            all_warnings.extend(result.warnings)

        evaluation = EvaluationResult(
            allowed=overall_allowed,
            results=results,
            violations=all_violations,
            warnings=all_warnings,
        )
        if not overall_allowed:
            violation_summary = "; ".join(all_violations)
            policy_names = ", ".join(r.policy_name for r in results if not r.allowed)
            raise PolicyViolation(
                policy=policy_names,
                violation=f"Intent text denied: {violation_summary}",
            )
        return evaluation

    def evaluate_tool_call(
        self,
        *,
        tool_name: str,
        arguments: dict[str, Any],
        intent_action: str = "",
        intent_domain: str = "",
        **kwargs: Any,
    ) -> EvaluationResult:
        """Evaluate a tool call against every registered policy (Phase E4).

        Mirror of :meth:`evaluate` that fans out to each policy's
        :meth:`~agenticapi.harness.policy.base.Policy.evaluate_tool_call`
        hook instead of the code-oriented ``evaluate``. Aggregates
        results identically and raises :class:`PolicyViolation` on
        denial so the tool-first execution path in
        :class:`~agenticapi.harness.engine.HarnessEngine` can share
        the existing exception flow.
        """
        results: list[PolicyResult] = []
        all_violations: list[str] = []
        all_warnings: list[str] = []
        overall_allowed = True

        for policy in self._policies:
            policy_name = type(policy).__name__
            logger.debug("policy_tool_call_evaluation_start", policy=policy_name, tool=tool_name)
            result = policy.evaluate_tool_call(
                tool_name=tool_name,
                arguments=arguments,
                intent_action=intent_action,
                intent_domain=intent_domain,
                **kwargs,
            )
            results.append(result)
            if not result.allowed:
                overall_allowed = False
                all_violations.extend(result.violations)
                logger.warning(
                    "policy_tool_call_denied",
                    policy=policy_name,
                    tool=tool_name,
                    violations=result.violations,
                )
            all_warnings.extend(result.warnings)

        evaluation = EvaluationResult(
            allowed=overall_allowed,
            results=results,
            violations=all_violations,
            warnings=all_warnings,
        )
        if not overall_allowed:
            violation_summary = "; ".join(all_violations)
            policy_names = ", ".join(r.policy_name for r in results if not r.allowed)
            raise PolicyViolation(
                policy=policy_names,
                violation=f"Tool call {tool_name!r} denied: {violation_summary}",
            )
        return evaluation

    @property
    def policies(self) -> list[Policy]:
        """Return a copy of the registered policies."""
        return list(self._policies)

policies property

policies: list[Policy]

Return a copy of the registered policies.

__init__

__init__(policies: list[Policy] | None = None) -> None

Initialize the evaluator with optional policies.

Parameters:

Name Type Description Default
policies list[Policy] | None

Initial list of policies to evaluate against.

None
Source code in src/agenticapi/harness/policy/evaluator.py
def __init__(self, policies: list[Policy] | None = None) -> None:
    """Initialize the evaluator with optional policies.

    Args:
        policies: Initial list of policies to evaluate against.
    """
    self._policies: list[Policy] = list(policies) if policies else []

add_policy

add_policy(policy: Policy) -> None

Add a policy to the evaluator.

Parameters:

Name Type Description Default
policy Policy

The policy to add.

required
Source code in src/agenticapi/harness/policy/evaluator.py
def add_policy(self, policy: Policy) -> None:
    """Add a policy to the evaluator.

    Args:
        policy: The policy to add.
    """
    self._policies.append(policy)
    logger.info("policy_added", policy_type=type(policy).__name__)

evaluate

evaluate(
    *,
    code: str,
    intent_action: str = "",
    intent_domain: str = "",
    **kwargs: Any,
) -> EvaluationResult

Evaluate code against all registered policies.

Runs every policy and aggregates results. If any policy denies the code, raises PolicyViolation with all violations.

Parameters:

Name Type Description Default
code str

The generated Python source code to evaluate.

required
intent_action str

The classified action type.

''
intent_domain str

The domain of the request.

''
**kwargs Any

Additional context passed to each policy.

{}

Returns:

Type Description
EvaluationResult

EvaluationResult with aggregated results.

Raises:

Type Description
PolicyViolation

If any policy denies the code.

Source code in src/agenticapi/harness/policy/evaluator.py
def evaluate(
    self,
    *,
    code: str,
    intent_action: str = "",
    intent_domain: str = "",
    **kwargs: Any,
) -> EvaluationResult:
    """Evaluate code against all registered policies.

    Runs every policy and aggregates results. If any policy denies
    the code, raises PolicyViolation with all violations.

    Args:
        code: The generated Python source code to evaluate.
        intent_action: The classified action type.
        intent_domain: The domain of the request.
        **kwargs: Additional context passed to each policy.

    Returns:
        EvaluationResult with aggregated results.

    Raises:
        PolicyViolation: If any policy denies the code.
    """
    results: list[PolicyResult] = []
    all_violations: list[str] = []
    all_warnings: list[str] = []
    overall_allowed = True

    for policy in self._policies:
        policy_name = type(policy).__name__
        logger.debug("policy_evaluation_start", policy=policy_name)

        result = policy.evaluate(
            code=code,
            intent_action=intent_action,
            intent_domain=intent_domain,
            **kwargs,
        )
        results.append(result)

        if not result.allowed:
            overall_allowed = False
            all_violations.extend(result.violations)
            logger.warning(
                "policy_denied",
                policy=policy_name,
                violations=result.violations,
            )
        else:
            logger.debug("policy_allowed", policy=policy_name)

        all_warnings.extend(result.warnings)

    if all_warnings:
        logger.info("policy_warnings", warnings=all_warnings)

    evaluation = EvaluationResult(
        allowed=overall_allowed,
        results=results,
        violations=all_violations,
        warnings=all_warnings,
    )

    if not overall_allowed:
        violation_summary = "; ".join(all_violations)
        policy_names = ", ".join(r.policy_name for r in results if not r.allowed)
        raise PolicyViolation(
            policy=policy_names,
            violation=violation_summary,
            generated_code=code,
        )

    return evaluation

evaluate_intent_text

evaluate_intent_text(
    *,
    intent_text: str,
    intent_action: str = "",
    intent_domain: str = "",
    **kwargs: Any,
) -> EvaluationResult

Evaluate raw intent text against every registered policy.

Called by the framework before the LLM fires. Fans out to each policy's :meth:~Policy.evaluate_intent_text hook and aggregates results identically to :meth:evaluate. Raises :class:PolicyViolation on denial so the request pipeline can abort before the LLM ever sees the text.

This is the input-scanning counterpart to :meth:evaluate (post-code-gen) and :meth:evaluate_tool_call (tool-first). Policies that don't override the hook default to allow.

Source code in src/agenticapi/harness/policy/evaluator.py
def evaluate_intent_text(
    self,
    *,
    intent_text: str,
    intent_action: str = "",
    intent_domain: str = "",
    **kwargs: Any,
) -> EvaluationResult:
    """Evaluate raw intent text against every registered policy.

    Called by the framework **before** the LLM fires. Fans out to
    each policy's :meth:`~Policy.evaluate_intent_text` hook and
    aggregates results identically to :meth:`evaluate`. Raises
    :class:`PolicyViolation` on denial so the request pipeline
    can abort before the LLM ever sees the text.

    This is the **input-scanning** counterpart to :meth:`evaluate`
    (post-code-gen) and :meth:`evaluate_tool_call` (tool-first).
    Policies that don't override the hook default to allow.
    """
    results: list[PolicyResult] = []
    all_violations: list[str] = []
    all_warnings: list[str] = []
    overall_allowed = True

    for policy in self._policies:
        policy_name = type(policy).__name__
        logger.debug("policy_intent_text_evaluation_start", policy=policy_name)
        result = policy.evaluate_intent_text(
            intent_text=intent_text,
            intent_action=intent_action,
            intent_domain=intent_domain,
            **kwargs,
        )
        results.append(result)
        if not result.allowed:
            overall_allowed = False
            all_violations.extend(result.violations)
            logger.warning(
                "policy_intent_text_denied",
                policy=policy_name,
                violations=result.violations,
            )
        all_warnings.extend(result.warnings)

    evaluation = EvaluationResult(
        allowed=overall_allowed,
        results=results,
        violations=all_violations,
        warnings=all_warnings,
    )
    if not overall_allowed:
        violation_summary = "; ".join(all_violations)
        policy_names = ", ".join(r.policy_name for r in results if not r.allowed)
        raise PolicyViolation(
            policy=policy_names,
            violation=f"Intent text denied: {violation_summary}",
        )
    return evaluation

evaluate_tool_call

evaluate_tool_call(
    *,
    tool_name: str,
    arguments: dict[str, Any],
    intent_action: str = "",
    intent_domain: str = "",
    **kwargs: Any,
) -> EvaluationResult

Evaluate a tool call against every registered policy (Phase E4).

Mirror of :meth:evaluate that fans out to each policy's :meth:~agenticapi.harness.policy.base.Policy.evaluate_tool_call hook instead of the code-oriented evaluate. Aggregates results identically and raises :class:PolicyViolation on denial so the tool-first execution path in :class:~agenticapi.harness.engine.HarnessEngine can share the existing exception flow.

Source code in src/agenticapi/harness/policy/evaluator.py
def evaluate_tool_call(
    self,
    *,
    tool_name: str,
    arguments: dict[str, Any],
    intent_action: str = "",
    intent_domain: str = "",
    **kwargs: Any,
) -> EvaluationResult:
    """Evaluate a tool call against every registered policy (Phase E4).

    Mirror of :meth:`evaluate` that fans out to each policy's
    :meth:`~agenticapi.harness.policy.base.Policy.evaluate_tool_call`
    hook instead of the code-oriented ``evaluate``. Aggregates
    results identically and raises :class:`PolicyViolation` on
    denial so the tool-first execution path in
    :class:`~agenticapi.harness.engine.HarnessEngine` can share
    the existing exception flow.
    """
    results: list[PolicyResult] = []
    all_violations: list[str] = []
    all_warnings: list[str] = []
    overall_allowed = True

    for policy in self._policies:
        policy_name = type(policy).__name__
        logger.debug("policy_tool_call_evaluation_start", policy=policy_name, tool=tool_name)
        result = policy.evaluate_tool_call(
            tool_name=tool_name,
            arguments=arguments,
            intent_action=intent_action,
            intent_domain=intent_domain,
            **kwargs,
        )
        results.append(result)
        if not result.allowed:
            overall_allowed = False
            all_violations.extend(result.violations)
            logger.warning(
                "policy_tool_call_denied",
                policy=policy_name,
                tool=tool_name,
                violations=result.violations,
            )
        all_warnings.extend(result.warnings)

    evaluation = EvaluationResult(
        allowed=overall_allowed,
        results=results,
        violations=all_violations,
        warnings=all_warnings,
    )
    if not overall_allowed:
        violation_summary = "; ".join(all_violations)
        policy_names = ", ".join(r.policy_name for r in results if not r.allowed)
        raise PolicyViolation(
            policy=policy_names,
            violation=f"Tool call {tool_name!r} denied: {violation_summary}",
        )
    return evaluation