Testing Utilities¶

mock_llm¶

mock_llm ¶

mock_llm(responses: list[str]) -> Generator[MockBackend]

Context manager that provides a MockBackend with predefined responses.

Yields a MockBackend configured with the given responses. Responses are consumed in FIFO order as generate() is called.

Parameters:

Name	Type	Description	Default
`responses`	`list[str]`	List of response strings to return in order.	required

Yields:

Type	Description
`Generator[MockBackend]`	A configured MockBackend instance.

Example

with mock_llm(responses=["SELECT COUNT() FROM orders"]) as backend: response = await backend.generate(prompt) assert response.content == "SELECT COUNT() FROM orders"

Source code in src/agenticapi/testing/mocks.py

@contextmanager
def mock_llm(responses: list[str]) -> Generator[MockBackend]:
    """Context manager that provides a MockBackend with predefined responses.

    Yields a MockBackend configured with the given responses. Responses
    are consumed in FIFO order as generate() is called.

    Args:
        responses: List of response strings to return in order.

    Yields:
        A configured MockBackend instance.

    Example:
        with mock_llm(responses=["SELECT COUNT(*) FROM orders"]) as backend:
            response = await backend.generate(prompt)
            assert response.content == "SELECT COUNT(*) FROM orders"
    """
    backend = MockBackend(responses=responses)
    yield backend

MockSandbox¶

MockSandbox ¶

Bases: SandboxRuntime

Mock sandbox for unit testing.

Returns predefined results based on pattern matching against the executed code. Raises SandboxViolation if the code contains any of the denied operations.

Parameters:

Name	Type	Description	Default
`allowed_results`	`dict[str, Any] \| None`	Mapping of code substrings to return values. If a key is found in the code, its value is used as the sandbox output.	`None`
`denied_operations`	`list[str] \| None`	List of code substrings that trigger a SandboxViolation when found in the code.	`None`

Example

sandbox = MockSandbox( allowed_results={"SELECT COUNT()": [{"count": 42}]}, denied_operations=["DROP TABLE"], ) async with sandbox as sb: result = await sb.execute("SELECT COUNT() FROM orders") assert result.return_value == [{"count": 42}]

Source code in src/agenticapi/testing/mocks.py

class MockSandbox(SandboxRuntime):
    """Mock sandbox for unit testing.

    Returns predefined results based on pattern matching against the
    executed code. Raises SandboxViolation if the code contains any
    of the denied operations.

    Args:
        allowed_results: Mapping of code substrings to return values.
            If a key is found in the code, its value is used as the
            sandbox output.
        denied_operations: List of code substrings that trigger a
            SandboxViolation when found in the code.

    Example:
        sandbox = MockSandbox(
            allowed_results={"SELECT COUNT(*)": [{"count": 42}]},
            denied_operations=["DROP TABLE"],
        )
        async with sandbox as sb:
            result = await sb.execute("SELECT COUNT(*) FROM orders")
            assert result.return_value == [{"count": 42}]
    """

    def __init__(
        self,
        *,
        allowed_results: dict[str, Any] | None = None,
        denied_operations: list[str] | None = None,
    ) -> None:
        """Initialize the mock sandbox.

        Args:
            allowed_results: Mapping of code substrings to return values.
            denied_operations: List of code substrings that trigger violations.
        """
        self._allowed_results: dict[str, Any] = allowed_results or {}
        self._denied_operations: list[str] = denied_operations or []
        self._execution_count: int = 0

    @property
    def execution_count(self) -> int:
        """Number of execute calls made."""
        return self._execution_count

    async def execute(
        self,
        code: str,
        tools: Any = None,
        resource_limits: ResourceLimits | None = None,
    ) -> SandboxResult:
        """Execute code against mock rules.

        Checks denied operations first, then matches allowed results.
        Returns a default SandboxResult if no match is found.

        Args:
            code: The Python source code to "execute".
            tools: Ignored in mock implementation.
            resource_limits: Ignored in mock implementation.

        Returns:
            SandboxResult with matched or default output.

        Raises:
            SandboxViolation: If the code contains a denied operation.
        """
        self._execution_count += 1

        # Check denied operations
        for denied in self._denied_operations:
            if denied in code:
                raise SandboxViolation(f"Denied operation detected in code: {denied}")

        # Check allowed results for matching key
        for pattern, result_value in self._allowed_results.items():
            if pattern in code:
                return SandboxResult(
                    output=result_value,
                    return_value=result_value,
                    metrics=ResourceMetrics(
                        cpu_time_ms=1.0,
                        memory_peak_mb=10.0,
                        wall_time_ms=1.0,
                    ),
                )

        # Default result
        return SandboxResult(
            output=None,
            return_value=None,
            metrics=ResourceMetrics(
                cpu_time_ms=0.5,
                memory_peak_mb=5.0,
                wall_time_ms=0.5,
            ),
        )

    async def __aenter__(self) -> MockSandbox:
        """Enter the mock sandbox context."""
        return self

    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
        """Exit the mock sandbox context (no-op)."""

execution_count `property` ¶

execution_count: int

Number of execute calls made.

init ¶

__init__(
    *,
    allowed_results: dict[str, Any] | None = None,
    denied_operations: list[str] | None = None,
) -> None

Initialize the mock sandbox.

Parameters:

Name	Type	Description	Default
`allowed_results`	`dict[str, Any] \| None`	Mapping of code substrings to return values.	`None`
`denied_operations`	`list[str] \| None`	List of code substrings that trigger violations.	`None`

Source code in src/agenticapi/testing/mocks.py

def __init__(
    self,
    *,
    allowed_results: dict[str, Any] | None = None,
    denied_operations: list[str] | None = None,
) -> None:
    """Initialize the mock sandbox.

    Args:
        allowed_results: Mapping of code substrings to return values.
        denied_operations: List of code substrings that trigger violations.
    """
    self._allowed_results: dict[str, Any] = allowed_results or {}
    self._denied_operations: list[str] = denied_operations or []
    self._execution_count: int = 0

execute `async` ¶

execute(
    code: str,
    tools: Any = None,
    resource_limits: ResourceLimits | None = None,
) -> SandboxResult

Execute code against mock rules.

Checks denied operations first, then matches allowed results. Returns a default SandboxResult if no match is found.

Parameters:

Name	Type	Description	Default
`code`	`str`	The Python source code to "execute".	required
`tools`	`Any`	Ignored in mock implementation.	`None`
`resource_limits`	`ResourceLimits \| None`	Ignored in mock implementation.	`None`

Returns:

Type	Description
`SandboxResult`	SandboxResult with matched or default output.

Raises:

Type	Description
`SandboxViolation`	If the code contains a denied operation.

Source code in src/agenticapi/testing/mocks.py

async def execute(
    self,
    code: str,
    tools: Any = None,
    resource_limits: ResourceLimits | None = None,
) -> SandboxResult:
    """Execute code against mock rules.

    Checks denied operations first, then matches allowed results.
    Returns a default SandboxResult if no match is found.

    Args:
        code: The Python source code to "execute".
        tools: Ignored in mock implementation.
        resource_limits: Ignored in mock implementation.

    Returns:
        SandboxResult with matched or default output.

    Raises:
        SandboxViolation: If the code contains a denied operation.
    """
    self._execution_count += 1

    # Check denied operations
    for denied in self._denied_operations:
        if denied in code:
            raise SandboxViolation(f"Denied operation detected in code: {denied}")

    # Check allowed results for matching key
    for pattern, result_value in self._allowed_results.items():
        if pattern in code:
            return SandboxResult(
                output=result_value,
                return_value=result_value,
                metrics=ResourceMetrics(
                    cpu_time_ms=1.0,
                    memory_peak_mb=10.0,
                    wall_time_ms=1.0,
                ),
            )

    # Default result
    return SandboxResult(
        output=None,
        return_value=None,
        metrics=ResourceMetrics(
            cpu_time_ms=0.5,
            memory_peak_mb=5.0,
            wall_time_ms=0.5,
        ),
    )

aenter `async` ¶

__aenter__() -> MockSandbox

Enter the mock sandbox context.

Source code in src/agenticapi/testing/mocks.py

async def __aenter__(self) -> MockSandbox:
    """Enter the mock sandbox context."""
    return self

aexit `async` ¶

__aexit__(exc_type: Any, exc_val: Any, exc_tb: Any) -> None

Exit the mock sandbox context (no-op).

Source code in src/agenticapi/testing/mocks.py

async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
    """Exit the mock sandbox context (no-op)."""

Assertions¶

assert_code_safe ¶

assert_code_safe(
    code: str, *, denied_modules: list[str] | None = None
) -> None

Assert that code passes static safety analysis.

Runs AST-based static analysis on the provided code and raises AssertionError if any safety violations with severity "error" are found.

Parameters:

Name	Type	Description	Default
`code`	`str`	Python source code to check.	required
`denied_modules`	`list[str] \| None`	Optional list of denied module names.	`None`

Raises:

Type	Description
`AssertionError`	If the code has safety violations.

Source code in src/agenticapi/testing/assertions.py

def assert_code_safe(
    code: str,
    *,
    denied_modules: list[str] | None = None,
) -> None:
    """Assert that code passes static safety analysis.

    Runs AST-based static analysis on the provided code and raises
    AssertionError if any safety violations with severity "error"
    are found.

    Args:
        code: Python source code to check.
        denied_modules: Optional list of denied module names.

    Raises:
        AssertionError: If the code has safety violations.
    """
    result = check_code_safety(
        code,
        denied_modules=denied_modules,
        deny_eval_exec=True,
        deny_dynamic_import=True,
    )

    if not result.safe:
        violation_details = "; ".join(
            f"[{v.rule}] {v.description} (line {v.line})" for v in result.violations if v.severity == "error"
        )
        msg = f"Code safety check failed: {violation_details}"
        raise AssertionError(msg)

assert_policy_enforced ¶

assert_policy_enforced(
    code: str, policies: list[Policy]
) -> None

Assert that all policies allow the code.

Evaluates the code against each policy. If any policy denies the code, raises AssertionError with violation details.

Parameters:

Name	Type	Description	Default
`code`	`str`	Python source code to evaluate.	required
`policies`	`list[Policy]`	List of policies to check against.	required

Raises:

Type	Description
`AssertionError`	If any policy denies the code.

Source code in src/agenticapi/testing/assertions.py

def assert_policy_enforced(
    code: str,
    policies: list[Policy],
) -> None:
    """Assert that all policies allow the code.

    Evaluates the code against each policy. If any policy denies the
    code, raises AssertionError with violation details.

    Args:
        code: Python source code to evaluate.
        policies: List of policies to check against.

    Raises:
        AssertionError: If any policy denies the code.
    """
    evaluator = PolicyEvaluator(policies=policies)

    try:
        evaluator.evaluate(code=code)
    except PolicyViolation as exc:
        msg = f"Policy enforcement failed: {exc}"
        raise AssertionError(msg) from exc

assert_intent_parsed ¶

assert_intent_parsed(
    raw: str, expected_action: IntentAction
) -> None

Assert that a raw intent string parses to the expected action.

Uses the keyword-based parser (no LLM) for deterministic testing.

Parameters:

Name	Type	Description	Default
`raw`	`str`	The raw natural language intent string.	required
`expected_action`	`IntentAction`	The expected IntentAction after parsing.	required

Raises:

Type	Description
`AssertionError`	If the parsed action does not match expected.

Source code in src/agenticapi/testing/assertions.py

def assert_intent_parsed(
    raw: str,
    expected_action: IntentAction,
) -> None:
    """Assert that a raw intent string parses to the expected action.

    Uses the keyword-based parser (no LLM) for deterministic testing.

    Args:
        raw: The raw natural language intent string.
        expected_action: The expected IntentAction after parsing.

    Raises:
        AssertionError: If the parsed action does not match expected.
    """
    parser = IntentParser()
    # Use the sync keyword parser directly for deterministic assertion
    intent = parser._parse_with_keywords(raw, {})

    if intent.action != expected_action:
        msg = f"Intent action mismatch: expected {expected_action!r}, got {intent.action!r} for input {raw!r}"
        raise AssertionError(msg)

Fixtures¶

create_test_app ¶

create_test_app(
    *,
    policies: list[Policy] | None = None,
    llm_responses: list[str] | None = None,
    title: str = "TestApp",
) -> AgenticApp

Create an AgenticApp configured for testing.

Builds an app with optional mock LLM backend and harness engine. Useful for integration tests that need a fully wired application.

Parameters:

Name	Type	Description	Default
`policies`	`list[Policy] \| None`	Optional list of policies for the harness engine.	`None`
`llm_responses`	`list[str] \| None`	Optional list of LLM response strings. If provided, a MockBackend is created and a HarnessEngine is configured.	`None`
`title`	`str`	Application title (default "TestApp").	`'TestApp'`

Returns:

Type	Description
`AgenticApp`	A configured AgenticApp ready for testing.

Example

app = create_test_app( policies=[CodePolicy(denied_modules=["os"])], llm_responses=["SELECT COUNT(*) FROM orders"], )

@app.agent_endpoint(name="test") async def test_agent(intent, context): return {"result": "ok"}

response = await app.process_intent("show orders")

Source code in src/agenticapi/testing/fixtures.py

def create_test_app(
    *,
    policies: list[Policy] | None = None,
    llm_responses: list[str] | None = None,
    title: str = "TestApp",
) -> AgenticApp:
    """Create an AgenticApp configured for testing.

    Builds an app with optional mock LLM backend and harness engine.
    Useful for integration tests that need a fully wired application.

    Args:
        policies: Optional list of policies for the harness engine.
        llm_responses: Optional list of LLM response strings. If provided,
            a MockBackend is created and a HarnessEngine is configured.
        title: Application title (default "TestApp").

    Returns:
        A configured AgenticApp ready for testing.

    Example:
        app = create_test_app(
            policies=[CodePolicy(denied_modules=["os"])],
            llm_responses=["SELECT COUNT(*) FROM orders"],
        )

        @app.agent_endpoint(name="test")
        async def test_agent(intent, context):
            return {"result": "ok"}

        response = await app.process_intent("show orders")
    """
    llm: MockBackend | None = None
    harness: HarnessEngine | None = None

    if llm_responses is not None:
        llm = MockBackend(responses=llm_responses)

    if policies is not None or llm is not None:
        harness = HarnessEngine(policies=policies)

    return AgenticApp(
        title=title,
        harness=harness,
        llm=llm,  # type: ignore[arg-type]  # MockBackend satisfies LLMBackend Protocol
    )

Benchmarks¶

BenchmarkRunner ¶

Lightweight benchmark runner for performance measurement.

Measures execution time of synchronous functions and stores results for subsequent assertion against targets.

Example

runner = BenchmarkRunner() result = runner.run("intent_parse", fn=parser.parse, iterations=100) runner.assert_within_target("intent_parse", target_ms=50.0)

Source code in src/agenticapi/testing/benchmark.py

class BenchmarkRunner:
    """Lightweight benchmark runner for performance measurement.

    Measures execution time of synchronous functions and stores
    results for subsequent assertion against targets.

    Example:
        runner = BenchmarkRunner()
        result = runner.run("intent_parse", fn=parser.parse, iterations=100)
        runner.assert_within_target("intent_parse", target_ms=50.0)
    """

    def __init__(self) -> None:
        """Initialize the benchmark runner."""
        self._results: dict[str, BenchmarkResult] = {}

    @property
    def results(self) -> dict[str, BenchmarkResult]:
        """All stored benchmark results."""
        return dict(self._results)

    def run(
        self,
        name: str,
        fn: Callable[..., Any],
        *,
        args: tuple[Any, ...] = (),
        kwargs: dict[str, Any] | None = None,
        iterations: int = 100,
        warmup: int = 5,
    ) -> BenchmarkResult:
        """Run a synchronous benchmark.

        Args:
            name: Name for this benchmark.
            fn: The function to benchmark.
            args: Positional arguments for fn.
            kwargs: Keyword arguments for fn.
            iterations: Number of iterations to measure.
            warmup: Number of warmup iterations (not measured).

        Returns:
            BenchmarkResult with timing statistics.
        """
        kw = kwargs or {}

        # Warmup
        for _ in range(warmup):
            fn(*args, **kw)

        # Measure
        times_ms: list[float] = []
        total_start = time.perf_counter_ns()

        for _ in range(iterations):
            start = time.perf_counter_ns()
            fn(*args, **kw)
            elapsed_ns = time.perf_counter_ns() - start
            times_ms.append(elapsed_ns / 1_000_000)

        total_elapsed_ns = time.perf_counter_ns() - total_start
        total_ms = total_elapsed_ns / 1_000_000

        result = BenchmarkResult(
            name=name,
            iterations=iterations,
            total_ms=total_ms,
            mean_ms=sum(times_ms) / len(times_ms),
            min_ms=min(times_ms),
            max_ms=max(times_ms),
        )

        self._results[name] = result

        logger.info(
            "benchmark_complete",
            name=name,
            iterations=iterations,
            mean_ms=f"{result.mean_ms:.3f}",
            min_ms=f"{result.min_ms:.3f}",
            max_ms=f"{result.max_ms:.3f}",
        )

        return result

    def assert_within_target(self, name: str, *, target_ms: float) -> None:
        """Assert that a benchmark's mean time is within target.

        Args:
            name: The benchmark name to check.
            target_ms: Maximum allowed mean time in milliseconds.

        Raises:
            AssertionError: If the mean time exceeds the target.
            KeyError: If no result exists for the given name.
        """
        result = self._results.get(name)
        if result is None:
            raise KeyError(f"No benchmark result for '{name}'. Run the benchmark first.")

        assert result.mean_ms <= target_ms, (
            f"Benchmark '{name}' mean {result.mean_ms:.3f}ms exceeds target {target_ms:.1f}ms"
        )

results `property` ¶

results: dict[str, BenchmarkResult]

All stored benchmark results.

init ¶

__init__() -> None

Initialize the benchmark runner.

Source code in src/agenticapi/testing/benchmark.py

def __init__(self) -> None:
    """Initialize the benchmark runner."""
    self._results: dict[str, BenchmarkResult] = {}

run ¶

run(
    name: str,
    fn: Callable[..., Any],
    *,
    args: tuple[Any, ...] = (),
    kwargs: dict[str, Any] | None = None,
    iterations: int = 100,
    warmup: int = 5,
) -> BenchmarkResult

Run a synchronous benchmark.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name for this benchmark.	required
`fn`	`Callable[..., Any]`	The function to benchmark.	required
`args`	`tuple[Any, ...]`	Positional arguments for fn.	`()`
`kwargs`	`dict[str, Any] \| None`	Keyword arguments for fn.	`None`
`iterations`	`int`	Number of iterations to measure.	`100`
`warmup`	`int`	Number of warmup iterations (not measured).	`5`

Returns:

Type	Description
`BenchmarkResult`	BenchmarkResult with timing statistics.

Source code in src/agenticapi/testing/benchmark.py

def run(
    self,
    name: str,
    fn: Callable[..., Any],
    *,
    args: tuple[Any, ...] = (),
    kwargs: dict[str, Any] | None = None,
    iterations: int = 100,
    warmup: int = 5,
) -> BenchmarkResult:
    """Run a synchronous benchmark.

    Args:
        name: Name for this benchmark.
        fn: The function to benchmark.
        args: Positional arguments for fn.
        kwargs: Keyword arguments for fn.
        iterations: Number of iterations to measure.
        warmup: Number of warmup iterations (not measured).

    Returns:
        BenchmarkResult with timing statistics.
    """
    kw = kwargs or {}

    # Warmup
    for _ in range(warmup):
        fn(*args, **kw)

    # Measure
    times_ms: list[float] = []
    total_start = time.perf_counter_ns()

    for _ in range(iterations):
        start = time.perf_counter_ns()
        fn(*args, **kw)
        elapsed_ns = time.perf_counter_ns() - start
        times_ms.append(elapsed_ns / 1_000_000)

    total_elapsed_ns = time.perf_counter_ns() - total_start
    total_ms = total_elapsed_ns / 1_000_000

    result = BenchmarkResult(
        name=name,
        iterations=iterations,
        total_ms=total_ms,
        mean_ms=sum(times_ms) / len(times_ms),
        min_ms=min(times_ms),
        max_ms=max(times_ms),
    )

    self._results[name] = result

    logger.info(
        "benchmark_complete",
        name=name,
        iterations=iterations,
        mean_ms=f"{result.mean_ms:.3f}",
        min_ms=f"{result.min_ms:.3f}",
        max_ms=f"{result.max_ms:.3f}",
    )

    return result

assert_within_target ¶

assert_within_target(
    name: str, *, target_ms: float
) -> None

Assert that a benchmark's mean time is within target.

Parameters:

Name	Type	Description	Default
`name`	`str`	The benchmark name to check.	required
`target_ms`	`float`	Maximum allowed mean time in milliseconds.	required

Raises:

Type	Description
`AssertionError`	If the mean time exceeds the target.
`KeyError`	If no result exists for the given name.

Source code in src/agenticapi/testing/benchmark.py

def assert_within_target(self, name: str, *, target_ms: float) -> None:
    """Assert that a benchmark's mean time is within target.

    Args:
        name: The benchmark name to check.
        target_ms: Maximum allowed mean time in milliseconds.

    Raises:
        AssertionError: If the mean time exceeds the target.
        KeyError: If no result exists for the given name.
    """
    result = self._results.get(name)
    if result is None:
        raise KeyError(f"No benchmark result for '{name}'. Run the benchmark first.")

    assert result.mean_ms <= target_ms, (
        f"Benchmark '{name}' mean {result.mean_ms:.3f}ms exceeds target {target_ms:.1f}ms"
    )

BenchmarkResult `dataclass` ¶

Result of a benchmark run.

Attributes:

Name	Type	Description
`name`	`str`	Name of the benchmark.
`iterations`	`int`	Number of iterations run.
`total_ms`	`float`	Total time in milliseconds.
`mean_ms`	`float`	Mean time per iteration in milliseconds.
`min_ms`	`float`	Minimum time in milliseconds.
`max_ms`	`float`	Maximum time in milliseconds.

Source code in src/agenticapi/testing/benchmark.py

@dataclass(frozen=True, slots=True)
class BenchmarkResult:
    """Result of a benchmark run.

    Attributes:
        name: Name of the benchmark.
        iterations: Number of iterations run.
        total_ms: Total time in milliseconds.
        mean_ms: Mean time per iteration in milliseconds.
        min_ms: Minimum time in milliseconds.
        max_ms: Maximum time in milliseconds.
    """

    name: str
    iterations: int
    total_ms: float
    mean_ms: float
    min_ms: float
    max_ms: float

Testing Utilities¶

mock_llm¶

mock_llm ¶

MockSandbox¶

MockSandbox ¶

execution_count property ¶

__init__ ¶

execute async ¶

__aenter__ async ¶

__aexit__ async ¶

Assertions¶

assert_code_safe ¶

assert_policy_enforced ¶

assert_intent_parsed ¶

Fixtures¶

create_test_app ¶

Benchmarks¶

BenchmarkRunner ¶

results property ¶

__init__ ¶

run ¶

assert_within_target ¶

BenchmarkResult dataclass ¶

execution_count `property` ¶

init ¶

execute `async` ¶

aenter `async` ¶

aexit `async` ¶

results `property` ¶

init ¶

BenchmarkResult `dataclass` ¶