Scorers¶
Evalcraft provides a set of assertion functions ("scorers") for evaluating cassettes and agent runs. Each function returns an AssertionResult rather than raising immediately, so you can collect all failures before reporting.
All scorers accept either a Cassette or an AgentRun as their first argument.
from evalcraft import (
assert_tool_called,
assert_tool_order,
assert_no_tool_called,
assert_output_contains,
assert_output_matches,
assert_cost_under,
assert_latency_under,
assert_token_count_under,
)
from evalcraft.eval.scorers import Evaluator
AssertionResult¶
Every scorer returns an AssertionResult:
result = assert_tool_called(cassette, "web_search")
print(result.passed) # True or False
print(result.name) # "assert_tool_called(web_search)"
print(result.expected) # "web_search"
print(result.actual) # ["web_search", "summarize"]
print(result.message) # "" if passed, error description if failed
In pytest, use assert result.passed, result.message to get descriptive failures:
result = assert_tool_called(run, "web_search")
assert result.passed, result.message
# If failed: "Tool 'web_search' was never called. Called tools: ['search', 'analyze']"
Tool assertions¶
assert_tool_called(cassette, tool_name, ...)¶
Assert that a specific tool was called.
from evalcraft import assert_tool_called, replay
run = replay("tests/cassettes/agent.json")
# Basic: tool was called at least once
result = assert_tool_called(run, "web_search")
assert result.passed
# Called exactly N times
result = assert_tool_called(run, "web_search", times=2)
# Called with specific arguments
result = assert_tool_called(run, "web_search", with_args={"query": "Paris weather"})
# Called before another tool
result = assert_tool_called(run, "web_search", before="summarize")
# Called after another tool
result = assert_tool_called(run, "summarize", after="web_search")
| Parameter | Type | Description |
|---|---|---|
cassette |
Cassette \| AgentRun |
The run to check |
tool_name |
str |
Name of the tool |
times |
int \| None |
Exact number of calls expected |
with_args |
dict \| None |
Arguments that must have been passed |
before |
str \| None |
Tool that should come AFTER this one |
after |
str \| None |
Tool that should come BEFORE this one |
assert_tool_order(cassette, expected_order, strict=False)¶
Assert tools were called in a specific order.
# Non-strict: the tools must appear in order, but other tools can be in between
result = assert_tool_order(run, ["web_search", "summarize", "send_email"])
assert result.passed
# Strict: the sequence must match exactly
result = assert_tool_order(
run,
["web_search", "summarize", "send_email"],
strict=True,
)
| Parameter | Type | Description |
|---|---|---|
cassette |
Cassette \| AgentRun |
The run to check |
expected_order |
list[str] |
Expected tool names in order |
strict |
bool |
If True, exact match required. If False, subsequence match. |
assert_no_tool_called(cassette, tool_name)¶
Assert a specific tool was NOT called.
result = assert_no_tool_called(run, "send_email")
assert result.passed, result.message
# If failed: "Tool 'send_email' was called 1 times, expected 0"
Output assertions¶
assert_output_contains(cassette, substring, case_sensitive=True)¶
Assert the agent's output contains a substring.
result = assert_output_contains(run, "Paris")
assert result.passed
# Case-insensitive
result = assert_output_contains(run, "paris", case_sensitive=False)
assert result.passed
assert_output_matches(cassette, pattern)¶
Assert the agent's output matches a regex pattern.
result = assert_output_matches(run, r"\d+°C")
assert result.passed, result.message
# If failed: "Output does not match pattern '\\d+°C'"
result = assert_output_matches(run, r"(sunny|cloudy|rainy)")
assert result.passed
Cost and performance assertions¶
assert_cost_under(cassette, max_usd)¶
Assert the total estimated cost is under a threshold.
result = assert_cost_under(run, max_usd=0.05)
assert result.passed
# If failed: "Cost $0.0823 exceeds limit $0.0500"
Note
Cost is only available if recorded during capture (e.g., via record_llm_call(cost_usd=...) or an adapter that estimates cost).
assert_latency_under(cassette, max_ms)¶
Assert total wall-clock time is under a threshold (in milliseconds).
result = assert_latency_under(run, max_ms=5000)
assert result.passed
# If failed: "Latency 6234.0ms exceeds limit 5000.0ms"
assert_token_count_under(cassette, max_tokens)¶
Assert total token count is under a threshold.
result = assert_token_count_under(run, max_tokens=2000)
assert result.passed
# If failed: "Token count 2341 exceeds limit 2000"
Evaluator¶
The Evaluator class lets you compose multiple assertions into a single evaluation.
from evalcraft.eval.scorers import Evaluator
from evalcraft import (
assert_tool_called,
assert_tool_order,
assert_cost_under,
assert_token_count_under,
assert_output_contains,
replay,
)
run = replay("tests/cassettes/agent.json")
evaluator = Evaluator()
evaluator.add(assert_tool_called, run, "web_search")
evaluator.add(assert_tool_order, run, ["web_search", "summarize"])
evaluator.add(assert_cost_under, run, max_usd=0.05)
evaluator.add(assert_token_count_under, run, max_tokens=2000)
evaluator.add(assert_output_contains, run, "summary")
result = evaluator.run()
print(result.passed) # True if all pass
print(result.score) # e.g. 0.8 = 4/5 passed
print(len(result.assertions)) # 5
for assertion in result.assertions:
status = "PASS" if assertion.passed else "FAIL"
print(f"[{status}] {assertion.name}")
if not assertion.passed:
print(f" {assertion.message}")
Evaluator.add(assertion_fn, *args, **kwargs)¶
Add an assertion to run.
Returns self for chaining.
Evaluator.run()¶
Run all assertions and return an EvalResult.
EvalResult fields¶
| Field | Type | Description |
|---|---|---|
passed |
bool |
True if all assertions passed |
score |
float |
Fraction of assertions that passed (0.0–1.0) |
assertions |
list[AssertionResult] |
Individual assertion results |
failed_assertions |
list[AssertionResult] |
Only failed assertions |
result = evaluator.run()
if not result.passed:
for a in result.failed_assertions:
print(f"FAIL: {a.name} — {a.message}")
Complete example¶
import pytest
from evalcraft import replay
from evalcraft import (
assert_tool_called,
assert_tool_order,
assert_no_tool_called,
assert_output_contains,
assert_cost_under,
assert_token_count_under,
assert_latency_under,
)
from evalcraft.eval.scorers import Evaluator
@pytest.fixture
def run():
return replay("tests/cassettes/research_agent.json")
def test_searches_before_summarizing(run):
result = assert_tool_called(run, "web_search", before="summarize")
assert result.passed, result.message
def test_no_email_sent(run):
result = assert_no_tool_called(run, "send_email")
assert result.passed, result.message
def test_output_is_informative(run):
result = assert_output_contains(run, "summary", case_sensitive=False)
assert result.passed, result.message
def test_budget(run):
evaluator = Evaluator()
evaluator.add(assert_cost_under, run, max_usd=0.10)
evaluator.add(assert_token_count_under, run, max_tokens=3000)
evaluator.add(assert_latency_under, run, max_ms=10_000)
result = evaluator.run()
assert result.passed, "\n".join(
f"{a.name}: {a.message}" for a in result.failed_assertions
)