39 lines
1.4 KiB
Python
39 lines
1.4 KiB
Python
from __future__ import annotations
|
|
|
|
from memabra.benchmarks import BenchmarkSuite, BenchmarkTask, save_benchmark_suite, load_benchmark_suite, default_benchmark_suite
|
|
|
|
|
|
def test_benchmark_suite_roundtrip(tmp_path):
|
|
path = tmp_path / "suite.json"
|
|
suite = BenchmarkSuite(
|
|
name="test-suite",
|
|
tasks=[
|
|
BenchmarkTask(user_input="Hello", channel="local", user_id="u1"),
|
|
BenchmarkTask(user_input="World", channel="telegram"),
|
|
],
|
|
)
|
|
|
|
save_benchmark_suite(suite, path)
|
|
loaded = load_benchmark_suite(path)
|
|
|
|
assert loaded.name == "test-suite"
|
|
assert len(loaded.tasks) == 2
|
|
assert loaded.tasks[0].user_input == "Hello"
|
|
assert loaded.tasks[0].channel == "local"
|
|
assert loaded.tasks[0].user_id == "u1"
|
|
assert loaded.tasks[1].user_input == "World"
|
|
assert loaded.tasks[1].channel == "telegram"
|
|
assert loaded.tasks[1].user_id is None
|
|
|
|
|
|
def test_default_benchmark_suite_covers_expected_categories():
|
|
suite = default_benchmark_suite()
|
|
|
|
assert suite.name == "default"
|
|
assert len(suite.tasks) >= 4
|
|
inputs = [t.user_input.lower() for t in suite.tasks]
|
|
assert any("memory" in i or "preference" in i for i in inputs)
|
|
assert any("skill" in i or "deploy" in i for i in inputs)
|
|
assert any("tool" in i or "status" in i for i in inputs)
|
|
assert any("composite" in i or "multiple" in i for i in inputs)
|