Files
memabra/tests/test_evaluator.py
2026-04-15 11:06:05 +08:00

55 lines
2.1 KiB
Python

from memabra.app import build_demo_app
from memabra.evaluator import BenchmarkTask, Evaluator
def test_evaluator_runs_benchmark_and_reports_metrics(tmp_path):
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
evaluator = Evaluator(app)
tasks = [
BenchmarkTask(user_input="Use my telegram preference."),
BenchmarkTask(user_input="Check the current system status."),
]
result = evaluator.run(tasks)
assert result.task_count == 2
assert result.avg_reward >= 0.0
assert "inject_memory" in result.decision_distribution
assert "call_tool" in result.decision_distribution
assert result.error_rate == 0.0
def test_evaluator_ab_compares_two_routers(tmp_path):
from memabra.router import RuleBasedRouter, TaskContext
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
evaluator = Evaluator(app)
tasks = [
BenchmarkTask(user_input="Use my telegram preference."),
BenchmarkTask(user_input="Check the current system status."),
]
baseline = evaluator.run(tasks, router=RuleBasedRouter())
# Using same router for both arms in this test; real tests would compare different routers
challenger = evaluator.run(tasks, router=RuleBasedRouter())
comparison = evaluator.compare(baseline, challenger)
assert comparison["winner"] in ("baseline", "challenger", "tie")
assert "avg_reward_delta" in comparison
assert "error_rate_delta" in comparison
def test_app_trains_learning_router_from_artifact_index(tmp_path):
from memabra.router import SimpleLearningRouter
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
# Generate some training data
app.run_task("Use my telegram preference.", channel="local")
app.run_task("Check the current system status.", channel="local")
router = app.train_learning_router()
assert isinstance(router, SimpleLearningRouter)
# After training, the router should be able to make predictions (not fallback to clarify for known patterns)
trajectory = app.run_task("Use my telegram preference.", channel="local")
assert trajectory["reward"]["total"] >= 0.0