55 lines
2.1 KiB
Python
55 lines
2.1 KiB
Python
from memabra.app import build_demo_app
|
|
from memabra.evaluator import BenchmarkTask, Evaluator
|
|
|
|
|
|
def test_evaluator_runs_benchmark_and_reports_metrics(tmp_path):
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
evaluator = Evaluator(app)
|
|
tasks = [
|
|
BenchmarkTask(user_input="Use my telegram preference."),
|
|
BenchmarkTask(user_input="Check the current system status."),
|
|
]
|
|
result = evaluator.run(tasks)
|
|
|
|
assert result.task_count == 2
|
|
assert result.avg_reward >= 0.0
|
|
assert "inject_memory" in result.decision_distribution
|
|
assert "call_tool" in result.decision_distribution
|
|
assert result.error_rate == 0.0
|
|
|
|
|
|
def test_evaluator_ab_compares_two_routers(tmp_path):
|
|
from memabra.router import RuleBasedRouter, TaskContext
|
|
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
evaluator = Evaluator(app)
|
|
tasks = [
|
|
BenchmarkTask(user_input="Use my telegram preference."),
|
|
BenchmarkTask(user_input="Check the current system status."),
|
|
]
|
|
|
|
baseline = evaluator.run(tasks, router=RuleBasedRouter())
|
|
# Using same router for both arms in this test; real tests would compare different routers
|
|
challenger = evaluator.run(tasks, router=RuleBasedRouter())
|
|
comparison = evaluator.compare(baseline, challenger)
|
|
|
|
assert comparison["winner"] in ("baseline", "challenger", "tie")
|
|
assert "avg_reward_delta" in comparison
|
|
assert "error_rate_delta" in comparison
|
|
|
|
|
|
def test_app_trains_learning_router_from_artifact_index(tmp_path):
|
|
from memabra.router import SimpleLearningRouter
|
|
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
# Generate some training data
|
|
app.run_task("Use my telegram preference.", channel="local")
|
|
app.run_task("Check the current system status.", channel="local")
|
|
|
|
router = app.train_learning_router()
|
|
|
|
assert isinstance(router, SimpleLearningRouter)
|
|
# After training, the router should be able to make predictions (not fallback to clarify for known patterns)
|
|
trajectory = app.run_task("Use my telegram preference.", channel="local")
|
|
assert trajectory["reward"]["total"] >= 0.0
|