from memabra.app import build_demo_app from memabra.evaluator import BenchmarkTask, Evaluator def test_evaluator_runs_benchmark_and_reports_metrics(tmp_path): app = build_demo_app(base_dir=tmp_path / "demo-artifacts") evaluator = Evaluator(app) tasks = [ BenchmarkTask(user_input="Use my telegram preference."), BenchmarkTask(user_input="Check the current system status."), ] result = evaluator.run(tasks) assert result.task_count == 2 assert result.avg_reward >= 0.0 assert "inject_memory" in result.decision_distribution assert "call_tool" in result.decision_distribution assert result.error_rate == 0.0 def test_evaluator_ab_compares_two_routers(tmp_path): from memabra.router import RuleBasedRouter, TaskContext app = build_demo_app(base_dir=tmp_path / "demo-artifacts") evaluator = Evaluator(app) tasks = [ BenchmarkTask(user_input="Use my telegram preference."), BenchmarkTask(user_input="Check the current system status."), ] baseline = evaluator.run(tasks, router=RuleBasedRouter()) # Using same router for both arms in this test; real tests would compare different routers challenger = evaluator.run(tasks, router=RuleBasedRouter()) comparison = evaluator.compare(baseline, challenger) assert comparison["winner"] in ("baseline", "challenger", "tie") assert "avg_reward_delta" in comparison assert "error_rate_delta" in comparison def test_app_trains_learning_router_from_artifact_index(tmp_path): from memabra.router import SimpleLearningRouter app = build_demo_app(base_dir=tmp_path / "demo-artifacts") # Generate some training data app.run_task("Use my telegram preference.", channel="local") app.run_task("Check the current system status.", channel="local") router = app.train_learning_router() assert isinstance(router, SimpleLearningRouter) # After training, the router should be able to make predictions (not fallback to clarify for known patterns) trajectory = app.run_task("Use my telegram preference.", channel="local") assert trajectory["reward"]["total"] >= 0.0