memabra/tests/test_evaluator.py

from memabra.app import build_demo_app
from memabra.evaluator import BenchmarkTask, Evaluator


def test_evaluator_runs_benchmark_and_reports_metrics(tmp_path):
    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    evaluator = Evaluator(app)
    tasks = [
        BenchmarkTask(user_input="Use my telegram preference."),
        BenchmarkTask(user_input="Check the current system status."),
    ]
    result = evaluator.run(tasks)

    assert result.task_count == 2
    assert result.avg_reward >= 0.0
    assert "inject_memory" in result.decision_distribution
    assert "call_tool" in result.decision_distribution
    assert result.error_rate == 0.0


def test_evaluator_ab_compares_two_routers(tmp_path):
    from memabra.router import RuleBasedRouter, TaskContext

    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    evaluator = Evaluator(app)
    tasks = [
        BenchmarkTask(user_input="Use my telegram preference."),
        BenchmarkTask(user_input="Check the current system status."),
    ]

    baseline = evaluator.run(tasks, router=RuleBasedRouter())
    # Using same router for both arms in this test; real tests would compare different routers
    challenger = evaluator.run(tasks, router=RuleBasedRouter())
    comparison = evaluator.compare(baseline, challenger)

    assert comparison["winner"] in ("baseline", "challenger", "tie")
    assert "avg_reward_delta" in comparison
    assert "error_rate_delta" in comparison


def test_app_trains_learning_router_from_artifact_index(tmp_path):
    from memabra.router import SimpleLearningRouter

    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    # Generate some training data
    app.run_task("Use my telegram preference.", channel="local")
    app.run_task("Check the current system status.", channel="local")

    router = app.train_learning_router()

    assert isinstance(router, SimpleLearningRouter)
    # After training, the router should be able to make predictions (not fallback to clarify for known patterns)
    trajectory = app.run_task("Use my telegram preference.", channel="local")
    assert trajectory["reward"]["total"] >= 0.0