Initial standalone memabra release
This commit is contained in:
54
tests/test_evaluator.py
Normal file
54
tests/test_evaluator.py
Normal file
@@ -0,0 +1,54 @@
|
||||
from memabra.app import build_demo_app
|
||||
from memabra.evaluator import BenchmarkTask, Evaluator
|
||||
|
||||
|
||||
def test_evaluator_runs_benchmark_and_reports_metrics(tmp_path):
|
||||
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
||||
evaluator = Evaluator(app)
|
||||
tasks = [
|
||||
BenchmarkTask(user_input="Use my telegram preference."),
|
||||
BenchmarkTask(user_input="Check the current system status."),
|
||||
]
|
||||
result = evaluator.run(tasks)
|
||||
|
||||
assert result.task_count == 2
|
||||
assert result.avg_reward >= 0.0
|
||||
assert "inject_memory" in result.decision_distribution
|
||||
assert "call_tool" in result.decision_distribution
|
||||
assert result.error_rate == 0.0
|
||||
|
||||
|
||||
def test_evaluator_ab_compares_two_routers(tmp_path):
|
||||
from memabra.router import RuleBasedRouter, TaskContext
|
||||
|
||||
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
||||
evaluator = Evaluator(app)
|
||||
tasks = [
|
||||
BenchmarkTask(user_input="Use my telegram preference."),
|
||||
BenchmarkTask(user_input="Check the current system status."),
|
||||
]
|
||||
|
||||
baseline = evaluator.run(tasks, router=RuleBasedRouter())
|
||||
# Using same router for both arms in this test; real tests would compare different routers
|
||||
challenger = evaluator.run(tasks, router=RuleBasedRouter())
|
||||
comparison = evaluator.compare(baseline, challenger)
|
||||
|
||||
assert comparison["winner"] in ("baseline", "challenger", "tie")
|
||||
assert "avg_reward_delta" in comparison
|
||||
assert "error_rate_delta" in comparison
|
||||
|
||||
|
||||
def test_app_trains_learning_router_from_artifact_index(tmp_path):
|
||||
from memabra.router import SimpleLearningRouter
|
||||
|
||||
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
||||
# Generate some training data
|
||||
app.run_task("Use my telegram preference.", channel="local")
|
||||
app.run_task("Check the current system status.", channel="local")
|
||||
|
||||
router = app.train_learning_router()
|
||||
|
||||
assert isinstance(router, SimpleLearningRouter)
|
||||
# After training, the router should be able to make predictions (not fallback to clarify for known patterns)
|
||||
trajectory = app.run_task("Use my telegram preference.", channel="local")
|
||||
assert trajectory["reward"]["total"] >= 0.0
|
||||
Reference in New Issue
Block a user