Initial standalone memabra release

2026-04-15 11:06:05 +08:00
commit 58f9f221b1
464 changed files with 30256 additions and 0 deletions
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -0,0 +1,54 @@
+from memabra.app import build_demo_app
+from memabra.evaluator import BenchmarkTask, Evaluator
+
+
+def test_evaluator_runs_benchmark_and_reports_metrics(tmp_path):
+    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
+    evaluator = Evaluator(app)
+    tasks = [
+        BenchmarkTask(user_input="Use my telegram preference."),
+        BenchmarkTask(user_input="Check the current system status."),
+    ]
+    result = evaluator.run(tasks)
+
+    assert result.task_count == 2
+    assert result.avg_reward >= 0.0
+    assert "inject_memory" in result.decision_distribution
+    assert "call_tool" in result.decision_distribution
+    assert result.error_rate == 0.0
+
+
+def test_evaluator_ab_compares_two_routers(tmp_path):
+    from memabra.router import RuleBasedRouter, TaskContext
+
+    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
+    evaluator = Evaluator(app)
+    tasks = [
+        BenchmarkTask(user_input="Use my telegram preference."),
+        BenchmarkTask(user_input="Check the current system status."),
+    ]
+
+    baseline = evaluator.run(tasks, router=RuleBasedRouter())
+    # Using same router for both arms in this test; real tests would compare different routers
+    challenger = evaluator.run(tasks, router=RuleBasedRouter())
+    comparison = evaluator.compare(baseline, challenger)
+
+    assert comparison["winner"] in ("baseline", "challenger", "tie")
+    assert "avg_reward_delta" in comparison
+    assert "error_rate_delta" in comparison
+
+
+def test_app_trains_learning_router_from_artifact_index(tmp_path):
+    from memabra.router import SimpleLearningRouter
+
+    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
+    # Generate some training data
+    app.run_task("Use my telegram preference.", channel="local")
+    app.run_task("Check the current system status.", channel="local")
+
+    router = app.train_learning_router()
+
+    assert isinstance(router, SimpleLearningRouter)
+    # After training, the router should be able to make predictions (not fallback to clarify for known patterns)
+    trajectory = app.run_task("Use my telegram preference.", channel="local")
+    assert trajectory["reward"]["total"] >= 0.0