198 lines
7.2 KiB
Python
198 lines
7.2 KiB
Python
from pathlib import Path
|
|
|
|
from memabra.app import MemabraApp, build_app_with_skills, build_demo_app
|
|
|
|
|
|
def test_build_demo_app_runs_task_and_produces_summary(tmp_path: Path):
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
|
|
trajectory = app.run_task("Use my telegram preference for this answer.", channel="telegram", user_id="oza")
|
|
summary = app.replay_summary()
|
|
|
|
assert trajectory["trajectory_id"].startswith("traj-")
|
|
assert summary.trajectories == 1
|
|
assert any(event["event_type"] == "memory_injected" for event in trajectory["events"])
|
|
assert len(list((tmp_path / "demo-artifacts" / "trajectories").glob("*.json"))) == 1
|
|
|
|
|
|
def test_app_can_run_tool_task_with_demo_backend(tmp_path: Path):
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
|
|
trajectory = app.run_task("Check the current system status.")
|
|
|
|
assert trajectory["decisions"][0]["decision_type"] == "call_tool"
|
|
assert any(event["event_type"] == "tool_result" for event in trajectory["events"])
|
|
assert trajectory["outcome"]["status"] == "success"
|
|
|
|
|
|
def test_build_app_with_skills_loads_real_skill_from_filesystem(tmp_path: Path):
|
|
skill_dir = tmp_path / "skills" / "github-auth"
|
|
skill_dir.mkdir(parents=True)
|
|
(skill_dir / "SKILL.md").write_text(
|
|
"---\n"
|
|
"name: github-auth\n"
|
|
"description: Authenticate with GitHub.\n"
|
|
"---\n\n"
|
|
"# GitHub Auth\n\n"
|
|
"Use git or gh.\n"
|
|
)
|
|
|
|
app = build_app_with_skills(base_dir=tmp_path / "artifacts", skill_search_paths=[tmp_path / "skills"])
|
|
|
|
# github-auth is not in the candidate set by default, so router won't trigger it.
|
|
# We test that the app builds and a memory task still works.
|
|
trajectory = app.run_task("Use my telegram preference for this answer.", channel="telegram", user_id="oza")
|
|
assert trajectory["decisions"][0]["decision_type"] == "inject_memory"
|
|
|
|
# Now verify the skill backend is actually wired by loading directly
|
|
backend = app.runner.execution_engine.skill_executor.backend
|
|
payload = backend.load_skill("github-auth")
|
|
assert payload["name"] == "github-auth"
|
|
assert "Use git or gh." in payload["content"]
|
|
|
|
|
|
def test_app_artifact_index_queries_persisted_trajectories(tmp_path: Path):
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
|
|
app.run_task("Use my telegram preference for this answer.", channel="telegram", user_id="u1")
|
|
app.run_task("Check the current system status.", channel="local", user_id="u2")
|
|
|
|
index = app.artifact_index()
|
|
telegram_trajs = index.query(channel="telegram")
|
|
tool_trajs = index.query(decision_type="call_tool")
|
|
|
|
assert len(telegram_trajs) == 1
|
|
assert telegram_trajs[0]["task"]["input"] == "Use my telegram preference for this answer."
|
|
assert len(tool_trajs) == 1
|
|
assert tool_trajs[0]["task"]["input"] == "Check the current system status."
|
|
|
|
slice_ids = index.slice_dataset(channel="local")
|
|
assert len(slice_ids) == 1
|
|
|
|
|
|
def test_app_run_online_learning_cycle_returns_report(tmp_path: Path):
|
|
from memabra.benchmarks import BenchmarkTask
|
|
from memabra.promotion import PromotionPolicy
|
|
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
# Seed trajectories
|
|
for i in range(10):
|
|
app.run_task(f"Task {i}")
|
|
|
|
result = app.run_online_learning_cycle(
|
|
policy=PromotionPolicy(
|
|
min_reward_delta=-1.0,
|
|
max_error_rate_increase=1.0,
|
|
max_latency_increase_ms=10000.0,
|
|
required_task_count=1,
|
|
),
|
|
benchmark_tasks=[BenchmarkTask(user_input="Task 0")],
|
|
min_new_trajectories=1,
|
|
)
|
|
|
|
assert "skipped" in result
|
|
assert "promoted" in result or result["skipped"] is True
|
|
assert "report_id" in result
|
|
|
|
|
|
def test_app_run_online_learning_cycle_uses_baseline_version(tmp_path: Path):
|
|
from memabra.benchmarks import BenchmarkTask
|
|
from memabra.promotion import PromotionPolicy
|
|
from memabra.router import SimpleLearningRouter
|
|
from memabra.router_versioning import RouterVersionStore
|
|
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
for i in range(10):
|
|
app.run_task(f"Task {i}")
|
|
|
|
# Save a baseline version
|
|
baseline_router = SimpleLearningRouter()
|
|
baseline_router._weights = {"call_tool": {"input_length": 0.99}}
|
|
baseline_router._feature_keys = ["input_length"]
|
|
version_dir = tmp_path / "versions"
|
|
store = RouterVersionStore(base_dir=version_dir)
|
|
store.save(baseline_router, version_id="v-baseline")
|
|
|
|
# Change current router
|
|
app.set_router(SimpleLearningRouter())
|
|
|
|
result = app.run_online_learning_cycle(
|
|
policy=PromotionPolicy(
|
|
min_reward_delta=-1.0,
|
|
max_error_rate_increase=1.0,
|
|
max_latency_increase_ms=10000.0,
|
|
required_task_count=1,
|
|
),
|
|
benchmark_tasks=[BenchmarkTask(user_input="Task 0")],
|
|
min_new_trajectories=1,
|
|
version_store_base_dir=version_dir,
|
|
baseline_version_id="v-baseline",
|
|
)
|
|
|
|
assert result["skipped"] is False
|
|
assert "baseline_metrics" in result
|
|
assert "challenger_metrics" in result
|
|
|
|
|
|
def test_app_run_online_learning_cycle_rebuilds_case_index(tmp_path: Path):
|
|
from memabra.benchmarks import BenchmarkTask
|
|
from memabra.promotion import PromotionPolicy
|
|
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
for i in range(10):
|
|
app.run_task(f"Task {i}")
|
|
|
|
case_index_path = tmp_path / "case-index.json"
|
|
result = app.run_online_learning_cycle(
|
|
policy=PromotionPolicy(
|
|
min_reward_delta=-1.0,
|
|
max_error_rate_increase=1.0,
|
|
max_latency_increase_ms=10000.0,
|
|
required_task_count=1,
|
|
),
|
|
benchmark_tasks=[BenchmarkTask(user_input="Task 0")],
|
|
min_new_trajectories=1,
|
|
case_index_path=case_index_path,
|
|
)
|
|
|
|
assert result["skipped"] is False
|
|
assert case_index_path.exists()
|
|
from memabra.case_index import CaseIndex
|
|
|
|
index = CaseIndex.load(case_index_path)
|
|
assert index.best("Task 0") is not None
|
|
|
|
|
|
def test_app_build_case_index_from_trajectories(tmp_path: Path):
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
app.run_task("Hello world", channel="local", user_id="u1")
|
|
app.run_task("Hello world", channel="local", user_id="u2")
|
|
|
|
case_index = app.build_case_index()
|
|
|
|
assert case_index.best("Hello world") is not None
|
|
|
|
|
|
def test_app_save_and_load_case_index(tmp_path: Path):
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
app.run_task("Persist this case", channel="local", user_id="u1")
|
|
|
|
case_index_path = tmp_path / "case-index.json"
|
|
app.build_case_index()
|
|
app.save_case_index(case_index_path)
|
|
loaded_app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
loaded_app.load_case_index(case_index_path)
|
|
|
|
assert loaded_app.case_index is not None
|
|
assert loaded_app.case_index.best("Persist this case") is not None
|
|
|
|
|
|
def test_app_best_trajectory_for_input(tmp_path: Path):
|
|
app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
|
|
trajectory = app.run_task("Find the best trajectory", channel="local", user_id="u1")
|
|
|
|
app.build_case_index()
|
|
best_id = app.best_trajectory_for("Find the best trajectory")
|
|
|
|
assert best_id == trajectory["trajectory_id"]
|