memabra/tests/test_app.py

from pathlib import Path

from memabra.app import MemabraApp, build_app_with_skills, build_demo_app


def test_build_demo_app_runs_task_and_produces_summary(tmp_path: Path):
    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")

    trajectory = app.run_task("Use my telegram preference for this answer.", channel="telegram", user_id="oza")
    summary = app.replay_summary()

    assert trajectory["trajectory_id"].startswith("traj-")
    assert summary.trajectories == 1
    assert any(event["event_type"] == "memory_injected" for event in trajectory["events"])
    assert len(list((tmp_path / "demo-artifacts" / "trajectories").glob("*.json"))) == 1


def test_app_can_run_tool_task_with_demo_backend(tmp_path: Path):
    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")

    trajectory = app.run_task("Check the current system status.")

    assert trajectory["decisions"][0]["decision_type"] == "call_tool"
    assert any(event["event_type"] == "tool_result" for event in trajectory["events"])
    assert trajectory["outcome"]["status"] == "success"


def test_build_app_with_skills_loads_real_skill_from_filesystem(tmp_path: Path):
    skill_dir = tmp_path / "skills" / "github-auth"
    skill_dir.mkdir(parents=True)
    (skill_dir / "SKILL.md").write_text(
        "---\n"
        "name: github-auth\n"
        "description: Authenticate with GitHub.\n"
        "---\n\n"
        "# GitHub Auth\n\n"
        "Use git or gh.\n"
    )

    app = build_app_with_skills(base_dir=tmp_path / "artifacts", skill_search_paths=[tmp_path / "skills"])

    # github-auth is not in the candidate set by default, so router won't trigger it.
    # We test that the app builds and a memory task still works.
    trajectory = app.run_task("Use my telegram preference for this answer.", channel="telegram", user_id="oza")
    assert trajectory["decisions"][0]["decision_type"] == "inject_memory"

    # Now verify the skill backend is actually wired by loading directly
    backend = app.runner.execution_engine.skill_executor.backend
    payload = backend.load_skill("github-auth")
    assert payload["name"] == "github-auth"
    assert "Use git or gh." in payload["content"]


def test_app_artifact_index_queries_persisted_trajectories(tmp_path: Path):
    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")

    app.run_task("Use my telegram preference for this answer.", channel="telegram", user_id="u1")
    app.run_task("Check the current system status.", channel="local", user_id="u2")

    index = app.artifact_index()
    telegram_trajs = index.query(channel="telegram")
    tool_trajs = index.query(decision_type="call_tool")

    assert len(telegram_trajs) == 1
    assert telegram_trajs[0]["task"]["input"] == "Use my telegram preference for this answer."
    assert len(tool_trajs) == 1
    assert tool_trajs[0]["task"]["input"] == "Check the current system status."

    slice_ids = index.slice_dataset(channel="local")
    assert len(slice_ids) == 1


def test_app_run_online_learning_cycle_returns_report(tmp_path: Path):
    from memabra.benchmarks import BenchmarkTask
    from memabra.promotion import PromotionPolicy

    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    # Seed trajectories
    for i in range(10):
        app.run_task(f"Task {i}")

    result = app.run_online_learning_cycle(
        policy=PromotionPolicy(
            min_reward_delta=-1.0,
            max_error_rate_increase=1.0,
            max_latency_increase_ms=10000.0,
            required_task_count=1,
        ),
        benchmark_tasks=[BenchmarkTask(user_input="Task 0")],
        min_new_trajectories=1,
    )

    assert "skipped" in result
    assert "promoted" in result or result["skipped"] is True
    assert "report_id" in result


def test_app_run_online_learning_cycle_uses_baseline_version(tmp_path: Path):
    from memabra.benchmarks import BenchmarkTask
    from memabra.promotion import PromotionPolicy
    from memabra.router import SimpleLearningRouter
    from memabra.router_versioning import RouterVersionStore

    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    for i in range(10):
        app.run_task(f"Task {i}")

    # Save a baseline version
    baseline_router = SimpleLearningRouter()
    baseline_router._weights = {"call_tool": {"input_length": 0.99}}
    baseline_router._feature_keys = ["input_length"]
    version_dir = tmp_path / "versions"
    store = RouterVersionStore(base_dir=version_dir)
    store.save(baseline_router, version_id="v-baseline")

    # Change current router
    app.set_router(SimpleLearningRouter())

    result = app.run_online_learning_cycle(
        policy=PromotionPolicy(
            min_reward_delta=-1.0,
            max_error_rate_increase=1.0,
            max_latency_increase_ms=10000.0,
            required_task_count=1,
        ),
        benchmark_tasks=[BenchmarkTask(user_input="Task 0")],
        min_new_trajectories=1,
        version_store_base_dir=version_dir,
        baseline_version_id="v-baseline",
    )

    assert result["skipped"] is False
    assert "baseline_metrics" in result
    assert "challenger_metrics" in result


def test_app_run_online_learning_cycle_rebuilds_case_index(tmp_path: Path):
    from memabra.benchmarks import BenchmarkTask
    from memabra.promotion import PromotionPolicy

    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    for i in range(10):
        app.run_task(f"Task {i}")

    case_index_path = tmp_path / "case-index.json"
    result = app.run_online_learning_cycle(
        policy=PromotionPolicy(
            min_reward_delta=-1.0,
            max_error_rate_increase=1.0,
            max_latency_increase_ms=10000.0,
            required_task_count=1,
        ),
        benchmark_tasks=[BenchmarkTask(user_input="Task 0")],
        min_new_trajectories=1,
        case_index_path=case_index_path,
    )

    assert result["skipped"] is False
    assert case_index_path.exists()
    from memabra.case_index import CaseIndex

    index = CaseIndex.load(case_index_path)
    assert index.best("Task 0") is not None


def test_app_build_case_index_from_trajectories(tmp_path: Path):
    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    app.run_task("Hello world", channel="local", user_id="u1")
    app.run_task("Hello world", channel="local", user_id="u2")

    case_index = app.build_case_index()

    assert case_index.best("Hello world") is not None


def test_app_save_and_load_case_index(tmp_path: Path):
    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    app.run_task("Persist this case", channel="local", user_id="u1")

    case_index_path = tmp_path / "case-index.json"
    app.build_case_index()
    app.save_case_index(case_index_path)
    loaded_app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    loaded_app.load_case_index(case_index_path)

    assert loaded_app.case_index is not None
    assert loaded_app.case_index.best("Persist this case") is not None


def test_app_best_trajectory_for_input(tmp_path: Path):
    app = build_demo_app(base_dir=tmp_path / "demo-artifacts")
    trajectory = app.run_task("Find the best trajectory", channel="local", user_id="u1")

    app.build_case_index()
    best_id = app.best_trajectory_for("Find the best trajectory")

    assert best_id == trajectory["trajectory_id"]