Skip to content

Vivaria architecture

How Vivaria runs agents on tasks

  1. A user defines a METR Task Standard task family
  2. The user picks out a task from the task family, e.g. count_odds/main
  3. The user makes an agent with a main.py file that calls hooks.getInstructions(), hooks.submit(answer), etc.
  4. The user runs viv run (see here for more details)
  5. The Vivaria server builds a Docker image based on the task family's and agent's code
  6. The server creates a Docker container from the image, again based on the task family's code
  7. The server runs a command in the container that starts the agent
  8. The agent logs trace entries, gets completions, and eventually submits an answer, all from/to the server via pyhooks
  9. Vivaria runs TaskFamily#score inside the Docker container, passing it the agent's submission

C4 diagrams

See here for details.

System context

C4Context
    System_Ext(llmapi, "LLM API providers")
    System_Boundary(b1, "METR") {
        Person(poke, "Researcher (poke)")
        System(vivaria, "Vivaria")
        Person(agentauthor, "Agent Author")
        Person(taskauthor, "Task Author")
    }

    Rel(vivaria, llmapi, "Calls out to")
    Rel(poke, vivaria, "Runs tasks")
    Rel(agentauthor, vivaria, "Writes agents")
    Rel(taskauthor, vivaria, "Writes tasks")

    UpdateRelStyle(poke, vivaria, $offsetX="-30", $offsetY="-20")
    UpdateRelStyle(agentauthor, vivaria, $offsetX="-30", $offsetY="-30")
    UpdateRelStyle(vivaria, llmapi, $offsetX="-30", $offsetY="-40")

    UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")

Container

C4Container
    Person(agentauthor, "Agent Author")
    Person(taskauthor, "Task Author")
    System_Ext(llmapi, "LLM API Providers")
    Container_Ext(auth0, "Auth0")
    ContainerDb_Ext(github, "GitHub")
    ContainerDb_Ext(airtable, "Airtable", "", "For misc analysies & search")

    Boundary(b1, "METR", "") {
        Boundary(b2, "Server-Side", "") {
            Container(middleman, "Middleman", "Python", "In separate repo")
            Container(api, "API Server", "TypeScript", "Orchestrates everything")
            ContainerDb(db, "DB", "Postgres; scripts/schema.sql", "Stores runs, trace entries, etc.")
            Container(agents, "Agents", "Python", "Run tasks, records output <br>via pyhooks compatibility library")
        }
        Boundary(b3, "Users & Their Machines", "") {
            Container(ui, "Web UI", "TypeScript, React, Vite")
            Container(cli, "viv CLI", "Python")
            Person(poke, "User (poke)")
        }

    }
    Rel(middleman, auth0, "Checks auth", "HTTPS")
    UpdateRelStyle(middleman, auth0, $offsetX="+30", $offsetY="+80")
    Rel(ui, auth0, "Mints auth tokens", "HTTPS")
    UpdateRelStyle(ui, auth0, $offsetX="-60", $offsetY="+360")
    Rel(api, auth0, "Checks auth", "HTTPS")
    UpdateRelStyle(api, auth0, $offsetX="+45", $offsetY="+80")
    Rel(cli, github, "Commits & pushes<br> agents/tasks to", "HTTPS")
    UpdateRelStyle(cli, github, $offsetX="-80", $offsetY="+260")
    Rel(middleman, llmapi, "Calls out to", "HTTPS")
    UpdateRelStyle(middleman, llmapi, $offsetX="-205", $offsetY="+205")
    Rel(api, middleman, "Forwards <br>model calls", "HTTPS")
    UpdateRelStyle(api, middleman, $offsetX="-30", $offsetY="-30")
    Rel(cli, api, "Starts runs", "tRPC/HTTPS")
    UpdateRelStyle(cli, api, $offsetX="+10", $offsetY="+100")
    Rel(api, github, "Fetches agents<br> and tasks", "HTTPS")
    UpdateRelStyle(api, github, $offsetX="+0", $offsetY="-70")
    Rel(api, agents, "Starts and runs tasks on", "docker commands")
    UpdateRelStyle(api, agents, $offsetX="-50", $offsetY="+60")
    Rel(agents, api, "Calls models and<br>saves trace events", "pyhooks tRPC/HTTP")
    UpdateRelStyle(agents, api, $offsetX="-160", $offsetY="-10")
    Rel(api, db, "Reads/writes <br>traces, runs, etc.", "SQL/TCP")
    UpdateRelStyle(api, db, $offsetX="-40", $offsetY="-40")
    Rel(ui, api, "Gets traces", "tRPC/HTTPS")
    UpdateRelStyle(ui, api, $offsetX="-150", $offsetY="+70")
    Rel(poke, ui, "Views traces")
    UpdateRelStyle(poke, ui, $offsetX="-0", $offsetY="-0")
    Rel(poke, cli, "Runs tasks")
    UpdateRelStyle(poke, cli, $offsetX="-0", $offsetY="-0")
    Rel(taskauthor, github, "Writes tasks")
    UpdateRelStyle(taskauthor, github, $offsetX="-0", $offsetY="-0")
    Rel(agentauthor, github, "Writes agents")
    UpdateRelStyle(agentauthor, github, $offsetX="-0", $offsetY="-0")
    Rel(api, airtable, "Writes run info", "HTTPS")
    UpdateRelStyle(api, airtable, $offsetX="-0", $offsetY="-0")

    UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")