Vivaria architecture
How Vivaria runs agents on tasks
- A user defines a METR Task Standard task family
- The user picks out a task from the task family, e.g.
count_odds/main
- The user makes an agent with a
main.py
file that callshooks.getInstructions()
,hooks.submit(answer)
, etc. - The user runs
viv run
(see here for more details) - The Vivaria server builds a Docker image based on the task family's and agent's code
- The server creates a Docker container from the image, again based on the task family's code
- The server runs a command in the container that starts the agent
- The agent logs trace entries, gets completions, and eventually submits an answer, all from/to the server via pyhooks
- Vivaria runs
TaskFamily#score
inside the Docker container, passing it the agent's submission
C4 diagrams
See here for details.
System context
C4Context
System_Ext(llmapi, "LLM API providers")
System_Boundary(b1, "METR") {
Person(poke, "Researcher (poke)")
System(vivaria, "Vivaria")
Person(agentauthor, "Agent Author")
Person(taskauthor, "Task Author")
}
Rel(vivaria, llmapi, "Calls out to")
Rel(poke, vivaria, "Runs tasks")
Rel(agentauthor, vivaria, "Writes agents")
Rel(taskauthor, vivaria, "Writes tasks")
UpdateRelStyle(poke, vivaria, $offsetX="-30", $offsetY="-20")
UpdateRelStyle(agentauthor, vivaria, $offsetX="-30", $offsetY="-30")
UpdateRelStyle(vivaria, llmapi, $offsetX="-30", $offsetY="-40")
UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")
Container
C4Container
Person(agentauthor, "Agent Author")
Person(taskauthor, "Task Author")
System_Ext(llmapi, "LLM API Providers")
Container_Ext(auth0, "Auth0")
ContainerDb_Ext(github, "GitHub")
Boundary(b1, "METR", "") {
Boundary(b2, "Server-Side", "") {
Container(middleman, "Middleman", "Python", "In separate repo")
Container(api, "API Server", "TypeScript", "Orchestrates everything")
ContainerDb(db, "DB", "Postgres; scripts/schema.sql", "Stores runs, trace entries, etc.")
Container(agents, "Agents", "Python", "Run tasks, records output <br>via pyhooks compatibility library")
}
Boundary(b3, "Users & Their Machines", "") {
Container(ui, "Web UI", "TypeScript, React, Vite")
Container(cli, "viv CLI", "Python")
Person(poke, "User (poke)")
}
}
Rel(middleman, auth0, "Checks auth", "HTTPS")
UpdateRelStyle(middleman, auth0, $offsetX="+30", $offsetY="+80")
Rel(ui, auth0, "Mints auth tokens", "HTTPS")
UpdateRelStyle(ui, auth0, $offsetX="-60", $offsetY="+360")
Rel(api, auth0, "Checks auth", "HTTPS")
UpdateRelStyle(api, auth0, $offsetX="+45", $offsetY="+80")
Rel(cli, github, "Commits & pushes<br> agents/tasks to", "HTTPS")
UpdateRelStyle(cli, github, $offsetX="-80", $offsetY="+260")
Rel(middleman, llmapi, "Calls out to", "HTTPS")
UpdateRelStyle(middleman, llmapi, $offsetX="-205", $offsetY="+205")
Rel(api, middleman, "Forwards <br>model calls", "HTTPS")
UpdateRelStyle(api, middleman, $offsetX="-30", $offsetY="-30")
Rel(cli, api, "Starts runs", "tRPC/HTTPS")
UpdateRelStyle(cli, api, $offsetX="+10", $offsetY="+100")
Rel(api, github, "Fetches agents<br> and tasks", "HTTPS")
UpdateRelStyle(api, github, $offsetX="+0", $offsetY="-70")
Rel(api, agents, "Starts and runs tasks on", "docker commands")
UpdateRelStyle(api, agents, $offsetX="-50", $offsetY="+60")
Rel(agents, api, "Calls models and<br>saves trace events", "pyhooks tRPC/HTTP")
UpdateRelStyle(agents, api, $offsetX="-160", $offsetY="-10")
Rel(api, db, "Reads/writes <br>traces, runs, etc.", "SQL/TCP")
UpdateRelStyle(api, db, $offsetX="-40", $offsetY="-40")
Rel(ui, api, "Gets traces", "tRPC/HTTPS")
UpdateRelStyle(ui, api, $offsetX="-150", $offsetY="+70")
Rel(poke, ui, "Views traces")
UpdateRelStyle(poke, ui, $offsetX="-0", $offsetY="-0")
Rel(poke, cli, "Runs tasks")
UpdateRelStyle(poke, cli, $offsetX="-0", $offsetY="-0")
Rel(taskauthor, github, "Writes tasks")
UpdateRelStyle(taskauthor, github, $offsetX="-0", $offsetY="-0")
Rel(agentauthor, github, "Writes agents")
UpdateRelStyle(agentauthor, github, $offsetX="-0", $offsetY="-0")
UpdateLayoutConfig($c4ShapeInRow="3", $c4BoundaryInRow="1")