{"leaderboard":[],"runs":[],"suites":[{"id":"swe-bench-verified","name":"SWE-bench Verified","category":"software","description":"Real GitHub issue resolution tasks with deterministic patch scoring.","sourceUrl":"https://www.swebench.com/","primaryMetric":"resolved","licenseNote":"Store task metadata and source references; import full problem assets during operator setup.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"swe-bench-verified-task-001","suiteId":"swe-bench-verified","title":"SWE-bench Verified seed task 1","prompt":"Run SWE-bench Verified task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"swe-bench-verified:1","requiresOperatorImport":true},{"id":"swe-bench-verified-task-002","suiteId":"swe-bench-verified","title":"SWE-bench Verified seed task 2","prompt":"Run SWE-bench Verified task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"swe-bench-verified:2","requiresOperatorImport":true},{"id":"swe-bench-verified-task-003","suiteId":"swe-bench-verified","title":"SWE-bench Verified seed task 3","prompt":"Run SWE-bench Verified task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"swe-bench-verified:3","requiresOperatorImport":true}]},{"id":"swe-bench-pro","name":"SWE-bench Pro","category":"software","description":"Harder software maintenance tasks for long-horizon coding agents.","sourceUrl":"https://www.swebench.com/","primaryMetric":"resolved","licenseNote":"Store task metadata and source references; import full problem assets during operator setup.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"swe-bench-pro-task-001","suiteId":"swe-bench-pro","title":"SWE-bench Pro seed task 1","prompt":"Run SWE-bench Pro task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"swe-bench-pro:1","requiresOperatorImport":true},{"id":"swe-bench-pro-task-002","suiteId":"swe-bench-pro","title":"SWE-bench Pro seed task 2","prompt":"Run SWE-bench Pro task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"swe-bench-pro:2","requiresOperatorImport":true},{"id":"swe-bench-pro-task-003","suiteId":"swe-bench-pro","title":"SWE-bench Pro seed task 3","prompt":"Run SWE-bench Pro task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"swe-bench-pro:3","requiresOperatorImport":true}]},{"id":"terminal-bench-2.1","name":"Terminal-Bench 2.1","category":"terminal","description":"Terminal-native engineering and systems tasks executed in a shell.","sourceUrl":"https://www.tbench.ai/","primaryMetric":"pass_rate","licenseNote":"Use suite IDs and task references by default; import benchmark assets during setup.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"terminal-bench-2.1-task-001","suiteId":"terminal-bench-2.1","title":"Terminal-Bench 2.1 seed task 1","prompt":"Run Terminal-Bench 2.1 task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"terminal-bench-2.1:1","requiresOperatorImport":true},{"id":"terminal-bench-2.1-task-002","suiteId":"terminal-bench-2.1","title":"Terminal-Bench 2.1 seed task 2","prompt":"Run Terminal-Bench 2.1 task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"terminal-bench-2.1:2","requiresOperatorImport":true},{"id":"terminal-bench-2.1-task-003","suiteId":"terminal-bench-2.1","title":"Terminal-Bench 2.1 seed task 3","prompt":"Run Terminal-Bench 2.1 task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"terminal-bench-2.1:3","requiresOperatorImport":true}]},{"id":"osworld-verified","name":"OSWorld Verified","category":"os","description":"Computer-use tasks across desktop and web applications.","sourceUrl":"https://os-world.github.io/","primaryMetric":"success_rate","licenseNote":"Store metadata and require operator-provided VM/app assets for full execution.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"osworld-verified-task-001","suiteId":"osworld-verified","title":"OSWorld Verified seed task 1","prompt":"Run OSWorld Verified task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"osworld-verified:1","requiresOperatorImport":true},{"id":"osworld-verified-task-002","suiteId":"osworld-verified","title":"OSWorld Verified seed task 2","prompt":"Run OSWorld Verified task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"osworld-verified:2","requiresOperatorImport":true},{"id":"osworld-verified-task-003","suiteId":"osworld-verified","title":"OSWorld Verified seed task 3","prompt":"Run OSWorld Verified task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"osworld-verified:3","requiresOperatorImport":true}]},{"id":"browsecomp","name":"BrowseComp","category":"browser","description":"Hard browsing tasks that require finding difficult web information.","sourceUrl":"https://openai.com/index/browsecomp/","primaryMetric":"accuracy","licenseNote":"Store task references only unless the corpus is explicitly imported by the operator.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"browsecomp-task-001","suiteId":"browsecomp","title":"BrowseComp seed task 1","prompt":"Run BrowseComp task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"browsecomp:1","requiresOperatorImport":true},{"id":"browsecomp-task-002","suiteId":"browsecomp","title":"BrowseComp seed task 2","prompt":"Run BrowseComp task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"browsecomp:2","requiresOperatorImport":true},{"id":"browsecomp-task-003","suiteId":"browsecomp","title":"BrowseComp seed task 3","prompt":"Run BrowseComp task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"browsecomp:3","requiresOperatorImport":true}]},{"id":"mcp-atlas","name":"MCP Atlas","category":"tool-use","description":"Tool-use tasks across MCP-style connected service environments.","sourceUrl":"https://www.vals.ai/benchmarks","primaryMetric":"accuracy","licenseNote":"Store task metadata and adapter hooks; import suite assets during operator setup.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"mcp-atlas-task-001","suiteId":"mcp-atlas","title":"MCP Atlas seed task 1","prompt":"Run MCP Atlas task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"mcp-atlas:1","requiresOperatorImport":true},{"id":"mcp-atlas-task-002","suiteId":"mcp-atlas","title":"MCP Atlas seed task 2","prompt":"Run MCP Atlas task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"mcp-atlas:2","requiresOperatorImport":true},{"id":"mcp-atlas-task-003","suiteId":"mcp-atlas","title":"MCP Atlas seed task 3","prompt":"Run MCP Atlas task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"mcp-atlas:3","requiresOperatorImport":true}]},{"id":"tau2-telecom","name":"Tau2-bench Telecom","category":"tool-use","description":"Conversational API and policy-adherence tasks in a telecom domain.","sourceUrl":"https://github.com/sierra-research/tau2-bench","primaryMetric":"success_rate","licenseNote":"Store task references and replay configuration; import suite assets during setup.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"tau2-telecom-task-001","suiteId":"tau2-telecom","title":"Tau2-bench Telecom seed task 1","prompt":"Run Tau2-bench Telecom task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"tau2-telecom:1","requiresOperatorImport":true},{"id":"tau2-telecom-task-002","suiteId":"tau2-telecom","title":"Tau2-bench Telecom seed task 2","prompt":"Run Tau2-bench Telecom task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"tau2-telecom:2","requiresOperatorImport":true},{"id":"tau2-telecom-task-003","suiteId":"tau2-telecom","title":"Tau2-bench Telecom seed task 3","prompt":"Run Tau2-bench Telecom task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"tau2-telecom:3","requiresOperatorImport":true}]},{"id":"finance-agent-v2","name":"Finance Agent v2","category":"finance","description":"Finance-oriented agent workflows requiring tool use and evidence handling.","sourceUrl":"https://www.vals.ai/benchmarks","primaryMetric":"accuracy","licenseNote":"Store task metadata and require operator-provided benchmark definitions.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"finance-agent-v2-task-001","suiteId":"finance-agent-v2","title":"Finance Agent v2 seed task 1","prompt":"Run Finance Agent v2 task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"finance-agent-v2:1","requiresOperatorImport":true},{"id":"finance-agent-v2-task-002","suiteId":"finance-agent-v2","title":"Finance Agent v2 seed task 2","prompt":"Run Finance Agent v2 task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"finance-agent-v2:2","requiresOperatorImport":true},{"id":"finance-agent-v2-task-003","suiteId":"finance-agent-v2","title":"Finance Agent v2 seed task 3","prompt":"Run Finance Agent v2 task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"finance-agent-v2:3","requiresOperatorImport":true}]},{"id":"vibe-code-bench-1.1","name":"Vibe Code Bench 1.1","category":"software","description":"Product-building coding tasks scored by project-level outcomes.","sourceUrl":"https://www.vals.ai/benchmarks/vibe-code","primaryMetric":"score","licenseNote":"Store task references and import benchmark assets during operator setup.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"vibe-code-bench-1.1-task-001","suiteId":"vibe-code-bench-1.1","title":"Vibe Code Bench 1.1 seed task 1","prompt":"Run Vibe Code Bench 1.1 task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"vibe-code-bench-1.1:1","requiresOperatorImport":true},{"id":"vibe-code-bench-1.1-task-002","suiteId":"vibe-code-bench-1.1","title":"Vibe Code Bench 1.1 seed task 2","prompt":"Run Vibe Code Bench 1.1 task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"vibe-code-bench-1.1:2","requiresOperatorImport":true},{"id":"vibe-code-bench-1.1-task-003","suiteId":"vibe-code-bench-1.1","title":"Vibe Code Bench 1.1 seed task 3","prompt":"Run Vibe Code Bench 1.1 task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"vibe-code-bench-1.1:3","requiresOperatorImport":true}]},{"id":"skillsbench","name":"SkillsBench","category":"skills","description":"Skill acquisition and instruction-following tasks for agentic harnesses.","sourceUrl":"https://www.vals.ai/benchmarks","primaryMetric":"score","licenseNote":"Store task metadata and require operator-provided benchmark definitions.","higherIsBetter":true,"defaultTaskLimit":3,"tasks":[{"id":"skillsbench-task-001","suiteId":"skillsbench","title":"SkillsBench seed task 1","prompt":"Run SkillsBench task 1 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"skillsbench:1","requiresOperatorImport":true},{"id":"skillsbench-task-002","suiteId":"skillsbench","title":"SkillsBench seed task 2","prompt":"Run SkillsBench task 2 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"skillsbench:2","requiresOperatorImport":true},{"id":"skillsbench-task-003","suiteId":"skillsbench","title":"SkillsBench seed task 3","prompt":"Run SkillsBench task 3 using the suite-compatible runner.\nProduce a machine-readable result.json with score, pass/fail, token usage, artifacts, and raw harness output.\nIf the full benchmark corpus is unavailable, return a failed infrastructure result explaining the missing import instead of fabricating a score.","expectedArtifacts":["result.json","harness.log"],"sourceRef":"skillsbench:3","requiresOperatorImport":true}]}],"harnesses":[{"id":"claude-code","label":"Claude Code","providerPolicy":"Anthropic models only","sdkPackage":"@anthropic-ai/claude-agent-sdk","summary":"Claude Agent SDK query runner with bash/read/edit tools enabled."},{"id":"codex","label":"Codex","providerPolicy":"OpenAI models, Responses-compatible endpoints, or Chat-compatible endpoints","sdkPackage":"@openai/codex-sdk","summary":"Codex SDK thread runner with explicit OpenAI, Responses-compatible, or Chat-compatible provider modes."},{"id":"opencode","label":"OpenCode","providerPolicy":"OpenCode provider/model identifiers","sdkPackage":"@opencode-ai/sdk","summary":"OpenCode server/client runner with harness-native model strings."},{"id":"eve","label":"Eve","providerPolicy":"Eve project model identifiers","sdkPackage":null,"summary":"Eve filesystem-first project template running inside Vercel Sandbox."},{"id":"mastra","label":"Mastra","providerPolicy":"Mastra model identifiers","sdkPackage":"@mastra/core","summary":"Mastra Agent/AgentController-style runner with normalized result output."}]}