Scaffold for agent capability benchmark harness (skills-qng9): - docs/specs/scenario-schema.md: YAML schema for test scenarios - tests/scenarios/: Easy, medium, hard example scenarios - tests/fixtures/: Python fixtures for testing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
175 lines
6.1 KiB
Markdown
175 lines
6.1 KiB
Markdown
# Scenario Definition Schema
|
|
|
|
Defines the YAML format for agent capability test scenarios.
|
|
|
|
## Schema
|
|
|
|
```yaml
|
|
# ============================================================================
|
|
# METADATA
|
|
# ============================================================================
|
|
id: string # Unique identifier (kebab-case)
|
|
title: string # Human-readable name
|
|
difficulty: easy | medium | hard
|
|
tags: [string] # For filtering (e.g., ["python", "refactor", "testing"])
|
|
|
|
# ============================================================================
|
|
# FIXTURE
|
|
# ============================================================================
|
|
fixture:
|
|
source: string # Path to fixture directory (relative to fixtures/)
|
|
# OR
|
|
git: string # Git repo URL to clone
|
|
ref: string # Branch/tag/commit (optional, default: main)
|
|
|
|
setup: # Commands to run after fixture setup (optional)
|
|
- string
|
|
|
|
# ============================================================================
|
|
# TASK
|
|
# ============================================================================
|
|
task:
|
|
description: | # The prompt given to the agent
|
|
Multi-line description of what to accomplish.
|
|
This is what the agent sees.
|
|
|
|
context: # Additional context to provide (optional)
|
|
- path: string # File path relative to fixture
|
|
hint: string # Why this file is relevant
|
|
|
|
entry_point: string # Where agent should start (optional, e.g., "src/main.py")
|
|
|
|
# ============================================================================
|
|
# EXECUTION
|
|
# ============================================================================
|
|
execution:
|
|
mode: scripted | live | both # Which execution modes to support
|
|
timeout: duration # e.g., "5m", "30s"
|
|
|
|
# For scripted mode: deterministic actions that simulate agent work
|
|
scripted:
|
|
actions:
|
|
- type: shell
|
|
run: string # Shell command
|
|
- type: write
|
|
path: string
|
|
content: string
|
|
- type: edit
|
|
path: string
|
|
old: string
|
|
new: string
|
|
- type: worker # Worker CLI command
|
|
command: string # e.g., "start", "done"
|
|
args: {key: value}
|
|
|
|
# For live mode: real agent execution
|
|
live:
|
|
model: string # Model to use (optional, default from config)
|
|
system_prompt: string # Override system prompt (optional)
|
|
tools: [string] # Tool restrictions (optional)
|
|
max_turns: int # Max agent turns (optional)
|
|
|
|
# ============================================================================
|
|
# VERIFICATION
|
|
# ============================================================================
|
|
verify:
|
|
# Property-based checks (fast, deterministic)
|
|
properties:
|
|
- type: file_exists
|
|
path: string
|
|
|
|
- type: file_not_exists
|
|
path: string
|
|
|
|
- type: file_contains
|
|
path: string
|
|
pattern: string # Regex or literal
|
|
|
|
- type: function_defined
|
|
path: string
|
|
name: string
|
|
language: python | nim | typescript | ...
|
|
|
|
- type: tests_pass
|
|
command: string # e.g., "pytest", "npm test"
|
|
|
|
- type: compiles
|
|
command: string # e.g., "nim c src/main.nim"
|
|
|
|
- type: lint_clean
|
|
command: string # e.g., "ruff check ."
|
|
|
|
- type: git_state
|
|
branch_merged: string # Branch name that should be merged
|
|
worktree_removed: string # Worktree path that should not exist
|
|
db_state: # Worker DB assertions
|
|
task_id: string
|
|
state: string
|
|
|
|
- type: custom
|
|
command: string # Exit 0 = pass, non-zero = fail
|
|
|
|
# LLM-as-judge evaluation
|
|
llm_judge:
|
|
enabled: bool # Default: true
|
|
model: string # Default: haiku (cheap) or sonnet (quality)
|
|
rubric:
|
|
- criterion: string # What to evaluate
|
|
weight: float # 0.0-1.0, default 1.0
|
|
threshold: float # Min score to pass (0.0-1.0)
|
|
|
|
# Golden file comparison
|
|
golden:
|
|
- path: string # File to compare
|
|
golden: string # Path to golden file (relative to scenario dir)
|
|
mode: exact | normalized | semantic
|
|
|
|
# Human review
|
|
human:
|
|
required: bool # If true, scenario never auto-passes
|
|
queue: string # Queue name for review (optional)
|
|
rubric: [string] # Checklist for human reviewer
|
|
|
|
# ============================================================================
|
|
# BENCHMARKING (optional)
|
|
# ============================================================================
|
|
benchmark:
|
|
enabled: bool # Include in benchmark runs
|
|
runs: int # Number of runs per config (for variance)
|
|
dimensions: # What to vary
|
|
models: [string]
|
|
prompts: [string] # Named prompt variants
|
|
# Add more as needed
|
|
metrics: # What to capture (beyond defaults)
|
|
- name: string
|
|
command: string # Custom metric extraction
|
|
```
|
|
|
|
## Examples
|
|
|
|
See `tests/scenarios/` for example scenarios at each difficulty level.
|
|
|
|
## Property Types Reference
|
|
|
|
| Type | Description | Required Fields |
|
|
|------|-------------|-----------------|
|
|
| `file_exists` | File exists | `path` |
|
|
| `file_not_exists` | File does not exist | `path` |
|
|
| `file_contains` | File contains pattern | `path`, `pattern` |
|
|
| `function_defined` | Function/method exists | `path`, `name`, `language` |
|
|
| `tests_pass` | Test command succeeds | `command` |
|
|
| `compiles` | Compile command succeeds | `command` |
|
|
| `lint_clean` | Linter passes | `command` |
|
|
| `git_state` | Git/worker state assertions | various |
|
|
| `custom` | Custom check script | `command` |
|
|
|
|
## Duration Format
|
|
|
|
Durations use Go-style format: `30s`, `5m`, `1h`, `1h30m`
|
|
|
|
## Difficulty Guidelines
|
|
|
|
- **easy**: Clear spec, single file, <5min for competent agent
|
|
- **medium**: Requires reading existing code, multiple files, some ambiguity
|
|
- **hard**: Debugging, architectural decisions, edge cases, significant ambiguity
|