# Scenario Definition Schema Defines the YAML format for agent capability test scenarios. ## Schema ```yaml # ============================================================================ # METADATA # ============================================================================ id: string # Unique identifier (kebab-case) title: string # Human-readable name difficulty: easy | medium | hard tags: [string] # For filtering (e.g., ["python", "refactor", "testing"]) # ============================================================================ # FIXTURE # ============================================================================ fixture: source: string # Path to fixture directory (relative to fixtures/) # OR git: string # Git repo URL to clone ref: string # Branch/tag/commit (optional, default: main) setup: # Commands to run after fixture setup (optional) - string # ============================================================================ # TASK # ============================================================================ task: description: | # The prompt given to the agent Multi-line description of what to accomplish. This is what the agent sees. context: # Additional context to provide (optional) - path: string # File path relative to fixture hint: string # Why this file is relevant entry_point: string # Where agent should start (optional, e.g., "src/main.py") # ============================================================================ # EXECUTION # ============================================================================ execution: mode: scripted | live | both # Which execution modes to support timeout: duration # e.g., "5m", "30s" # For scripted mode: deterministic actions that simulate agent work scripted: actions: - type: shell run: string # Shell command - type: write path: string content: string - type: edit path: string old: string new: string - type: worker # Worker CLI command command: string # e.g., "start", "done" args: {key: value} # For live mode: real agent execution live: model: string # Model to use (optional, default from config) system_prompt: string # Override system prompt (optional) tools: [string] # Tool restrictions (optional) max_turns: int # Max agent turns (optional) # ============================================================================ # VERIFICATION # ============================================================================ verify: # Property-based checks (fast, deterministic) properties: - type: file_exists path: string - type: file_not_exists path: string - type: file_contains path: string pattern: string # Regex or literal - type: function_defined path: string name: string language: python | nim | typescript | ... - type: tests_pass command: string # e.g., "pytest", "npm test" - type: compiles command: string # e.g., "nim c src/main.nim" - type: lint_clean command: string # e.g., "ruff check ." - type: git_state branch_merged: string # Branch name that should be merged worktree_removed: string # Worktree path that should not exist db_state: # Worker DB assertions task_id: string state: string - type: custom command: string # Exit 0 = pass, non-zero = fail # LLM-as-judge evaluation llm_judge: enabled: bool # Default: true model: string # Default: haiku (cheap) or sonnet (quality) rubric: - criterion: string # What to evaluate weight: float # 0.0-1.0, default 1.0 threshold: float # Min score to pass (0.0-1.0) # Golden file comparison golden: - path: string # File to compare golden: string # Path to golden file (relative to scenario dir) mode: exact | normalized | semantic # Human review human: required: bool # If true, scenario never auto-passes queue: string # Queue name for review (optional) rubric: [string] # Checklist for human reviewer # ============================================================================ # BENCHMARKING (optional) # ============================================================================ benchmark: enabled: bool # Include in benchmark runs runs: int # Number of runs per config (for variance) dimensions: # What to vary models: [string] prompts: [string] # Named prompt variants # Add more as needed metrics: # What to capture (beyond defaults) - name: string command: string # Custom metric extraction ``` ## Examples See `tests/scenarios/` for example scenarios at each difficulty level. ## Property Types Reference | Type | Description | Required Fields | |------|-------------|-----------------| | `file_exists` | File exists | `path` | | `file_not_exists` | File does not exist | `path` | | `file_contains` | File contains pattern | `path`, `pattern` | | `function_defined` | Function/method exists | `path`, `name`, `language` | | `tests_pass` | Test command succeeds | `command` | | `compiles` | Compile command succeeds | `command` | | `lint_clean` | Linter passes | `command` | | `git_state` | Git/worker state assertions | various | | `custom` | Custom check script | `command` | ## Duration Format Durations use Go-style format: `30s`, `5m`, `1h`, `1h30m` ## Difficulty Guidelines - **easy**: Clear spec, single file, <5min for competent agent - **medium**: Requires reading existing code, multiple files, some ambiguity - **hard**: Debugging, architectural decisions, edge cases, significant ambiguity