docs: add scenario schema and example test fixtures
Scaffold for agent capability benchmark harness (skills-qng9): - docs/specs/scenario-schema.md: YAML schema for test scenarios - tests/scenarios/: Easy, medium, hard example scenarios - tests/fixtures/: Python fixtures for testing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
7c7733bc64
commit
4da1890fc3
174
docs/specs/scenario-schema.md
Normal file
174
docs/specs/scenario-schema.md
Normal file
|
|
@ -0,0 +1,174 @@
|
|||
# Scenario Definition Schema
|
||||
|
||||
Defines the YAML format for agent capability test scenarios.
|
||||
|
||||
## Schema
|
||||
|
||||
```yaml
|
||||
# ============================================================================
|
||||
# METADATA
|
||||
# ============================================================================
|
||||
id: string # Unique identifier (kebab-case)
|
||||
title: string # Human-readable name
|
||||
difficulty: easy | medium | hard
|
||||
tags: [string] # For filtering (e.g., ["python", "refactor", "testing"])
|
||||
|
||||
# ============================================================================
|
||||
# FIXTURE
|
||||
# ============================================================================
|
||||
fixture:
|
||||
source: string # Path to fixture directory (relative to fixtures/)
|
||||
# OR
|
||||
git: string # Git repo URL to clone
|
||||
ref: string # Branch/tag/commit (optional, default: main)
|
||||
|
||||
setup: # Commands to run after fixture setup (optional)
|
||||
- string
|
||||
|
||||
# ============================================================================
|
||||
# TASK
|
||||
# ============================================================================
|
||||
task:
|
||||
description: | # The prompt given to the agent
|
||||
Multi-line description of what to accomplish.
|
||||
This is what the agent sees.
|
||||
|
||||
context: # Additional context to provide (optional)
|
||||
- path: string # File path relative to fixture
|
||||
hint: string # Why this file is relevant
|
||||
|
||||
entry_point: string # Where agent should start (optional, e.g., "src/main.py")
|
||||
|
||||
# ============================================================================
|
||||
# EXECUTION
|
||||
# ============================================================================
|
||||
execution:
|
||||
mode: scripted | live | both # Which execution modes to support
|
||||
timeout: duration # e.g., "5m", "30s"
|
||||
|
||||
# For scripted mode: deterministic actions that simulate agent work
|
||||
scripted:
|
||||
actions:
|
||||
- type: shell
|
||||
run: string # Shell command
|
||||
- type: write
|
||||
path: string
|
||||
content: string
|
||||
- type: edit
|
||||
path: string
|
||||
old: string
|
||||
new: string
|
||||
- type: worker # Worker CLI command
|
||||
command: string # e.g., "start", "done"
|
||||
args: {key: value}
|
||||
|
||||
# For live mode: real agent execution
|
||||
live:
|
||||
model: string # Model to use (optional, default from config)
|
||||
system_prompt: string # Override system prompt (optional)
|
||||
tools: [string] # Tool restrictions (optional)
|
||||
max_turns: int # Max agent turns (optional)
|
||||
|
||||
# ============================================================================
|
||||
# VERIFICATION
|
||||
# ============================================================================
|
||||
verify:
|
||||
# Property-based checks (fast, deterministic)
|
||||
properties:
|
||||
- type: file_exists
|
||||
path: string
|
||||
|
||||
- type: file_not_exists
|
||||
path: string
|
||||
|
||||
- type: file_contains
|
||||
path: string
|
||||
pattern: string # Regex or literal
|
||||
|
||||
- type: function_defined
|
||||
path: string
|
||||
name: string
|
||||
language: python | nim | typescript | ...
|
||||
|
||||
- type: tests_pass
|
||||
command: string # e.g., "pytest", "npm test"
|
||||
|
||||
- type: compiles
|
||||
command: string # e.g., "nim c src/main.nim"
|
||||
|
||||
- type: lint_clean
|
||||
command: string # e.g., "ruff check ."
|
||||
|
||||
- type: git_state
|
||||
branch_merged: string # Branch name that should be merged
|
||||
worktree_removed: string # Worktree path that should not exist
|
||||
db_state: # Worker DB assertions
|
||||
task_id: string
|
||||
state: string
|
||||
|
||||
- type: custom
|
||||
command: string # Exit 0 = pass, non-zero = fail
|
||||
|
||||
# LLM-as-judge evaluation
|
||||
llm_judge:
|
||||
enabled: bool # Default: true
|
||||
model: string # Default: haiku (cheap) or sonnet (quality)
|
||||
rubric:
|
||||
- criterion: string # What to evaluate
|
||||
weight: float # 0.0-1.0, default 1.0
|
||||
threshold: float # Min score to pass (0.0-1.0)
|
||||
|
||||
# Golden file comparison
|
||||
golden:
|
||||
- path: string # File to compare
|
||||
golden: string # Path to golden file (relative to scenario dir)
|
||||
mode: exact | normalized | semantic
|
||||
|
||||
# Human review
|
||||
human:
|
||||
required: bool # If true, scenario never auto-passes
|
||||
queue: string # Queue name for review (optional)
|
||||
rubric: [string] # Checklist for human reviewer
|
||||
|
||||
# ============================================================================
|
||||
# BENCHMARKING (optional)
|
||||
# ============================================================================
|
||||
benchmark:
|
||||
enabled: bool # Include in benchmark runs
|
||||
runs: int # Number of runs per config (for variance)
|
||||
dimensions: # What to vary
|
||||
models: [string]
|
||||
prompts: [string] # Named prompt variants
|
||||
# Add more as needed
|
||||
metrics: # What to capture (beyond defaults)
|
||||
- name: string
|
||||
command: string # Custom metric extraction
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
See `tests/scenarios/` for example scenarios at each difficulty level.
|
||||
|
||||
## Property Types Reference
|
||||
|
||||
| Type | Description | Required Fields |
|
||||
|------|-------------|-----------------|
|
||||
| `file_exists` | File exists | `path` |
|
||||
| `file_not_exists` | File does not exist | `path` |
|
||||
| `file_contains` | File contains pattern | `path`, `pattern` |
|
||||
| `function_defined` | Function/method exists | `path`, `name`, `language` |
|
||||
| `tests_pass` | Test command succeeds | `command` |
|
||||
| `compiles` | Compile command succeeds | `command` |
|
||||
| `lint_clean` | Linter passes | `command` |
|
||||
| `git_state` | Git/worker state assertions | various |
|
||||
| `custom` | Custom check script | `command` |
|
||||
|
||||
## Duration Format
|
||||
|
||||
Durations use Go-style format: `30s`, `5m`, `1h`, `1h30m`
|
||||
|
||||
## Difficulty Guidelines
|
||||
|
||||
- **easy**: Clear spec, single file, <5min for competent agent
|
||||
- **medium**: Requires reading existing code, multiple files, some ambiguity
|
||||
- **hard**: Debugging, architectural decisions, edge cases, significant ambiguity
|
||||
27
tests/fixtures/flask-user-api/README.md
vendored
Normal file
27
tests/fixtures/flask-user-api/README.md
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# Flask User API Fixture
|
||||
|
||||
Flask application with user management for testing agent capability on medium/hard tasks.
|
||||
|
||||
## Structure
|
||||
|
||||
```
|
||||
src/
|
||||
__init__.py # App factory
|
||||
cache.py # Cache utilities
|
||||
routes/
|
||||
users.py # User endpoints
|
||||
models/
|
||||
user.py # User model
|
||||
tests/
|
||||
test_users.py # User endpoint tests
|
||||
stress_test_cache.py # Concurrency stress test
|
||||
```
|
||||
|
||||
## Scenarios Using This Fixture
|
||||
|
||||
- `medium/add-caching-to-api.yaml` - Add caching to user lookup
|
||||
- `hard/fix-race-condition.yaml` - Debug cache race condition
|
||||
|
||||
## Notes
|
||||
|
||||
This fixture intentionally contains a subtle race condition for the hard scenario.
|
||||
23
tests/fixtures/python-math-lib/README.md
vendored
Normal file
23
tests/fixtures/python-math-lib/README.md
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
# Python Math Library Fixture
|
||||
|
||||
Simple Python project for testing agent capability on basic tasks.
|
||||
|
||||
## Structure
|
||||
|
||||
```
|
||||
src/
|
||||
math_utils.py # Math utility functions
|
||||
tests/
|
||||
test_math_utils.py
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Run tests
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
## Scenarios Using This Fixture
|
||||
|
||||
- `easy/add-factorial.yaml` - Add factorial function
|
||||
10
tests/fixtures/python-math-lib/src/math_utils.py
vendored
Normal file
10
tests/fixtures/python-math-lib/src/math_utils.py
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
# Math utilities
|
||||
|
||||
def add(a: int, b: int) -> int:
|
||||
"""Add two integers."""
|
||||
return a + b
|
||||
|
||||
|
||||
def multiply(a: int, b: int) -> int:
|
||||
"""Multiply two integers."""
|
||||
return a * b
|
||||
32
tests/fixtures/python-math-lib/tests/test_math_utils.py
vendored
Normal file
32
tests/fixtures/python-math-lib/tests/test_math_utils.py
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
"""Tests for math utilities."""
|
||||
import pytest
|
||||
from src.math_utils import add, multiply
|
||||
|
||||
|
||||
def test_add():
|
||||
assert add(2, 3) == 5
|
||||
assert add(0, 0) == 0
|
||||
assert add(-1, 1) == 0
|
||||
|
||||
|
||||
def test_multiply():
|
||||
assert multiply(2, 3) == 6
|
||||
assert multiply(0, 5) == 0
|
||||
assert multiply(-2, 3) == -6
|
||||
|
||||
|
||||
# Tests for factorial will be added when the function exists
|
||||
def test_factorial():
|
||||
"""Test factorial function once implemented."""
|
||||
try:
|
||||
from src.math_utils import factorial
|
||||
except ImportError:
|
||||
pytest.skip("factorial not implemented yet")
|
||||
|
||||
assert factorial(0) == 1
|
||||
assert factorial(1) == 1
|
||||
assert factorial(5) == 120
|
||||
assert factorial(10) == 3628800
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
factorial(-1)
|
||||
95
tests/scenarios/easy/add-factorial.yaml
Normal file
95
tests/scenarios/easy/add-factorial.yaml
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
# Easy scenario: Add a simple function to existing codebase
|
||||
id: add-factorial
|
||||
title: Add factorial function
|
||||
difficulty: easy
|
||||
tags: [python, new-feature, single-file]
|
||||
|
||||
fixture:
|
||||
source: python-math-lib
|
||||
|
||||
task:
|
||||
description: |
|
||||
Add a function `factorial(n)` to `src/math_utils.py` that computes
|
||||
the factorial of a non-negative integer.
|
||||
|
||||
Requirements:
|
||||
- factorial(0) should return 1
|
||||
- factorial(5) should return 120
|
||||
- Should raise ValueError for negative inputs
|
||||
|
||||
entry_point: src/math_utils.py
|
||||
|
||||
execution:
|
||||
mode: both
|
||||
timeout: 5m
|
||||
|
||||
scripted:
|
||||
actions:
|
||||
- type: worker
|
||||
command: start
|
||||
- type: edit
|
||||
path: src/math_utils.py
|
||||
old: |
|
||||
# Math utilities
|
||||
new: |
|
||||
# Math utilities
|
||||
|
||||
def factorial(n: int) -> int:
|
||||
"""Compute factorial of non-negative integer."""
|
||||
if n < 0:
|
||||
raise ValueError("factorial not defined for negative numbers")
|
||||
if n <= 1:
|
||||
return 1
|
||||
return n * factorial(n - 1)
|
||||
- type: shell
|
||||
run: git add -A && git commit -m "Add factorial function"
|
||||
- type: worker
|
||||
command: done
|
||||
|
||||
verify:
|
||||
properties:
|
||||
- type: file_contains
|
||||
path: src/math_utils.py
|
||||
pattern: "def factorial"
|
||||
|
||||
- type: function_defined
|
||||
path: src/math_utils.py
|
||||
name: factorial
|
||||
language: python
|
||||
|
||||
- type: tests_pass
|
||||
command: pytest tests/ -v
|
||||
|
||||
- type: custom
|
||||
command: |
|
||||
python -c "
|
||||
from src.math_utils import factorial
|
||||
assert factorial(0) == 1
|
||||
assert factorial(5) == 120
|
||||
try:
|
||||
factorial(-1)
|
||||
exit(1) # Should have raised
|
||||
except ValueError:
|
||||
pass
|
||||
"
|
||||
|
||||
llm_judge:
|
||||
enabled: true
|
||||
model: haiku
|
||||
rubric:
|
||||
- criterion: Function correctly computes factorial for typical inputs
|
||||
weight: 1.0
|
||||
- criterion: Handles edge case n=0 correctly
|
||||
weight: 0.5
|
||||
- criterion: Handles negative input with appropriate error
|
||||
weight: 0.5
|
||||
- criterion: Code is idiomatic Python with type hints
|
||||
weight: 0.3
|
||||
threshold: 0.7
|
||||
|
||||
human:
|
||||
required: false
|
||||
|
||||
benchmark:
|
||||
enabled: true
|
||||
runs: 5
|
||||
79
tests/scenarios/hard/fix-race-condition.yaml
Normal file
79
tests/scenarios/hard/fix-race-condition.yaml
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
# Hard scenario: Debug and fix a race condition
|
||||
id: fix-race-condition
|
||||
title: Fix race condition in cache invalidation
|
||||
difficulty: hard
|
||||
tags: [python, concurrency, debugging, race-condition]
|
||||
|
||||
fixture:
|
||||
source: flask-user-api # Same fixture, but with a bug
|
||||
|
||||
task:
|
||||
description: |
|
||||
Users are reporting stale data after updates. Investigation shows there's
|
||||
a race condition in the cache invalidation logic.
|
||||
|
||||
Bug report:
|
||||
- User updates their profile
|
||||
- Immediately after, they see old data
|
||||
- Refreshing a few seconds later shows correct data
|
||||
|
||||
The issue is intermittent and happens under load.
|
||||
|
||||
Find and fix the race condition.
|
||||
|
||||
context:
|
||||
- path: src/routes/users.py
|
||||
hint: The endpoint with the bug
|
||||
- path: src/cache.py
|
||||
hint: Cache implementation
|
||||
- path: tests/test_users.py
|
||||
hint: Existing tests (don't catch this bug)
|
||||
|
||||
execution:
|
||||
mode: live # Too complex for scripted mode
|
||||
timeout: 15m
|
||||
|
||||
live:
|
||||
max_turns: 50
|
||||
# No scripted version - agent must debug
|
||||
|
||||
verify:
|
||||
properties:
|
||||
- type: tests_pass
|
||||
command: pytest tests/ -v
|
||||
|
||||
- type: custom
|
||||
command: |
|
||||
# Stress test for race condition
|
||||
python tests/stress_test_cache.py --iterations=100 --concurrent=10
|
||||
|
||||
llm_judge:
|
||||
enabled: true
|
||||
model: opus # Use best model for complex evaluation
|
||||
rubric:
|
||||
- criterion: Correctly identified the race condition
|
||||
weight: 1.0
|
||||
- criterion: Fix actually resolves the race (not just hiding it)
|
||||
weight: 1.5
|
||||
- criterion: Fix doesn't introduce new bugs or performance issues
|
||||
weight: 1.0
|
||||
- criterion: Explanation of the bug is accurate
|
||||
weight: 0.5
|
||||
- criterion: Added test that would catch this regression
|
||||
weight: 0.8
|
||||
threshold: 0.8
|
||||
|
||||
human:
|
||||
required: true # Hard scenarios need human verification
|
||||
queue: hard-reviews
|
||||
rubric:
|
||||
- Race condition is correctly identified
|
||||
- Fix is correct and complete
|
||||
- No new concurrency issues introduced
|
||||
- Performance impact is acceptable
|
||||
|
||||
benchmark:
|
||||
enabled: true
|
||||
runs: 3
|
||||
dimensions:
|
||||
models: [sonnet, opus] # Compare models on hard tasks
|
||||
154
tests/scenarios/medium/add-caching-to-api.yaml
Normal file
154
tests/scenarios/medium/add-caching-to-api.yaml
Normal file
|
|
@ -0,0 +1,154 @@
|
|||
# Medium scenario: Add caching to existing API endpoint
|
||||
id: add-caching-to-api
|
||||
title: Add caching to user lookup endpoint
|
||||
difficulty: medium
|
||||
tags: [python, flask, caching, refactor, multi-file]
|
||||
|
||||
fixture:
|
||||
source: flask-user-api
|
||||
|
||||
task:
|
||||
description: |
|
||||
The `/api/users/<id>` endpoint is slow because it queries the database
|
||||
on every request. Add caching to improve performance.
|
||||
|
||||
Requirements:
|
||||
- Cache user lookups for 5 minutes
|
||||
- Use the existing `cache` module (see src/cache.py)
|
||||
- Cache key should include user ID
|
||||
- Cache should be invalidated when user is updated (PUT /api/users/<id>)
|
||||
- Add a header `X-Cache: HIT` or `X-Cache: MISS` to responses
|
||||
|
||||
context:
|
||||
- path: src/routes/users.py
|
||||
hint: The endpoint to modify
|
||||
- path: src/cache.py
|
||||
hint: Existing cache utilities to use
|
||||
- path: src/models/user.py
|
||||
hint: User model for reference
|
||||
|
||||
execution:
|
||||
mode: both
|
||||
timeout: 10m
|
||||
|
||||
scripted:
|
||||
actions:
|
||||
- type: worker
|
||||
command: start
|
||||
- type: edit
|
||||
path: src/routes/users.py
|
||||
old: |
|
||||
@bp.route('/api/users/<int:user_id>')
|
||||
def get_user(user_id):
|
||||
user = User.query.get_or_404(user_id)
|
||||
return jsonify(user.to_dict())
|
||||
new: |
|
||||
@bp.route('/api/users/<int:user_id>')
|
||||
def get_user(user_id):
|
||||
cache_key = f"user:{user_id}"
|
||||
cached = cache.get(cache_key)
|
||||
if cached:
|
||||
response = jsonify(cached)
|
||||
response.headers['X-Cache'] = 'HIT'
|
||||
return response
|
||||
|
||||
user = User.query.get_or_404(user_id)
|
||||
user_dict = user.to_dict()
|
||||
cache.set(cache_key, user_dict, ttl=300)
|
||||
response = jsonify(user_dict)
|
||||
response.headers['X-Cache'] = 'MISS'
|
||||
return response
|
||||
- type: edit
|
||||
path: src/routes/users.py
|
||||
old: |
|
||||
@bp.route('/api/users/<int:user_id>', methods=['PUT'])
|
||||
def update_user(user_id):
|
||||
user = User.query.get_or_404(user_id)
|
||||
# ... update logic ...
|
||||
db.session.commit()
|
||||
return jsonify(user.to_dict())
|
||||
new: |
|
||||
@bp.route('/api/users/<int:user_id>', methods=['PUT'])
|
||||
def update_user(user_id):
|
||||
user = User.query.get_or_404(user_id)
|
||||
# ... update logic ...
|
||||
db.session.commit()
|
||||
cache.delete(f"user:{user_id}") # Invalidate cache
|
||||
return jsonify(user.to_dict())
|
||||
- type: edit
|
||||
path: src/routes/users.py
|
||||
old: |
|
||||
from flask import Blueprint, jsonify
|
||||
new: |
|
||||
from flask import Blueprint, jsonify
|
||||
from src import cache
|
||||
- type: shell
|
||||
run: git add -A && git commit -m "Add caching to user lookup endpoint"
|
||||
- type: worker
|
||||
command: done
|
||||
|
||||
verify:
|
||||
properties:
|
||||
- type: file_contains
|
||||
path: src/routes/users.py
|
||||
pattern: "cache\\.get"
|
||||
|
||||
- type: file_contains
|
||||
path: src/routes/users.py
|
||||
pattern: "cache\\.set"
|
||||
|
||||
- type: file_contains
|
||||
path: src/routes/users.py
|
||||
pattern: "X-Cache"
|
||||
|
||||
- type: file_contains
|
||||
path: src/routes/users.py
|
||||
pattern: "cache\\.delete"
|
||||
|
||||
- type: tests_pass
|
||||
command: pytest tests/ -v
|
||||
|
||||
- type: custom
|
||||
command: |
|
||||
# Functional test: verify caching behavior
|
||||
python -c "
|
||||
from src import create_app
|
||||
app = create_app('testing')
|
||||
with app.test_client() as client:
|
||||
# First request should be MISS
|
||||
r1 = client.get('/api/users/1')
|
||||
assert r1.headers.get('X-Cache') == 'MISS', 'First request should be MISS'
|
||||
|
||||
# Second request should be HIT
|
||||
r2 = client.get('/api/users/1')
|
||||
assert r2.headers.get('X-Cache') == 'HIT', 'Second request should be HIT'
|
||||
"
|
||||
|
||||
llm_judge:
|
||||
enabled: true
|
||||
model: sonnet # Use better model for nuanced evaluation
|
||||
rubric:
|
||||
- criterion: Caching is implemented correctly with appropriate TTL
|
||||
weight: 1.0
|
||||
- criterion: Cache invalidation on update is implemented
|
||||
weight: 1.0
|
||||
- criterion: X-Cache header correctly indicates HIT/MISS
|
||||
weight: 0.8
|
||||
- criterion: Existing cache module is used (not reinvented)
|
||||
weight: 0.5
|
||||
- criterion: Code follows existing patterns in the codebase
|
||||
weight: 0.5
|
||||
- criterion: No obvious bugs or edge cases missed
|
||||
weight: 0.7
|
||||
threshold: 0.75
|
||||
|
||||
human:
|
||||
required: false
|
||||
rubric:
|
||||
- Cache logic is correct and efficient
|
||||
- Invalidation covers all update paths
|
||||
- No security issues with cached data
|
||||
|
||||
benchmark:
|
||||
enabled: true
|
||||
runs: 3
|
||||
Loading…
Reference in a new issue