diff --git a/docs/specs/scenario-schema.md b/docs/specs/scenario-schema.md new file mode 100644 index 0000000..2ccd4e7 --- /dev/null +++ b/docs/specs/scenario-schema.md @@ -0,0 +1,174 @@ +# Scenario Definition Schema + +Defines the YAML format for agent capability test scenarios. + +## Schema + +```yaml +# ============================================================================ +# METADATA +# ============================================================================ +id: string # Unique identifier (kebab-case) +title: string # Human-readable name +difficulty: easy | medium | hard +tags: [string] # For filtering (e.g., ["python", "refactor", "testing"]) + +# ============================================================================ +# FIXTURE +# ============================================================================ +fixture: + source: string # Path to fixture directory (relative to fixtures/) + # OR + git: string # Git repo URL to clone + ref: string # Branch/tag/commit (optional, default: main) + +setup: # Commands to run after fixture setup (optional) + - string + +# ============================================================================ +# TASK +# ============================================================================ +task: + description: | # The prompt given to the agent + Multi-line description of what to accomplish. + This is what the agent sees. + + context: # Additional context to provide (optional) + - path: string # File path relative to fixture + hint: string # Why this file is relevant + + entry_point: string # Where agent should start (optional, e.g., "src/main.py") + +# ============================================================================ +# EXECUTION +# ============================================================================ +execution: + mode: scripted | live | both # Which execution modes to support + timeout: duration # e.g., "5m", "30s" + + # For scripted mode: deterministic actions that simulate agent work + scripted: + actions: + - type: shell + run: string # Shell command + - type: write + path: string + content: string + - type: edit + path: string + old: string + new: string + - type: worker # Worker CLI command + command: string # e.g., "start", "done" + args: {key: value} + + # For live mode: real agent execution + live: + model: string # Model to use (optional, default from config) + system_prompt: string # Override system prompt (optional) + tools: [string] # Tool restrictions (optional) + max_turns: int # Max agent turns (optional) + +# ============================================================================ +# VERIFICATION +# ============================================================================ +verify: + # Property-based checks (fast, deterministic) + properties: + - type: file_exists + path: string + + - type: file_not_exists + path: string + + - type: file_contains + path: string + pattern: string # Regex or literal + + - type: function_defined + path: string + name: string + language: python | nim | typescript | ... + + - type: tests_pass + command: string # e.g., "pytest", "npm test" + + - type: compiles + command: string # e.g., "nim c src/main.nim" + + - type: lint_clean + command: string # e.g., "ruff check ." + + - type: git_state + branch_merged: string # Branch name that should be merged + worktree_removed: string # Worktree path that should not exist + db_state: # Worker DB assertions + task_id: string + state: string + + - type: custom + command: string # Exit 0 = pass, non-zero = fail + + # LLM-as-judge evaluation + llm_judge: + enabled: bool # Default: true + model: string # Default: haiku (cheap) or sonnet (quality) + rubric: + - criterion: string # What to evaluate + weight: float # 0.0-1.0, default 1.0 + threshold: float # Min score to pass (0.0-1.0) + + # Golden file comparison + golden: + - path: string # File to compare + golden: string # Path to golden file (relative to scenario dir) + mode: exact | normalized | semantic + + # Human review + human: + required: bool # If true, scenario never auto-passes + queue: string # Queue name for review (optional) + rubric: [string] # Checklist for human reviewer + +# ============================================================================ +# BENCHMARKING (optional) +# ============================================================================ +benchmark: + enabled: bool # Include in benchmark runs + runs: int # Number of runs per config (for variance) + dimensions: # What to vary + models: [string] + prompts: [string] # Named prompt variants + # Add more as needed + metrics: # What to capture (beyond defaults) + - name: string + command: string # Custom metric extraction +``` + +## Examples + +See `tests/scenarios/` for example scenarios at each difficulty level. + +## Property Types Reference + +| Type | Description | Required Fields | +|------|-------------|-----------------| +| `file_exists` | File exists | `path` | +| `file_not_exists` | File does not exist | `path` | +| `file_contains` | File contains pattern | `path`, `pattern` | +| `function_defined` | Function/method exists | `path`, `name`, `language` | +| `tests_pass` | Test command succeeds | `command` | +| `compiles` | Compile command succeeds | `command` | +| `lint_clean` | Linter passes | `command` | +| `git_state` | Git/worker state assertions | various | +| `custom` | Custom check script | `command` | + +## Duration Format + +Durations use Go-style format: `30s`, `5m`, `1h`, `1h30m` + +## Difficulty Guidelines + +- **easy**: Clear spec, single file, <5min for competent agent +- **medium**: Requires reading existing code, multiple files, some ambiguity +- **hard**: Debugging, architectural decisions, edge cases, significant ambiguity diff --git a/tests/fixtures/flask-user-api/README.md b/tests/fixtures/flask-user-api/README.md new file mode 100644 index 0000000..b2ceac3 --- /dev/null +++ b/tests/fixtures/flask-user-api/README.md @@ -0,0 +1,27 @@ +# Flask User API Fixture + +Flask application with user management for testing agent capability on medium/hard tasks. + +## Structure + +``` +src/ + __init__.py # App factory + cache.py # Cache utilities + routes/ + users.py # User endpoints + models/ + user.py # User model +tests/ + test_users.py # User endpoint tests + stress_test_cache.py # Concurrency stress test +``` + +## Scenarios Using This Fixture + +- `medium/add-caching-to-api.yaml` - Add caching to user lookup +- `hard/fix-race-condition.yaml` - Debug cache race condition + +## Notes + +This fixture intentionally contains a subtle race condition for the hard scenario. diff --git a/tests/fixtures/python-math-lib/README.md b/tests/fixtures/python-math-lib/README.md new file mode 100644 index 0000000..c3aa174 --- /dev/null +++ b/tests/fixtures/python-math-lib/README.md @@ -0,0 +1,23 @@ +# Python Math Library Fixture + +Simple Python project for testing agent capability on basic tasks. + +## Structure + +``` +src/ + math_utils.py # Math utility functions +tests/ + test_math_utils.py +``` + +## Usage + +```bash +# Run tests +pytest tests/ -v +``` + +## Scenarios Using This Fixture + +- `easy/add-factorial.yaml` - Add factorial function diff --git a/tests/fixtures/python-math-lib/src/math_utils.py b/tests/fixtures/python-math-lib/src/math_utils.py new file mode 100644 index 0000000..c336246 --- /dev/null +++ b/tests/fixtures/python-math-lib/src/math_utils.py @@ -0,0 +1,10 @@ +# Math utilities + +def add(a: int, b: int) -> int: + """Add two integers.""" + return a + b + + +def multiply(a: int, b: int) -> int: + """Multiply two integers.""" + return a * b diff --git a/tests/fixtures/python-math-lib/tests/test_math_utils.py b/tests/fixtures/python-math-lib/tests/test_math_utils.py new file mode 100644 index 0000000..5282887 --- /dev/null +++ b/tests/fixtures/python-math-lib/tests/test_math_utils.py @@ -0,0 +1,32 @@ +"""Tests for math utilities.""" +import pytest +from src.math_utils import add, multiply + + +def test_add(): + assert add(2, 3) == 5 + assert add(0, 0) == 0 + assert add(-1, 1) == 0 + + +def test_multiply(): + assert multiply(2, 3) == 6 + assert multiply(0, 5) == 0 + assert multiply(-2, 3) == -6 + + +# Tests for factorial will be added when the function exists +def test_factorial(): + """Test factorial function once implemented.""" + try: + from src.math_utils import factorial + except ImportError: + pytest.skip("factorial not implemented yet") + + assert factorial(0) == 1 + assert factorial(1) == 1 + assert factorial(5) == 120 + assert factorial(10) == 3628800 + + with pytest.raises(ValueError): + factorial(-1) diff --git a/tests/scenarios/easy/add-factorial.yaml b/tests/scenarios/easy/add-factorial.yaml new file mode 100644 index 0000000..2e185b1 --- /dev/null +++ b/tests/scenarios/easy/add-factorial.yaml @@ -0,0 +1,95 @@ +# Easy scenario: Add a simple function to existing codebase +id: add-factorial +title: Add factorial function +difficulty: easy +tags: [python, new-feature, single-file] + +fixture: + source: python-math-lib + +task: + description: | + Add a function `factorial(n)` to `src/math_utils.py` that computes + the factorial of a non-negative integer. + + Requirements: + - factorial(0) should return 1 + - factorial(5) should return 120 + - Should raise ValueError for negative inputs + + entry_point: src/math_utils.py + +execution: + mode: both + timeout: 5m + + scripted: + actions: + - type: worker + command: start + - type: edit + path: src/math_utils.py + old: | + # Math utilities + new: | + # Math utilities + + def factorial(n: int) -> int: + """Compute factorial of non-negative integer.""" + if n < 0: + raise ValueError("factorial not defined for negative numbers") + if n <= 1: + return 1 + return n * factorial(n - 1) + - type: shell + run: git add -A && git commit -m "Add factorial function" + - type: worker + command: done + +verify: + properties: + - type: file_contains + path: src/math_utils.py + pattern: "def factorial" + + - type: function_defined + path: src/math_utils.py + name: factorial + language: python + + - type: tests_pass + command: pytest tests/ -v + + - type: custom + command: | + python -c " + from src.math_utils import factorial + assert factorial(0) == 1 + assert factorial(5) == 120 + try: + factorial(-1) + exit(1) # Should have raised + except ValueError: + pass + " + + llm_judge: + enabled: true + model: haiku + rubric: + - criterion: Function correctly computes factorial for typical inputs + weight: 1.0 + - criterion: Handles edge case n=0 correctly + weight: 0.5 + - criterion: Handles negative input with appropriate error + weight: 0.5 + - criterion: Code is idiomatic Python with type hints + weight: 0.3 + threshold: 0.7 + + human: + required: false + +benchmark: + enabled: true + runs: 5 diff --git a/tests/scenarios/hard/fix-race-condition.yaml b/tests/scenarios/hard/fix-race-condition.yaml new file mode 100644 index 0000000..c0588e1 --- /dev/null +++ b/tests/scenarios/hard/fix-race-condition.yaml @@ -0,0 +1,79 @@ +# Hard scenario: Debug and fix a race condition +id: fix-race-condition +title: Fix race condition in cache invalidation +difficulty: hard +tags: [python, concurrency, debugging, race-condition] + +fixture: + source: flask-user-api # Same fixture, but with a bug + +task: + description: | + Users are reporting stale data after updates. Investigation shows there's + a race condition in the cache invalidation logic. + + Bug report: + - User updates their profile + - Immediately after, they see old data + - Refreshing a few seconds later shows correct data + + The issue is intermittent and happens under load. + + Find and fix the race condition. + + context: + - path: src/routes/users.py + hint: The endpoint with the bug + - path: src/cache.py + hint: Cache implementation + - path: tests/test_users.py + hint: Existing tests (don't catch this bug) + +execution: + mode: live # Too complex for scripted mode + timeout: 15m + + live: + max_turns: 50 + # No scripted version - agent must debug + +verify: + properties: + - type: tests_pass + command: pytest tests/ -v + + - type: custom + command: | + # Stress test for race condition + python tests/stress_test_cache.py --iterations=100 --concurrent=10 + + llm_judge: + enabled: true + model: opus # Use best model for complex evaluation + rubric: + - criterion: Correctly identified the race condition + weight: 1.0 + - criterion: Fix actually resolves the race (not just hiding it) + weight: 1.5 + - criterion: Fix doesn't introduce new bugs or performance issues + weight: 1.0 + - criterion: Explanation of the bug is accurate + weight: 0.5 + - criterion: Added test that would catch this regression + weight: 0.8 + threshold: 0.8 + + human: + required: true # Hard scenarios need human verification + queue: hard-reviews + rubric: + - Race condition is correctly identified + - Fix is correct and complete + - No new concurrency issues introduced + - Performance impact is acceptable + +benchmark: + enabled: true + runs: 3 + dimensions: + models: [sonnet, opus] # Compare models on hard tasks diff --git a/tests/scenarios/medium/add-caching-to-api.yaml b/tests/scenarios/medium/add-caching-to-api.yaml new file mode 100644 index 0000000..a87c2d4 --- /dev/null +++ b/tests/scenarios/medium/add-caching-to-api.yaml @@ -0,0 +1,154 @@ +# Medium scenario: Add caching to existing API endpoint +id: add-caching-to-api +title: Add caching to user lookup endpoint +difficulty: medium +tags: [python, flask, caching, refactor, multi-file] + +fixture: + source: flask-user-api + +task: + description: | + The `/api/users/` endpoint is slow because it queries the database + on every request. Add caching to improve performance. + + Requirements: + - Cache user lookups for 5 minutes + - Use the existing `cache` module (see src/cache.py) + - Cache key should include user ID + - Cache should be invalidated when user is updated (PUT /api/users/) + - Add a header `X-Cache: HIT` or `X-Cache: MISS` to responses + + context: + - path: src/routes/users.py + hint: The endpoint to modify + - path: src/cache.py + hint: Existing cache utilities to use + - path: src/models/user.py + hint: User model for reference + +execution: + mode: both + timeout: 10m + + scripted: + actions: + - type: worker + command: start + - type: edit + path: src/routes/users.py + old: | + @bp.route('/api/users/') + def get_user(user_id): + user = User.query.get_or_404(user_id) + return jsonify(user.to_dict()) + new: | + @bp.route('/api/users/') + def get_user(user_id): + cache_key = f"user:{user_id}" + cached = cache.get(cache_key) + if cached: + response = jsonify(cached) + response.headers['X-Cache'] = 'HIT' + return response + + user = User.query.get_or_404(user_id) + user_dict = user.to_dict() + cache.set(cache_key, user_dict, ttl=300) + response = jsonify(user_dict) + response.headers['X-Cache'] = 'MISS' + return response + - type: edit + path: src/routes/users.py + old: | + @bp.route('/api/users/', methods=['PUT']) + def update_user(user_id): + user = User.query.get_or_404(user_id) + # ... update logic ... + db.session.commit() + return jsonify(user.to_dict()) + new: | + @bp.route('/api/users/', methods=['PUT']) + def update_user(user_id): + user = User.query.get_or_404(user_id) + # ... update logic ... + db.session.commit() + cache.delete(f"user:{user_id}") # Invalidate cache + return jsonify(user.to_dict()) + - type: edit + path: src/routes/users.py + old: | + from flask import Blueprint, jsonify + new: | + from flask import Blueprint, jsonify + from src import cache + - type: shell + run: git add -A && git commit -m "Add caching to user lookup endpoint" + - type: worker + command: done + +verify: + properties: + - type: file_contains + path: src/routes/users.py + pattern: "cache\\.get" + + - type: file_contains + path: src/routes/users.py + pattern: "cache\\.set" + + - type: file_contains + path: src/routes/users.py + pattern: "X-Cache" + + - type: file_contains + path: src/routes/users.py + pattern: "cache\\.delete" + + - type: tests_pass + command: pytest tests/ -v + + - type: custom + command: | + # Functional test: verify caching behavior + python -c " + from src import create_app + app = create_app('testing') + with app.test_client() as client: + # First request should be MISS + r1 = client.get('/api/users/1') + assert r1.headers.get('X-Cache') == 'MISS', 'First request should be MISS' + + # Second request should be HIT + r2 = client.get('/api/users/1') + assert r2.headers.get('X-Cache') == 'HIT', 'Second request should be HIT' + " + + llm_judge: + enabled: true + model: sonnet # Use better model for nuanced evaluation + rubric: + - criterion: Caching is implemented correctly with appropriate TTL + weight: 1.0 + - criterion: Cache invalidation on update is implemented + weight: 1.0 + - criterion: X-Cache header correctly indicates HIT/MISS + weight: 0.8 + - criterion: Existing cache module is used (not reinvented) + weight: 0.5 + - criterion: Code follows existing patterns in the codebase + weight: 0.5 + - criterion: No obvious bugs or edge cases missed + weight: 0.7 + threshold: 0.75 + + human: + required: false + rubric: + - Cache logic is correct and efficient + - Invalidation covers all update paths + - No security issues with cached data + +benchmark: + enabled: true + runs: 3