docs: add scenario schema and example test fixtures

Scaffold for agent capability benchmark harness (skills-qng9): - docs/specs/scenario-schema.md: YAML schema for test scenarios - tests/scenarios/: Easy, medium, hard example scenarios - tests/fixtures/: Python fixtures for testing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 21:24:28 -08:00 · 2026-01-11 21:24:28 -08:00 · 4da1890fc3
parent 7c7733bc64
commit 4da1890fc3
8 changed files with 594 additions and 0 deletions
--- a/docs/specs/scenario-schema.md
+++ b/docs/specs/scenario-schema.md
@ -0,0 +1,174 @@
+# Scenario Definition Schema
+
+Defines the YAML format for agent capability test scenarios.
+
+## Schema
+
+```yaml
+# ============================================================================
+# METADATA
+# ============================================================================
+id: string                    # Unique identifier (kebab-case)
+title: string                 # Human-readable name
+difficulty: easy | medium | hard
+tags: [string]                # For filtering (e.g., ["python", "refactor", "testing"])
+
+# ============================================================================
+# FIXTURE
+# ============================================================================
+fixture:
+  source: string              # Path to fixture directory (relative to fixtures/)
+  # OR
+  git: string                 # Git repo URL to clone
+  ref: string                 # Branch/tag/commit (optional, default: main)
+
+setup:                        # Commands to run after fixture setup (optional)
+  - string
+
+# ============================================================================
+# TASK
+# ============================================================================
+task:
+  description: |              # The prompt given to the agent
+    Multi-line description of what to accomplish.
+    This is what the agent sees.
+
+  context:                    # Additional context to provide (optional)
+    - path: string            # File path relative to fixture
+      hint: string            # Why this file is relevant
+
+  entry_point: string         # Where agent should start (optional, e.g., "src/main.py")
+
+# ============================================================================
+# EXECUTION
+# ============================================================================
+execution:
+  mode: scripted | live | both   # Which execution modes to support
+  timeout: duration              # e.g., "5m", "30s"
+
+  # For scripted mode: deterministic actions that simulate agent work
+  scripted:
+    actions:
+      - type: shell
+        run: string              # Shell command
+      - type: write
+        path: string
+        content: string
+      - type: edit
+        path: string
+        old: string
+        new: string
+      - type: worker              # Worker CLI command
+        command: string           # e.g., "start", "done"
+        args: {key: value}
+
+  # For live mode: real agent execution
+  live:
+    model: string                # Model to use (optional, default from config)
+    system_prompt: string        # Override system prompt (optional)
+    tools: [string]              # Tool restrictions (optional)
+    max_turns: int               # Max agent turns (optional)
+
+# ============================================================================
+# VERIFICATION
+# ============================================================================
+verify:
+  # Property-based checks (fast, deterministic)
+  properties:
+    - type: file_exists
+      path: string
+
+    - type: file_not_exists
+      path: string
+
+    - type: file_contains
+      path: string
+      pattern: string            # Regex or literal
+
+    - type: function_defined
+      path: string
+      name: string
+      language: python | nim | typescript | ...
+
+    - type: tests_pass
+      command: string            # e.g., "pytest", "npm test"
+
+    - type: compiles
+      command: string            # e.g., "nim c src/main.nim"
+
+    - type: lint_clean
+      command: string            # e.g., "ruff check ."
+
+    - type: git_state
+      branch_merged: string      # Branch name that should be merged
+      worktree_removed: string   # Worktree path that should not exist
+      db_state:                  # Worker DB assertions
+        task_id: string
+        state: string
+
+    - type: custom
+      command: string            # Exit 0 = pass, non-zero = fail
+
+  # LLM-as-judge evaluation
+  llm_judge:
+    enabled: bool                # Default: true
+    model: string                # Default: haiku (cheap) or sonnet (quality)
+    rubric:
+      - criterion: string        # What to evaluate
+        weight: float            # 0.0-1.0, default 1.0
+    threshold: float             # Min score to pass (0.0-1.0)
+
+  # Golden file comparison
+  golden:
+    - path: string               # File to compare
+      golden: string             # Path to golden file (relative to scenario dir)
+      mode: exact | normalized | semantic
+
+  # Human review
+  human:
+    required: bool               # If true, scenario never auto-passes
+    queue: string                # Queue name for review (optional)
+    rubric: [string]             # Checklist for human reviewer
+
+# ============================================================================
+# BENCHMARKING (optional)
+# ============================================================================
+benchmark:
+  enabled: bool                  # Include in benchmark runs
+  runs: int                      # Number of runs per config (for variance)
+  dimensions:                    # What to vary
+    models: [string]
+    prompts: [string]            # Named prompt variants
+    # Add more as needed
+  metrics:                       # What to capture (beyond defaults)
+    - name: string
+      command: string            # Custom metric extraction
+```
+
+## Examples
+
+See `tests/scenarios/` for example scenarios at each difficulty level.
+
+## Property Types Reference
+
+| Type | Description | Required Fields |
+|------|-------------|-----------------|
+| `file_exists` | File exists | `path` |
+| `file_not_exists` | File does not exist | `path` |
+| `file_contains` | File contains pattern | `path`, `pattern` |
+| `function_defined` | Function/method exists | `path`, `name`, `language` |
+| `tests_pass` | Test command succeeds | `command` |
+| `compiles` | Compile command succeeds | `command` |
+| `lint_clean` | Linter passes | `command` |
+| `git_state` | Git/worker state assertions | various |
+| `custom` | Custom check script | `command` |
+
+## Duration Format
+
+Durations use Go-style format: `30s`, `5m`, `1h`, `1h30m`
+
+## Difficulty Guidelines
+
+- **easy**: Clear spec, single file, <5min for competent agent
+- **medium**: Requires reading existing code, multiple files, some ambiguity
+- **hard**: Debugging, architectural decisions, edge cases, significant ambiguity
--- a/tests/fixtures/flask-user-api/README.md
+++ b/tests/fixtures/flask-user-api/README.md
@ -0,0 +1,27 @@
+# Flask User API Fixture
+
+Flask application with user management for testing agent capability on medium/hard tasks.
+
+## Structure
+
+```
+src/
+  __init__.py          # App factory
+  cache.py             # Cache utilities
+  routes/
+    users.py           # User endpoints
+  models/
+    user.py            # User model
+tests/
+  test_users.py        # User endpoint tests
+  stress_test_cache.py # Concurrency stress test
+```
+
+## Scenarios Using This Fixture
+
+- `medium/add-caching-to-api.yaml` - Add caching to user lookup
+- `hard/fix-race-condition.yaml` - Debug cache race condition
+
+## Notes
+
+This fixture intentionally contains a subtle race condition for the hard scenario.
--- a/tests/fixtures/python-math-lib/README.md
+++ b/tests/fixtures/python-math-lib/README.md
@ -0,0 +1,23 @@
+# Python Math Library Fixture
+
+Simple Python project for testing agent capability on basic tasks.
+
+## Structure
+
+```
+src/
+  math_utils.py    # Math utility functions
+tests/
+  test_math_utils.py
+```
+
+## Usage
+
+```bash
+# Run tests
+pytest tests/ -v
+```
+
+## Scenarios Using This Fixture
+
+- `easy/add-factorial.yaml` - Add factorial function
--- a/tests/fixtures/python-math-lib/src/math_utils.py
+++ b/tests/fixtures/python-math-lib/src/math_utils.py
@ -0,0 +1,10 @@
+# Math utilities
+
+def add(a: int, b: int) -> int:
+    """Add two integers."""
+    return a + b
+
+
+def multiply(a: int, b: int) -> int:
+    """Multiply two integers."""
+    return a * b
--- a/tests/fixtures/python-math-lib/tests/test_math_utils.py
+++ b/tests/fixtures/python-math-lib/tests/test_math_utils.py
@ -0,0 +1,32 @@
+"""Tests for math utilities."""
+import pytest
+from src.math_utils import add, multiply
+
+
+def test_add():
+    assert add(2, 3) == 5
+    assert add(0, 0) == 0
+    assert add(-1, 1) == 0
+
+
+def test_multiply():
+    assert multiply(2, 3) == 6
+    assert multiply(0, 5) == 0
+    assert multiply(-2, 3) == -6
+
+
+# Tests for factorial will be added when the function exists
+def test_factorial():
+    """Test factorial function once implemented."""
+    try:
+        from src.math_utils import factorial
+    except ImportError:
+        pytest.skip("factorial not implemented yet")
+
+    assert factorial(0) == 1
+    assert factorial(1) == 1
+    assert factorial(5) == 120
+    assert factorial(10) == 3628800
+
+    with pytest.raises(ValueError):
+        factorial(-1)
--- a/tests/scenarios/easy/add-factorial.yaml
+++ b/tests/scenarios/easy/add-factorial.yaml
@ -0,0 +1,95 @@
+# Easy scenario: Add a simple function to existing codebase
+id: add-factorial
+title: Add factorial function
+difficulty: easy
+tags: [python, new-feature, single-file]
+
+fixture:
+  source: python-math-lib
+
+task:
+  description: |
+    Add a function `factorial(n)` to `src/math_utils.py` that computes
+    the factorial of a non-negative integer.
+
+    Requirements:
+    - factorial(0) should return 1
+    - factorial(5) should return 120
+    - Should raise ValueError for negative inputs
+
+  entry_point: src/math_utils.py
+
+execution:
+  mode: both
+  timeout: 5m
+
+  scripted:
+    actions:
+      - type: worker
+        command: start
+      - type: edit
+        path: src/math_utils.py
+        old: |
+          # Math utilities
+        new: |
+          # Math utilities
+
+          def factorial(n: int) -> int:
+              """Compute factorial of non-negative integer."""
+              if n < 0:
+                  raise ValueError("factorial not defined for negative numbers")
+              if n <= 1:
+                  return 1
+              return n * factorial(n - 1)
+      - type: shell
+        run: git add -A && git commit -m "Add factorial function"
+      - type: worker
+        command: done
+
+verify:
+  properties:
+    - type: file_contains
+      path: src/math_utils.py
+      pattern: "def factorial"
+
+    - type: function_defined
+      path: src/math_utils.py
+      name: factorial
+      language: python
+
+    - type: tests_pass
+      command: pytest tests/ -v
+
+    - type: custom
+      command: |
+        python -c "
+        from src.math_utils import factorial
+        assert factorial(0) == 1
+        assert factorial(5) == 120
+        try:
+            factorial(-1)
+            exit(1)  # Should have raised
+        except ValueError:
+            pass
+        "
+
+  llm_judge:
+    enabled: true
+    model: haiku
+    rubric:
+      - criterion: Function correctly computes factorial for typical inputs
+        weight: 1.0
+      - criterion: Handles edge case n=0 correctly
+        weight: 0.5
+      - criterion: Handles negative input with appropriate error
+        weight: 0.5
+      - criterion: Code is idiomatic Python with type hints
+        weight: 0.3
+    threshold: 0.7
+
+  human:
+    required: false
+
+benchmark:
+  enabled: true
+  runs: 5
--- a/tests/scenarios/hard/fix-race-condition.yaml
+++ b/tests/scenarios/hard/fix-race-condition.yaml
@ -0,0 +1,79 @@
+# Hard scenario: Debug and fix a race condition
+id: fix-race-condition
+title: Fix race condition in cache invalidation
+difficulty: hard
+tags: [python, concurrency, debugging, race-condition]
+
+fixture:
+  source: flask-user-api  # Same fixture, but with a bug
+
+task:
+  description: |
+    Users are reporting stale data after updates. Investigation shows there's
+    a race condition in the cache invalidation logic.
+
+    Bug report:
+    - User updates their profile
+    - Immediately after, they see old data
+    - Refreshing a few seconds later shows correct data
+
+    The issue is intermittent and happens under load.
+
+    Find and fix the race condition.
+
+  context:
+    - path: src/routes/users.py
+      hint: The endpoint with the bug
+    - path: src/cache.py
+      hint: Cache implementation
+    - path: tests/test_users.py
+      hint: Existing tests (don't catch this bug)
+
+execution:
+  mode: live  # Too complex for scripted mode
+  timeout: 15m
+
+  live:
+    max_turns: 50
+    # No scripted version - agent must debug
+
+verify:
+  properties:
+    - type: tests_pass
+      command: pytest tests/ -v
+
+    - type: custom
+      command: |
+        # Stress test for race condition
+        python tests/stress_test_cache.py --iterations=100 --concurrent=10
+
+  llm_judge:
+    enabled: true
+    model: opus  # Use best model for complex evaluation
+    rubric:
+      - criterion: Correctly identified the race condition
+        weight: 1.0
+      - criterion: Fix actually resolves the race (not just hiding it)
+        weight: 1.5
+      - criterion: Fix doesn't introduce new bugs or performance issues
+        weight: 1.0
+      - criterion: Explanation of the bug is accurate
+        weight: 0.5
+      - criterion: Added test that would catch this regression
+        weight: 0.8
+    threshold: 0.8
+
+  human:
+    required: true  # Hard scenarios need human verification
+    queue: hard-reviews
+    rubric:
+      - Race condition is correctly identified
+      - Fix is correct and complete
+      - No new concurrency issues introduced
+      - Performance impact is acceptable
+
+benchmark:
+  enabled: true
+  runs: 3
+  dimensions:
+    models: [sonnet, opus]  # Compare models on hard tasks
--- a/tests/scenarios/medium/add-caching-to-api.yaml
+++ b/tests/scenarios/medium/add-caching-to-api.yaml
@ -0,0 +1,154 @@
+# Medium scenario: Add caching to existing API endpoint
+id: add-caching-to-api
+title: Add caching to user lookup endpoint
+difficulty: medium
+tags: [python, flask, caching, refactor, multi-file]
+
+fixture:
+  source: flask-user-api
+
+task:
+  description: |
+    The `/api/users/<id>` endpoint is slow because it queries the database
+    on every request. Add caching to improve performance.
+
+    Requirements:
+    - Cache user lookups for 5 minutes
+    - Use the existing `cache` module (see src/cache.py)
+    - Cache key should include user ID
+    - Cache should be invalidated when user is updated (PUT /api/users/<id>)
+    - Add a header `X-Cache: HIT` or `X-Cache: MISS` to responses
+
+  context:
+    - path: src/routes/users.py
+      hint: The endpoint to modify
+    - path: src/cache.py
+      hint: Existing cache utilities to use
+    - path: src/models/user.py
+      hint: User model for reference
+
+execution:
+  mode: both
+  timeout: 10m
+
+  scripted:
+    actions:
+      - type: worker
+        command: start
+      - type: edit
+        path: src/routes/users.py
+        old: |
+          @bp.route('/api/users/<int:user_id>')
+          def get_user(user_id):
+              user = User.query.get_or_404(user_id)
+              return jsonify(user.to_dict())
+        new: |
+          @bp.route('/api/users/<int:user_id>')
+          def get_user(user_id):
+              cache_key = f"user:{user_id}"
+              cached = cache.get(cache_key)
+              if cached:
+                  response = jsonify(cached)
+                  response.headers['X-Cache'] = 'HIT'
+                  return response
+
+              user = User.query.get_or_404(user_id)
+              user_dict = user.to_dict()
+              cache.set(cache_key, user_dict, ttl=300)
+              response = jsonify(user_dict)
+              response.headers['X-Cache'] = 'MISS'
+              return response
+      - type: edit
+        path: src/routes/users.py
+        old: |
+          @bp.route('/api/users/<int:user_id>', methods=['PUT'])
+          def update_user(user_id):
+              user = User.query.get_or_404(user_id)
+              # ... update logic ...
+              db.session.commit()
+              return jsonify(user.to_dict())
+        new: |
+          @bp.route('/api/users/<int:user_id>', methods=['PUT'])
+          def update_user(user_id):
+              user = User.query.get_or_404(user_id)
+              # ... update logic ...
+              db.session.commit()
+              cache.delete(f"user:{user_id}")  # Invalidate cache
+              return jsonify(user.to_dict())
+      - type: edit
+        path: src/routes/users.py
+        old: |
+          from flask import Blueprint, jsonify
+        new: |
+          from flask import Blueprint, jsonify
+          from src import cache
+      - type: shell
+        run: git add -A && git commit -m "Add caching to user lookup endpoint"
+      - type: worker
+        command: done
+
+verify:
+  properties:
+    - type: file_contains
+      path: src/routes/users.py
+      pattern: "cache\\.get"
+
+    - type: file_contains
+      path: src/routes/users.py
+      pattern: "cache\\.set"
+
+    - type: file_contains
+      path: src/routes/users.py
+      pattern: "X-Cache"
+
+    - type: file_contains
+      path: src/routes/users.py
+      pattern: "cache\\.delete"
+
+    - type: tests_pass
+      command: pytest tests/ -v
+
+    - type: custom
+      command: |
+        # Functional test: verify caching behavior
+        python -c "
+        from src import create_app
+        app = create_app('testing')
+        with app.test_client() as client:
+            # First request should be MISS
+            r1 = client.get('/api/users/1')
+            assert r1.headers.get('X-Cache') == 'MISS', 'First request should be MISS'
+
+            # Second request should be HIT
+            r2 = client.get('/api/users/1')
+            assert r2.headers.get('X-Cache') == 'HIT', 'Second request should be HIT'
+        "
+
+  llm_judge:
+    enabled: true
+    model: sonnet  # Use better model for nuanced evaluation
+    rubric:
+      - criterion: Caching is implemented correctly with appropriate TTL
+        weight: 1.0
+      - criterion: Cache invalidation on update is implemented
+        weight: 1.0
+      - criterion: X-Cache header correctly indicates HIT/MISS
+        weight: 0.8
+      - criterion: Existing cache module is used (not reinvented)
+        weight: 0.5
+      - criterion: Code follows existing patterns in the codebase
+        weight: 0.5
+      - criterion: No obvious bugs or edge cases missed
+        weight: 0.7
+    threshold: 0.75
+
+  human:
+    required: false
+    rubric:
+      - Cache logic is correct and efficient
+      - Invalidation covers all update paths
+      - No security issues with cached data
+
+benchmark:
+  enabled: true
+  runs: 3