Scaffold for agent capability benchmark harness (skills-qng9): - docs/specs/scenario-schema.md: YAML schema for test scenarios - tests/scenarios/: Easy, medium, hard example scenarios - tests/fixtures/: Python fixtures for testing Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
96 lines
2.2 KiB
YAML
96 lines
2.2 KiB
YAML
# Easy scenario: Add a simple function to existing codebase
|
|
id: add-factorial
|
|
title: Add factorial function
|
|
difficulty: easy
|
|
tags: [python, new-feature, single-file]
|
|
|
|
fixture:
|
|
source: python-math-lib
|
|
|
|
task:
|
|
description: |
|
|
Add a function `factorial(n)` to `src/math_utils.py` that computes
|
|
the factorial of a non-negative integer.
|
|
|
|
Requirements:
|
|
- factorial(0) should return 1
|
|
- factorial(5) should return 120
|
|
- Should raise ValueError for negative inputs
|
|
|
|
entry_point: src/math_utils.py
|
|
|
|
execution:
|
|
mode: both
|
|
timeout: 5m
|
|
|
|
scripted:
|
|
actions:
|
|
- type: worker
|
|
command: start
|
|
- type: edit
|
|
path: src/math_utils.py
|
|
old: |
|
|
# Math utilities
|
|
new: |
|
|
# Math utilities
|
|
|
|
def factorial(n: int) -> int:
|
|
"""Compute factorial of non-negative integer."""
|
|
if n < 0:
|
|
raise ValueError("factorial not defined for negative numbers")
|
|
if n <= 1:
|
|
return 1
|
|
return n * factorial(n - 1)
|
|
- type: shell
|
|
run: git add -A && git commit -m "Add factorial function"
|
|
- type: worker
|
|
command: done
|
|
|
|
verify:
|
|
properties:
|
|
- type: file_contains
|
|
path: src/math_utils.py
|
|
pattern: "def factorial"
|
|
|
|
- type: function_defined
|
|
path: src/math_utils.py
|
|
name: factorial
|
|
language: python
|
|
|
|
- type: tests_pass
|
|
command: pytest tests/ -v
|
|
|
|
- type: custom
|
|
command: |
|
|
python -c "
|
|
from src.math_utils import factorial
|
|
assert factorial(0) == 1
|
|
assert factorial(5) == 120
|
|
try:
|
|
factorial(-1)
|
|
exit(1) # Should have raised
|
|
except ValueError:
|
|
pass
|
|
"
|
|
|
|
llm_judge:
|
|
enabled: true
|
|
model: haiku
|
|
rubric:
|
|
- criterion: Function correctly computes factorial for typical inputs
|
|
weight: 1.0
|
|
- criterion: Handles edge case n=0 correctly
|
|
weight: 0.5
|
|
- criterion: Handles negative input with appropriate error
|
|
weight: 0.5
|
|
- criterion: Code is idiomatic Python with type hints
|
|
weight: 0.3
|
|
threshold: 0.7
|
|
|
|
human:
|
|
required: false
|
|
|
|
benchmark:
|
|
enabled: true
|
|
runs: 5
|