skills/tests/scenarios/hard/fix-race-condition.yaml

# Hard scenario: Debug and fix a race condition
id: fix-race-condition
title: Fix race condition in cache invalidation
difficulty: hard
tags: [python, concurrency, debugging, race-condition]

fixture:
  source: flask-user-api  # Same fixture, but with a bug

task:
  description: |
    Users are reporting stale data after updates. Investigation shows there's
    a race condition in the cache invalidation logic.

    Bug report:
    - User updates their profile
    - Immediately after, they see old data
    - Refreshing a few seconds later shows correct data

    The issue is intermittent and happens under load.

    Find and fix the race condition.

  context:
    - path: src/routes/users.py
      hint: The endpoint with the bug
    - path: src/cache.py
      hint: Cache implementation
    - path: tests/test_users.py
      hint: Existing tests (don't catch this bug)

execution:
  mode: live  # Too complex for scripted mode
  timeout: 15m

  live:
    max_turns: 50
    # No scripted version - agent must debug

verify:
  properties:
    - type: tests_pass
      command: pytest tests/ -v

    - type: custom
      command: |
        # Stress test for race condition
        python tests/stress_test_cache.py --iterations=100 --concurrent=10

  llm_judge:
    enabled: true
    model: opus  # Use best model for complex evaluation
    rubric:
      - criterion: Correctly identified the race condition
        weight: 1.0
      - criterion: Fix actually resolves the race (not just hiding it)
        weight: 1.5
      - criterion: Fix doesn't introduce new bugs or performance issues
        weight: 1.0
      - criterion: Explanation of the bug is accurate
        weight: 0.5
      - criterion: Added test that would catch this regression
        weight: 0.8
    threshold: 0.8

  human:
    required: true  # Hard scenarios need human verification
    queue: hard-reviews
    rubric:
      - Race condition is correctly identified
      - Fix is correct and complete
      - No new concurrency issues introduced
      - Performance impact is acceptable

benchmark:
  enabled: true
  runs: 3
  dimensions:
    models: [sonnet, opus]  # Compare models on hard tasks