Add cgroups limits and CPU watchdog
- User slice: MemoryMax 80%, TasksMax 500, CPUWeight 100 - CPU watchdog: detects sustained abuse (>180% for 5 min), kills user - Fixed scripts for NixOS (shebang, PATH) - Closes ops-jrz1-8m7, ops-jrz1-1bk 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
9c3ebaf9f9
commit
89f2987f1e
|
|
@ -21,7 +21,7 @@
|
|||
{"id":"ops-jrz1-6of","title":"AI cost/rate limiting per user","description":"One user could drain API credits with runaway script. Need rate limiting per user, either via proxy middleware or opencode config. Track usage.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-05T15:32:30.772304538-08:00","updated_at":"2025-12-05T17:42:42.773613559-08:00","closed_at":"2025-12-05T17:42:42.773613559-08:00","dependencies":[{"issue_id":"ops-jrz1-6of","depends_on_id":"ops-jrz1-3so","type":"parent-child","created_at":"2025-12-05T17:05:47.206816868-08:00","created_by":"daemon","metadata":"{}"},{"issue_id":"ops-jrz1-6of","depends_on_id":"ops-jrz1-wj2","type":"blocks","created_at":"2025-12-05T17:17:38.658742196-08:00","created_by":"daemon","metadata":"{}"}]}
|
||||
{"id":"ops-jrz1-7j4","title":"Git credential strategy for non-programmers","description":"Non-programmers can't manage SSH keys. Pre-configure git-credential-store or provide simple PAT workflow with docs. Store in persistent home with 600 perms.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-05T15:32:19.673999683-08:00","updated_at":"2025-12-05T17:38:54.788694408-08:00","closed_at":"2025-12-05T17:38:54.788694408-08:00","dependencies":[{"issue_id":"ops-jrz1-7j4","depends_on_id":"ops-jrz1-3so","type":"parent-child","created_at":"2025-12-05T17:05:47.139749437-08:00","created_by":"daemon","metadata":"{}"}]}
|
||||
{"id":"ops-jrz1-88o","title":"Implement backup strategy for VPS","description":"No backups configured. Critical data: Matrix DB (622M), PostgreSQL (161M), Forgejo (2.5M), maubot (320K). No recovery path if disk fails. Need automated backups with off-site storage.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-04T22:55:25.546850172-08:00","updated_at":"2025-12-05T00:56:27.720623612-08:00","closed_at":"2025-12-05T00:56:27.720623612-08:00"}
|
||||
{"id":"ops-jrz1-8m7","title":"Add cgroups limits for user slices","description":"Add soft resource limits to prevent one user/agent from crashing server.\n\n## Config\n```nix\nsystemd.slices.\"user\".sliceConfig = {\n MemoryMax = \"80%\";\n TasksMax = 500;\n CPUWeight = 100; # Fair sharing, no hard quota\n};\n```\n\n## Behavior\n- Memory: Users collectively can't exceed 80% RAM\n- Tasks: Max 500 processes per user (prevents fork bombs)\n- CPU: Fair sharing when contended, bursts allowed\n\n## Testing\n- Verify with `systemctl show user-1001.slice`\n- Test fork bomb doesn't crash server","status":"open","priority":2,"issue_type":"task","created_at":"2026-01-02T20:16:22.600133044-08:00","created_by":"dan","updated_at":"2026-01-02T20:16:22.600133044-08:00"}
|
||||
{"id":"ops-jrz1-8m7","title":"Add cgroups limits for user slices","description":"Add soft resource limits to prevent one user/agent from crashing server.\n\n## Config\n```nix\nsystemd.slices.\"user\".sliceConfig = {\n MemoryMax = \"80%\";\n TasksMax = 500;\n CPUWeight = 100; # Fair sharing, no hard quota\n};\n```\n\n## Behavior\n- Memory: Users collectively can't exceed 80% RAM\n- Tasks: Max 500 processes per user (prevents fork bombs)\n- CPU: Fair sharing when contended, bursts allowed\n\n## Testing\n- Verify with `systemctl show user-1001.slice`\n- Test fork bomb doesn't crash server","status":"in_progress","priority":2,"issue_type":"task","created_at":"2026-01-02T20:16:22.600133044-08:00","created_by":"dan","updated_at":"2026-01-02T20:54:10.907683845-08:00"}
|
||||
{"id":"ops-jrz1-9gd","title":"Upgrade VPS RAM for dev environments","description":"Current: 2GB. Need 4-8GB for multiple code-server containers. Coordinate with Vultr, plan maintenance window.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-05T17:16:54.267689439-08:00","updated_at":"2025-12-28T00:08:06.748175273-05:00","closed_at":"2025-12-28T00:08:06.748175273-05:00","close_reason":"Browser-based dev environment cancelled","dependencies":[{"issue_id":"ops-jrz1-9gd","depends_on_id":"ops-jrz1-3so","type":"parent-child","created_at":"2025-12-05T17:17:36.331146543-08:00","created_by":"daemon","metadata":"{}"}]}
|
||||
{"id":"ops-jrz1-9pe","title":"Research: System packages for learner accounts","description":"How do dev users get access to toolchains (Go, Node, Rust, etc.)?\n\n## Findings\n\n**Users CAN self-install packages:**\n```bash\nnix profile install nixpkgs#go\nnix profile install nixpkgs#nodejs\nnix profile install nixpkgs#rustc\n```\n\nPackages go to `~/.nix-profile/bin`, already in PATH. Works today.\n\n**Devshells work too:**\n```bash\n# In project with flake.nix\nnix develop\n```\n\n## Options\n\n| Option | Pros | Cons |\n|--------|------|------|\n| **Self-service only** | Minimal config, user learns nix | Cold start friction |\n| **Global defaults** | Zero friction for common tools | Bloats system, version conflicts |\n| **Starter script** | One command setup, customizable | Another thing to maintain |\n| **direnv + devshells** | Per-project envs, reproducible | Needs direnv installed globally |\n\n## Current State\n- `nix profile install` works for users ✅\n- `nix develop` works ✅\n- direnv NOT installed globally\n- Only python3, uv in system packages\n\n## Recommendation\n1. Add `direnv` to global packages (enables per-project devshells)\n2. Document `nix profile install` for quick one-offs\n3. Provide example flake.nix templates for Go, Node, Rust projects\n4. Keep system packages minimal (python3, uv, direnv, git, vim)\n\n## Test Results\n```\n$ nix profile install nixpkgs#go\n$ go version\ngo version go1.22.8 linux/amd64\n```","status":"closed","priority":2,"issue_type":"task","created_at":"2026-01-02T12:27:32.894163417-08:00","created_by":"dan","updated_at":"2026-01-02T12:32:32.502649201-08:00","closed_at":"2026-01-02T12:32:32.502649201-08:00","close_reason":"Users can self-install via nix profile. Added direnv globally for devshells."}
|
||||
{"id":"ops-jrz1-9x8","title":"Claude CLI update mechanism","description":"Claude Code CLI is manually installed to /usr/local/bin/claude.\n\n## Current state\n- Installed via: curl -fsSL https://claude.ai/install.sh | bash\n- Copied to /usr/local/bin/claude\n- No automatic updates\n\n## Options\n1. Periodic manual update (run install script again)\n2. Systemd timer to check for updates\n3. Package via nix (would need custom derivation)\n\n## Acceptance criteria\nDocument the update process at minimum.","status":"open","priority":3,"issue_type":"task","created_at":"2026-01-02T16:46:03.908575951-08:00","created_by":"dan","updated_at":"2026-01-02T16:46:03.908575951-08:00"}
|
||||
|
|
|
|||
|
|
@ -78,6 +78,31 @@
|
|||
"olm-3.2.16"
|
||||
];
|
||||
|
||||
# Resource limits for user slices (prevent one user from crashing server)
|
||||
systemd.slices."user".sliceConfig = {
|
||||
MemoryMax = "80%"; # Users collectively can't exceed 80% RAM
|
||||
TasksMax = 500; # Max 500 processes per user (prevents fork bombs)
|
||||
CPUWeight = 100; # Fair sharing when contended, bursts allowed
|
||||
};
|
||||
|
||||
# CPU watchdog - detects sustained abuse, kills offending user
|
||||
systemd.services.cpu-watchdog = {
|
||||
description = "CPU abuse watchdog";
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
ExecStart = "/usr/local/bin/cpu-watchdog";
|
||||
};
|
||||
};
|
||||
|
||||
systemd.timers.cpu-watchdog = {
|
||||
description = "Run CPU watchdog every minute";
|
||||
wantedBy = [ "timers.target" ];
|
||||
timerConfig = {
|
||||
OnBootSec = "1min";
|
||||
OnUnitActiveSec = "1min";
|
||||
};
|
||||
};
|
||||
|
||||
# This value determines the NixOS release compatibility
|
||||
system.stateVersion = "24.05";
|
||||
}
|
||||
|
|
|
|||
43
scripts/cpu-watchdog
Executable file
43
scripts/cpu-watchdog
Executable file
|
|
@ -0,0 +1,43 @@
|
|||
#!/run/current-system/sw/bin/bash
|
||||
# cpu-watchdog - Detect sustained CPU abuse, kill after 5 consecutive violations
|
||||
# Runs every minute via systemd timer
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# NixOS paths
|
||||
PATH="/run/current-system/sw/bin:$PATH"
|
||||
|
||||
THRESHOLD=180 # 180% CPU (almost 2 cores)
|
||||
MAX_STRIKES=5
|
||||
COUNTDIR="/var/lib/cpu-watchdog"
|
||||
|
||||
mkdir -p "$COUNTDIR"
|
||||
|
||||
for homedir in /home/*; do
|
||||
user=$(basename "$homedir")
|
||||
|
||||
# Skip if not a real user
|
||||
id "$user" &>/dev/null || continue
|
||||
|
||||
# Get total CPU usage for user
|
||||
pct=$(ps -u "$user" -o %cpu= 2>/dev/null | awk '{s+=$1}END{print int(s)}' | tr -d '[:space:]' || echo 0)
|
||||
pct=${pct:-0}
|
||||
[[ "$pct" =~ ^[0-9]+$ ]] || pct=0
|
||||
|
||||
if [ "$pct" -gt "$THRESHOLD" ]; then
|
||||
# Increment strike counter
|
||||
count=$(cat "$COUNTDIR/$user" 2>/dev/null || echo 0)
|
||||
count=$((count + 1))
|
||||
echo "$count" > "$COUNTDIR/$user"
|
||||
|
||||
logger -t cpu-watchdog "User $user at ${pct}% CPU (strike $count/$MAX_STRIKES)"
|
||||
|
||||
if [ "$count" -ge "$MAX_STRIKES" ]; then
|
||||
/usr/local/bin/killswitch "$user" "sustained CPU abuse (${pct}%)"
|
||||
rm -f "$COUNTDIR/$user"
|
||||
fi
|
||||
else
|
||||
# Reset counter if below threshold
|
||||
rm -f "$COUNTDIR/$user"
|
||||
fi
|
||||
done
|
||||
|
|
@ -1,9 +1,12 @@
|
|||
#!/usr/bin/env bash
|
||||
#!/run/current-system/sw/bin/bash
|
||||
# killswitch - Immediately terminate all processes for a user
|
||||
# Usage: killswitch <username> [reason]
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# NixOS paths
|
||||
PATH="/run/current-system/sw/bin:$PATH"
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: killswitch <username> [reason]" >&2
|
||||
exit 1
|
||||
|
|
|
|||
Loading…
Reference in a new issue