Add failure notification and resilience to backup services

- Add backup-b2-failed oneshot for OnFailure notification
- Add onFailure handler to both backup-b2 and backup-b2-check
- Add network-online.target dependency to backup-b2-check
- Add TimeoutStartSec (2h for backup, 1h for check)

Found via ops-review lenses.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Dan 2026-01-10 18:56:33 -08:00
parent d581d7bac4
commit b1d2674629

View file

@ -65,11 +65,26 @@ in
}; };
}; };
# Backup failure notification service
systemd.services.backup-b2-failed = {
description = "Handle backup failure notification";
serviceConfig = {
Type = "oneshot";
User = "root";
};
script = ''
echo "BACKUP FAILED at $(date)" | tee -a /var/log/backup-failures.log
echo "Check: journalctl -u backup-b2 -n 50"
# TODO: Add Matrix notification or healthchecks.io ping
'';
};
# Backup service # Backup service
systemd.services.backup-b2 = { systemd.services.backup-b2 = {
description = "Restic backup to Backblaze B2"; description = "Restic backup to Backblaze B2";
after = [ "network-online.target" "postgresql.service" ]; after = [ "network-online.target" "postgresql.service" ];
wants = [ "network-online.target" ]; wants = [ "network-online.target" ];
onFailure = [ "backup-b2-failed.service" ];
# Don't require postgres - backup should still run even if DB is down # Don't require postgres - backup should still run even if DB is down
# (will just skip the dump files if they don't exist) # (will just skip the dump files if they don't exist)
@ -79,6 +94,8 @@ in
# Low priority # Low priority
IOSchedulingClass = "idle"; IOSchedulingClass = "idle";
Nice = 19; Nice = 19;
# Timeout after 2 hours to prevent hung backups
TimeoutStartSec = "2h";
}; };
path = [ pkgs.restic ]; path = [ pkgs.restic ];
@ -126,12 +143,17 @@ in
# Weekly integrity check service # Weekly integrity check service
systemd.services.backup-b2-check = { systemd.services.backup-b2-check = {
description = "Verify B2 backup integrity"; description = "Verify B2 backup integrity";
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
onFailure = [ "backup-b2-failed.service" ];
serviceConfig = { serviceConfig = {
Type = "oneshot"; Type = "oneshot";
User = "root"; User = "root";
IOSchedulingClass = "idle"; IOSchedulingClass = "idle";
Nice = 19; Nice = 19;
# Timeout after 1 hour for integrity check
TimeoutStartSec = "1h";
}; };
path = [ pkgs.restic ]; path = [ pkgs.restic ];