Add failure notification and resilience to backup services
- Add backup-b2-failed oneshot for OnFailure notification - Add onFailure handler to both backup-b2 and backup-b2-check - Add network-online.target dependency to backup-b2-check - Add TimeoutStartSec (2h for backup, 1h for check) Found via ops-review lenses. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
d581d7bac4
commit
b1d2674629
|
|
@ -65,11 +65,26 @@ in
|
|||
};
|
||||
};
|
||||
|
||||
# Backup failure notification service
|
||||
systemd.services.backup-b2-failed = {
|
||||
description = "Handle backup failure notification";
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
User = "root";
|
||||
};
|
||||
script = ''
|
||||
echo "BACKUP FAILED at $(date)" | tee -a /var/log/backup-failures.log
|
||||
echo "Check: journalctl -u backup-b2 -n 50"
|
||||
# TODO: Add Matrix notification or healthchecks.io ping
|
||||
'';
|
||||
};
|
||||
|
||||
# Backup service
|
||||
systemd.services.backup-b2 = {
|
||||
description = "Restic backup to Backblaze B2";
|
||||
after = [ "network-online.target" "postgresql.service" ];
|
||||
wants = [ "network-online.target" ];
|
||||
onFailure = [ "backup-b2-failed.service" ];
|
||||
# Don't require postgres - backup should still run even if DB is down
|
||||
# (will just skip the dump files if they don't exist)
|
||||
|
||||
|
|
@ -79,6 +94,8 @@ in
|
|||
# Low priority
|
||||
IOSchedulingClass = "idle";
|
||||
Nice = 19;
|
||||
# Timeout after 2 hours to prevent hung backups
|
||||
TimeoutStartSec = "2h";
|
||||
};
|
||||
|
||||
path = [ pkgs.restic ];
|
||||
|
|
@ -126,12 +143,17 @@ in
|
|||
# Weekly integrity check service
|
||||
systemd.services.backup-b2-check = {
|
||||
description = "Verify B2 backup integrity";
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
onFailure = [ "backup-b2-failed.service" ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
User = "root";
|
||||
IOSchedulingClass = "idle";
|
||||
Nice = 19;
|
||||
# Timeout after 1 hour for integrity check
|
||||
TimeoutStartSec = "1h";
|
||||
};
|
||||
|
||||
path = [ pkgs.restic ];
|
||||
|
|
|
|||
Loading…
Reference in a new issue