Add failure notification and resilience to backup services
- Add backup-b2-failed oneshot for OnFailure notification - Add onFailure handler to both backup-b2 and backup-b2-check - Add network-online.target dependency to backup-b2-check - Add TimeoutStartSec (2h for backup, 1h for check) Found via ops-review lenses. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
d581d7bac4
commit
b1d2674629
|
|
@ -65,11 +65,26 @@ in
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
# Backup failure notification service
|
||||||
|
systemd.services.backup-b2-failed = {
|
||||||
|
description = "Handle backup failure notification";
|
||||||
|
serviceConfig = {
|
||||||
|
Type = "oneshot";
|
||||||
|
User = "root";
|
||||||
|
};
|
||||||
|
script = ''
|
||||||
|
echo "BACKUP FAILED at $(date)" | tee -a /var/log/backup-failures.log
|
||||||
|
echo "Check: journalctl -u backup-b2 -n 50"
|
||||||
|
# TODO: Add Matrix notification or healthchecks.io ping
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
|
||||||
# Backup service
|
# Backup service
|
||||||
systemd.services.backup-b2 = {
|
systemd.services.backup-b2 = {
|
||||||
description = "Restic backup to Backblaze B2";
|
description = "Restic backup to Backblaze B2";
|
||||||
after = [ "network-online.target" "postgresql.service" ];
|
after = [ "network-online.target" "postgresql.service" ];
|
||||||
wants = [ "network-online.target" ];
|
wants = [ "network-online.target" ];
|
||||||
|
onFailure = [ "backup-b2-failed.service" ];
|
||||||
# Don't require postgres - backup should still run even if DB is down
|
# Don't require postgres - backup should still run even if DB is down
|
||||||
# (will just skip the dump files if they don't exist)
|
# (will just skip the dump files if they don't exist)
|
||||||
|
|
||||||
|
|
@ -79,6 +94,8 @@ in
|
||||||
# Low priority
|
# Low priority
|
||||||
IOSchedulingClass = "idle";
|
IOSchedulingClass = "idle";
|
||||||
Nice = 19;
|
Nice = 19;
|
||||||
|
# Timeout after 2 hours to prevent hung backups
|
||||||
|
TimeoutStartSec = "2h";
|
||||||
};
|
};
|
||||||
|
|
||||||
path = [ pkgs.restic ];
|
path = [ pkgs.restic ];
|
||||||
|
|
@ -126,12 +143,17 @@ in
|
||||||
# Weekly integrity check service
|
# Weekly integrity check service
|
||||||
systemd.services.backup-b2-check = {
|
systemd.services.backup-b2-check = {
|
||||||
description = "Verify B2 backup integrity";
|
description = "Verify B2 backup integrity";
|
||||||
|
after = [ "network-online.target" ];
|
||||||
|
wants = [ "network-online.target" ];
|
||||||
|
onFailure = [ "backup-b2-failed.service" ];
|
||||||
|
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "oneshot";
|
Type = "oneshot";
|
||||||
User = "root";
|
User = "root";
|
||||||
IOSchedulingClass = "idle";
|
IOSchedulingClass = "idle";
|
||||||
Nice = 19;
|
Nice = 19;
|
||||||
|
# Timeout after 1 hour for integrity check
|
||||||
|
TimeoutStartSec = "1h";
|
||||||
};
|
};
|
||||||
|
|
||||||
path = [ pkgs.restic ];
|
path = [ pkgs.restic ];
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue