diff --git a/doc/source/sysadmin.rst b/doc/source/sysadmin.rst index 2694e34cfa..6735caa132 100644 --- a/doc/source/sysadmin.rst +++ b/doc/source/sysadmin.rst @@ -262,6 +262,19 @@ then monthly backups for 1 year and yearly backups for each archive. The backup servers will send a warning when backup volume usage is high, at which point this can be run manually. +Retiring hosts +-------------- + +When a host that runs backups is retired, it should have its user +added to the `borg_retire_users` list (this list can differ per-backup +server, depending on storage requirements). Retired users will have +their backup accounts disabled and we will only keep the latest backup +revision. + +When we are ready to completely remove the backups, the user can be +moved to the `borg_purge_users` list, which will purge all borg +backups on the next ansible run. + .. _force-merging-a-change: Force-Merging a Change diff --git a/playbooks/roles/borg-backup-server/README.rst b/playbooks/roles/borg-backup-server/README.rst index c78c557dec..f16bfa704a 100644 --- a/playbooks/roles/borg-backup-server/README.rst +++ b/playbooks/roles/borg-backup-server/README.rst @@ -13,3 +13,18 @@ Their ``authorized_keys`` file is configured with the public key to allow the remote host to log in and only run ``borg`` in server mode. **Role Variables** + +.. zuul:rolevar:: borg_retire_users + :default: [] + + A list of backup user names that are in a "retired" state. The + host should not be in the inventory or active. The backup user + will be diabled and when running a prune, we will only keep the + latest backup to save space. + +.. zuul:rolevar:: borg_purge_users + default: [] + + A list of backup user names whose data should be purged. This list + represents backups for hosts that have been retired and we now + agree we do not want to retain any of their data. diff --git a/playbooks/roles/borg-backup-server/defaults/main.yaml b/playbooks/roles/borg-backup-server/defaults/main.yaml index 9ba22faee0..c3d0fc7839 100644 --- a/playbooks/roles/borg-backup-server/defaults/main.yaml +++ b/playbooks/roles/borg-backup-server/defaults/main.yaml @@ -1 +1,3 @@ borg_users: [] +borg_retire_users: [] +borg_purge_users: [] diff --git a/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh b/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh index af6b684dc6..7669047f63 100644 --- a/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh +++ b/playbooks/roles/borg-backup-server/files/prune-borg-backups.sh @@ -10,10 +10,12 @@ if [[ ${borg_op} == 'noop' ]]; then BORG_OP='--dry-run' elif [[ ${borg_op} == 'prune' ]]; then BORG_OP='' - LOG_FILE="/opt/backups/prune-$(date '+%Y-%m-%d-%H-%M-%S').log" - echo "*** Logging output to ${LOG_FILE}" - exec 1>${LOG_FILE} - exec 2>&1 + if [ -z ${NO_LOG_FILE+x} ]; then + LOG_FILE="/opt/backups/prune-$(date '+%Y-%m-%d-%H-%M-%S').log" + echo "*** Logging output to ${LOG_FILE}" + exec 1>${LOG_FILE} + exec 2>&1 + fi else echo "*** Invalid input" exit 1 @@ -22,25 +24,39 @@ fi pushd /opt/backups for u in borg-*; do - BORG_REPO=/opt/backups/$u/backup + BORG_BASE=/opt/backups/$u + BORG_REPO=${BORG_BASE}/backup - sudo BORG_OP=${BORG_OP} BORG_RELOCATED_REPO_ACCESS_IS_OK=y BORG_REPO=${BORG_REPO} -u ${u} -s <<'EOF' + _prune_flags='--keep-daily 7 --keep-weekly 4 --keep-monthly 12' + _retired='' + if [[ -f ${BORG_BASE}/.retired ]]; then + _prune_flags='--keep-daily 1' + _retired=' (retired)' + fi + + sudo BORG_OP=${BORG_OP} BORG_UNKNOWN_UNENCRYPTED_REPO_ACCESS_IS_OK=yes BORG_REPO=${BORG_REPO} _retired="${_retired}" _prune_flags="${_prune_flags}" -u ${u} -s <<'EOF' # Look at all archives and strip the timestamp, leaving just the archive names # We limit the prune by --prefix so each archive is considered separately # Long-running aborted backups might leave a ".checkpoint" archive around; ignore # these as prune will remove them automatically + # + # Note we are assuming the archives are in the format made by our backup scripts, + # which include -YYYY-MM-DDTHH:MM:SS on the end. archives=$(/opt/borg/bin/borg list ${BORG_REPO} | awk '$1 !~ /\.checkpoint$/ { print substr($1, 0, length($1)-20) }' | sort | uniq) + echo "+------" + echo "| $(date) Pruning ${BORG_REPO}${_retired}" + for prefix in ${archives}; do - echo - echo - echo "+------" - echo "| $(date) Pruning ${BORG_REPO} archive ${prefix}" - echo "+------" - /opt/borg/bin/borg prune --prefix ${prefix} ${BORG_OP} --verbose --list --show-rc --keep-daily 7 --keep-weekly 4 --keep-monthly 12 + echo "| $(date) - archive ${prefix}" + /opt/borg/bin/borg prune --prefix ${prefix} ${BORG_OP} --verbose --list --show-rc ${_prune_flags} done + echo "| $(date) done!" + echo "+------" + echo + EOF done diff --git a/playbooks/roles/borg-backup-server/tasks/main.yaml b/playbooks/roles/borg-backup-server/tasks/main.yaml index 5ea555008d..67ea53e813 100644 --- a/playbooks/roles/borg-backup-server/tasks/main.yaml +++ b/playbooks/roles/borg-backup-server/tasks/main.yaml @@ -64,3 +64,15 @@ loop: '{{ borg_users }}' loop_control: loop_var: borg_user + +- name: Remove purged user's backup dirs + file: + name: '/opt/backups/{{ item }}/backup' + state: absent + loop: '{{ borg_purge_users }}' + +- name: Disable retired users + include_tasks: retire.yaml + loop: '{{ borg_retire_users }}' + loop_control: + loop_var: borg_user diff --git a/playbooks/roles/borg-backup-server/tasks/retire.yaml b/playbooks/roles/borg-backup-server/tasks/retire.yaml new file mode 100644 index 0000000000..0208fac0b7 --- /dev/null +++ b/playbooks/roles/borg-backup-server/tasks/retire.yaml @@ -0,0 +1,17 @@ +- name: Disable backup user login + user: + name: '{{ borg_user }}' + shell: /bin/nologin + +- name: Remove ssh key + file: + name: '/opt/backups/{{ borg_user }}/.ssh' + state: absent + +- name: Mark as retired + file: + name: '/opt/backups/{{ borg_user }}/.retired' + state: touch + owner: '{{ borg_user }}' + group: '{{ borg_user }}' + mode: 0644 diff --git a/playbooks/test-borg-backup-pre.yaml b/playbooks/test-borg-backup-pre.yaml new file mode 100644 index 0000000000..bfff0fbf07 --- /dev/null +++ b/playbooks/test-borg-backup-pre.yaml @@ -0,0 +1,31 @@ +- hosts: "borg-backup-server" + tasks: + - name: Setup backup area + file: + name: '/opt/backups/' + state: directory + mode: 0755 + owner: root + group: root + + # We put this in borg_retire_users and check it gets + # marked as retired + - name: Setup fake retired user + user: + name: borg-retired + home: '/opt/backups/borg-retired' + + # We put "borg-purge" in borg_purge_users and check it's + # backup directory gets removed. + - name: Setup fake purge user + user: + name: borg-purge + home: '/opt/backups/borg-purge' + + - name: Setup fake purge user backup directory + file: + name: '/opt/backups/borg-purge/backup' + state: directory + mode: 0755 + owner: borg-purge + group: borg-purge diff --git a/playbooks/zuul/run-base.yaml b/playbooks/zuul/run-base.yaml index 554365399a..9c363adfe8 100644 --- a/playbooks/zuul/run-base.yaml +++ b/playbooks/zuul/run-base.yaml @@ -140,6 +140,7 @@ - group_vars/zuul-merger.yaml - group_vars/zuul-scheduler.yaml - group_vars/zuul-web.yaml + - host_vars/borg-backup01.region.provider.opendev.org.yaml - host_vars/codesearch01.opendev.org.yaml - host_vars/etherpad99.opendev.org.yaml - host_vars/letsencrypt01.opendev.org.yaml diff --git a/playbooks/zuul/templates/host_vars/borg-backup01.region.provider.opendev.org.yaml.j2 b/playbooks/zuul/templates/host_vars/borg-backup01.region.provider.opendev.org.yaml.j2 new file mode 100644 index 0000000000..7de413411b --- /dev/null +++ b/playbooks/zuul/templates/host_vars/borg-backup01.region.provider.opendev.org.yaml.j2 @@ -0,0 +1,4 @@ +borg_retire_users: + - borg-retired +borg_purge_users: + - borg-purge diff --git a/testinfra/test_borg_backups.py b/testinfra/test_borg_backups.py index 9975fc0459..17b37602a1 100644 --- a/testinfra/test_borg_backups.py +++ b/testinfra/test_borg_backups.py @@ -54,6 +54,14 @@ def test_borg_server_users(host): f = host.file(borg_repo) assert f.exists + # test retired stamp is made for host in retired group + f = host.file('/opt/backups/borg-retired/.retired') + assert f.exists + + # test purge for host in purge group + f = host.file('/opt/backups/borg-purge/backup') + assert not f.exists + def test_borg_backup_host_config(host): hostname = host.backend.get_hostname() if hostname == 'borg-backup01.region.provider.opendev.org': @@ -97,7 +105,19 @@ def test_borg_server_prune(host): if hostname != 'borg-backup01.region.provider.opendev.org': pytest.skip() - cmd = host.run('echo "prune" | /usr/local/bin/prune-borg-backups &> /var/log/prune-borg-backups.log') + # bit of a hack; instead of making a host, backing it up, and then + # retiring it -- which would require testing multiple runs of the + # backup process -- simulate the retired user being active by just + # making a small archive. This ensure the prune script works on + # user flagged as retired. + cmd = host.run('dd if=/dev/urandom of=/tmp/borg-backup.random bs=1M count=5') + assert cmd.succeeded + cmd = host.run('sudo -u borg-retired /opt/borg/bin/borg init --encryption=none /opt/backups/borg-retired/backup') + assert cmd.succeeded + cmd = host.run('sudo -u borg-retired /opt/borg/bin/borg create /opt/backups/borg-retired/backup::test-9999-12-12T00:00:00 /tmp/borg-backup.random') + assert cmd.succeeded + + cmd = host.run('echo "prune" | NO_LOG_FILE=1 /usr/local/bin/prune-borg-backups &> /var/log/prune-borg-backups.log') assert cmd.succeeded def test_borg_server_verify(host): diff --git a/zuul.d/system-config-run.yaml b/zuul.d/system-config-run.yaml index 653d42b0ab..f83010d1af 100644 --- a/zuul.d/system-config-run.yaml +++ b/zuul.d/system-config-run.yaml @@ -427,6 +427,7 @@ - <<: *bastion_group vars: run_playbooks: + - playbooks/test-borg-backup-pre.yaml - playbooks/service-borg-backup.yaml run_test_playbook: playbooks/test-borg-backup.yaml files: