From f87fed345c6c151197e32937e3c60100b14e9501 Mon Sep 17 00:00:00 2001 From: DmitriiAn Date: Mon, 22 Jun 2026 13:43:34 +0200 Subject: [PATCH] add emergency-stop.sh to halt a running migration --- pgcopydb-helpers/AGENTS.md | 17 +++ pgcopydb-helpers/README.md | 10 ++ pgcopydb-helpers/emergency-stop.sh | 182 +++++++++++++++++++++++++++++ 3 files changed, 209 insertions(+) create mode 100755 pgcopydb-helpers/emergency-stop.sh diff --git a/pgcopydb-helpers/AGENTS.md b/pgcopydb-helpers/AGENTS.md index 14d5f88..1d4c6d3 100644 --- a/pgcopydb-helpers/AGENTS.md +++ b/pgcopydb-helpers/AGENTS.md @@ -207,6 +207,23 @@ Wrapper that runs `run-migration.sh` inside a detached `screen` session named "m - `Ctrl-A D` — detach from screen (migration keeps running) - `~/check-migration-status.sh` — check progress without attaching +#### `emergency-stop.sh` + +Immediately terminates a running migration and all of its subprocesses (clone/COPY/index/follow workers). Finds the supervisor PID from `$MIGRATION_DIR/pgcopydb.pid` (with a `pgrep` fallback), prints what will be stopped plus the consequences, and — after a single `[y/N]` confirmation — sends `SIGTERM` to the whole process group, then escalates automatically to repeated `SIGKILL` (process group plus each surviving PID). If anything is still alive after that, it warns loudly with the leftover PIDs and exits non-zero rather than failing silently. Also quits the detached `migration` screen session. Reads the SQLite catalog directly to report copy progress and recommend the right resume path. + +```bash +~/emergency-stop.sh +MIGRATION_DIR=~/migration_YYYYMMDD-HHMMSS ~/emergency-stop.sh +``` + +**When to use:** An emergency — e.g. the source database is overloaded during the initial copy, or the migration must be halted at once. + +**Stop-only and resumable:** it does NOT drop the replication slot, snapshot, or target data. The migration dir, SQLite catalog, and source slot are preserved, so afterwards you can resume with `resume-migration.sh` (or `resume-cdc.sh` if the initial COPY had finished). To instead abandon and start over, run `drop-replication-slots.sh` → `target-clean.sh` → `start-migration-screen.sh`. Leaving the slot in place keeps WAL accumulating on the source until you resume or drop it. + +**Requires:** a running migration. **No `~/.env` needed** — it never reads DB credentials (and never prints process command lines, which would leak PGURI passwords). + +**Exit code:** `0` when processes were stopped or nothing was running; `1` if pgcopydb processes survived (it lists the leftover PIDs). + --- ### Monitoring diff --git a/pgcopydb-helpers/README.md b/pgcopydb-helpers/README.md index dff4286..5432831 100644 --- a/pgcopydb-helpers/README.md +++ b/pgcopydb-helpers/README.md @@ -229,6 +229,15 @@ This drops the replication slot on the source, the replication origin on the tar ## Recovery +To halt a running migration immediately — for example when the source database is overloaded during the initial copy — use the emergency stop: + +```bash +~/emergency-stop.sh # uses most recent migration dir +MIGRATION_DIR=~/migration_YYYYMMDD-HHMMSS ~/emergency-stop.sh # or specify explicitly +``` + +This terminates `pgcopydb` and all of its workers at once (SIGTERM, escalating to SIGKILL on its own if needed) after prompting for confirmation and printing the consequences. It is **stop-only**: it does not drop the replication slot or touch the target, so the migration stays resumable with `resume-migration.sh` / `resume-cdc.sh` below. + If pgcopydb crashes, the instance reboots, or the migration is interrupted: ```bash @@ -414,6 +423,7 @@ sqlite3 ~/migration_*/schema/filter.db "SELECT COUNT(*) FROM s_depend;" | `check-migration-status.sh` | Monitor | Migration progress dashboard | | `check-cdc-status.sh` | Monitor | CDC replication progress and health | | `slack-migration-alerts.sh` | Monitor | Slack alerts | +| `emergency-stop.sh` | Recovery | Immediately stop a running migration and all subprocesses | | `resume-migration.sh` | Recovery | Resume an interrupted migration (full clone + CDC) | | `resume-cdc.sh` | Recovery | Resume only the CDC phase (skips clone) | | `target-clean.sh` | Recovery | Wipe target database for re-migration (prompts for confirmation) | diff --git a/pgcopydb-helpers/emergency-stop.sh b/pgcopydb-helpers/emergency-stop.sh new file mode 100755 index 0000000..f987c66 --- /dev/null +++ b/pgcopydb-helpers/emergency-stop.sh @@ -0,0 +1,182 @@ +#!/bin/bash +# +# Usage: ~/emergency-stop.sh +# Example: MIGRATION_DIR=~/migration_YYYYMMDD-HHMMSS ~/emergency-stop.sh +# +# EMERGENCY STOP: immediately terminates a running pgcopydb migration and +# ALL of its subprocesses (clone/COPY/index/follow workers). Use when the source +# database is overloaded or the migration must be halted at once. Prompts for +# confirmation and spells out the consequences first, then sends SIGTERM to the +# whole process group and escalates to SIGKILL on its own if anything survives. +# +# Stop-only by design: it does NOT drop the replication slot, snapshot, or +# target data, so the migration stays resumable via resume-migration.sh / +# resume-cdc.sh. +# +set -eo pipefail + +# List live pgcopydb PIDs by exact process name. Every pgcopydb process — +# supervisor and workers — is named "pgcopydb". +# If none are found, returns an empty string (not an error). +pgcopydb_pids() { pgrep -x pgcopydb 2>/dev/null || true; } + +# Wait up to $1 seconds for every pgcopydb process to disappear (lets the OS reap +# children after a kill). Returns 0 once none remain, non-zero on timeout. +wait_until_gone() { + local secs="$1" + while [ "$secs" -gt 0 ]; do + [ -z "$(pgcopydb_pids)" ] && return 0 + sleep 1 + secs=$((secs - 1)) + done + [ -z "$(pgcopydb_pids)" ] +} + +# Find the most recent migration directory, or set MIGRATION_DIR explicitly +MIGRATION_DIR="${MIGRATION_DIR:-$(ls -dt ~/migration_*/ 2>/dev/null | head -1 || true)}" + +# --- Locate the running pgcopydb supervisor --- +# pgcopydb writes its supervisor PID to /pgcopydb.pid at startup. Fall back +# to pgrep so the script still works if the pidfile is missing or stale. +MAIN_PID="" +PIDFILE="$MIGRATION_DIR/pgcopydb.pid" +if [ -n "$MIGRATION_DIR" ] && [ -f "$PIDFILE" ]; then + MAIN_PID=$(head -1 "$PIDFILE" 2>/dev/null | tr -d '[:space:]') + # Drop a stale pid that no longer points at a live process + if [ -n "$MAIN_PID" ] && ! kill -0 "$MAIN_PID" 2>/dev/null; then + MAIN_PID="" + fi +fi + +PGCOPYDB_PIDS=$(pgcopydb_pids) + +if [ -z "$MAIN_PID" ] && [ -z "$PGCOPYDB_PIDS" ]; then + echo "No running pgcopydb migration found. Nothing to stop." + exit 0 +fi + +PROC_COUNT=$(printf '%s\n' "$PGCOPYDB_PIDS" | grep -c '[0-9]' || true) +PGID="" +if [ -n "$MAIN_PID" ]; then + PGID=$(ps -o pgid= -p "$MAIN_PID" 2>/dev/null | tr -d ' ' || true) +fi + +# Safety net: only signal pgcopydb's process group when it is genuinely its own +# group (pgcopydb calls setpgrp at startup, so it always is in production). +OWN_PGID=$(ps -o pgid= -p $$ 2>/dev/null | tr -d ' ' || true) +USE_PGID=0 +if [ -n "$PGID" ] && [ "$PGID" != "$OWN_PGID" ]; then + USE_PGID=1 +fi + +# --- Report what will be stopped (query pgcopydb's catalog directly) --- +COPY_DONE=0 +TABLES_DONE="" +TABLES_TOTAL="" +CDC_STARTED="no" +SOURCE_DB="$MIGRATION_DIR/schema/source.db" +if [ -n "$MIGRATION_DIR" ] && [ -f "$SOURCE_DB" ]; then + TABLES_TOTAL=$(sqlite3 "$SOURCE_DB" "SELECT COUNT(*) FROM s_table;" 2>/dev/null || true) + TABLES_DONE=$(sqlite3 "$SOURCE_DB" \ + "SELECT COUNT(*) FROM summary WHERE tableoid IS NOT NULL AND done_time_epoch IS NOT NULL;" 2>/dev/null || true) + WRITE_LSN=$(sqlite3 "$SOURCE_DB" "SELECT write_lsn FROM sentinel LIMIT 1;" 2>/dev/null || true) + if [ -n "$WRITE_LSN" ]; then CDC_STARTED="yes"; fi + if [ -n "$TABLES_TOTAL" ] && [ "$TABLES_TOTAL" -gt 0 ] 2>/dev/null && [ "$TABLES_DONE" = "$TABLES_TOTAL" ]; then + COPY_DONE=1 + fi +fi + +echo "==========================================" +echo "EMERGENCY STOP: pgcopydb migration" +echo "==========================================" +echo "Migration dir: ${MIGRATION_DIR:-(unknown)}" +echo "Supervisor PID: ${MAIN_PID:-(no pidfile — using pgrep)}" +echo "Process group: ${PGID:-(n/a)}" +echo "pgcopydb procs: ${PROC_COUNT:-0}" +if [ -n "$TABLES_TOTAL" ]; then + echo "Initial COPY: ${TABLES_DONE:-0}/${TABLES_TOTAL} tables done" +fi +echo "CDC started: $CDC_STARTED" +echo "" +echo "Consequences of stopping NOW:" +echo " - pgcopydb and ALL its workers (COPY/index/follow) are terminated at once." +echo " - If the initial copy is still running, target tables may be left" +echo " PARTIALLY COPIED and are not consistent until the migration resumes." +echo " - The source replication slot stays ACTIVE (kept on purpose so you can" +echo " resume) — it holds WAL on the source until you resume or drop it." +echo " - This is RECOVERABLE: the migration dir, SQLite catalog, and slot are" +echo " preserved, so the migration can be resumed." +echo "" +read -r -p "Stop the migration NOW? [y/N] " CONFIRM || CONFIRM="" +if [[ ! "$CONFIRM" =~ ^[Yy]$ ]]; then + echo "Aborted." + exit 0 +fi + +# --- Stop: SIGTERM the whole process group, then escalate automatically --- +echo "" +echo "Sending SIGTERM..." +if [ "$USE_PGID" -eq 1 ]; then + kill -TERM -- "-$PGID" 2>/dev/null || true +else + pkill -TERM -x pgcopydb 2>/dev/null || true +fi + +# Give pgcopydb up to ~10s to wind down cleanly. If it doesn't, escalate to +# SIGKILL automatically and keep trying — by process group AND +# by each surviving PID — for several rounds. +if ! wait_until_gone 10; then + echo "Still running after SIGTERM — escalating to SIGKILL..." + tries=5 + while [ "$tries" -gt 0 ] && [ -n "$(pgcopydb_pids)" ]; do + [ "$USE_PGID" -eq 1 ] && kill -KILL -- "-$PGID" 2>/dev/null || true + pkill -KILL -x pgcopydb 2>/dev/null || true + for p in $(pgcopydb_pids); do kill -KILL "$p" 2>/dev/null || true; done + wait_until_gone 2 && break + tries=$((tries - 1)) + done +fi + +# Tear down a lingering detached screen session from start-migration-screen.sh +screen -S migration -X quit >/dev/null 2>&1 || true + +# --- Verify: only after SIGTERM + repeated SIGKILL do we ask the user to act --- +REMAINING="$(pgcopydb_pids | tr '\n' ' ')" +if [ -n "${REMAINING// /}" ]; then + echo "" + echo "############################################################" + echo "WARNING: pgcopydb is STILL RUNNING after SIGTERM and repeated SIGKILL." + echo "Surviving PIDs: $REMAINING" + echo "Inspect and kill them manually, then re-run this script:" + echo " ps -o pid,stat,comm -p $REMAINING" + echo " sudo kill -9 $REMAINING" + echo " ~/emergency-stop.sh" + echo "############################################################" + exit 1 +fi + +echo "" +echo "All pgcopydb processes stopped." +echo "" +echo "==========================================" +echo "Next steps" +echo "==========================================" +echo "To RESUME where it stopped (slot + catalog are intact):" +if [ "$COPY_DONE" -eq 1 ]; then + echo " Initial COPY had finished — resume CDC only:" + echo " ~/resume-cdc.sh" + echo " (or, to re-run the full clone + CDC: ~/resume-migration.sh)" +else + echo " Initial COPY was not finished — resume the full clone + CDC:" + echo " ~/resume-migration.sh" + echo " (or, if the copy was actually complete: ~/resume-cdc.sh)" +fi +echo "" +echo "To ABANDON and start fresh:" +echo " ~/drop-replication-slots.sh # remove slot/origin" +echo " ~/target-clean.sh # wipe the target" +echo " ~/start-migration-screen.sh # start over" +echo "" +echo "If you will NOT resume soon and need to relieve the source, run" +echo "~/drop-replication-slots.sh to free the WAL the slot is holding." +echo "WARNING: dropping the slot makes resume impossible (full re-clone required)."