pglogical icon indicating copy to clipboard operation
pglogical copied to clipboard

Pglogical_create_subscriber losing records when converting streaming slave to logical

Open glk1001 opened this issue 3 years ago • 5 comments

We have a test harness that checks no records are lost when using 'pglogical_create_subscriber' to convert a streaming slave to a logical slave. This was working fine in Postgres 10 but we're just moving to Postgres 13 and this test harness is failing. Basically, the test has a separate process updating records on the master at the same time as 'pglogical_create_subscriber' is converting the streaming, and a comparison of the records at the end. Not sure where to look to solve this but I can see that when the conversion is not successful, the Postgres log on the master instance has entries with 'app' = '..._snap' including a line with 'SELECT pg_drop_replication_slot($1)'. These are not there when the test passes - if that's a clue. I can dig deeper if you can give me some suggestions or I could put more time into a simpler test.

glk1001 avatar Oct 20 '21 07:10 glk1001

Please provide a test case.

eulerto avatar Oct 20 '21 15:10 eulerto

=====test_pglogical_forever.sh
#!/bin/bash -x

set -u


# This test fails after a few retries on a Centos 7 Vagrant box.
#
# Vagrant:    2.2.7
# VirtualBox: 6.1
# Postgres:   13.4
# Pglogical:  postgresql13-pglogical.x86_64 2.4.0-1.el7
# Centos:     7.7.1908 (Core)

declare -r THIS_TEST_START_TIME=$(date +"%Y-%m-%d_%H-%M-%S")

for retry_num in $(seq 100); do
    bash test_pglogical_setup_slave.sh "${THIS_TEST_START_TIME}" "${retry_num}"
    if [[ $? != 0 ]]; then
        echo "ERROR: Master/Slave setup failure."
        exit 1
    fi    
    bash test_pglogical.sh
    if [[ $? != 0 ]]; then
        echo "ERROR: Test failed after ${retry_num} tries."
        exit 1
    fi    
done
=====
=====test_pglogical.sh
#!/bin/bash -x

set -u
set -e

declare -r POSTGRES_VERSION=13

declare -r SRCE_PORT=5340
declare -r DEST_PORT=5341
declare -r DEST_DATA_DIR="/var/lib/pgsql/${POSTGRES_VERSION}/slave"
declare -r DBNAME="test_db"
declare -r TABLE="test_table"
declare -r TOTAL_TIME_SECS=30
declare -r TRIGGER_FILE="/tmp/stop_inserting"

function create_test_database()
{
    rm -f "${TRIGGER_FILE}"

    psql -p ${SRCE_PORT} -c "DROP DATABASE IF EXISTS ${DBNAME}"
    psql -p ${SRCE_PORT} -c "CREATE DATABASE ${DBNAME}"
    psql -p ${SRCE_PORT} -d "${DBNAME}" -c "CREATE TABLE ${TABLE} (f1 int primary key, insert_time timestamp without time zone)"
    psql -p ${SRCE_PORT} -d "${DBNAME}" -c "CREATE TABLE table1 (f1 int primary key, insert_time timestamp without time zone)"
}

function add_records_to_test_table()
{
    local -r sleep_secs=0

    rm -f "${TRIGGER_FILE}"

    psql -p ${SRCE_PORT} -d "${DBNAME}" -c "DELETE FROM ${TABLE}"
    
    echo "***START LOOP TO INSERT RECORDS INTO \"${DBNAME}\"..."
    echo
    
    local -r temp_sql_file="/tmp/test_pglogical.sql"
    local -r start_time=$(date +%s)
    local f1_val_done=0
    local f1_val=1
    while true; do
        echo "" > "${temp_sql_file}"
        for i in $(seq 2); do
            f1_val_done=${f1_val}
            echo "INSERT INTO ${TABLE} VALUES($f1_val, CURRENT_TIMESTAMP);" >> "${temp_sql_file}"
            f1_val=$((f1_val + 1))
        done
        psql -p ${SRCE_PORT} -d "${DBNAME}" -f "${temp_sql_file}" &> /dev/null
        if [[ -f "${TRIGGER_FILE}" ]]; then
            echo
            echo "*** FOUND TRIGGER FILE \"${TRIGGER_FILE}\". STOPPING NOW..."
            break
        fi    
        sleep ${sleep_secs}
        time_now=$(date +%s)
        if (( $((time_now - start_time)) > TOTAL_TIME_SECS )); then
            break
        fi    
    done
    
    echo "*** STOPPED LOOP (PID ${BASHPID}) INSERTING RECORDS INTO \"${DBNAME}\". LAST RECORD WAS \"${f1_val_done}\"."
    echo
    
    return 0
}

function setup_master_node()
{
    psql -p ${SRCE_PORT} -d ${DBNAME} -c "CREATE EXTENSION IF NOT EXISTS pglogical"
    psql -p ${SRCE_PORT} -d ${DBNAME} -c "select pglogical.create_node(node_name := 'provider', dsn := 'host=localhost port=${SRCE_PORT} dbname=${DBNAME}')"
    psql -p ${SRCE_PORT} -d ${DBNAME} -c "select pglogical.create_replication_set('replication_set')"
    psql -p ${SRCE_PORT} -d ${DBNAME} -c "select pglogical.replication_set_add_table(set_name := 'replication_set', relation := '${TABLE}')"

    echo
    echo "*** Successfully setup pglogical master on port ${SRCE_PORT}."
    echo
}

function setup_subscriber_node()
{
    echo
    echo "*** Stopping slave, then running pglogical_create_subscriber..."
    echo

    /usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl stop -m fast -D ${DEST_DATA_DIR}

    /usr/pgsql-${POSTGRES_VERSION}/bin/pglogical_create_subscriber -D ${DEST_DATA_DIR} \
    --subscriber-name="subscription" \
    --subscriber-dsn="host=localhost port=${DEST_PORT} dbname=${DBNAME}" \
    --provider-dsn="host=localhost port=${SRCE_PORT} dbname=${DBNAME}" \
    --replication-sets="ddl_sql,replication_set" \
    -v --drop-slot-if-exists

    echo
    echo "*** Successfully started pglogical slave on port ${DEST_PORT}."
    echo
}

create_test_database
add_records_to_test_table &

sleep 2

setup_master_node
setup_subscriber_node

touch "${TRIGGER_FILE}"

sleep 5

NUM_SRCE_RECS=$(psql -qt -p ${SRCE_PORT} -d "${DBNAME}" -c "SELECT COUNT(f1) FROM ${TABLE}") 
NUM_DEST_RECS=$(psql -qt -p ${DEST_PORT} -d "${DBNAME}" -c "SELECT COUNT(f1) FROM ${TABLE}") 

echo "NUM_SRCE_RECS = ${NUM_SRCE_RECS}"
echo "NUM_DEST_RECS = ${NUM_DEST_RECS}"

if (( NUM_SRCE_RECS != NUM_DEST_RECS )); then
    echo "ERROR: srce/dest records mismatch"
    exit 1
fi
======
=====test_pglogical_setup_slave.sh
#!/bin/bash -x

set -u

declare -r THIS_TEST_START_TIME=${1}
declare -r TEST_NUM=${2}

declare -r POSTGRES_VERSION=13

declare -r MASTER_PORT=5340
declare -r SLAVE_PORT=5341
declare -r MASTER_DIR="/var/lib/pgsql/${POSTGRES_VERSION}/master"
declare -r SLAVE_DIR="/var/lib/pgsql/${POSTGRES_VERSION}/slave"

/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl stop -m immediate -D ${SLAVE_DIR}
if pgrep -af "postgres .*slave" ; then
    echo "ERROR: Could not stop slave."
    exit 1
fi    

rm -rf ${SLAVE_DIR}
if [[ -d ${SLAVE_DIR} ]]; then
    echo "ERROR: Could not delete slave directory \"${SLAVE_DIR}\"."
    exit 1
fi    

/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl stop -m immediate -D ${MASTER_DIR}
if pgrep -af "postgres .*master" ; then
    echo "ERROR: Could not stop master."
    exit 1
fi    

rm -rf ${MASTER_DIR}
if [[ -d ${MASTER_DIR} ]]; then
    echo "ERROR: Could not delete master directory \"${MASTER_DIR}\"."
    exit 1
fi    

/usr/pgsql-${POSTGRES_VERSION}/bin/initdb -D ${MASTER_DIR}  -E UTF8 --lc-collate=C --lc-ctype=C
if [[ $? != 0 ]]; then 
    echo "ERROR: Could not create master."
    exit 1
fi    

echo "port = ${MASTER_PORT}" >> ${MASTER_DIR}/postgresql.auto.conf
echo "shared_preload_libraries = 'pglogical'" >> ${MASTER_DIR}/postgresql.auto.conf
echo "wal_level = logical" >> ${MASTER_DIR}/postgresql.auto.conf
echo "max_worker_processes = 10" >> ${MASTER_DIR}/postgresql.auto.conf
echo "max_replication_slots = 10" >> ${MASTER_DIR}/postgresql.auto.conf
echo "max_wal_senders = 10" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_directory = '${PWD}/log'" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_filename = 'postgresql-${THIS_TEST_START_TIME}-test-${TEST_NUM}.log'" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_checkpoints = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_connections = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_disconnections = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_duration = off" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_min_duration_statement = 0" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_statement = all" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_line_prefix = '%m [%p] port=${MASTER_PORT} %q%u@%d/%a '" >> ${MASTER_DIR}/postgresql.auto.conf

echo "host   replication      postgres     127.0.0.1/32            trust" >> ${MASTER_DIR}/pg_hba.conf
echo "host       test_db      postgres     127.0.0.1/32            trust" >> ${MASTER_DIR}/pg_hba.conf
echo "host       test_db      postgres     127.0.0.1/32            trust" >> ${MASTER_DIR}/pg_hba.conf

/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl start -D ${MASTER_DIR}
if [[ $? != 0 ]]; then 
    echo "ERROR: Could not start master."
    exit 1
fi    

pg_basebackup -h localhost -U postgres -p ${MASTER_PORT} -D ${SLAVE_DIR} -P -Xs -R
if [[ $? != 0 ]]; then 
    echo "ERROR: Could not create slave."
    exit 1
fi    

echo "port = ${SLAVE_PORT}" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "archive_mode = off" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "hot_standby = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "shared_preload_libraries = 'pglogical'" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_directory = '${PWD}/log'" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_filename = 'postgresql-${THIS_TEST_START_TIME}-test-${TEST_NUM}.log'" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_checkpoints = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_connections = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_disconnections = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_duration = off" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_min_duration_statement = 0" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_statement = all" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_line_prefix = '%m [%p] port=${SLAVE_PORT} %q%u@%d/%a '" >> ${SLAVE_DIR}/postgresql.auto.conf

/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl start -D ${SLAVE_DIR}
if [[ $? != 0 ]]; then 
    echo "ERROR: Could not start slave."
    exit 1
fi    
=====

glk1001 avatar Oct 27 '21 06:10 glk1001

I did get a failure on a physical machine after about 3000 retries.

glk1001 avatar Oct 27 '21 12:10 glk1001

Hi, I've looked into this a bit more and it's failing because of this code sequence:

In

void pglogical_apply_main(Datum main_arg)

there is a call to

pglogical_sync_subscription(MySubscription);

Further in this gets called:

PGLogicalSyncStatus * get_subscription_sync_status(Oid subid, bool missing_ok)
...

	rv = makeRangeVar(EXTENSION_NAME, CATALOG_LOCAL_SYNC_STATUS, -1);
	rel = table_openrv(rv, RowExclusiveLock);
	tupDesc = RelationGetDescr(rel);

The problem is here - sometimes the return for sync status is 'i' instead of 'r'. This happens even though the 'pglogical_create_subscriber' program sets sync_status = 'r' just after setting up a subscription. I can make the test script pass by forcing sync status = 'r' to be returned from 'get_subscription_sync_status'. Not sure why 'table_openrv' is not always getting 'r'.

glk1001 avatar Nov 09 '21 06:11 glk1001

@glk1001 we also met the same issue, could you share how you fix it? Or workaround?

JinlongWukong avatar Dec 09 '23 14:12 JinlongWukong