commit a594ca905a125dc3165fbee0ad6e224600f23ee8 (HEAD -> knielsen_mdev28776)
Author: Kristian Nielsen <knielsen@knielsen-hq.org>
Date:   Mon Jul 17 00:26:41 2023 +0200

    MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master
    
    Hacky test case to reproduce the problem locally.
    Inject a handful of sleeps that makes the problem trigger in most runs of the test (as compared to one in >100000 iterations on buildbot machines).
    
    Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>

diff --git a/mysql-test/suite/rpl/t/rpl_parallel_mdev28776_try2.test b/mysql-test/suite/rpl/t/rpl_parallel_mdev28776_try2.test
new file mode 100644
index 00000000000..67a664a2fad
--- /dev/null
+++ b/mysql-test/suite/rpl/t/rpl_parallel_mdev28776_try2.test
@@ -0,0 +1,66 @@
+--source include/master-slave.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+--source include/have_binlog_format_statement.inc
+
+--let $rpl_connection_name= slave2
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+--let $rpl_connection_name= slave3
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+--let $rpl_connection_name= slave4
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+--connection master
+ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
+CREATE TABLE t1(a INT) ENGINE=INNODB;
+INSERT INTO t1 VALUES(1);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+--let $save_transaction_retries= `SELECT @@global.slave_transaction_retries`
+--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads`
+--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode`
+set @@global.slave_parallel_threads= 2;
+set @@global.slave_parallel_mode= OPTIMISTIC;
+set @@global.slave_transaction_retries= 5;
+
+
+--connection master
+# T1
+INSERT INTO t1  SELECT 1+a FROM t1;
+# T2
+INSERT INTO t1  SELECT 2+a FROM t1;
+
+SELECT * FROM t1 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection slave
+--let $retry1= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1)
+--source include/start_slave.inc
+--let $slave_timeout= 15
+--source include/sync_with_master_gtid.inc
+eval SET GLOBAL debug_dbug= '$save_dbug';
+--let $retry2= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1)
+SELECT * FROM t1 ORDER BY a;
+--let $retries= `SELECT $retry2 - $retry1`
+eval SELECT $retries AS `Number of retries`;
+
+# Cleanup.
+--connection slave
+--source include/stop_slave.inc
+eval SET @@global.slave_parallel_threads= $save_slave_parallel_threads;
+eval SET @@global.slave_parallel_mode= $save_slave_parallel_mode;
+eval SET @@global.slave_transaction_retries= $save_transaction_retries;
+--source include/start_slave.inc
+
+--connection master
+DROP TABLE t1;
+
+--source include/rpl_end.inc
diff --git a/sql/rpl_parallel.cc b/sql/rpl_parallel.cc
index 1aeb1257c4a..84dcc5ba059 100644
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -740,6 +740,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
   Format_description_log_event *description_event= NULL;
 
 do_retry:
+  fprintf(stderr, "HULU1: retry_event_group(GTID %u-%u-%lu)\n", rgi->current_gtid.domain_id, rgi->current_gtid.server_id, (ulong)rgi->current_gtid.seq_no);
   event_count= 0;
   err= 0;
   errmsg= NULL;
@@ -1207,6 +1208,9 @@ handle_rpl_parallel_thread(void *arg)
         bool did_enter_cond= false;
         PSI_stage_info old_stage;
 
+        // A small sleep to make T1 be the youngest trx and be chosen as deadlock victim.
+        if (rgi->current_gtid.seq_no == 4)
+          my_sleep(10000);
 #ifdef ENABLED_DEBUG_SYNC
         DBUG_EXECUTE_IF("hold_worker_on_schedule", {
             if (rgi->current_gtid.domain_id == 0 &&
diff --git a/sql/slave.cc b/sql/slave.cc
index f43240a8866..ca249245455 100644
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -502,6 +502,9 @@ static void bg_rpl_load_gtid_slave_state(void *)
 static void bg_slave_kill(void *victim)
 {
   THD *to_kill= (THD *)victim;
+  fprintf(stderr, "HULU3: deadlock kill GTID %u-%u-%lu\n", ((to_kill && to_kill->rgi_slave) ? to_kill->rgi_slave->current_gtid.domain_id : 0), ((to_kill && to_kill->rgi_slave) ? to_kill->rgi_slave->current_gtid.server_id : 0), ((to_kill && to_kill->rgi_slave) ? (ulong)to_kill->rgi_slave->current_gtid.seq_no : (ulong)0));
+  my_sleep(5000000);
+  fprintf(stderr, "HULU3:     (delayed to now)\n");
   to_kill->awake(KILL_CONNECTION);
   mysql_mutex_lock(&to_kill->LOCK_wakeup_ready);
   to_kill->rgi_slave->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index e6ed7ca1cc4..695b5ddb5e0 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -5166,6 +5166,21 @@ thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd)
   return 1;
 }
 
+extern "C" void
+hulu_print_thd_gtid(const char * pref, const THD *victim_thd, const THD *other_thd)
+{
+  fprintf(stderr, "%s GTID %u-%u-%lu (%u-%u-%lu)\n", pref, ((victim_thd && victim_thd->rgi_slave) ? victim_thd->rgi_slave->current_gtid.domain_id : 0), ((victim_thd && victim_thd->rgi_slave) ? victim_thd->rgi_slave->current_gtid.server_id : 0), ((victim_thd && victim_thd->rgi_slave) ? (ulong)victim_thd->rgi_slave->current_gtid.seq_no : (ulong)0), ((other_thd && other_thd->rgi_slave) ? other_thd->rgi_slave->current_gtid.domain_id : 0), ((other_thd && other_thd->rgi_slave) ? other_thd->rgi_slave->current_gtid.server_id : 0), ((other_thd && other_thd->rgi_slave) ? (ulong)other_thd->rgi_slave->current_gtid.seq_no : (ulong)0));
+}
+
+void hulu_wait_the_T2(const THD *thd)
+{
+  if (!thd || !thd->rgi_slave)
+    return;
+  const rpl_gtid *g= &thd->rgi_slave->current_gtid;
+  if (g->seq_no == 5)
+    my_sleep(20000);
+}
+
 /*
   This function is called from InnoDB to check if the commit order of
   two transactions has already been decided by the upper layer. This happens
diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc
index 0f1b66f7610..e0faad366fe 100644
--- a/sql/sql_insert.cc
+++ b/sql/sql_insert.cc
@@ -78,6 +78,7 @@
 #include "sql_audit.h"
 #include "sql_derived.h"                        // mysql_handle_derived
 #include "sql_prepare.h"
+#include "rpl_rli.h"
 #include <my_bit.h>
 
 #include "debug_sync.h"
@@ -1753,6 +1754,9 @@ int write_record(THD *thd, TABLE *table,COPY_INFO *info)
   save_read_set=  table->read_set;
   save_write_set= table->write_set;
 
+  // Small sleep to make both T1 and T2 hold the S lock before either takes the X lock.
+  if (thd->rgi_slave && (thd->rgi_slave->current_gtid.seq_no == 4 || thd->rgi_slave->current_gtid.seq_no == 5))
+    my_sleep(20000);
   if (info->handle_duplicates == DUP_REPLACE ||
       info->handle_duplicates == DUP_UPDATE)
   {
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index 469d03eaa06..d0c427a99eb 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -6783,6 +6783,8 @@ DeadlockChecker::notify(const lock_t* lock) const
 	DBUG_PRINT("ib_lock", ("deadlock detected"));
 }
 
+extern "C" void hulu_print_thd_gtid(const char * pref, const THD *victim_thd, const THD *other_thd);
+
 /** Select the victim transaction that should be rolledback.
 @return victim transaction */
 const trx_t*
@@ -6800,6 +6802,7 @@ DeadlockChecker::select_victim() const
 			return(m_wait_lock->trx);
 		}
 #endif /* WITH_WSREP */
+		hulu_print_thd_gtid("HULU2: A: victim", m_start->mysql_thd, m_wait_lock->trx->mysql_thd);
 		return(m_start);
 	}
 
@@ -6809,6 +6812,7 @@ DeadlockChecker::select_victim() const
 	}
 #endif /* WITH_WSREP */
 
+	hulu_print_thd_gtid("HULU2: B: victim", m_wait_lock->trx->mysql_thd, m_start->mysql_thd);
 	return(m_wait_lock->trx);
 }
 
diff --git a/storage/innobase/lock/lock0wait.cc b/storage/innobase/lock/lock0wait.cc
index 5eb03f668b3..9961a3acc3b 100644
--- a/storage/innobase/lock/lock0wait.cc
+++ b/storage/innobase/lock/lock0wait.cc
@@ -336,6 +336,8 @@ lock_wait_suspend_thread(
 	}
 
 	os_event_wait(slot->event);
+	extern void hulu_wait_the_T2(const THD *thd);
+	hulu_wait_the_T2(trx->mysql_thd);
 
 	thd_wait_end(trx->mysql_thd);
 
