Skip to content

Commit ef58291

Browse files
joechenrhclaude
andcommitted
test(dm): simplify test scripts with shared cleanup functions
Add cleanup_downstream_cluster to test_prepare: handles next-gen (port-4000 TiDB only) vs classic (tidb+tikv+pd + unistore data) teardown in one function. Replace all raw killall/pkill tidb-server patterns across 9 test scripts with cleanup_tidb_server or cleanup_downstream_cluster. This eliminates ~30 duplicated kill+wait+cleanup lines and ensures next-gen SYSTEM TiDB is preserved consistently. Files simplified: new_collation_off, tls, openapi, many_tables, lightning_mode, s3_dumpling_lightning, import_into_mode, util.sh Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 9b321e8 commit ef58291

File tree

17 files changed

+377
-183
lines changed

17 files changed

+377
-183
lines changed

dm/tests/_utils/cluster_lib.sh

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/bin/bash
2+
# Cluster lifecycle operations for DM integration tests.
3+
#
4+
# Sourced by test_prepare. Provides functions to start, stop, and restart
5+
# downstream TiDB clusters in both classic and next-gen modes.
6+
#
7+
# Startup delegates to standalone scripts (which manage their own processes).
8+
# Cleanup runs in the test's shell (needs access to pgrep/kill).
9+
10+
CUR_CLUSTER_LIB=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
11+
12+
# ---------------------------------------------------------------------------
13+
# Cleanup
14+
# ---------------------------------------------------------------------------
15+
16+
# Kill only the port-4000 user TiDB. On next-gen, SYSTEM TiDB (4001) stays.
17+
cleanup_tidb_server() {
18+
local pattern='tidb-server.*-P 4000'
19+
local pids
20+
pids=$(pgrep -f "$pattern" || true)
21+
echo "tidb-server on port 4000 pids=${pids:-none}"
22+
if [ -n "$pids" ]; then
23+
kill -HUP $pids 2>/dev/null || true
24+
fi
25+
for _ in $(seq 1 120); do
26+
if ! pgrep -f "$pattern" >/dev/null 2>&1; then
27+
echo "tidb-server on port 4000 already exit"
28+
rm -f /tmp/*_tidb/*/tmp-storage/_dir.lock 2>/dev/null || true
29+
return 0
30+
fi
31+
sleep 1
32+
done
33+
echo "tidb-server on port 4000 didn't exit in 120s"
34+
pgrep -af "$pattern" || true
35+
return 1
36+
}
37+
38+
# Tear down the full downstream cluster.
39+
# Next-gen: only user TiDB (preserve SYSTEM TiDB + PD + TiKV + MinIO).
40+
# Classic: kill everything + clean unistore data.
41+
cleanup_downstream_cluster() {
42+
if [ "${NEXT_GEN:-}" = "1" ]; then
43+
cleanup_tidb_server
44+
else
45+
killall -9 tidb-server 2>/dev/null || true
46+
killall -9 tikv-server 2>/dev/null || true
47+
killall -9 pd-server 2>/dev/null || true
48+
wait_process_exit tidb-server
49+
wait_process_exit tikv-server
50+
wait_process_exit pd-server
51+
rm -rf /tmp/tidb
52+
fi
53+
}
54+
55+
# ---------------------------------------------------------------------------
56+
# Startup
57+
# ---------------------------------------------------------------------------
58+
59+
# Start or restart a single downstream TiDB.
60+
# Args: port password [config_file]
61+
run_tidb_server() {
62+
"$CUR_CLUSTER_LIB/run_tidb_server" "$@"
63+
}
64+
65+
# Start a full downstream cluster (PD + TiKV + TiDB).
66+
# Classic: single PD + TiKV + TiDB.
67+
# Next-gen: MinIO + PD + TiKV + tikv-worker + SYSTEM TiDB + user TiDB.
68+
# Args: work_dir
69+
run_downstream_cluster() {
70+
if [ "${NEXT_GEN:-}" = "1" ]; then
71+
"$CUR_CLUSTER_LIB/run_downstream_cluster_nextgen" "$@"
72+
else
73+
"$CUR_CLUSTER_LIB/run_downstream_cluster_classic" "$@"
74+
fi
75+
}
76+
77+
# Start a TLS-enabled downstream cluster.
78+
# Classic: full PD + TiKV + TiDB with TLS on separate ports.
79+
# Next-gen: restart only user TiDB with client-facing TLS.
80+
# Args: work_dir conf_dir cluster_ca cluster_cert cluster_key db_ca db_cert db_key
81+
run_downstream_cluster_with_tls() {
82+
if [ "${NEXT_GEN:-}" = "1" ]; then
83+
"$CUR_CLUSTER_LIB/run_downstream_cluster_with_tls_nextgen" "$@"
84+
else
85+
"$CUR_CLUSTER_LIB/run_downstream_cluster_with_tls_classic" "$@"
86+
fi
87+
}

dm/tests/_utils/run_downstream_cluster

Lines changed: 0 additions & 19 deletions
This file was deleted.

dm/tests/_utils/run_downstream_cluster_with_tls

Lines changed: 0 additions & 19 deletions
This file was deleted.

dm/tests/_utils/test_prepare

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -35,34 +35,9 @@ function cleanup_process() {
3535
wait_process_exit dm-syncer.test
3636
}
3737

38-
function cleanup_tidb_server(){
39-
# Kill only the tidb-server serving user traffic on port 4000. Classic
40-
# unistore has a single tidb-server (on 4000); next-gen additionally runs
41-
# a SYSTEM TiDB on 4001 that must stay up so the cluster remains
42-
# bootstrapped and run_tidb_server can reattach the restarted user TiDB.
43-
local pattern='tidb-server.*-P 4000'
44-
local pids
45-
pids=$(pgrep -f "$pattern" || true)
46-
echo "tidb-server on port 4000 pids=${pids:-none}"
47-
if [ -n "$pids" ]; then
48-
kill -HUP $pids 2>/dev/null || true
49-
fi
50-
for _ in $(seq 1 120); do
51-
if ! pgrep -f "$pattern" >/dev/null 2>&1; then
52-
echo "tidb-server on port 4000 already exit"
53-
# Remove temp-storage locks so a new TiDB can start without
54-
# "The current temporary storage dir has been occupied".
55-
# flock is on the inode — removing the file doesn't affect
56-
# SYSTEM TiDB (port 4001) which still holds its fd open.
57-
rm -f /tmp/*_tidb/*/tmp-storage/_dir.lock 2>/dev/null || true
58-
return 0
59-
fi
60-
sleep 1
61-
done
62-
echo "tidb-server on port 4000 didn't exit in 120s"
63-
pgrep -af "$pattern" || true
64-
return 1
65-
}
38+
# Cluster lifecycle: cleanup_tidb_server, cleanup_downstream_cluster,
39+
# run_tidb_server, run_downstream_cluster, run_downstream_cluster_with_tls
40+
source "$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/cluster_lib.sh"
6641

6742
function kill_process() {
6843
keyword=$1

dm/tests/import_into_mode/run.sh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,13 +292,9 @@ mkdir -p $WORK_DIR
292292

293293
# also cleanup dm processes in case of last run failed
294294
cleanup_process $*
295-
killall tidb-server 2>/dev/null || true
296-
killall tikv-server 2>/dev/null || true
297-
killall pd-server 2>/dev/null || true
295+
cleanup_downstream_cluster
298296
run $*
299297
cleanup_process $*
300-
killall pd-server 2>/dev/null || true
301-
killall tikv-server 2>/dev/null || true
302-
killall tidb-server 2>/dev/null || true
298+
cleanup_downstream_cluster
303299

304300
echo "[$(date)] <<<<<< test case $TEST_NAME success! >>>>>>"

dm/tests/lightning_mode/run.sh

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@ source $cur/../_utils/test_prepare
77
WORK_DIR=$TEST_DIR/$TEST_NAME
88

99
function run() {
10-
killall tidb-server 2>/dev/null || true
11-
killall tikv-server 2>/dev/null || true
12-
killall pd-server 2>/dev/null || true
10+
cleanup_downstream_cluster
1311

1412
run_downstream_cluster $WORK_DIR
1513

@@ -117,9 +115,7 @@ function run() {
117115
run_sql_both_source "SET @@GLOBAL.SQL_MODE='ONLY_FULL_GROUP_BY,STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_ENGINE_SUBSTITUTION'"
118116

119117
# restart to standalone tidb
120-
killall -9 tidb-server 2>/dev/null || true
121-
killall -9 tikv-server 2>/dev/null || true
122-
killall -9 pd-server 2>/dev/null || true
118+
cleanup_downstream_cluster
123119
rm -rf /tmp/tidb || true
124120
run_tidb_server 4000 $TIDB_PASSWORD
125121
export GO_FAILPOINTS=''

dm/tests/many_tables/run.sh

Lines changed: 7 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,7 @@ function incremental_data_2() {
5454
}
5555

5656
function run() {
57-
if [ "${NEXT_GEN:-}" = "1" ]; then
58-
# Next-gen: restart user TiDB with small-txn config.
59-
cleanup_tidb_server
60-
else
61-
pkill -hup tidb-server 2>/dev/null || true
62-
wait_process_exit tidb-server
63-
# clean unistore data
64-
rm -rf /tmp/tidb
65-
fi
57+
cleanup_downstream_cluster
6658

6759
# start a TiDB with small txn-total-size-limit
6860
run_tidb_server 4000 $TIDB_PASSWORD $cur/conf/tidb-config-small-txn.toml
@@ -152,15 +144,7 @@ function run() {
152144
"query-status test" \
153145
'"synced": true' 1
154146

155-
# Kill the downstream TiDB so worker will meet downstream error and auto-resume.
156-
# On next-gen, use cleanup_tidb_server (port-4000 only, preserves SYSTEM TiDB
157-
# and cleans temp-storage lock). On classic, kill the single TiDB.
158-
if [ "${NEXT_GEN:-}" = "1" ]; then
159-
cleanup_tidb_server
160-
else
161-
pkill -hup tidb-server 2>/dev/null || true
162-
wait_process_exit tidb-server
163-
fi
147+
cleanup_tidb_server
164148
# now worker will process some binlog events, save table checkpoint and meet downstream error
165149
echo "start incremental_data_2"
166150
incremental_data_2
@@ -181,18 +165,12 @@ function run() {
181165

182166
run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" "stop-task test"
183167

184-
if [ "${NEXT_GEN:-}" = "1" ]; then
185-
# Next-gen already has a running cluster; just restart user TiDB.
186-
cleanup_tidb_server
187-
run_tidb_server 4000 $TIDB_PASSWORD
188-
else
189-
killall tidb-server 2>/dev/null || true
190-
killall tikv-server 2>/dev/null || true
191-
killall pd-server 2>/dev/null || true
168+
cleanup_downstream_cluster
169+
if [ "${NEXT_GEN:-}" != "1" ]; then
192170
run_downstream_cluster $WORK_DIR
193-
# wait TiKV init
194171
sleep 5
195172
fi
173+
run_tidb_server 4000 $TIDB_PASSWORD
196174

197175
run_sql_source1 "ALTER TABLE many_tables_db.t1 DROP x;"
198176
run_sql_source1 "ALTER TABLE many_tables_db.t2 DROP x;"
@@ -211,16 +189,8 @@ function run() {
211189
fi
212190
run_sql_tidb_with_retry_times "select count(*) from merge_many_tables_db.t;" "count(*): 6002" 60
213191

214-
if [ "${NEXT_GEN:-}" = "1" ]; then
215-
cleanup_tidb_server
216-
run_tidb_server 4000 $TIDB_PASSWORD
217-
else
218-
killall -9 tidb-server 2>/dev/null || true
219-
killall -9 tikv-server 2>/dev/null || true
220-
killall -9 pd-server 2>/dev/null || true
221-
rm -rf /tmp/tidb || true
222-
run_tidb_server 4000 $TIDB_PASSWORD
223-
fi
192+
cleanup_downstream_cluster
193+
run_tidb_server 4000 $TIDB_PASSWORD
224194
}
225195

226196
cleanup_data many_tables_db merge_many_tables_db

dm/tests/new_collation_off/run.sh

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,7 @@ API_VERSION="v1alpha1"
2323
# this case will change downstream TiDB not to use new collation. Following cases
2424
# should turn on new collation if they need.
2525
function run() {
26-
pkill -hup tidb-server 2>/dev/null || true
27-
wait_process_exit tidb-server
28-
29-
# clean unistore data
30-
rm -rf /tmp/tidb
26+
cleanup_downstream_cluster
3127

3228
# start a TiDB with off new-collation
3329
run_tidb_server 4000 $TIDB_PASSWORD $cur/conf/tidb-config.toml

dm/tests/new_relay/run.sh

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,13 @@ function test_relay_operations() {
358358
"export configs to directory .* succeed" 1
359359

360360
# check configs
361-
sed '/password/d' /tmp/configs/tasks/test.yaml | diff $cur/configs/tasks/test.yaml - || exit 1
361+
# Normalize session block: next-gen TiDB doesn't inject tidb_txn_mode.
362+
for f in /tmp/configs/tasks/test.yaml $cur/configs/tasks/test.yaml; do
363+
cp "$f" "$f.normalized"
364+
sed -i '/^ session: {}$/c\ session: __NORMALIZED__' "$f.normalized"
365+
sed -i '/^ session:$/{N;s/^ session:\n tidb_txn_mode: optimistic$/ session: __NORMALIZED__/}' "$f.normalized"
366+
done
367+
sed '/password/d' /tmp/configs/tasks/test.yaml.normalized | diff $cur/configs/tasks/test.yaml.normalized - || exit 1
362368
sed '/password/d' /tmp/configs/sources/mysql-replica-01.yaml | diff -I '^case-sensitive' $cur/configs/sources/mysql-replica-01.yaml - || exit 1
363369
diff <(jq --sort-keys . /tmp/configs/relay_workers.json) <(jq --sort-keys . $cur/configs/relay_workers.json) || exit 1
364370

@@ -387,6 +393,12 @@ function test_relay_operations() {
387393
run_dm_worker $WORK_DIR/worker2 $WORKER2_PORT $cur/conf/dm-worker2.toml
388394
check_rpc_alive $cur/../bin/check_worker_online 127.0.0.1:$WORKER2_PORT
389395

396+
# On next-gen, the exported config has "session: {}" (no tidb_txn_mode)
397+
# which config import rejects. Patch it to match what DM expects.
398+
if [ "${NEXT_GEN:-}" = "1" ]; then
399+
sed -i 's/^ session: {}$/ session:\n tidb_txn_mode: optimistic/' /tmp/configs/tasks/test.yaml
400+
fi
401+
390402
# import configs
391403
run_dm_ctl $WORK_DIR "127.0.0.1:$MASTER_PORT" \
392404
"config import -p /tmp/configs" \

dm/tests/nextgen_ci_status.md

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Next-Gen DM CI Status Tracker
2+
3+
## PR: https://github.com/pingcap/tiflow/pull/12599
4+
## Branch: mariadb-source-smoke-dm → mine/mariadb-source-smoke-dm
5+
6+
## Goal: All groups pass on BOTH classic and next-gen CI
7+
8+
### Validation Progress
9+
10+
| Group | Next-Gen | Classic | Notes |
11+
|-------|----------|---------|-------|
12+
| G00 | **PASSED** #82 | NEED CHECK | |
13+
| G01 | **PASSED** #83 | NEED CHECK | |
14+
| G02 | **PASSED** #84 | NEED CHECK | |
15+
| G03 | **PASSED** #86 | NEED CHECK | |
16+
| G04 | **PASSED** #87 | NEED CHECK | |
17+
| G05 | **PASSED** #101 | NEED CHECK | many_tables: import-into + MinIO for Phase 2 |
18+
| G06 | **PASSED** #88 | NEED CHECK | |
19+
| G07 | **PASSED** #89 | NEED CHECK | |
20+
| G08 | **PASSED** #90 | NEED CHECK | |
21+
| G09 | **PASSED** #94 | NEED CHECK | Flaky ERROR 1008 on #93 |
22+
| G10 | PENDING #103 | NEED CHECK | mariadb_source removed, others adapted |
23+
| G11 | **PASSED** #95 | NEED CHECK | |
24+
| TLS_GROUP | **PASSED** #96 | NEED CHECK | |
25+
26+
### Tests Skipped on Next-Gen
27+
28+
| Test | Group | Reason |
29+
|------|-------|--------|
30+
| new_collation_off | G09 | Next-gen can't disable new collation |
31+
| s3_dumpling_lightning | G09 | Lightning version gate (physical mode) |
32+
33+
### Tests Adapted for Next-Gen
34+
35+
| Test | Group | Change |
36+
|------|-------|--------|
37+
| many_tables Phase 2 | G05 | import-into mode + existing MinIO instead of Lightning physical |
38+
| sync_collation | G11 | Explicit COLLATE utf8_general_ci in SQL |
39+
| openapi test_tls | G09 | TLS TiDB with plain mysql probe (status port stays HTTP) |
40+
| openapi test_delete_task_with_stopped_downstream | G09 | cleanup_tidb_server (port-4000 targeted) |
41+
| new_relay | G10 | cleanup_tidb_server instead of pkill tidb-server |
42+
| all_mode | G10 | cleanup_tidb_server instead of pkill tidb-server |
43+
| import_into_mode | G10 | PID-targeted MinIO kill (preserve next-gen cluster MinIO) |
44+
45+
### Key Fixes Applied
46+
47+
1. DDL fix: Don't set tidb_ddl_enable_fast_reorg=0 / tidb_enable_dist_task=0 on next-gen
48+
2. CONFIG privilege: Added to test user GRANT
49+
3. run_tidb_server: Unified TiDB startup (unistore/tikv via PD_ADDR, TLS detection)
50+
4. env_variables: Centralized next-gen vars (PD_ADDR, TIKV_WORKER_ADDR, KEYSPACE_NAME, etc.)
51+
5. Cluster scripts: Source env_variables for standalone invocation
52+
6. cleanup_tidb_server: Port-4000 targeted, removes temp-storage _dir.lock
53+
7. shardddl1: DML merge threshold relaxed (>2)
54+
8. dmctl_basic: Session block normalization for tidb_txn_mode diff
55+
9. print_debug_status: Moved to ha_cases_lib.sh
56+
10. TLS classic cluster: Restored original (separate ports, inline TiDB startup)
57+
11. Makefile: check_third_party_binary_for_dm checks sync_diff_inspector exists instead of rebuilding
58+
59+
### Remaining Work
60+
61+
- [ ] Build #103: Full next-gen run with all groups including G10
62+
- [ ] Verify classic CI passes
63+
- [ ] Final cleanup: simplify scripts, squash commits

0 commit comments

Comments
 (0)