From 19d65d32e0994f3229ece8a919fdce47cf1de163 Mon Sep 17 00:00:00 2001 From: hillium Date: Tue, 30 Apr 2024 15:23:57 +0800 Subject: [PATCH 1/3] retry when encountered dns error Signed-off-by: hillium --- br/pkg/lightning/common/retry.go | 4 +++ br/pkg/pdutil/pd.go | 7 ++++- tests/realtikvtest/brietest/BUILD.bazel | 2 ++ tests/realtikvtest/brietest/pdutil_test.go | 33 ++++++++++++++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 tests/realtikvtest/brietest/pdutil_test.go diff --git a/br/pkg/lightning/common/retry.go b/br/pkg/lightning/common/retry.go index 046ebebe1d43e..1b66f5b96022d 100644 --- a/br/pkg/lightning/common/retry.go +++ b/br/pkg/lightning/common/retry.go @@ -108,6 +108,10 @@ func isSingleRetryableError(err error) bool { switch nerr := err.(type) { case net.Error: + var dErr *net.DNSError + if goerrors.As(nerr, &dErr) { + return true + } if nerr.Timeout() { return true } diff --git a/br/pkg/pdutil/pd.go b/br/pkg/pdutil/pd.go index 9f257c33dd61b..94a203ef2ddc3 100644 --- a/br/pkg/pdutil/pd.go +++ b/br/pkg/pdutil/pd.go @@ -52,7 +52,7 @@ const ( pauseTimeout = 5 * time.Minute // pd request retry time when connection fail - pdRequestRetryTime = 10 + pdRequestRetryTime = 120 // set max-pending-peer-count to a large value to avoid scatter region failed. maxPendingPeerUnlimited uint64 = math.MaxInt32 @@ -174,6 +174,11 @@ func pdRequestWithCode( if err != nil { return 0, nil, errors.Trace(err) } + failpoint.Inject("DNSError", func() { + req.Host = "nosuchhost" + req.URL.Host = "nosuchhost" + fmt.Println(req.URL.String()) + }) resp, err = cli.Do(req) //nolint:bodyclose count++ failpoint.Inject("InjectClosed", func(v failpoint.Value) { diff --git a/tests/realtikvtest/brietest/BUILD.bazel b/tests/realtikvtest/brietest/BUILD.bazel index 683dd0ca91486..b7de33ba8799f 100644 --- a/tests/realtikvtest/brietest/BUILD.bazel +++ b/tests/realtikvtest/brietest/BUILD.bazel @@ -9,10 +9,12 @@ go_test( "flashback_test.go", "main_test.go", "operator_test.go", + "pdutil_test.go", ], flaky = True, race = "on", deps = [ + "//br/pkg/pdutil", "//br/pkg/task", "//br/pkg/task/operator", "//config", diff --git a/tests/realtikvtest/brietest/pdutil_test.go b/tests/realtikvtest/brietest/pdutil_test.go new file mode 100644 index 0000000000000..298de04afa6f7 --- /dev/null +++ b/tests/realtikvtest/brietest/pdutil_test.go @@ -0,0 +1,33 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package brietest + +import ( + "context" + "testing" + + "github.com/pingcap/failpoint" + "github.com/pingcap/tidb/br/pkg/pdutil" + "github.com/stretchr/testify/require" + pd "github.com/tikv/pd/client" +) + +func TestCreateClient(t *testing.T) { + require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/pdutil/DNSError", "119*return")) + require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/br/pkg/pdutil/FastRetry", "return(true)")) + ctl, err := pdutil.NewPdController(context.Background(), "127.0.0.1:2379", nil, pd.SecurityOption{}) + require.NoError(t, err) + ctl.Close() +} From fc24450c3015254ff722210640dbf7c7c252b301 Mon Sep 17 00:00:00 2001 From: hillium Date: Tue, 30 Apr 2024 16:05:22 +0800 Subject: [PATCH 2/3] remove useless debug code Signed-off-by: hillium --- br/pkg/pdutil/pd.go | 1 - 1 file changed, 1 deletion(-) diff --git a/br/pkg/pdutil/pd.go b/br/pkg/pdutil/pd.go index 94a203ef2ddc3..1c7e325747ebc 100644 --- a/br/pkg/pdutil/pd.go +++ b/br/pkg/pdutil/pd.go @@ -177,7 +177,6 @@ func pdRequestWithCode( failpoint.Inject("DNSError", func() { req.Host = "nosuchhost" req.URL.Host = "nosuchhost" - fmt.Println(req.URL.String()) }) resp, err = cli.Do(req) //nolint:bodyclose count++ From 329197d82e8a57bac0a7b1cfa3380e61c8379590 Mon Sep 17 00:00:00 2001 From: hillium Date: Tue, 30 Apr 2024 17:06:47 +0800 Subject: [PATCH 3/3] fix a mistakenly broken test Signed-off-by: hillium --- br/pkg/lightning/common/retry_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/br/pkg/lightning/common/retry_test.go b/br/pkg/lightning/common/retry_test.go index 397a028c765ab..b86685136ef17 100644 --- a/br/pkg/lightning/common/retry_test.go +++ b/br/pkg/lightning/common/retry_test.go @@ -39,7 +39,7 @@ func TestIsRetryableError(t *testing.T) { require.True(t, IsRetryableError(ErrWriteTooSlow)) require.False(t, IsRetryableError(io.EOF)) require.False(t, IsRetryableError(&net.AddrError{})) - require.False(t, IsRetryableError(&net.DNSError{})) + require.True(t, IsRetryableError(&net.DNSError{})) require.True(t, IsRetryableError(&net.DNSError{IsTimeout: true})) // kv errors