From 88b964fe18d37af5b53cea00e8d785159fb61f59 Mon Sep 17 00:00:00 2001 From: Madhu Rajanna Date: Tue, 1 Oct 2024 17:34:09 +0200 Subject: [PATCH 1/4] rbd: consider ErrPermissionDenied for vol Incase of RDR with restricted access the ceph user will not have access to all the objects or all the pools where mapping exists This commits add a check to continue to get the volume if there is a permission error Signed-off-by: Madhu Rajanna --- internal/rbd/rbd_util.go | 36 +++++++++++++++++++---- internal/rbd/rbd_util_test.go | 54 +++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/internal/rbd/rbd_util.go b/internal/rbd/rbd_util.go index 1beb4f5318f..3b07d235dcd 100644 --- a/internal/rbd/rbd_util.go +++ b/internal/rbd/rbd_util.go @@ -1214,8 +1214,7 @@ func GenVolFromVolID( } vol, err = generateVolumeFromVolumeID(ctx, volumeID, vi, cr, secrets) - if !errors.Is(err, util.ErrKeyNotFound) && !errors.Is(err, util.ErrPoolNotFound) && - !errors.Is(err, ErrImageNotFound) { + if !shouldRetryVolumeGeneration(err) { return vol, err } @@ -1226,8 +1225,7 @@ func GenVolFromVolID( } if mapping != nil { rbdVol, vErr := generateVolumeFromMapping(ctx, mapping, volumeID, vi, cr, secrets) - if !errors.Is(vErr, util.ErrKeyNotFound) && !errors.Is(vErr, util.ErrPoolNotFound) && - !errors.Is(vErr, ErrImageNotFound) { + if !shouldRetryVolumeGeneration(vErr) { return rbdVol, vErr } } @@ -1280,8 +1278,7 @@ func generateVolumeFromMapping( // Add mapping poolID to Identifier nvi.LocationID = pID vol, err = generateVolumeFromVolumeID(ctx, volumeID, nvi, cr, secrets) - if !errors.Is(err, util.ErrKeyNotFound) && !errors.Is(err, util.ErrPoolNotFound) && - !errors.Is(err, ErrImageNotFound) { + if !shouldRetryVolumeGeneration(err) { return vol, err } } @@ -1292,6 +1289,33 @@ func generateVolumeFromMapping( return vol, util.ErrPoolNotFound } +// shouldRetryVolumeGeneration determines whether the process of finding or generating +// volumes should continue based on the type of error encountered. +// +// It checks if the given error matches any of the following known errors: +// - util.ErrKeyNotFound: The key required to locate the volume is missing in Rados omap. +// - util.ErrPoolNotFound: The rbd pool where the volume/omap is expected doesn't exist. +// - ErrImageNotFound: The image doesn't exist in the rbd pool. +// - rados.ErrPermissionDenied: Permissions to access the pool is denied. +// +// If any of these errors are encountered, the function returns `true`, indicating +// that the volume search should continue because of known error. Otherwise, it +// returns `false`, meaning the search should stop. +// +// This helper function is used in scenarios where multiple attempts may be made +// to retrieve or generate volume information, and we want to gracefully handle +// specific failure cases while retrying for others. +func shouldRetryVolumeGeneration(err error) bool { + if err == nil { + return false // No error, do not retry + } + // Continue searching for specific known errors + return (errors.Is(err, util.ErrKeyNotFound) || + errors.Is(err, util.ErrPoolNotFound) || + errors.Is(err, ErrImageNotFound) || + errors.Is(err, rados.ErrPermissionDenied)) +} + func genVolFromVolumeOptions( ctx context.Context, volOptions map[string]string, diff --git a/internal/rbd/rbd_util_test.go b/internal/rbd/rbd_util_test.go index 905e977079c..5d14ed844a7 100644 --- a/internal/rbd/rbd_util_test.go +++ b/internal/rbd/rbd_util_test.go @@ -23,8 +23,11 @@ import ( "strings" "testing" + "github.com/ceph/go-ceph/rados" librbd "github.com/ceph/go-ceph/rbd" "github.com/stretchr/testify/require" + + "github.com/ceph/ceph-csi/internal/util" ) func TestHasSnapshotFeature(t *testing.T) { @@ -387,3 +390,54 @@ func Test_checkValidImageFeatures(t *testing.T) { }) } } + +func Test_shouldRetryVolumeGeneration(t *testing.T) { + t.Parallel() + type args struct { + err error + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "No error (stop searching)", + args: args{err: nil}, + want: false, // No error, stop searching + }, + { + name: "ErrKeyNotFound (continue searching)", + args: args{err: util.ErrKeyNotFound}, + want: true, // Known error, continue searching + }, + { + name: "ErrPoolNotFound (continue searching)", + args: args{err: util.ErrPoolNotFound}, + want: true, // Known error, continue searching + }, + { + name: "ErrImageNotFound (continue searching)", + args: args{err: ErrImageNotFound}, + want: true, // Known error, continue searching + }, + { + name: "ErrPermissionDenied (continue searching)", + args: args{err: rados.ErrPermissionDenied}, + want: true, // Known error, continue searching + }, + { + name: "Different error (stop searching)", + args: args{err: errors.New("unknown error")}, + want: false, // Unknown error, stop searching + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + if got := shouldRetryVolumeGeneration(tt.args.err); got != tt.want { + t.Errorf("shouldRetryVolumeGeneration() = %v, want %v", got, tt.want) + } + }) + } +} From 10076ca11f72a5bede7c3ae90d5955c1dac9b047 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Wed, 2 Oct 2024 17:11:11 +0200 Subject: [PATCH 2/4] rbd: use the new go-ceph rbd.ErrExist for checking rbd.GroupCreate() The go-ceph rbd.GroupCreate() now returns ErrExist in case the group that is created, already exists. The previous check only ever matched the string comparison, which is prone to errors in case the contents is modified by go-ceph. Signed-off-by: Niels de Vos --- internal/rbd/group/volume_group.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/rbd/group/volume_group.go b/internal/rbd/group/volume_group.go index 7de46131a3a..d23b26d99b3 100644 --- a/internal/rbd/group/volume_group.go +++ b/internal/rbd/group/volume_group.go @@ -20,7 +20,6 @@ import ( "context" "errors" "fmt" - "strings" "github.com/ceph/go-ceph/rados" librbd "github.com/ceph/go-ceph/rbd" @@ -164,7 +163,7 @@ func (vg *volumeGroup) Create(ctx context.Context) error { err = librbd.GroupCreate(ioctx, name) if err != nil { - if !errors.Is(rados.ErrObjectExists, err) && !strings.Contains(err.Error(), "rbd: ret=-17, File exists") { + if !errors.Is(err, librbd.ErrExist) { return fmt.Errorf("failed to create volume group %q: %w", name, err) } From 9267210da408bd39c3614ae8a7c7f52917946c9e Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Thu, 3 Oct 2024 16:14:26 +0200 Subject: [PATCH 3/4] rebase: use the go-ceph master branch The main change that is useful, is the new rbd.ErrExist error. Signed-off-by: Niels de Vos --- go.mod | 2 +- go.sum | 8 ++++---- vendor/github.com/ceph/go-ceph/internal/dlsym/dlsym.go | 6 ++---- vendor/github.com/ceph/go-ceph/rados/snapshot.go | 4 ++-- vendor/github.com/ceph/go-ceph/rados/watcher.go | 6 +++--- vendor/github.com/ceph/go-ceph/rbd/errors.go | 2 ++ vendor/github.com/ceph/go-ceph/rbd/snapshot_octopus.go | 4 ++-- vendor/modules.txt | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/go.mod b/go.mod index 5d62a3ebbd9..e1e7a3c3dd5 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/aws/aws-sdk-go v1.55.5 github.com/aws/aws-sdk-go-v2/service/sts v1.31.3 github.com/ceph/ceph-csi/api v0.0.0-00010101000000-000000000000 - github.com/ceph/go-ceph v0.29.0 + github.com/ceph/go-ceph v0.29.1-0.20240925141413-065319c78733 github.com/container-storage-interface/spec v1.10.0 github.com/csi-addons/spec v0.2.1-0.20240730084235-3958a5b17d24 github.com/gemalto/kmip-go v0.0.10 diff --git a/go.sum b/go.sum index 66757015b9c..a214bae6b35 100644 --- a/go.sum +++ b/go.sum @@ -1440,8 +1440,8 @@ github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyY github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw= -github.com/ceph/go-ceph v0.29.0 h1:pJQY+++PyY2FMP0ffVaE7FbIdivemBPCu4MWr4S8CtI= -github.com/ceph/go-ceph v0.29.0/go.mod h1:U/l216/AzIWrFe7ny+9cJ5Yjzyb5otrEGfdrU5VtCME= +github.com/ceph/go-ceph v0.29.1-0.20240925141413-065319c78733 h1:vX8mfbKwE24CbO5t1Xc02u53pjWAsyvjgAMJk7o7nsQ= +github.com/ceph/go-ceph v0.29.1-0.20240925141413-065319c78733/go.mod h1:OJFju/Xmtb7ihHo/aXOayw6RhVOUGNke5EwTipwaf6A= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -1647,8 +1647,8 @@ github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MG github.com/goccy/go-yaml v1.9.8/go.mod h1:JubOolP3gh0HpiBc4BLRD4YmjEjHAmIIB2aaXKkTfoE= github.com/goccy/go-yaml v1.11.0/go.mod h1:H+mJrWtjPTJAHvRbV09MCK9xYwODM+wRTVFFTWckfng= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/gofrs/uuid v4.4.0+incompatible h1:3qXRTX8/NbyulANqlc0lchS1gqAVxRgsuW1YrTJupqA= -github.com/gofrs/uuid v4.4.0+incompatible/go.mod h1:b2aQJv3Z4Fp6yNu3cdSllBxTCLRxnplIgP/c0N/04lM= +github.com/gofrs/uuid/v5 v5.3.0 h1:m0mUMr+oVYUdxpMLgSYCZiXe7PuVPnI94+OMeVBNedk= +github.com/gofrs/uuid/v5 v5.3.0/go.mod h1:CDOjlDMVAtN56jqyRUZh58JT31Tiw7/oQyEXZV+9bD8= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4= github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= diff --git a/vendor/github.com/ceph/go-ceph/internal/dlsym/dlsym.go b/vendor/github.com/ceph/go-ceph/internal/dlsym/dlsym.go index 9e5782c4d50..0c1ef1ef7bc 100644 --- a/vendor/github.com/ceph/go-ceph/internal/dlsym/dlsym.go +++ b/vendor/github.com/ceph/go-ceph/internal/dlsym/dlsym.go @@ -2,12 +2,10 @@ package dlsym // #cgo LDFLAGS: -ldl // +// #define _GNU_SOURCE +// // #include // #include -// -// #ifndef RTLD_DEFAULT /* from dlfcn.h */ -// #define RTLD_DEFAULT ((void *) 0) -// #endif import "C" import ( diff --git a/vendor/github.com/ceph/go-ceph/rados/snapshot.go b/vendor/github.com/ceph/go-ceph/rados/snapshot.go index 183a119d2cc..46b87aa6522 100644 --- a/vendor/github.com/ceph/go-ceph/rados/snapshot.go +++ b/vendor/github.com/ceph/go-ceph/rados/snapshot.go @@ -85,8 +85,8 @@ func (ioctx *IOContext) GetSnapName(snapID SnapID) (string, error) { err error ) // range from 1k to 64KiB - retry.WithSizes(1024, 1<<16, func(len int) retry.Hint { - cLen := C.int(len) + retry.WithSizes(1024, 1<<16, func(length int) retry.Hint { + cLen := C.int(length) buf = make([]byte, cLen) ret := C.rados_ioctx_snap_get_name( ioctx.ioctx, diff --git a/vendor/github.com/ceph/go-ceph/rados/watcher.go b/vendor/github.com/ceph/go-ceph/rados/watcher.go index 7cd7e90f3d8..a99c77bc00a 100644 --- a/vendor/github.com/ceph/go-ceph/rados/watcher.go +++ b/vendor/github.com/ceph/go-ceph/rados/watcher.go @@ -298,11 +298,11 @@ func (c *Conn) WatcherFlush() error { // // NOTE: starting with pacific this is implemented as a C function and this can // be replaced later -func decodeNotifyResponse(response *C.char, len C.size_t) ([]NotifyAck, []NotifyTimeout) { - if len == 0 || response == nil { +func decodeNotifyResponse(response *C.char, length C.size_t) ([]NotifyAck, []NotifyTimeout) { + if length == 0 || response == nil { return nil, nil } - b := (*[math.MaxInt32]byte)(unsafe.Pointer(response))[:len:len] + b := (*[math.MaxInt32]byte)(unsafe.Pointer(response))[:length:length] pos := 0 num := binary.LittleEndian.Uint32(b[pos:]) diff --git a/vendor/github.com/ceph/go-ceph/rbd/errors.go b/vendor/github.com/ceph/go-ceph/rbd/errors.go index 4d98cce7c4e..e26e9d1356a 100644 --- a/vendor/github.com/ceph/go-ceph/rbd/errors.go +++ b/vendor/github.com/ceph/go-ceph/rbd/errors.go @@ -73,6 +73,8 @@ var ( // Public general error const ( + // ErrExist indicates a non-specific already existing resource. + ErrExist = rbdError(-C.EEXIST) // ErrNotExist indicates a non-specific missing resource. ErrNotExist = rbdError(-C.ENOENT) // ErrNotImplemented indicates a function is not implemented in by librbd. diff --git a/vendor/github.com/ceph/go-ceph/rbd/snapshot_octopus.go b/vendor/github.com/ceph/go-ceph/rbd/snapshot_octopus.go index 86b8e77d3f3..4bd008a7405 100644 --- a/vendor/github.com/ceph/go-ceph/rbd/snapshot_octopus.go +++ b/vendor/github.com/ceph/go-ceph/rbd/snapshot_octopus.go @@ -50,8 +50,8 @@ func (image *Image) GetSnapByID(snapID uint64) (string, error) { err error ) // range from 1k to 64KiB - retry.WithSizes(1024, 1<<16, func(len int) retry.Hint { - cLen := C.size_t(len) + retry.WithSizes(1024, 1<<16, func(length int) retry.Hint { + cLen := C.size_t(length) buf = make([]byte, cLen) ret := C.rbd_snap_get_name( image.image, diff --git a/vendor/modules.txt b/vendor/modules.txt index f2ca75b2e13..6a24b1fb5e5 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -205,7 +205,7 @@ github.com/ceph/ceph-csi/api/deploy/kubernetes/cephfs github.com/ceph/ceph-csi/api/deploy/kubernetes/nfs github.com/ceph/ceph-csi/api/deploy/kubernetes/rbd github.com/ceph/ceph-csi/api/deploy/ocp -# github.com/ceph/go-ceph v0.29.0 +# github.com/ceph/go-ceph v0.29.1-0.20240925141413-065319c78733 ## explicit; go 1.19 github.com/ceph/go-ceph/cephfs github.com/ceph/go-ceph/cephfs/admin From d33e6b14fe5b82f9b10af4b7faa3f169ffcdda3a Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Tue, 1 Oct 2024 15:10:27 +0200 Subject: [PATCH 4/4] rbd: validate IOContext before getting the list of trashed images `ensureImageCleanup()` can cause a panic when an image was deleted, but the journal still contained a reference. By opening the IOContext before using, an error may be returned instead of a panic when using a `nil` or freed IOContext. Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 11 +++-------- internal/rbd/rbd_util.go | 5 +++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index f0ac0a47956..1745dbc3793 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -881,9 +881,9 @@ func (cs *ControllerServer) checkErrAndUndoReserve( } if errors.Is(err, ErrImageNotFound) { - err = rbdVol.ensureImageCleanup(ctx) - if err != nil { - return nil, status.Error(codes.Internal, err.Error()) + notFoundErr := rbdVol.ensureImageCleanup(ctx) + if notFoundErr != nil { + return nil, status.Errorf(codes.Internal, "failed to cleanup image %q: %v", rbdVol, notFoundErr) } } else { // All errors other than ErrImageNotFound should return an error back to the caller @@ -1538,11 +1538,6 @@ func cleanUpImageAndSnapReservation(ctx context.Context, rbdSnap *rbdSnapshot, c } defer rbdVol.Destroy(ctx) - err = rbdVol.openIoctx() - if err != nil { - return status.Error(codes.Internal, err.Error()) - } - // cleanup the image from trash if the error is image not found. err = rbdVol.ensureImageCleanup(ctx) if err != nil { diff --git a/internal/rbd/rbd_util.go b/internal/rbd/rbd_util.go index 3b07d235dcd..3b72b7c4c86 100644 --- a/internal/rbd/rbd_util.go +++ b/internal/rbd/rbd_util.go @@ -621,6 +621,11 @@ func isCephMgrSupported(ctx context.Context, clusterID string, err error) bool { // ensureImageCleanup finds image in trash and if found removes it // from trash. func (ri *rbdImage) ensureImageCleanup(ctx context.Context) error { + err := ri.openIoctx() + if err != nil { + return err + } + trashInfoList, err := librbd.GetTrashList(ri.ioctx) if err != nil { log.ErrorLog(ctx, "failed to list images in trash: %v", err)