Skip to content

Commit

Permalink
Merge pull request #965 from ArangoGutierrez/cnt/103
Browse files Browse the repository at this point in the history
Enable labels for IMEX Domain and Clique
  • Loading branch information
ArangoGutierrez authored Oct 10, 2024
2 parents 8fd4b8f + 0719b2a commit 966d5b8
Show file tree
Hide file tree
Showing 14 changed files with 307 additions and 2 deletions.
6 changes: 6 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ type GFDCommandLineFlags struct {
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
OutputFile *string `json:"outputFile" yaml:"outputFile"`
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
// ImexNodesConfigFile is the path to a file containing the IP addresses of nodes
// that are part of the IMEX domain.
// Note that this is the absolute path to the file in the device plugin container.
ImexNodesConfigFile *string `json:"imexNodesConfigFile" yaml:"imexNodesConfigFile"`
}

// UpdateFromCLIFlags updates Flags from settings in the cli Flags if they are set.
Expand Down Expand Up @@ -162,6 +166,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
case "output-file":
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
case "imex-nodes-config-file":
updateFromCLIFlag(&f.GFD.ImexNodesConfigFile, c, n)
case "sleep-interval":
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
case "no-timestamp":
Expand Down
6 changes: 4 additions & 2 deletions api/config/v1/flags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ func TestMarshalFlags(t *testing.T) {
"noTimestamp": null,
"outputFile": null,
"sleepInterval": "0s",
"machineTypeFile": null
"machineTypeFile": null,
"imexNodesConfigFile": null
}
}`,
},
Expand All @@ -210,7 +211,8 @@ func TestMarshalFlags(t *testing.T) {
"noTimestamp": null,
"outputFile": null,
"sleepInterval": "5ns",
"machineTypeFile": null
"machineTypeFile": null,
"imexNodesConfigFile": null
}
}`,
},
Expand Down
6 changes: 6 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ func main() {
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
EnvVars: []string{"GFD_OUTPUT_FILE"},
},
&cli.StringFlag{
Name: "imex-nodes-config-file",
Usage: "Path to the IMEX nodes config file. This file contains a list of IP addresses of the nodes in the IMEX domain.",
Value: "/etc/nvidia-imex/nodes_config.cfg",
EnvVars: []string{"GFD_IMEX_NODES_CONFIG_FILE"},
},
&cli.StringFlag{
Name: "machine-type-file",
Value: "/sys/class/dmi/id/product_name",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ spec:
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: host-sys
mountPath: "/sys"
- name: nvidia-imex-dir
mountPath: "/etc/nvidia-imex"
{{- if $options.hasConfigMap }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,6 +201,10 @@ spec:
- name: host-sys
hostPath:
path: "/sys"
- name: nvidia-imex-dir
type: DirectoryOrCreate
hostPath:
path: {{ clean ( join "/" ( list "/" .Values.nvidiaDriverRoot "/etc/nvidia-imex" ) ) | quote }}
{{- if $options.hasConfigMap }}
- name: available-configs
configMap:
Expand Down
2 changes: 2 additions & 0 deletions docs/gpu-feature-discovery/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ their meaning:
| nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 |
| nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 |
| nvidia.com/gpu.mode | String | Display or Compute Mode of the GPU. Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute |
| nvidia.com/gpu.clique | String | GPUFabric ClusterUUID + CliqueID | 7b968a6d-c8aa-45e1-9e07-e1e51be99c31.1 |
| nvidia.com/gpu.imex-domain | String | IMEX domain Ip list(Hashed) + CliqueID | 79b326e7-d566-3483-c2a3-9b38fa5cb1c8.1 |

Depending on the MIG strategy used, the following set of labels may also be
available (or override the default values for some of the labels listed above):
Expand Down
159 changes: 159 additions & 0 deletions internal/lm/fabric.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/**
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package lm

import (
"bufio"
"fmt"
"io"
"math/rand" // nolint:gosec
"net"
"os"
"sort"
"strings"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"

"github.com/google/uuid"
"k8s.io/klog/v2"
)

func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) {
if config.Flags.GFD.ImexNodesConfigFile == nil || *config.Flags.GFD.ImexNodesConfigFile == "" {
// No imex config file, return empty labels
return empty{}, nil
}

imexConfigFile, err := os.Open(*config.Flags.GFD.ImexNodesConfigFile)
if os.IsNotExist(err) {
// No imex config file, return empty labels
return empty{}, nil
} else if err != nil {
return nil, fmt.Errorf("failed to open imex config file: %v", err)
}
defer imexConfigFile.Close()

clusterUUID, cliqueID, err := getFabricIDs(devices)
if err != nil {
return nil, err
}
if clusterUUID == "" || cliqueID == "" {
return empty{}, nil
}

imexDomainID, err := getImexDomainID(imexConfigFile)
if err != nil {
return nil, err
}
if imexDomainID == "" {
return empty{}, nil
}

labels := Labels{
"nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."),
"nvidia.com/gpu.imex-domain": strings.Join([]string{imexDomainID, cliqueID}, "."),
}

return labels, nil
}

func getFabricIDs(devices []resource.Device) (string, string, error) {
uniqueClusterUUIDs := make(map[string][]int)
uniqueCliqueIDs := make(map[string][]int)
for i, device := range devices {
isFabricAttached, err := device.IsFabricAttached()
if err != nil {
return "", "", fmt.Errorf("error checking imex capability: %v", err)
}
if !isFabricAttached {
continue
}

clusterUUID, cliqueID, err := device.GetFabricIDs()
if err != nil {

return "", "", fmt.Errorf("error getting fabric IDs: %w", err)
}

uniqueClusterUUIDs[clusterUUID] = append(uniqueClusterUUIDs[clusterUUID], i)
uniqueCliqueIDs[cliqueID] = append(uniqueCliqueIDs[cliqueID], i)
}

if len(uniqueClusterUUIDs) > 1 {
klog.Warningf("Cluster UUIDs are non-unique: %v", uniqueClusterUUIDs)
return "", "", nil
}

if len(uniqueCliqueIDs) > 1 {
klog.Warningf("Clique IDs are non-unique: %v", uniqueCliqueIDs)
return "", "", nil
}

for clusterUUID := range uniqueClusterUUIDs {
for cliqueID := range uniqueCliqueIDs {
return clusterUUID, cliqueID, nil
}
}
return "", "", nil
}

// getImexDomainID reads the imex config file and returns a unique identifier
// based on the sorted list of IP addresses in the file.
func getImexDomainID(r io.Reader) (string, error) {
// Read the file line by line
var ips []string
scanner := bufio.NewScanner(r)
for scanner.Scan() {
ip := strings.TrimSpace(scanner.Text())
if net.ParseIP(ip) == nil {
return "", fmt.Errorf("invalid IP address in imex config file: %s", ip)
}
ips = append(ips, ip)
}

if err := scanner.Err(); err != nil {
return "", fmt.Errorf("failed to read imex config file: %v", err)
}

if len(ips) == 0 {
// No IPs in the file, return empty labels
return "", nil
}

sort.Strings(ips)

return generateContentUUID(strings.Join(ips, "\n")), nil

}

func generateContentUUID(seed string) string {
// nolint:gosec
rand := rand.New(rand.NewSource(hash(seed)))
charset := make([]byte, 16)
rand.Read(charset)
uuid, _ := uuid.FromBytes(charset)
return uuid.String()
}

func hash(s string) int64 {
h := int64(0)
for _, c := range s {
h = 31*h + int64(c)
}
return h
}
6 changes: 6 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

imexLabeler, err := newImexLabeler(config, devices)
if err != nil {
return nil, fmt.Errorf("error creating IMEX labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down
8 changes: 8 additions & 0 deletions internal/resource/cuda-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,11 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
func (d *cudaDevice) GetPCIClass() (uint32, error) {
return 0, nil
}

func (d *cudaDevice) IsFabricAttached() (bool, error) {
return false, nil
}

func (d *cudaDevice) GetFabricIDs() (string, string, error) {
return "", "", fmt.Errorf("GetFabricIDs is not supported for CUDA devices")
}
Loading

0 comments on commit 966d5b8

Please sign in to comment.