Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nvidia container toolkit #1705

Draft
wants to merge 10 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion build_library/sysext_prod_builder
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ rm -rf "${sysext_workdir}" "${sysext_output_dir}"
mkdir "${sysext_workdir}" "${sysext_output_dir}"

info "creating temporary base OS squashfs"
sudo mksquashfs "${root_fs_dir}" "${sysext_base}" -noappend
sudo mksquashfs "${root_fs_dir}" "${sysext_base}" -noappend -xattrs-exclude '^btrfs.'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea to silence this 👍


# Build sysexts on top of root fs and mount sysexts' squashfs + pkginfo squashfs
# for combined overlay later.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
DIST nvidia-container-toolkit-1.14.5.tar.gz 2364638 BLAKE2B 0d578e35adb6c10ee2ecc0058efd321291f4280a8866e200e23858ae850eec5bd0b5a4767a853204e29d3d489f5c4f521154c24f4931b053824072fd0b2fcdc3 SHA512 828b69578894be96b6629f5e404e71589700267c9c24593caea7cb8c6c0d668d5393510a4608cb6ce377d4f28fe7a442bf9e08495f142a22616ca2115cb5eb61
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#accept-nvidia-visible-devices-as-volume-mounts = false
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
disable-require = false
supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,video"
#swarm-resource = "DOCKER_RESOURCE_GPU"

[nvidia-container-cli]
#debug = "/var/log/nvidia-container-toolkit.log"
environment = []
#ldcache = "/etc/ld.so.cache"
ldconfig = "@/sbin/ldconfig"
load-kmods = true
#no-cgroups = false
#path = "/usr/bin/nvidia-container-cli"
#root = "/run/nvidia/driver"
#user = "root:video"

[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"
log-level = "info"
mode = "auto"
runtimes = ["docker-runc", "runc", "crun"]

[nvidia-container-runtime.modes]

[nvidia-container-runtime.modes.cdi]
annotation-prefixes = ["cdi.k8s.io/"]
default-kind = "nvidia.com/gpu"
spec-dirs = ["/etc/cdi", "/var/run/cdi"]

[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime-hook]
path = "nvidia-container-runtime-hook"
skip-mode-detection = false

[nvidia-ctk]
path = "nvidia-ctk"
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2024 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8

EGO_PN="github.com/NVIDIA/${PN}"

inherit coreos-go-depend

DESCRIPTION="NVIDIA container runtime toolkit"
HOMEPAGE="https://github.com/NVIDIA/nvidia-container-toolkit"
SRC_URI="https://github.com/NVIDIA/${PN}/archive/v${PV/_rc/-rc.}.tar.gz -> ${P}.tar.gz"

LICENSE="Apache-2.0"
SLOT="0"
KEYWORDS="amd64 arm64"

DEPEND=""
RDEPEND="${DEPEND}
sys-libs/libnvidia-container:=
"
BDEPEND=""

src_compile() {
go_export
emake binaries
}

src_install() {
dobin nvidia-container-runtime{-hook,.cdi,} nvidia-ctk
insinto "/etc/nvidia-container-runtime/"
doins "${FILESDIR}/config.toml"
}
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ RDEPEND="${RDEPEND}
app-editors/vim
app-emulation/actool
app-containers/cri-tools
app-containers/nvidia-container-toolkit
app-misc/ca-certificates
app-misc/jq
app-misc/pax-utils
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ After=network.target

[Service]
Delegate=yes
# /opt/bin needs to be in path for nvidia-container-runtime
Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/opt/bin
Environment=CONTAINERD_CONFIG=/usr/share/containerd/config.toml
ExecStartPre=mkdir -p /run/docker/libcontainerd
ExecStartPre=ln -fs /run/containerd/containerd.sock /run/docker/libcontainerd/docker-containerd.sock
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,10 @@ enable_selinux = true
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
runtime_type = "io.containerd.runc.v2"

[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
SystemdCgroup = true
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@ Requires=containerd.service docker.socket
Type=notify
EnvironmentFile=-/run/flannel/flannel_docker_opts.env
Environment=DOCKER_SELINUX=--selinux-enabled=true
Environment=DOCKER_NVIDIA=--add-runtime=nvidia=nvidia-container-runtime
# /opt/bin needs to be in path for the nvidia-runtime to pick it up
Environment=PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/opt/bin

# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/bin/dockerd --host=fd:// --containerd=/var/run/docker/libcontainerd/docker-containerd.sock $DOCKER_SELINUX $DOCKER_OPTS $DOCKER_CGROUPS $DOCKER_OPT_BIP $DOCKER_OPT_MTU $DOCKER_OPT_IPMASQ
ExecStart=/usr/bin/dockerd --host=fd:// --containerd=/var/run/docker/libcontainerd/docker-containerd.sock $DOCKER_NVIDIA $DOCKER_SELINUX $DOCKER_OPTS $DOCKER_CGROUPS $DOCKER_OPT_BIP $DOCKER_OPT_MTU $DOCKER_OPT_IPMASQ
ExecReload=/bin/kill -s HUP $MAINPID
LimitNOFILE=1048576
# Having non-zero Limit*s causes performance problems due to accounting overhead
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
DIST libnvidia-container-1.14.5.tar.gz 1549093 BLAKE2B ce3769c2589af12bfc802272ae3a6bdff9d2c8d25c669755eb6e9eba3a1a9fc4e89b55b73ed24098777e2eaf680ee686afce818206f7e94fe7f713cfd7cf69bd SHA512 0d50c584af5f222d9e54f8b6b094ddd9b625c965ed519e1b8f74e7b8d26d811084e1c37b3d7fb1a2473890b7b7ef263c0893c15e6bc4586d5155c03f31ab4662
DIST libtirpc-1.3.2.tar.bz2 513151 BLAKE2B 375b7bb046244f4666522c4f148428349fe1867b095dc5e268d037ba26982f88df70b0ad21fbe2b99150f644806a46651b524c3b9f7fe2499469806ea50b0331 SHA512 8664d5c4f842ee5acf83b9c1cadb7871f17b8157a7c4500e2236dcfb3a25768cab39f7c5123758dcd7381e30eb028ddfa26a28f458283f2dcea3426c9878c255
DIST nvidia-modprobe-495.44.tar.gz 43488 BLAKE2B 5af5a07158088fd131e85baf14477281e395fad981507bf6c0ea861abfc32be51e5a7030d412ca1ba3f7cca68d22964c36553b7c4186db7ee2320926acbd4c5b SHA512 67486ed1b17c8962786e13880910bb2b1938206a0fd76b360ddef7faf80ee0c941a2e3fbc73fa92a92009e2c54130dce17a466c8079537a981a2fed09c07e4c9
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright 2024 Gentoo Authors
# Distributed under the terms of the GNU General Public License v2

EAPI=8

inherit coreos-go-depend toolchain-funcs

DESCRIPTION="NVIDIA container runtime library"
HOMEPAGE="https://github.com/NVIDIA/libnvidia-container"

NVIDIA_MODPROBE_VERSION=495.44
TIRPC_VERSION=1.3.2
SRC_URI="
https://github.com/NVIDIA/${PN}/archive/v${PV/_rc/-rc.}.tar.gz -> ${P}.tar.gz
https://github.com/NVIDIA/nvidia-modprobe/archive/${NVIDIA_MODPROBE_VERSION}.tar.gz -> nvidia-modprobe-${NVIDIA_MODPROBE_VERSION}.tar.gz
https://downloads.sourceforge.net/project/libtirpc/libtirpc/${TIRPC_VERSION}/libtirpc-${TIRPC_VERSION}.tar.bz2
"
S="${WORKDIR}/${PN}-${PV/_rc/-rc.}"
KEYWORDS="~amd64"

LICENSE="Apache-2.0"
SLOT="0"
KEYWORDS="amd64 arm64"

DEPEND="
sys-libs/libcap
sys-libs/libseccomp
virtual/libelf
"
RDEPEND="${DEPEND}"
BDEPEND="
net-libs/rpcsvc-proto
virtual/pkgconfig
"


src_prepare() {
# sanity check:
grep -q "${NVIDIA_MODPROBE_VERSION}" mk/nvidia-modprobe.mk || die
mkdir -p "${S}/deps/src/" || die
local nvmoddir="nvidia-modprobe-${NVIDIA_MODPROBE_VERSION}"
ln -s "${WORKDIR}/${nvmoddir}" "${S}/deps/src/" || die
patch -d "${S}/deps/src/${nvmoddir}" -p1 <"${S}/mk/nvidia-modprobe.patch" || die
touch "${S}/deps/src/${nvmoddir}/.download_stamp" || die

grep -q "${TIRPC_VERSION}" mk/libtirpc.mk || die
local tirpcdir="libtirpc-${TIRPC_VERSION}"
ln -s "${WORKDIR}/${tirpcdir}" "${S}/deps/src/" || die
touch "${S}/deps/src/${tirpcdir}/.download_stamp" || die

default
}

src_compile() {
go_export
tc-export CC OBJCOPY LD AR STRIP PKG_CONFIG
MAKE_ARGS=(
LIB_VERSION="${PV/v/}"
prefix="${EPREFIX}/usr"
libdir="${EPREFIX}/usr/$(get_libdir)"
REVISION="${PV}"
WITH_LIBELF=yes
WITH_SECCOMP=yes
WITH_TIRPC=yes
CURL=die
OBJCPY="${OBJCOPY}"
LDCONFIG=${ROOT}/usr/sbin/ldconfig
)
emake "${MAKE_ARGS[@]}" || die "emake failed"
}

src_install() {
emake DESTDIR="${ED}" "${MAKE_ARGS[@]}" install || die "emake install failed"
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ set -euo pipefail
NVIDIA_DOWNLOAD_BASEURL="https://us.download.nvidia.com/${NVIDIA_PRODUCT_TYPE}/"
NVIDIA_DRIVER_BASENAME="NVIDIA-Linux-x86_64-${NVIDIA_DRIVER_VERSION}"
NVIDIA_WORKDIR='nvidia-workdir'
# User can disable
: ${NVIDIA_PERSISTENCE_MODE:=on}

FLATCAR_DEVELOPER_CONTAINER="flatcar_developer_container-${FLATCAR_RELEASE_VERSION}.bin"
FLATCAR_DEVELOPER_CONTAINER_URL=""
Expand Down Expand Up @@ -128,6 +130,12 @@ function install_and_load() {
D=`grep nvidia-uvm /proc/devices | awk '{print $1}'`
mknod -m 666 /dev/nvidia-uvm c $D 0
fi

nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml

if [ "${NVIDIA_PERSISTENCE_MODE}" = "on" ]; then
nvidia-persistenced
fi
}

function verify_installation() {
Expand Down
Loading