From c4a424c2f41e4ca9d36b0543004104e99631a5b1 Mon Sep 17 00:00:00 2001 From: killianmuldoon Date: Mon, 21 Oct 2024 10:45:36 +0100 Subject: [PATCH 1/3] Remove outdated helm values docs Signed-off-by: killianmuldoon --- docs/customizations/helm.rst | 623 ++--------------------------------- templates/helm.rst.gotmpl | 366 +------------------- 2 files changed, 23 insertions(+), 966 deletions(-) diff --git a/docs/customizations/helm.rst b/docs/customizations/helm.rst index ff77e66..3fecdc9 100644 --- a/docs/customizations/helm.rst +++ b/docs/customizations/helm.rst @@ -42,10 +42,6 @@ General Parameters - Type - Default - Description - * - deployCR - - bool - - `false` - - Deploy ``NicClusterPolicy`` custom resource according to the provided parameters. * - imagePullSecrets - list - `[]` @@ -109,6 +105,22 @@ General Parameters - object - `{}` - Configure node selector settings for the operator. + * - operator.ofedDriver.initContainer.enable + - bool + - `true` + - Deploy init container. + * - operator.ofedDriver.initContainer.image + - string + - `"network-operator-init-container"` + - Init container image name. + * - operator.ofedDriver.initContainer.repository + - string + - `"ghcr.io/mellanox"` + - Init container image repository. + * - operator.ofedDriver.initContainer.version + - string + - `"v0.0.2"` + - Init container image version. * - operator.repository - string - `"nvcr.io/nvstaging/mellanox"` @@ -277,7 +289,7 @@ SR-IOV Network Operator Helm chart customization options can be found `here `_ can be configured for each container of the sub-resources deployed by the Network Operator by setting the parameter ``containerResources``. - -For example: - -.. code-block:: yaml - - containerResources: - - name: "mofed-container" - requests: - cpu: "200m" - memory: "150Mi" - limits: - cpu: "300m" - memory: "300Mi" - -=================== - NVIDIA DOCA Driver -=================== - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - ofedDriver.certConfig.name - - string - - `""` - - Custom TLS key/certificate configuration configMap name. - * - ofedDriver.deploy - - bool - - `false` - - Deploy the NVIDIA DOCA Driver driver container. - * - ofedDriver.forcePrecompiled - - bool - - `false` - - Fail Mellanox OFED deployment if precompiled OFED driver container image does not exists. - * - ofedDriver.image - - string - - `"doca-driver"` - - NVIDIA DOCA Driver image name. - * - ofedDriver.initContainer.enable - - bool - - `true` - - Deploy init container. - * - ofedDriver.initContainer.image - - string - - `"network-operator-init-container"` - - Init container image name. - * - ofedDriver.initContainer.repository - - string - - `"ghcr.io/mellanox"` - - Init container image repository. - * - ofedDriver.initContainer.version - - string - - `"v0.0.2"` - - Init container image version. - * - ofedDriver.livenessProbe.initialDelaySeconds - - int - - `30` - - NVIDIA DOCA Driver liveness probe initial delay. - * - ofedDriver.livenessProbe.periodSeconds - - int - - `30` - - NVIDIA DOCA Driver liveness probe interval. - * - ofedDriver.readinessProbe.initialDelaySeconds - - int - - `10` - - NVIDIA DOCA Driver readiness probe initial delay. - * - ofedDriver.readinessProbe.periodSeconds - - int - - `30` - - NVIDIA DOCA Driver readiness probe interval. - * - ofedDriver.repoConfig - - yaml - - .. code-block:: yaml - - name: "" - - - Private mirror repository configuration. - * - ofedDriver.repository - - string - - `"nvcr.io/nvstaging/mellanox"` - - NVIDIA DOCA Driver image repository. - * - ofedDriver.startupProbe.initialDelaySeconds - - int - - `10` - - NVIDIA DOCA Driver startup probe initial delay. - * - ofedDriver.startupProbe.periodSeconds - - int - - `20` - - NVIDIA DOCA Driver startup probe interval. - * - ofedDriver.terminationGracePeriodSeconds - - int - - `300` - - The grace period before the driver containeris forcibly removed. - * - ofedDriver.upgradePolicy.autoUpgrade - - bool - - `true` - - Global switch for automatic upgrade feature, if set to false all other options are ignored. - * - ofedDriver.upgradePolicy.drain - - yaml - - .. code-block:: yaml - - # -- Options for node drain (``kubectl drain``) before driver reload, if - # auto upgrade is enabled. - enable: true - # -- Use force drain of pods. - force: true - # -- Pod selector to specify which pods will be drained from the node. - # An empty selector means all pods. - podSelector: "" - # -- It's recommended to set a timeout to avoid infinite drain in case - # non-fatal error keeps happening on retries. - timeoutSeconds: 300 - # -- Delete pods local storage. - deleteEmptyDir: true - - - Options for node drain (`kubectl drain`) before the driver reload. If auto upgrade is enabled but drain.enable is false, then driver POD will be reloaded immediately without removing PODs from the node. - * - ofedDriver.upgradePolicy.drain.deleteEmptyDir - - bool - - `true` - - Delete pods local storage. - * - ofedDriver.upgradePolicy.drain.enable - - bool - - `true` - - Options for node drain (``kubectl drain``) before driver reload, if auto upgrade is enabled. - * - ofedDriver.upgradePolicy.drain.force - - bool - - `true` - - Use force drain of pods. - * - ofedDriver.upgradePolicy.drain.podSelector - - string - - `""` - - Pod selector to specify which pods will be drained from the node. An empty selector means all pods. - * - ofedDriver.upgradePolicy.drain.timeoutSeconds - - int - - `300` - - It's recommended to set a timeout to avoid infinite drain in case non-fatal error keeps happening on retries. - * - ofedDriver.upgradePolicy.maxParallelUpgrades - - int - - `1` - - Number of nodes that can be upgraded in parallel (default: 1). 0 means no limit, all nodes will be upgraded in parallel. - * - ofedDriver.upgradePolicy.safeLoad - - bool - - `false` - - Cordon and drain (if enabled) a node before loading the driver on it. - * - ofedDriver.upgradePolicy.waitForCompletion - - string - - `nil` - - - * - ofedDriver.version - - string - - `"24.10-0.4.6.0-0"` - - NVIDIA DOCA Driver version. - -=============================================== -NVIDIA DOCA Driver Driver Environment Variables -=============================================== - -The following are special environment variables supported by the NVIDIA DOCA Driver container to configure its behavior: - -.. list-table:: - :header-rows: 1 - - * - Name - - Default - - Description - * - CREATE_IFNAMES_UDEV - - | * "true” for Ubuntu 20.04, RHEL v8.x and OCP <= v4.13. - | * "false" for newer OS. - - Create an udev rule to preserve "old-style" path based netdev names e.g enp3s0f0 - * - UNLOAD_STORAGE_MODULES - - "false" - - | Unload host storage modules prior to loading NVIDIA DOCA Driver modules: - | * ib_isert - | * nvme_rdma - | * nvmet_rdma - | * rpcrdma - | * xprtrdma - | * ib_srpt - * - ENABLE_NFSRDMA - - "false" - - Enable loading of NFS & NVME related storage modules from a NVIDIA DOCA Driver container - * - RESTORE_DRIVER_ON_POD_TERMINATION - - "true" - - Restore host drivers when a container - -In addition, it is possible to specify any environment variables to be exposed to the NVIDIA DOCA Driver container, such as the standard "HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY". - -.. warning:: - CREATE_IFNAMES_UDEV is set automatically by the Network Operator, depending on the Operating System of the worker nodes in the cluster (the cluster is assumed to be homogenous). - -.. warning:: - When ENABLE_NFSRDMA is set to `true`, it is not possible to load NVME related storage modules from NVIDIA DOCA Driver container when they are in use by the system - (e.g the system has NVMe SSD drives in use). User should ensure the modules are not in use and blacklist them prior to the use of NVIDIA DOCA Driver container. - -To set these variables, change them into Helm values. For example: - -.. code-block:: yaml - - ofedDriver: - env: - - name: RESTORE_DRIVER_ON_POD_TERMINATION - value: "true" - - name: UNLOAD_STORAGE_MODULES - value: "true" - - name: CREATE_IFNAMES_UDEV - value: "true" - -The variables can also be configured directly via the NicClusterPolicy CRD. - -========================= -RDMA Shared Device Plugin -========================= - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - rdmaSharedDevicePlugin.deploy - - bool - - `true` - - Deploy RDMA shared device plugin. - * - rdmaSharedDevicePlugin.image - - string - - `"k8s-rdma-shared-dev-plugin"` - - RDMA shared device plugin image name. - * - rdmaSharedDevicePlugin.repository - - string - - `"ghcr.io/mellanox"` - - RDMA shared device plugin image repository. - * - rdmaSharedDevicePlugin.resources - - yaml - - .. code-block:: yaml - - - name: rdma_shared_device_a - vendors: [15b3] - rdmaHcaMax: 63 - - - The following defines the RDMA resources in the cluster. It must be provided by the user when deploying the chart. Each entry in the resources element will create a resource with the provided and list of devices. - * - rdmaSharedDevicePlugin.useCdi - - bool - - `false` - - Enable Container Device Interface (CDI) mode. **NOTE**: NVIDIA Network Operator does not configure container runtime to enable CDI. - * - rdmaSharedDevicePlugin.version - - string - - `"sha-4f3eb2224b8b5f97be3f17441ddee8d41753b7d5"` - - RDMA shared device plugin version. - -========================================== -RDMA Device Plugin Resource Configurations -========================================== - -These configurations consist of a list of RDMA resources, each with a name and a selector of RDMA capable network devices to be associated with the resource. Refer to `RDMA Shared Device Plugin Selectors `_ for supported selectors. - -.. code-block:: yaml - - resources: - - name: rdma_shared_device_a - vendors: [15b3] - deviceIDs: [1017] - ifNames: [enp5s0f0] - rdmaHcaMax: 63 - - name: rdma_shared_device_b - vendors: [15b3] - deviceIDs: [1017] - ifNames: [ib0, ib1] - rdmaHcaMax: 63 - -============================ -SR-IOV Network Device Plugin -============================ - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - sriovDevicePlugin.deploy - - bool - - `false` - - Deploy SR-IOV Network device plugin. - * - sriovDevicePlugin.image - - string - - `"sriov-network-device-plugin"` - - SR-IOV Network device plugin image name. - * - sriovDevicePlugin.repository - - string - - `"ghcr.io/k8snetworkplumbingwg"` - - SR-IOV Network device plugin image repository. - * - sriovDevicePlugin.resources[0].name - - string - - `"hostdev"` + - `"v0.1.4"` - - * - sriovDevicePlugin.resources[0].vendors[0] - - string - - `"15b3"` - - - * - sriovDevicePlugin.useCdi - - bool - - `false` - - Enable Container Device Interface (CDI) mode. **NOTE**: NVIDIA Network Operator does not configure container runtime to enable CD. - * - sriovDevicePlugin.version - - string - - `"v3.7.0"` - - SR-IOV Network device plugin version - -=================================================== -SR-IOV Network Device Plugin Resource Configuration -=================================================== - -Consists of a list of RDMA resources, each with a name and a selector of RDMA capable network devices to be associated with the resource. Refer to `SR-IOV Network Device Plugin Selectors `_ for supported selectors. - -.. code-block:: yaml - - resources: - - name: hostdev - vendors: [15b3] - - name: ethernet_rdma - vendors: [15b3] - linkTypes: [ether] - - name: sriov_rdma - vendors: [15b3] - devices: [1018] - drivers: [mlx5_ib] - -============= -IB Kubernetes -============= - -ib-kubernetes provides a daemon that works in conjunction with the `SR-IOV Network Device Plugin `_. It acts on Kubernetes pod object changes (Create/Update/Delete), reading the pod's network annotation, fetching its corresponding network CRD and reading the PKey. This is done in order to add the newly generated GUID or the predefined GUID in the GUID field of the CRD cni-args to that PKey for pods with ``mellanox.infiniband.app`` annotation. - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - ibKubernetes.deploy - - bool - - `false` - - Deploy IB Kubernetes. - * - ibKubernetes.image - - string - - `"ib-kubernetes"` - - IB Kubernetes image name. - * - ibKubernetes.pKeyGUIDPoolRangeEnd - - string - - `"02:FF:FF:FF:FF:FF:FF:FF"` - - Maximal available GUID value to be allocated for the pod. - * - ibKubernetes.pKeyGUIDPoolRangeStart - - string - - `"02:00:00:00:00:00:00:00"` - - Minimal available GUID value to be allocated for the pod. - * - ibKubernetes.periodicUpdateSeconds - - int - - `5` - - Interval of periodic update in seconds. - * - ibKubernetes.repository - - string - - `"ghcr.io/mellanox"` - - IB Kubernetes image repository. - * - ibKubernetes.ufmSecret - - string - - `""` - - Name of the Secret with the NVIDIA UFM access credentials, deployed in advance. - * - ibKubernetes.version - - string - - `"v1.1.0"` - - IB Kubernetes version. - -========== -UFM Secret -========== - -IB Kubernetes must access `NVIDIA UFM `_ in order to manage pods' GUIDs. To provide its credentials, the secret of the following format should be deployed in advance: - -.. code-block:: yaml - - apiVersion: v1 - kind: Secret - metadata: - name: ib-kubernetes-ufm-secret - namespace: nvidia-network-operator - stringData: - UFM_USERNAME: "admin" - UFM_PASSWORD: "123456" - UFM_ADDRESS: "ufm-hostname" - UFM_HTTP_SCHEMA: "" - UFM_PORT: "" - data: - UFM_CERTIFICATE: "" - -.. warning:: - The InfiniBand Fabric manages a single pool of GUIDs. In order to use IB Kubernetes in different clusters, different GUID ranges must be specified to avoid collisions. - -================== -NVIDIA IPAM Plugin -================== - -`NVIDIA IPAM Plugin `_ is recommended to be used on large-scale deployments of the NVIDIA Network Operator. - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - nvIpam.deploy - - bool - - `true` - - Deploy NVIDIA IPAM Plugin. - * - nvIpam.enableWebhook - - bool - - `false` - - Enable deployment of the validataion webhook for IPPool CRD. - * - nvIpam.image - - string - - `"nvidia-k8s-ipam"` - - NVIDIA IPAM Plugin image name. - * - nvIpam.repository - - string - - `"ghcr.io/mellanox"` - - NVIDIA IPAM Plugin image repository. - * - nvIpam.version - - string - - `"v0.2.0"` - - NVIDIA IPAM Plugin image version. - -.. warning:: - Supported X.509 certificate management system should be available in the cluster to enable the validation webhook. Currently, the supported systems are `certmanager `_ and `Openshift certificate management `_. - -================= -Secondary Network -================= - -Specifies components to deploy in order to facilitate a secondary network in Kubernetes. It consists of the following optionally deployed components: - -* `Multus-CNI `_: Delegate CNI plugin to support secondary networks in Kubernetes -* CNI plugins: Currently only `containernetworking-plugins `_ is supported -* IPAM CNI: Currently only `Whereabout IPAM CNI `_ is supported as a part of the secondaryNetwork section. NVIDIA-IPAM is configured separately. -* `IPoIB CNI `_: Allows the user to create IPoIB child link and move it to the pod - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - secondaryNetwork.cniPlugins.deploy - - bool - - `true` - - Deploy CNI Plugins Secondary Network. - * - secondaryNetwork.cniPlugins.image - - string - - `"plugins"` - - CNI Plugins image name. - * - secondaryNetwork.cniPlugins.repository - - string - - `"ghcr.io/k8snetworkplumbingwg"` - - CNI Plugins image repository. - * - secondaryNetwork.cniPlugins.version - - string - - `"v1.5.0"` - - CNI Plugins image version. - * - secondaryNetwork.deploy - - bool - - `true` - - Deploy Secondary Network. - * - secondaryNetwork.ipamPlugin.deploy - - bool - - `false` - - Deploy IPAM CNI Plugin Secondary Network. - * - secondaryNetwork.ipamPlugin.image - - string - - `"whereabouts"` - - IPAM CNI Plugin image name. - * - secondaryNetwork.ipamPlugin.repository - - string - - `"ghcr.io/k8snetworkplumbingwg"` - - IPAM CNI Plugin image repository. - * - secondaryNetwork.ipamPlugin.version - - string - - `"v0.7.0"` - - IPAM CNI Plugin image version. - * - secondaryNetwork.ipoib.deploy - - bool - - `false` - - Deploy IPoIB CNI. - * - secondaryNetwork.ipoib.image - - string - - `"ipoib-cni"` - - IPoIB CNI image name. - * - secondaryNetwork.ipoib.repository - - string - - `"ghcr.io/mellanox"` - - IPoIB CNI image repository. - * - secondaryNetwork.ipoib.version - - string - - `"v1.2.0"` - - IPoIB CNI image version. - * - secondaryNetwork.multus.deploy - - bool - - `true` - - Deploy Multus Secondary Network. - * - secondaryNetwork.multus.image - - string - - `"multus-cni"` - - Multus image name. - * - secondaryNetwork.multus.repository - - string - - `"ghcr.io/k8snetworkplumbingwg"` - - Multus image repository. - * - secondaryNetwork.multus.version - - string - - `"v4.1.0"` - - Multus image version. - -============================ -NVIDIA NIC Feature Discovery -============================ - -`NVIDIA NIC Feature Discovery `_ leverages `Node Feature Discovery `_ to advertise NIC specific labels on K8s Node objects. - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - nicFeatureDiscovery.deploy - - bool - - `false` - - Deploy NVIDIA NIC Feature Discovery. - * - nicFeatureDiscovery.image - - string - - `"nic-feature-discovery"` - - NVIDIA NIC Feature Discovery image name. - * - nicFeatureDiscovery.repository - - string - - `"ghcr.io/mellanox"` - - NVIDIA NIC Feature Discovery repository. - * - nicFeatureDiscovery.version - - string - - `"v0.0.1"` - - NVIDIA NIC Feature Discovery image version. - -====================== -DOCA Telemetry Service -====================== -`DOCA Telemetry Service `_ exports metrics from NVIDIA NICs on K8s Nodes. - -.. list-table:: - :header-rows: 1 - - * - Name - - Type - - Default - - Description - * - docaTelemetryService.deploy - - bool - - `false` - - Deploy DOCA Telemetry Service. - * - docaTelemetryService.image - - string - - `"doca_telemetry"` - - DOCA Telemetry Service image name. - * - docaTelemetryService.repository - - string - - `"nvcr.io/nvidia/doca"` - - DOCA Telemetry Service image repository. - * - docaTelemetryService.version - - string - - `"1.16.5-doca2.6.0-host"` - - DOCA Telemetry Service image version. ======================= Helm customization file ======================= .. warning:: - Since several parameters should be provided when creating custom resources during operator deployment, it is recommended to use a configuration file. While it is possible to override the parameters via CLI, we recommend to avoid the use of CLI arguments in favor of a configuration file. + It is recommended to use a configuration file. While it is possible to override the parameters via CLI, we recommend to avoid the use of CLI arguments in favor of a configuration file. .. code-block:: bash diff --git a/templates/helm.rst.gotmpl b/templates/helm.rst.gotmpl index ccc7a5d..535d6ef 100644 --- a/templates/helm.rst.gotmpl +++ b/templates/helm.rst.gotmpl @@ -172,376 +172,12 @@ NIC Configuration Operator Helm chart customization options can be found `here < {{ template "chart.nicConfigurationValuesTable" . }} -=================== -Container Resources -=================== - -Optional `requests and limits `_ can be configured for each container of the sub-resources deployed by the Network Operator by setting the parameter ``containerResources``. - -For example: - -.. code-block:: yaml - - containerResources: - - name: "mofed-container" - requests: - cpu: "200m" - memory: "150Mi" - limits: - cpu: "300m" - memory: "300Mi" - -=================== - NVIDIA DOCA Driver -=================== - -.. list-table:: - :header-rows: 1 - - {{- define "chart.ofedValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "ofedDriver" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.ofedValuesTable" . }} - -=============================================== -NVIDIA DOCA Driver Driver Environment Variables -=============================================== - -The following are special environment variables supported by the NVIDIA DOCA Driver container to configure its behavior: - -.. list-table:: - :header-rows: 1 - - * - Name - - Default - - Description - * - CREATE_IFNAMES_UDEV - - | * "true” for Ubuntu 20.04, RHEL v8.x and OCP <= v4.13. - | * "false" for newer OS. - - Create an udev rule to preserve "old-style" path based netdev names e.g enp3s0f0 - * - UNLOAD_STORAGE_MODULES - - "false" - - | Unload host storage modules prior to loading NVIDIA DOCA Driver modules: - | * ib_isert - | * nvme_rdma - | * nvmet_rdma - | * rpcrdma - | * xprtrdma - | * ib_srpt - * - ENABLE_NFSRDMA - - "false" - - Enable loading of NFS & NVME related storage modules from a NVIDIA DOCA Driver container - * - RESTORE_DRIVER_ON_POD_TERMINATION - - "true" - - Restore host drivers when a container - -In addition, it is possible to specify any environment variables to be exposed to the NVIDIA DOCA Driver container, such as the standard "HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY". - -.. warning:: - CREATE_IFNAMES_UDEV is set automatically by the Network Operator, depending on the Operating System of the worker nodes in the cluster (the cluster is assumed to be homogenous). - -.. warning:: - When ENABLE_NFSRDMA is set to `true`, it is not possible to load NVME related storage modules from NVIDIA DOCA Driver container when they are in use by the system - (e.g the system has NVMe SSD drives in use). User should ensure the modules are not in use and blacklist them prior to the use of NVIDIA DOCA Driver container. - -To set these variables, change them into Helm values. For example: - -.. code-block:: yaml - - ofedDriver: - env: - - name: RESTORE_DRIVER_ON_POD_TERMINATION - value: "true" - - name: UNLOAD_STORAGE_MODULES - value: "true" - - name: CREATE_IFNAMES_UDEV - value: "true" - -The variables can also be configured directly via the NicClusterPolicy CRD. - -========================= -RDMA Shared Device Plugin -========================= - -.. list-table:: - :header-rows: 1 - - {{- define "chart.rdmaDpValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "rdmaSharedDevicePlugin" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.rdmaDpValuesTable" . }} - -========================================== -RDMA Device Plugin Resource Configurations -========================================== - -These configurations consist of a list of RDMA resources, each with a name and a selector of RDMA capable network devices to be associated with the resource. Refer to `RDMA Shared Device Plugin Selectors `_ for supported selectors. - -.. code-block:: yaml - - resources: - - name: rdma_shared_device_a - vendors: [15b3] - deviceIDs: [1017] - ifNames: [enp5s0f0] - rdmaHcaMax: 63 - - name: rdma_shared_device_b - vendors: [15b3] - deviceIDs: [1017] - ifNames: [ib0, ib1] - rdmaHcaMax: 63 - -============================ -SR-IOV Network Device Plugin -============================ - -.. list-table:: - :header-rows: 1 - - {{- define "chart.sriovDpValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "sriovDevicePlugin" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.sriovDpValuesTable" . }} - -=================================================== -SR-IOV Network Device Plugin Resource Configuration -=================================================== - -Consists of a list of RDMA resources, each with a name and a selector of RDMA capable network devices to be associated with the resource. Refer to `SR-IOV Network Device Plugin Selectors `_ for supported selectors. - -.. code-block:: yaml - - resources: - - name: hostdev - vendors: [15b3] - - name: ethernet_rdma - vendors: [15b3] - linkTypes: [ether] - - name: sriov_rdma - vendors: [15b3] - devices: [1018] - drivers: [mlx5_ib] - -============= -IB Kubernetes -============= - -ib-kubernetes provides a daemon that works in conjunction with the `SR-IOV Network Device Plugin `_. It acts on Kubernetes pod object changes (Create/Update/Delete), reading the pod's network annotation, fetching its corresponding network CRD and reading the PKey. This is done in order to add the newly generated GUID or the predefined GUID in the GUID field of the CRD cni-args to that PKey for pods with ``mellanox.infiniband.app`` annotation. - -.. list-table:: - :header-rows: 1 - - {{- define "chart.ibValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "ibKubernetes" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.ibValuesTable" . }} - -========== -UFM Secret -========== - -IB Kubernetes must access `NVIDIA UFM `_ in order to manage pods' GUIDs. To provide its credentials, the secret of the following format should be deployed in advance: - -.. code-block:: yaml - - apiVersion: v1 - kind: Secret - metadata: - name: ib-kubernetes-ufm-secret - namespace: nvidia-network-operator - stringData: - UFM_USERNAME: "admin" - UFM_PASSWORD: "123456" - UFM_ADDRESS: "ufm-hostname" - UFM_HTTP_SCHEMA: "" - UFM_PORT: "" - data: - UFM_CERTIFICATE: "" - -.. warning:: - The InfiniBand Fabric manages a single pool of GUIDs. In order to use IB Kubernetes in different clusters, different GUID ranges must be specified to avoid collisions. - -================== -NVIDIA IPAM Plugin -================== - -`NVIDIA IPAM Plugin `_ is recommended to be used on large-scale deployments of the NVIDIA Network Operator. - -.. list-table:: - :header-rows: 1 - - {{- define "chart.nvipamValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "nvIpam" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.nvipamValuesTable" . }} - -.. warning:: - Supported X.509 certificate management system should be available in the cluster to enable the validation webhook. Currently, the supported systems are `certmanager `_ and `Openshift certificate management `_. - -================= -Secondary Network -================= - -Specifies components to deploy in order to facilitate a secondary network in Kubernetes. It consists of the following optionally deployed components: - -* `Multus-CNI `_: Delegate CNI plugin to support secondary networks in Kubernetes -* CNI plugins: Currently only `containernetworking-plugins `_ is supported -* IPAM CNI: Currently only `Whereabout IPAM CNI `_ is supported as a part of the secondaryNetwork section. NVIDIA-IPAM is configured separately. -* `IPoIB CNI `_: Allows the user to create IPoIB child link and move it to the pod - -.. list-table:: - :header-rows: 1 - - {{- define "chart.secNetworkValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "secondaryNetwork" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.secNetworkValuesTable" . }} - -============================ -NVIDIA NIC Feature Discovery -============================ - -`NVIDIA NIC Feature Discovery `_ leverages `Node Feature Discovery `_ to advertise NIC specific labels on K8s Node objects. - -.. list-table:: - :header-rows: 1 - - {{- define "chart.nfdValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "nicFeatureDiscovery" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.nfdValuesTable" . }} - -====================== -DOCA Telemetry Service -====================== -`DOCA Telemetry Service `_ exports metrics from NVIDIA NICs on K8s Nodes. - -.. list-table:: - :header-rows: 1 - - {{- define "chart.docaValuesTable" }} - * - Name - - Type - - Default - - Description - {{- range .Values }} - {{- if hasPrefix "docaTelemetryService" .Key}} - * - {{ .Key }} - - {{ .Type }} - - {{ if eq .Type "yaml" }}.. code-block:: yaml - -{{ .Default | indent 10}}{{ else }}{{ .Default }}{{ end }} - - {{ if .Description }}{{ .Description }}{{ else }}{{ .AutoDescription }}{{ end }} - {{- end }} - {{- end }} - {{- end }} - -{{ template "chart.docaValuesTable" . }} - ======================= Helm customization file ======================= .. warning:: - Since several parameters should be provided when creating custom resources during operator deployment, it is recommended to use a configuration file. While it is possible to override the parameters via CLI, we recommend to avoid the use of CLI arguments in favor of a configuration file. + It is recommended to use a configuration file. While it is possible to override the parameters via CLI, we recommend to avoid the use of CLI arguments in favor of a configuration file. .. code-block:: bash From afb491d7dcc0873cd915c0a1901dc49face14895 Mon Sep 17 00:00:00 2001 From: killianmuldoon Date: Thu, 24 Oct 2024 12:53:19 +0100 Subject: [PATCH 2/3] Add additional information to the advanced configuration Signed-off-by: killianmuldoon --- docs/advanced-configurations.rst | 89 ++++++++++++++++++++++++++++- docs/getting-started-kubernetes.rst | 5 ++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/docs/advanced-configurations.rst b/docs/advanced-configurations.rst index 5608971..099a0bf 100644 --- a/docs/advanced-configurations.rst +++ b/docs/advanced-configurations.rst @@ -477,4 +477,91 @@ To build Ubuntu-based image please use provided `Ubuntu Dockerfile `_ can be configured for each component of the sub-resources deployed by the Network Operator by setting the parameter ``containerResources``. + +For example, for the SR-IOV Device Plugin: + +.. code-block:: yaml + kind: NicClusterPolicy + metadata: + name: nic-cluster-policy + spec: + sriovDevicePlugin: + containerResources: + - name: "mofed-container" + requests: + cpu: "200m" + memory: "150Mi" + limits: + cpu: "300m" + memory: "300Mi" + + +=============================================== +NVIDIA DOCA Driver Driver Environment Variables +=============================================== + +The following are special environment variables supported by the NVIDIA DOCA Driver container to configure its behavior: + +.. list-table:: + :header-rows: 1 + + * - Name + - Default + - Description + * - CREATE_IFNAMES_UDEV + - | * "true” for Ubuntu 20.04, RHEL v8.x and OCP <= v4.13. + | * "false" for newer OS. + - Create an udev rule to preserve "old-style" path based netdev names e.g enp3s0f0 + * - UNLOAD_STORAGE_MODULES + - "false" + - | Unload host storage modules prior to loading NVIDIA DOCA Driver modules: + | * ib_isert + | * nvme_rdma + | * nvmet_rdma + | * rpcrdma + | * xprtrdma + | * ib_srpt + * - ENABLE_NFSRDMA + - "false" + - Enable loading of NFS & NVME related storage modules from a NVIDIA DOCA Driver container + * - RESTORE_DRIVER_ON_POD_TERMINATION + - "true" + - Restore host drivers when a container + +In addition, it is possible to specify any environment variables to be exposed to the NVIDIA DOCA Driver container, such as the standard "HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY". + +.. warning:: + CREATE_IFNAMES_UDEV is set automatically by the Network Operator, depending on the Operating System of the worker nodes in the cluster (the cluster is assumed to be homogenous). + +.. warning:: + When ENABLE_NFSRDMA is set to `true`, it is not possible to load NVME related storage modules from NVIDIA DOCA Driver container when they are in use by the system + (e.g the system has NVMe SSD drives in use). User should ensure the modules are not in use and blacklist them prior to the use of NVIDIA DOCA Driver container. + +These variables can be set in the NicClusterPolicy. For example: + +.. code-block:: yaml + kind: NicClusterPolicy + metadata: + name: nic-cluster-policy + spec: + ofedDriver: + env: + - name: RESTORE_DRIVER_ON_POD_TERMINATION + value: "true" + - name: UNLOAD_STORAGE_MODULES + value: "true" + - name: CREATE_IFNAMES_UDEV + value: "true" + +=============================================== +Container Device Interface +=============================================== + diff --git a/docs/getting-started-kubernetes.rst b/docs/getting-started-kubernetes.rst index dbee5db..264b6e9 100644 --- a/docs/getting-started-kubernetes.rst +++ b/docs/getting-started-kubernetes.rst @@ -1537,13 +1537,18 @@ Network Operator deployment with InfiniBand network requires the following: * InfiniBand device – Both the host device and the switch ports must be enabled in InfiniBand mode. * rdma-core package should be installed when an inbox driver is used. + Current limitations: * Only a single PKey can be configured per workload pod. * When a single instance of NVIDIA UFM is used with several K8s clusters, different PKey GUID pools should be configured for each cluster. +.. note:: ib-kubernetes provides a daemon that works in conjunction with the `SR-IOV Network Device Plugin `_. It acts on Kubernetes pod object changes (Create/Update/Delete), reading the pod's network annotation, fetching its corresponding network CRD and reading the PKey. This is done in order to add the newly generated GUID or the predefined GUID in the GUID field of the CRD cni-args to that PKey for pods with ``mellanox.infiniband.app`` annotation. + .. warning:: `ib-kubernetes-ufm-secret` should be created before NicClusterPolicy. +IB Kubernetes must access `NVIDIA UFM `_ in order to manage pods' GUIDs. To provide its credentials, the secret of the following format should be deployed in advance: + ``ufm-secret.yaml`` .. code-block:: yaml From eebe4fc7c3fb641f8afedb0b329f4b0180cdcc9c Mon Sep 17 00:00:00 2001 From: killianmuldoon Date: Thu, 31 Oct 2024 08:11:19 +0000 Subject: [PATCH 3/3] Fix review comments Signed-off-by: killianmuldoon --- docs/advanced-configurations.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/advanced-configurations.rst b/docs/advanced-configurations.rst index 099a0bf..2838022 100644 --- a/docs/advanced-configurations.rst +++ b/docs/advanced-configurations.rst @@ -489,13 +489,14 @@ Optional `requests and limits