From 7e432852787e17678e8a33be51f7cb783d582e03 Mon Sep 17 00:00:00 2001 From: UGA Innovation Factory Date: Mon, 16 Feb 2026 15:51:19 -0500 Subject: [PATCH] feat: Add nvidia compatibility option for LXC devices --- docs/INVENTORY.md | 28 ++++++++++++++ docs/NAMESPACE.md | 64 +++++++++++++++++++++++++++++++ hw/nix-lxc.nix | 98 +++++++++++++++++++++++++++++++++++++++++++++++ inventory.nix | 19 +++++++++ users.nix | 10 +++++ 5 files changed, 219 insertions(+) diff --git a/docs/INVENTORY.md b/docs/INVENTORY.md index 20eaacd..ff3663c 100644 --- a/docs/INVENTORY.md +++ b/docs/INVENTORY.md @@ -343,6 +343,34 @@ nix-lxc = { }; ``` +### Proxmox LXC with NVIDIA (Dual P40 Example) + +```nix +nix-lxc = { + devices = { + "gpu-builder" = { + athenix.host.useHostPrefix = false; + athenix.sw.type = "headless"; + + # Optional NVIDIA containerization support + athenix.hw.nix-lxc.cuda = { + enable = true; + # Expose both GPUs (index-based) + visibleDevices = [ "0" "1" ]; + # Select a compatible driver branch for older cards when needed + driver.channel = "legacy_470"; + driverCapabilities = [ "compute" "utility" ]; + }; + }; + }; +}; +``` + +Notes: +- Proxmox host must provide `/dev/nvidia*` devices to the container. +- Use `driver.channel = "dc_550"` (default) for newer datacenter cards when supported. +- Use `driver.package` only when you need an explicit package override. + ### WSL Instances ```nix diff --git a/docs/NAMESPACE.md b/docs/NAMESPACE.md index 6c55c7d..1b91003 100644 --- a/docs/NAMESPACE.md +++ b/docs/NAMESPACE.md @@ -5,6 +5,7 @@ All UGA Innovation Factory-specific options are in the `athenix` namespace to av ## Table of Contents - [Host Configuration (`athenix.host`)](#host-configuration-athenixhost) +- [Hardware Type Configuration (`athenix.hw`)](#hardware-type-configuration-athenixhw) - [Software Configuration (`athenix.sw`)](#software-configuration-athenixsw) - [User Management (`athenix.users`)](#user-management-athenixusers) - [Convenience Options](#convenience-options) @@ -90,6 +91,69 @@ Default WSL user account (only for `nix-wsl` type). athenix.host.wsl.user = "myusername"; ``` +## Hardware Type Configuration (`athenix.hw`) + +Hardware-type specific options. These are usually set in per-device config or fleet overrides. + +### `athenix.hw.nix-lxc.cuda.enable` + +Enable NVIDIA CUDA container support for Proxmox LXC hosts. + +**Type:** Boolean + +**Default:** `false` + +### `athenix.hw.nix-lxc.cuda.visibleDevices` + +Select which NVIDIA GPUs are exposed to containerized workloads. + +**Type:** List of strings + +**Default:** `[ "all" ]` + +**Examples:** +- `[ "all" ]` +- `[ "0" "1" ]` (for dual-GPU systems) +- `[ "GPU-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" ]` + +### `athenix.hw.nix-lxc.cuda.driverCapabilities` + +Sets `NVIDIA_DRIVER_CAPABILITIES` for container runtimes. + +**Type:** List of strings + +**Default:** `[ "compute" "utility" ]` + +**Example:** +```nix +athenix.hw.nix-lxc.cuda.driverCapabilities = [ "compute" "utility" "video" ]; +``` + +### `athenix.hw.nix-lxc.cuda.driver.channel` + +Driver package channel selected from `boot.kernelPackages.nvidiaPackages`. + +**Type:** String + +**Default:** `"dc_550"` + +**Common values:** `"stable"`, `"latest"`, `"beta"`, `"dc_550"`, `"legacy_470"`, `"legacy_390"` + +**Aliases:** +- `"production"` → `"stable"` +- `"datacenter"` → `"dc_550"` + +### `athenix.hw.nix-lxc.cuda.driver.package` + +Explicit package override for NVIDIA driver selection. + +**Type:** Package or null + +**Default:** `null` + +**Description:** +When set, this takes precedence over `athenix.hw.nix-lxc.cuda.driver.channel`. + ## Software Configuration (`athenix.sw`) System type, packages, and application configuration. diff --git a/hw/nix-lxc.nix b/hw/nix-lxc.nix index 3dcc45e..1ece1d0 100644 --- a/hw/nix-lxc.nix +++ b/hw/nix-lxc.nix @@ -7,6 +7,7 @@ { config, lib, + pkgs, ... }: @@ -24,6 +25,56 @@ in default = false; description = "Enable Proxmox LXC container hardware configuration."; }; + cuda.enable = mkOption { + type = types.bool; + default = false; + description = "Enable CUDA support in LXC containers (requires NVIDIA GPU and drivers on host)."; + }; + cuda.visibleDevices = mkOption { + type = types.listOf types.str; + default = [ "all" ]; + example = [ + "0" + "1" + ]; + description = '' + NVIDIA devices exposed to containerized workloads. + Use indexes (e.g. "0", "1"), UUIDs, or "all". + ''; + }; + cuda.driverCapabilities = mkOption { + type = types.listOf types.str; + default = [ + "compute" + "utility" + ]; + example = [ + "compute" + "utility" + "video" + ]; + description = '' + Value used for NVIDIA_DRIVER_CAPABILITIES for container runtimes. + ''; + }; + cuda.driver.channel = mkOption { + type = types.str; + default = "dc_550"; + example = "legacy_470"; + description = '' + NVIDIA driver package channel from boot.kernelPackages.nvidiaPackages. + Common values include: stable, latest, beta, dc_550, legacy_470, legacy_390. + Alias values: production -> stable, datacenter -> dc_550. + ''; + }; + cuda.driver.package = mkOption { + type = types.nullOr types.package; + default = null; + description = '' + Explicit NVIDIA driver package override. When set, this takes precedence + over cuda.driver.channel. + ''; + }; }; }; default = { }; @@ -50,6 +101,53 @@ in # Set timezone to fix /etc/localtime for Docker containers time.timeZone = lib.mkDefault "America/New_York"; + # NVIDIA Container Toolkit for CUDA support (optional) + hardware.nvidia = lib.mkIf cfg.cuda.enable { + package = + let + nvidiaPackages = config.boot.kernelPackages.nvidiaPackages; + driverAliasMap = { + production = "stable"; + datacenter = "dc_550"; + }; + driverChannel = driverAliasMap.${cfg.cuda.driver.channel} or cfg.cuda.driver.channel; + in + if cfg.cuda.driver.package != null then + cfg.cuda.driver.package + else if builtins.hasAttr driverChannel nvidiaPackages then + builtins.getAttr driverChannel nvidiaPackages + else + throw "athenix.hw.nix-lxc.cuda.driver.channel '${driverChannel}' not found in boot.kernelPackages.nvidiaPackages"; + }; + hardware.nvidia-container-toolkit.enable = lib.mkIf cfg.cuda.enable true; + hardware.nvidia-container-toolkit.suppressNvidiaDriverAssertion = lib.mkIf cfg.cuda.enable true; + environment.systemPackages = lib.mkIf cfg.cuda.enable [ + (pkgs.writeShellScriptBin "nvidia-smi" '' + primary="${config.hardware.nvidia.package}/bin/nvidia-smi" + secondary="${ + if builtins.hasAttr "bin" config.hardware.nvidia.package then + config.hardware.nvidia.package.bin + else + config.hardware.nvidia.package + }/bin/nvidia-smi" + + if [ -x "$primary" ]; then + exec "$primary" "$@" + fi + + if [ -x "$secondary" ]; then + exec "$secondary" "$@" + fi + + echo "nvidia-smi binary not found in configured NVIDIA package: ${config.hardware.nvidia.package}" >&2 + exit 127 + '') + ]; + environment.variables = lib.mkIf cfg.cuda.enable { + NVIDIA_VISIBLE_DEVICES = lib.concatStringsSep "," cfg.cuda.visibleDevices; + NVIDIA_DRIVER_CAPABILITIES = lib.concatStringsSep "," cfg.cuda.driverCapabilities; + }; + # Allow getty to work in containers systemd.services."getty@".unitConfig.ConditionPathExists = [ "" diff --git a/inventory.nix b/inventory.nix index 16cb492..954afbc 100644 --- a/inventory.nix +++ b/inventory.nix @@ -127,6 +127,25 @@ }; }; }; + "nix-big-cuda" = { + athenix.sw.headless.enable = true; + nixpkgs.config = { + allowUnfree = true; + nvidia.acceptLicense = true; + }; + athenix.hw.nix-lxc.cuda = { + enable = true; + visibleDevices = [ + "0" + "1" + ]; + driver.channel = "legacy_535"; + driverCapabilities = [ + "compute" + "utility" + ]; + }; + }; "usda-dash".external = { url = "https://git.factory.uga.edu/MODEL/usda-dash-config.git"; rev = "ce2700b0196e106f7c013bbcee851a5f96b146a3"; diff --git a/users.nix b/users.nix index 32aef53..0343543 100644 --- a/users.nix +++ b/users.nix @@ -48,6 +48,7 @@ enable = true; # Default user, enabled everywhere }; hdh20267 = { + description = "Hunter Halloran"; external = { url = "https://git.factory.uga.edu/hdh20267/hdh20267-nix"; rev = "dbdf65c7bd59e646719f724a3acd2330e0c922ec"; @@ -67,5 +68,14 @@ shell = "zsh"; # enable = false by default, set to true per-system }; + dj69594 = { + description = "David Joy"; + extraGroups = [ + "networkmanager" + "wheel" + ]; + shell = "zsh"; + # enable = false by default, set to true per-system + }; }; }