GPU硬件直通
标签搜索
侧边栏壁纸
  • 累计撰写 16 篇文章
  • 累计收到 0 条评论

GPU硬件直通

ranyuan
2024-09-02 / 0 评论 / 189 阅读 / 正在检测是否收录...

显卡直通


第一:主板BIOS需启用 IOMMU / VT-d、bove 4G Decoding、SR-IOV。
--------------------------------------------------

● IOMMU:是一种地址映射技术,而 VT-d 是 Intel 对该技术的别称
● Above 4G Decoding:关系到 PCI-E 设备 RAM 的 64 位寻址能力,
通常用于需要让 CPU 访问全部显存的场景,使用 vGPU 时推荐开启
● SR-IOV:允许一个 PCI-E 设备被多个虚拟机使用,常用于网卡等设备共享。

第二:确认CPU硬件是否支持虚拟化、关闭selinux
---------------------------

# 核查是否支持cpu虚拟化
egrep -o '(vmx|svm)' /proc/cpuinfo
# 关闭selinux
sed -i 's/SELINUX=enforcing/SELINUX=disabled/g' /etc/selinux/config

第三:CPU启用iommu
-------------

# intel_iommu=on iommu=pt
# Intel添加: rd.driver.pre=vfio-pci intel_iommu=on video=efifb:off,vesafb:off
# AMD添加:   rd.driver.pre=vfio-pci amd_iommu=on video=efifb:off,vesafb:off
[root@sv-gpu-node-001 ~]# cat /etc/default/grub
GRUB_TIMEOUT=5
GRUB_DISTRIBUTOR="$(sed 's, release .*$,,g' /etc/system-release)"
GRUB_DEFAULT=saved
GRUB_DISABLE_SUBMENU=true
GRUB_TERMINAL_OUTPUT="console"
GRUB_CMDLINE_LINUX="crashkernel=auto resume=/dev/mapper/cs-swap 
rd.lvm.lv=cs00/root rd.lvm.lv=cs/swap rhgb quiet 
rd.driver.pre=vfio-pci intel_iommu=on video=efifb:off,vesafb:off"
GRUB_DISABLE_RECOVERY="true"
GRUB_ENABLE_BLSCFG=true

第四:更新grub
---------

sudo grub2-mkconfig -o /boot/grub2/grub.cfg

#    参数说明:
#        vfio-pci 显卡直通虚拟话需要的驱动
#        iommu开启直通分组
#        efifb:off 禁用efi启动的显示设备
#        vesafb:off 禁用legacy启动的显示设备

第五:加载显卡直通所需的驱动模块
----------------
cat > /etc/modules-load.d/vfio.conf << EOF
vfio
vfio_iommu_type1
vfio_pci
vfio_virqfd
EOF

第六:禁用原本的英伟达显卡驱动和开源驱动nouveau,避免直通出错
----------------------------------
cat > /etc/modprobe.d/blacklist.conf << EOF
blacklist nouveau
blacklist nvidia
options nouveau modeset=0
EOF

第七:重构
-----
# 先备份
    mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r)-nouveau.img
# 重建
    dracut /boot/initramfs-$(uname -r).img $(uname -r)
# 重启
    reboot

第八:验证
-----
# 验证IOMMU是否开启
[root@sv-gpu-node-001 ~]# dmesg | grep -e DMAR -e IOMMU
[    0.000000] ACPI: DMAR 0x000000007DF6D650 000160 (v01 A M I  OEMDMAR  00000001 INTL 00000001)
[    0.000000] ACPI: Reserving DMAR table memory at [mem 0x7df6d650-0x7df6d7af]
[    0.000000] DMAR: IOMMU enabled

# 这是未屏蔽开源显卡驱动的情况
[root@localhost ~]# lsmod | grep nouveau
nouveau              2355200  4
video                  53248  1 nouveau
mxm_wmi                16384  1 nouveau
wmi                    32768  2 mxm_wmi,nouveau
drm_display_helper    151552  1 nouveau
i2c_algo_bit           16384  2 ast,nouveau
drm_kms_helper        167936  5 drm_vram_helper,ast,drm_display_helper,nouveau
drm_ttm_helper         16384  3 drm_vram_helper,ast,nouveau
ttm                    81920  3 drm_vram_helper,drm_ttm_helper,nouveau
drm                   577536  13 drm_kms_helper,drm_vram_helper,ast,drm_display_helper,drm_ttm_helper,ttm,nouveau

# 这是已经成功屏蔽开源显卡驱动的情况(这是期待的结果-表示成功)
[root@localhost ~]# lsmod | grep nouveau
[root@localhost ~]# 

# 如下分别是禁用nvidia驱动后,未分配给虚拟机使用 和 已分配给虚拟机使用 的情况
[root@localhost ~]# lspci -v -s  85:00.0
85:00.0 VGA compatible controller: NVIDIA Corporation GM107GL [Tesla M10] (rev a2) (prog-if 00 [VGA controller])
    Subsystem: NVIDIA Corporation Tesla M10
    Flags: bus master, fast devsel, latency 0, IRQ 304, NUMA node 1, IOMMU group 78
    Memory at f8000000 (32-bit, non-prefetchable) [size=16M]
    Memory at 39ffa0000000 (64-bit, prefetchable) [size=256M]
    Memory at 39ffb0000000 (64-bit, prefetchable) [size=32M]
    I/O ports at c000 [size=128]
    Capabilities: [60] Power Management version 3
    Capabilities: [68] MSI: Enable+ Count=1/1 Maskable- 64bit+
    Capabilities: [78] Express Endpoint, MSI 00
    Capabilities: [100] Virtual Channel
    Capabilities: [258] L1 PM Substates
    Capabilities: [128] Power Budgeting <?>
    Capabilities: [420] Advanced Error Reporting
    Capabilities: [600] Vendor Specific Information: ID=0001 Rev=1 Len=024 <?>
    Capabilities: [900] Secondary PCI Express
    Kernel driver in use: nvidia    # 这是还未直通给虚拟机用时
    Kernel modules: nouveau, nvidia_drm, nvidia

[root@localhost ~]# lspci -v -s  85:00.0 
85:00.0 VGA compatible controller: NVIDIA Corporation GM107GL [Tesla M10] (rev a2) (prog-if 00 [VGA controller])
    Subsystem: NVIDIA Corporation Tesla M10
    Flags: fast devsel, IRQ 38, NUMA node 1, IOMMU group 76
    Memory at fa000000 (32-bit, non-prefetchable) [size=16M]
    Memory at 39ffe0000000 (64-bit, prefetchable) [size=256M]
    Memory at 39fff0000000 (64-bit, prefetchable) [size=32M]
    I/O ports at e000 [size=128]
    Capabilities: [60] Power Management version 3
    Capabilities: [68] MSI: Enable- Count=1/1 Maskable- 64bit+
    Capabilities: [78] Express Endpoint, MSI 00
    Capabilities: [100] Virtual Channel
    Capabilities: [250] Latency Tolerance Reporting
    Capabilities: [258] L1 PM Substates
    Capabilities: [128] Power Budgeting <?>
    Capabilities: [420] Advanced Error Reporting
    Capabilities: [600] Vendor Specific Information: ID=0001 Rev=1 Len=024 <?>
    Capabilities: [900] Secondary PCI Express
    Kernel driver in use: vfio-pci    # 这是已经直通给虚拟机用后
    Kernel modules: nouveau, nvidia_drm, nvidia
##将pcie组分配给虚拟机
[root@localhost ~]# lspci | grep NVIDIA
03:00.0 VGA compatible controller: NVIDIA Corporation GA102 [GeForce RTX 3080] (rev a1)
03:00.1 Audio device: NVIDIA Corporation GA102 High Definition Audio Controller (rev a1)
0

评论 (0)

取消