1029 lines
38 KiB
Diff
1029 lines
38 KiB
Diff
From 5dc7b745eb04e799b95e7e8d17868970a65621df Mon Sep 17 00:00:00 2001
|
|
From: David Gibson <dgibson@redhat.com>
|
|
Date: Thu, 30 May 2019 04:37:28 +0100
|
|
Subject: [PATCH 7/8] spapr: Support NVIDIA V100 GPU with NVLink2
|
|
|
|
RH-Author: David Gibson <dgibson@redhat.com>
|
|
Message-id: <20190530043728.32575-7-dgibson@redhat.com>
|
|
Patchwork-id: 88423
|
|
O-Subject: [RHEL-8.1 qemu-kvm PATCH 6/6] spapr: Support NVIDIA V100 GPU with NVLink2
|
|
Bugzilla: 1710662
|
|
RH-Acked-by: Laurent Vivier <lvivier@redhat.com>
|
|
RH-Acked-by: Auger Eric <eric.auger@redhat.com>
|
|
RH-Acked-by: Cornelia Huck <cohuck@redhat.com>
|
|
|
|
From: Alexey Kardashevskiy <aik@ozlabs.ru>
|
|
|
|
NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory
|
|
space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver
|
|
implements special regions for such GPUs and emulates an NVLink bridge.
|
|
NVLink2-enabled POWER9 CPUs also provide address translation services
|
|
which includes an ATS shootdown (ATSD) register exported via the NVLink
|
|
bridge device.
|
|
|
|
This adds a quirk to VFIO to map the GPU memory and create an MR;
|
|
the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses
|
|
this to get the MR and map it to the system address space.
|
|
Another quirk does the same for ATSD.
|
|
|
|
This adds additional steps to sPAPR PHB setup:
|
|
|
|
1. Search for specific GPUs and NPUs, collect findings in
|
|
sPAPRPHBState::nvgpus, manage system address space mappings;
|
|
|
|
2. Add device-specific properties such as "ibm,npu", "ibm,gpu",
|
|
"memory-block", "link-speed" to advertise the NVLink2 function to
|
|
the guest;
|
|
|
|
3. Add "mmio-atsd" to vPHB to advertise the ATSD capability;
|
|
|
|
4. Add new memory blocks (with extra "linux,memory-usable" to prevent
|
|
the guest OS from accessing the new memory until it is onlined) and
|
|
npuphb# nodes representing an NPU unit for every vPHB as the GPU driver
|
|
uses it for link discovery.
|
|
|
|
This allocates space for GPU RAM and ATSD like we do for MMIOs by
|
|
adding 2 new parameters to the phb_placement() hook. Older machine types
|
|
set these to zero.
|
|
|
|
This puts new memory nodes in a separate NUMA node to as the GPU RAM
|
|
needs to be configured equally distant from any other node in the system.
|
|
Unlike the host setup which assigns numa ids from 255 downwards, this
|
|
adds new NUMA nodes after the user configures nodes or from 1 if none
|
|
were configured.
|
|
|
|
This adds requirement similar to EEH - one IOMMU group per vPHB.
|
|
The reason for this is that ATSD registers belong to a physical NPU
|
|
so they cannot invalidate translations on GPUs attached to another NPU.
|
|
It is guaranteed by the host platform as it does not mix NVLink bridges
|
|
or GPUs from different NPU in the same IOMMU group. If more than one
|
|
IOMMU group is detected on a vPHB, this disables ATSD support for that
|
|
vPHB and prints a warning.
|
|
|
|
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
|
|
[aw: for vfio portions]
|
|
Acked-by: Alex Williamson <alex.williamson@redhat.com>
|
|
Message-Id: <20190312082103.130561-1-aik@ozlabs.ru>
|
|
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
|
|
(cherry picked from commit ec132efaa81f09861a3bd6afad94827e74543b3f)
|
|
|
|
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
|
|
|
|
Conflicts:
|
|
hw/ppc/spapr.c
|
|
hw/ppc/spapr_pci.c
|
|
hw/vfio/trace-events
|
|
include/hw/pci-host/spapr.h
|
|
include/hw/ppc/spapr.h
|
|
|
|
Conflicts come for several reasons:
|
|
1) Some contextual conflicts
|
|
2) Downstream tree does not have PHB hotplug, so upstream changes to
|
|
that code need to be dropped, we also need to adapt some hunks to
|
|
apply to the code as it existed before PHB hotplug was added
|
|
3) Upstream had a mass renaming of spapr types to give more
|
|
consistent CamelCasing. We don't have that change downstream, so
|
|
we need to adjust accordingly.
|
|
4) We add an explicit include of qemu/units.h, since it's not indirectly
|
|
included downstream (and it's messy to backport the patch which adds
|
|
that)
|
|
|
|
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1710662
|
|
|
|
Signed-off-by: David Gibson <dgibson@redhat.com>
|
|
Signed-off-by: Danilo C. L. de Paula <ddepaula@redhat.com>
|
|
---
|
|
hw/ppc/Makefile.objs | 2 +-
|
|
hw/ppc/spapr.c | 31 ++-
|
|
hw/ppc/spapr_pci.c | 21 ++-
|
|
hw/ppc/spapr_pci_nvlink2.c | 450 ++++++++++++++++++++++++++++++++++++++++++++
|
|
hw/vfio/pci-quirks.c | 131 +++++++++++++
|
|
hw/vfio/pci.c | 14 ++
|
|
hw/vfio/pci.h | 2 +
|
|
hw/vfio/trace-events | 4 +
|
|
include/hw/pci-host/spapr.h | 46 +++++
|
|
include/hw/ppc/spapr.h | 5 +-
|
|
10 files changed, 697 insertions(+), 9 deletions(-)
|
|
create mode 100644 hw/ppc/spapr_pci_nvlink2.c
|
|
|
|
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
|
|
index a46a989..d07e999 100644
|
|
--- a/hw/ppc/Makefile.objs
|
|
+++ b/hw/ppc/Makefile.objs
|
|
@@ -8,7 +8,7 @@ obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
|
|
# IBM PowerNV
|
|
obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o pnv_psi.o pnv_occ.o pnv_bmc.o
|
|
ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
|
|
-obj-y += spapr_pci_vfio.o
|
|
+obj-y += spapr_pci_vfio.o spapr_pci_nvlink2.o
|
|
endif
|
|
obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
|
|
# PowerPC 4xx boards
|
|
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
|
|
index b57c0be..c72aad1 100644
|
|
--- a/hw/ppc/spapr.c
|
|
+++ b/hw/ppc/spapr.c
|
|
@@ -910,12 +910,13 @@ static void spapr_dt_rtas(sPAPRMachineState *spapr, void *fdt)
|
|
0, cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE),
|
|
cpu_to_be32(max_cpus / smp_threads),
|
|
};
|
|
+ uint32_t maxdomain = cpu_to_be32(spapr->gpu_numa_id > 1 ? 1 : 0);
|
|
uint32_t maxdomains[] = {
|
|
cpu_to_be32(4),
|
|
- cpu_to_be32(0),
|
|
- cpu_to_be32(0),
|
|
- cpu_to_be32(0),
|
|
- cpu_to_be32(nb_numa_nodes ? nb_numa_nodes : 1),
|
|
+ maxdomain,
|
|
+ maxdomain,
|
|
+ maxdomain,
|
|
+ cpu_to_be32(spapr->gpu_numa_id),
|
|
};
|
|
|
|
_FDT(rtas = fdt_add_subnode(fdt, 0, "rtas"));
|
|
@@ -1515,6 +1516,16 @@ static void spapr_machine_reset(void)
|
|
ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
|
|
}
|
|
|
|
+ /*
|
|
+ * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node.
|
|
+ * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is
|
|
+ * called from vPHB reset handler so we initialize the counter here.
|
|
+ * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM
|
|
+ * must be equally distant from any other node.
|
|
+ * The final value of spapr->gpu_numa_id is going to be written to
|
|
+ * max-associativity-domains in spapr_build_fdt().
|
|
+ */
|
|
+ spapr->gpu_numa_id = MAX(1, nb_numa_nodes);
|
|
qemu_devices_reset();
|
|
|
|
/* DRC reset may cause a device to be unplugged. This will cause troubles
|
|
@@ -3601,7 +3612,8 @@ static const CPUArchIdList *spapr_possible_cpu_arch_ids(MachineState *machine)
|
|
static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
|
|
uint64_t *buid, hwaddr *pio,
|
|
hwaddr *mmio32, hwaddr *mmio64,
|
|
- unsigned n_dma, uint32_t *liobns, Error **errp)
|
|
+ unsigned n_dma, uint32_t *liobns,
|
|
+ hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
|
|
{
|
|
/*
|
|
* New-style PHB window placement.
|
|
@@ -3648,6 +3660,9 @@ static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t index,
|
|
*pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
|
|
*mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
|
|
*mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
|
|
+
|
|
+ *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * SPAPR_PCI_NV2RAM64_WIN_SIZE;
|
|
+ *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * SPAPR_PCI_NV2ATSD_WIN_SIZE;
|
|
}
|
|
|
|
static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
|
|
@@ -4133,7 +4148,8 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
|
|
static void phb_placement_2_7(sPAPRMachineState *spapr, uint32_t index,
|
|
uint64_t *buid, hwaddr *pio,
|
|
hwaddr *mmio32, hwaddr *mmio64,
|
|
- unsigned n_dma, uint32_t *liobns, Error **errp)
|
|
+ unsigned n_dma, uint32_t *liobns,
|
|
+ hwaddr *nv2gpa, hwaddr *nv2atsd, Error **errp)
|
|
{
|
|
/* Legacy PHB placement for pseries-2.7 and earlier machine types */
|
|
const uint64_t base_buid = 0x800000020000000ULL;
|
|
@@ -4177,6 +4193,9 @@ static void phb_placement_2_7(sPAPRMachineState *spapr, uint32_t index,
|
|
* fallback behaviour of automatically splitting a large "32-bit"
|
|
* window into contiguous 32-bit and 64-bit windows
|
|
*/
|
|
+
|
|
+ *nv2gpa = 0;
|
|
+ *nv2atsd = 0;
|
|
}
|
|
|
|
#if 0 /* Disabled for Red Hat Enterprise Linux */
|
|
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
|
|
index f936ce6..d82f957 100644
|
|
--- a/hw/ppc/spapr_pci.c
|
|
+++ b/hw/ppc/spapr_pci.c
|
|
@@ -1326,6 +1326,8 @@ static void spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
|
|
if (sphb->pcie_ecs && pci_is_express(dev)) {
|
|
_FDT(fdt_setprop_cell(fdt, offset, "ibm,pci-config-space-type", 0x1));
|
|
}
|
|
+
|
|
+ spapr_phb_nvgpu_populate_pcidev_dt(dev, fdt, offset, sphb);
|
|
}
|
|
|
|
/* create OF node for pci device and required OF DT properties */
|
|
@@ -1559,7 +1561,9 @@ static void spapr_phb_realize(DeviceState *dev, Error **errp)
|
|
smc->phb_placement(spapr, sphb->index,
|
|
&sphb->buid, &sphb->io_win_addr,
|
|
&sphb->mem_win_addr, &sphb->mem64_win_addr,
|
|
- windows_supported, sphb->dma_liobn, &local_err);
|
|
+ windows_supported, sphb->dma_liobn,
|
|
+ &sphb->nv2_gpa_win_addr, &sphb->nv2_atsd_win_addr,
|
|
+ &local_err);
|
|
if (local_err) {
|
|
error_propagate(errp, local_err);
|
|
return;
|
|
@@ -1764,8 +1768,14 @@ void spapr_phb_dma_reset(sPAPRPHBState *sphb)
|
|
static void spapr_phb_reset(DeviceState *qdev)
|
|
{
|
|
sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev);
|
|
+ Error *errp = NULL;
|
|
|
|
spapr_phb_dma_reset(sphb);
|
|
+ spapr_phb_nvgpu_free(sphb);
|
|
+ spapr_phb_nvgpu_setup(sphb, &errp);
|
|
+ if (errp) {
|
|
+ error_report_err(errp);
|
|
+ }
|
|
|
|
/* Reset the IOMMU state */
|
|
object_child_foreach(OBJECT(qdev), spapr_phb_children_reset, NULL);
|
|
@@ -1798,6 +1808,8 @@ static Property spapr_phb_properties[] = {
|
|
pre_2_8_migration, false),
|
|
DEFINE_PROP_BOOL("pcie-extended-configuration-space", sPAPRPHBState,
|
|
pcie_ecs, true),
|
|
+ DEFINE_PROP_UINT64("gpa", sPAPRPHBState, nv2_gpa_win_addr, 0),
|
|
+ DEFINE_PROP_UINT64("atsd", sPAPRPHBState, nv2_atsd_win_addr, 0),
|
|
DEFINE_PROP_END_OF_LIST(),
|
|
};
|
|
|
|
@@ -2089,6 +2101,7 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
|
|
sPAPRTCETable *tcet;
|
|
PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus;
|
|
sPAPRFDT s_fdt;
|
|
+ Error *errp = NULL;
|
|
|
|
/* Start populating the FDT */
|
|
nodename = g_strdup_printf("pci@%" PRIx64, phb->buid);
|
|
@@ -2170,6 +2183,12 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb,
|
|
return ret;
|
|
}
|
|
|
|
+ spapr_phb_nvgpu_populate_dt(phb, fdt, bus_off, &errp);
|
|
+ if (errp) {
|
|
+ error_report_err(errp);
|
|
+ }
|
|
+ spapr_phb_nvgpu_ram_populate_dt(phb, fdt);
|
|
+
|
|
return 0;
|
|
}
|
|
|
|
diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
|
|
new file mode 100644
|
|
index 0000000..60b14d8
|
|
--- /dev/null
|
|
+++ b/hw/ppc/spapr_pci_nvlink2.c
|
|
@@ -0,0 +1,450 @@
|
|
+/*
|
|
+ * QEMU sPAPR PCI for NVLink2 pass through
|
|
+ *
|
|
+ * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation.
|
|
+ *
|
|
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
+ * of this software and associated documentation files (the "Software"), to deal
|
|
+ * in the Software without restriction, including without limitation the rights
|
|
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
+ * copies of the Software, and to permit persons to whom the Software is
|
|
+ * furnished to do so, subject to the following conditions:
|
|
+ *
|
|
+ * The above copyright notice and this permission notice shall be included in
|
|
+ * all copies or substantial portions of the Software.
|
|
+ *
|
|
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
+ * THE SOFTWARE.
|
|
+ */
|
|
+#include "qemu/osdep.h"
|
|
+#include "qapi/error.h"
|
|
+#include "qemu-common.h"
|
|
+#include "hw/pci/pci.h"
|
|
+#include "hw/pci-host/spapr.h"
|
|
+#include "qemu/error-report.h"
|
|
+#include "hw/ppc/fdt.h"
|
|
+#include "hw/pci/pci_bridge.h"
|
|
+
|
|
+#define PHANDLE_PCIDEV(phb, pdev) (0x12000000 | \
|
|
+ (((phb)->index) << 16) | ((pdev)->devfn))
|
|
+#define PHANDLE_GPURAM(phb, n) (0x110000FF | ((n) << 8) | \
|
|
+ (((phb)->index) << 16))
|
|
+#define PHANDLE_NVLINK(phb, gn, nn) (0x00130000 | (((phb)->index) << 8) | \
|
|
+ ((gn) << 4) | (nn))
|
|
+
|
|
+#define SPAPR_GPU_NUMA_ID (cpu_to_be32(1))
|
|
+
|
|
+struct spapr_phb_pci_nvgpu_config {
|
|
+ uint64_t nv2_ram_current;
|
|
+ uint64_t nv2_atsd_current;
|
|
+ int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */
|
|
+ struct spapr_phb_pci_nvgpu_slot {
|
|
+ uint64_t tgt;
|
|
+ uint64_t gpa;
|
|
+ unsigned numa_id;
|
|
+ PCIDevice *gpdev;
|
|
+ int linknum;
|
|
+ struct {
|
|
+ uint64_t atsd_gpa;
|
|
+ PCIDevice *npdev;
|
|
+ uint32_t link_speed;
|
|
+ } links[NVGPU_MAX_LINKS];
|
|
+ } slots[NVGPU_MAX_NUM];
|
|
+ Error *errp;
|
|
+};
|
|
+
|
|
+static struct spapr_phb_pci_nvgpu_slot *
|
|
+spapr_nvgpu_get_slot(struct spapr_phb_pci_nvgpu_config *nvgpus, uint64_t tgt)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ /* Search for partially collected "slot" */
|
|
+ for (i = 0; i < nvgpus->num; ++i) {
|
|
+ if (nvgpus->slots[i].tgt == tgt) {
|
|
+ return &nvgpus->slots[i];
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) {
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ i = nvgpus->num;
|
|
+ nvgpus->slots[i].tgt = tgt;
|
|
+ ++nvgpus->num;
|
|
+
|
|
+ return &nvgpus->slots[i];
|
|
+}
|
|
+
|
|
+static void spapr_pci_collect_nvgpu(struct spapr_phb_pci_nvgpu_config *nvgpus,
|
|
+ PCIDevice *pdev, uint64_t tgt,
|
|
+ MemoryRegion *mr, Error **errp)
|
|
+{
|
|
+ MachineState *machine = MACHINE(qdev_get_machine());
|
|
+ sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
|
|
+ struct spapr_phb_pci_nvgpu_slot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
|
|
+
|
|
+ if (!nvslot) {
|
|
+ error_setg(errp, "Found too many GPUs per vPHB");
|
|
+ return;
|
|
+ }
|
|
+ g_assert(!nvslot->gpdev);
|
|
+ nvslot->gpdev = pdev;
|
|
+
|
|
+ nvslot->gpa = nvgpus->nv2_ram_current;
|
|
+ nvgpus->nv2_ram_current += memory_region_size(mr);
|
|
+ nvslot->numa_id = spapr->gpu_numa_id;
|
|
+ ++spapr->gpu_numa_id;
|
|
+}
|
|
+
|
|
+static void spapr_pci_collect_nvnpu(struct spapr_phb_pci_nvgpu_config *nvgpus,
|
|
+ PCIDevice *pdev, uint64_t tgt,
|
|
+ MemoryRegion *mr, Error **errp)
|
|
+{
|
|
+ struct spapr_phb_pci_nvgpu_slot *nvslot = spapr_nvgpu_get_slot(nvgpus, tgt);
|
|
+ int j;
|
|
+
|
|
+ if (!nvslot) {
|
|
+ error_setg(errp, "Found too many NVLink bridges per vPHB");
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ j = nvslot->linknum;
|
|
+ if (j == ARRAY_SIZE(nvslot->links)) {
|
|
+ error_setg(errp, "Found too many NVLink bridges per GPU");
|
|
+ return;
|
|
+ }
|
|
+ ++nvslot->linknum;
|
|
+
|
|
+ g_assert(!nvslot->links[j].npdev);
|
|
+ nvslot->links[j].npdev = pdev;
|
|
+ nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current;
|
|
+ nvgpus->nv2_atsd_current += memory_region_size(mr);
|
|
+ nvslot->links[j].link_speed =
|
|
+ object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", NULL);
|
|
+}
|
|
+
|
|
+static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev,
|
|
+ void *opaque)
|
|
+{
|
|
+ PCIBus *sec_bus;
|
|
+ Object *po = OBJECT(pdev);
|
|
+ uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL);
|
|
+
|
|
+ if (tgt) {
|
|
+ Error *local_err = NULL;
|
|
+ struct spapr_phb_pci_nvgpu_config *nvgpus = opaque;
|
|
+ Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", NULL);
|
|
+ Object *mr_npu = object_property_get_link(po, "nvlink2-atsd-mr[0]",
|
|
+ NULL);
|
|
+
|
|
+ g_assert(mr_gpu || mr_npu);
|
|
+ if (mr_gpu) {
|
|
+ spapr_pci_collect_nvgpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_gpu),
|
|
+ &local_err);
|
|
+ } else {
|
|
+ spapr_pci_collect_nvnpu(nvgpus, pdev, tgt, MEMORY_REGION(mr_npu),
|
|
+ &local_err);
|
|
+ }
|
|
+ error_propagate(&nvgpus->errp, local_err);
|
|
+ }
|
|
+ if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) !=
|
|
+ PCI_HEADER_TYPE_BRIDGE)) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
|
|
+ if (!sec_bus) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
|
|
+ spapr_phb_pci_collect_nvgpu, opaque);
|
|
+}
|
|
+
|
|
+void spapr_phb_nvgpu_setup(sPAPRPHBState *sphb, Error **errp)
|
|
+{
|
|
+ int i, j, valid_gpu_num;
|
|
+ PCIBus *bus;
|
|
+
|
|
+ /* Search for GPUs and NPUs */
|
|
+ if (!sphb->nv2_gpa_win_addr || !sphb->nv2_atsd_win_addr) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ sphb->nvgpus = g_new0(struct spapr_phb_pci_nvgpu_config, 1);
|
|
+ sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr;
|
|
+ sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr;
|
|
+
|
|
+ bus = PCI_HOST_BRIDGE(sphb)->bus;
|
|
+ pci_for_each_device(bus, pci_bus_num(bus),
|
|
+ spapr_phb_pci_collect_nvgpu, sphb->nvgpus);
|
|
+
|
|
+ if (sphb->nvgpus->errp) {
|
|
+ error_propagate(errp, sphb->nvgpus->errp);
|
|
+ sphb->nvgpus->errp = NULL;
|
|
+ goto cleanup_exit;
|
|
+ }
|
|
+
|
|
+ /* Add found GPU RAM and ATSD MRs if found */
|
|
+ for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) {
|
|
+ Object *nvmrobj;
|
|
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
|
|
+
|
|
+ if (!nvslot->gpdev) {
|
|
+ continue;
|
|
+ }
|
|
+ nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev),
|
|
+ "nvlink2-mr[0]", NULL);
|
|
+ /* ATSD is pointless without GPU RAM MR so skip those */
|
|
+ if (!nvmrobj) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ ++valid_gpu_num;
|
|
+ memory_region_add_subregion(get_system_memory(), nvslot->gpa,
|
|
+ MEMORY_REGION(nvmrobj));
|
|
+
|
|
+ for (j = 0; j < nvslot->linknum; ++j) {
|
|
+ Object *atsdmrobj;
|
|
+
|
|
+ atsdmrobj = object_property_get_link(OBJECT(nvslot->links[j].npdev),
|
|
+ "nvlink2-atsd-mr[0]", NULL);
|
|
+ if (!atsdmrobj) {
|
|
+ continue;
|
|
+ }
|
|
+ memory_region_add_subregion(get_system_memory(),
|
|
+ nvslot->links[j].atsd_gpa,
|
|
+ MEMORY_REGION(atsdmrobj));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (valid_gpu_num) {
|
|
+ return;
|
|
+ }
|
|
+ /* We did not find any interesting GPU */
|
|
+cleanup_exit:
|
|
+ g_free(sphb->nvgpus);
|
|
+ sphb->nvgpus = NULL;
|
|
+}
|
|
+
|
|
+void spapr_phb_nvgpu_free(sPAPRPHBState *sphb)
|
|
+{
|
|
+ int i, j;
|
|
+
|
|
+ if (!sphb->nvgpus) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < sphb->nvgpus->num; ++i) {
|
|
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
|
|
+ Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
|
|
+ "nvlink2-mr[0]", NULL);
|
|
+
|
|
+ if (nv_mrobj) {
|
|
+ memory_region_del_subregion(get_system_memory(),
|
|
+ MEMORY_REGION(nv_mrobj));
|
|
+ }
|
|
+ for (j = 0; j < nvslot->linknum; ++j) {
|
|
+ PCIDevice *npdev = nvslot->links[j].npdev;
|
|
+ Object *atsd_mrobj;
|
|
+ atsd_mrobj = object_property_get_link(OBJECT(npdev),
|
|
+ "nvlink2-atsd-mr[0]", NULL);
|
|
+ if (atsd_mrobj) {
|
|
+ memory_region_del_subregion(get_system_memory(),
|
|
+ MEMORY_REGION(atsd_mrobj));
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ g_free(sphb->nvgpus);
|
|
+ sphb->nvgpus = NULL;
|
|
+}
|
|
+
|
|
+void spapr_phb_nvgpu_populate_dt(sPAPRPHBState *sphb, void *fdt, int bus_off,
|
|
+ Error **errp)
|
|
+{
|
|
+ int i, j, atsdnum = 0;
|
|
+ uint64_t atsd[8]; /* The existing limitation of known guests */
|
|
+
|
|
+ if (!sphb->nvgpus) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < ARRAY_SIZE(atsd)); ++i) {
|
|
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
|
|
+
|
|
+ if (!nvslot->gpdev) {
|
|
+ continue;
|
|
+ }
|
|
+ for (j = 0; j < nvslot->linknum; ++j) {
|
|
+ if (!nvslot->links[j].atsd_gpa) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (atsdnum == ARRAY_SIZE(atsd)) {
|
|
+ error_report("Only %"PRIuPTR" ATSD registers supported",
|
|
+ ARRAY_SIZE(atsd));
|
|
+ break;
|
|
+ }
|
|
+ atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa);
|
|
+ ++atsdnum;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!atsdnum) {
|
|
+ error_setg(errp, "No ATSD registers found");
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!spapr_phb_eeh_available(sphb)) {
|
|
+ /*
|
|
+ * ibm,mmio-atsd contains ATSD registers; these belong to an NPU PHB
|
|
+ * which we do not emulate as a separate device. Instead we put
|
|
+ * ibm,mmio-atsd to the vPHB with GPU and make sure that we do not
|
|
+ * put GPUs from different IOMMU groups to the same vPHB to ensure
|
|
+ * that the guest will use ATSDs from the corresponding NPU.
|
|
+ */
|
|
+ error_setg(errp, "ATSD requires separate vPHB per GPU IOMMU group");
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd", atsd,
|
|
+ atsdnum * sizeof(atsd[0]))));
|
|
+}
|
|
+
|
|
+void spapr_phb_nvgpu_ram_populate_dt(sPAPRPHBState *sphb, void *fdt)
|
|
+{
|
|
+ int i, j, linkidx, npuoff;
|
|
+ char *npuname;
|
|
+
|
|
+ if (!sphb->nvgpus) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ npuname = g_strdup_printf("npuphb%d", sphb->index);
|
|
+ npuoff = fdt_add_subnode(fdt, 0, npuname);
|
|
+ _FDT(npuoff);
|
|
+ _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", 1));
|
|
+ _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", 0));
|
|
+ /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */
|
|
+ _FDT((fdt_setprop_string(fdt, npuoff, "compatible", "ibm,power9-npu")));
|
|
+ g_free(npuname);
|
|
+
|
|
+ for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) {
|
|
+ for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) {
|
|
+ char *linkname = g_strdup_printf("link@%d", linkidx);
|
|
+ int off = fdt_add_subnode(fdt, npuoff, linkname);
|
|
+
|
|
+ _FDT(off);
|
|
+ /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx))); */
|
|
+ _FDT((fdt_setprop_string(fdt, off, "compatible",
|
|
+ "ibm,npu-link")));
|
|
+ _FDT((fdt_setprop_cell(fdt, off, "phandle",
|
|
+ PHANDLE_NVLINK(sphb, i, j))));
|
|
+ _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", linkidx)));
|
|
+ g_free(linkname);
|
|
+ ++linkidx;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Add memory nodes for GPU RAM and mark them unusable */
|
|
+ for (i = 0; i < sphb->nvgpus->num; ++i) {
|
|
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
|
|
+ Object *nv_mrobj = object_property_get_link(OBJECT(nvslot->gpdev),
|
|
+ "nvlink2-mr[0]", NULL);
|
|
+ uint32_t associativity[] = {
|
|
+ cpu_to_be32(0x4),
|
|
+ SPAPR_GPU_NUMA_ID,
|
|
+ SPAPR_GPU_NUMA_ID,
|
|
+ SPAPR_GPU_NUMA_ID,
|
|
+ cpu_to_be32(nvslot->numa_id)
|
|
+ };
|
|
+ uint64_t size = object_property_get_uint(nv_mrobj, "size", NULL);
|
|
+ uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), cpu_to_be64(size) };
|
|
+ char *mem_name = g_strdup_printf("memory@%"PRIx64, nvslot->gpa);
|
|
+ int off = fdt_add_subnode(fdt, 0, mem_name);
|
|
+
|
|
+ _FDT(off);
|
|
+ _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
|
|
+ _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
|
|
+ _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
|
|
+ sizeof(associativity))));
|
|
+
|
|
+ _FDT((fdt_setprop_string(fdt, off, "compatible",
|
|
+ "ibm,coherent-device-memory")));
|
|
+
|
|
+ mem_reg[1] = cpu_to_be64(0);
|
|
+ _FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg,
|
|
+ sizeof(mem_reg))));
|
|
+ _FDT((fdt_setprop_cell(fdt, off, "phandle",
|
|
+ PHANDLE_GPURAM(sphb, i))));
|
|
+ g_free(mem_name);
|
|
+ }
|
|
+
|
|
+}
|
|
+
|
|
+void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
|
|
+ sPAPRPHBState *sphb)
|
|
+{
|
|
+ int i, j;
|
|
+
|
|
+ if (!sphb->nvgpus) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < sphb->nvgpus->num; ++i) {
|
|
+ struct spapr_phb_pci_nvgpu_slot *nvslot = &sphb->nvgpus->slots[i];
|
|
+
|
|
+ /* Skip "slot" without attached GPU */
|
|
+ if (!nvslot->gpdev) {
|
|
+ continue;
|
|
+ }
|
|
+ if (dev == nvslot->gpdev) {
|
|
+ uint32_t npus[nvslot->linknum];
|
|
+
|
|
+ for (j = 0; j < nvslot->linknum; ++j) {
|
|
+ PCIDevice *npdev = nvslot->links[j].npdev;
|
|
+
|
|
+ npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev));
|
|
+ }
|
|
+ _FDT(fdt_setprop(fdt, offset, "ibm,npu", npus,
|
|
+ j * sizeof(npus[0])));
|
|
+ _FDT((fdt_setprop_cell(fdt, offset, "phandle",
|
|
+ PHANDLE_PCIDEV(sphb, dev))));
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ for (j = 0; j < nvslot->linknum; ++j) {
|
|
+ if (dev != nvslot->links[j].npdev) {
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ _FDT((fdt_setprop_cell(fdt, offset, "phandle",
|
|
+ PHANDLE_PCIDEV(sphb, dev))));
|
|
+ _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu",
|
|
+ PHANDLE_PCIDEV(sphb, nvslot->gpdev)));
|
|
+ _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink",
|
|
+ PHANDLE_NVLINK(sphb, i, j))));
|
|
+ /*
|
|
+ * If we ever want to emulate GPU RAM at the same location as on
|
|
+ * the host - here is the encoding GPA->TGT:
|
|
+ *
|
|
+ * gta = ((sphb->nv2_gpa >> 42) & 0x1) << 42;
|
|
+ * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43;
|
|
+ * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45;
|
|
+ * gta |= sphb->nv2_gpa & ((1UL << 43) - 1);
|
|
+ */
|
|
+ _FDT(fdt_setprop_cell(fdt, offset, "memory-region",
|
|
+ PHANDLE_GPURAM(sphb, i)));
|
|
+ _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr",
|
|
+ nvslot->tgt));
|
|
+ _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed",
|
|
+ nvslot->links[j].link_speed));
|
|
+ }
|
|
+ }
|
|
+}
|
|
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
|
|
index 92457ed..1beedca 100644
|
|
--- a/hw/vfio/pci-quirks.c
|
|
+++ b/hw/vfio/pci-quirks.c
|
|
@@ -1968,3 +1968,134 @@ int vfio_add_virt_caps(VFIOPCIDevice *vdev, Error **errp)
|
|
|
|
return 0;
|
|
}
|
|
+
|
|
+static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
|
|
+ const char *name,
|
|
+ void *opaque, Error **errp)
|
|
+{
|
|
+ uint64_t tgt = (uintptr_t) opaque;
|
|
+ visit_type_uint64(v, name, &tgt, errp);
|
|
+}
|
|
+
|
|
+static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
|
|
+ const char *name,
|
|
+ void *opaque, Error **errp)
|
|
+{
|
|
+ uint32_t link_speed = (uint32_t)(uintptr_t) opaque;
|
|
+ visit_type_uint32(v, name, &link_speed, errp);
|
|
+}
|
|
+
|
|
+int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
|
|
+{
|
|
+ int ret;
|
|
+ void *p;
|
|
+ struct vfio_region_info *nv2reg = NULL;
|
|
+ struct vfio_info_cap_header *hdr;
|
|
+ struct vfio_region_info_cap_nvlink2_ssatgt *cap;
|
|
+ VFIOQuirk *quirk;
|
|
+
|
|
+ ret = vfio_get_dev_region_info(&vdev->vbasedev,
|
|
+ VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
|
|
+ PCI_VENDOR_ID_NVIDIA,
|
|
+ VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
|
|
+ &nv2reg);
|
|
+ if (ret) {
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ hdr = vfio_get_region_info_cap(nv2reg, VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
|
|
+ if (!hdr) {
|
|
+ ret = -ENODEV;
|
|
+ goto free_exit;
|
|
+ }
|
|
+ cap = (void *) hdr;
|
|
+
|
|
+ p = mmap(NULL, nv2reg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
|
|
+ MAP_SHARED, vdev->vbasedev.fd, nv2reg->offset);
|
|
+ if (p == MAP_FAILED) {
|
|
+ ret = -errno;
|
|
+ goto free_exit;
|
|
+ }
|
|
+
|
|
+ quirk = vfio_quirk_alloc(1);
|
|
+ memory_region_init_ram_ptr(&quirk->mem[0], OBJECT(vdev), "nvlink2-mr",
|
|
+ nv2reg->size, p);
|
|
+ QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
|
|
+
|
|
+ object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
|
|
+ vfio_pci_nvlink2_get_tgt, NULL, NULL,
|
|
+ (void *) (uintptr_t) cap->tgt, NULL);
|
|
+ trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, cap->tgt,
|
|
+ nv2reg->size);
|
|
+free_exit:
|
|
+ g_free(nv2reg);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
|
|
+{
|
|
+ int ret;
|
|
+ void *p;
|
|
+ struct vfio_region_info *atsdreg = NULL;
|
|
+ struct vfio_info_cap_header *hdr;
|
|
+ struct vfio_region_info_cap_nvlink2_ssatgt *captgt;
|
|
+ struct vfio_region_info_cap_nvlink2_lnkspd *capspeed;
|
|
+ VFIOQuirk *quirk;
|
|
+
|
|
+ ret = vfio_get_dev_region_info(&vdev->vbasedev,
|
|
+ VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
|
|
+ PCI_VENDOR_ID_IBM,
|
|
+ VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
|
|
+ &atsdreg);
|
|
+ if (ret) {
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ hdr = vfio_get_region_info_cap(atsdreg,
|
|
+ VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
|
|
+ if (!hdr) {
|
|
+ ret = -ENODEV;
|
|
+ goto free_exit;
|
|
+ }
|
|
+ captgt = (void *) hdr;
|
|
+
|
|
+ hdr = vfio_get_region_info_cap(atsdreg,
|
|
+ VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
|
|
+ if (!hdr) {
|
|
+ ret = -ENODEV;
|
|
+ goto free_exit;
|
|
+ }
|
|
+ capspeed = (void *) hdr;
|
|
+
|
|
+ /* Some NVLink bridges may not have assigned ATSD */
|
|
+ if (atsdreg->size) {
|
|
+ p = mmap(NULL, atsdreg->size, PROT_READ | PROT_WRITE | PROT_EXEC,
|
|
+ MAP_SHARED, vdev->vbasedev.fd, atsdreg->offset);
|
|
+ if (p == MAP_FAILED) {
|
|
+ ret = -errno;
|
|
+ goto free_exit;
|
|
+ }
|
|
+
|
|
+ quirk = vfio_quirk_alloc(1);
|
|
+ memory_region_init_ram_device_ptr(&quirk->mem[0], OBJECT(vdev),
|
|
+ "nvlink2-atsd-mr", atsdreg->size, p);
|
|
+ QLIST_INSERT_HEAD(&vdev->bars[0].quirks, quirk, next);
|
|
+ }
|
|
+
|
|
+ object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
|
|
+ vfio_pci_nvlink2_get_tgt, NULL, NULL,
|
|
+ (void *) (uintptr_t) captgt->tgt, NULL);
|
|
+ trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, captgt->tgt,
|
|
+ atsdreg->size);
|
|
+
|
|
+ object_property_add(OBJECT(vdev), "nvlink2-link-speed", "uint32",
|
|
+ vfio_pci_nvlink2_get_link_speed, NULL, NULL,
|
|
+ (void *) (uintptr_t) capspeed->link_speed, NULL);
|
|
+ trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
|
|
+ capspeed->link_speed);
|
|
+free_exit:
|
|
+ g_free(atsdreg);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
|
|
index ba3a393..735dcae 100644
|
|
--- a/hw/vfio/pci.c
|
|
+++ b/hw/vfio/pci.c
|
|
@@ -3078,6 +3078,20 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
|
|
}
|
|
}
|
|
|
|
+ if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
|
|
+ ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
|
|
+ if (ret && ret != -ENODEV) {
|
|
+ error_report("Failed to setup NVIDIA V100 GPU RAM");
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (vdev->vendor_id == PCI_VENDOR_ID_IBM) {
|
|
+ ret = vfio_pci_nvlink2_init(vdev, errp);
|
|
+ if (ret && ret != -ENODEV) {
|
|
+ error_report("Failed to setup NVlink2 bridge");
|
|
+ }
|
|
+ }
|
|
+
|
|
vfio_register_err_notifier(vdev);
|
|
vfio_register_req_notifier(vdev);
|
|
vfio_setup_resetfn_quirk(vdev);
|
|
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
|
|
index 629c875..bf07b43 100644
|
|
--- a/hw/vfio/pci.h
|
|
+++ b/hw/vfio/pci.h
|
|
@@ -175,6 +175,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp);
|
|
int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
|
|
struct vfio_region_info *info,
|
|
Error **errp);
|
|
+int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp);
|
|
+int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp);
|
|
|
|
int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
|
|
void vfio_display_finalize(VFIOPCIDevice *vdev);
|
|
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
|
|
index 9487887..c9a9c14 100644
|
|
--- a/hw/vfio/trace-events
|
|
+++ b/hw/vfio/trace-events
|
|
@@ -84,6 +84,10 @@ vfio_pci_igd_opregion_enabled(const char *name) "%s"
|
|
vfio_pci_igd_host_bridge_enabled(const char *name) "%s"
|
|
vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
|
|
|
|
+vfio_pci_nvidia_gpu_setup_quirk(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
|
|
+vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
|
|
+vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t link_speed) "%s link_speed=0x%x"
|
|
+
|
|
# hw/vfio/common.c
|
|
vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
|
|
vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
|
|
diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
|
|
index 0fae4fc..cd29c59 100644
|
|
--- a/include/hw/pci-host/spapr.h
|
|
+++ b/include/hw/pci-host/spapr.h
|
|
@@ -24,6 +24,7 @@
|
|
#include "hw/pci/pci.h"
|
|
#include "hw/pci/pci_host.h"
|
|
#include "hw/ppc/xics.h"
|
|
+#include "qemu/units.h"
|
|
|
|
#define TYPE_SPAPR_PCI_HOST_BRIDGE "spapr-pci-host-bridge"
|
|
|
|
@@ -87,6 +88,9 @@ struct sPAPRPHBState {
|
|
uint32_t mig_liobn;
|
|
hwaddr mig_mem_win_addr, mig_mem_win_size;
|
|
hwaddr mig_io_win_addr, mig_io_win_size;
|
|
+ hwaddr nv2_gpa_win_addr;
|
|
+ hwaddr nv2_atsd_win_addr;
|
|
+ struct spapr_phb_pci_nvgpu_config *nvgpus;
|
|
};
|
|
|
|
#define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
|
|
@@ -104,6 +108,22 @@ struct sPAPRPHBState {
|
|
|
|
#define SPAPR_PCI_MSI_WINDOW 0x40000000000ULL
|
|
|
|
+#define SPAPR_PCI_NV2RAM64_WIN_BASE SPAPR_PCI_LIMIT
|
|
+#define SPAPR_PCI_NV2RAM64_WIN_SIZE (2 * TiB) /* For up to 6 GPUs 256GB each */
|
|
+
|
|
+/* Max number of these GPUsper a physical box */
|
|
+#define NVGPU_MAX_NUM 6
|
|
+/* Max number of NVLinks per GPU in any physical box */
|
|
+#define NVGPU_MAX_LINKS 3
|
|
+
|
|
+/*
|
|
+ * GPU RAM starts at 64TiB so huge DMA window to cover it all ends at 128TiB
|
|
+ * which is enough. We do not need DMA for ATSD so we put them at 128TiB.
|
|
+ */
|
|
+#define SPAPR_PCI_NV2ATSD_WIN_BASE (128 * TiB)
|
|
+#define SPAPR_PCI_NV2ATSD_WIN_SIZE (NVGPU_MAX_NUM * NVGPU_MAX_LINKS * \
|
|
+ 64 * KiB)
|
|
+
|
|
static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin)
|
|
{
|
|
sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
|
|
@@ -135,6 +155,13 @@ int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state);
|
|
int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option);
|
|
int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb);
|
|
void spapr_phb_vfio_reset(DeviceState *qdev);
|
|
+void spapr_phb_nvgpu_setup(sPAPRPHBState *sphb, Error **errp);
|
|
+void spapr_phb_nvgpu_free(sPAPRPHBState *sphb);
|
|
+void spapr_phb_nvgpu_populate_dt(sPAPRPHBState *sphb, void *fdt, int bus_off,
|
|
+ Error **errp);
|
|
+void spapr_phb_nvgpu_ram_populate_dt(sPAPRPHBState *sphb, void *fdt);
|
|
+void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int offset,
|
|
+ sPAPRPHBState *sphb);
|
|
#else
|
|
static inline bool spapr_phb_eeh_available(sPAPRPHBState *sphb)
|
|
{
|
|
@@ -161,6 +188,25 @@ static inline int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
|
|
static inline void spapr_phb_vfio_reset(DeviceState *qdev)
|
|
{
|
|
}
|
|
+static inline void spapr_phb_nvgpu_setup(sPAPRPHBState *sphb, Error **errp)
|
|
+{
|
|
+}
|
|
+static inline void spapr_phb_nvgpu_free(sPAPRPHBState *sphb)
|
|
+{
|
|
+}
|
|
+static inline void spapr_phb_nvgpu_populate_dt(sPAPRPHBState *sphb, void *fdt,
|
|
+ int bus_off, Error **errp)
|
|
+{
|
|
+}
|
|
+static inline void spapr_phb_nvgpu_ram_populate_dt(sPAPRPHBState *sphb,
|
|
+ void *fdt)
|
|
+{
|
|
+}
|
|
+static inline void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt,
|
|
+ int offset,
|
|
+ sPAPRPHBState *sphb)
|
|
+{
|
|
+}
|
|
#endif
|
|
|
|
void spapr_phb_dma_reset(sPAPRPHBState *sphb);
|
|
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
|
|
index beb42bc..72cfa49 100644
|
|
--- a/include/hw/ppc/spapr.h
|
|
+++ b/include/hw/ppc/spapr.h
|
|
@@ -104,7 +104,8 @@ struct sPAPRMachineClass {
|
|
void (*phb_placement)(sPAPRMachineState *spapr, uint32_t index,
|
|
uint64_t *buid, hwaddr *pio,
|
|
hwaddr *mmio32, hwaddr *mmio64,
|
|
- unsigned n_dma, uint32_t *liobns, Error **errp);
|
|
+ unsigned n_dma, uint32_t *liobns, hwaddr *nv2gpa,
|
|
+ hwaddr *nv2atsd, Error **errp);
|
|
sPAPRResizeHPT resize_hpt_default;
|
|
sPAPRCapabilities default_caps;
|
|
};
|
|
@@ -171,6 +172,8 @@ struct sPAPRMachineState {
|
|
|
|
bool cmd_line_caps[SPAPR_CAP_NUM];
|
|
sPAPRCapabilities def, eff, mig;
|
|
+
|
|
+ unsigned gpu_numa_id;
|
|
};
|
|
|
|
#define H_SUCCESS 0
|
|
--
|
|
1.8.3.1
|
|
|