- kvm-hw-pci-Rename-has_power-to-enabled.patch [RHEL-7301] - kvm-hw-pci-Basic-support-for-PCI-power-management.patch [RHEL-7301] - kvm-pci-Use-PCI-PM-capability-initializer.patch [RHEL-7301] - kvm-vfio-pci-Delete-local-pm_cap.patch [RHEL-7301] - kvm-pcie-virtio-Remove-redundant-pm_cap.patch [RHEL-7301] - kvm-hw-vfio-pci-Re-order-pre-reset.patch [RHEL-7301] - kvm-Also-recommend-systemtap-devel-from-qemu-tools.patch [RHEL-47340] - Resolves: RHEL-7301 ([intel iommu] VFIO_MAP_DMA failed: Bad address on system_powerdown) - Resolves: RHEL-47340 ([Qemu RHEL-9] qemu-trace-stap should handle lack of stap more gracefully)
243 lines
9.3 KiB
Diff
243 lines
9.3 KiB
Diff
From 98b0cd83c09d35a3da0ae142c09038174355e87e Mon Sep 17 00:00:00 2001
|
|
From: Alex Williamson <alex.williamson@redhat.com>
|
|
Date: Tue, 25 Feb 2025 14:52:25 -0700
|
|
Subject: [PATCH 2/7] hw/pci: Basic support for PCI power management
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
RH-Author: Eric Auger <eric.auger@redhat.com>
|
|
RH-MergeRequest: 348: PCI: Implement basic PCI PM capability backing
|
|
RH-Jira: RHEL-7301
|
|
RH-Acked-by: Cédric Le Goater <clg@redhat.com>
|
|
RH-Acked-by: Alex Williamson <None>
|
|
RH-Acked-by: Jon Maloy <jmaloy@redhat.com>
|
|
RH-Commit: [2/6] 5faff6382c124711887704fff4f857e8f85e7be5 (eauger1/centos-qemu-kvm)
|
|
|
|
Conflicts: contextual conflict in include/hw/pci/pci.h
|
|
we don't have 449dca6ac93a ("pcie: enable Extended tag field support")
|
|
downstream so we don't have x-pcie-ext-tag definition.
|
|
|
|
The memory and IO BARs for devices are only accessible in the D0 power
|
|
state. In other power states the PCI spec defines that the device
|
|
responds to TLPs and messages with an Unsupported Request response.
|
|
|
|
To approximate this behavior, consider the BARs as unmapped when the
|
|
device is not in the D0 power state. This makes the BARs inaccessible
|
|
and has the additional bonus for vfio-pci that we don't attempt to DMA
|
|
map BARs for devices in a non-D0 power state.
|
|
|
|
To support this, an interface is added for devices to register the PM
|
|
capability, which allows central tracking to enforce valid transitions
|
|
and unmap BARs in non-D0 states.
|
|
|
|
NB. We currently have device models (eepro100 and pcie_pci_bridge)
|
|
that register a PM capability but do not set wmask to enable writes to
|
|
the power state field. In order to maintain migration compatibility,
|
|
this new helper does not manage the wmask to enable guest writes to
|
|
initiate a power state change. The contents and write access of the
|
|
PM capability are still managed by the caller.
|
|
|
|
Cc: Michael S. Tsirkin <mst@redhat.com>
|
|
Cc: Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
|
|
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
|
|
Reviewed-by: Eric Auger <eric.auger@redhat.com>
|
|
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
|
|
Link: https://lore.kernel.org/qemu-devel/20250225215237.3314011-2-alex.williamson@redhat.com
|
|
Signed-off-by: Cédric Le Goater <clg@redhat.com>
|
|
(cherry picked from commit 9461afd2008b0820fc45a6a7bc675df1b6791e4f)
|
|
Signed-off-by: Eric Auger <eric.auger@redhat.com>
|
|
---
|
|
hw/pci/pci.c | 93 ++++++++++++++++++++++++++++++++++++-
|
|
hw/pci/trace-events | 2 +
|
|
include/hw/pci/pci.h | 3 ++
|
|
include/hw/pci/pci_device.h | 3 ++
|
|
4 files changed, 99 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
|
|
index 83c9d5b9ea..d774ae47d2 100644
|
|
--- a/hw/pci/pci.c
|
|
+++ b/hw/pci/pci.c
|
|
@@ -365,6 +365,84 @@ static void pci_msi_trigger(PCIDevice *dev, MSIMessage msg)
|
|
attrs, NULL);
|
|
}
|
|
|
|
+/*
|
|
+ * Register and track a PM capability. If wmask is also enabled for the power
|
|
+ * state field of the pmcsr register, guest writes may change the device PM
|
|
+ * state. BAR access is only enabled while the device is in the D0 state.
|
|
+ * Return the capability offset or negative error code.
|
|
+ */
|
|
+int pci_pm_init(PCIDevice *d, uint8_t offset, Error **errp)
|
|
+{
|
|
+ int cap = pci_add_capability(d, PCI_CAP_ID_PM, offset, PCI_PM_SIZEOF, errp);
|
|
+
|
|
+ if (cap < 0) {
|
|
+ return cap;
|
|
+ }
|
|
+
|
|
+ d->pm_cap = cap;
|
|
+ d->cap_present |= QEMU_PCI_CAP_PM;
|
|
+
|
|
+ return cap;
|
|
+}
|
|
+
|
|
+static uint8_t pci_pm_state(PCIDevice *d)
|
|
+{
|
|
+ uint16_t pmcsr;
|
|
+
|
|
+ if (!(d->cap_present & QEMU_PCI_CAP_PM)) {
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ pmcsr = pci_get_word(d->config + d->pm_cap + PCI_PM_CTRL);
|
|
+
|
|
+ return pmcsr & PCI_PM_CTRL_STATE_MASK;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Update the PM capability state based on the new value stored in config
|
|
+ * space respective to the old, pre-write state provided. If the new value
|
|
+ * is rejected (unsupported or invalid transition) restore the old value.
|
|
+ * Return the resulting PM state.
|
|
+ */
|
|
+static uint8_t pci_pm_update(PCIDevice *d, uint32_t addr, int l, uint8_t old)
|
|
+{
|
|
+ uint16_t pmc;
|
|
+ uint8_t new;
|
|
+
|
|
+ if (!(d->cap_present & QEMU_PCI_CAP_PM) ||
|
|
+ !range_covers_byte(addr, l, d->pm_cap + PCI_PM_CTRL)) {
|
|
+ return old;
|
|
+ }
|
|
+
|
|
+ new = pci_pm_state(d);
|
|
+ if (new == old) {
|
|
+ return old;
|
|
+ }
|
|
+
|
|
+ pmc = pci_get_word(d->config + d->pm_cap + PCI_PM_PMC);
|
|
+
|
|
+ /*
|
|
+ * Transitions to D1 & D2 are only allowed if supported. Devices may
|
|
+ * only transition to higher D-states or to D0.
|
|
+ */
|
|
+ if ((!(pmc & PCI_PM_CAP_D1) && new == 1) ||
|
|
+ (!(pmc & PCI_PM_CAP_D2) && new == 2) ||
|
|
+ (old && new && new < old)) {
|
|
+ pci_word_test_and_clear_mask(d->config + d->pm_cap + PCI_PM_CTRL,
|
|
+ PCI_PM_CTRL_STATE_MASK);
|
|
+ pci_word_test_and_set_mask(d->config + d->pm_cap + PCI_PM_CTRL,
|
|
+ old);
|
|
+ trace_pci_pm_bad_transition(d->name, pci_dev_bus_num(d),
|
|
+ PCI_SLOT(d->devfn), PCI_FUNC(d->devfn),
|
|
+ old, new);
|
|
+ return old;
|
|
+ }
|
|
+
|
|
+ trace_pci_pm_transition(d->name, pci_dev_bus_num(d), PCI_SLOT(d->devfn),
|
|
+ PCI_FUNC(d->devfn), old, new);
|
|
+ return new;
|
|
+}
|
|
+
|
|
static void pci_reset_regions(PCIDevice *dev)
|
|
{
|
|
int r;
|
|
@@ -404,6 +482,11 @@ static void pci_do_device_reset(PCIDevice *dev)
|
|
pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
|
|
pci_get_word(dev->w1cmask + PCI_INTERRUPT_LINE));
|
|
dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
|
|
+ /* Default PM state is D0 */
|
|
+ if (dev->cap_present & QEMU_PCI_CAP_PM) {
|
|
+ pci_word_test_and_clear_mask(dev->config + dev->pm_cap + PCI_PM_CTRL,
|
|
+ PCI_PM_CTRL_STATE_MASK);
|
|
+ }
|
|
pci_reset_regions(dev);
|
|
pci_update_mappings(dev);
|
|
|
|
@@ -1525,7 +1608,7 @@ static void pci_update_mappings(PCIDevice *d)
|
|
continue;
|
|
|
|
new_addr = pci_bar_address(d, i, r->type, r->size);
|
|
- if (!d->enabled) {
|
|
+ if (!d->enabled || pci_pm_state(d)) {
|
|
new_addr = PCI_BAR_UNMAPPED;
|
|
}
|
|
|
|
@@ -1591,6 +1674,7 @@ uint32_t pci_default_read_config(PCIDevice *d,
|
|
|
|
void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int l)
|
|
{
|
|
+ uint8_t new_pm_state, old_pm_state = pci_pm_state(d);
|
|
int i, was_irq_disabled = pci_irq_disabled(d);
|
|
uint32_t val = val_in;
|
|
|
|
@@ -1603,11 +1687,16 @@ void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val_in, int
|
|
d->config[addr + i] = (d->config[addr + i] & ~wmask) | (val & wmask);
|
|
d->config[addr + i] &= ~(val & w1cmask); /* W1C: Write 1 to Clear */
|
|
}
|
|
+
|
|
+ new_pm_state = pci_pm_update(d, addr, l, old_pm_state);
|
|
+
|
|
if (ranges_overlap(addr, l, PCI_BASE_ADDRESS_0, 24) ||
|
|
ranges_overlap(addr, l, PCI_ROM_ADDRESS, 4) ||
|
|
ranges_overlap(addr, l, PCI_ROM_ADDRESS1, 4) ||
|
|
- range_covers_byte(addr, l, PCI_COMMAND))
|
|
+ range_covers_byte(addr, l, PCI_COMMAND) ||
|
|
+ !!new_pm_state != !!old_pm_state) {
|
|
pci_update_mappings(d);
|
|
+ }
|
|
|
|
if (ranges_overlap(addr, l, PCI_COMMAND, 2)) {
|
|
pci_update_irq_disabled(d, was_irq_disabled);
|
|
diff --git a/hw/pci/trace-events b/hw/pci/trace-events
|
|
index 19643aa8c6..c82a87ffdd 100644
|
|
--- a/hw/pci/trace-events
|
|
+++ b/hw/pci/trace-events
|
|
@@ -1,6 +1,8 @@
|
|
# See docs/devel/tracing.rst for syntax documentation.
|
|
|
|
# pci.c
|
|
+pci_pm_bad_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x REJECTED PM transition D%d->D%d"
|
|
+pci_pm_transition(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, uint8_t old, uint8_t new) "%s %02x:%02x.%x PM transition D%d->D%d"
|
|
pci_update_mappings_del(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
|
|
pci_update_mappings_add(const char *dev, uint32_t bus, uint32_t slot, uint32_t func, int bar, uint64_t addr, uint64_t size) "%s %02x:%02x.%x %d,0x%"PRIx64"+0x%"PRIx64
|
|
pci_route_irq(int dev_irq, const char *dev_path, int parent_irq, const char *parent_path) "IRQ %d @%s -> IRQ %d @%s"
|
|
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
|
|
index 45365ae085..afeb5a2263 100644
|
|
--- a/include/hw/pci/pci.h
|
|
+++ b/include/hw/pci/pci.h
|
|
@@ -213,6 +213,8 @@ enum {
|
|
QEMU_PCIE_ERR_UNC_MASK = (1 << QEMU_PCIE_ERR_UNC_MASK_BITNR),
|
|
#define QEMU_PCIE_ARI_NEXTFN_1_BITNR 12
|
|
QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR),
|
|
+#define QEMU_PCI_CAP_PM_BITNR 14
|
|
+ QEMU_PCI_CAP_PM = (1 << QEMU_PCI_CAP_PM_BITNR),
|
|
};
|
|
|
|
typedef struct PCIINTxRoute {
|
|
@@ -680,5 +682,6 @@ static inline void pci_irq_pulse(PCIDevice *pci_dev)
|
|
MSIMessage pci_get_msi_message(PCIDevice *dev, int vector);
|
|
void pci_set_enabled(PCIDevice *pci_dev, bool state);
|
|
void pci_set_power(PCIDevice *pci_dev, bool state);
|
|
+int pci_pm_init(PCIDevice *pci_dev, uint8_t offset, Error **errp);
|
|
|
|
#endif
|
|
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
|
|
index f38fb31119..325d7bcaf7 100644
|
|
--- a/include/hw/pci/pci_device.h
|
|
+++ b/include/hw/pci/pci_device.h
|
|
@@ -105,6 +105,9 @@ struct PCIDevice {
|
|
/* Capability bits */
|
|
uint32_t cap_present;
|
|
|
|
+ /* Offset of PM capability in config space */
|
|
+ uint8_t pm_cap;
|
|
+
|
|
/* Offset of MSI-X capability in config space */
|
|
uint8_t msix_cap;
|
|
|
|
--
|
|
2.48.1
|
|
|