From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Tue, 25 Jul 2023 13:23:10 -0400
Subject: [PATCH] kern/ieee1275/init: ppc64: Restrict high memory in presence
 of fadump

When a kernel dump is present then restrict the high memory regions to
avoid allocating memory where the kernel dump resides. Use the
ibm,kernel-dump node under /rtas to determine whether a kernel dump exists
and up to which limit grub can use available memory. Set the
upper_mem_limit to the size of the kernel dump section of type
'REAL_MODE_REGION' and therefore only allow grub's memory usage for high
addresses from RMO_ADDR_MAX to 'upper_mem_limit'. This means that grub can
use high memory in the range of RMO_ADDR_MAX (768MB) to upper_mem_limit and
the kernel-dump memory regions above 'upper_mem_limit' remain untouched.
This change has no effect on memory allocations below 'linux_rmo_save'
(typically at 640MB).

Also, fall back to allocating below rmo_linux_save in case the chunk of
memory there would be larger than the chunk of memory above RMO_ADDR_MAX.
This can for example occur if a free memory area is found starting at 300MB
extending up to 1GB but a kernel dump is located at 768MB and therefore
does not allow the allocation of the high memory area but requiring to use
the chunk starting at 300MB to avoid an unnecessary out-of-memory
condition.

Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Hari Bathini <hbathini@linux.ibm.com>
Cc: Pavithra Prakash <pavrampu@in.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Carolyn Scherrer <cpscherr@us.ibm.com>
Cc: Mahesh Salgaonkar <mahesh@linux.ibm.com>
Cc: Sourabh Jain <sourabhjain@linux.ibm.com>
---
 grub-core/kern/ieee1275/init.c | 144 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 142 insertions(+), 2 deletions(-)

diff --git a/grub-core/kern/ieee1275/init.c b/grub-core/kern/ieee1275/init.c
index 3d4ad9d1f162..8e7f742fad46 100644
--- a/grub-core/kern/ieee1275/init.c
+++ b/grub-core/kern/ieee1275/init.c
@@ -17,6 +17,8 @@
  *  along with GRUB.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <stddef.h> /* offsetof() */
+
 #include <grub/kernel.h>
 #include <grub/dl.h>
 #include <grub/disk.h>
@@ -198,6 +200,96 @@ grub_claim_heap (void)
 #else
 /* Helpers for mm on powerpc. */
 
+/* ibm,kernel-dump data structures */
+struct kd_section
+{
+  grub_uint32_t flags;
+  grub_uint16_t src_datatype;
+#define KD_SRC_DATATYPE_REAL_MODE_REGION  0x0011
+  grub_uint16_t error_flags;
+  grub_uint64_t src_address;
+  grub_uint64_t num_bytes;
+  grub_uint64_t act_bytes;
+  grub_uint64_t dst_address;
+} GRUB_PACKED;
+
+#define MAX_KD_SECTIONS 10
+
+struct kernel_dump
+{
+  grub_uint32_t format;
+  grub_uint16_t num_sections;
+  grub_uint16_t status_flags;
+  grub_uint32_t offset_1st_section;
+  grub_uint32_t num_blocks;
+  grub_uint64_t start_block;
+  grub_uint64_t num_blocks_avail;
+  grub_uint32_t offet_path_string;
+  grub_uint32_t max_time_allowed;
+  struct kd_section kds[MAX_KD_SECTIONS]; /* offset_1st_section should point to kds[0] */
+} GRUB_PACKED;
+
+/*
+ * Determine if a kernel dump exists and if it does, then determine the highest
+ * address that grub can use for memory allocations.
+ * The caller must have initialized *highest to rmo_top. *highest will not
+ * be modified if no kernel dump is found.
+ */
+static void
+check_kernel_dump (grub_uint64_t *highest)
+{
+  struct kernel_dump kernel_dump;
+  grub_ssize_t kernel_dump_size;
+  grub_ieee1275_phandle_t rtas;
+  struct kd_section *kds;
+  grub_size_t i;
+
+  /* If there's a kernel-dump it must have at least one section */
+  if (grub_ieee1275_finddevice ("/rtas", &rtas) ||
+      grub_ieee1275_get_property (rtas, "ibm,kernel-dump", &kernel_dump,
+                                  sizeof (kernel_dump), &kernel_dump_size) ||
+      kernel_dump_size <= (grub_ssize_t) offsetof (struct kernel_dump, kds[1]))
+    return;
+
+  kernel_dump_size = grub_min (kernel_dump_size, (grub_ssize_t) sizeof (kernel_dump));
+
+  if (grub_be_to_cpu32 (kernel_dump.format) != 1)
+    {
+      grub_printf (_("Error: ibm,kernel-dump has an unexpected format version '%u'\n"),
+                   grub_be_to_cpu32 (kernel_dump.format));
+      return;
+    }
+
+  if (grub_be_to_cpu16 (kernel_dump.num_sections) > MAX_KD_SECTIONS)
+    {
+      grub_printf (_("Error: Too many kernel dump sections: %d\n"),
+                   grub_be_to_cpu32 (kernel_dump.num_sections));
+      return;
+    }
+
+  for (i = 0; i < grub_be_to_cpu16 (kernel_dump.num_sections); i++)
+    {
+      kds = (struct kd_section *) ((grub_addr_t) &kernel_dump +
+                                   grub_be_to_cpu32 (kernel_dump.offset_1st_section) +
+                                   i * sizeof (struct kd_section));
+      /* sanity check the address is within the 'kernel_dump' struct */
+      if ((grub_addr_t) kds > (grub_addr_t) &kernel_dump + kernel_dump_size + sizeof (*kds))
+        {
+          grub_printf (_("Error: 'kds' address beyond last available section\n"));
+          return;
+        }
+
+      if ((grub_be_to_cpu16 (kds->src_datatype) == KD_SRC_DATATYPE_REAL_MODE_REGION) &&
+          (grub_be_to_cpu64 (kds->src_address) == 0))
+        {
+          *highest = grub_min (*highest, grub_be_to_cpu64 (kds->num_bytes));
+          break;
+        }
+    }
+
+  return;
+}
+
 /*
  * How much memory does OF believe exists in total?
  *
@@ -277,10 +369,31 @@ regions_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
    *
    * Finally, we also want to make sure that when grub loads the kernel,
    * it isn't going to use up all the memory we're trying to reserve! So
-   * enforce our entire RUNTIME_MIN_SPACE here:
+   * enforce our entire RUNTIME_MIN_SPACE here (no fadump):
+   *
+   * | Top of memory == upper_mem_limit -|
+   * |                                   |
+   * |             available             |
+   * |                                   |
+   * |----------     768 MB    ----------|
+   * |                                   |
+   * |              reserved             |
+   * |                                   |
+   * |--- 768 MB - runtime min space  ---|
+   * |                                   |
+   * |             available             |
+   * |                                   |
+   * |----------      0 MB     ----------|
+   *
+   * In case fadump is used, we allow the following:
    *
    * |---------- Top of memory ----------|
    * |                                   |
+   * |             unavailable           |
+   * |         (kernel dump area)        |
+   * |                                   |
+   * |--------- upper_mem_limit ---------|
+   * |                                   |
    * |             available             |
    * |                                   |
    * |----------     768 MB    ----------|
@@ -335,17 +448,44 @@ regions_claim (grub_uint64_t addr, grub_uint64_t len, grub_memory_type_t type,
         }
       else
         {
+          grub_uint64_t upper_mem_limit = rmo_top;
+          grub_uint64_t orig_addr = addr;
+
+          check_kernel_dump (&upper_mem_limit);
+
           /*
            * we order these cases to prefer higher addresses and avoid some
            * splitting issues
+           * The following shows the order of variables:
+           *  no   kernel dump: linux_rmo_save < RMO_ADDR_MAX <= upper_mem_limit == rmo_top
+           *  with kernel dump: liuxx_rmo_save < RMO_ADDR_MAX <= upper_mem_limit <= rmo_top
            */
-          if (addr < RMO_ADDR_MAX && (addr + len) > RMO_ADDR_MAX)
+          if (addr < RMO_ADDR_MAX && (addr + len) > RMO_ADDR_MAX && upper_mem_limit >= RMO_ADDR_MAX)
             {
               grub_dprintf ("ieee1275",
                             "adjusting region for RUNTIME_MIN_SPACE: (%llx -> %llx) -> (%llx -> %llx)\n",
                             addr, addr + len, RMO_ADDR_MAX, addr + len);
               len = (addr + len) - RMO_ADDR_MAX;
               addr = RMO_ADDR_MAX;
+
+              /* We must not exceed the upper_mem_limit (assuming it's >= RMO_ADDR_MAX) */
+              if (addr + len > upper_mem_limit)
+                {
+                  /* take the bigger chunk from either below linux_rmo_save or above upper_mem_limit */
+                  len = upper_mem_limit - addr;
+                  if (orig_addr < linux_rmo_save && linux_rmo_save - orig_addr > len)
+                    {
+                      /* lower part is bigger */
+                      addr = orig_addr;
+                      len = linux_rmo_save - addr;
+                    }
+
+                  grub_dprintf ("ieee1275", "re-adjusted region to: (%llx -> %llx)\n",
+                                addr, addr + len);
+
+                  if (len == 0)
+                    return 0;
+                }
             }
           else if ((addr < linux_rmo_save) && ((addr + len) > linux_rmo_save))
             {