From f963e1f65292c3ece5adc5201706600ad7817a31 Mon Sep 17 00:00:00 2001
From: "Frank Ch. Eigler" <fche@redhat.com>
Date: Mon, 14 Aug 2023 14:07:32 -0400
Subject: [PATCH] Resolves: rhbz2231632 Resolves: rhbz2231635

---
 pr29108.patch  | 1845 ++++++++++++++++++++++++++++++++++++++++++++++++
 pr30749.patch  |   99 +++
 systemtap.spec |   10 +-
 3 files changed, 1953 insertions(+), 1 deletion(-)
 create mode 100644 pr29108.patch
 create mode 100644 pr30749.patch

diff --git a/pr29108.patch b/pr29108.patch
new file mode 100644
index 0000000..43f5170
--- /dev/null
+++ b/pr29108.patch
@@ -0,0 +1,1845 @@
+commit bf95ad72c984c9e68d12707c4d34dbe6bc1f89f2
+gpg: Signature made Sat 12 Aug 2023 02:49:06 PM EDT
+gpg:                using RSA key 5D38116FA4D3A7CC77E378D37E83610126DCC2E8
+gpg: Good signature from "Frank Ch. Eigler <fche@elastic.org>" [full]
+Author: Aliaksandr Valialkin <valyala@gmail.com>
+Date:   Thu Jul 27 18:52:37 2023 -0400
+
+    runtime/staprun: import gheap routines
+    
+    BSD-2-Clause gift from the Aliaksandr Valialkin:
+    https://github.com/valyala/gheap
+
+diff --git a/staprun/gheap.h b/staprun/gheap.h
+new file mode 100644
+index 000000000..4af4b29ed
+--- /dev/null
++++ b/staprun/gheap.h
+@@ -0,0 +1,561 @@
++#ifndef GHEAP_H
++#define GHEAP_H
++
++/*
++ * Generalized heap implementation for C99.
++ *
++ * Don't forget passing -DNDEBUG option to the compiler when creating optimized
++ * builds. This significantly speeds up gheap code by removing debug assertions.
++ *
++ * Author: Aliaksandr Valialkin <valyala@gmail.com>.
++ */
++/*
++Copyright (c) 2011 Aliaksandr Valialkin <valyala@gmail.com>
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions
++are met:
++1. Redistributions of source code must retain the above copyright
++   notice, this list of conditions and the following disclaimer.
++2. Redistributions in binary form must reproduce the above copyright
++   notice, this list of conditions and the following disclaimer in the
++   documentation and/or other materials provided with the distribution.
++
++THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
++ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
++FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
++OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
++LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
++OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
++SUCH DAMAGE.
++*/
++
++
++
++/*******************************************************************************
++ * Interface.
++ ******************************************************************************/
++
++#include <stddef.h>     /* for size_t */
++#include <stdint.h>     /* for SIZE_MAX */
++
++/*
++ * Less comparer must return non-zero value if a < b.
++ * ctx is the gheap_ctx->less_comparer_ctx.
++ * Otherwise it must return 0.
++ */
++typedef int (*gheap_less_comparer_t)(const void *ctx, const void *a,
++    const void *b);
++
++/*
++ * Moves the item from src to dst.
++ */
++typedef void (*gheap_item_mover_t)(void *dst, const void *src);
++
++/*
++ * Gheap context.
++ * This context must be passed to every gheap function.
++ */
++struct gheap_ctx
++{
++  /*
++   * How much children each heap item can have.
++   */
++  size_t fanout;
++
++  /*
++   * A chunk is a tuple containing fanout items arranged sequentially in memory.
++   * A page is a subheap containing page_chunks chunks arranged sequentially
++   * in memory.
++   * The number of chunks in a page is an arbitrary integer greater than 0.
++   */
++  size_t page_chunks;
++
++  /*
++   * The size of each item in bytes.
++   */
++  size_t item_size;
++
++  gheap_less_comparer_t less_comparer;
++  const void *less_comparer_ctx;
++
++  gheap_item_mover_t item_mover;
++};
++
++/*
++ * Returns parent index for the given child index.
++ * Child index must be greater than 0.
++ * Returns 0 if the parent is root.
++ */
++static inline size_t gheap_get_parent_index(const struct gheap_ctx *ctx,
++    size_t u);
++
++/*
++ * Returns the index of the first child for the given parent index.
++ * Parent index must be less than SIZE_MAX.
++ * Returns SIZE_MAX if the index of the first child for the given parent
++ * cannot fit size_t.
++ */
++static inline size_t gheap_get_child_index(const struct gheap_ctx *ctx,
++    size_t u);
++
++/*
++ * Returns a pointer to the first non-heap item using less_comparer
++ * for items' comparison.
++ * Returns the index of the first non-heap item.
++ * Returns heap_size if base points to valid max heap with the given size.
++ */
++static inline size_t gheap_is_heap_until(const struct gheap_ctx *ctx,
++    const void *base, size_t heap_size);
++
++/*
++ * Returns non-zero if base points to valid max heap. Returns zero otherwise.
++ * Uses less_comparer for items' comparison.
++ */
++static inline int gheap_is_heap(const struct gheap_ctx *ctx,
++    const void *base, size_t heap_size);
++
++/*
++ * Makes max heap from items base[0] ... base[heap_size-1].
++ * Uses less_comparer for items' comparison.
++ */
++static inline void gheap_make_heap(const struct gheap_ctx *ctx,
++    void *base, size_t heap_size);
++
++/*
++ * Pushes the item base[heap_size-1] into max heap base[0] ... base[heap_size-2]
++ * Uses less_comparer for items' comparison.
++ */
++static inline void gheap_push_heap(const struct gheap_ctx *ctx,
++    void *base, size_t heap_size);
++
++/*
++ * Pops the maximum item from max heap base[0] ... base[heap_size-1] into
++ * base[heap_size-1].
++ * Uses less_comparer for items' comparison.
++ */
++static inline void gheap_pop_heap(const struct gheap_ctx *ctx,
++    void *base, size_t heap_size);
++
++/*
++ * Sorts items in place of max heap in ascending order.
++ * Uses less_comparer for items' comparison.
++ */
++static inline void gheap_sort_heap(const struct gheap_ctx *ctx,
++    void *base, size_t heap_size);
++
++/*
++ * Swaps the item outside the heap with the maximum item inside
++ * the heap and restores heap invariant.
++ */
++static inline void gheap_swap_max_item(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size, void *item);
++
++/*
++ * Restores max heap invariant after item's value has been increased,
++ * i.e. less_comparer(old_item, new_item) != 0.
++ */
++static inline void gheap_restore_heap_after_item_increase(
++    const struct gheap_ctx *ctx,
++    void *base, size_t heap_size, size_t modified_item_index);
++
++/*
++ * Restores max heap invariant after item's value has been decreased,
++ * i.e. less_comparer(new_item, old_item) != 0.
++ */
++static inline void gheap_restore_heap_after_item_decrease(
++    const struct gheap_ctx *ctx,
++    void *base, size_t heap_size, size_t modified_item_index);
++
++/*
++ * Removes the given item from the heap and puts it into base[heap_size-1].
++ * Uses less_comparer for items' comparison.
++ */
++static inline void gheap_remove_from_heap(const struct gheap_ctx *ctx,
++    void *base, size_t heap_size, size_t item_index);
++
++/*******************************************************************************
++ * Implementation.
++ *
++ * Define all functions inline, so compiler will be able optimizing out common
++ * args (fanout, page_chunks, item_size, less_comparer and item_mover),
++ * which are usually constants, using contant folding optimization
++ * ( http://en.wikipedia.org/wiki/Constant_folding ).
++ *****************************************************************************/
++
++#include <assert.h>     /* for assert */
++#include <stddef.h>     /* for size_t */
++#include <stdint.h>     /* for uintptr_t, SIZE_MAX and UINTPTR_MAX */
++
++static inline size_t gheap_get_parent_index(const struct gheap_ctx *const ctx,
++    size_t u)
++{
++  assert(u > 0);
++
++  const size_t fanout = ctx->fanout;
++  const size_t page_chunks = ctx->page_chunks;
++
++  --u;
++  if (page_chunks == 1) {
++    return u / fanout;
++  }
++
++  if (u < fanout) {
++    /* Parent is root. */
++    return 0;
++  }
++
++  assert(page_chunks <= SIZE_MAX / fanout);
++  const size_t page_size = fanout * page_chunks;
++  size_t v = u % page_size;
++  if (v >= fanout) {
++    /* Fast path. Parent is on the same page as the child. */
++    return u - v + v / fanout;
++  }
++
++  /* Slow path. Parent is on another page. */
++  v = u / page_size - 1;
++  const size_t page_leaves = (fanout - 1) * page_chunks + 1;
++  u = v / page_leaves + 1;
++  return u * page_size + v % page_leaves - page_leaves + 1;
++}
++
++static inline size_t gheap_get_child_index(const struct gheap_ctx *const ctx,
++    size_t u)
++{
++  assert(u < SIZE_MAX);
++
++  const size_t fanout = ctx->fanout;
++  const size_t page_chunks = ctx->page_chunks;
++
++  if (page_chunks == 1) {
++    if (u > (SIZE_MAX - 1) / fanout) {
++      /* Child overflow. */
++      return SIZE_MAX;
++    }
++    return u * fanout + 1;
++  }
++
++  if (u == 0) {
++    /* Root's child is always 1. */
++    return 1;
++  }
++
++  assert(page_chunks <= SIZE_MAX / fanout);
++  const size_t page_size = fanout * page_chunks;
++  --u;
++  size_t v = u % page_size + 1;
++  if (v < page_size / fanout) {
++    /* Fast path. Child is on the same page as the parent. */
++    v *= fanout - 1;
++    if (u > SIZE_MAX - 2 - v) {
++      /* Child overflow. */
++      return SIZE_MAX;
++    }
++    return u + v + 2;
++  }
++
++  /* Slow path. Child is on another page. */
++  const size_t page_leaves = (fanout - 1) * page_chunks + 1;
++  v += (u / page_size + 1) * page_leaves - page_size;
++  if (v > (SIZE_MAX - 1) / page_size) {
++    /* Child overflow. */
++    return SIZE_MAX;
++  }
++  return v * page_size + 1;
++}
++
++/* Returns a pointer to base[index]. */
++static inline void *_gheap_get_item_ptr(const struct gheap_ctx *const ctx,
++    const void *const base, const size_t index)
++{
++  const size_t item_size = ctx->item_size;
++
++  assert(index <= SIZE_MAX / item_size);
++
++  const size_t offset = item_size * index;
++  assert((uintptr_t)base <= UINTPTR_MAX - offset);
++
++  return ((char *)base) + offset;
++}
++
++/*
++ * Sifts the item up in the given sub-heap with the given root_index
++ * starting from the hole_index.
++ */
++static inline void _gheap_sift_up(const struct gheap_ctx *const ctx,
++    void *const base, const size_t root_index, size_t hole_index,
++    const void *const item)
++{
++  assert(hole_index >= root_index);
++
++  const gheap_less_comparer_t less_comparer = ctx->less_comparer;
++  const void *const less_comparer_ctx = ctx->less_comparer_ctx;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  while (hole_index > root_index) {
++    const size_t parent_index = gheap_get_parent_index(ctx, hole_index);
++    assert(parent_index >= root_index);
++    const void *const parent = _gheap_get_item_ptr(ctx, base, parent_index);
++    if (!less_comparer(less_comparer_ctx, parent, item)) {
++      break;
++    }
++    item_mover(_gheap_get_item_ptr(ctx, base, hole_index),
++        parent);
++    hole_index = parent_index;
++  }
++
++  item_mover(_gheap_get_item_ptr(ctx, base, hole_index), item);
++}
++
++/*
++ * Moves the max child into the given hole and returns index
++ * of the new hole.
++ */
++static inline size_t _gheap_move_up_max_child(const struct gheap_ctx *const ctx,
++    void *const base, const size_t children_count,
++    const size_t hole_index, const size_t child_index)
++{
++  assert(children_count > 0);
++  assert(children_count <= ctx->fanout);
++  assert(child_index == gheap_get_child_index(ctx, hole_index));
++
++  const gheap_less_comparer_t less_comparer = ctx->less_comparer;
++  const void *const less_comparer_ctx = ctx->less_comparer_ctx;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  size_t max_child_index = child_index;
++  for (size_t i = 1; i < children_count; ++i) {
++    if (!less_comparer(less_comparer_ctx,
++        _gheap_get_item_ptr(ctx, base, child_index + i),
++        _gheap_get_item_ptr(ctx, base, max_child_index))) {
++      max_child_index = child_index + i;
++    }
++  }
++  item_mover(_gheap_get_item_ptr(ctx, base, hole_index),
++      _gheap_get_item_ptr(ctx, base, max_child_index));
++  return max_child_index;
++}
++
++/*
++ * Sifts the given item down in the heap of the given size starting
++ * from the hole_index.
++ */
++static inline void _gheap_sift_down(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size, size_t hole_index,
++    const void *const item)
++{
++  assert(heap_size > 0);
++  assert(hole_index < heap_size);
++
++  const size_t fanout = ctx->fanout;
++
++  const size_t root_index = hole_index;
++  const size_t last_full_index = heap_size - (heap_size - 1) % fanout;
++  while (1) {
++    const size_t child_index = gheap_get_child_index(ctx, hole_index);
++    if (child_index >= last_full_index) {
++      if (child_index < heap_size) {
++        assert(child_index == last_full_index);
++        hole_index = _gheap_move_up_max_child(ctx, base,
++            heap_size - child_index, hole_index, child_index);
++      }
++      break;
++    }
++    assert(heap_size - child_index >= fanout);
++    hole_index = _gheap_move_up_max_child(ctx, base, fanout, hole_index,
++        child_index);
++  }
++  _gheap_sift_up(ctx, base, root_index, hole_index, item);
++}
++
++/*
++ * Pops the maximum item from the heap [base[0] ... base[heap_size-1]]
++ * into base[heap_size].
++ */
++static inline void _gheap_pop_max_item(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size)
++{
++  void *const hole = _gheap_get_item_ptr(ctx, base, heap_size);
++  gheap_swap_max_item(ctx, base, heap_size, hole);
++}
++
++static inline size_t gheap_is_heap_until(const struct gheap_ctx *const ctx,
++    const void *const base, const size_t heap_size)
++{
++  const gheap_less_comparer_t less_comparer = ctx->less_comparer;
++  const void *const less_comparer_ctx = ctx->less_comparer_ctx;
++
++  for (size_t u = 1; u < heap_size; ++u) {
++    const size_t v = gheap_get_parent_index(ctx, u);
++    const void *const a = _gheap_get_item_ptr(ctx, base, v);
++    const void *const b = _gheap_get_item_ptr(ctx, base, u);
++    if (less_comparer(less_comparer_ctx, a, b)) {
++      return u;
++    }
++  }
++  return heap_size;
++}
++
++static inline int gheap_is_heap(const struct gheap_ctx *const ctx,
++    const void *const base, const size_t heap_size)
++{
++  return (gheap_is_heap_until(ctx, base, heap_size) == heap_size);
++}
++
++static inline void gheap_make_heap(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size)
++{
++  const size_t fanout = ctx->fanout;
++  const size_t page_chunks = ctx->page_chunks;
++  const size_t item_size = ctx->item_size;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  if (heap_size > 1) {
++    /* Skip leaf nodes without children. This is easy to do for non-paged heap,
++     * i.e. when page_chunks = 1, but it is difficult for paged heaps.
++     * So leaf nodes in paged heaps are visited anyway.
++     */
++    size_t i = (page_chunks == 1) ? ((heap_size - 2) / fanout) :
++        (heap_size - 2);
++    do {
++      char tmp[item_size];
++      item_mover(tmp, _gheap_get_item_ptr(ctx, base, i));
++      _gheap_sift_down(ctx, base, heap_size, i, tmp);
++    } while (i-- > 0);
++  }
++
++  assert(gheap_is_heap(ctx, base, heap_size));
++}
++
++static inline void gheap_push_heap(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size)
++{
++  assert(heap_size > 0);
++  assert(gheap_is_heap(ctx, base, heap_size - 1));
++
++  const size_t item_size = ctx->item_size;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  if (heap_size > 1) {
++    const size_t u = heap_size - 1;
++    char tmp[item_size];
++    item_mover(tmp, _gheap_get_item_ptr(ctx, base, u));
++    _gheap_sift_up(ctx, base, 0, u, tmp);
++  }
++
++  assert(gheap_is_heap(ctx, base, heap_size));
++}
++
++static inline void gheap_pop_heap(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size)
++{
++  assert(heap_size > 0);
++  assert(gheap_is_heap(ctx, base, heap_size));
++
++  if (heap_size > 1) {
++    _gheap_pop_max_item(ctx, base, heap_size - 1);
++  }
++
++  assert(gheap_is_heap(ctx, base, heap_size - 1));
++}
++
++static inline void gheap_sort_heap(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size)
++{
++  for (size_t i = heap_size; i > 1; --i) {
++    _gheap_pop_max_item(ctx, base, i - 1);
++  }
++}
++
++static inline void gheap_swap_max_item(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size, void *item)
++{
++  assert(heap_size > 0);
++  assert(gheap_is_heap(ctx, base, heap_size));
++
++  const size_t item_size = ctx->item_size;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  char tmp[item_size];
++  item_mover(tmp, item);
++  item_mover(item, base);
++  _gheap_sift_down(ctx, base, heap_size, 0, tmp);
++
++  assert(gheap_is_heap(ctx, base, heap_size));
++}
++
++static inline void gheap_restore_heap_after_item_increase(
++    const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size, size_t modified_item_index)
++{
++  assert(heap_size > 0);
++  assert(modified_item_index < heap_size);
++  assert(gheap_is_heap(ctx, base, modified_item_index));
++
++  const size_t item_size = ctx->item_size;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  if (modified_item_index > 0) {
++    char tmp[item_size];
++    item_mover(tmp, _gheap_get_item_ptr(ctx, base, modified_item_index));
++    _gheap_sift_up(ctx, base, 0, modified_item_index, tmp);
++  }
++
++  assert(gheap_is_heap(ctx, base, heap_size));
++  (void)heap_size;
++}
++
++static inline void gheap_restore_heap_after_item_decrease(
++    const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size, size_t modified_item_index)
++{
++  assert(heap_size > 0);
++  assert(modified_item_index < heap_size);
++  assert(gheap_is_heap(ctx, base, modified_item_index));
++
++  const size_t item_size = ctx->item_size;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  char tmp[item_size];
++  item_mover(tmp, _gheap_get_item_ptr(ctx, base, modified_item_index));
++  _gheap_sift_down(ctx, base, heap_size, modified_item_index, tmp);
++
++  assert(gheap_is_heap(ctx, base, heap_size));
++}
++
++static inline void gheap_remove_from_heap(const struct gheap_ctx *const ctx,
++    void *const base, const size_t heap_size, size_t item_index)
++{
++  assert(heap_size > 0);
++  assert(item_index < heap_size);
++  assert(gheap_is_heap(ctx, base, heap_size));
++
++  const size_t item_size = ctx->item_size;
++  const gheap_less_comparer_t less_comparer = ctx->less_comparer;
++  const void *const less_comparer_ctx = ctx->less_comparer_ctx;
++  const gheap_item_mover_t item_mover = ctx->item_mover;
++
++  const size_t new_heap_size = heap_size - 1;
++  if (item_index < new_heap_size) {
++    char tmp[item_size];
++    void *const hole = _gheap_get_item_ptr(ctx, base, new_heap_size);
++    item_mover(tmp, hole);
++    item_mover(hole, _gheap_get_item_ptr(ctx, base, item_index));
++    if (less_comparer(less_comparer_ctx, tmp, hole)) {
++      _gheap_sift_down(ctx, base, new_heap_size, item_index, tmp);
++    }
++    else {
++      _gheap_sift_up(ctx, base, 0, item_index, tmp);
++    }
++  }
++
++  assert(gheap_is_heap(ctx, base, new_heap_size));
++}
++
++#endif
+
+commit 5b39471380a238469c8fc18136f12600e5e9aec7
+gpg: Signature made Sat 12 Aug 2023 02:49:21 PM EDT
+gpg:                using RSA key 5D38116FA4D3A7CC77E378D37E83610126DCC2E8
+gpg: Good signature from "Frank Ch. Eigler <fche@elastic.org>" [full]
+Author: Frank Ch. Eigler <fche@redhat.com>
+Date:   Mon Jul 31 14:06:57 2023 -0400
+
+    PR29108 / BZ2095359: rewrite staprun serializer logic
+    
+    Logic in commit cd48874296e00 (2021, PR28449) fixed broken cross-cpu
+    message ordering that followed previous transport concurrency fixes,
+    but imposed a lot of userspace synchronization delays upon the threads
+    who were supposed to drain messages from the kernel relayfs streams as
+    fast as possible.  This has led to unnecessarily lossy output overall.
+    
+    New code uses a new many-writers single-reader data structure, a mutex
+    protected heap.  All the per-cpu readers copy & pump messages into
+    that heap as rapidly as possible, sorted by the generally monotonic
+    sequence number.  The reader is signalled via a condition variable and
+    time to print & release messages in sequence number order.  It also
+    handles lost messages (jumps in the sequence numbers) by waiting a while
+    to let the stragglers come in.
+    
+    The kernel-user messages now also include a framing sequence to allow
+    the per-cpu readers to resynchronize to the message boundaries, in
+    case some sort of buffer overflow or something else occurs.  It
+    reports how many bytes and/or messages were skipped in order to
+    resynchronize.  It does so in a lot less lossy way than previous code,
+    which just tried to flush everything then-currently available, hoping
+    that it'd match message boundaries.
+    
+    Unfortunately, this means that the user-kernel message ABI has
+    changed!  Previous-version staprun instances won't work with the new
+    modules, nor will current-version staprun with old modules.  This flag
+    day is enforced by changing the numbers of the various ctl message
+    numbers, so old/new kernel/user combinations will generate errors
+    rather than quasi-successful staprun startup.
+    
+    New code also dramatically simplifies the use of signals in staprun
+    (or rather stapio).  Gone is the signal thread, a lot of the
+    masking/blocking/waiting.  Instead a single basic signal handler just
+    increments globals when signals of various kinds arrive, and all the
+    per-cpu etc. threads poll those globals periodically.  This includes
+    logic needed for -S (output file rotation on SIGUSR2) as well as
+    flight recorder (-L / -A) modes.
+    
+    The reader_timeout_ms value (-T) in both bulk/serialized mode for all
+    ppoll timeouts, to prevent those threads from sleeping indefinitely,
+    now that they won't be bothered by signals.
+
+diff --git a/configure b/configure
+index 974cc2c81..1ff5580b4 100755
+--- a/configure
++++ b/configure
+@@ -12694,6 +12694,14 @@ printf "%s\n" "$as_me: WARNING: cannot find librpmio" >&2;}
+   fi
+ fi
+ 
++ac_fn_c_check_header_compile "$LINENO" "stdatomic.h" "ac_cv_header_stdatomic_h" "$ac_includes_default"
++if test "x$ac_cv_header_stdatomic_h" = xyes
++then :
++  printf "%s\n" "#define HAVE_STDATOMIC_H 1" >>confdefs.h
++
++fi
++
++
+        for ac_header in rpm/rpmcrypto.h
+ do :
+   ac_fn_c_check_header_compile "$LINENO" "rpm/rpmcrypto.h" "ac_cv_header_rpm_rpmcrypto_h" "$ac_includes_default"
+diff --git a/configure.ac b/configure.ac
+index 3f184f862..e9176b725 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -490,6 +490,8 @@ if test "$with_rpm" != "no"; then
+   fi
+ fi
+ 
++AC_CHECK_HEADERS([stdatomic.h])
++
+ dnl Look for rpmcrypto.h
+ AC_CHECK_HEADERS([rpm/rpmcrypto.h], [
+ 		    AC_DEFINE([HAVE_RPMCRYPTO_H],[1],[have rpmcrypto_h])
+diff --git a/man/stap.1.in b/man/stap.1.in
+index 4e1f0a537..c1a81fef3 100644
+--- a/man/stap.1.in
++++ b/man/stap.1.in
+@@ -388,7 +388,7 @@ With \-o option, run staprun in background as a daemon and show its pid.
+ Sets the maximum size of output file and the maximum number of output files.
+ If the size of output file will exceed
+ .B size
+-, systemtap switches output file to the next file. And if the number of
++megabytes, systemtap switches output file to the next file. And if the number of
+ output files exceed
+ .B N
+ , systemtap removes the oldest output file. You can omit the second argument.
+diff --git a/runtime/print_flush.c b/runtime/print_flush.c
+index 35677b225..4141f95b9 100644
+--- a/runtime/print_flush.c
++++ b/runtime/print_flush.c
+@@ -43,6 +43,7 @@ static void __stp_print_flush(struct _stp_log *log)
+         if (likely(entry && bytes_reserved > hlen)) {
+                 /* copy new _stp_trace_ header */
+                 struct _stp_trace t = {
++                        .magic = STAP_TRACE_MAGIC,
+                         .sequence = _stp_seq_inc(),
+                         .pdu_len = len
+                 };
+diff --git a/runtime/transport/control.c b/runtime/transport/control.c
+index 3d7333403..d0a8bdf53 100644
+--- a/runtime/transport/control.c
++++ b/runtime/transport/control.c
+@@ -57,7 +57,7 @@ static ssize_t _stp_ctl_write_cmd(struct file *file, const char __user *buf, siz
+ 
+ #if defined(DEBUG_TRANS) && (DEBUG_TRANS >= 2)
+ 	if (type < STP_MAX_CMD)
+-		dbug_trans2("Got %s. euid=%ld, len=%d\n", _stp_command_name[type],
++		dbug_trans2("Got %s. euid=%ld, len=%d\n", _stp_command_name[min(type,STP_MAX_CMD)] ?: "?",
+ 			    (long)euid, (int)count);
+ #endif
+ 
+@@ -211,7 +211,9 @@ out:
+ 
+ #if defined(DEBUG_TRANS) && (DEBUG_TRANS >= 2)
+ 	if (type < STP_MAX_CMD)
+-		dbug_trans2("Completed %s (rc=%d)\n", _stp_command_name[type], rc);
++		dbug_trans2("Completed %s (rc=%d)\n",
++                            _stp_command_name[min(type,STP_MAX_CMD)] ?: "?",
++                            rc);
+ #endif
+         return rc;
+ }
+diff --git a/runtime/transport/transport_msgs.h b/runtime/transport/transport_msgs.h
+index 9e0081c80..e3aa995b1 100644
+--- a/runtime/transport/transport_msgs.h
++++ b/runtime/transport/transport_msgs.h
+@@ -1,7 +1,7 @@
+ /* -*- linux-c -*- 
+  * transport_msgs.h - messages exchanged between module and userspace
+  *
+- * Copyright (C) Red Hat Inc, 2006-2011
++ * Copyright (C) Red Hat Inc, 2006-2023
+  *
+  * This file is part of systemtap, and is free software.  You can
+  * redistribute it and/or modify it under the terms of the GNU General
+@@ -19,7 +19,9 @@
+ #define STP_TZ_NAME_LEN 64
+ #define STP_REMOTE_URI_LEN 128
+ 
++#define STAP_TRACE_MAGIC "\xF0\x9F\xA9\xBA" /* unicode stethoscope 🩺 in UTF-8 */
+ struct _stp_trace {
++        char magic[4];          /* framing helper */
+ 	uint32_t sequence;	/* event number */
+ 	uint32_t pdu_len;	/* length of data after this trace */
+ };
+@@ -30,7 +32,7 @@ enum
+ 	/** stapio sends a STP_START after recieving a STP_TRANSPORT from
+ 	    the module. The module sends STP_START back with result of call
+ 	    systemtap_module_init() which will install all initial probes.  */
+-	STP_START,
++	STP_START = 0x50, // renumbered in version 5.0 to force incompatibility
+ 	/** stapio sends STP_EXIT to signal it wants to stop the module
+ 	    itself or in response to receiving a STP_REQUEST_EXIT.
+ 	    The module sends STP_EXIT once _stp_clean_and_exit has been
+@@ -87,16 +89,21 @@ enum
+ 	/** Send by staprun to notify module of remote identity, if any.
+             Only send once at startup.  */
+         STP_REMOTE_ID,
++	/** Placeholder, it was mistakenly labeled STP_MAX_CMD */
++	STP_MAX_CMD_PLACEHOLDER,
++        /** Sent by stapio after having recevied STP_TRANSPORT. Notifies
++            the module of the target namespaces pid.*/
++        STP_NAMESPACES_PID,
++        
++        /** INSERT NEW MESSAGE TYPES HERE */
++        
+ 	/** Max number of message types, sanity check only.  */
+ 	STP_MAX_CMD,
+-  /** Sent by stapio after having recevied STP_TRANSPORT. Notifies
+-      the module of the target namespaces pid.*/
+-  STP_NAMESPACES_PID
+ };
+ 
+ #ifdef DEBUG_TRANS
+-static const char *_stp_command_name[] = {
+-	"STP_START",
++static const char *_stp_command_name[STP_MAX_CMD] = {
++	[STP_START]="STP_START",
+ 	"STP_EXIT",
+ 	"STP_OOB_DATA",
+ 	"STP_SYSTEM",
+@@ -113,7 +120,9 @@ static const char *_stp_command_name[] = {
+ 	"STP_TZINFO",
+ 	"STP_PRIVILEGE_CREDENTIALS",
+ 	"STP_REMOTE_ID",
+-  "STP_NAMESPACES_PID",
++        "STP_MAX_CMD_PLACEHOLDER",
++        "STP_NAMESPACE_PID",
++        [STP_MAX_CMD]="?"   /* in control.c, STP_MAX_CMD represents unknown message numbers/names */
+ };
+ #endif /* DEBUG_TRANS */
+ 
+diff --git a/staprun/common.c b/staprun/common.c
+index 3d23d7319..f8d618e24 100644
+--- a/staprun/common.c
++++ b/staprun/common.c
+@@ -115,7 +115,7 @@ void parse_args(int argc, char **argv)
+ 	target_pid = 0;
+ 	target_namespaces_pid = 0;
+ 	buffer_size = 0;
+-        reader_timeout_ms = 0;
++        reader_timeout_ms = 200;
+ 	target_cmd = NULL;
+ 	outfile_name = NULL;
+ 	rename_mod = 0;
+diff --git a/staprun/mainloop.c b/staprun/mainloop.c
+index 4af21e950..c507fc069 100644
+--- a/staprun/mainloop.c
++++ b/staprun/mainloop.c
+@@ -7,7 +7,7 @@
+  * Public License (GPL); either version 2, or (at your option) any
+  * later version.
+  *
+- * Copyright (C) 2005-2021 Red Hat Inc.
++ * Copyright (C) 2005-2023 Red Hat Inc.
+  */
+ 
+ #include "staprun.h"
+@@ -23,31 +23,9 @@
+ 
+ /* globals */
+ int ncpus;
+-static int pending_interrupts = 0;
++static volatile sig_atomic_t pending_interrupts = 0; // tells stp_main_loop to trigger STP_EXIT message to kernel
+ static int target_pid_failed_p = 0;
+ 
+-/* Setup by setup_main_signals, used by signal_thread to notify the
+-   main thread of interruptable events. */
+-static pthread_t main_thread;
+-
+-static void set_nonblocking_std_fds(void)
+-{
+-  int fd;
+-  for (fd = 1; fd < 3; fd++) {
+-    /* NB: writing to stderr/stdout blockingly in signal handler is
+-     * dangerous since it may prevent the stap process from quitting
+-     * gracefully on receiving SIGTERM/etc signals when the stderr/stdout
+-     * write buffer is full. PR23891 */
+-    int flags = fcntl(fd, F_GETFL);
+-    if (flags == -1)
+-      continue;
+-
+-    if (flags & O_NONBLOCK)
+-      continue;
+-
+-    (void) fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+-  }
+-}
+ 
+ static void set_blocking_std_fds(void)
+ {
+@@ -77,43 +55,16 @@ static void my_exit(int rc)
+   _exit(rc);
+ }
+ 
+-static void *signal_thread(void *arg)
+-{
+-  sigset_t *s = (sigset_t *) arg;
+-  int signum = 0;
+ 
+-  while (1) {
+-    if (sigwait(s, &signum) < 0) {
+-      _perr("sigwait");
+-      continue;
+-    }
++
++static void interrupt_handler(int signum)
++{
+     if (signum == SIGQUIT) {
+       load_only = 1; /* flag for stp_main_loop */
+-      pending_interrupts ++;
+-    } else if (signum == SIGINT || signum == SIGHUP || signum == SIGTERM
+-               || signum == SIGPIPE)
+-    {
+-      pending_interrupts ++;
+     }
+-    if (pending_interrupts > 2) {
+-      set_nonblocking_std_fds();
+-      pthread_kill (main_thread, SIGURG);
+-    }
+-    dbug(2, "sigproc %d (%s)\n", signum, strsignal(signum));
+-  }
+-  /* Notify main thread (interrupts select). */
+-  pthread_kill (main_thread, SIGURG);
+-  return NULL;
++    pending_interrupts ++;
+ }
+ 
+-static void urg_proc(int signum)
+-{
+-  /* This handler is just notified from the signal_thread
+-     whenever an interruptable condition is detected. The
+-     handler itself doesn't do anything. But this will
+-     result select to detect an EINTR event. */
+-  dbug(2, "urg_proc %d (%s)\n", signum, strsignal(signum));
+-}
+ 
+ static void chld_proc(int signum)
+ {
+@@ -143,9 +94,9 @@ static void chld_proc(int signum)
+   (void) rc; /* XXX: notused */
+ }
+ 
++
+ static void setup_main_signals(void)
+ {
+-  pthread_t tid;
+   struct sigaction sa;
+   sigset_t *s = malloc(sizeof(*s));
+   if (!s) {
+@@ -153,25 +104,11 @@ static void setup_main_signals(void)
+     exit(1);
+   }
+ 
+-  /* The main thread will only handle SIGCHLD and SIGURG.
+-     SIGURG is send from the signal thread in case the interrupt
+-     flag is set. This will then interrupt any select call. */
+-  main_thread = pthread_self();
+-  sigfillset(s);
+-  pthread_sigmask(SIG_SETMASK, s, NULL);
+-
+   memset(&sa, 0, sizeof(sa));
+   /* select will report EINTR even when SA_RESTART is set. */
+   sa.sa_flags = SA_RESTART;
+   sigfillset(&sa.sa_mask);
+ 
+-  /* Ignore all these events on the main thread. */
+-  sa.sa_handler = SIG_IGN;
+-  sigaction(SIGINT, &sa, NULL);
+-  sigaction(SIGTERM, &sa, NULL);
+-  sigaction(SIGHUP, &sa, NULL);
+-  sigaction(SIGQUIT, &sa, NULL);
+-
+   /* This is to notify when our child process (-c) ends. */
+   sa.sa_handler = chld_proc;
+   sigaction(SIGCHLD, &sa, NULL);
+@@ -182,26 +119,21 @@ static void setup_main_signals(void)
+       sigaction(SIGWINCH, &sa, NULL);
+     }
+ 
+-  /* This signal handler is notified from the signal_thread
+-     whenever a interruptable event is detected. It will
+-     result in an EINTR event for select or sleep. */
+-  sa.sa_handler = urg_proc;
+-  sigaction(SIGURG, &sa, NULL);
+-
+-  /* Everything else is handled on a special signal_thread. */
+-  sigemptyset(s);
+-  sigaddset(s, SIGINT);
+-  sigaddset(s, SIGTERM);
+-  sigaddset(s, SIGHUP);
+-  sigaddset(s, SIGQUIT);
+-  sigaddset(s, SIGPIPE);
+-  pthread_sigmask(SIG_SETMASK, s, NULL);
+-  if (pthread_create(&tid, NULL, signal_thread, s) < 0) {
+-    _perr(_("failed to create thread"));
+-    exit(1);
+-  }
++  // listen to these signals via general interrupt handler in whatever thread
++  memset(&sa, 0, sizeof(sa));
++  sa.sa_flags = SA_RESTART;
++  sigfillset(&sa.sa_mask);
++
++  sa.sa_handler = interrupt_handler;
++  sigaction(SIGINT, &sa, NULL);
++  sigaction(SIGTERM, &sa, NULL);
++  sigaction(SIGHUP, &sa, NULL);
++  sigaction(SIGQUIT, &sa, NULL);
++
++  // Formerly, we had a signal catching thread.
+ }
+ 
++
+ /**
+  * system_cmd() executes system commands in response
+  * to an STP_SYSTEM message from the module. These
+diff --git a/staprun/relay.c b/staprun/relay.c
+index dea1d5ae9..08850b246 100644
+--- a/staprun/relay.c
++++ b/staprun/relay.c
+@@ -7,30 +7,32 @@
+  * Public License (GPL); either version 2, or (at your option) any
+  * later version.
+  *
+- * Copyright (C) 2007-2013 Red Hat Inc.
++ * Copyright (C) 2007-2023 Red Hat Inc.
+  */
+ 
+ #include "staprun.h"
++#include <string.h>
++#ifdef HAVE_STDATOMIC_H
++#include <stdatomic.h>
++#endif
++#define NDEBUG
++#include "gheap.h"
++
+ 
+ int out_fd[MAX_NR_CPUS];
+ int monitor_end = 0;
+ static pthread_t reader[MAX_NR_CPUS];
+-static int relay_fd[MAX_NR_CPUS];
++static int relay_fd[MAX_NR_CPUS]; // fd to kernel per-cpu relayfs
+ static int avail_cpus[MAX_NR_CPUS];
+-static int switch_file[MAX_NR_CPUS];
+-static pthread_mutex_t mutex[MAX_NR_CPUS];
++static volatile sig_atomic_t sigusr2_count; // number of SIGUSR2's received by process
++static int sigusr2_processed[MAX_NR_CPUS]; // each thread's count of processed SIGUSR2's
+ static int bulkmode = 0;
+-static volatile int stop_threads = 0;
++static volatile int stop_threads = 0; // set during relayfs_close to signal threads to die
+ static time_t *time_backlog[MAX_NR_CPUS];
+ static int backlog_order=0;
+ #define BACKLOG_MASK ((1 << backlog_order) - 1)
+ #define MONITORLINELENGTH 4096
+ 
+-/* tracking message sequence #s for cross-cpu merging */
+-static uint32_t last_sequence_number;
+-static pthread_mutex_t last_sequence_number_mutex = PTHREAD_MUTEX_INITIALIZER;
+-static pthread_cond_t last_sequence_number_changed = PTHREAD_COND_INITIALIZER;
+-
+ #ifdef NEED_PPOLL
+ int ppoll(struct pollfd *fds, nfds_t nfds,
+ 	  const struct timespec *timeout, const sigset_t *sigmask)
+@@ -123,18 +125,375 @@ static int switch_outfile(int cpu, int *fnum)
+ 	return 0;
+ }
+ 
++
++
++/* In serialized (non-bulk) output mode, ndividual messages that have
++ been received from the kernel per-cpu relays are stored in an central
++ serializing data structure - in this case, a heap.  They are ordered
++ by message sequence number.  An additional thread (serializer_thread)
++ scans & sequences the output. */
++struct serialized_message {
++        union {
++                struct _stp_trace bufhdr;
++                char bufhdr_raw[sizeof(struct _stp_trace)];
++        };
++        time_t received; // timestamp when this message was enqueued
++        char *buf; // malloc()'d size >= rounded(bufhdr.pdu_len)
++};
++static struct serialized_message* buffer_heap = NULL; // the heap
++
++// NB: we control memory via realloc(), gheap just manipulates entries in place
++static unsigned buffer_heap_size = 0; // used number of entries 
++static unsigned buffer_heap_alloc = 0; // allocation length, always >= buffer_heap_size
++static unsigned last_sequence_number = 0; // last processed sequential message number
++
++#ifdef HAVE_STDATOMIC_H
++static atomic_ulong lost_message_count = 0; // how many sequence numbers we know we missed
++static atomic_ulong lost_byte_count = 0; // how many bytes were skipped during resync
++#else
++static unsigned long lost_message_count = 0; // how many sequence numbers we know we missed
++static unsigned long lost_byte_count = 0; // how many bytes were skipped during resync
++#endif
++
++// concurrency control for the buffer_heap
++static pthread_cond_t buffer_heap_cond = PTHREAD_COND_INITIALIZER;
++static pthread_mutex_t buffer_heap_mutex = PTHREAD_MUTEX_INITIALIZER;
++static pthread_t serializer_thread; // ! bulkmode only
++
++
++static void buffer_heap_mover (void *const dest, const void *const src)
++{
++        memmove (dest, src, sizeof(struct serialized_message));
++}
++
++// NB: since we want to sort messages into an INCREASING heap sequence,
++// we reverse the normal comparison operator.  gheap_pop_heap() should
++// therefore return the SMALLEST element.
++static int buffer_heap_comparer (const void *const ctx, const void *a, const void *b)
++{
++        (void) ctx;
++        uint32_t aa = ((struct serialized_message *)a)->bufhdr.sequence;
++        uint32_t bb = ((struct serialized_message *)b)->bufhdr.sequence;
++        return (aa > bb);
++}        
++
++static const struct gheap_ctx buffer_heap_ctx = {
++        .item_size = sizeof(struct serialized_message),
++        .less_comparer = buffer_heap_comparer,
++        .item_mover = buffer_heap_mover,
++        .page_chunks = 16, // arbitrary
++        .fanout = 2 // standard binary heap
++};
++
++
++#define MAX_MESSAGE_LENGTH (128*1024) /* maximum likely length of a single pdu */
++
++
++
++/* Thread that reads per-cpu messages, and stuffs complete ones into
++   dynamically allocated serialized_message nodes in a binary tree. */
++static void* reader_thread_serialmode (void *data)
++{
++        int rc, cpu = (int)(long)data;
++        struct pollfd pollfd;
++	sigset_t sigs;
++	cpu_set_t cpu_mask;
++                
++	sigemptyset(&sigs);
++	sigaddset(&sigs,SIGUSR2);
++	pthread_sigmask(SIG_BLOCK, &sigs, NULL);
++
++	sigfillset(&sigs);
++	sigdelset(&sigs,SIGUSR2);
++
++	CPU_ZERO(&cpu_mask);
++	CPU_SET(cpu, &cpu_mask);
++	if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
++		_perr("sched_setaffinity");
++
++	pollfd.fd = relay_fd[cpu];
++	pollfd.events = POLLIN;
++
++        while (! stop_threads) {
++                // read a message header
++                struct serialized_message message;
++                
++                /* 200ms, close to human level of "instant" */
++                struct timespec tim, *timeout = &tim;
++                timeout->tv_sec = reader_timeout_ms / 1000;
++                timeout->tv_nsec = (reader_timeout_ms - timeout->tv_sec * 1000) * 1000000;
++                
++                rc = ppoll(&pollfd, 1, timeout, &sigs);
++                if (rc < 0) {
++			dbug(3, "cpu=%d poll=%d errno=%d\n", cpu, rc, errno);
++			if (errno == EINTR) {
++				if (stop_threads)
++					break;
++			} else {
++				_perr("poll error");
++				goto error_out;
++			}
++                }
++
++                // set the timestamp
++                message.received = time(NULL);
++
++                /* Read the header. */
++                rc = read(relay_fd[cpu], &message.bufhdr, sizeof(message.bufhdr));
++                if (rc <= 0) /* seen during normal shutdown or error */
++                        continue;
++                if (rc != sizeof(message.bufhdr)) {
++                        lost_byte_count += rc;
++                        continue;
++                }
++
++                /* Validate the magic value.  In case of mismatch,
++                   keep on reading & shifting the header, one byte at
++                   a time, until we get a match. */
++                while (! stop_threads && memcmp(message.bufhdr.magic, STAP_TRACE_MAGIC, 4)) {
++                        lost_byte_count ++;
++                        memmove(& message.bufhdr_raw[0],
++                                & message.bufhdr_raw[1],
++                                sizeof(message.bufhdr_raw)-1);
++                        rc = read(relay_fd[cpu],
++                                  &message.bufhdr_raw[sizeof(message.bufhdr_raw)-1],
++                                  1);
++                        if (rc <= 0) /* seen during normal shutdown or error */
++                                break;
++                }
++
++                /* Validate it slightly.  Because of lost messages, we might be getting
++                   not a proper _stp_trace struct but the interior of some piece of 
++                   trace text message.  XXX: validate bufhdr.sequence a little bit too? */
++                if (message.bufhdr.pdu_len == 0 ||
++                    message.bufhdr.pdu_len > MAX_MESSAGE_LENGTH) {
++                        lost_byte_count += sizeof(message.bufhdr);
++                        continue;
++                }
++
++                // Allocate the pdu body
++                message.buf = malloc(message.bufhdr.pdu_len);
++                if (message.buf == NULL)
++                {
++                        lost_byte_count += message.bufhdr.pdu_len;
++                        continue;
++                }
++                
++                /* Read the message, perhaps in pieces (such as if crossing
++                 * relayfs subbuf boundaries). */
++                size_t bufread = 0;
++                while (bufread < message.bufhdr.pdu_len) {
++                        rc = read(relay_fd[cpu], message.buf+bufread, message.bufhdr.pdu_len-bufread);
++                        if (rc <= 0) {
++                                lost_byte_count += message.bufhdr.pdu_len-bufread;
++                                break; /* still process it; hope we can resync at next packet. */
++                        }
++                        bufread += rc;
++                }
++
++                // plop the message into the buffer_heap
++                pthread_mutex_lock(& buffer_heap_mutex);
++                if (message.bufhdr.sequence < last_sequence_number) {
++                        // whoa! is this some old message that we've assumed lost?
++                        // or are we wrapping around the uint_32 sequence numbers?
++                        _perr("unexpected sequence=%u", message.bufhdr.sequence);
++                }
++                        
++                // is it large enough?  if not, realloc
++                if (buffer_heap_alloc - buffer_heap_size == 0) { // full
++                        unsigned new_buffer_heap_alloc = (buffer_heap_alloc + 1) * 1.5;
++                        struct serialized_message *new_buffer_heap =
++                                realloc(buffer_heap,
++                                        new_buffer_heap_alloc * sizeof(struct serialized_message));
++                        if (new_buffer_heap == NULL) {
++                                _perr("out of memory while enlarging buffer heap");
++                                free (message.buf);
++                                lost_message_count ++;
++                                pthread_mutex_unlock(& buffer_heap_mutex);
++                                continue;
++                        }
++                        buffer_heap = new_buffer_heap;
++                        buffer_heap_alloc = new_buffer_heap_alloc;
++                }
++                // plop copy of message struct into slot at end of heap
++                buffer_heap[buffer_heap_size++] = message;
++                // push it into heap
++                gheap_push_heap(&buffer_heap_ctx,
++                                buffer_heap,
++                                buffer_heap_size);
++                // and c'est tout
++                pthread_mutex_unlock(& buffer_heap_mutex);
++                pthread_cond_broadcast (& buffer_heap_cond);
++                dbug(3, "thread %d received seq=%u\n", cpu, message.bufhdr.sequence);
++        }
++
++	dbug(3, "exiting thread for cpu %d\n", cpu);
++        return NULL;
++        
++error_out:
++	/* Signal the main thread that we need to quit */
++	kill(getpid(), SIGTERM);
++	dbug(2, "exiting thread for cpu %d after error\n", cpu);
++        
++        return NULL;
++}
++
++
++// Print and free buffer of given serialized message.
++static void print_serialized_message (struct serialized_message *msg)
++{
++        // check if file switching is necessary, as per staprun -S
++
++        // NB: unlike reader_thread_bulkmode(), we don't need to use
++        // mutexes to protect switch_file[] or such, because we're the
++        // ONLY thread doing output.
++        unsigned cpu = 0; // arbitrary
++        static ssize_t wsize = 0; // how many bytes we've written into the serialized file so far
++        static int fnum = 0; // which file number we're using
++
++        if ((fsize_max && (wsize > fsize_max)) ||
++            (sigusr2_count > sigusr2_processed[cpu])) {
++                dbug(2, "switching output file wsize=%ld fsize_max=%ld sigusr2 %d > %d\n",
++                     wsize, fsize_max, sigusr2_count, sigusr2_processed[cpu]);
++                sigusr2_processed[cpu] = sigusr2_count;                
++                if (switch_outfile(cpu, &fnum) < 0) {
++                        perr("unable to switch output file");
++                        // but continue
++                }
++                wsize = 0;
++        }
++
++        
++        // write loop ... could block if e.g. the output disk is slow
++        // or the user hits a ^S (XOFF) on the tty
++        ssize_t sent = 0;
++        do {
++                ssize_t ret = write (out_fd[avail_cpus[0]],
++                                     msg->buf+sent, msg->bufhdr.pdu_len-sent);
++                if (ret <= 0) {
++                        perr("error writing output");
++                        break;
++                }
++                sent += ret;
++        } while ((unsigned)sent < msg->bufhdr.pdu_len);
++        wsize += sent;
++        
++        // free the associated buffer
++        free (msg->buf);
++        msg->buf = NULL;
++}
++
++
++/* Thread that checks on the heap of messages, and pumps them out to
++   the designated output fd in sequence.  It waits, but only a little
++   while, if it has only fresher messages than it's expecting.  It
++   exits upon a global stop_threads indication.
++*/
++static void* reader_thread_serializer (void *data) {
++        (void) data;
++        while (! stop_threads) {
++                /* timeout 0-1 seconds; this is the maximum extra time that
++                   stapio will be waiting after a ^C */
++                struct timespec ts = {.tv_sec=time(NULL)+1, .tv_nsec=0};
++                int rc;
++                pthread_mutex_lock(& buffer_heap_mutex);                
++                rc = pthread_cond_timedwait (& buffer_heap_cond,
++                                             & buffer_heap_mutex,
++                                             & ts);
++
++		dbug(3, "serializer cond wait rc=%d heapsize=%u\n", rc, buffer_heap_size);
++                time_t now = time(NULL);
++                unsigned processed = 0;
++                while (buffer_heap_size > 0) { // consume as much as possible
++                        // check out the sequence# of the first element
++                        uint32_t buf_min_seq = buffer_heap[0].bufhdr.sequence;
++
++                        dbug(3, "serializer last=%u seq=%u\n", last_sequence_number, buf_min_seq);
++                                
++                        if ((buf_min_seq == last_sequence_number + 1) || // expected seq#
++                            (buffer_heap[0].received + 2 <= now)) {  // message too old
++                                // "we've got one!" -- or waited too long for one
++                                // get it off the head of the heap
++                                gheap_pop_heap(&buffer_heap_ctx,
++                                               buffer_heap,
++                                               buffer_heap_size);
++                                buffer_heap_size --; // becomes index where the head was moved
++                                processed ++;
++
++                                // take a copy of the whole message
++                                struct serialized_message msg = buffer_heap[buffer_heap_size];
++                                
++                                // paranoid clear this field of the now-unused slot
++                                buffer_heap[buffer_heap_size].buf = NULL;
++                                // update statistics
++                                if (attach_mod == 1 && last_sequence_number == 0) // first message after staprun -A
++                                        ; // do not penalize it with lost messages
++                                else
++                                        lost_message_count += (buf_min_seq - last_sequence_number - 1);
++                                last_sequence_number = buf_min_seq;
++                                
++                                // unlock the mutex, permitting
++                                // reader_thread_serialmode threads to
++                                // resume piling messages into the
++                                // heap while we print stuff
++                                pthread_mutex_unlock(& buffer_heap_mutex);
++
++                                print_serialized_message (& msg);
++                                
++                                // must re-take lock for next iteration of the while loop
++                                pthread_mutex_lock(& buffer_heap_mutex);
++                        } else {
++                                // processed as much of the heap as we
++                                // could this time; wait for the
++                                // condition again
++                                break;
++                        }
++                }
++                pthread_mutex_unlock(& buffer_heap_mutex);
++                if (processed > 0)
++                        dbug(2, "serializer processed n=%u\n", processed);
++        }
++        return NULL;        
++}
++
++
++
++// At the end of the program main loop, flush out any the remaining
++// messages and free up all that heapy data.
++static void reader_serialized_flush()
++{
++        dbug(3, "serializer flushing messages=%u\n", buffer_heap_size);
++        while (buffer_heap_size > 0) { // consume it all
++                // check out the sequence# of the first element
++                uint32_t buf_min_seq = buffer_heap[0].bufhdr.sequence;
++                dbug(3, "serializer seq=%u\n", buf_min_seq);
++                gheap_pop_heap(&buffer_heap_ctx,
++                               buffer_heap,
++                               buffer_heap_size);
++                buffer_heap_size --; // also index where the head was moved
++
++                // NB: no need for mutex manipulations, this is super single threaded
++                print_serialized_message (& buffer_heap[buffer_heap_size]);
++
++                lost_message_count += (buf_min_seq - last_sequence_number - 1);
++                last_sequence_number = buf_min_seq;                
++        }
++        free (buffer_heap);
++        buffer_heap = NULL;
++}
++
++
++
+ /**
+- *	reader_thread - per-cpu channel buffer reader
++ *	reader_thread - per-cpu channel buffer reader, bulkmode (one output file per cpu input file)
+  */
+-static void *reader_thread(void *data)
++static void *reader_thread_bulkmode (void *data)
+ {
+-        char buf[128*1024]; // NB: maximum possible output amount from a single probe hit's print_flush
++        char buf[MAX_MESSAGE_LENGTH];
+         struct _stp_trace bufhdr;
+ 
+         int rc, cpu = (int)(long)data;
+         struct pollfd pollfd;
+-        /* 200ms, close to human level of "instant" */
+-	struct timespec tim = {.tv_sec=0, .tv_nsec=200000000}, *timeout = &tim;
+ 	sigset_t sigs;
+ 	off_t wsize = 0;
+ 	int fnum = 0;
+@@ -151,44 +510,30 @@ static void *reader_thread(void *data)
+ 	CPU_SET(cpu, &cpu_mask);
+ 	if( sched_setaffinity( 0, sizeof(cpu_mask), &cpu_mask ) < 0 )
+ 		_perr("sched_setaffinity");
+-#ifdef NEED_PPOLL
+-	/* Without a real ppoll, there is a small race condition that could */
+-	/* block ppoll(). So use a timeout to prevent that. */
+-	timeout->tv_sec = 10;
+-	timeout->tv_nsec = 0;
+-#else
+-	timeout = NULL;
+-#endif
+-
+-        if (reader_timeout_ms && timeout) {
+-                timeout->tv_sec = reader_timeout_ms / 1000;
+-                timeout->tv_nsec = (reader_timeout_ms - timeout->tv_sec * 1000) * 1000000;
+-        }
+ 
+ 	pollfd.fd = relay_fd[cpu];
+ 	pollfd.events = POLLIN;
+ 
+         do {
+-		dbug(3, "thread %d start ppoll\n", cpu);
++                /* 200ms, close to human level of "instant" */
++                struct timespec tim, *timeout = &tim;
++                timeout->tv_sec = reader_timeout_ms / 1000;
++                timeout->tv_nsec = (reader_timeout_ms - timeout->tv_sec * 1000) * 1000000;
++                
+                 rc = ppoll(&pollfd, 1, timeout, &sigs);
+-		dbug(3, "thread %d end ppoll:%d\n", cpu, rc);
+                 if (rc < 0) {
+ 			dbug(3, "cpu=%d poll=%d errno=%d\n", cpu, rc, errno);
+ 			if (errno == EINTR) {
+ 				if (stop_threads)
+ 					break;
+ 
+-				pthread_mutex_lock(&mutex[cpu]);
+-				if (switch_file[cpu]) {
+-					if (switch_outfile(cpu, &fnum) < 0) {
+-						switch_file[cpu] = 0;
+-						pthread_mutex_unlock(&mutex[cpu]);
++                                if (sigusr2_count > sigusr2_processed[cpu]) {
++                                       sigusr2_processed[cpu] = sigusr2_count;
++                                       if (switch_outfile(cpu, &fnum) < 0) {
+ 						goto error_out;
+-					}
+-					switch_file[cpu] = 0;
+-					wsize = 0;
++                                       }
++                                       wsize = 0;
+ 				}
+-				pthread_mutex_unlock(&mutex[cpu]);
+ 			} else {
+ 				_perr("poll error");
+ 				goto error_out;
+@@ -197,7 +542,7 @@ static void *reader_thread(void *data)
+ 
+                 /* Read the header. */
+                 rc = read(relay_fd[cpu], &bufhdr, sizeof(bufhdr));
+-                if (rc == 0) /* seen during normal shutdown */
++                if (rc <= 0) /* seen during normal shutdown */
+                         continue;
+                 if (rc != sizeof(bufhdr)) {
+                         _perr("bufhdr read error, attempting resync");
+@@ -228,41 +573,20 @@ static void *reader_thread(void *data)
+                         bufread += rc;
+                 }
+ 
+-                if (! bulkmode) {
+-                        /* Wait until the bufhdr.sequence number indicates it's OUR TURN to go ahead. */
+-                        struct timespec ts = {.tv_sec=time(NULL)+2, .tv_nsec=0}; /* wait 1-2 seconds */
+-                        pthread_mutex_lock(& last_sequence_number_mutex);
+-                        while ((last_sequence_number+1 != bufhdr.sequence) && /* normal case */
+-                               (last_sequence_number < bufhdr.sequence)) { /* we're late!!! */
+-                                int rc = pthread_cond_timedwait (& last_sequence_number_changed,
+-                                                                 & last_sequence_number_mutex,
+-                                                                 & ts);
+-                                if (rc == ETIMEDOUT) {
+-                                        /* _perr("message sequencing timeout"); */
+-                                        break;
+-                                }
+-                        }
+-                        pthread_mutex_unlock(& last_sequence_number_mutex);
+-                }
+-                
+                 int wbytes = rc;
+                 char *wbuf = buf;
+ 
+                 dbug(3, "cpu %d: read %d bytes of data\n", cpu, rc);
+ 
+                 /* Switching file */
+-                pthread_mutex_lock(&mutex[cpu]);
+                 if ((fsize_max && ((wsize + rc) > fsize_max)) ||
+-                    switch_file[cpu]) {
++                    (sigusr2_count > sigusr2_processed[cpu])) {
++                        sigusr2_processed[cpu] = sigusr2_count;
+                         if (switch_outfile(cpu, &fnum) < 0) {
+-                                switch_file[cpu] = 0;
+-                                pthread_mutex_unlock(&mutex[cpu]);
+                                 goto error_out;
+                         }
+-                        switch_file[cpu] = 0;
+                         wsize = 0;
+                 }
+-                pthread_mutex_unlock(&mutex[cpu]);
+ 
+                 /* Copy loop.  Must repeat write(2) in case of a pipe overflow
+                    or other transient fullness. */
+@@ -291,13 +615,8 @@ static void *reader_thread(void *data)
+                                 int fd;
+                                 /* Only bulkmode and fsize_max use per-cpu output files. Otherwise,
+                                    there's just a single output fd stored at out_fd[avail_cpus[0]]. */
+-                                if (bulkmode || fsize_max)
+-                                        fd = out_fd[cpu];
+-                                else
+-                                        fd = out_fd[avail_cpus[0]];
+-                                rc = 0;
+-                                if (bulkmode)
+-                                        rc = write(fd, &bufhdr, sizeof(bufhdr)); // write header
++                                fd = out_fd[cpu];
++                                rc = write(fd, &bufhdr, sizeof(bufhdr)); // write header
+                                 rc |= write(fd, wbuf, wbytes); // write payload
+                                 if (rc <= 0) {
+                                         perr("Couldn't write to output %d for cpu %d, exiting.",
+@@ -310,14 +629,6 @@ static void *reader_thread(void *data)
+                         }
+                 }
+ 
+-                /* update the sequence number & let other cpus go ahead */
+-                pthread_mutex_lock(& last_sequence_number_mutex);
+-                if (last_sequence_number < bufhdr.sequence) { /* not if someone leapfrogged us */
+-                        last_sequence_number = bufhdr.sequence;
+-                        pthread_cond_broadcast (& last_sequence_number_changed);
+-                }
+-                pthread_mutex_unlock(& last_sequence_number_mutex);
+-                
+         } while (!stop_threads);
+ 	dbug(3, "exiting thread for cpu %d\n", cpu);
+ 	return(NULL);
+@@ -329,41 +640,16 @@ error_out:
+ 	return(NULL);
+ }
+ 
++
+ static void switchfile_handler(int sig)
+ {
+-	int i;
++        (void) sig;
+ 	if (stop_threads || !outfile_name)
+ 		return;
+-
+-	for (i = 0; i < ncpus; i++) {
+-		pthread_mutex_lock(&mutex[avail_cpus[i]]);
+-		if (reader[avail_cpus[i]] && switch_file[avail_cpus[i]]) {
+-			pthread_mutex_unlock(&mutex[avail_cpus[i]]);
+-			dbug(2, "file switching is progressing, signal ignored.\n", sig);
+-			return;
+-		}
+-		pthread_mutex_unlock(&mutex[avail_cpus[i]]);
+-	}
+-	for (i = 0; i < ncpus; i++) {
+-		pthread_mutex_lock(&mutex[avail_cpus[i]]);
+-		if (reader[avail_cpus[i]]) {
+-			switch_file[avail_cpus[i]] = 1;
+-			pthread_mutex_unlock(&mutex[avail_cpus[i]]);
+-
+-			// Make sure we don't send the USR2 signal to
+-			// ourselves.
+-			if (pthread_equal(pthread_self(),
+-					  reader[avail_cpus[i]]))
+-				break;
+-			pthread_kill(reader[avail_cpus[i]], SIGUSR2);
+-		}
+-		else {
+-			pthread_mutex_unlock(&mutex[avail_cpus[i]]);
+-			break;
+-		}
+-	}
++        sigusr2_count ++;
+ }
+ 
++
+ /**
+  *	init_relayfs - create files and threads for relayfs processing
+  *
+@@ -507,19 +793,20 @@ int init_relayfs(void)
+         sigaction(SIGUSR2, &sa, NULL);
+ 
+         dbug(2, "starting threads\n");
+-	for (i = 0; i < ncpus; i++) {
+-		if (pthread_mutex_init(&mutex[avail_cpus[i]], NULL) < 0) {
+-                        _perr("failed to create mutex");
+-                        return -1;
+-		}
+-	}
+         for (i = 0; i < ncpus; i++) {
+-                if (pthread_create(&reader[avail_cpus[i]], NULL, reader_thread,
++                if (pthread_create(&reader[avail_cpus[i]], NULL,
++                                   bulkmode ? reader_thread_bulkmode : reader_thread_serialmode,
+                                    (void *)(long)avail_cpus[i]) < 0) {
+                         _perr("failed to create thread");
+                         return -1;
+                 }
+         }
++        if (! bulkmode)
++                if (pthread_create(&serializer_thread, NULL,
++                                   reader_thread_serializer, NULL) < 0) {
++                        _perr("failed to create thread");
++                        return -1;
++                }
+ 
+ 	return 0;
+ }
+@@ -529,27 +816,31 @@ void close_relayfs(void)
+ 	int i;
+ 	stop_threads = 1;
+ 	dbug(2, "closing\n");
+-	for (i = 0; i < ncpus; i++) {
+-		if (reader[avail_cpus[i]])
+-			pthread_kill(reader[avail_cpus[i]], SIGUSR2);
+-		else
+-			break;
+-	}
++
+ 	for (i = 0; i < ncpus; i++) {
+ 		if (reader[avail_cpus[i]])
+ 			pthread_join(reader[avail_cpus[i]], NULL);
+ 		else
+ 			break;
+ 	}
++        if (! bulkmode) {
++                if (serializer_thread) // =0 on load_only!
++                        pthread_join(serializer_thread, NULL);
++                // at this point, we know all reader and writer
++                // threads for the buffer_heap are dead.
++                reader_serialized_flush();
++
++                if (lost_message_count > 0 || lost_byte_count > 0)
++                        eprintf("WARNING: There were %u lost messages and %u lost bytes.\n",
++                                lost_message_count, lost_byte_count); 
++        }
++
+ 	for (i = 0; i < ncpus; i++) {
+ 		if (relay_fd[avail_cpus[i]] >= 0)
+ 			close(relay_fd[avail_cpus[i]]);
+ 		else
+ 			break;
+ 	}
+-	for (i = 0; i < ncpus; i++) {
+-		pthread_mutex_destroy(&mutex[avail_cpus[i]]);
+-	}
+ 	dbug(2, "done\n");
+ }
+ 
+@@ -558,12 +849,6 @@ void kill_relayfs(void)
+ 	int i;
+ 	stop_threads = 1;
+ 	dbug(2, "killing\n");
+-	for (i = 0; i < ncpus; i++) {
+-		if (reader[avail_cpus[i]])
+-			pthread_kill(reader[avail_cpus[i]], SIGUSR2);
+-		else
+-			break;
+-	}
+ 	for (i = 0; i < ncpus; i++) {
+ 		if (reader[avail_cpus[i]])
+ 			pthread_cancel(reader[avail_cpus[i]]); /* no wait */
+@@ -576,8 +861,5 @@ void kill_relayfs(void)
+ 		else
+ 			break;
+ 	}
+-	for (i = 0; i < ncpus; i++) {
+-		pthread_mutex_destroy(&mutex[avail_cpus[i]]);
+-	}
+ 	dbug(2, "done\n");
+ }
+diff --git a/staprun/stap_merge.c b/staprun/stap_merge.c
+index 87de7d465..b210db663 100644
+--- a/staprun/stap_merge.c
++++ b/staprun/stap_merge.c
+@@ -76,6 +76,7 @@ int main (int argc, char *argv[])
+ 			fprintf(stderr, "error opening file %s.\n", argv[optind - 1]);
+ 			return -1;
+ 		}
++                (void) fread(buf, 4, 1, fp[i]); // read & ignore magic word
+ 		if (fread (buf, TIMESTAMP_SIZE, 1, fp[i]))
+ 			num[i] = *((int *)buf);
+ 		else
+@@ -133,6 +134,7 @@ int main (int argc, char *argv[])
+ 			count = min;
+ 		}
+ 
++                (void) fread(buf, 4, 1, fp[i]); // read & ignore magic word
+ 		if (fread (buf, TIMESTAMP_SIZE, 1, fp[j]))
+ 			num[j] = *((int *)buf);
+ 		else
+diff --git a/staprun/stap_merge.tcl b/staprun/stap_merge.tcl
+deleted file mode 100755
+index 0c7d7b694..000000000
+--- a/staprun/stap_merge.tcl
++++ /dev/null
+@@ -1,101 +0,0 @@
+-#!/usr/bin/env tclsh
+-#
+-# stap_merge.tcl - systemtap merge program
+-#
+-# This program is free software; you can redistribute it and/or modify
+-# it under the terms of the GNU General Public License as published by
+-# the Free Software Foundation; either version 2 of the License, or
+-# (at your option) any later version.
+-#
+-# This program is distributed in the hope that it will be useful,
+-# but WITHOUT ANY WARRANTY; without even the implied warranty of
+-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+-# GNU General Public License for more details.
+-#
+-# You should have received a copy of the GNU General Public License
+-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+-#
+-# Copyright (C) Red Hat Inc, 2007
+-#
+-#
+-
+-proc usage {} {
+-	puts stderr "$::argv0 \[-v\] \[-o output_filename\] input_files ...\n"
+-	exit 1
+-}
+-
+-set outfile "stdout"
+-set verbose 0
+-set index 0
+-while {[string match -* [lindex $argv $index]]} {
+-    switch -glob -- [lindex $argv $index] {
+-	-v {set verbose 1}
+-	-o   {incr index; set outfile [lindex $argv $index]}
+-	default {usage}
+-    }
+-    incr index
+-}
+-
+-if {$tcl_platform(byteOrder) == "littleEndian"} {
+-    set int_format i
+-} else {
+-    set int_format I
+-}
+-
+-set files [lrange $argv $index end]
+-
+-set n 0
+-foreach file $files {
+-    if {[catch {open $file} fd($n)]} {
+-	puts stderr $fd($n)
+-	exit 1
+-    }
+-    fconfigure $fd($n) -translation binary
+-    if {![binary scan [read $fd($n) 4] $int_format timestamp($n)]} {
+-	continue
+-    }
+-    set timestamp($n) [expr $timestamp($n) & 0xFFFFFFFF]
+-    incr n
+-}
+-set ncpus $n
+-
+-if {$outfile != "stdout"} {
+-    if {[catch {open $outfile w} outfile]} {
+-	puts stderr $outfile
+-	exit 1
+-    }
+-}
+-fconfigure $outfile -translation binary
+-
+-while {1} {
+-    set mincpu -1
+-    for {set n 0} {$n < $ncpus} {incr n} {
+-	if {[info exists fd($n)] && (![info exists min] || $timestamp($n) <= $min)} {
+-	    set min $timestamp($n)
+-	    set mincpu $n
+-	}
+-    }
+-
+-    if {![info exists min]} {break}
+-
+-    if {![binary scan [read $fd($mincpu) 4] $int_format len]} {
+-	puts stderr "Error reading length from channel $mincpu"
+-	exit 1
+-    }
+-
+-    if {$verbose == 1} {
+-	puts stderr "\[CPU:$mincpu, seq=$min, length=$len\]"
+-    }
+-
+-    set data [read $fd($mincpu) $len]
+-    puts -nonewline $outfile $data
+-
+-    set data [read $fd($mincpu) 4]
+-    if {$data == ""} {
+-	unset fd($mincpu)
+-    } else {
+-	binary scan $data $int_format timestamp($mincpu)
+-	set timestamp($mincpu) [expr $timestamp($mincpu) & 0xFFFFFFFF]
+-    }
+-    unset min
+-}
+diff --git a/staprun/staprun.8 b/staprun/staprun.8
+index 3bc16ab95..4e1ca9af6 100644
+--- a/staprun/staprun.8
++++ b/staprun/staprun.8
+@@ -120,7 +120,7 @@ remote_id() and remote_uri().
+ Sets the maximum size of output file and the maximum number of output files.
+ If the size of output file will exceed
+ .B size
+-, systemtap switches output file to the next file. And if the number of
++megabytes, systemtap switches output file to the next file. And if the number of
+ output files exceed
+ .B N
+ , systemtap removes the oldest output file. You can omit the second argument.
+commit 2442beb99eeab3144c2622cae1fc98b999f72108
+gpg: Signature made Mon 14 Aug 2023 01:55:27 PM EDT
+gpg:                using RSA key 5D38116FA4D3A7CC77E378D37E83610126DCC2E8
+gpg: Good signature from "Frank Ch. Eigler <fche@elastic.org>" [full]
+Author: Frank Ch. Eigler <fche@redhat.com>
+Date:   Mon Aug 14 13:54:50 2023 -0400
+
+    PR29108 / BZ2095359 tweak: stap_merge magic handling
+    
+    We don't bother do much error checking in this infrequently used
+    tool, but gcc warnings require us to do some.
+
+diff --git a/staprun/stap_merge.c b/staprun/stap_merge.c
+index b210db663..388b14938 100644
+--- a/staprun/stap_merge.c
++++ b/staprun/stap_merge.c
+@@ -76,7 +76,8 @@ int main (int argc, char *argv[])
+ 			fprintf(stderr, "error opening file %s.\n", argv[optind - 1]);
+ 			return -1;
+ 		}
+-                (void) fread(buf, 4, 1, fp[i]); // read & ignore magic word
++                if (fread(buf, 4, 1, fp[i]) != 1) // read magic word
++                  fprintf(stderr, "warning: erro reading magic word\n");
+ 		if (fread (buf, TIMESTAMP_SIZE, 1, fp[i]))
+ 			num[i] = *((int *)buf);
+ 		else
+@@ -134,7 +135,8 @@ int main (int argc, char *argv[])
+ 			count = min;
+ 		}
+ 
+-                (void) fread(buf, 4, 1, fp[i]); // read & ignore magic word
++                if (fread(buf, 4, 1, fp[i]) != 1) // read magic word
++                  fprintf(stderr, "warning: erro reading magic word\n");
+ 		if (fread (buf, TIMESTAMP_SIZE, 1, fp[j]))
+ 			num[j] = *((int *)buf);
+ 		else
diff --git a/pr30749.patch b/pr30749.patch
new file mode 100644
index 0000000..108a642
--- /dev/null
+++ b/pr30749.patch
@@ -0,0 +1,99 @@
+commit 9839db5514a29cf4f58b3de8cc6155088be6d061
+gpg: Signature made Sat 12 Aug 2023 02:49:26 PM EDT
+gpg:                using RSA key 5D38116FA4D3A7CC77E378D37E83610126DCC2E8
+gpg: Good signature from "Frank Ch. Eigler <fche@elastic.org>" [full]
+Author: Frank Ch. Eigler <fche@redhat.com>
+Date:   Sat Aug 12 14:28:44 2023 -0400
+
+    PR30749: correct stap --sign-module timing
+    
+    Previous code signed the temp directory copy, after it had already
+    been copied into the cache -- so the signature never made it to a
+    permanent artifact.
+    
+    If the module was being fetched from the cache from a previous build
+    run, a sign (re)attempt will still be done.  This may not be
+    necessary, but shouldn't be harmful.
+    
+    Reported-By: Renaud Métrich <rmetrich@redhat.com>
+
+diff --git a/main.cxx b/main.cxx
+index 06adb66ad..9f695cbd8 100644
+--- a/main.cxx
++++ b/main.cxx
+@@ -1190,8 +1190,10 @@ passes_0_4 (systemtap_session &s)
+ 		  s.mok_fingerprints.clear();
+ 		  s.mok_fingerprints.push_back(mok_fingerprint);
+ 		}
+-	      rc =
+-		sign_module (s.tmpdir, s.module_filename(), s.mok_fingerprints, mok_path, s.kernel_build_tree);
++              if (s.verbose)
++                clog << _F("Signing %s with mok key %s", s.module_filename().c_str(), mok_path.c_str())
++                     << endl;
++	      rc = sign_module (s.tmpdir, s.module_filename(), s.mok_fingerprints, mok_path, s.kernel_build_tree);
+ 	    }
+ #endif
+ 
+@@ -1310,8 +1312,30 @@ passes_0_4 (systemtap_session &s)
+       if (! s.use_script_cache && s.last_pass <= 4)
+         s.save_module = true;
+ 
++#if HAVE_NSS
++      // PR30749
++      if (!rc && s.module_sign_given)
++        {
++          // when run on client as --sign-module, mok fingerprints are result of mokutil -l
++          // when run from server as --sign-module=PATH, mok fingerprint is given by PATH
++          string mok_path;
++          if (!s.module_sign_mok_path.empty())
++            {
++              string mok_fingerprint;
++              split_path (s.module_sign_mok_path, mok_path, mok_fingerprint);
++              s.mok_fingerprints.clear();
++              s.mok_fingerprints.push_back(mok_fingerprint);
++            }
++          
++          if (s.verbose)
++            clog << _F("Signing %s with mok key %s", s.module_filename().c_str(), mok_path.c_str())
++                 << endl;
++          rc = sign_module (s.tmpdir, s.module_filename(), s.mok_fingerprints, mok_path, s.kernel_build_tree);
++        }
++#endif
++      
+       // Copy module to the current directory.
+-      if (s.save_module && !pending_interrupts)
++      if (!rc && s.save_module && !pending_interrupts)
+         {
+ 	  string module_src_path = s.tmpdir + "/" + s.module_filename();
+ 	  string module_dest_path = s.module_filename();
+@@ -1327,29 +1351,11 @@ passes_0_4 (systemtap_session &s)
+         }
+     }
+   
+-#if HAVE_NSS
+-  if (s.module_sign_given)
+-    {
+-      // when run on client as --sign-module, mok fingerprints are result of mokutil -l
+-      // when run from server as --sign-module=PATH, mok fingerprint is given by PATH
+-      string mok_path;
+-      if (!s.module_sign_mok_path.empty())
+-	{
+-	  string mok_fingerprint;
+-	  split_path (s.module_sign_mok_path, mok_path, mok_fingerprint);
+-	  s.mok_fingerprints.clear();
+-	  s.mok_fingerprints.push_back(mok_fingerprint);
+-	}
+-
+-      rc = sign_module (s.tmpdir, s.module_filename(), s.mok_fingerprints, mok_path, s.kernel_build_tree);
+-    }
+-#endif
+-  
+   PROBE1(stap, pass4__end, &s);
+ 
+   return rc;
+ }
+-
++ 
+ int
+ pass_5 (systemtap_session &s, vector<remote*> targets)
+ {
diff --git a/systemtap.spec b/systemtap.spec
index 717c8b0..9857a0c 100644
--- a/systemtap.spec
+++ b/systemtap.spec
@@ -123,7 +123,7 @@ m     stapdev  stapdev
 Name: systemtap
 # PRERELEASE
 Version: 4.9
-Release: 2%{?release_override}%{?dist}
+Release: 3%{?release_override}%{?dist}
 # for version, see also configure.ac
 
 
@@ -162,6 +162,8 @@ Source: ftp://sourceware.org/pub/systemtap/releases/systemtap-%{version}.tar.gz
 
 Patch1: rhbz2223733.patch
 Patch2: rhbz2223735.patch
+Patch3: pr29108.patch
+Patch4: pr30749.patch
 
 
 # Build*
@@ -594,6 +596,8 @@ or within a container.
 %setup -q
 %patch -P1 -p1
 %patch -P2 -p1
+%patch -P3 -p1
+%patch -P4 -p1
 
 %build
 
@@ -1313,6 +1317,10 @@ exit 0
 
 # PRERELEASE
 %changelog
+* Mon Aug 14 2023 Frank Ch. Eigler <fche@redhat.com> - 4.9-3
+- rhbz2231632
+- rhbz2231635
+
 * Tue Jul 18 2023 Frank Ch. Eigler <fche@redhat.com> - 4.9-2
 - rhbz2223733
 - rhbz2223735