75 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			75 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef _TOOLS_LINUX_RING_BUFFER_H_
 | |
| #define _TOOLS_LINUX_RING_BUFFER_H_
 | |
| 
 | |
| #include <asm/barrier.h>
 | |
| #include <linux/perf_event.h>
 | |
| 
 | |
| /*
 | |
|  * Contract with kernel for walking the perf ring buffer from
 | |
|  * user space requires the following barrier pairing (quote
 | |
|  * from kernel/events/ring_buffer.c):
 | |
|  *
 | |
|  *   Since the mmap() consumer (userspace) can run on a
 | |
|  *   different CPU:
 | |
|  *
 | |
|  *   kernel                             user
 | |
|  *
 | |
|  *   if (LOAD ->data_tail) {            LOAD ->data_head
 | |
|  *                      (A)             smp_rmb()       (C)
 | |
|  *      STORE $data                     LOAD $data
 | |
|  *      smp_wmb()       (B)             smp_mb()        (D)
 | |
|  *      STORE ->data_head               STORE ->data_tail
 | |
|  *   }
 | |
|  *
 | |
|  *   Where A pairs with D, and B pairs with C.
 | |
|  *
 | |
|  *   In our case A is a control dependency that separates the
 | |
|  *   load of the ->data_tail and the stores of $data. In case
 | |
|  *   ->data_tail indicates there is no room in the buffer to
 | |
|  *   store $data we do not.
 | |
|  *
 | |
|  *   D needs to be a full barrier since it separates the data
 | |
|  *   READ from the tail WRITE.
 | |
|  *
 | |
|  *   For B a WMB is sufficient since it separates two WRITEs,
 | |
|  *   and for C an RMB is sufficient since it separates two READs.
 | |
|  *
 | |
|  * Note, instead of B, C, D we could also use smp_store_release()
 | |
|  * in B and D as well as smp_load_acquire() in C.
 | |
|  *
 | |
|  * However, this optimization does not make sense for all kernel
 | |
|  * supported architectures since for a fair number it would
 | |
|  * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
 | |
|  * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
 | |
|  *
 | |
|  * Thus for those smp_wmb() in B and smp_rmb() in C would still
 | |
|  * be less expensive. For the case of D this has either the same
 | |
|  * cost or is less expensive, for example, due to TSO x86 can
 | |
|  * avoid the CPU barrier entirely.
 | |
|  */
 | |
| 
 | |
| static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
 | |
| {
 | |
| /*
 | |
|  * Architectures where smp_load_acquire() does not fallback to
 | |
|  * READ_ONCE() + smp_mb() pair.
 | |
|  */
 | |
| #if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
 | |
|     defined(__ia64__) || defined(__sparc__) && defined(__arch64__) || defined(__riscv)
 | |
| 	return smp_load_acquire(&base->data_head);
 | |
| #else
 | |
| 	u64 head = READ_ONCE(base->data_head);
 | |
| 
 | |
| 	smp_rmb();
 | |
| 	return head;
 | |
| #endif
 | |
| }
 | |
| 
 | |
| static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
 | |
| 					  u64 tail)
 | |
| {
 | |
| 	smp_store_release(&base->data_tail, tail);
 | |
| }
 | |
| 
 | |
| #endif /* _TOOLS_LINUX_RING_BUFFER_H_ */
 |