10061 lines
304 KiB
Diff
10061 lines
304 KiB
Diff
--- libgomp/loop.c.jj 2018-04-25 09:40:31.870655561 +0200
|
|
+++ libgomp/loop.c 2019-05-07 18:46:36.526109736 +0200
|
|
@@ -27,9 +27,13 @@
|
|
|
|
#include <limits.h>
|
|
#include <stdlib.h>
|
|
+#include <string.h>
|
|
#include "libgomp.h"
|
|
|
|
|
|
+ialias (GOMP_loop_runtime_next)
|
|
+ialias_redirect (GOMP_taskgroup_reduction_register)
|
|
+
|
|
/* Initialize the given work share construct from the given arguments. */
|
|
|
|
static inline void
|
|
@@ -79,12 +83,12 @@ gomp_loop_init (struct gomp_work_share *
|
|
}
|
|
|
|
/* The *_start routines are called when first encountering a loop construct
|
|
- that is not bound directly to a parallel construct. The first thread
|
|
+ that is not bound directly to a parallel construct. The first thread
|
|
that arrives will create the work-share construct; subsequent threads
|
|
will see the construct exists and allocate work from it.
|
|
|
|
START, END, INCR are the bounds of the loop; due to the restrictions of
|
|
- OpenMP, these values must be the same in every thread. This is not
|
|
+ OpenMP, these values must be the same in every thread. This is not
|
|
verified (nor is it entirely verifiable, since START is not necessarily
|
|
retained intact in the work-share data structure). CHUNK_SIZE is the
|
|
scheduling parameter; again this must be identical in all threads.
|
|
@@ -101,7 +105,7 @@ gomp_loop_static_start (long start, long
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
|
|
thr->ts.static_trip = 0;
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
GFS_STATIC, chunk_size);
|
|
@@ -123,7 +127,7 @@ gomp_loop_dynamic_start (long start, lon
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
GFS_DYNAMIC, chunk_size);
|
|
@@ -151,7 +155,7 @@ gomp_loop_guided_start (long start, long
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
GFS_GUIDED, chunk_size);
|
|
@@ -174,7 +178,7 @@ GOMP_loop_runtime_start (long start, lon
|
|
long *istart, long *iend)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
- switch (icv->run_sched_var)
|
|
+ switch (icv->run_sched_var & ~GFS_MONOTONIC)
|
|
{
|
|
case GFS_STATIC:
|
|
return gomp_loop_static_start (start, end, incr,
|
|
@@ -197,6 +201,100 @@ GOMP_loop_runtime_start (long start, lon
|
|
}
|
|
}
|
|
|
|
+static long
|
|
+gomp_adjust_sched (long sched, long *chunk_size)
|
|
+{
|
|
+ sched &= ~GFS_MONOTONIC;
|
|
+ switch (sched)
|
|
+ {
|
|
+ case GFS_STATIC:
|
|
+ case GFS_DYNAMIC:
|
|
+ case GFS_GUIDED:
|
|
+ return sched;
|
|
+ /* GFS_RUNTIME is used for runtime schedule without monotonic
|
|
+ or nonmonotonic modifiers on the clause.
|
|
+ GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
|
|
+ modifier. */
|
|
+ case GFS_RUNTIME:
|
|
+ /* GFS_AUTO is used for runtime schedule with nonmonotonic
|
|
+ modifier. */
|
|
+ case GFS_AUTO:
|
|
+ {
|
|
+ struct gomp_task_icv *icv = gomp_icv (false);
|
|
+ sched = icv->run_sched_var & ~GFS_MONOTONIC;
|
|
+ switch (sched)
|
|
+ {
|
|
+ case GFS_STATIC:
|
|
+ case GFS_DYNAMIC:
|
|
+ case GFS_GUIDED:
|
|
+ *chunk_size = icv->run_sched_chunk_size;
|
|
+ break;
|
|
+ case GFS_AUTO:
|
|
+ sched = GFS_STATIC;
|
|
+ *chunk_size = 0;
|
|
+ break;
|
|
+ default:
|
|
+ abort ();
|
|
+ }
|
|
+ return sched;
|
|
+ }
|
|
+ default:
|
|
+ abort ();
|
|
+ }
|
|
+}
|
|
+
|
|
+bool
|
|
+GOMP_loop_start (long start, long end, long incr, long sched,
|
|
+ long chunk_size, long *istart, long *iend,
|
|
+ uintptr_t *reductions, void **mem)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+
|
|
+ thr->ts.static_trip = 0;
|
|
+ if (reductions)
|
|
+ gomp_workshare_taskgroup_start ();
|
|
+ if (gomp_work_share_start (0))
|
|
+ {
|
|
+ sched = gomp_adjust_sched (sched, &chunk_size);
|
|
+ gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
+ sched, chunk_size);
|
|
+ if (reductions)
|
|
+ {
|
|
+ GOMP_taskgroup_reduction_register (reductions);
|
|
+ thr->task->taskgroup->workshare = true;
|
|
+ thr->ts.work_share->task_reductions = reductions;
|
|
+ }
|
|
+ if (mem)
|
|
+ {
|
|
+ uintptr_t size = (uintptr_t) *mem;
|
|
+ if (size > (sizeof (struct gomp_work_share)
|
|
+ - offsetof (struct gomp_work_share,
|
|
+ inline_ordered_team_ids)))
|
|
+ thr->ts.work_share->ordered_team_ids
|
|
+ = gomp_malloc_cleared (size);
|
|
+ else
|
|
+ memset (thr->ts.work_share->ordered_team_ids, '\0', size);
|
|
+ *mem = (void *) thr->ts.work_share->ordered_team_ids;
|
|
+ }
|
|
+ gomp_work_share_init_done ();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (reductions)
|
|
+ {
|
|
+ uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
|
|
+ gomp_workshare_task_reduction_register (reductions,
|
|
+ first_reductions);
|
|
+ }
|
|
+ if (mem)
|
|
+ *mem = (void *) thr->ts.work_share->ordered_team_ids;
|
|
+ }
|
|
+
|
|
+ if (!istart)
|
|
+ return true;
|
|
+ return ialias_call (GOMP_loop_runtime_next) (istart, iend);
|
|
+}
|
|
+
|
|
/* The *_ordered_*_start routines are similar. The only difference is that
|
|
this work-share construct is initialized to expect an ORDERED section. */
|
|
|
|
@@ -207,7 +305,7 @@ gomp_loop_ordered_static_start (long sta
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
|
|
thr->ts.static_trip = 0;
|
|
- if (gomp_work_share_start (true))
|
|
+ if (gomp_work_share_start (1))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
GFS_STATIC, chunk_size);
|
|
@@ -225,7 +323,7 @@ gomp_loop_ordered_dynamic_start (long st
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (true))
|
|
+ if (gomp_work_share_start (1))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
GFS_DYNAMIC, chunk_size);
|
|
@@ -250,7 +348,7 @@ gomp_loop_ordered_guided_start (long sta
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (true))
|
|
+ if (gomp_work_share_start (1))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
GFS_GUIDED, chunk_size);
|
|
@@ -273,7 +371,7 @@ GOMP_loop_ordered_runtime_start (long st
|
|
long *istart, long *iend)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
- switch (icv->run_sched_var)
|
|
+ switch (icv->run_sched_var & ~GFS_MONOTONIC)
|
|
{
|
|
case GFS_STATIC:
|
|
return gomp_loop_ordered_static_start (start, end, incr,
|
|
@@ -297,6 +395,81 @@ GOMP_loop_ordered_runtime_start (long st
|
|
}
|
|
}
|
|
|
|
+bool
|
|
+GOMP_loop_ordered_start (long start, long end, long incr, long sched,
|
|
+ long chunk_size, long *istart, long *iend,
|
|
+ uintptr_t *reductions, void **mem)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ size_t ordered = 1;
|
|
+ bool ret;
|
|
+
|
|
+ thr->ts.static_trip = 0;
|
|
+ if (reductions)
|
|
+ gomp_workshare_taskgroup_start ();
|
|
+ if (mem)
|
|
+ ordered += (uintptr_t) *mem;
|
|
+ if (gomp_work_share_start (ordered))
|
|
+ {
|
|
+ sched = gomp_adjust_sched (sched, &chunk_size);
|
|
+ gomp_loop_init (thr->ts.work_share, start, end, incr,
|
|
+ sched, chunk_size);
|
|
+ if (reductions)
|
|
+ {
|
|
+ GOMP_taskgroup_reduction_register (reductions);
|
|
+ thr->task->taskgroup->workshare = true;
|
|
+ thr->ts.work_share->task_reductions = reductions;
|
|
+ }
|
|
+ if (sched == GFS_STATIC)
|
|
+ gomp_ordered_static_init ();
|
|
+ else
|
|
+ gomp_mutex_lock (&thr->ts.work_share->lock);
|
|
+ gomp_work_share_init_done ();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (reductions)
|
|
+ {
|
|
+ uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
|
|
+ gomp_workshare_task_reduction_register (reductions,
|
|
+ first_reductions);
|
|
+ }
|
|
+ sched = thr->ts.work_share->sched;
|
|
+ if (sched != GFS_STATIC)
|
|
+ gomp_mutex_lock (&thr->ts.work_share->lock);
|
|
+ }
|
|
+
|
|
+ if (mem)
|
|
+ {
|
|
+ uintptr_t p
|
|
+ = (uintptr_t) (thr->ts.work_share->ordered_team_ids
|
|
+ + (thr->ts.team ? thr->ts.team->nthreads : 1));
|
|
+ p += __alignof__ (long long) - 1;
|
|
+ p &= ~(__alignof__ (long long) - 1);
|
|
+ *mem = (void *) p;
|
|
+ }
|
|
+
|
|
+ switch (sched)
|
|
+ {
|
|
+ case GFS_STATIC:
|
|
+ case GFS_AUTO:
|
|
+ return !gomp_iter_static_next (istart, iend);
|
|
+ case GFS_DYNAMIC:
|
|
+ ret = gomp_iter_dynamic_next_locked (istart, iend);
|
|
+ break;
|
|
+ case GFS_GUIDED:
|
|
+ ret = gomp_iter_guided_next_locked (istart, iend);
|
|
+ break;
|
|
+ default:
|
|
+ abort ();
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ gomp_ordered_first ();
|
|
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
/* The *_doacross_*_start routines are similar. The only difference is that
|
|
this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
|
|
section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
|
|
@@ -310,11 +483,11 @@ gomp_loop_doacross_static_start (unsigne
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
|
|
thr->ts.static_trip = 0;
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
|
|
GFS_STATIC, chunk_size);
|
|
- gomp_doacross_init (ncounts, counts, chunk_size);
|
|
+ gomp_doacross_init (ncounts, counts, chunk_size, 0);
|
|
gomp_work_share_init_done ();
|
|
}
|
|
|
|
@@ -328,11 +501,11 @@ gomp_loop_doacross_dynamic_start (unsign
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
|
|
GFS_DYNAMIC, chunk_size);
|
|
- gomp_doacross_init (ncounts, counts, chunk_size);
|
|
+ gomp_doacross_init (ncounts, counts, chunk_size, 0);
|
|
gomp_work_share_init_done ();
|
|
}
|
|
|
|
@@ -354,11 +527,11 @@ gomp_loop_doacross_guided_start (unsigne
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
|
|
GFS_GUIDED, chunk_size);
|
|
- gomp_doacross_init (ncounts, counts, chunk_size);
|
|
+ gomp_doacross_init (ncounts, counts, chunk_size, 0);
|
|
gomp_work_share_init_done ();
|
|
}
|
|
|
|
@@ -378,7 +551,7 @@ GOMP_loop_doacross_runtime_start (unsign
|
|
long *istart, long *iend)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
- switch (icv->run_sched_var)
|
|
+ switch (icv->run_sched_var & ~GFS_MONOTONIC)
|
|
{
|
|
case GFS_STATIC:
|
|
return gomp_loop_doacross_static_start (ncounts, counts,
|
|
@@ -402,8 +575,52 @@ GOMP_loop_doacross_runtime_start (unsign
|
|
}
|
|
}
|
|
|
|
-/* The *_next routines are called when the thread completes processing of
|
|
- the iteration block currently assigned to it. If the work-share
|
|
+bool
|
|
+GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched,
|
|
+ long chunk_size, long *istart, long *iend,
|
|
+ uintptr_t *reductions, void **mem)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+
|
|
+ thr->ts.static_trip = 0;
|
|
+ if (reductions)
|
|
+ gomp_workshare_taskgroup_start ();
|
|
+ if (gomp_work_share_start (0))
|
|
+ {
|
|
+ size_t extra = 0;
|
|
+ if (mem)
|
|
+ extra = (uintptr_t) *mem;
|
|
+ sched = gomp_adjust_sched (sched, &chunk_size);
|
|
+ gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
|
|
+ sched, chunk_size);
|
|
+ gomp_doacross_init (ncounts, counts, chunk_size, extra);
|
|
+ if (reductions)
|
|
+ {
|
|
+ GOMP_taskgroup_reduction_register (reductions);
|
|
+ thr->task->taskgroup->workshare = true;
|
|
+ thr->ts.work_share->task_reductions = reductions;
|
|
+ }
|
|
+ gomp_work_share_init_done ();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (reductions)
|
|
+ {
|
|
+ uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
|
|
+ gomp_workshare_task_reduction_register (reductions,
|
|
+ first_reductions);
|
|
+ }
|
|
+ sched = thr->ts.work_share->sched;
|
|
+ }
|
|
+
|
|
+ if (mem)
|
|
+ *mem = thr->ts.work_share->doacross->extra;
|
|
+
|
|
+ return ialias_call (GOMP_loop_runtime_next) (istart, iend);
|
|
+}
|
|
+
|
|
+/* The *_next routines are called when the thread completes processing of
|
|
+ the iteration block currently assigned to it. If the work-share
|
|
construct is bound directly to a parallel construct, then the iteration
|
|
bounds may have been set up before the parallel. In which case, this
|
|
may be the first iteration for the thread.
|
|
@@ -456,7 +673,7 @@ bool
|
|
GOMP_loop_runtime_next (long *istart, long *iend)
|
|
{
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
-
|
|
+
|
|
switch (thr->ts.work_share->sched)
|
|
{
|
|
case GFS_STATIC:
|
|
@@ -534,7 +751,7 @@ bool
|
|
GOMP_loop_ordered_runtime_next (long *istart, long *iend)
|
|
{
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
-
|
|
+
|
|
switch (thr->ts.work_share->sched)
|
|
{
|
|
case GFS_STATIC:
|
|
@@ -563,7 +780,7 @@ gomp_parallel_loop_start (void (*fn) (vo
|
|
num_threads = gomp_resolve_num_threads (num_threads, 0);
|
|
team = gomp_new_team (num_threads);
|
|
gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
|
|
- gomp_team_start (fn, data, num_threads, flags, team);
|
|
+ gomp_team_start (fn, data, num_threads, flags, team, NULL);
|
|
}
|
|
|
|
void
|
|
@@ -600,7 +817,8 @@ GOMP_parallel_loop_runtime_start (void (
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
|
|
- icv->run_sched_var, icv->run_sched_chunk_size, 0);
|
|
+ icv->run_sched_var & ~GFS_MONOTONIC,
|
|
+ icv->run_sched_chunk_size, 0);
|
|
}
|
|
|
|
ialias_redirect (GOMP_parallel_end)
|
|
@@ -638,11 +856,28 @@ GOMP_parallel_loop_guided (void (*fn) (v
|
|
GOMP_parallel_end ();
|
|
}
|
|
|
|
+void
|
|
+GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
|
|
+ unsigned num_threads, long start, long end,
|
|
+ long incr, unsigned flags)
|
|
+{
|
|
+ struct gomp_task_icv *icv = gomp_icv (false);
|
|
+ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
|
|
+ icv->run_sched_var & ~GFS_MONOTONIC,
|
|
+ icv->run_sched_chunk_size, flags);
|
|
+ fn (data);
|
|
+ GOMP_parallel_end ();
|
|
+}
|
|
+
|
|
#ifdef HAVE_ATTRIBUTE_ALIAS
|
|
extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic
|
|
__attribute__((alias ("GOMP_parallel_loop_dynamic")));
|
|
extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided
|
|
__attribute__((alias ("GOMP_parallel_loop_guided")));
|
|
+extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_nonmonotonic_runtime
|
|
+ __attribute__((alias ("GOMP_parallel_loop_runtime")));
|
|
+extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_maybe_nonmonotonic_runtime
|
|
+ __attribute__((alias ("GOMP_parallel_loop_runtime")));
|
|
#else
|
|
void
|
|
GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data,
|
|
@@ -667,21 +902,35 @@ GOMP_parallel_loop_nonmonotonic_guided (
|
|
fn (data);
|
|
GOMP_parallel_end ();
|
|
}
|
|
-#endif
|
|
|
|
void
|
|
-GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
|
|
- unsigned num_threads, long start, long end,
|
|
- long incr, unsigned flags)
|
|
+GOMP_parallel_loop_nonmonotonic_runtime (void (*fn) (void *), void *data,
|
|
+ unsigned num_threads, long start,
|
|
+ long end, long incr, unsigned flags)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
|
|
- icv->run_sched_var, icv->run_sched_chunk_size,
|
|
- flags);
|
|
+ icv->run_sched_var & ~GFS_MONOTONIC,
|
|
+ icv->run_sched_chunk_size, flags);
|
|
fn (data);
|
|
GOMP_parallel_end ();
|
|
}
|
|
|
|
+void
|
|
+GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*fn) (void *), void *data,
|
|
+ unsigned num_threads, long start,
|
|
+ long end, long incr,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct gomp_task_icv *icv = gomp_icv (false);
|
|
+ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
|
|
+ icv->run_sched_var & ~GFS_MONOTONIC,
|
|
+ icv->run_sched_chunk_size, flags);
|
|
+ fn (data);
|
|
+ GOMP_parallel_end ();
|
|
+}
|
|
+#endif
|
|
+
|
|
/* The GOMP_loop_end* routines are called after the thread is told that
|
|
all loop iterations are complete. The first two versions synchronize
|
|
all threads; the nowait version does not. */
|
|
@@ -721,6 +970,10 @@ extern __typeof(gomp_loop_dynamic_start)
|
|
__attribute__((alias ("gomp_loop_dynamic_start")));
|
|
extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start
|
|
__attribute__((alias ("gomp_loop_guided_start")));
|
|
+extern __typeof(GOMP_loop_runtime_start) GOMP_loop_nonmonotonic_runtime_start
|
|
+ __attribute__((alias ("GOMP_loop_runtime_start")));
|
|
+extern __typeof(GOMP_loop_runtime_start) GOMP_loop_maybe_nonmonotonic_runtime_start
|
|
+ __attribute__((alias ("GOMP_loop_runtime_start")));
|
|
|
|
extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start
|
|
__attribute__((alias ("gomp_loop_ordered_static_start")));
|
|
@@ -746,6 +999,10 @@ extern __typeof(gomp_loop_dynamic_next)
|
|
__attribute__((alias ("gomp_loop_dynamic_next")));
|
|
extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next
|
|
__attribute__((alias ("gomp_loop_guided_next")));
|
|
+extern __typeof(GOMP_loop_runtime_next) GOMP_loop_nonmonotonic_runtime_next
|
|
+ __attribute__((alias ("GOMP_loop_runtime_next")));
|
|
+extern __typeof(GOMP_loop_runtime_next) GOMP_loop_maybe_nonmonotonic_runtime_next
|
|
+ __attribute__((alias ("GOMP_loop_runtime_next")));
|
|
|
|
extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next
|
|
__attribute__((alias ("gomp_loop_ordered_static_next")));
|
|
@@ -791,6 +1048,20 @@ GOMP_loop_nonmonotonic_guided_start (lon
|
|
}
|
|
|
|
bool
|
|
+GOMP_loop_nonmonotonic_runtime_start (long start, long end, long incr,
|
|
+ long *istart, long *iend)
|
|
+{
|
|
+ return GOMP_loop_runtime_start (start, end, incr, istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
+GOMP_loop_maybe_nonmonotonic_runtime_start (long start, long end, long incr,
|
|
+ long *istart, long *iend)
|
|
+{
|
|
+ return GOMP_loop_runtime_start (start, end, incr, istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
GOMP_loop_ordered_static_start (long start, long end, long incr,
|
|
long chunk_size, long *istart, long *iend)
|
|
{
|
|
@@ -869,6 +1140,18 @@ GOMP_loop_nonmonotonic_guided_next (long
|
|
}
|
|
|
|
bool
|
|
+GOMP_loop_nonmonotonic_runtime_next (long *istart, long *iend)
|
|
+{
|
|
+ return GOMP_loop_runtime_next (istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
+GOMP_loop_maybe_nonmonotonic_runtime_next (long *istart, long *iend)
|
|
+{
|
|
+ return GOMP_loop_runtime_next (istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
GOMP_loop_ordered_static_next (long *istart, long *iend)
|
|
{
|
|
return gomp_loop_ordered_static_next (istart, iend);
|
|
--- libgomp/oacc-plugin.c.jj 2018-04-25 09:40:31.322655307 +0200
|
|
+++ libgomp/oacc-plugin.c 2019-05-07 18:46:36.531109656 +0200
|
|
@@ -49,3 +49,14 @@ GOMP_PLUGIN_acc_thread (void)
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
return thr ? thr->target_tls : NULL;
|
|
}
|
|
+
|
|
+int
|
|
+GOMP_PLUGIN_acc_default_dim (unsigned int i)
|
|
+{
|
|
+ if (i >= GOMP_DIM_MAX)
|
|
+ {
|
|
+ gomp_fatal ("invalid dimension argument: %d", i);
|
|
+ return -1;
|
|
+ }
|
|
+ return goacc_default_dims[i];
|
|
+}
|
|
--- libgomp/libgomp_g.h.jj 2018-04-25 09:40:31.320655306 +0200
|
|
+++ libgomp/libgomp_g.h 2019-05-07 18:46:36.513109943 +0200
|
|
@@ -1,4 +1,4 @@
|
|
-/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
|
|
+/* Copyright (C) 2005-2019 Free Software Foundation, Inc.
|
|
Contributed by Richard Henderson <rth@redhat.com>.
|
|
|
|
This file is part of the GNU Offloading and Multi Processing Library
|
|
@@ -31,6 +31,7 @@
|
|
|
|
#include <stdbool.h>
|
|
#include <stddef.h>
|
|
+#include "gstdint.h"
|
|
|
|
/* barrier.c */
|
|
|
|
@@ -56,6 +57,12 @@ extern bool GOMP_loop_nonmonotonic_dynam
|
|
long *, long *);
|
|
extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long,
|
|
long *, long *);
|
|
+extern bool GOMP_loop_nonmonotonic_runtime_start (long, long, long,
|
|
+ long *, long *);
|
|
+extern bool GOMP_loop_maybe_nonmonotonic_runtime_start (long, long, long,
|
|
+ long *, long *);
|
|
+extern bool GOMP_loop_start (long, long, long, long, long, long *, long *,
|
|
+ uintptr_t *, void **);
|
|
|
|
extern bool GOMP_loop_ordered_static_start (long, long, long, long,
|
|
long *, long *);
|
|
@@ -64,6 +71,8 @@ extern bool GOMP_loop_ordered_dynamic_st
|
|
extern bool GOMP_loop_ordered_guided_start (long, long, long, long,
|
|
long *, long *);
|
|
extern bool GOMP_loop_ordered_runtime_start (long, long, long, long *, long *);
|
|
+extern bool GOMP_loop_ordered_start (long, long, long, long, long, long *,
|
|
+ long *, uintptr_t *, void **);
|
|
|
|
extern bool GOMP_loop_static_next (long *, long *);
|
|
extern bool GOMP_loop_dynamic_next (long *, long *);
|
|
@@ -71,6 +80,8 @@ extern bool GOMP_loop_guided_next (long
|
|
extern bool GOMP_loop_runtime_next (long *, long *);
|
|
extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *);
|
|
extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *);
|
|
+extern bool GOMP_loop_nonmonotonic_runtime_next (long *, long *);
|
|
+extern bool GOMP_loop_maybe_nonmonotonic_runtime_next (long *, long *);
|
|
|
|
extern bool GOMP_loop_ordered_static_next (long *, long *);
|
|
extern bool GOMP_loop_ordered_dynamic_next (long *, long *);
|
|
@@ -85,6 +96,8 @@ extern bool GOMP_loop_doacross_guided_st
|
|
long *);
|
|
extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *,
|
|
long *);
|
|
+extern bool GOMP_loop_doacross_start (unsigned, long *, long, long, long *,
|
|
+ long *, uintptr_t *, void **);
|
|
|
|
extern void GOMP_parallel_loop_static_start (void (*)(void *), void *,
|
|
unsigned, long, long, long, long);
|
|
@@ -112,6 +125,13 @@ extern void GOMP_parallel_loop_nonmonoto
|
|
extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *,
|
|
unsigned, long, long,
|
|
long, long, unsigned);
|
|
+extern void GOMP_parallel_loop_nonmonotonic_runtime (void (*)(void *), void *,
|
|
+ unsigned, long, long,
|
|
+ long, unsigned);
|
|
+extern void GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*)(void *),
|
|
+ void *, unsigned,
|
|
+ long, long,
|
|
+ long, unsigned);
|
|
|
|
extern void GOMP_loop_end (void);
|
|
extern void GOMP_loop_end_nowait (void);
|
|
@@ -154,6 +174,21 @@ extern bool GOMP_loop_ull_nonmonotonic_g
|
|
unsigned long long,
|
|
unsigned long long *,
|
|
unsigned long long *);
|
|
+extern bool GOMP_loop_ull_nonmonotonic_runtime_start (bool, unsigned long long,
|
|
+ unsigned long long,
|
|
+ unsigned long long,
|
|
+ unsigned long long *,
|
|
+ unsigned long long *);
|
|
+extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool,
|
|
+ unsigned long long,
|
|
+ unsigned long long,
|
|
+ unsigned long long,
|
|
+ unsigned long long *,
|
|
+ unsigned long long *);
|
|
+extern bool GOMP_loop_ull_start (bool, unsigned long long, unsigned long long,
|
|
+ unsigned long long, long, unsigned long long,
|
|
+ unsigned long long *, unsigned long long *,
|
|
+ uintptr_t *, void **);
|
|
|
|
extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long,
|
|
unsigned long long,
|
|
@@ -178,6 +213,13 @@ extern bool GOMP_loop_ull_ordered_runtim
|
|
unsigned long long,
|
|
unsigned long long *,
|
|
unsigned long long *);
|
|
+extern bool GOMP_loop_ull_ordered_start (bool, unsigned long long,
|
|
+ unsigned long long,
|
|
+ unsigned long long, long,
|
|
+ unsigned long long,
|
|
+ unsigned long long *,
|
|
+ unsigned long long *,
|
|
+ uintptr_t *, void **);
|
|
|
|
extern bool GOMP_loop_ull_static_next (unsigned long long *,
|
|
unsigned long long *);
|
|
@@ -191,6 +233,10 @@ extern bool GOMP_loop_ull_nonmonotonic_d
|
|
unsigned long long *);
|
|
extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *,
|
|
unsigned long long *);
|
|
+extern bool GOMP_loop_ull_nonmonotonic_runtime_next (unsigned long long *,
|
|
+ unsigned long long *);
|
|
+extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_next (unsigned long long *,
|
|
+ unsigned long long *);
|
|
|
|
extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *,
|
|
unsigned long long *);
|
|
@@ -220,6 +266,11 @@ extern bool GOMP_loop_ull_doacross_runti
|
|
unsigned long long *,
|
|
unsigned long long *,
|
|
unsigned long long *);
|
|
+extern bool GOMP_loop_ull_doacross_start (unsigned, unsigned long long *,
|
|
+ long, unsigned long long,
|
|
+ unsigned long long *,
|
|
+ unsigned long long *,
|
|
+ uintptr_t *, void **);
|
|
|
|
/* ordered.c */
|
|
|
|
@@ -235,6 +286,8 @@ extern void GOMP_doacross_ull_wait (unsi
|
|
extern void GOMP_parallel_start (void (*) (void *), void *, unsigned);
|
|
extern void GOMP_parallel_end (void);
|
|
extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned);
|
|
+extern unsigned GOMP_parallel_reductions (void (*) (void *), void *, unsigned,
|
|
+ unsigned);
|
|
extern bool GOMP_cancel (int, bool);
|
|
extern bool GOMP_cancellation_point (int);
|
|
|
|
@@ -251,13 +304,19 @@ extern void GOMP_taskloop_ull (void (*)
|
|
unsigned long long, unsigned long long,
|
|
unsigned long long);
|
|
extern void GOMP_taskwait (void);
|
|
+extern void GOMP_taskwait_depend (void **);
|
|
extern void GOMP_taskyield (void);
|
|
extern void GOMP_taskgroup_start (void);
|
|
extern void GOMP_taskgroup_end (void);
|
|
+extern void GOMP_taskgroup_reduction_register (uintptr_t *);
|
|
+extern void GOMP_taskgroup_reduction_unregister (uintptr_t *);
|
|
+extern void GOMP_task_reduction_remap (size_t, size_t, void **);
|
|
+extern void GOMP_workshare_task_reduction_unregister (bool);
|
|
|
|
/* sections.c */
|
|
|
|
extern unsigned GOMP_sections_start (unsigned);
|
|
+extern unsigned GOMP_sections2_start (unsigned, uintptr_t *, void **);
|
|
extern unsigned GOMP_sections_next (void);
|
|
extern void GOMP_parallel_sections_start (void (*) (void *), void *,
|
|
unsigned, unsigned);
|
|
@@ -293,6 +352,11 @@ extern void GOMP_target_enter_exit_data
|
|
void **);
|
|
extern void GOMP_teams (unsigned int, unsigned int);
|
|
|
|
+/* teams.c */
|
|
+
|
|
+extern void GOMP_teams_reg (void (*) (void *), void *, unsigned, unsigned,
|
|
+ unsigned);
|
|
+
|
|
/* oacc-parallel.c */
|
|
|
|
extern void GOACC_parallel_keyed (int, void (*) (void *), size_t,
|
|
--- libgomp/affinity.c.jj 2018-04-25 09:40:31.913655581 +0200
|
|
+++ libgomp/affinity.c 2019-05-07 18:46:36.254114081 +0200
|
|
@@ -26,6 +26,8 @@
|
|
/* This is a generic stub implementation of a CPU affinity setting. */
|
|
|
|
#include "libgomp.h"
|
|
+#include <string.h>
|
|
+#include <stdio.h>
|
|
|
|
void
|
|
gomp_init_affinity (void)
|
|
@@ -138,5 +140,17 @@ gomp_get_place_proc_ids_8 (int place_num
|
|
(void) ids;
|
|
}
|
|
|
|
+void
|
|
+gomp_display_affinity_place (char *buffer, size_t size, size_t *ret,
|
|
+ int place)
|
|
+{
|
|
+ char buf[sizeof (long) * 3 + 4];
|
|
+ if (gomp_available_cpus > 1)
|
|
+ sprintf (buf, "0-%lu", gomp_available_cpus - 1);
|
|
+ else
|
|
+ strcpy (buf, "0");
|
|
+ gomp_display_string (buffer, size, ret, buf, strlen (buf));
|
|
+}
|
|
+
|
|
ialias(omp_get_place_num_procs)
|
|
ialias(omp_get_place_proc_ids)
|
|
--- libgomp/sections.c.jj 2018-04-25 09:40:31.924655586 +0200
|
|
+++ libgomp/sections.c 2019-05-07 18:46:36.535109592 +0200
|
|
@@ -26,8 +26,11 @@
|
|
/* This file handles the SECTIONS construct. */
|
|
|
|
#include "libgomp.h"
|
|
+#include <string.h>
|
|
|
|
|
|
+ialias_redirect (GOMP_taskgroup_reduction_register)
|
|
+
|
|
/* Initialize the given work share construct from the given arguments. */
|
|
|
|
static inline void
|
|
@@ -72,7 +75,7 @@ GOMP_sections_start (unsigned count)
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
long s, e, ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_sections_init (thr->ts.work_share, count);
|
|
gomp_work_share_init_done ();
|
|
@@ -95,6 +98,66 @@ GOMP_sections_start (unsigned count)
|
|
return ret;
|
|
}
|
|
|
|
+unsigned
|
|
+GOMP_sections2_start (unsigned count, uintptr_t *reductions, void **mem)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ long s, e, ret;
|
|
+
|
|
+ if (reductions)
|
|
+ gomp_workshare_taskgroup_start ();
|
|
+ if (gomp_work_share_start (0))
|
|
+ {
|
|
+ gomp_sections_init (thr->ts.work_share, count);
|
|
+ if (reductions)
|
|
+ {
|
|
+ GOMP_taskgroup_reduction_register (reductions);
|
|
+ thr->task->taskgroup->workshare = true;
|
|
+ thr->ts.work_share->task_reductions = reductions;
|
|
+ }
|
|
+ if (mem)
|
|
+ {
|
|
+ uintptr_t size = (uintptr_t) *mem;
|
|
+ if (size > (sizeof (struct gomp_work_share)
|
|
+ - offsetof (struct gomp_work_share,
|
|
+ inline_ordered_team_ids)))
|
|
+ thr->ts.work_share->ordered_team_ids
|
|
+ = gomp_malloc_cleared (size);
|
|
+ else
|
|
+ memset (thr->ts.work_share->ordered_team_ids, '\0', size);
|
|
+ *mem = (void *) thr->ts.work_share->ordered_team_ids;
|
|
+ }
|
|
+ gomp_work_share_init_done ();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (reductions)
|
|
+ {
|
|
+ uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
|
|
+ gomp_workshare_task_reduction_register (reductions,
|
|
+ first_reductions);
|
|
+ }
|
|
+ if (mem)
|
|
+ *mem = (void *) thr->ts.work_share->ordered_team_ids;
|
|
+ }
|
|
+
|
|
+#ifdef HAVE_SYNC_BUILTINS
|
|
+ if (gomp_iter_dynamic_next (&s, &e))
|
|
+ ret = s;
|
|
+ else
|
|
+ ret = 0;
|
|
+#else
|
|
+ gomp_mutex_lock (&thr->ts.work_share->lock);
|
|
+ if (gomp_iter_dynamic_next_locked (&s, &e))
|
|
+ ret = s;
|
|
+ else
|
|
+ ret = 0;
|
|
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
|
|
+#endif
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
/* This routine is called when the thread completes processing of the
|
|
section currently assigned to it. If the work-share construct is
|
|
bound directly to a parallel construct, then the construct may have
|
|
@@ -140,7 +203,7 @@ GOMP_parallel_sections_start (void (*fn)
|
|
num_threads = gomp_resolve_num_threads (num_threads, count);
|
|
team = gomp_new_team (num_threads);
|
|
gomp_sections_init (&team->work_shares[0], count);
|
|
- gomp_team_start (fn, data, num_threads, 0, team);
|
|
+ gomp_team_start (fn, data, num_threads, 0, team, NULL);
|
|
}
|
|
|
|
ialias_redirect (GOMP_parallel_end)
|
|
@@ -154,7 +217,7 @@ GOMP_parallel_sections (void (*fn) (void
|
|
num_threads = gomp_resolve_num_threads (num_threads, count);
|
|
team = gomp_new_team (num_threads);
|
|
gomp_sections_init (&team->work_shares[0], count);
|
|
- gomp_team_start (fn, data, num_threads, flags, team);
|
|
+ gomp_team_start (fn, data, num_threads, flags, team, NULL);
|
|
fn (data);
|
|
GOMP_parallel_end ();
|
|
}
|
|
--- libgomp/config/linux/affinity.c.jj 2018-04-25 09:40:31.875655563 +0200
|
|
+++ libgomp/config/linux/affinity.c 2019-05-07 18:46:36.344112642 +0200
|
|
@@ -396,6 +396,56 @@ gomp_get_place_proc_ids_8 (int place_num
|
|
*ids++ = i;
|
|
}
|
|
|
|
+void
|
|
+gomp_display_affinity_place (char *buffer, size_t size, size_t *ret,
|
|
+ int place)
|
|
+{
|
|
+ cpu_set_t *cpusetp;
|
|
+ char buf[sizeof (long) * 3 + 4];
|
|
+ if (place >= 0 && place < gomp_places_list_len)
|
|
+ cpusetp = (cpu_set_t *) gomp_places_list[place];
|
|
+ else if (gomp_cpusetp)
|
|
+ cpusetp = gomp_cpusetp;
|
|
+ else
|
|
+ {
|
|
+ if (gomp_available_cpus > 1)
|
|
+ sprintf (buf, "0-%lu", gomp_available_cpus - 1);
|
|
+ else
|
|
+ strcpy (buf, "0");
|
|
+ gomp_display_string (buffer, size, ret, buf, strlen (buf));
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ unsigned long i, max = 8 * gomp_cpuset_size, start;
|
|
+ bool prev_set = false;
|
|
+ start = max;
|
|
+ for (i = 0; i <= max; i++)
|
|
+ {
|
|
+ bool this_set;
|
|
+ if (i == max)
|
|
+ this_set = false;
|
|
+ else
|
|
+ this_set = CPU_ISSET_S (i, gomp_cpuset_size, cpusetp);
|
|
+ if (this_set != prev_set)
|
|
+ {
|
|
+ prev_set = this_set;
|
|
+ if (this_set)
|
|
+ {
|
|
+ char *p = buf;
|
|
+ if (start != max)
|
|
+ *p++ = ',';
|
|
+ sprintf (p, "%lu", i);
|
|
+ start = i;
|
|
+ }
|
|
+ else if (i == start + 1)
|
|
+ continue;
|
|
+ else
|
|
+ sprintf (buf, "-%lu", i - 1);
|
|
+ gomp_display_string (buffer, size, ret, buf, strlen (buf));
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
ialias(omp_get_place_num_procs)
|
|
ialias(omp_get_place_proc_ids)
|
|
|
|
--- libgomp/config/linux/ia64/futex.h.jj 2018-04-25 09:40:31.877655564 +0200
|
|
+++ libgomp/config/linux/ia64/futex.h 2019-05-07 18:46:36.344112642 +0200
|
|
@@ -45,8 +45,8 @@ sys_futex0(int *addr, int op, int val)
|
|
"=r"(r8), "=r"(r10)
|
|
: "r"(r15), "r"(out0), "r"(out1), "r"(out2), "r"(out3)
|
|
: "memory", "out4", "out5", "out6", "out7",
|
|
- /* Non-stacked integer registers, minus r8, r10, r15. */
|
|
- "r2", "r3", "r9", "r11", "r12", "r13", "r14", "r16", "r17", "r18",
|
|
+ /* Non-stacked integer registers, minus r8, r10, r12, r15. */
|
|
+ "r2", "r3", "r9", "r11", "r13", "r14", "r16", "r17", "r18",
|
|
"r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27",
|
|
"r28", "r29", "r30", "r31",
|
|
/* Predicate registers. */
|
|
--- libgomp/config/nvptx/teams.c.jj 2019-05-07 18:46:36.459110805 +0200
|
|
+++ libgomp/config/nvptx/teams.c 2019-05-07 18:46:36.459110805 +0200
|
|
@@ -0,0 +1,57 @@
|
|
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
|
|
+ Contributed by Alexander Monakov <amonakov@ispras.ru>
|
|
+
|
|
+ This file is part of the GNU Offloading and Multi Processing Library
|
|
+ (libgomp).
|
|
+
|
|
+ Libgomp is free software; you can redistribute it and/or modify it
|
|
+ under the terms of the GNU General Public License as published by
|
|
+ the Free Software Foundation; either version 3, or (at your option)
|
|
+ any later version.
|
|
+
|
|
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
+ more details.
|
|
+
|
|
+ Under Section 7 of GPL version 3, you are granted additional
|
|
+ permissions described in the GCC Runtime Library Exception, version
|
|
+ 3.1, as published by the Free Software Foundation.
|
|
+
|
|
+ You should have received a copy of the GNU General Public License and
|
|
+ a copy of the GCC Runtime Library Exception along with this program;
|
|
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* This file defines OpenMP API entry points that accelerator targets are
|
|
+ expected to replace. */
|
|
+
|
|
+#include "libgomp.h"
|
|
+
|
|
+void
|
|
+GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
|
|
+ unsigned int thread_limit, unsigned int flags)
|
|
+{
|
|
+ (void) fn;
|
|
+ (void) data;
|
|
+ (void) flags;
|
|
+ (void) num_teams;
|
|
+ (void) thread_limit;
|
|
+}
|
|
+
|
|
+int
|
|
+omp_get_num_teams (void)
|
|
+{
|
|
+ return gomp_num_teams_var + 1;
|
|
+}
|
|
+
|
|
+int
|
|
+omp_get_team_num (void)
|
|
+{
|
|
+ int ctaid;
|
|
+ asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
|
|
+ return ctaid;
|
|
+}
|
|
+
|
|
+ialias (omp_get_num_teams)
|
|
+ialias (omp_get_team_num)
|
|
--- libgomp/config/nvptx/team.c.jj 2018-04-25 09:40:31.890655570 +0200
|
|
+++ libgomp/config/nvptx/team.c 2019-05-07 18:46:36.459110805 +0200
|
|
@@ -116,7 +116,8 @@ gomp_thread_start (struct gomp_thread_po
|
|
|
|
void
|
|
gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
|
|
- unsigned flags, struct gomp_team *team)
|
|
+ unsigned flags, struct gomp_team *team,
|
|
+ struct gomp_taskgroup *taskgroup)
|
|
{
|
|
struct gomp_thread *thr, *nthr;
|
|
struct gomp_task *task;
|
|
@@ -147,6 +148,7 @@ gomp_team_start (void (*fn) (void *), vo
|
|
nthreads_var = icv->nthreads_var;
|
|
gomp_init_task (thr->task, task, icv);
|
|
team->implicit_task[0].icv.nthreads_var = nthreads_var;
|
|
+ team->implicit_task[0].taskgroup = taskgroup;
|
|
|
|
if (nthreads == 1)
|
|
return;
|
|
@@ -166,6 +168,7 @@ gomp_team_start (void (*fn) (void *), vo
|
|
nthr->task = &team->implicit_task[i];
|
|
gomp_init_task (nthr->task, task, icv);
|
|
team->implicit_task[i].icv.nthreads_var = nthreads_var;
|
|
+ team->implicit_task[i].taskgroup = taskgroup;
|
|
nthr->fn = fn;
|
|
nthr->data = data;
|
|
team->ordered_release[i] = &nthr->release;
|
|
@@ -174,5 +177,11 @@ gomp_team_start (void (*fn) (void *), vo
|
|
gomp_simple_barrier_wait (&pool->threads_dock);
|
|
}
|
|
|
|
+int
|
|
+gomp_pause_host (void)
|
|
+{
|
|
+ return -1;
|
|
+}
|
|
+
|
|
#include "../../team.c"
|
|
#endif
|
|
--- libgomp/config/nvptx/oacc-parallel.c.jj 2018-04-25 09:40:31.887655569 +0200
|
|
+++ libgomp/config/nvptx/oacc-parallel.c 2019-05-07 18:46:36.453110901 +0200
|
|
@@ -1,358 +0,0 @@
|
|
-/* OpenACC constructs
|
|
-
|
|
- Copyright (C) 2014-2018 Free Software Foundation, Inc.
|
|
-
|
|
- Contributed by Mentor Embedded.
|
|
-
|
|
- This file is part of the GNU Offloading and Multi Processing Library
|
|
- (libgomp).
|
|
-
|
|
- Libgomp is free software; you can redistribute it and/or modify it
|
|
- under the terms of the GNU General Public License as published by
|
|
- the Free Software Foundation; either version 3, or (at your option)
|
|
- any later version.
|
|
-
|
|
- Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
- FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
- more details.
|
|
-
|
|
- Under Section 7 of GPL version 3, you are granted additional
|
|
- permissions described in the GCC Runtime Library Exception, version
|
|
- 3.1, as published by the Free Software Foundation.
|
|
-
|
|
- You should have received a copy of the GNU General Public License and
|
|
- a copy of the GCC Runtime Library Exception along with this program;
|
|
- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
- <http://www.gnu.org/licenses/>. */
|
|
-
|
|
-#include "libgomp_g.h"
|
|
-
|
|
-__asm__ (".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n"
|
|
- "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_num_threads\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n"
|
|
- "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_thread_num\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n"
|
|
- "// BEGIN GLOBAL FUNCTION DECL: abort\n"
|
|
- ".extern .func abort;\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n"
|
|
- "{\n"
|
|
- ".reg .u32 %ar1;\n"
|
|
- ".reg .u32 %retval;\n"
|
|
- ".reg .u64 %hr10;\n"
|
|
- ".reg .u32 %r22;\n"
|
|
- ".reg .u32 %r23;\n"
|
|
- ".reg .u32 %r24;\n"
|
|
- ".reg .u32 %r25;\n"
|
|
- ".reg .u32 %r26;\n"
|
|
- ".reg .u32 %r27;\n"
|
|
- ".reg .u32 %r28;\n"
|
|
- ".reg .u32 %r29;\n"
|
|
- ".reg .pred %r30;\n"
|
|
- ".reg .u32 %r31;\n"
|
|
- ".reg .pred %r32;\n"
|
|
- ".reg .u32 %r33;\n"
|
|
- ".reg .pred %r34;\n"
|
|
- ".local .align 8 .b8 %frame[4];\n"
|
|
- "ld.param.u32 %ar1,[%in_ar1];\n"
|
|
- "mov.u32 %r27,%ar1;\n"
|
|
- "st.local.u32 [%frame],%r27;\n"
|
|
- "ld.local.u32 %r28,[%frame];\n"
|
|
- "mov.u32 %r29,1;\n"
|
|
- "setp.eq.u32 %r30,%r28,%r29;\n"
|
|
- "@%r30 bra $L4;\n"
|
|
- "mov.u32 %r31,2;\n"
|
|
- "setp.eq.u32 %r32,%r28,%r31;\n"
|
|
- "@%r32 bra $L5;\n"
|
|
- "mov.u32 %r33,0;\n"
|
|
- "setp.eq.u32 %r34,%r28,%r33;\n"
|
|
- "@!%r34 bra $L8;\n"
|
|
- "mov.u32 %r23,%tid.x;\n"
|
|
- "mov.u32 %r22,%r23;\n"
|
|
- "bra $L7;\n"
|
|
- "$L4:\n"
|
|
- "mov.u32 %r24,%tid.y;\n"
|
|
- "mov.u32 %r22,%r24;\n"
|
|
- "bra $L7;\n"
|
|
- "$L5:\n"
|
|
- "mov.u32 %r25,%tid.z;\n"
|
|
- "mov.u32 %r22,%r25;\n"
|
|
- "bra $L7;\n"
|
|
- "$L8:\n"
|
|
- "{\n"
|
|
- "{\n"
|
|
- "call abort;\n"
|
|
- "}\n"
|
|
- "}\n"
|
|
- "$L7:\n"
|
|
- "mov.u32 %r26,%r22;\n"
|
|
- "mov.u32 %retval,%r26;\n"
|
|
- "st.param.u32 [%out_retval],%retval;\n"
|
|
- "ret;\n"
|
|
- "}\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n"
|
|
- "{\n"
|
|
- ".reg .u32 %ar1;\n"
|
|
- ".reg .u32 %retval;\n"
|
|
- ".reg .u64 %hr10;\n"
|
|
- ".reg .u32 %r22;\n"
|
|
- ".reg .u32 %r23;\n"
|
|
- ".reg .u32 %r24;\n"
|
|
- ".reg .u32 %r25;\n"
|
|
- ".reg .u32 %r26;\n"
|
|
- ".reg .u32 %r27;\n"
|
|
- ".reg .u32 %r28;\n"
|
|
- ".reg .u32 %r29;\n"
|
|
- ".reg .pred %r30;\n"
|
|
- ".reg .u32 %r31;\n"
|
|
- ".reg .pred %r32;\n"
|
|
- ".reg .u32 %r33;\n"
|
|
- ".reg .pred %r34;\n"
|
|
- ".local .align 8 .b8 %frame[4];\n"
|
|
- "ld.param.u32 %ar1,[%in_ar1];\n"
|
|
- "mov.u32 %r27,%ar1;\n"
|
|
- "st.local.u32 [%frame],%r27;\n"
|
|
- "ld.local.u32 %r28,[%frame];\n"
|
|
- "mov.u32 %r29,1;\n"
|
|
- "setp.eq.u32 %r30,%r28,%r29;\n"
|
|
- "@%r30 bra $L11;\n"
|
|
- "mov.u32 %r31,2;\n"
|
|
- "setp.eq.u32 %r32,%r28,%r31;\n"
|
|
- "@%r32 bra $L12;\n"
|
|
- "mov.u32 %r33,0;\n"
|
|
- "setp.eq.u32 %r34,%r28,%r33;\n"
|
|
- "@!%r34 bra $L15;\n"
|
|
- "mov.u32 %r23,%ntid.x;\n"
|
|
- "mov.u32 %r22,%r23;\n"
|
|
- "bra $L14;\n"
|
|
- "$L11:\n"
|
|
- "mov.u32 %r24,%ntid.y;\n"
|
|
- "mov.u32 %r22,%r24;\n"
|
|
- "bra $L14;\n"
|
|
- "$L12:\n"
|
|
- "mov.u32 %r25,%ntid.z;\n"
|
|
- "mov.u32 %r22,%r25;\n"
|
|
- "bra $L14;\n"
|
|
- "$L15:\n"
|
|
- "{\n"
|
|
- "{\n"
|
|
- "call abort;\n"
|
|
- "}\n"
|
|
- "}\n"
|
|
- "$L14:\n"
|
|
- "mov.u32 %r26,%r22;\n"
|
|
- "mov.u32 %retval,%r26;\n"
|
|
- "st.param.u32 [%out_retval],%retval;\n"
|
|
- "ret;\n"
|
|
- "}\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n"
|
|
- "{\n"
|
|
- ".reg .u32 %ar1;\n"
|
|
- ".reg .u32 %retval;\n"
|
|
- ".reg .u64 %hr10;\n"
|
|
- ".reg .u32 %r22;\n"
|
|
- ".reg .u32 %r23;\n"
|
|
- ".reg .u32 %r24;\n"
|
|
- ".reg .u32 %r25;\n"
|
|
- ".reg .u32 %r26;\n"
|
|
- ".reg .u32 %r27;\n"
|
|
- ".reg .u32 %r28;\n"
|
|
- ".reg .u32 %r29;\n"
|
|
- ".reg .pred %r30;\n"
|
|
- ".reg .u32 %r31;\n"
|
|
- ".reg .pred %r32;\n"
|
|
- ".reg .u32 %r33;\n"
|
|
- ".reg .pred %r34;\n"
|
|
- ".local .align 8 .b8 %frame[4];\n"
|
|
- "ld.param.u32 %ar1,[%in_ar1];\n"
|
|
- "mov.u32 %r27,%ar1;\n"
|
|
- "st.local.u32 [%frame],%r27;\n"
|
|
- "ld.local.u32 %r28,[%frame];\n"
|
|
- "mov.u32 %r29,1;\n"
|
|
- "setp.eq.u32 %r30,%r28,%r29;\n"
|
|
- "@%r30 bra $L18;\n"
|
|
- "mov.u32 %r31,2;\n"
|
|
- "setp.eq.u32 %r32,%r28,%r31;\n"
|
|
- "@%r32 bra $L19;\n"
|
|
- "mov.u32 %r33,0;\n"
|
|
- "setp.eq.u32 %r34,%r28,%r33;\n"
|
|
- "@!%r34 bra $L22;\n"
|
|
- "mov.u32 %r23,%ctaid.x;\n"
|
|
- "mov.u32 %r22,%r23;\n"
|
|
- "bra $L21;\n"
|
|
- "$L18:\n"
|
|
- "mov.u32 %r24,%ctaid.y;\n"
|
|
- "mov.u32 %r22,%r24;\n"
|
|
- "bra $L21;\n"
|
|
- "$L19:\n"
|
|
- "mov.u32 %r25,%ctaid.z;\n"
|
|
- "mov.u32 %r22,%r25;\n"
|
|
- "bra $L21;\n"
|
|
- "$L22:\n"
|
|
- "{\n"
|
|
- "{\n"
|
|
- "call abort;\n"
|
|
- "}\n"
|
|
- "}\n"
|
|
- "$L21:\n"
|
|
- "mov.u32 %r26,%r22;\n"
|
|
- "mov.u32 %retval,%r26;\n"
|
|
- "st.param.u32 [%out_retval],%retval;\n"
|
|
- "ret;\n"
|
|
- "}\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n"
|
|
- "{\n"
|
|
- ".reg .u32 %ar1;\n"
|
|
- ".reg .u32 %retval;\n"
|
|
- ".reg .u64 %hr10;\n"
|
|
- ".reg .u32 %r22;\n"
|
|
- ".reg .u32 %r23;\n"
|
|
- ".reg .u32 %r24;\n"
|
|
- ".reg .u32 %r25;\n"
|
|
- ".reg .u32 %r26;\n"
|
|
- ".reg .u32 %r27;\n"
|
|
- ".reg .u32 %r28;\n"
|
|
- ".reg .u32 %r29;\n"
|
|
- ".reg .pred %r30;\n"
|
|
- ".reg .u32 %r31;\n"
|
|
- ".reg .pred %r32;\n"
|
|
- ".reg .u32 %r33;\n"
|
|
- ".reg .pred %r34;\n"
|
|
- ".local .align 8 .b8 %frame[4];\n"
|
|
- "ld.param.u32 %ar1,[%in_ar1];\n"
|
|
- "mov.u32 %r27,%ar1;\n"
|
|
- "st.local.u32 [%frame],%r27;\n"
|
|
- "ld.local.u32 %r28,[%frame];\n"
|
|
- "mov.u32 %r29,1;\n"
|
|
- "setp.eq.u32 %r30,%r28,%r29;\n"
|
|
- "@%r30 bra $L25;\n"
|
|
- "mov.u32 %r31,2;\n"
|
|
- "setp.eq.u32 %r32,%r28,%r31;\n"
|
|
- "@%r32 bra $L26;\n"
|
|
- "mov.u32 %r33,0;\n"
|
|
- "setp.eq.u32 %r34,%r28,%r33;\n"
|
|
- "@!%r34 bra $L29;\n"
|
|
- "mov.u32 %r23,%nctaid.x;\n"
|
|
- "mov.u32 %r22,%r23;\n"
|
|
- "bra $L28;\n"
|
|
- "$L25:\n"
|
|
- "mov.u32 %r24,%nctaid.y;\n"
|
|
- "mov.u32 %r22,%r24;\n"
|
|
- "bra $L28;\n"
|
|
- "$L26:\n"
|
|
- "mov.u32 %r25,%nctaid.z;\n"
|
|
- "mov.u32 %r22,%r25;\n"
|
|
- "bra $L28;\n"
|
|
- "$L29:\n"
|
|
- "{\n"
|
|
- "{\n"
|
|
- "call abort;\n"
|
|
- "}\n"
|
|
- "}\n"
|
|
- "$L28:\n"
|
|
- "mov.u32 %r26,%r22;\n"
|
|
- "mov.u32 %retval,%r26;\n"
|
|
- "st.param.u32 [%out_retval],%retval;\n"
|
|
- "ret;\n"
|
|
- "}\n"
|
|
- "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_num_threads\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n"
|
|
- "{\n"
|
|
- ".reg .u32 %retval;\n"
|
|
- ".reg .u64 %hr10;\n"
|
|
- ".reg .u32 %r22;\n"
|
|
- ".reg .u32 %r23;\n"
|
|
- ".reg .u32 %r24;\n"
|
|
- ".reg .u32 %r25;\n"
|
|
- ".reg .u32 %r26;\n"
|
|
- ".reg .u32 %r27;\n"
|
|
- ".reg .u32 %r28;\n"
|
|
- ".reg .u32 %r29;\n"
|
|
- "mov.u32 %r26,0;\n"
|
|
- "{\n"
|
|
- ".param .u32 %retval_in;\n"
|
|
- "{\n"
|
|
- ".param .u32 %out_arg0;\n"
|
|
- "st.param.u32 [%out_arg0],%r26;\n"
|
|
- "call (%retval_in),GOACC_ntid,(%out_arg0);\n"
|
|
- "}\n"
|
|
- "ld.param.u32 %r27,[%retval_in];\n"
|
|
- "}\n"
|
|
- "mov.u32 %r22,%r27;\n"
|
|
- "mov.u32 %r28,0;\n"
|
|
- "{\n"
|
|
- ".param .u32 %retval_in;\n"
|
|
- "{\n"
|
|
- ".param .u32 %out_arg0;\n"
|
|
- "st.param.u32 [%out_arg0],%r28;\n"
|
|
- "call (%retval_in),GOACC_nctaid,(%out_arg0);\n"
|
|
- "}\n"
|
|
- "ld.param.u32 %r29,[%retval_in];\n"
|
|
- "}\n"
|
|
- "mov.u32 %r23,%r29;\n"
|
|
- "mul.lo.u32 %r24,%r22,%r23;\n"
|
|
- "mov.u32 %r25,%r24;\n"
|
|
- "mov.u32 %retval,%r25;\n"
|
|
- "st.param.u32 [%out_retval],%retval;\n"
|
|
- "ret;\n"
|
|
- "}\n"
|
|
- "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_thread_num\n"
|
|
- ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n"
|
|
- "{\n"
|
|
- ".reg .u32 %retval;\n"
|
|
- ".reg .u64 %hr10;\n"
|
|
- ".reg .u32 %r22;\n"
|
|
- ".reg .u32 %r23;\n"
|
|
- ".reg .u32 %r24;\n"
|
|
- ".reg .u32 %r25;\n"
|
|
- ".reg .u32 %r26;\n"
|
|
- ".reg .u32 %r27;\n"
|
|
- ".reg .u32 %r28;\n"
|
|
- ".reg .u32 %r29;\n"
|
|
- ".reg .u32 %r30;\n"
|
|
- ".reg .u32 %r31;\n"
|
|
- ".reg .u32 %r32;\n"
|
|
- ".reg .u32 %r33;\n"
|
|
- "mov.u32 %r28,0;\n"
|
|
- "{\n"
|
|
- ".param .u32 %retval_in;\n"
|
|
- "{\n"
|
|
- ".param .u32 %out_arg0;\n"
|
|
- "st.param.u32 [%out_arg0],%r28;\n"
|
|
- "call (%retval_in),GOACC_ntid,(%out_arg0);\n"
|
|
- "}\n"
|
|
- "ld.param.u32 %r29,[%retval_in];\n"
|
|
- "}\n"
|
|
- "mov.u32 %r22,%r29;\n"
|
|
- "mov.u32 %r30,0;\n"
|
|
- "{\n"
|
|
- ".param .u32 %retval_in;\n"
|
|
- "{\n"
|
|
- ".param .u32 %out_arg0;\n"
|
|
- "st.param.u32 [%out_arg0],%r30;\n"
|
|
- "call (%retval_in),GOACC_ctaid,(%out_arg0);\n"
|
|
- "}\n"
|
|
- "ld.param.u32 %r31,[%retval_in];\n"
|
|
- "}\n"
|
|
- "mov.u32 %r23,%r31;\n"
|
|
- "mul.lo.u32 %r24,%r22,%r23;\n"
|
|
- "mov.u32 %r32,0;\n"
|
|
- "{\n"
|
|
- ".param .u32 %retval_in;\n"
|
|
- "{\n"
|
|
- ".param .u32 %out_arg0;\n"
|
|
- "st.param.u32 [%out_arg0],%r32;\n"
|
|
- "call (%retval_in),GOACC_tid,(%out_arg0);\n"
|
|
- "}\n"
|
|
- "ld.param.u32 %r33,[%retval_in];\n"
|
|
- "}\n"
|
|
- "mov.u32 %r25,%r33;\n"
|
|
- "add.u32 %r26,%r24,%r25;\n"
|
|
- "mov.u32 %r27,%r26;\n"
|
|
- "mov.u32 %retval,%r27;\n"
|
|
- "st.param.u32 [%out_retval],%retval;\n"
|
|
- "ret;\n"
|
|
- "}\n");
|
|
--- libgomp/config/nvptx/target.c.jj 2018-04-25 09:40:31.890655570 +0200
|
|
+++ libgomp/config/nvptx/target.c 2019-05-07 18:46:36.453110901 +0200
|
|
@@ -47,3 +47,21 @@ GOMP_teams (unsigned int num_teams, unsi
|
|
}
|
|
gomp_num_teams_var = num_teams - 1;
|
|
}
|
|
+
|
|
+int
|
|
+omp_pause_resource (omp_pause_resource_t kind, int device_num)
|
|
+{
|
|
+ (void) kind;
|
|
+ (void) device_num;
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+int
|
|
+omp_pause_resource_all (omp_pause_resource_t kind)
|
|
+{
|
|
+ (void) kind;
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+ialias (omp_pause_resource)
|
|
+ialias (omp_pause_resource_all)
|
|
--- libgomp/config/nvptx/icv-device.c.jj 2018-04-25 09:40:31.889655570 +0200
|
|
+++ libgomp/config/nvptx/icv-device.c 2019-05-07 18:46:36.453110901 +0200
|
|
@@ -46,20 +46,6 @@ omp_get_num_devices (void)
|
|
}
|
|
|
|
int
|
|
-omp_get_num_teams (void)
|
|
-{
|
|
- return gomp_num_teams_var + 1;
|
|
-}
|
|
-
|
|
-int
|
|
-omp_get_team_num (void)
|
|
-{
|
|
- int ctaid;
|
|
- asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
|
|
- return ctaid;
|
|
-}
|
|
-
|
|
-int
|
|
omp_is_initial_device (void)
|
|
{
|
|
/* NVPTX is an accelerator-only target. */
|
|
@@ -69,6 +55,4 @@ omp_is_initial_device (void)
|
|
ialias (omp_set_default_device)
|
|
ialias (omp_get_default_device)
|
|
ialias (omp_get_num_devices)
|
|
-ialias (omp_get_num_teams)
|
|
-ialias (omp_get_team_num)
|
|
ialias (omp_is_initial_device)
|
|
--- libgomp/config/nvptx/affinity-fmt.c.jj 2019-05-07 18:46:36.358112419 +0200
|
|
+++ libgomp/config/nvptx/affinity-fmt.c 2019-05-07 18:46:36.358112419 +0200
|
|
@@ -0,0 +1,51 @@
|
|
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
|
|
+
|
|
+ This file is part of the GNU Offloading and Multi Processing Library
|
|
+ (libgomp).
|
|
+
|
|
+ Libgomp is free software; you can redistribute it and/or modify it
|
|
+ under the terms of the GNU General Public License as published by
|
|
+ the Free Software Foundation; either version 3, or (at your option)
|
|
+ any later version.
|
|
+
|
|
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
+ more details.
|
|
+
|
|
+ Under Section 7 of GPL version 3, you are granted additional
|
|
+ permissions described in the GCC Runtime Library Exception, version
|
|
+ 3.1, as published by the Free Software Foundation.
|
|
+
|
|
+ You should have received a copy of the GNU General Public License and
|
|
+ a copy of the GCC Runtime Library Exception along with this program;
|
|
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include "libgomp.h"
|
|
+#include <string.h>
|
|
+#include <stdio.h>
|
|
+#include <stdlib.h>
|
|
+#ifdef HAVE_UNISTD_H
|
|
+#include <unistd.h>
|
|
+#endif
|
|
+#ifdef HAVE_INTTYPES_H
|
|
+# include <inttypes.h> /* For PRIx64. */
|
|
+#endif
|
|
+#ifdef HAVE_UNAME
|
|
+#include <sys/utsname.h>
|
|
+#endif
|
|
+
|
|
+/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx,
|
|
+ while the nvptx newlib implementation does not support those functions.
|
|
+ Override the configure test results here. */
|
|
+#undef HAVE_GETPID
|
|
+#undef HAVE_GETHOSTNAME
|
|
+
|
|
+/* The nvptx newlib implementation does not support fwrite, but it does support
|
|
+ write. Map fwrite to write. */
|
|
+#undef fwrite
|
|
+#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
|
|
+
|
|
+#include "../../affinity-fmt.c"
|
|
+
|
|
--- libgomp/config/mingw32/affinity-fmt.c.jj 2019-05-07 18:46:36.344112642 +0200
|
|
+++ libgomp/config/mingw32/affinity-fmt.c 2019-05-07 18:46:36.344112642 +0200
|
|
@@ -0,0 +1,68 @@
|
|
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
|
|
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
|
|
+
|
|
+ This file is part of the GNU Offloading and Multi Processing Library
|
|
+ (libgomp).
|
|
+
|
|
+ Libgomp is free software; you can redistribute it and/or modify it
|
|
+ under the terms of the GNU General Public License as published by
|
|
+ the Free Software Foundation; either version 3, or (at your option)
|
|
+ any later version.
|
|
+
|
|
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
+ more details.
|
|
+
|
|
+ Under Section 7 of GPL version 3, you are granted additional
|
|
+ permissions described in the GCC Runtime Library Exception, version
|
|
+ 3.1, as published by the Free Software Foundation.
|
|
+
|
|
+ You should have received a copy of the GNU General Public License and
|
|
+ a copy of the GCC Runtime Library Exception along with this program;
|
|
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include "libgomp.h"
|
|
+#include <string.h>
|
|
+#include <stdio.h>
|
|
+#include <stdlib.h>
|
|
+#ifdef HAVE_UNISTD_H
|
|
+#include <unistd.h>
|
|
+#endif
|
|
+#ifdef HAVE_INTTYPES_H
|
|
+# include <inttypes.h> /* For PRIx64. */
|
|
+#endif
|
|
+#define WIN32_LEAN_AND_MEAN
|
|
+#include <windows.h>
|
|
+#include <errno.h>
|
|
+
|
|
+static int
|
|
+gomp_gethostname (char *name, size_t len)
|
|
+{
|
|
+ /* On Win9x GetComputerName fails if the input size is less
|
|
+ than MAX_COMPUTERNAME_LENGTH + 1. */
|
|
+ char buffer[MAX_COMPUTERNAME_LENGTH + 1];
|
|
+ DWORD size = sizeof (buffer);
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!GetComputerName (buffer, &size))
|
|
+ return -1;
|
|
+
|
|
+ if ((size = strlen (buffer) + 1) > len)
|
|
+ {
|
|
+ errno = EINVAL;
|
|
+ /* Truncate as per POSIX spec. We do not NUL-terminate. */
|
|
+ size = len;
|
|
+ ret = -1;
|
|
+ }
|
|
+ memcpy (name, buffer, (size_t) size);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#undef gethostname
|
|
+#define gethostname gomp_gethostname
|
|
+#define HAVE_GETHOSTNAME 1
|
|
+
|
|
+#include "../../affinity-fmt.c"
|
|
--- libgomp/config/rtems/bar.c.jj 2018-04-25 09:40:31.902655576 +0200
|
|
+++ libgomp/config/rtems/bar.c 2019-05-07 18:46:36.460110789 +0200
|
|
@@ -72,184 +72,5 @@ do_wait (int *addr, int val)
|
|
futex_wait (addr, val);
|
|
}
|
|
|
|
-/* Everything below this point should be identical to the Linux
|
|
- implementation. */
|
|
-
|
|
-void
|
|
-gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
|
|
-{
|
|
- if (__builtin_expect (state & BAR_WAS_LAST, 0))
|
|
- {
|
|
- /* Next time we'll be awaiting TOTAL threads again. */
|
|
- bar->awaited = bar->total;
|
|
- __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
|
|
- MEMMODEL_RELEASE);
|
|
- futex_wake ((int *) &bar->generation, INT_MAX);
|
|
- }
|
|
- else
|
|
- {
|
|
- do
|
|
- do_wait ((int *) &bar->generation, state);
|
|
- while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) == state);
|
|
- }
|
|
-}
|
|
-
|
|
-void
|
|
-gomp_barrier_wait (gomp_barrier_t *bar)
|
|
-{
|
|
- gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
|
|
-}
|
|
-
|
|
-/* Like gomp_barrier_wait, except that if the encountering thread
|
|
- is not the last one to hit the barrier, it returns immediately.
|
|
- The intended usage is that a thread which intends to gomp_barrier_destroy
|
|
- this barrier calls gomp_barrier_wait, while all other threads
|
|
- call gomp_barrier_wait_last. When gomp_barrier_wait returns,
|
|
- the barrier can be safely destroyed. */
|
|
-
|
|
-void
|
|
-gomp_barrier_wait_last (gomp_barrier_t *bar)
|
|
-{
|
|
- gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
|
|
- if (state & BAR_WAS_LAST)
|
|
- gomp_barrier_wait_end (bar, state);
|
|
-}
|
|
-
|
|
-void
|
|
-gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
|
|
-{
|
|
- futex_wake ((int *) &bar->generation, count == 0 ? INT_MAX : count);
|
|
-}
|
|
-
|
|
-void
|
|
-gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
|
|
-{
|
|
- unsigned int generation, gen;
|
|
-
|
|
- if (__builtin_expect (state & BAR_WAS_LAST, 0))
|
|
- {
|
|
- /* Next time we'll be awaiting TOTAL threads again. */
|
|
- struct gomp_thread *thr = gomp_thread ();
|
|
- struct gomp_team *team = thr->ts.team;
|
|
-
|
|
- bar->awaited = bar->total;
|
|
- team->work_share_cancelled = 0;
|
|
- if (__builtin_expect (team->task_count, 0))
|
|
- {
|
|
- gomp_barrier_handle_tasks (state);
|
|
- state &= ~BAR_WAS_LAST;
|
|
- }
|
|
- else
|
|
- {
|
|
- state &= ~BAR_CANCELLED;
|
|
- state += BAR_INCR - BAR_WAS_LAST;
|
|
- __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
|
|
- futex_wake ((int *) &bar->generation, INT_MAX);
|
|
- return;
|
|
- }
|
|
- }
|
|
-
|
|
- generation = state;
|
|
- state &= ~BAR_CANCELLED;
|
|
- do
|
|
- {
|
|
- do_wait ((int *) &bar->generation, generation);
|
|
- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
|
|
- if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
|
|
- {
|
|
- gomp_barrier_handle_tasks (state);
|
|
- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
|
|
- }
|
|
- generation |= gen & BAR_WAITING_FOR_TASK;
|
|
- }
|
|
- while (gen != state + BAR_INCR);
|
|
-}
|
|
-
|
|
-void
|
|
-gomp_team_barrier_wait (gomp_barrier_t *bar)
|
|
-{
|
|
- gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
|
|
-}
|
|
-
|
|
-void
|
|
-gomp_team_barrier_wait_final (gomp_barrier_t *bar)
|
|
-{
|
|
- gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
|
|
- if (__builtin_expect (state & BAR_WAS_LAST, 0))
|
|
- bar->awaited_final = bar->total;
|
|
- gomp_team_barrier_wait_end (bar, state);
|
|
-}
|
|
-
|
|
-bool
|
|
-gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
|
|
- gomp_barrier_state_t state)
|
|
-{
|
|
- unsigned int generation, gen;
|
|
-
|
|
- if (__builtin_expect (state & BAR_WAS_LAST, 0))
|
|
- {
|
|
- /* Next time we'll be awaiting TOTAL threads again. */
|
|
- /* BAR_CANCELLED should never be set in state here, because
|
|
- cancellation means that at least one of the threads has been
|
|
- cancelled, thus on a cancellable barrier we should never see
|
|
- all threads to arrive. */
|
|
- struct gomp_thread *thr = gomp_thread ();
|
|
- struct gomp_team *team = thr->ts.team;
|
|
-
|
|
- bar->awaited = bar->total;
|
|
- team->work_share_cancelled = 0;
|
|
- if (__builtin_expect (team->task_count, 0))
|
|
- {
|
|
- gomp_barrier_handle_tasks (state);
|
|
- state &= ~BAR_WAS_LAST;
|
|
- }
|
|
- else
|
|
- {
|
|
- state += BAR_INCR - BAR_WAS_LAST;
|
|
- __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
|
|
- futex_wake ((int *) &bar->generation, INT_MAX);
|
|
- return false;
|
|
- }
|
|
- }
|
|
-
|
|
- if (__builtin_expect (state & BAR_CANCELLED, 0))
|
|
- return true;
|
|
-
|
|
- generation = state;
|
|
- do
|
|
- {
|
|
- do_wait ((int *) &bar->generation, generation);
|
|
- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
|
|
- if (__builtin_expect (gen & BAR_CANCELLED, 0))
|
|
- return true;
|
|
- if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
|
|
- {
|
|
- gomp_barrier_handle_tasks (state);
|
|
- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
|
|
- }
|
|
- generation |= gen & BAR_WAITING_FOR_TASK;
|
|
- }
|
|
- while (gen != state + BAR_INCR);
|
|
-
|
|
- return false;
|
|
-}
|
|
-
|
|
-bool
|
|
-gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
|
|
-{
|
|
- return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
|
|
-}
|
|
-
|
|
-void
|
|
-gomp_team_barrier_cancel (struct gomp_team *team)
|
|
-{
|
|
- gomp_mutex_lock (&team->task_lock);
|
|
- if (team->barrier.generation & BAR_CANCELLED)
|
|
- {
|
|
- gomp_mutex_unlock (&team->task_lock);
|
|
- return;
|
|
- }
|
|
- team->barrier.generation |= BAR_CANCELLED;
|
|
- gomp_mutex_unlock (&team->task_lock);
|
|
- futex_wake ((int *) &team->barrier.generation, INT_MAX);
|
|
-}
|
|
+#define GOMP_WAIT_H 1
|
|
+#include "../linux/bar.c"
|
|
--- libgomp/config/rtems/affinity-fmt.c.jj 2019-05-07 18:46:36.459110805 +0200
|
|
+++ libgomp/config/rtems/affinity-fmt.c 2019-05-07 18:46:36.459110805 +0200
|
|
@@ -0,0 +1,49 @@
|
|
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
|
|
+
|
|
+ This file is part of the GNU Offloading and Multi Processing Library
|
|
+ (libgomp).
|
|
+
|
|
+ Libgomp is free software; you can redistribute it and/or modify it
|
|
+ under the terms of the GNU General Public License as published by
|
|
+ the Free Software Foundation; either version 3, or (at your option)
|
|
+ any later version.
|
|
+
|
|
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
+ more details.
|
|
+
|
|
+ Under Section 7 of GPL version 3, you are granted additional
|
|
+ permissions described in the GCC Runtime Library Exception, version
|
|
+ 3.1, as published by the Free Software Foundation.
|
|
+
|
|
+ You should have received a copy of the GNU General Public License and
|
|
+ a copy of the GCC Runtime Library Exception along with this program;
|
|
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include "libgomp.h"
|
|
+#include <string.h>
|
|
+#include <stdio.h>
|
|
+#include <stdlib.h>
|
|
+#ifdef HAVE_UNISTD_H
|
|
+#include <unistd.h>
|
|
+#endif
|
|
+#ifdef HAVE_INTTYPES_H
|
|
+# include <inttypes.h> /* For PRIx64. */
|
|
+#endif
|
|
+#ifdef HAVE_UNAME
|
|
+#include <sys/utsname.h>
|
|
+#endif
|
|
+
|
|
+/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for RTEMS,
|
|
+ but the extra information they give are of little value for the user.
|
|
+ Override the configure test results here. */
|
|
+#undef HAVE_GETPID
|
|
+#undef HAVE_GETHOSTNAME
|
|
+
|
|
+/* Avoid the complex fwrite() in favour of the simple write(). */
|
|
+#undef fwrite
|
|
+#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
|
|
+
|
|
+#include "../../affinity-fmt.c"
|
|
--- libgomp/config.h.in.jj 2018-04-25 09:40:31.870655561 +0200
|
|
+++ libgomp/config.h.in 2019-05-07 18:46:36.465110710 +0200
|
|
@@ -1,5 +1,8 @@
|
|
/* config.h.in. Generated from configure.ac by autoheader. */
|
|
|
|
+/* Define to 1 if you have the `aligned_alloc' function. */
|
|
+#undef HAVE_ALIGNED_ALLOC
|
|
+
|
|
/* Define to 1 if the target assembler supports .symver directive. */
|
|
#undef HAVE_AS_SYMVER_DIRECTIVE
|
|
|
|
@@ -33,9 +36,15 @@
|
|
/* Define to 1 if you have the `getgid' function. */
|
|
#undef HAVE_GETGID
|
|
|
|
+/* Define if gethostname is supported. */
|
|
+#undef HAVE_GETHOSTNAME
|
|
+
|
|
/* Define to 1 if you have the `getloadavg' function. */
|
|
#undef HAVE_GETLOADAVG
|
|
|
|
+/* Define if getpid is supported. */
|
|
+#undef HAVE_GETPID
|
|
+
|
|
/* Define to 1 if you have the `getuid' function. */
|
|
#undef HAVE_GETUID
|
|
|
|
@@ -45,9 +54,15 @@
|
|
/* Define to 1 if you have the `dl' library (-ldl). */
|
|
#undef HAVE_LIBDL
|
|
|
|
+/* Define to 1 if you have the `memalign' function. */
|
|
+#undef HAVE_MEMALIGN
|
|
+
|
|
/* Define to 1 if you have the <memory.h> header file. */
|
|
#undef HAVE_MEMORY_H
|
|
|
|
+/* Define to 1 if you have the `posix_memalign' function. */
|
|
+#undef HAVE_POSIX_MEMALIGN
|
|
+
|
|
/* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */
|
|
#undef HAVE_PTHREAD_AFFINITY_NP
|
|
|
|
@@ -103,9 +118,15 @@
|
|
/* Define to 1 if the target supports thread-local storage. */
|
|
#undef HAVE_TLS
|
|
|
|
+/* Define if uname is supported and struct utsname has nodename field. */
|
|
+#undef HAVE_UNAME
|
|
+
|
|
/* Define to 1 if you have the <unistd.h> header file. */
|
|
#undef HAVE_UNISTD_H
|
|
|
|
+/* Define to 1 if you have the `_aligned_malloc' function. */
|
|
+#undef HAVE__ALIGNED_MALLOC
|
|
+
|
|
/* Define to 1 if you have the `__secure_getenv' function. */
|
|
#undef HAVE___SECURE_GETENV
|
|
|
|
@@ -125,8 +146,8 @@
|
|
*/
|
|
#undef LT_OBJDIR
|
|
|
|
-/* Define to offload targets, separated by commas. */
|
|
-#undef OFFLOAD_TARGETS
|
|
+/* Define to offload plugins, separated by commas. */
|
|
+#undef OFFLOAD_PLUGINS
|
|
|
|
/* Name of package */
|
|
#undef PACKAGE
|
|
--- libgomp/teams.c.jj 2019-05-07 18:46:36.548109384 +0200
|
|
+++ libgomp/teams.c 2019-05-07 18:46:36.548109384 +0200
|
|
@@ -0,0 +1,74 @@
|
|
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
|
|
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
|
|
+
|
|
+ This file is part of the GNU Offloading and Multi Processing Library
|
|
+ (libgomp).
|
|
+
|
|
+ Libgomp is free software; you can redistribute it and/or modify it
|
|
+ under the terms of the GNU General Public License as published by
|
|
+ the Free Software Foundation; either version 3, or (at your option)
|
|
+ any later version.
|
|
+
|
|
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
+ more details.
|
|
+
|
|
+ Under Section 7 of GPL version 3, you are granted additional
|
|
+ permissions described in the GCC Runtime Library Exception, version
|
|
+ 3.1, as published by the Free Software Foundation.
|
|
+
|
|
+ You should have received a copy of the GNU General Public License and
|
|
+ a copy of the GCC Runtime Library Exception along with this program;
|
|
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+/* This file handles the host TEAMS construct. */
|
|
+
|
|
+#include "libgomp.h"
|
|
+#include <limits.h>
|
|
+
|
|
+static unsigned gomp_num_teams = 1, gomp_team_num = 0;
|
|
+
|
|
+void
|
|
+GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
|
|
+ unsigned int thread_limit, unsigned int flags)
|
|
+{
|
|
+ (void) flags;
|
|
+ (void) num_teams;
|
|
+ unsigned old_thread_limit_var = 0;
|
|
+ if (thread_limit)
|
|
+ {
|
|
+ struct gomp_task_icv *icv = gomp_icv (true);
|
|
+ old_thread_limit_var = icv->thread_limit_var;
|
|
+ icv->thread_limit_var
|
|
+ = thread_limit > INT_MAX ? UINT_MAX : thread_limit;
|
|
+ }
|
|
+ if (num_teams == 0)
|
|
+ num_teams = 3;
|
|
+ gomp_num_teams = num_teams;
|
|
+ for (gomp_team_num = 0; gomp_team_num < num_teams; gomp_team_num++)
|
|
+ fn (data);
|
|
+ gomp_num_teams = 1;
|
|
+ gomp_team_num = 0;
|
|
+ if (thread_limit)
|
|
+ {
|
|
+ struct gomp_task_icv *icv = gomp_icv (true);
|
|
+ icv->thread_limit_var = old_thread_limit_var;
|
|
+ }
|
|
+}
|
|
+
|
|
+int
|
|
+omp_get_num_teams (void)
|
|
+{
|
|
+ return gomp_num_teams;
|
|
+}
|
|
+
|
|
+int
|
|
+omp_get_team_num (void)
|
|
+{
|
|
+ return gomp_team_num;
|
|
+}
|
|
+
|
|
+ialias (omp_get_num_teams)
|
|
+ialias (omp_get_team_num)
|
|
--- libgomp/libgomp.map.jj 2018-04-25 09:40:31.321655307 +0200
|
|
+++ libgomp/libgomp.map 2019-05-07 18:46:36.525109751 +0200
|
|
@@ -164,6 +164,22 @@ OMP_4.5 {
|
|
omp_target_disassociate_ptr;
|
|
} OMP_4.0;
|
|
|
|
+OMP_5.0 {
|
|
+ global:
|
|
+ omp_capture_affinity;
|
|
+ omp_capture_affinity_;
|
|
+ omp_display_affinity;
|
|
+ omp_display_affinity_;
|
|
+ omp_get_affinity_format;
|
|
+ omp_get_affinity_format_;
|
|
+ omp_set_affinity_format;
|
|
+ omp_set_affinity_format_;
|
|
+ omp_pause_resource;
|
|
+ omp_pause_resource_;
|
|
+ omp_pause_resource_all;
|
|
+ omp_pause_resource_all_;
|
|
+} OMP_4.5;
|
|
+
|
|
GOMP_1.0 {
|
|
global:
|
|
GOMP_atomic_end;
|
|
@@ -298,6 +314,34 @@ GOMP_4.5 {
|
|
GOMP_parallel_loop_nonmonotonic_guided;
|
|
} GOMP_4.0.1;
|
|
|
|
+GOMP_5.0 {
|
|
+ global:
|
|
+ GOMP_loop_doacross_start;
|
|
+ GOMP_loop_maybe_nonmonotonic_runtime_next;
|
|
+ GOMP_loop_maybe_nonmonotonic_runtime_start;
|
|
+ GOMP_loop_nonmonotonic_runtime_next;
|
|
+ GOMP_loop_nonmonotonic_runtime_start;
|
|
+ GOMP_loop_ordered_start;
|
|
+ GOMP_loop_start;
|
|
+ GOMP_loop_ull_doacross_start;
|
|
+ GOMP_loop_ull_maybe_nonmonotonic_runtime_next;
|
|
+ GOMP_loop_ull_maybe_nonmonotonic_runtime_start;
|
|
+ GOMP_loop_ull_nonmonotonic_runtime_next;
|
|
+ GOMP_loop_ull_nonmonotonic_runtime_start;
|
|
+ GOMP_loop_ull_ordered_start;
|
|
+ GOMP_loop_ull_start;
|
|
+ GOMP_parallel_loop_maybe_nonmonotonic_runtime;
|
|
+ GOMP_parallel_loop_nonmonotonic_runtime;
|
|
+ GOMP_parallel_reductions;
|
|
+ GOMP_sections2_start;
|
|
+ GOMP_taskgroup_reduction_register;
|
|
+ GOMP_taskgroup_reduction_unregister;
|
|
+ GOMP_task_reduction_remap;
|
|
+ GOMP_taskwait_depend;
|
|
+ GOMP_teams_reg;
|
|
+ GOMP_workshare_task_reduction_unregister;
|
|
+} GOMP_4.5;
|
|
+
|
|
OACC_2.0 {
|
|
global:
|
|
acc_get_num_devices;
|
|
@@ -386,6 +430,52 @@ OACC_2.0.1 {
|
|
acc_pcreate;
|
|
} OACC_2.0;
|
|
|
|
+OACC_2.5 {
|
|
+ global:
|
|
+ acc_copyin_async;
|
|
+ acc_copyin_async_32_h_;
|
|
+ acc_copyin_async_64_h_;
|
|
+ acc_copyin_async_array_h_;
|
|
+ acc_copyout_async;
|
|
+ acc_copyout_async_32_h_;
|
|
+ acc_copyout_async_64_h_;
|
|
+ acc_copyout_async_array_h_;
|
|
+ acc_copyout_finalize;
|
|
+ acc_copyout_finalize_32_h_;
|
|
+ acc_copyout_finalize_64_h_;
|
|
+ acc_copyout_finalize_array_h_;
|
|
+ acc_copyout_finalize_async;
|
|
+ acc_copyout_finalize_async_32_h_;
|
|
+ acc_copyout_finalize_async_64_h_;
|
|
+ acc_copyout_finalize_async_array_h_;
|
|
+ acc_create_async;
|
|
+ acc_create_async_32_h_;
|
|
+ acc_create_async_64_h_;
|
|
+ acc_create_async_array_h_;
|
|
+ acc_delete_async;
|
|
+ acc_delete_async_32_h_;
|
|
+ acc_delete_async_64_h_;
|
|
+ acc_delete_async_array_h_;
|
|
+ acc_delete_finalize;
|
|
+ acc_delete_finalize_32_h_;
|
|
+ acc_delete_finalize_64_h_;
|
|
+ acc_delete_finalize_array_h_;
|
|
+ acc_delete_finalize_async;
|
|
+ acc_delete_finalize_async_32_h_;
|
|
+ acc_delete_finalize_async_64_h_;
|
|
+ acc_delete_finalize_async_array_h_;
|
|
+ acc_memcpy_from_device_async;
|
|
+ acc_memcpy_to_device_async;
|
|
+ acc_update_device_async;
|
|
+ acc_update_device_async_32_h_;
|
|
+ acc_update_device_async_64_h_;
|
|
+ acc_update_device_async_array_h_;
|
|
+ acc_update_self_async;
|
|
+ acc_update_self_async_32_h_;
|
|
+ acc_update_self_async_64_h_;
|
|
+ acc_update_self_async_array_h_;
|
|
+} OACC_2.0.1;
|
|
+
|
|
GOACC_2.0 {
|
|
global:
|
|
GOACC_data_end;
|
|
@@ -420,3 +510,8 @@ GOMP_PLUGIN_1.1 {
|
|
global:
|
|
GOMP_PLUGIN_target_task_completion;
|
|
} GOMP_PLUGIN_1.0;
|
|
+
|
|
+GOMP_PLUGIN_1.2 {
|
|
+ global:
|
|
+ GOMP_PLUGIN_acc_default_dim;
|
|
+} GOMP_PLUGIN_1.1;
|
|
--- libgomp/oacc-async.c.jj 2018-04-25 09:40:31.925655587 +0200
|
|
+++ libgomp/oacc-async.c 2019-05-07 18:46:36.528109704 +0200
|
|
@@ -34,7 +34,7 @@
|
|
int
|
|
acc_async_test (int async)
|
|
{
|
|
- if (async < acc_async_sync)
|
|
+ if (!async_valid_p (async))
|
|
gomp_fatal ("invalid async argument: %d", async);
|
|
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
@@ -59,7 +59,7 @@ acc_async_test_all (void)
|
|
void
|
|
acc_wait (int async)
|
|
{
|
|
- if (async < acc_async_sync)
|
|
+ if (!async_valid_p (async))
|
|
gomp_fatal ("invalid async argument: %d", async);
|
|
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
@@ -117,7 +117,7 @@ acc_async_wait_all (void)
|
|
void
|
|
acc_wait_all_async (int async)
|
|
{
|
|
- if (async < acc_async_sync)
|
|
+ if (!async_valid_p (async))
|
|
gomp_fatal ("invalid async argument: %d", async);
|
|
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
--- libgomp/loop_ull.c.jj 2018-04-25 09:40:31.912655580 +0200
|
|
+++ libgomp/loop_ull.c 2019-05-07 18:46:36.527109719 +0200
|
|
@@ -27,8 +27,12 @@
|
|
|
|
#include <limits.h>
|
|
#include <stdlib.h>
|
|
+#include <string.h>
|
|
#include "libgomp.h"
|
|
|
|
+ialias (GOMP_loop_ull_runtime_next)
|
|
+ialias_redirect (GOMP_taskgroup_reduction_register)
|
|
+
|
|
typedef unsigned long long gomp_ull;
|
|
|
|
/* Initialize the given work share construct from the given arguments. */
|
|
@@ -104,7 +108,7 @@ gomp_loop_ull_static_start (bool up, gom
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
|
|
thr->ts.static_trip = 0;
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
GFS_STATIC, chunk_size);
|
|
@@ -122,7 +126,7 @@ gomp_loop_ull_dynamic_start (bool up, go
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
GFS_DYNAMIC, chunk_size);
|
|
@@ -148,7 +152,7 @@ gomp_loop_ull_guided_start (bool up, gom
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
GFS_GUIDED, chunk_size);
|
|
@@ -171,7 +175,7 @@ GOMP_loop_ull_runtime_start (bool up, go
|
|
gomp_ull incr, gomp_ull *istart, gomp_ull *iend)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
- switch (icv->run_sched_var)
|
|
+ switch (icv->run_sched_var & ~GFS_MONOTONIC)
|
|
{
|
|
case GFS_STATIC:
|
|
return gomp_loop_ull_static_start (up, start, end, incr,
|
|
@@ -195,6 +199,99 @@ GOMP_loop_ull_runtime_start (bool up, go
|
|
}
|
|
}
|
|
|
|
+static long
|
|
+gomp_adjust_sched (long sched, gomp_ull *chunk_size)
|
|
+{
|
|
+ sched &= ~GFS_MONOTONIC;
|
|
+ switch (sched)
|
|
+ {
|
|
+ case GFS_STATIC:
|
|
+ case GFS_DYNAMIC:
|
|
+ case GFS_GUIDED:
|
|
+ return sched;
|
|
+ /* GFS_RUNTIME is used for runtime schedule without monotonic
|
|
+ or nonmonotonic modifiers on the clause.
|
|
+ GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
|
|
+ modifier. */
|
|
+ case GFS_RUNTIME:
|
|
+ /* GFS_AUTO is used for runtime schedule with nonmonotonic
|
|
+ modifier. */
|
|
+ case GFS_AUTO:
|
|
+ {
|
|
+ struct gomp_task_icv *icv = gomp_icv (false);
|
|
+ sched = icv->run_sched_var & ~GFS_MONOTONIC;
|
|
+ switch (sched)
|
|
+ {
|
|
+ case GFS_STATIC:
|
|
+ case GFS_DYNAMIC:
|
|
+ case GFS_GUIDED:
|
|
+ *chunk_size = icv->run_sched_chunk_size;
|
|
+ break;
|
|
+ case GFS_AUTO:
|
|
+ sched = GFS_STATIC;
|
|
+ *chunk_size = 0;
|
|
+ break;
|
|
+ default:
|
|
+ abort ();
|
|
+ }
|
|
+ return sched;
|
|
+ }
|
|
+ default:
|
|
+ abort ();
|
|
+ }
|
|
+}
|
|
+
|
|
+bool
|
|
+GOMP_loop_ull_start (bool up, gomp_ull start, gomp_ull end,
|
|
+ gomp_ull incr, long sched, gomp_ull chunk_size,
|
|
+ gomp_ull *istart, gomp_ull *iend,
|
|
+ uintptr_t *reductions, void **mem)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+
|
|
+ thr->ts.static_trip = 0;
|
|
+ if (reductions)
|
|
+ gomp_workshare_taskgroup_start ();
|
|
+ if (gomp_work_share_start (0))
|
|
+ {
|
|
+ sched = gomp_adjust_sched (sched, &chunk_size);
|
|
+ gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
+ sched, chunk_size);
|
|
+ if (reductions)
|
|
+ {
|
|
+ GOMP_taskgroup_reduction_register (reductions);
|
|
+ thr->task->taskgroup->workshare = true;
|
|
+ thr->ts.work_share->task_reductions = reductions;
|
|
+ }
|
|
+ if (mem)
|
|
+ {
|
|
+ uintptr_t size = (uintptr_t) *mem;
|
|
+ if (size > (sizeof (struct gomp_work_share)
|
|
+ - offsetof (struct gomp_work_share,
|
|
+ inline_ordered_team_ids)))
|
|
+ thr->ts.work_share->ordered_team_ids
|
|
+ = gomp_malloc_cleared (size);
|
|
+ else
|
|
+ memset (thr->ts.work_share->ordered_team_ids, '\0', size);
|
|
+ *mem = (void *) thr->ts.work_share->ordered_team_ids;
|
|
+ }
|
|
+ gomp_work_share_init_done ();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (reductions)
|
|
+ {
|
|
+ uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
|
|
+ gomp_workshare_task_reduction_register (reductions,
|
|
+ first_reductions);
|
|
+ }
|
|
+ if (mem)
|
|
+ *mem = (void *) thr->ts.work_share->ordered_team_ids;
|
|
+ }
|
|
+
|
|
+ return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
|
|
+}
|
|
+
|
|
/* The *_ordered_*_start routines are similar. The only difference is that
|
|
this work-share construct is initialized to expect an ORDERED section. */
|
|
|
|
@@ -206,7 +303,7 @@ gomp_loop_ull_ordered_static_start (bool
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
|
|
thr->ts.static_trip = 0;
|
|
- if (gomp_work_share_start (true))
|
|
+ if (gomp_work_share_start (1))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
GFS_STATIC, chunk_size);
|
|
@@ -225,7 +322,7 @@ gomp_loop_ull_ordered_dynamic_start (boo
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (true))
|
|
+ if (gomp_work_share_start (1))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
GFS_DYNAMIC, chunk_size);
|
|
@@ -251,7 +348,7 @@ gomp_loop_ull_ordered_guided_start (bool
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (true))
|
|
+ if (gomp_work_share_start (1))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
GFS_GUIDED, chunk_size);
|
|
@@ -275,7 +372,7 @@ GOMP_loop_ull_ordered_runtime_start (boo
|
|
gomp_ull *iend)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
- switch (icv->run_sched_var)
|
|
+ switch (icv->run_sched_var & ~GFS_MONOTONIC)
|
|
{
|
|
case GFS_STATIC:
|
|
return gomp_loop_ull_ordered_static_start (up, start, end, incr,
|
|
@@ -299,6 +396,82 @@ GOMP_loop_ull_ordered_runtime_start (boo
|
|
}
|
|
}
|
|
|
|
+bool
|
|
+GOMP_loop_ull_ordered_start (bool up, gomp_ull start, gomp_ull end,
|
|
+ gomp_ull incr, long sched, gomp_ull chunk_size,
|
|
+ gomp_ull *istart, gomp_ull *iend,
|
|
+ uintptr_t *reductions, void **mem)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ size_t ordered = 1;
|
|
+ bool ret;
|
|
+
|
|
+ thr->ts.static_trip = 0;
|
|
+ if (reductions)
|
|
+ gomp_workshare_taskgroup_start ();
|
|
+ if (mem)
|
|
+ ordered += (uintptr_t) *mem;
|
|
+ if (gomp_work_share_start (ordered))
|
|
+ {
|
|
+ sched = gomp_adjust_sched (sched, &chunk_size);
|
|
+ gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
|
|
+ sched, chunk_size);
|
|
+ if (reductions)
|
|
+ {
|
|
+ GOMP_taskgroup_reduction_register (reductions);
|
|
+ thr->task->taskgroup->workshare = true;
|
|
+ thr->ts.work_share->task_reductions = reductions;
|
|
+ }
|
|
+ if (sched == GFS_STATIC)
|
|
+ gomp_ordered_static_init ();
|
|
+ else
|
|
+ gomp_mutex_lock (&thr->ts.work_share->lock);
|
|
+ gomp_work_share_init_done ();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (reductions)
|
|
+ {
|
|
+ uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
|
|
+ gomp_workshare_task_reduction_register (reductions,
|
|
+ first_reductions);
|
|
+ }
|
|
+ sched = thr->ts.work_share->sched;
|
|
+ if (sched != GFS_STATIC)
|
|
+ gomp_mutex_lock (&thr->ts.work_share->lock);
|
|
+ }
|
|
+
|
|
+ if (mem)
|
|
+ {
|
|
+ uintptr_t p
|
|
+ = (uintptr_t) (thr->ts.work_share->ordered_team_ids
|
|
+ + (thr->ts.team ? thr->ts.team->nthreads : 1));
|
|
+ p += __alignof__ (long long) - 1;
|
|
+ p &= ~(__alignof__ (long long) - 1);
|
|
+ *mem = (void *) p;
|
|
+ }
|
|
+
|
|
+ switch (sched)
|
|
+ {
|
|
+ case GFS_STATIC:
|
|
+ case GFS_AUTO:
|
|
+ return !gomp_iter_ull_static_next (istart, iend);
|
|
+ case GFS_DYNAMIC:
|
|
+ ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
|
|
+ break;
|
|
+ case GFS_GUIDED:
|
|
+ ret = gomp_iter_ull_guided_next_locked (istart, iend);
|
|
+ break;
|
|
+ default:
|
|
+ abort ();
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ gomp_ordered_first ();
|
|
+ gomp_mutex_unlock (&thr->ts.work_share->lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
/* The *_doacross_*_start routines are similar. The only difference is that
|
|
this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
|
|
section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
|
|
@@ -313,11 +486,11 @@ gomp_loop_ull_doacross_static_start (uns
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
|
|
thr->ts.static_trip = 0;
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
|
|
GFS_STATIC, chunk_size);
|
|
- gomp_doacross_ull_init (ncounts, counts, chunk_size);
|
|
+ gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
|
|
gomp_work_share_init_done ();
|
|
}
|
|
|
|
@@ -332,11 +505,11 @@ gomp_loop_ull_doacross_dynamic_start (un
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
|
|
GFS_DYNAMIC, chunk_size);
|
|
- gomp_doacross_ull_init (ncounts, counts, chunk_size);
|
|
+ gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
|
|
gomp_work_share_init_done ();
|
|
}
|
|
|
|
@@ -359,11 +532,11 @@ gomp_loop_ull_doacross_guided_start (uns
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
bool ret;
|
|
|
|
- if (gomp_work_share_start (false))
|
|
+ if (gomp_work_share_start (0))
|
|
{
|
|
gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
|
|
GFS_GUIDED, chunk_size);
|
|
- gomp_doacross_ull_init (ncounts, counts, chunk_size);
|
|
+ gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
|
|
gomp_work_share_init_done ();
|
|
}
|
|
|
|
@@ -383,7 +556,7 @@ GOMP_loop_ull_doacross_runtime_start (un
|
|
gomp_ull *istart, gomp_ull *iend)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (false);
|
|
- switch (icv->run_sched_var)
|
|
+ switch (icv->run_sched_var & ~GFS_MONOTONIC)
|
|
{
|
|
case GFS_STATIC:
|
|
return gomp_loop_ull_doacross_static_start (ncounts, counts,
|
|
@@ -407,6 +580,51 @@ GOMP_loop_ull_doacross_runtime_start (un
|
|
}
|
|
}
|
|
|
|
+bool
|
|
+GOMP_loop_ull_doacross_start (unsigned ncounts, gomp_ull *counts,
|
|
+ long sched, gomp_ull chunk_size,
|
|
+ gomp_ull *istart, gomp_ull *iend,
|
|
+ uintptr_t *reductions, void **mem)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+
|
|
+ thr->ts.static_trip = 0;
|
|
+ if (reductions)
|
|
+ gomp_workshare_taskgroup_start ();
|
|
+ if (gomp_work_share_start (0))
|
|
+ {
|
|
+ size_t extra = 0;
|
|
+ if (mem)
|
|
+ extra = (uintptr_t) *mem;
|
|
+ sched = gomp_adjust_sched (sched, &chunk_size);
|
|
+ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
|
|
+ sched, chunk_size);
|
|
+ gomp_doacross_ull_init (ncounts, counts, chunk_size, extra);
|
|
+ if (reductions)
|
|
+ {
|
|
+ GOMP_taskgroup_reduction_register (reductions);
|
|
+ thr->task->taskgroup->workshare = true;
|
|
+ thr->ts.work_share->task_reductions = reductions;
|
|
+ }
|
|
+ gomp_work_share_init_done ();
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ if (reductions)
|
|
+ {
|
|
+ uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
|
|
+ gomp_workshare_task_reduction_register (reductions,
|
|
+ first_reductions);
|
|
+ }
|
|
+ sched = thr->ts.work_share->sched;
|
|
+ }
|
|
+
|
|
+ if (mem)
|
|
+ *mem = thr->ts.work_share->doacross->extra;
|
|
+
|
|
+ return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
|
|
+}
|
|
+
|
|
/* The *_next routines are called when the thread completes processing of
|
|
the iteration block currently assigned to it. If the work-share
|
|
construct is bound directly to a parallel construct, then the iteration
|
|
@@ -570,6 +788,10 @@ extern __typeof(gomp_loop_ull_dynamic_st
|
|
__attribute__((alias ("gomp_loop_ull_dynamic_start")));
|
|
extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start
|
|
__attribute__((alias ("gomp_loop_ull_guided_start")));
|
|
+extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_nonmonotonic_runtime_start
|
|
+ __attribute__((alias ("GOMP_loop_ull_runtime_start")));
|
|
+extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_maybe_nonmonotonic_runtime_start
|
|
+ __attribute__((alias ("GOMP_loop_ull_runtime_start")));
|
|
|
|
extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start
|
|
__attribute__((alias ("gomp_loop_ull_ordered_static_start")));
|
|
@@ -595,6 +817,10 @@ extern __typeof(gomp_loop_ull_dynamic_ne
|
|
__attribute__((alias ("gomp_loop_ull_dynamic_next")));
|
|
extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next
|
|
__attribute__((alias ("gomp_loop_ull_guided_next")));
|
|
+extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_nonmonotonic_runtime_next
|
|
+ __attribute__((alias ("GOMP_loop_ull_runtime_next")));
|
|
+extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_maybe_nonmonotonic_runtime_next
|
|
+ __attribute__((alias ("GOMP_loop_ull_runtime_next")));
|
|
|
|
extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next
|
|
__attribute__((alias ("gomp_loop_ull_ordered_static_next")));
|
|
@@ -650,6 +876,23 @@ GOMP_loop_ull_nonmonotonic_guided_start
|
|
}
|
|
|
|
bool
|
|
+GOMP_loop_ull_nonmonotonic_runtime_start (bool up, gomp_ull start,
|
|
+ gomp_ull end, gomp_ull incr,
|
|
+ gomp_ull *istart, gomp_ull *iend)
|
|
+{
|
|
+ return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
+GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool up, gomp_ull start,
|
|
+ gomp_ull end, gomp_ull incr,
|
|
+ gomp_ull *istart,
|
|
+ gomp_ull *iend)
|
|
+{
|
|
+ return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end,
|
|
gomp_ull incr, gomp_ull chunk_size,
|
|
gomp_ull *istart, gomp_ull *iend)
|
|
@@ -734,6 +977,19 @@ GOMP_loop_ull_nonmonotonic_guided_next (
|
|
}
|
|
|
|
bool
|
|
+GOMP_loop_ull_nonmonotonic_runtime_next (gomp_ull *istart, gomp_ull *iend)
|
|
+{
|
|
+ return GOMP_loop_ull_runtime_next (istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
+GOMP_loop_ull_maybe_nonmonotonic_runtime_next (gomp_ull *istart,
|
|
+ gomp_ull *iend)
|
|
+{
|
|
+ return GOMP_loop_ull_runtime_next (istart, iend);
|
|
+}
|
|
+
|
|
+bool
|
|
GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend)
|
|
{
|
|
return gomp_loop_ull_ordered_static_next (istart, iend);
|
|
--- libgomp/oacc-int.h.jj 2018-04-25 09:40:31.320655306 +0200
|
|
+++ libgomp/oacc-int.h 2019-05-07 18:46:36.529109688 +0200
|
|
@@ -99,6 +99,28 @@ void goacc_restore_bind (void);
|
|
void goacc_lazy_initialize (void);
|
|
void goacc_host_init (void);
|
|
|
|
+static inline bool
|
|
+async_valid_stream_id_p (int async)
|
|
+{
|
|
+ return async >= 0;
|
|
+}
|
|
+
|
|
+static inline bool
|
|
+async_valid_p (int async)
|
|
+{
|
|
+ return (async == acc_async_noval || async == acc_async_sync
|
|
+ || async_valid_stream_id_p (async));
|
|
+}
|
|
+
|
|
+static inline bool
|
|
+async_synchronous_p (int async)
|
|
+{
|
|
+ if (!async_valid_p (async))
|
|
+ return true;
|
|
+
|
|
+ return async == acc_async_sync;
|
|
+}
|
|
+
|
|
#ifdef HAVE_ATTRIBUTE_VISIBILITY
|
|
# pragma GCC visibility pop
|
|
#endif
|
|
--- libgomp/testsuite/Makefile.in.jj 2018-04-25 09:40:31.452655368 +0200
|
|
+++ libgomp/testsuite/Makefile.in 2019-05-07 18:51:35.754330084 +0200
|
|
@@ -223,6 +223,7 @@ mkdir_p = @mkdir_p@
|
|
multi_basedir = @multi_basedir@
|
|
offload_additional_lib_paths = @offload_additional_lib_paths@
|
|
offload_additional_options = @offload_additional_options@
|
|
+offload_plugins = @offload_plugins@
|
|
offload_targets = @offload_targets@
|
|
oldincludedir = @oldincludedir@
|
|
pdfdir = @pdfdir@
|
|
--- libgomp/task.c.jj 2018-04-25 09:40:31.925655587 +0200
|
|
+++ libgomp/task.c 2019-05-07 18:46:36.547109400 +0200
|
|
@@ -166,21 +166,72 @@ gomp_task_handle_depend (struct gomp_tas
|
|
void **depend)
|
|
{
|
|
size_t ndepend = (uintptr_t) depend[0];
|
|
- size_t nout = (uintptr_t) depend[1];
|
|
size_t i;
|
|
hash_entry_type ent;
|
|
|
|
+ if (ndepend)
|
|
+ {
|
|
+ /* depend[0] is total # */
|
|
+ size_t nout = (uintptr_t) depend[1]; /* # of out: and inout: */
|
|
+ /* ndepend - nout is # of in: */
|
|
+ for (i = 0; i < ndepend; i++)
|
|
+ {
|
|
+ task->depend[i].addr = depend[2 + i];
|
|
+ task->depend[i].is_in = i >= nout;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ ndepend = (uintptr_t) depend[1]; /* total # */
|
|
+ size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */
|
|
+ size_t nmutexinoutset = (uintptr_t) depend[3]; /* # of mutexinoutset: */
|
|
+ /* For now we treat mutexinoutset like out, which is compliant, but
|
|
+ inefficient. */
|
|
+ size_t nin = (uintptr_t) depend[4]; /* # of in: */
|
|
+ /* ndepend - nout - nmutexinoutset - nin is # of depobjs */
|
|
+ size_t normal = nout + nmutexinoutset + nin;
|
|
+ size_t n = 0;
|
|
+ for (i = normal; i < ndepend; i++)
|
|
+ {
|
|
+ void **d = (void **) (uintptr_t) depend[5 + i];
|
|
+ switch ((uintptr_t) d[1])
|
|
+ {
|
|
+ case GOMP_DEPEND_OUT:
|
|
+ case GOMP_DEPEND_INOUT:
|
|
+ case GOMP_DEPEND_MUTEXINOUTSET:
|
|
+ break;
|
|
+ case GOMP_DEPEND_IN:
|
|
+ continue;
|
|
+ default:
|
|
+ gomp_fatal ("unknown omp_depend_t dependence type %d",
|
|
+ (int) (uintptr_t) d[1]);
|
|
+ }
|
|
+ task->depend[n].addr = d[0];
|
|
+ task->depend[n++].is_in = 0;
|
|
+ }
|
|
+ for (i = 0; i < normal; i++)
|
|
+ {
|
|
+ task->depend[n].addr = depend[5 + i];
|
|
+ task->depend[n++].is_in = i >= nout + nmutexinoutset;
|
|
+ }
|
|
+ for (i = normal; i < ndepend; i++)
|
|
+ {
|
|
+ void **d = (void **) (uintptr_t) depend[5 + i];
|
|
+ if ((uintptr_t) d[1] != GOMP_DEPEND_IN)
|
|
+ continue;
|
|
+ task->depend[n].addr = d[0];
|
|
+ task->depend[n++].is_in = 1;
|
|
+ }
|
|
+ }
|
|
task->depend_count = ndepend;
|
|
task->num_dependees = 0;
|
|
if (parent->depend_hash == NULL)
|
|
parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
|
|
for (i = 0; i < ndepend; i++)
|
|
{
|
|
- task->depend[i].addr = depend[2 + i];
|
|
task->depend[i].next = NULL;
|
|
task->depend[i].prev = NULL;
|
|
task->depend[i].task = task;
|
|
- task->depend[i].is_in = i >= nout;
|
|
task->depend[i].redundant = false;
|
|
task->depend[i].redundant_out = false;
|
|
|
|
@@ -205,7 +256,7 @@ gomp_task_handle_depend (struct gomp_tas
|
|
last = ent;
|
|
|
|
/* depend(in:...) doesn't depend on earlier depend(in:...). */
|
|
- if (i >= nout && ent->is_in)
|
|
+ if (task->depend[i].is_in && ent->is_in)
|
|
continue;
|
|
|
|
if (!ent->is_in)
|
|
@@ -280,9 +331,18 @@ gomp_task_handle_depend (struct gomp_tas
|
|
then the task may be executed by any member of the team.
|
|
|
|
DEPEND is an array containing:
|
|
+ if depend[0] is non-zero, then:
|
|
depend[0]: number of depend elements.
|
|
- depend[1]: number of depend elements of type "out".
|
|
- depend[2..N+1]: address of [1..N]th depend element. */
|
|
+ depend[1]: number of depend elements of type "out/inout".
|
|
+ depend[2..N+1]: address of [1..N]th depend element.
|
|
+ otherwise, when depend[0] is zero, then:
|
|
+ depend[1]: number of depend elements.
|
|
+ depend[2]: number of depend elements of type "out/inout".
|
|
+ depend[3]: number of depend elements of type "mutexinoutset".
|
|
+ depend[4]: number of depend elements of type "in".
|
|
+ depend[5..4+depend[2]+depend[3]+depend[4]]: address of depend elements
|
|
+ depend[5+depend[2]+depend[3]+depend[4]..4+depend[1]]: address of
|
|
+ omp_depend_t objects. */
|
|
|
|
void
|
|
GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
|
|
@@ -303,10 +363,20 @@ GOMP_task (void (*fn) (void *), void *da
|
|
#endif
|
|
|
|
/* If parallel or taskgroup has been cancelled, don't start new tasks. */
|
|
- if (team
|
|
- && (gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
|
|
- return;
|
|
+ if (__builtin_expect (gomp_cancel_var, 0) && team)
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
|
|
if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0)
|
|
priority = 0;
|
|
@@ -377,7 +447,7 @@ GOMP_task (void (*fn) (void *), void *da
|
|
size_t depend_size = 0;
|
|
|
|
if (flags & GOMP_TASK_FLAG_DEPEND)
|
|
- depend_size = ((uintptr_t) depend[0]
|
|
+ depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1])
|
|
* sizeof (struct gomp_task_depend_entry));
|
|
task = gomp_malloc (sizeof (*task) + depend_size
|
|
+ arg_size + arg_align - 1);
|
|
@@ -404,14 +474,26 @@ GOMP_task (void (*fn) (void *), void *da
|
|
gomp_mutex_lock (&team->task_lock);
|
|
/* If parallel or taskgroup has been cancelled, don't start new
|
|
tasks. */
|
|
- if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (taskgroup && taskgroup->cancelled))
|
|
- && !task->copy_ctors_done, 0))
|
|
+ if (__builtin_expect (gomp_cancel_var, 0)
|
|
+ && !task->copy_ctors_done)
|
|
{
|
|
- gomp_mutex_unlock (&team->task_lock);
|
|
- gomp_finish_task (task);
|
|
- free (task);
|
|
- return;
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ {
|
|
+ do_cancel:
|
|
+ gomp_mutex_unlock (&team->task_lock);
|
|
+ gomp_finish_task (task);
|
|
+ free (task);
|
|
+ return;
|
|
+ }
|
|
+ if (taskgroup)
|
|
+ {
|
|
+ if (taskgroup->cancelled)
|
|
+ goto do_cancel;
|
|
+ if (taskgroup->workshare
|
|
+ && taskgroup->prev
|
|
+ && taskgroup->prev->cancelled)
|
|
+ goto do_cancel;
|
|
+ }
|
|
}
|
|
if (taskgroup)
|
|
taskgroup->num_children++;
|
|
@@ -463,6 +545,7 @@ GOMP_task (void (*fn) (void *), void *da
|
|
|
|
ialias (GOMP_taskgroup_start)
|
|
ialias (GOMP_taskgroup_end)
|
|
+ialias (GOMP_taskgroup_reduction_register)
|
|
|
|
#define TYPE long
|
|
#define UTYPE unsigned long
|
|
@@ -601,10 +684,20 @@ gomp_create_target_task (struct gomp_dev
|
|
struct gomp_team *team = thr->ts.team;
|
|
|
|
/* If parallel or taskgroup has been cancelled, don't start new tasks. */
|
|
- if (team
|
|
- && (gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
|
|
- return true;
|
|
+ if (__builtin_expect (gomp_cancel_var, 0) && team)
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return true;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return true;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
|
|
struct gomp_target_task *ttask;
|
|
struct gomp_task *task;
|
|
@@ -617,7 +710,7 @@ gomp_create_target_task (struct gomp_dev
|
|
|
|
if (depend != NULL)
|
|
{
|
|
- depend_cnt = (uintptr_t) depend[0];
|
|
+ depend_cnt = (uintptr_t) (depend[0] ? depend[0] : depend[1]);
|
|
depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry);
|
|
}
|
|
if (fn)
|
|
@@ -687,13 +780,25 @@ gomp_create_target_task (struct gomp_dev
|
|
task->final_task = 0;
|
|
gomp_mutex_lock (&team->task_lock);
|
|
/* If parallel or taskgroup has been cancelled, don't start new tasks. */
|
|
- if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (taskgroup && taskgroup->cancelled), 0))
|
|
+ if (__builtin_expect (gomp_cancel_var, 0))
|
|
{
|
|
- gomp_mutex_unlock (&team->task_lock);
|
|
- gomp_finish_task (task);
|
|
- free (task);
|
|
- return true;
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ {
|
|
+ do_cancel:
|
|
+ gomp_mutex_unlock (&team->task_lock);
|
|
+ gomp_finish_task (task);
|
|
+ free (task);
|
|
+ return true;
|
|
+ }
|
|
+ if (taskgroup)
|
|
+ {
|
|
+ if (taskgroup->cancelled)
|
|
+ goto do_cancel;
|
|
+ if (taskgroup->workshare
|
|
+ && taskgroup->prev
|
|
+ && taskgroup->prev->cancelled)
|
|
+ goto do_cancel;
|
|
+ }
|
|
}
|
|
if (depend_size)
|
|
{
|
|
@@ -986,10 +1091,21 @@ gomp_task_run_pre (struct gomp_task *chi
|
|
|
|
if (--team->task_queued_count == 0)
|
|
gomp_team_barrier_clear_task_pending (&team->barrier);
|
|
- if ((gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (taskgroup && taskgroup->cancelled))
|
|
+ if (__builtin_expect (gomp_cancel_var, 0)
|
|
&& !child_task->copy_ctors_done)
|
|
- return true;
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return true;
|
|
+ if (taskgroup)
|
|
+ {
|
|
+ if (taskgroup->cancelled)
|
|
+ return true;
|
|
+ if (taskgroup->workshare
|
|
+ && taskgroup->prev
|
|
+ && taskgroup->prev->cancelled)
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
return false;
|
|
}
|
|
|
|
@@ -1456,6 +1572,35 @@ GOMP_taskwait (void)
|
|
}
|
|
}
|
|
|
|
+/* Called when encountering a taskwait directive with depend clause(s).
|
|
+ Wait as if it was an mergeable included task construct with empty body. */
|
|
+
|
|
+void
|
|
+GOMP_taskwait_depend (void **depend)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_team *team = thr->ts.team;
|
|
+
|
|
+ /* If parallel or taskgroup has been cancelled, return early. */
|
|
+ if (__builtin_expect (gomp_cancel_var, 0) && team)
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (thr->task && thr->task->depend_hash)
|
|
+ gomp_task_maybe_wait_for_dependencies (depend);
|
|
+}
|
|
+
|
|
/* An undeferred task is about to run. Wait for all tasks that this
|
|
undeferred task depends on.
|
|
|
|
@@ -1464,7 +1609,7 @@ GOMP_taskwait (void)
|
|
the scheduling queues. Then we iterate through these imminently
|
|
ready tasks (and possibly other high priority tasks), and run them.
|
|
If we run out of ready dependencies to execute, we either wait for
|
|
- the reamining dependencies to finish, or wait for them to get
|
|
+ the remaining dependencies to finish, or wait for them to get
|
|
scheduled so we can run them.
|
|
|
|
DEPEND is as in GOMP_task. */
|
|
@@ -1477,21 +1622,50 @@ gomp_task_maybe_wait_for_dependencies (v
|
|
struct gomp_team *team = thr->ts.team;
|
|
struct gomp_task_depend_entry elem, *ent = NULL;
|
|
struct gomp_taskwait taskwait;
|
|
- size_t ndepend = (uintptr_t) depend[0];
|
|
+ size_t orig_ndepend = (uintptr_t) depend[0];
|
|
size_t nout = (uintptr_t) depend[1];
|
|
+ size_t ndepend = orig_ndepend;
|
|
+ size_t normal = ndepend;
|
|
+ size_t n = 2;
|
|
size_t i;
|
|
size_t num_awaited = 0;
|
|
struct gomp_task *child_task = NULL;
|
|
struct gomp_task *to_free = NULL;
|
|
int do_wake = 0;
|
|
|
|
+ if (ndepend == 0)
|
|
+ {
|
|
+ ndepend = nout;
|
|
+ nout = (uintptr_t) depend[2] + (uintptr_t) depend[3];
|
|
+ normal = nout + (uintptr_t) depend[4];
|
|
+ n = 5;
|
|
+ }
|
|
gomp_mutex_lock (&team->task_lock);
|
|
for (i = 0; i < ndepend; i++)
|
|
{
|
|
- elem.addr = depend[i + 2];
|
|
+ elem.addr = depend[i + n];
|
|
+ elem.is_in = i >= nout;
|
|
+ if (__builtin_expect (i >= normal, 0))
|
|
+ {
|
|
+ void **d = (void **) elem.addr;
|
|
+ switch ((uintptr_t) d[1])
|
|
+ {
|
|
+ case GOMP_DEPEND_IN:
|
|
+ break;
|
|
+ case GOMP_DEPEND_OUT:
|
|
+ case GOMP_DEPEND_INOUT:
|
|
+ case GOMP_DEPEND_MUTEXINOUTSET:
|
|
+ elem.is_in = 0;
|
|
+ break;
|
|
+ default:
|
|
+ gomp_fatal ("unknown omp_depend_t dependence type %d",
|
|
+ (int) (uintptr_t) d[1]);
|
|
+ }
|
|
+ elem.addr = d[0];
|
|
+ }
|
|
ent = htab_find (task->depend_hash, &elem);
|
|
for (; ent; ent = ent->next)
|
|
- if (i >= nout && ent->is_in)
|
|
+ if (elem.is_in && ent->is_in)
|
|
continue;
|
|
else
|
|
{
|
|
@@ -1654,13 +1828,28 @@ GOMP_taskyield (void)
|
|
/* Nothing at the moment. */
|
|
}
|
|
|
|
+static inline struct gomp_taskgroup *
|
|
+gomp_taskgroup_init (struct gomp_taskgroup *prev)
|
|
+{
|
|
+ struct gomp_taskgroup *taskgroup
|
|
+ = gomp_malloc (sizeof (struct gomp_taskgroup));
|
|
+ taskgroup->prev = prev;
|
|
+ priority_queue_init (&taskgroup->taskgroup_queue);
|
|
+ taskgroup->reductions = prev ? prev->reductions : NULL;
|
|
+ taskgroup->in_taskgroup_wait = false;
|
|
+ taskgroup->cancelled = false;
|
|
+ taskgroup->workshare = false;
|
|
+ taskgroup->num_children = 0;
|
|
+ gomp_sem_init (&taskgroup->taskgroup_sem, 0);
|
|
+ return taskgroup;
|
|
+}
|
|
+
|
|
void
|
|
GOMP_taskgroup_start (void)
|
|
{
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
struct gomp_team *team = thr->ts.team;
|
|
struct gomp_task *task = thr->task;
|
|
- struct gomp_taskgroup *taskgroup;
|
|
|
|
/* If team is NULL, all tasks are executed as
|
|
GOMP_TASK_UNDEFERRED tasks and thus all children tasks of
|
|
@@ -1668,14 +1857,7 @@ GOMP_taskgroup_start (void)
|
|
by the time GOMP_taskgroup_end is called. */
|
|
if (team == NULL)
|
|
return;
|
|
- taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
|
|
- taskgroup->prev = task->taskgroup;
|
|
- priority_queue_init (&taskgroup->taskgroup_queue);
|
|
- taskgroup->in_taskgroup_wait = false;
|
|
- taskgroup->cancelled = false;
|
|
- taskgroup->num_children = 0;
|
|
- gomp_sem_init (&taskgroup->taskgroup_sem, 0);
|
|
- task->taskgroup = taskgroup;
|
|
+ task->taskgroup = gomp_taskgroup_init (task->taskgroup);
|
|
}
|
|
|
|
void
|
|
@@ -1840,6 +2022,302 @@ GOMP_taskgroup_end (void)
|
|
free (taskgroup);
|
|
}
|
|
|
|
+static inline __attribute__((always_inline)) void
|
|
+gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,
|
|
+ unsigned nthreads)
|
|
+{
|
|
+ size_t total_cnt = 0;
|
|
+ uintptr_t *d = data;
|
|
+ struct htab *old_htab = NULL, *new_htab;
|
|
+ do
|
|
+ {
|
|
+ if (__builtin_expect (orig != NULL, 0))
|
|
+ {
|
|
+ /* For worksharing task reductions, memory has been allocated
|
|
+ already by some other thread that encountered the construct
|
|
+ earlier. */
|
|
+ d[2] = orig[2];
|
|
+ d[6] = orig[6];
|
|
+ orig = (uintptr_t *) orig[4];
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ size_t sz = d[1] * nthreads;
|
|
+ /* Should use omp_alloc if d[3] is not -1. */
|
|
+ void *ptr = gomp_aligned_alloc (d[2], sz);
|
|
+ memset (ptr, '\0', sz);
|
|
+ d[2] = (uintptr_t) ptr;
|
|
+ d[6] = d[2] + sz;
|
|
+ }
|
|
+ d[5] = 0;
|
|
+ total_cnt += d[0];
|
|
+ if (d[4] == 0)
|
|
+ {
|
|
+ d[4] = (uintptr_t) old;
|
|
+ break;
|
|
+ }
|
|
+ else
|
|
+ d = (uintptr_t *) d[4];
|
|
+ }
|
|
+ while (1);
|
|
+ if (old && old[5])
|
|
+ {
|
|
+ old_htab = (struct htab *) old[5];
|
|
+ total_cnt += htab_elements (old_htab);
|
|
+ }
|
|
+ new_htab = htab_create (total_cnt);
|
|
+ if (old_htab)
|
|
+ {
|
|
+ /* Copy old hash table, like in htab_expand. */
|
|
+ hash_entry_type *p, *olimit;
|
|
+ new_htab->n_elements = htab_elements (old_htab);
|
|
+ olimit = old_htab->entries + old_htab->size;
|
|
+ p = old_htab->entries;
|
|
+ do
|
|
+ {
|
|
+ hash_entry_type x = *p;
|
|
+ if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
|
|
+ *find_empty_slot_for_expand (new_htab, htab_hash (x)) = x;
|
|
+ p++;
|
|
+ }
|
|
+ while (p < olimit);
|
|
+ }
|
|
+ d = data;
|
|
+ do
|
|
+ {
|
|
+ size_t j;
|
|
+ for (j = 0; j < d[0]; ++j)
|
|
+ {
|
|
+ uintptr_t *p = d + 7 + j * 3;
|
|
+ p[2] = (uintptr_t) d;
|
|
+ /* Ugly hack, hash_entry_type is defined for the task dependencies,
|
|
+ which hash on the first element which is a pointer. We need
|
|
+ to hash also on the first sizeof (uintptr_t) bytes which contain
|
|
+ a pointer. Hide the cast from the compiler. */
|
|
+ hash_entry_type n;
|
|
+ __asm ("" : "=g" (n) : "0" (p));
|
|
+ *htab_find_slot (&new_htab, n, INSERT) = n;
|
|
+ }
|
|
+ if (d[4] == (uintptr_t) old)
|
|
+ break;
|
|
+ else
|
|
+ d = (uintptr_t *) d[4];
|
|
+ }
|
|
+ while (1);
|
|
+ d[5] = (uintptr_t) new_htab;
|
|
+}
|
|
+
|
|
+static void
|
|
+gomp_create_artificial_team (void)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_task_icv *icv;
|
|
+ struct gomp_team *team = gomp_new_team (1);
|
|
+ struct gomp_task *task = thr->task;
|
|
+ icv = task ? &task->icv : &gomp_global_icv;
|
|
+ team->prev_ts = thr->ts;
|
|
+ thr->ts.team = team;
|
|
+ thr->ts.team_id = 0;
|
|
+ thr->ts.work_share = &team->work_shares[0];
|
|
+ thr->ts.last_work_share = NULL;
|
|
+#ifdef HAVE_SYNC_BUILTINS
|
|
+ thr->ts.single_count = 0;
|
|
+#endif
|
|
+ thr->ts.static_trip = 0;
|
|
+ thr->task = &team->implicit_task[0];
|
|
+ gomp_init_task (thr->task, NULL, icv);
|
|
+ if (task)
|
|
+ {
|
|
+ thr->task = task;
|
|
+ gomp_end_task ();
|
|
+ free (task);
|
|
+ thr->task = &team->implicit_task[0];
|
|
+ }
|
|
+#ifdef LIBGOMP_USE_PTHREADS
|
|
+ else
|
|
+ pthread_setspecific (gomp_thread_destructor, thr);
|
|
+#endif
|
|
+}
|
|
+
|
|
+/* The format of data is:
|
|
+ data[0] cnt
|
|
+ data[1] size
|
|
+ data[2] alignment (on output array pointer)
|
|
+ data[3] allocator (-1 if malloc allocator)
|
|
+ data[4] next pointer
|
|
+ data[5] used internally (htab pointer)
|
|
+ data[6] used internally (end of array)
|
|
+ cnt times
|
|
+ ent[0] address
|
|
+ ent[1] offset
|
|
+ ent[2] used internally (pointer to data[0])
|
|
+ The entries are sorted by increasing offset, so that a binary
|
|
+ search can be performed. Normally, data[8] is 0, exception is
|
|
+ for worksharing construct task reductions in cancellable parallel,
|
|
+ where at offset 0 there should be space for a pointer and an integer
|
|
+ which are used internally. */
|
|
+
|
|
+void
|
|
+GOMP_taskgroup_reduction_register (uintptr_t *data)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_team *team = thr->ts.team;
|
|
+ struct gomp_task *task;
|
|
+ unsigned nthreads;
|
|
+ if (__builtin_expect (team == NULL, 0))
|
|
+ {
|
|
+ /* The task reduction code needs a team and task, so for
|
|
+ orphaned taskgroups just create the implicit team. */
|
|
+ gomp_create_artificial_team ();
|
|
+ ialias_call (GOMP_taskgroup_start) ();
|
|
+ team = thr->ts.team;
|
|
+ }
|
|
+ nthreads = team->nthreads;
|
|
+ task = thr->task;
|
|
+ gomp_reduction_register (data, task->taskgroup->reductions, NULL, nthreads);
|
|
+ task->taskgroup->reductions = data;
|
|
+}
|
|
+
|
|
+void
|
|
+GOMP_taskgroup_reduction_unregister (uintptr_t *data)
|
|
+{
|
|
+ uintptr_t *d = data;
|
|
+ htab_free ((struct htab *) data[5]);
|
|
+ do
|
|
+ {
|
|
+ gomp_aligned_free ((void *) d[2]);
|
|
+ d = (uintptr_t *) d[4];
|
|
+ }
|
|
+ while (d && !d[5]);
|
|
+}
|
|
+ialias (GOMP_taskgroup_reduction_unregister)
|
|
+
|
|
+/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the
|
|
+ original list item or address of previously remapped original list
|
|
+ item to address of the private copy, store that to ptrs[i].
|
|
+ For i < cntorig, additionally set ptrs[cnt+i] to the address of
|
|
+ the original list item. */
|
|
+
|
|
+void
|
|
+GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_task *task = thr->task;
|
|
+ unsigned id = thr->ts.team_id;
|
|
+ uintptr_t *data = task->taskgroup->reductions;
|
|
+ uintptr_t *d;
|
|
+ struct htab *reduction_htab = (struct htab *) data[5];
|
|
+ size_t i;
|
|
+ for (i = 0; i < cnt; ++i)
|
|
+ {
|
|
+ hash_entry_type ent, n;
|
|
+ __asm ("" : "=g" (ent) : "0" (ptrs + i));
|
|
+ n = htab_find (reduction_htab, ent);
|
|
+ if (n)
|
|
+ {
|
|
+ uintptr_t *p;
|
|
+ __asm ("" : "=g" (p) : "0" (n));
|
|
+ /* At this point, p[0] should be equal to (uintptr_t) ptrs[i],
|
|
+ p[1] is the offset within the allocated chunk for each
|
|
+ thread, p[2] is the array registered with
|
|
+ GOMP_taskgroup_reduction_register, d[2] is the base of the
|
|
+ allocated memory and d[1] is the size of the allocated chunk
|
|
+ for one thread. */
|
|
+ d = (uintptr_t *) p[2];
|
|
+ ptrs[i] = (void *) (d[2] + id * d[1] + p[1]);
|
|
+ if (__builtin_expect (i < cntorig, 0))
|
|
+ ptrs[cnt + i] = (void *) p[0];
|
|
+ continue;
|
|
+ }
|
|
+ d = data;
|
|
+ while (d != NULL)
|
|
+ {
|
|
+ if ((uintptr_t) ptrs[i] >= d[2] && (uintptr_t) ptrs[i] < d[6])
|
|
+ break;
|
|
+ d = (uintptr_t *) d[4];
|
|
+ }
|
|
+ if (d == NULL)
|
|
+ gomp_fatal ("couldn't find matching task_reduction or reduction with "
|
|
+ "task modifier for %p", ptrs[i]);
|
|
+ uintptr_t off = ((uintptr_t) ptrs[i] - d[2]) % d[1];
|
|
+ ptrs[i] = (void *) (d[2] + id * d[1] + off);
|
|
+ if (__builtin_expect (i < cntorig, 0))
|
|
+ {
|
|
+ size_t lo = 0, hi = d[0] - 1;
|
|
+ while (lo <= hi)
|
|
+ {
|
|
+ size_t m = (lo + hi) / 2;
|
|
+ if (d[7 + 3 * m + 1] < off)
|
|
+ lo = m + 1;
|
|
+ else if (d[7 + 3 * m + 1] == off)
|
|
+ {
|
|
+ ptrs[cnt + i] = (void *) d[7 + 3 * m];
|
|
+ break;
|
|
+ }
|
|
+ else
|
|
+ hi = m - 1;
|
|
+ }
|
|
+ if (lo > hi)
|
|
+ gomp_fatal ("couldn't find matching task_reduction or reduction "
|
|
+ "with task modifier for %p", ptrs[i]);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+struct gomp_taskgroup *
|
|
+gomp_parallel_reduction_register (uintptr_t *data, unsigned nthreads)
|
|
+{
|
|
+ struct gomp_taskgroup *taskgroup = gomp_taskgroup_init (NULL);
|
|
+ gomp_reduction_register (data, NULL, NULL, nthreads);
|
|
+ taskgroup->reductions = data;
|
|
+ return taskgroup;
|
|
+}
|
|
+
|
|
+void
|
|
+gomp_workshare_task_reduction_register (uintptr_t *data, uintptr_t *orig)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_team *team = thr->ts.team;
|
|
+ struct gomp_task *task = thr->task;
|
|
+ unsigned nthreads = team->nthreads;
|
|
+ gomp_reduction_register (data, task->taskgroup->reductions, orig, nthreads);
|
|
+ task->taskgroup->reductions = data;
|
|
+}
|
|
+
|
|
+void
|
|
+gomp_workshare_taskgroup_start (void)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_team *team = thr->ts.team;
|
|
+ struct gomp_task *task;
|
|
+
|
|
+ if (team == NULL)
|
|
+ {
|
|
+ gomp_create_artificial_team ();
|
|
+ team = thr->ts.team;
|
|
+ }
|
|
+ task = thr->task;
|
|
+ task->taskgroup = gomp_taskgroup_init (task->taskgroup);
|
|
+ task->taskgroup->workshare = true;
|
|
+}
|
|
+
|
|
+void
|
|
+GOMP_workshare_task_reduction_unregister (bool cancelled)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_task *task = thr->task;
|
|
+ struct gomp_team *team = thr->ts.team;
|
|
+ uintptr_t *data = task->taskgroup->reductions;
|
|
+ ialias_call (GOMP_taskgroup_end) ();
|
|
+ if (thr->ts.team_id == 0)
|
|
+ ialias_call (GOMP_taskgroup_reduction_unregister) (data);
|
|
+ else
|
|
+ htab_free ((struct htab *) data[5]);
|
|
+
|
|
+ if (!cancelled)
|
|
+ gomp_team_barrier_wait (&team->barrier);
|
|
+}
|
|
+
|
|
int
|
|
omp_in_final (void)
|
|
{
|
|
--- libgomp/team.c.jj 2018-04-25 09:40:31.322655307 +0200
|
|
+++ libgomp/team.c 2019-05-07 18:46:36.548109384 +0200
|
|
@@ -32,7 +32,6 @@
|
|
#include <string.h>
|
|
|
|
#ifdef LIBGOMP_USE_PTHREADS
|
|
-/* This attribute contains PTHREAD_CREATE_DETACHED. */
|
|
pthread_attr_t gomp_thread_attr;
|
|
|
|
/* This key is for the thread destructor. */
|
|
@@ -58,6 +57,7 @@ struct gomp_thread_start_data
|
|
struct gomp_thread_pool *thread_pool;
|
|
unsigned int place;
|
|
bool nested;
|
|
+ pthread_t handle;
|
|
};
|
|
|
|
|
|
@@ -89,6 +89,9 @@ gomp_thread_start (void *xdata)
|
|
thr->ts = data->ts;
|
|
thr->task = data->task;
|
|
thr->place = data->place;
|
|
+#ifdef GOMP_NEEDS_THREAD_HANDLE
|
|
+ thr->handle = data->handle;
|
|
+#endif
|
|
|
|
thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
|
|
|
|
@@ -131,6 +134,7 @@ gomp_thread_start (void *xdata)
|
|
}
|
|
|
|
gomp_sem_destroy (&thr->release);
|
|
+ pthread_detach (pthread_self ());
|
|
thr->thread_pool = NULL;
|
|
thr->task = NULL;
|
|
return NULL;
|
|
@@ -183,7 +187,7 @@ gomp_new_team (unsigned nthreads)
|
|
team->single_count = 0;
|
|
#endif
|
|
team->work_shares_to_free = &team->work_shares[0];
|
|
- gomp_init_work_share (&team->work_shares[0], false, nthreads);
|
|
+ gomp_init_work_share (&team->work_shares[0], 0, nthreads);
|
|
team->work_shares[0].next_alloc = NULL;
|
|
team->work_share_list_free = NULL;
|
|
team->work_share_list_alloc = &team->work_shares[1];
|
|
@@ -231,6 +235,7 @@ gomp_free_pool_helper (void *thread_pool
|
|
thr->thread_pool = NULL;
|
|
thr->task = NULL;
|
|
#ifdef LIBGOMP_USE_PTHREADS
|
|
+ pthread_detach (pthread_self ());
|
|
pthread_exit (NULL);
|
|
#elif defined(__nvptx__)
|
|
asm ("exit;");
|
|
@@ -297,7 +302,8 @@ gomp_free_thread (void *arg __attribute_
|
|
#ifdef LIBGOMP_USE_PTHREADS
|
|
void
|
|
gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
|
|
- unsigned flags, struct gomp_team *team)
|
|
+ unsigned flags, struct gomp_team *team,
|
|
+ struct gomp_taskgroup *taskgroup)
|
|
{
|
|
struct gomp_thread_start_data *start_data;
|
|
struct gomp_thread *thr, *nthr;
|
|
@@ -312,6 +318,7 @@ gomp_team_start (void (*fn) (void *), vo
|
|
unsigned int s = 0, rest = 0, p = 0, k = 0;
|
|
unsigned int affinity_count = 0;
|
|
struct gomp_thread **affinity_thr = NULL;
|
|
+ bool force_display = false;
|
|
|
|
thr = gomp_thread ();
|
|
nested = thr->ts.level;
|
|
@@ -319,7 +326,12 @@ gomp_team_start (void (*fn) (void *), vo
|
|
task = thr->task;
|
|
icv = task ? &task->icv : &gomp_global_icv;
|
|
if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
|
|
- gomp_init_affinity ();
|
|
+ {
|
|
+ gomp_init_affinity ();
|
|
+ if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1)
|
|
+ gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
|
|
+ thr->place);
|
|
+ }
|
|
|
|
/* Always save the previous state, even if this isn't a nested team.
|
|
In particular, we should save any work share state from an outer
|
|
@@ -338,6 +350,9 @@ gomp_team_start (void (*fn) (void *), vo
|
|
#endif
|
|
thr->ts.static_trip = 0;
|
|
thr->task = &team->implicit_task[0];
|
|
+#ifdef GOMP_NEEDS_THREAD_HANDLE
|
|
+ thr->handle = pthread_self ();
|
|
+#endif
|
|
nthreads_var = icv->nthreads_var;
|
|
if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
|
|
&& thr->ts.level < gomp_nthreads_var_list_len)
|
|
@@ -350,6 +365,7 @@ gomp_team_start (void (*fn) (void *), vo
|
|
&& thr->ts.level < gomp_bind_var_list_len)
|
|
bind_var = gomp_bind_var_list[thr->ts.level];
|
|
gomp_init_task (thr->task, task, icv);
|
|
+ thr->task->taskgroup = taskgroup;
|
|
team->implicit_task[0].icv.nthreads_var = nthreads_var;
|
|
team->implicit_task[0].icv.bind_var = bind_var;
|
|
|
|
@@ -465,7 +481,9 @@ gomp_team_start (void (*fn) (void *), vo
|
|
pool->threads
|
|
= gomp_realloc (pool->threads,
|
|
pool->threads_size
|
|
- * sizeof (struct gomp_thread_data *));
|
|
+ * sizeof (struct gomp_thread *));
|
|
+ /* Add current (master) thread to threads[]. */
|
|
+ pool->threads[0] = thr;
|
|
}
|
|
|
|
/* Release existing idle threads. */
|
|
@@ -540,6 +558,7 @@ gomp_team_start (void (*fn) (void *), vo
|
|
+ place_partition_len))
|
|
{
|
|
unsigned int l;
|
|
+ force_display = true;
|
|
if (affinity_thr == NULL)
|
|
{
|
|
unsigned int j;
|
|
@@ -623,6 +642,7 @@ gomp_team_start (void (*fn) (void *), vo
|
|
gomp_init_task (nthr->task, task, icv);
|
|
team->implicit_task[i].icv.nthreads_var = nthreads_var;
|
|
team->implicit_task[i].icv.bind_var = bind_var;
|
|
+ nthr->task->taskgroup = taskgroup;
|
|
nthr->fn = fn;
|
|
nthr->data = data;
|
|
team->ordered_release[i] = &nthr->release;
|
|
@@ -712,19 +732,17 @@ gomp_team_start (void (*fn) (void *), vo
|
|
{
|
|
size_t stacksize;
|
|
pthread_attr_init (&thread_attr);
|
|
- pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED);
|
|
if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
|
|
pthread_attr_setstacksize (&thread_attr, stacksize);
|
|
attr = &thread_attr;
|
|
}
|
|
|
|
start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
|
|
- * (nthreads-i));
|
|
+ * (nthreads - i));
|
|
|
|
/* Launch new threads. */
|
|
for (; i < nthreads; ++i)
|
|
{
|
|
- pthread_t pt;
|
|
int err;
|
|
|
|
start_data->ts.place_partition_off = thr->ts.place_partition_off;
|
|
@@ -810,11 +828,14 @@ gomp_team_start (void (*fn) (void *), vo
|
|
gomp_init_task (start_data->task, task, icv);
|
|
team->implicit_task[i].icv.nthreads_var = nthreads_var;
|
|
team->implicit_task[i].icv.bind_var = bind_var;
|
|
+ start_data->task->taskgroup = taskgroup;
|
|
start_data->thread_pool = pool;
|
|
start_data->nested = nested;
|
|
|
|
attr = gomp_adjust_thread_attr (attr, &thread_attr);
|
|
- err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
|
|
+ err = pthread_create (&start_data->handle, attr, gomp_thread_start,
|
|
+ start_data);
|
|
+ start_data++;
|
|
if (err != 0)
|
|
gomp_fatal ("Thread creation failed: %s", strerror (err));
|
|
}
|
|
@@ -854,6 +875,42 @@ gomp_team_start (void (*fn) (void *), vo
|
|
gomp_mutex_unlock (&gomp_managed_threads_lock);
|
|
#endif
|
|
}
|
|
+ if (__builtin_expect (gomp_display_affinity_var, 0))
|
|
+ {
|
|
+ if (nested
|
|
+ || nthreads != old_threads_used
|
|
+ || force_display)
|
|
+ {
|
|
+ gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
|
|
+ thr->place);
|
|
+ if (nested)
|
|
+ {
|
|
+ start_data -= nthreads - 1;
|
|
+ for (i = 1; i < nthreads; ++i)
|
|
+ {
|
|
+ gomp_display_affinity_thread (
|
|
+#ifdef LIBGOMP_USE_PTHREADS
|
|
+ start_data->handle,
|
|
+#else
|
|
+ gomp_thread_self (),
|
|
+#endif
|
|
+ &start_data->ts,
|
|
+ start_data->place);
|
|
+ start_data++;
|
|
+ }
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ for (i = 1; i < nthreads; ++i)
|
|
+ {
|
|
+ gomp_thread_handle handle
|
|
+ = gomp_thread_to_pthread_t (pool->threads[i]);
|
|
+ gomp_display_affinity_thread (handle, &pool->threads[i]->ts,
|
|
+ pool->threads[i]->place);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
if (__builtin_expect (affinity_thr != NULL, 0)
|
|
&& team->prev_ts.place_partition_len > 64)
|
|
free (affinity_thr);
|
|
@@ -894,7 +951,7 @@ gomp_team_end (void)
|
|
gomp_end_task ();
|
|
thr->ts = team->prev_ts;
|
|
|
|
- if (__builtin_expect (thr->ts.team != NULL, 0))
|
|
+ if (__builtin_expect (thr->ts.level != 0, 0))
|
|
{
|
|
#ifdef HAVE_SYNC_BUILTINS
|
|
__sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
|
|
@@ -959,6 +1016,76 @@ team_destructor (void)
|
|
crashes. */
|
|
pthread_key_delete (gomp_thread_destructor);
|
|
}
|
|
+
|
|
+/* Similar to gomp_free_pool_helper, but don't detach itself,
|
|
+ gomp_pause_host will pthread_join those threads. */
|
|
+
|
|
+static void
|
|
+gomp_pause_pool_helper (void *thread_pool)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_thread_pool *pool
|
|
+ = (struct gomp_thread_pool *) thread_pool;
|
|
+ gomp_simple_barrier_wait_last (&pool->threads_dock);
|
|
+ gomp_sem_destroy (&thr->release);
|
|
+ thr->thread_pool = NULL;
|
|
+ thr->task = NULL;
|
|
+ pthread_exit (NULL);
|
|
+}
|
|
+
|
|
+/* Free a thread pool and release its threads. Return non-zero on
|
|
+ failure. */
|
|
+
|
|
+int
|
|
+gomp_pause_host (void)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ struct gomp_thread_pool *pool = thr->thread_pool;
|
|
+ if (thr->ts.level)
|
|
+ return -1;
|
|
+ if (pool)
|
|
+ {
|
|
+ if (pool->threads_used > 0)
|
|
+ {
|
|
+ int i;
|
|
+ pthread_t *thrs
|
|
+ = gomp_alloca (sizeof (pthread_t) * pool->threads_used);
|
|
+ for (i = 1; i < pool->threads_used; i++)
|
|
+ {
|
|
+ struct gomp_thread *nthr = pool->threads[i];
|
|
+ nthr->fn = gomp_pause_pool_helper;
|
|
+ nthr->data = pool;
|
|
+ thrs[i] = gomp_thread_to_pthread_t (nthr);
|
|
+ }
|
|
+ /* This barrier undocks threads docked on pool->threads_dock. */
|
|
+ gomp_simple_barrier_wait (&pool->threads_dock);
|
|
+ /* And this waits till all threads have called gomp_barrier_wait_last
|
|
+ in gomp_pause_pool_helper. */
|
|
+ gomp_simple_barrier_wait (&pool->threads_dock);
|
|
+ /* Now it is safe to destroy the barrier and free the pool. */
|
|
+ gomp_simple_barrier_destroy (&pool->threads_dock);
|
|
+
|
|
+#ifdef HAVE_SYNC_BUILTINS
|
|
+ __sync_fetch_and_add (&gomp_managed_threads,
|
|
+ 1L - pool->threads_used);
|
|
+#else
|
|
+ gomp_mutex_lock (&gomp_managed_threads_lock);
|
|
+ gomp_managed_threads -= pool->threads_used - 1L;
|
|
+ gomp_mutex_unlock (&gomp_managed_threads_lock);
|
|
+#endif
|
|
+ for (i = 1; i < pool->threads_used; i++)
|
|
+ pthread_join (thrs[i], NULL);
|
|
+ }
|
|
+ if (pool->last_team)
|
|
+ free_team (pool->last_team);
|
|
+#ifndef __nvptx__
|
|
+ free (pool->threads);
|
|
+ free (pool);
|
|
+#endif
|
|
+ thr->thread_pool = NULL;
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
#endif
|
|
|
|
struct gomp_task_icv *
|
|
--- libgomp/libgomp.h.jj 2018-04-25 09:40:31.925655587 +0200
|
|
+++ libgomp/libgomp.h 2019-05-07 19:01:51.285535999 +0200
|
|
@@ -44,6 +44,7 @@
|
|
#include "config.h"
|
|
#include "gstdint.h"
|
|
#include "libgomp-plugin.h"
|
|
+#include "gomp-constants.h"
|
|
|
|
#ifdef HAVE_PTHREAD_H
|
|
#include <pthread.h>
|
|
@@ -85,9 +86,21 @@ enum memmodel
|
|
|
|
/* alloc.c */
|
|
|
|
+#if defined(HAVE_ALIGNED_ALLOC) \
|
|
+ || defined(HAVE__ALIGNED_MALLOC) \
|
|
+ || defined(HAVE_POSIX_MEMALIGN) \
|
|
+ || defined(HAVE_MEMALIGN)
|
|
+/* Defined if gomp_aligned_alloc doesn't use fallback version
|
|
+ and free can be used instead of gomp_aligned_free. */
|
|
+#define GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC 1
|
|
+#endif
|
|
+
|
|
extern void *gomp_malloc (size_t) __attribute__((malloc));
|
|
extern void *gomp_malloc_cleared (size_t) __attribute__((malloc));
|
|
extern void *gomp_realloc (void *, size_t);
|
|
+extern void *gomp_aligned_alloc (size_t, size_t)
|
|
+ __attribute__((malloc, alloc_size (2)));
|
|
+extern void gomp_aligned_free (void *);
|
|
|
|
/* Avoid conflicting prototypes of alloca() in system headers by using
|
|
GCC's builtin alloca(). */
|
|
@@ -137,7 +150,8 @@ enum gomp_schedule_type
|
|
GFS_STATIC,
|
|
GFS_DYNAMIC,
|
|
GFS_GUIDED,
|
|
- GFS_AUTO
|
|
+ GFS_AUTO,
|
|
+ GFS_MONOTONIC = 0x80000000U
|
|
};
|
|
|
|
struct gomp_doacross_work_share
|
|
@@ -174,6 +188,8 @@ struct gomp_doacross_work_share
|
|
/* Likewise, but for the ull implementation. */
|
|
unsigned long long boundary_ull;
|
|
};
|
|
+ /* Pointer to extra memory if needed for lastprivate(conditional). */
|
|
+ void *extra;
|
|
/* Array of shift counts for each dimension if they can be flattened. */
|
|
unsigned int shift_counts[];
|
|
};
|
|
@@ -275,6 +291,9 @@ struct gomp_work_share
|
|
struct gomp_work_share *next_free;
|
|
};
|
|
|
|
+ /* Task reductions for this work-sharing construct. */
|
|
+ uintptr_t *task_reductions;
|
|
+
|
|
/* If only few threads are in the team, ordered_team_ids can point
|
|
to this array which fills the padding at the end of this struct. */
|
|
unsigned inline_ordered_team_ids[0];
|
|
@@ -365,8 +384,12 @@ extern void **gomp_places_list;
|
|
extern unsigned long gomp_places_list_len;
|
|
extern unsigned int gomp_num_teams_var;
|
|
extern int gomp_debug_var;
|
|
+extern bool gomp_display_affinity_var;
|
|
+extern char *gomp_affinity_format_var;
|
|
+extern size_t gomp_affinity_format_len;
|
|
extern int goacc_device_num;
|
|
extern char *goacc_device_type;
|
|
+extern int goacc_default_dims[GOMP_DIM_MAX];
|
|
|
|
enum gomp_task_kind
|
|
{
|
|
@@ -469,8 +492,10 @@ struct gomp_taskgroup
|
|
struct gomp_taskgroup *prev;
|
|
/* Queue of tasks that belong in this taskgroup. */
|
|
struct priority_queue taskgroup_queue;
|
|
+ uintptr_t *reductions;
|
|
bool in_taskgroup_wait;
|
|
bool cancelled;
|
|
+ bool workshare;
|
|
gomp_sem_t taskgroup_sem;
|
|
size_t num_children;
|
|
};
|
|
@@ -613,6 +638,19 @@ struct gomp_thread
|
|
|
|
/* User pthread thread pool */
|
|
struct gomp_thread_pool *thread_pool;
|
|
+
|
|
+#if defined(LIBGOMP_USE_PTHREADS) \
|
|
+ && (!defined(HAVE_TLS) \
|
|
+ || !defined(__GLIBC__) \
|
|
+ || !defined(USING_INITIAL_EXEC_TLS))
|
|
+ /* pthread_t of the thread containing this gomp_thread.
|
|
+ On Linux when using initial-exec TLS,
|
|
+ (typeof (pthread_t)) gomp_thread () - pthread_self ()
|
|
+ is constant in all threads, so we can optimize and not
|
|
+ store it. */
|
|
+#define GOMP_NEEDS_THREAD_HANDLE 1
|
|
+ pthread_t handle;
|
|
+#endif
|
|
};
|
|
|
|
|
|
@@ -709,6 +747,25 @@ extern bool gomp_affinity_finalize_place
|
|
extern bool gomp_affinity_init_level (int, unsigned long, bool);
|
|
extern void gomp_affinity_print_place (void *);
|
|
extern void gomp_get_place_proc_ids_8 (int, int64_t *);
|
|
+extern void gomp_display_affinity_place (char *, size_t, size_t *, int);
|
|
+
|
|
+/* affinity-fmt.c */
|
|
+
|
|
+extern void gomp_print_string (const char *str, size_t len);
|
|
+extern void gomp_set_affinity_format (const char *, size_t);
|
|
+extern void gomp_display_string (char *, size_t, size_t *, const char *,
|
|
+ size_t);
|
|
+#ifdef LIBGOMP_USE_PTHREADS
|
|
+typedef pthread_t gomp_thread_handle;
|
|
+#else
|
|
+typedef struct {} gomp_thread_handle;
|
|
+#endif
|
|
+extern size_t gomp_display_affinity (char *, size_t, const char *,
|
|
+ gomp_thread_handle,
|
|
+ struct gomp_team_state *, unsigned int);
|
|
+extern void gomp_display_affinity_thread (gomp_thread_handle,
|
|
+ struct gomp_team_state *,
|
|
+ unsigned int) __attribute__((cold));
|
|
|
|
/* iter.c */
|
|
|
|
@@ -745,9 +802,9 @@ extern void gomp_ordered_next (void);
|
|
extern void gomp_ordered_static_init (void);
|
|
extern void gomp_ordered_static_next (void);
|
|
extern void gomp_ordered_sync (void);
|
|
-extern void gomp_doacross_init (unsigned, long *, long);
|
|
+extern void gomp_doacross_init (unsigned, long *, long, size_t);
|
|
extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
|
|
- unsigned long long);
|
|
+ unsigned long long, size_t);
|
|
|
|
/* parallel.c */
|
|
|
|
@@ -770,6 +827,10 @@ extern bool gomp_create_target_task (str
|
|
size_t *, unsigned short *, unsigned int,
|
|
void **, void **,
|
|
enum gomp_target_task_state);
|
|
+extern struct gomp_taskgroup *gomp_parallel_reduction_register (uintptr_t *,
|
|
+ unsigned);
|
|
+extern void gomp_workshare_taskgroup_start (void);
|
|
+extern void gomp_workshare_task_reduction_register (uintptr_t *, uintptr_t *);
|
|
|
|
static void inline
|
|
gomp_finish_task (struct gomp_task *task)
|
|
@@ -782,9 +843,11 @@ gomp_finish_task (struct gomp_task *task
|
|
|
|
extern struct gomp_team *gomp_new_team (unsigned);
|
|
extern void gomp_team_start (void (*) (void *), void *, unsigned,
|
|
- unsigned, struct gomp_team *);
|
|
+ unsigned, struct gomp_team *,
|
|
+ struct gomp_taskgroup *);
|
|
extern void gomp_team_end (void);
|
|
extern void gomp_free_thread (void *);
|
|
+extern int gomp_pause_host (void);
|
|
|
|
/* target.c */
|
|
|
|
@@ -851,6 +914,8 @@ struct splay_tree_key_s {
|
|
uintptr_t tgt_offset;
|
|
/* Reference count. */
|
|
uintptr_t refcount;
|
|
+ /* Dynamic reference count. */
|
|
+ uintptr_t dynamic_refcount;
|
|
/* Pointer to the original mapping of "omp declare target link" object. */
|
|
splay_tree_key link_key;
|
|
};
|
|
@@ -989,7 +1054,9 @@ enum gomp_map_vars_kind
|
|
};
|
|
|
|
extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
|
|
-extern void gomp_acc_remove_pointer (void *, bool, int, int);
|
|
+extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
|
|
+extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
|
|
+ unsigned short *);
|
|
|
|
extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
|
|
size_t, void **, void **,
|
|
@@ -999,12 +1066,13 @@ extern void gomp_unmap_vars (struct targ
|
|
extern void gomp_init_device (struct gomp_device_descr *);
|
|
extern void gomp_free_memmap (struct splay_tree_s *);
|
|
extern void gomp_unload_device (struct gomp_device_descr *);
|
|
+extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
|
|
|
|
/* work.c */
|
|
|
|
-extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned);
|
|
+extern void gomp_init_work_share (struct gomp_work_share *, size_t, unsigned);
|
|
extern void gomp_fini_work_share (struct gomp_work_share *);
|
|
-extern bool gomp_work_share_start (bool);
|
|
+extern bool gomp_work_share_start (size_t);
|
|
extern void gomp_work_share_end (void);
|
|
extern bool gomp_work_share_end_cancel (void);
|
|
extern void gomp_work_share_end_nowait (void);
|
|
@@ -1028,6 +1096,14 @@ gomp_work_share_init_done (void)
|
|
#include "omp-lock.h"
|
|
#define _LIBGOMP_OMP_LOCK_DEFINED 1
|
|
#include "omp.h.in"
|
|
+#define omp_sched_monotonic 0x80000000U
|
|
+typedef enum omp_pause_resource_t
|
|
+{
|
|
+ omp_pause_soft = 1,
|
|
+ omp_pause_hard = 2
|
|
+} omp_pause_resource_t;
|
|
+extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW;
|
|
+extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW;
|
|
|
|
#if !defined (HAVE_ATTRIBUTE_VISIBILITY) \
|
|
|| !defined (HAVE_ATTRIBUTE_ALIAS) \
|
|
@@ -1082,16 +1158,26 @@ extern int gomp_test_nest_lock_25 (omp_n
|
|
# define attribute_hidden
|
|
#endif
|
|
|
|
+#if __GNUC__ >= 9
|
|
+# define HAVE_ATTRIBUTE_COPY
|
|
+#endif
|
|
+
|
|
+#ifdef HAVE_ATTRIBUTE_COPY
|
|
+# define attribute_copy(arg) __attribute__ ((copy (arg)))
|
|
+#else
|
|
+# define attribute_copy(arg)
|
|
+#endif
|
|
+
|
|
#ifdef HAVE_ATTRIBUTE_ALIAS
|
|
# define strong_alias(fn, al) \
|
|
- extern __typeof (fn) al __attribute__ ((alias (#fn)));
|
|
+ extern __typeof (fn) al __attribute__ ((alias (#fn))) attribute_copy (fn);
|
|
|
|
# define ialias_ulp ialias_str1(__USER_LABEL_PREFIX__)
|
|
# define ialias_str1(x) ialias_str2(x)
|
|
# define ialias_str2(x) #x
|
|
# define ialias(fn) \
|
|
extern __typeof (fn) gomp_ialias_##fn \
|
|
- __attribute__ ((alias (#fn))) attribute_hidden;
|
|
+ __attribute__ ((alias (#fn))) attribute_hidden attribute_copy (fn);
|
|
# define ialias_redirect(fn) \
|
|
extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden;
|
|
# define ialias_call(fn) gomp_ialias_ ## fn
|
|
@@ -1131,4 +1217,42 @@ task_to_priority_node (enum priority_que
|
|
return (struct priority_node *) ((char *) task
|
|
+ priority_queue_offset (type));
|
|
}
|
|
+
|
|
+#ifdef LIBGOMP_USE_PTHREADS
|
|
+static inline gomp_thread_handle
|
|
+gomp_thread_self (void)
|
|
+{
|
|
+ return pthread_self ();
|
|
+}
|
|
+
|
|
+static inline gomp_thread_handle
|
|
+gomp_thread_to_pthread_t (struct gomp_thread *thr)
|
|
+{
|
|
+ struct gomp_thread *this_thr = gomp_thread ();
|
|
+ if (thr == this_thr)
|
|
+ return pthread_self ();
|
|
+#ifdef GOMP_NEEDS_THREAD_HANDLE
|
|
+ return thr->handle;
|
|
+#else
|
|
+ /* On Linux with initial-exec TLS, the pthread_t of the thread containing
|
|
+ thr can be computed from thr, this_thr and pthread_self (),
|
|
+ as the distance between this_thr and pthread_self () is constant. */
|
|
+ return pthread_self () + ((uintptr_t) thr - (uintptr_t) this_thr);
|
|
+#endif
|
|
+}
|
|
+#else
|
|
+static inline gomp_thread_handle
|
|
+gomp_thread_self (void)
|
|
+{
|
|
+ return (gomp_thread_handle) {};
|
|
+}
|
|
+
|
|
+static inline gomp_thread_handle
|
|
+gomp_thread_to_pthread_t (struct gomp_thread *thr)
|
|
+{
|
|
+ (void) thr;
|
|
+ return gomp_thread_self ();
|
|
+}
|
|
+#endif
|
|
+
|
|
#endif /* LIBGOMP_H */
|
|
--- libgomp/oacc-parallel.c.jj 2018-04-25 09:40:31.319655306 +0200
|
|
+++ libgomp/oacc-parallel.c 2019-05-07 19:09:47.010991153 +0200
|
|
@@ -27,6 +27,8 @@
|
|
/* This file handles OpenACC constructs. */
|
|
|
|
#include "openacc.h"
|
|
+void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW;
|
|
+void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW;
|
|
#include "libgomp.h"
|
|
#include "libgomp_g.h"
|
|
#include "gomp-constants.h"
|
|
@@ -38,31 +40,95 @@
|
|
#include <stdarg.h>
|
|
#include <assert.h>
|
|
|
|
+
|
|
+/* In the ABI, the GOACC_FLAGs are encoded as an inverted bitmask, so that we
|
|
+ continue to support the following two legacy values. */
|
|
+_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_ICV) == 0,
|
|
+ "legacy GOMP_DEVICE_ICV broken");
|
|
+_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_HOST_FALLBACK)
|
|
+ == GOACC_FLAG_HOST_FALLBACK,
|
|
+ "legacy GOMP_DEVICE_HOST_FALLBACK broken");
|
|
+
|
|
+
|
|
+/* Returns the number of mappings associated with the pointer or pset. PSET
|
|
+ have three mappings, whereas pointer have two. */
|
|
+
|
|
static int
|
|
-find_pset (int pos, size_t mapnum, unsigned short *kinds)
|
|
+find_pointer (int pos, size_t mapnum, unsigned short *kinds)
|
|
{
|
|
if (pos + 1 >= mapnum)
|
|
return 0;
|
|
|
|
unsigned char kind = kinds[pos+1] & 0xff;
|
|
|
|
- return kind == GOMP_MAP_TO_PSET;
|
|
+ if (kind == GOMP_MAP_TO_PSET)
|
|
+ return 3;
|
|
+ else if (kind == GOMP_MAP_POINTER)
|
|
+ return 2;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Handle the mapping pair that are presented when a
|
|
+ deviceptr clause is used with Fortran. */
|
|
+
|
|
+static void
|
|
+handle_ftn_pointers (size_t mapnum, void **hostaddrs, size_t *sizes,
|
|
+ unsigned short *kinds)
|
|
+{
|
|
+ int i;
|
|
+
|
|
+ for (i = 0; i < mapnum; i++)
|
|
+ {
|
|
+ unsigned short kind1 = kinds[i] & 0xff;
|
|
+
|
|
+ /* Handle Fortran deviceptr clause. */
|
|
+ if (kind1 == GOMP_MAP_FORCE_DEVICEPTR)
|
|
+ {
|
|
+ unsigned short kind2;
|
|
+
|
|
+ if (i < (signed)mapnum - 1)
|
|
+ kind2 = kinds[i + 1] & 0xff;
|
|
+ else
|
|
+ kind2 = 0xffff;
|
|
+
|
|
+ if (sizes[i] == sizeof (void *))
|
|
+ continue;
|
|
+
|
|
+ /* At this point, we're dealing with a Fortran deviceptr.
|
|
+ If the next element is not what we're expecting, then
|
|
+ this is an instance of where the deviceptr variable was
|
|
+ not used within the region and the pointer was removed
|
|
+ by the gimplifier. */
|
|
+ if (kind2 == GOMP_MAP_POINTER
|
|
+ && sizes[i + 1] == 0
|
|
+ && hostaddrs[i] == *(void **)hostaddrs[i + 1])
|
|
+ {
|
|
+ kinds[i+1] = kinds[i];
|
|
+ sizes[i+1] = sizeof (void *);
|
|
+ }
|
|
+
|
|
+ /* Invalidate the entry. */
|
|
+ hostaddrs[i] = NULL;
|
|
+ }
|
|
+ }
|
|
}
|
|
|
|
static void goacc_wait (int async, int num_waits, va_list *ap);
|
|
|
|
|
|
-/* Launch a possibly offloaded function on DEVICE. FN is the host fn
|
|
+/* Launch a possibly offloaded function with FLAGS. FN is the host fn
|
|
address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory
|
|
blocks to be copied to/from the device. Varadic arguments are
|
|
keyed optional parameters terminated with a zero. */
|
|
|
|
void
|
|
-GOACC_parallel_keyed (int device, void (*fn) (void *),
|
|
+GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
|
|
size_t mapnum, void **hostaddrs, size_t *sizes,
|
|
unsigned short *kinds, ...)
|
|
{
|
|
- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
|
|
+ int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
|
|
+
|
|
va_list ap;
|
|
struct goacc_thread *thr;
|
|
struct gomp_device_descr *acc_dev;
|
|
@@ -88,9 +154,11 @@ GOACC_parallel_keyed (int device, void (
|
|
thr = goacc_thread ();
|
|
acc_dev = thr->dev;
|
|
|
|
+ handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
|
|
+
|
|
/* Host fallback if "if" clause is false or if the current device is set to
|
|
the host. */
|
|
- if (host_fallback)
|
|
+ if (flags & GOACC_FLAG_HOST_FALLBACK)
|
|
{
|
|
goacc_save_and_set_bind (acc_device_host);
|
|
fn (hostaddrs);
|
|
@@ -140,9 +208,7 @@ GOACC_parallel_keyed (int device, void (
|
|
case GOMP_LAUNCH_WAIT:
|
|
{
|
|
unsigned num_waits = GOMP_LAUNCH_OP (tag);
|
|
-
|
|
- if (num_waits)
|
|
- goacc_wait (async, num_waits, &ap);
|
|
+ goacc_wait (async, num_waits, &ap);
|
|
break;
|
|
}
|
|
|
|
@@ -177,16 +243,36 @@ GOACC_parallel_keyed (int device, void (
|
|
devaddrs = gomp_alloca (sizeof (void *) * mapnum);
|
|
for (i = 0; i < mapnum; i++)
|
|
devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
|
|
- + tgt->list[i].key->tgt_offset);
|
|
+ + tgt->list[i].key->tgt_offset
|
|
+ + tgt->list[i].offset);
|
|
|
|
acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
|
|
async, dims, tgt);
|
|
|
|
/* If running synchronously, unmap immediately. */
|
|
- if (async < acc_async_noval)
|
|
+ bool copyfrom = true;
|
|
+ if (async_synchronous_p (async))
|
|
gomp_unmap_vars (tgt, true);
|
|
else
|
|
- tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
|
|
+ {
|
|
+ bool async_unmap = false;
|
|
+ for (size_t i = 0; i < tgt->list_count; i++)
|
|
+ {
|
|
+ splay_tree_key k = tgt->list[i].key;
|
|
+ if (k && k->refcount == 1)
|
|
+ {
|
|
+ async_unmap = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if (async_unmap)
|
|
+ tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
|
|
+ else
|
|
+ {
|
|
+ copyfrom = false;
|
|
+ gomp_unmap_vars (tgt, copyfrom);
|
|
+ }
|
|
+ }
|
|
|
|
acc_dev->openacc.async_set_async_func (acc_async_sync);
|
|
}
|
|
@@ -194,7 +280,7 @@ GOACC_parallel_keyed (int device, void (
|
|
/* Legacy entry point, only provide host execution. */
|
|
|
|
void
|
|
-GOACC_parallel (int device, void (*fn) (void *),
|
|
+GOACC_parallel (int flags_m, void (*fn) (void *),
|
|
size_t mapnum, void **hostaddrs, size_t *sizes,
|
|
unsigned short *kinds,
|
|
int num_gangs, int num_workers, int vector_length,
|
|
@@ -206,10 +292,11 @@ GOACC_parallel (int device, void (*fn) (
|
|
}
|
|
|
|
void
|
|
-GOACC_data_start (int device, size_t mapnum,
|
|
+GOACC_data_start (int flags_m, size_t mapnum,
|
|
void **hostaddrs, size_t *sizes, unsigned short *kinds)
|
|
{
|
|
- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
|
|
+ int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
|
|
+
|
|
struct target_mem_desc *tgt;
|
|
|
|
#ifdef HAVE_INTTYPES_H
|
|
@@ -227,7 +314,7 @@ GOACC_data_start (int device, size_t map
|
|
|
|
/* Host fallback or 'do nothing'. */
|
|
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
|
- || host_fallback)
|
|
+ || (flags & GOACC_FLAG_HOST_FALLBACK))
|
|
{
|
|
tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
|
|
GOMP_MAP_VARS_OPENACC);
|
|
@@ -258,13 +345,14 @@ GOACC_data_end (void)
|
|
}
|
|
|
|
void
|
|
-GOACC_enter_exit_data (int device, size_t mapnum,
|
|
+GOACC_enter_exit_data (int flags_m, size_t mapnum,
|
|
void **hostaddrs, size_t *sizes, unsigned short *kinds,
|
|
int async, int num_waits, ...)
|
|
{
|
|
+ int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
|
|
+
|
|
struct goacc_thread *thr;
|
|
struct gomp_device_descr *acc_dev;
|
|
- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
|
|
bool data_enter = false;
|
|
size_t i;
|
|
|
|
@@ -274,7 +362,7 @@ GOACC_enter_exit_data (int device, size_
|
|
acc_dev = thr->dev;
|
|
|
|
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
|
- || host_fallback)
|
|
+ || (flags & GOACC_FLAG_HOST_FALLBACK))
|
|
return;
|
|
|
|
if (num_waits)
|
|
@@ -286,6 +374,17 @@ GOACC_enter_exit_data (int device, size_
|
|
va_end (ap);
|
|
}
|
|
|
|
+ /* Determine whether "finalize" semantics apply to all mappings of this
|
|
+ OpenACC directive. */
|
|
+ bool finalize = false;
|
|
+ if (mapnum > 0)
|
|
+ {
|
|
+ unsigned char kind = kinds[0] & 0xff;
|
|
+ if (kind == GOMP_MAP_DELETE
|
|
+ || kind == GOMP_MAP_FORCE_FROM)
|
|
+ finalize = true;
|
|
+ }
|
|
+
|
|
acc_dev->openacc.async_set_async_func (async);
|
|
|
|
/* Determine if this is an "acc enter data". */
|
|
@@ -298,13 +397,17 @@ GOACC_enter_exit_data (int device, size_
|
|
|
|
if (kind == GOMP_MAP_FORCE_ALLOC
|
|
|| kind == GOMP_MAP_FORCE_PRESENT
|
|
- || kind == GOMP_MAP_FORCE_TO)
|
|
+ || kind == GOMP_MAP_FORCE_TO
|
|
+ || kind == GOMP_MAP_TO
|
|
+ || kind == GOMP_MAP_ALLOC)
|
|
{
|
|
data_enter = true;
|
|
break;
|
|
}
|
|
|
|
- if (kind == GOMP_MAP_DELETE
|
|
+ if (kind == GOMP_MAP_RELEASE
|
|
+ || kind == GOMP_MAP_DELETE
|
|
+ || kind == GOMP_MAP_FROM
|
|
|| kind == GOMP_MAP_FORCE_FROM)
|
|
break;
|
|
|
|
@@ -312,31 +415,35 @@ GOACC_enter_exit_data (int device, size_
|
|
kind);
|
|
}
|
|
|
|
+ /* In c, non-pointers and arrays are represented by a single data clause.
|
|
+ Dynamically allocated arrays and subarrays are represented by a data
|
|
+ clause followed by an internal GOMP_MAP_POINTER.
|
|
+
|
|
+ In fortran, scalars and not allocated arrays are represented by a
|
|
+ single data clause. Allocated arrays and subarrays have three mappings:
|
|
+ 1) the original data clause, 2) a PSET 3) a pointer to the array data.
|
|
+ */
|
|
+
|
|
if (data_enter)
|
|
{
|
|
for (i = 0; i < mapnum; i++)
|
|
{
|
|
unsigned char kind = kinds[i] & 0xff;
|
|
|
|
- /* Scan for PSETs. */
|
|
- int psets = find_pset (i, mapnum, kinds);
|
|
+ /* Scan for pointers and PSETs. */
|
|
+ int pointer = find_pointer (i, mapnum, kinds);
|
|
|
|
- if (!psets)
|
|
+ if (!pointer)
|
|
{
|
|
switch (kind)
|
|
{
|
|
- case GOMP_MAP_POINTER:
|
|
- gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i],
|
|
- &kinds[i]);
|
|
- break;
|
|
+ case GOMP_MAP_ALLOC:
|
|
case GOMP_MAP_FORCE_ALLOC:
|
|
acc_create (hostaddrs[i], sizes[i]);
|
|
break;
|
|
- case GOMP_MAP_FORCE_PRESENT:
|
|
- acc_present_or_copyin (hostaddrs[i], sizes[i]);
|
|
- break;
|
|
+ case GOMP_MAP_TO:
|
|
case GOMP_MAP_FORCE_TO:
|
|
- acc_present_or_copyin (hostaddrs[i], sizes[i]);
|
|
+ acc_copyin (hostaddrs[i], sizes[i]);
|
|
break;
|
|
default:
|
|
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
|
|
@@ -346,12 +453,13 @@ GOACC_enter_exit_data (int device, size_
|
|
}
|
|
else
|
|
{
|
|
- gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]);
|
|
+ gomp_acc_insert_pointer (pointer, &hostaddrs[i],
|
|
+ &sizes[i], &kinds[i]);
|
|
/* Increment 'i' by two because OpenACC requires fortran
|
|
arrays to be contiguous, so each PSET is associated with
|
|
one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
|
|
one MAP_POINTER. */
|
|
- i += 2;
|
|
+ i += pointer - 1;
|
|
}
|
|
}
|
|
}
|
|
@@ -360,22 +468,28 @@ GOACC_enter_exit_data (int device, size_
|
|
{
|
|
unsigned char kind = kinds[i] & 0xff;
|
|
|
|
- int psets = find_pset (i, mapnum, kinds);
|
|
+ int pointer = find_pointer (i, mapnum, kinds);
|
|
|
|
- if (!psets)
|
|
+ if (!pointer)
|
|
{
|
|
switch (kind)
|
|
{
|
|
- case GOMP_MAP_POINTER:
|
|
- gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
|
|
- == GOMP_MAP_FORCE_FROM,
|
|
- async, 1);
|
|
- break;
|
|
+ case GOMP_MAP_RELEASE:
|
|
case GOMP_MAP_DELETE:
|
|
- acc_delete (hostaddrs[i], sizes[i]);
|
|
+ if (acc_is_present (hostaddrs[i], sizes[i]))
|
|
+ {
|
|
+ if (finalize)
|
|
+ acc_delete_finalize (hostaddrs[i], sizes[i]);
|
|
+ else
|
|
+ acc_delete (hostaddrs[i], sizes[i]);
|
|
+ }
|
|
break;
|
|
+ case GOMP_MAP_FROM:
|
|
case GOMP_MAP_FORCE_FROM:
|
|
- acc_copyout (hostaddrs[i], sizes[i]);
|
|
+ if (finalize)
|
|
+ acc_copyout_finalize (hostaddrs[i], sizes[i]);
|
|
+ else
|
|
+ acc_copyout (hostaddrs[i], sizes[i]);
|
|
break;
|
|
default:
|
|
gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
|
|
@@ -385,10 +499,12 @@ GOACC_enter_exit_data (int device, size_
|
|
}
|
|
else
|
|
{
|
|
- gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
|
|
- == GOMP_MAP_FORCE_FROM, async, 3);
|
|
+ bool copyfrom = (kind == GOMP_MAP_FORCE_FROM
|
|
+ || kind == GOMP_MAP_FROM);
|
|
+ gomp_acc_remove_pointer (hostaddrs[i], sizes[i], copyfrom, async,
|
|
+ finalize, pointer);
|
|
/* See the above comment. */
|
|
- i += 2;
|
|
+ i += pointer - 1;
|
|
}
|
|
}
|
|
|
|
@@ -398,13 +514,20 @@ GOACC_enter_exit_data (int device, size_
|
|
static void
|
|
goacc_wait (int async, int num_waits, va_list *ap)
|
|
{
|
|
- struct goacc_thread *thr = goacc_thread ();
|
|
- struct gomp_device_descr *acc_dev = thr->dev;
|
|
-
|
|
while (num_waits--)
|
|
{
|
|
int qid = va_arg (*ap, int);
|
|
-
|
|
+
|
|
+ /* Waiting on ACC_ASYNC_NOVAL maps to 'wait all'. */
|
|
+ if (qid == acc_async_noval)
|
|
+ {
|
|
+ if (async == acc_async_sync)
|
|
+ acc_wait_all ();
|
|
+ else
|
|
+ acc_wait_all_async (async);
|
|
+ break;
|
|
+ }
|
|
+
|
|
if (acc_async_test (qid))
|
|
continue;
|
|
|
|
@@ -415,16 +538,17 @@ goacc_wait (int async, int num_waits, va
|
|
launching on, the queue itself will order work as
|
|
required, so there's no need to wait explicitly. */
|
|
else
|
|
- acc_dev->openacc.async_wait_async_func (qid, async);
|
|
+ acc_wait_async (qid, async);
|
|
}
|
|
}
|
|
|
|
void
|
|
-GOACC_update (int device, size_t mapnum,
|
|
+GOACC_update (int flags_m, size_t mapnum,
|
|
void **hostaddrs, size_t *sizes, unsigned short *kinds,
|
|
int async, int num_waits, ...)
|
|
{
|
|
- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
|
|
+ int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
|
|
+
|
|
size_t i;
|
|
|
|
goacc_lazy_initialize ();
|
|
@@ -433,7 +557,7 @@ GOACC_update (int device, size_t mapnum,
|
|
struct gomp_device_descr *acc_dev = thr->dev;
|
|
|
|
if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
|
- || host_fallback)
|
|
+ || (flags & GOACC_FLAG_HOST_FALLBACK))
|
|
return;
|
|
|
|
if (num_waits)
|
|
@@ -447,6 +571,7 @@ GOACC_update (int device, size_t mapnum,
|
|
|
|
acc_dev->openacc.async_set_async_func (async);
|
|
|
|
+ bool update_device = false;
|
|
for (i = 0; i < mapnum; ++i)
|
|
{
|
|
unsigned char kind = kinds[i] & 0xff;
|
|
@@ -457,11 +582,46 @@ GOACC_update (int device, size_t mapnum,
|
|
case GOMP_MAP_TO_PSET:
|
|
break;
|
|
|
|
+ case GOMP_MAP_ALWAYS_POINTER:
|
|
+ if (update_device)
|
|
+ {
|
|
+ /* Save the contents of the host pointer. */
|
|
+ void *dptr = acc_deviceptr (hostaddrs[i-1]);
|
|
+ uintptr_t t = *(uintptr_t *) hostaddrs[i];
|
|
+
|
|
+ /* Update the contents of the host pointer to reflect
|
|
+ the value of the allocated device memory in the
|
|
+ previous pointer. */
|
|
+ *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
|
|
+ acc_update_device (hostaddrs[i], sizeof (uintptr_t));
|
|
+
|
|
+ /* Restore the host pointer. */
|
|
+ *(uintptr_t *) hostaddrs[i] = t;
|
|
+ update_device = false;
|
|
+ }
|
|
+ break;
|
|
+
|
|
+ case GOMP_MAP_TO:
|
|
+ if (!acc_is_present (hostaddrs[i], sizes[i]))
|
|
+ {
|
|
+ update_device = false;
|
|
+ break;
|
|
+ }
|
|
+ /* Fallthru */
|
|
case GOMP_MAP_FORCE_TO:
|
|
+ update_device = true;
|
|
acc_update_device (hostaddrs[i], sizes[i]);
|
|
break;
|
|
|
|
+ case GOMP_MAP_FROM:
|
|
+ if (!acc_is_present (hostaddrs[i], sizes[i]))
|
|
+ {
|
|
+ update_device = false;
|
|
+ break;
|
|
+ }
|
|
+ /* Fallthru */
|
|
case GOMP_MAP_FORCE_FROM:
|
|
+ update_device = false;
|
|
acc_update_self (hostaddrs[i], sizes[i]);
|
|
break;
|
|
|
|
@@ -487,8 +647,8 @@ GOACC_wait (int async, int num_waits, ..
|
|
}
|
|
else if (async == acc_async_sync)
|
|
acc_wait_all ();
|
|
- else if (async == acc_async_noval)
|
|
- goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval);
|
|
+ else
|
|
+ acc_wait_all_async (async);
|
|
}
|
|
|
|
int
|
|
@@ -504,7 +664,7 @@ GOACC_get_thread_num (void)
|
|
}
|
|
|
|
void
|
|
-GOACC_declare (int device, size_t mapnum,
|
|
+GOACC_declare (int flags_m, size_t mapnum,
|
|
void **hostaddrs, size_t *sizes, unsigned short *kinds)
|
|
{
|
|
int i;
|
|
@@ -522,9 +682,10 @@ GOACC_declare (int device, size_t mapnum
|
|
case GOMP_MAP_FORCE_FROM:
|
|
case GOMP_MAP_FORCE_TO:
|
|
case GOMP_MAP_POINTER:
|
|
+ case GOMP_MAP_RELEASE:
|
|
case GOMP_MAP_DELETE:
|
|
- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
|
|
- &kinds[i], 0, 0);
|
|
+ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
|
|
+ &kinds[i], GOMP_ASYNC_SYNC, 0);
|
|
break;
|
|
|
|
case GOMP_MAP_FORCE_DEVICEPTR:
|
|
@@ -532,20 +693,19 @@ GOACC_declare (int device, size_t mapnum
|
|
|
|
case GOMP_MAP_ALLOC:
|
|
if (!acc_is_present (hostaddrs[i], sizes[i]))
|
|
- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
|
|
- &kinds[i], 0, 0);
|
|
+ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
|
|
+ &kinds[i], GOMP_ASYNC_SYNC, 0);
|
|
break;
|
|
|
|
case GOMP_MAP_TO:
|
|
- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
|
|
- &kinds[i], 0, 0);
|
|
+ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
|
|
+ &kinds[i], GOMP_ASYNC_SYNC, 0);
|
|
|
|
break;
|
|
|
|
case GOMP_MAP_FROM:
|
|
- kinds[i] = GOMP_MAP_FORCE_FROM;
|
|
- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
|
|
- &kinds[i], 0, 0);
|
|
+ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
|
|
+ &kinds[i], GOMP_ASYNC_SYNC, 0);
|
|
break;
|
|
|
|
case GOMP_MAP_FORCE_PRESENT:
|
|
--- libgomp/openacc2.f90.jj 2019-05-07 19:54:18.828514375 +0200
|
|
+++ libgomp/openacc2.f90 2019-05-07 19:56:38.454296347 +0200
|
|
@@ -0,0 +1,1502 @@
|
|
+! OpenACC Runtime Library Definitions.
|
|
+
|
|
+! Copyright (C) 2014-2019 Free Software Foundation, Inc.
|
|
+
|
|
+! Contributed by Tobias Burnus <burnus@net-b.de>
|
|
+! and Mentor Embedded.
|
|
+
|
|
+! This file is part of the GNU Offloading and Multi Processing Library
|
|
+! (libgomp).
|
|
+
|
|
+! Libgomp is free software; you can redistribute it and/or modify it
|
|
+! under the terms of the GNU General Public License as published by
|
|
+! the Free Software Foundation; either version 3, or (at your option)
|
|
+! any later version.
|
|
+
|
|
+! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
+! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
+! FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
+! more details.
|
|
+
|
|
+! Under Section 7 of GPL version 3, you are granted additional
|
|
+! permissions described in the GCC Runtime Library Exception, version
|
|
+! 3.1, as published by the Free Software Foundation.
|
|
+
|
|
+! You should have received a copy of the GNU General Public License and
|
|
+! a copy of the GCC Runtime Library Exception along with this program;
|
|
+! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
+! <http://www.gnu.org/licenses/>.
|
|
+
|
|
+module openacc_kinds2
|
|
+ use iso_fortran_env, only: int32
|
|
+ implicit none
|
|
+
|
|
+ private :: int32
|
|
+ public :: acc_device_kind
|
|
+
|
|
+ integer, parameter :: acc_device_kind = int32
|
|
+
|
|
+ public :: acc_device_none, acc_device_default, acc_device_host
|
|
+ public :: acc_device_not_host, acc_device_nvidia
|
|
+
|
|
+ ! Keep in sync with include/gomp-constants.h.
|
|
+ integer (acc_device_kind), parameter :: acc_device_none = 0
|
|
+ integer (acc_device_kind), parameter :: acc_device_default = 1
|
|
+ integer (acc_device_kind), parameter :: acc_device_host = 2
|
|
+ ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
|
|
+ integer (acc_device_kind), parameter :: acc_device_not_host = 4
|
|
+ integer (acc_device_kind), parameter :: acc_device_nvidia = 5
|
|
+
|
|
+ public :: acc_handle_kind
|
|
+
|
|
+ integer, parameter :: acc_handle_kind = int32
|
|
+
|
|
+ public :: acc_async_noval, acc_async_sync
|
|
+
|
|
+ ! Keep in sync with include/gomp-constants.h.
|
|
+ integer (acc_handle_kind), parameter :: acc_async_noval = -1
|
|
+ integer (acc_handle_kind), parameter :: acc_async_sync = -2
|
|
+
|
|
+end module
|
|
+
|
|
+module openacc_internal2
|
|
+ use openacc_kinds2
|
|
+ implicit none
|
|
+
|
|
+ interface
|
|
+ function acc_get_num_devices_h (d)
|
|
+ import
|
|
+ integer acc_get_num_devices_h
|
|
+ integer (acc_device_kind) d
|
|
+ end function
|
|
+
|
|
+ subroutine acc_set_device_type_h (d)
|
|
+ import
|
|
+ integer (acc_device_kind) d
|
|
+ end subroutine
|
|
+
|
|
+ function acc_get_device_type_h ()
|
|
+ import
|
|
+ integer (acc_device_kind) acc_get_device_type_h
|
|
+ end function
|
|
+
|
|
+ subroutine acc_set_device_num_h (n, d)
|
|
+ import
|
|
+ integer n
|
|
+ integer (acc_device_kind) d
|
|
+ end subroutine
|
|
+
|
|
+ function acc_get_device_num_h (d)
|
|
+ import
|
|
+ integer acc_get_device_num_h
|
|
+ integer (acc_device_kind) d
|
|
+ end function
|
|
+
|
|
+ function acc_async_test_h (a)
|
|
+ logical acc_async_test_h
|
|
+ integer a
|
|
+ end function
|
|
+
|
|
+ function acc_async_test_all_h ()
|
|
+ logical acc_async_test_all_h
|
|
+ end function
|
|
+
|
|
+ subroutine acc_wait_h (a)
|
|
+ integer a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_wait_async_h (a1, a2)
|
|
+ integer a1, a2
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_wait_all_h ()
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_wait_all_async_h (a)
|
|
+ integer a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_init_h (d)
|
|
+ import
|
|
+ integer (acc_device_kind) d
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_shutdown_h (d)
|
|
+ import
|
|
+ integer (acc_device_kind) d
|
|
+ end subroutine
|
|
+
|
|
+ function acc_on_device_h (d)
|
|
+ import
|
|
+ integer (acc_device_kind) d
|
|
+ logical acc_on_device_h
|
|
+ end function
|
|
+
|
|
+ subroutine acc_copyin_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyin_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyin_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_copyin_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_copyin_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_copyin_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_create_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_create_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_create_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_finalize_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_finalize_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_finalize_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_finalize_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_finalize_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_finalize_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_array_h (a)
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end subroutine
|
|
+
|
|
+ function acc_is_present_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ logical acc_is_present_32_h
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ end function
|
|
+
|
|
+ function acc_is_present_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ logical acc_is_present_64_h
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ end function
|
|
+
|
|
+ function acc_is_present_array_h (a)
|
|
+ logical acc_is_present_array_h
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ end function
|
|
+
|
|
+ subroutine acc_copyin_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyin_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyin_async_array_h (a, async)
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_async_array_h (a, async)
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_async_array_h (a, async)
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_async_array_h (a, async)
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_async_array_h (a, async)
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_async_array_h (a, async)
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ end subroutine
|
|
+ end interface
|
|
+
|
|
+ interface
|
|
+ function acc_get_num_devices_l (d) &
|
|
+ bind (C, name = "acc_get_num_devices")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int) :: acc_get_num_devices_l
|
|
+ integer (c_int), value :: d
|
|
+ end function
|
|
+
|
|
+ subroutine acc_set_device_type_l (d) &
|
|
+ bind (C, name = "acc_set_device_type")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int), value :: d
|
|
+ end subroutine
|
|
+
|
|
+ function acc_get_device_type_l () &
|
|
+ bind (C, name = "acc_get_device_type")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int) :: acc_get_device_type_l
|
|
+ end function
|
|
+
|
|
+ subroutine acc_set_device_num_l (n, d) &
|
|
+ bind (C, name = "acc_set_device_num")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int), value :: n, d
|
|
+ end subroutine
|
|
+
|
|
+ function acc_get_device_num_l (d) &
|
|
+ bind (C, name = "acc_get_device_num")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int) :: acc_get_device_num_l
|
|
+ integer (c_int), value :: d
|
|
+ end function
|
|
+
|
|
+ function acc_async_test_l (a) &
|
|
+ bind (C, name = "acc_async_test")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int) :: acc_async_test_l
|
|
+ integer (c_int), value :: a
|
|
+ end function
|
|
+
|
|
+ function acc_async_test_all_l () &
|
|
+ bind (C, name = "acc_async_test_all")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int) :: acc_async_test_all_l
|
|
+ end function
|
|
+
|
|
+ subroutine acc_wait_l (a) &
|
|
+ bind (C, name = "acc_wait")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int), value :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_wait_async_l (a1, a2) &
|
|
+ bind (C, name = "acc_wait_async")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int), value :: a1, a2
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_wait_all_l () &
|
|
+ bind (C, name = "acc_wait_all")
|
|
+ use iso_c_binding, only: c_int
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_wait_all_async_l (a) &
|
|
+ bind (C, name = "acc_wait_all_async")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int), value :: a
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_init_l (d) &
|
|
+ bind (C, name = "acc_init")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int), value :: d
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_shutdown_l (d) &
|
|
+ bind (C, name = "acc_shutdown")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int), value :: d
|
|
+ end subroutine
|
|
+
|
|
+ function acc_on_device_l (d) &
|
|
+ bind (C, name = "acc_on_device")
|
|
+ use iso_c_binding, only: c_int
|
|
+ integer (c_int) :: acc_on_device_l
|
|
+ integer (c_int), value :: d
|
|
+ end function
|
|
+
|
|
+ subroutine acc_copyin_l (a, len) &
|
|
+ bind (C, name = "acc_copyin")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_copyin_l (a, len) &
|
|
+ bind (C, name = "acc_present_or_copyin")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_l (a, len) &
|
|
+ bind (C, name = "acc_create")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_present_or_create_l (a, len) &
|
|
+ bind (C, name = "acc_present_or_create")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_l (a, len) &
|
|
+ bind (C, name = "acc_copyout")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_finalize_l (a, len) &
|
|
+ bind (C, name = "acc_copyout_finalize")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_l (a, len) &
|
|
+ bind (C, name = "acc_delete")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_finalize_l (a, len) &
|
|
+ bind (C, name = "acc_delete_finalize")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_l (a, len) &
|
|
+ bind (C, name = "acc_update_device")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_l (a, len) &
|
|
+ bind (C, name = "acc_update_self")
|
|
+ use iso_c_binding, only: c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end subroutine
|
|
+
|
|
+ function acc_is_present_l (a, len) &
|
|
+ bind (C, name = "acc_is_present")
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ integer (c_int32_t) :: acc_is_present_l
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ end function
|
|
+
|
|
+ subroutine acc_copyin_async_l (a, len, async) &
|
|
+ bind (C, name = "acc_copyin_async")
|
|
+ use iso_c_binding, only: c_size_t, c_int
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ integer (c_int), value :: async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_create_async_l (a, len, async) &
|
|
+ bind (C, name = "acc_create_async")
|
|
+ use iso_c_binding, only: c_size_t, c_int
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ integer (c_int), value :: async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_copyout_async_l (a, len, async) &
|
|
+ bind (C, name = "acc_copyout_async")
|
|
+ use iso_c_binding, only: c_size_t, c_int
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ integer (c_int), value :: async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_delete_async_l (a, len, async) &
|
|
+ bind (C, name = "acc_delete_async")
|
|
+ use iso_c_binding, only: c_size_t, c_int
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ integer (c_int), value :: async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_device_async_l (a, len, async) &
|
|
+ bind (C, name = "acc_update_device_async")
|
|
+ use iso_c_binding, only: c_size_t, c_int
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ integer (c_int), value :: async
|
|
+ end subroutine
|
|
+
|
|
+ subroutine acc_update_self_async_l (a, len, async) &
|
|
+ bind (C, name = "acc_update_self_async")
|
|
+ use iso_c_binding, only: c_size_t, c_int
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_size_t), value :: len
|
|
+ integer (c_int), value :: async
|
|
+ end subroutine
|
|
+ end interface
|
|
+end module
|
|
+
|
|
+module openacc2
|
|
+ use openacc_kinds2
|
|
+ use openacc_internal2
|
|
+ implicit none
|
|
+
|
|
+ public :: openacc_version
|
|
+
|
|
+ public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type
|
|
+ public :: acc_set_device_num, acc_get_device_num, acc_async_test
|
|
+ public :: acc_async_test_all
|
|
+ public :: acc_wait, acc_async_wait, acc_wait_async
|
|
+ public :: acc_wait_all, acc_async_wait_all, acc_wait_all_async
|
|
+ public :: acc_init, acc_shutdown, acc_on_device
|
|
+ public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create
|
|
+ public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete
|
|
+ public :: acc_update_device, acc_update_self, acc_is_present
|
|
+ public :: acc_copyin_async, acc_create_async, acc_copyout_async
|
|
+ public :: acc_delete_async, acc_update_device_async, acc_update_self_async
|
|
+
|
|
+ integer, parameter :: openacc_version = 201306
|
|
+
|
|
+ interface acc_get_num_devices
|
|
+ procedure :: acc_get_num_devices_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_set_device_type
|
|
+ procedure :: acc_set_device_type_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_get_device_type
|
|
+ procedure :: acc_get_device_type_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_set_device_num
|
|
+ procedure :: acc_set_device_num_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_get_device_num
|
|
+ procedure :: acc_get_device_num_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_async_test
|
|
+ procedure :: acc_async_test_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_async_test_all
|
|
+ procedure :: acc_async_test_all_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_wait
|
|
+ procedure :: acc_wait_h
|
|
+ end interface
|
|
+
|
|
+ ! acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.
|
|
+ interface acc_async_wait
|
|
+ procedure :: acc_wait_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_wait_async
|
|
+ procedure :: acc_wait_async_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_wait_all
|
|
+ procedure :: acc_wait_all_h
|
|
+ end interface
|
|
+
|
|
+ ! acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.
|
|
+ interface acc_async_wait_all
|
|
+ procedure :: acc_wait_all_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_wait_all_async
|
|
+ procedure :: acc_wait_all_async_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_init
|
|
+ procedure :: acc_init_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_shutdown
|
|
+ procedure :: acc_shutdown_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_on_device
|
|
+ procedure :: acc_on_device_h
|
|
+ end interface
|
|
+
|
|
+ ! acc_malloc: Only available in C/C++
|
|
+ ! acc_free: Only available in C/C++
|
|
+
|
|
+ ! As vendor extension, the following code supports both 32bit and 64bit
|
|
+ ! arguments for "size"; the OpenACC standard only permits default-kind
|
|
+ ! integers, which are of kind 4 (i.e. 32 bits).
|
|
+ ! Additionally, the two-argument version also takes arrays as argument.
|
|
+ ! and the one argument version also scalars. Note that the code assumes
|
|
+ ! that the arrays are contiguous.
|
|
+
|
|
+ interface acc_copyin
|
|
+ procedure :: acc_copyin_32_h
|
|
+ procedure :: acc_copyin_64_h
|
|
+ procedure :: acc_copyin_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_present_or_copyin
|
|
+ procedure :: acc_present_or_copyin_32_h
|
|
+ procedure :: acc_present_or_copyin_64_h
|
|
+ procedure :: acc_present_or_copyin_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_pcopyin
|
|
+ procedure :: acc_present_or_copyin_32_h
|
|
+ procedure :: acc_present_or_copyin_64_h
|
|
+ procedure :: acc_present_or_copyin_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_create
|
|
+ procedure :: acc_create_32_h
|
|
+ procedure :: acc_create_64_h
|
|
+ procedure :: acc_create_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_present_or_create
|
|
+ procedure :: acc_present_or_create_32_h
|
|
+ procedure :: acc_present_or_create_64_h
|
|
+ procedure :: acc_present_or_create_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_pcreate
|
|
+ procedure :: acc_present_or_create_32_h
|
|
+ procedure :: acc_present_or_create_64_h
|
|
+ procedure :: acc_present_or_create_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_copyout
|
|
+ procedure :: acc_copyout_32_h
|
|
+ procedure :: acc_copyout_64_h
|
|
+ procedure :: acc_copyout_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_copyout_finalize
|
|
+ procedure :: acc_copyout_finalize_32_h
|
|
+ procedure :: acc_copyout_finalize_64_h
|
|
+ procedure :: acc_copyout_finalize_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_delete
|
|
+ procedure :: acc_delete_32_h
|
|
+ procedure :: acc_delete_64_h
|
|
+ procedure :: acc_delete_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_delete_finalize
|
|
+ procedure :: acc_delete_finalize_32_h
|
|
+ procedure :: acc_delete_finalize_64_h
|
|
+ procedure :: acc_delete_finalize_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_update_device
|
|
+ procedure :: acc_update_device_32_h
|
|
+ procedure :: acc_update_device_64_h
|
|
+ procedure :: acc_update_device_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_update_self
|
|
+ procedure :: acc_update_self_32_h
|
|
+ procedure :: acc_update_self_64_h
|
|
+ procedure :: acc_update_self_array_h
|
|
+ end interface
|
|
+
|
|
+ ! acc_map_data: Only available in C/C++
|
|
+ ! acc_unmap_data: Only available in C/C++
|
|
+ ! acc_deviceptr: Only available in C/C++
|
|
+ ! acc_hostptr: Only available in C/C++
|
|
+
|
|
+ interface acc_is_present
|
|
+ procedure :: acc_is_present_32_h
|
|
+ procedure :: acc_is_present_64_h
|
|
+ procedure :: acc_is_present_array_h
|
|
+ end interface
|
|
+
|
|
+ ! acc_memcpy_to_device: Only available in C/C++
|
|
+ ! acc_memcpy_from_device: Only available in C/C++
|
|
+
|
|
+ interface acc_copyin_async
|
|
+ procedure :: acc_copyin_async_32_h
|
|
+ procedure :: acc_copyin_async_64_h
|
|
+ procedure :: acc_copyin_async_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_create_async
|
|
+ procedure :: acc_create_async_32_h
|
|
+ procedure :: acc_create_async_64_h
|
|
+ procedure :: acc_create_async_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_copyout_async
|
|
+ procedure :: acc_copyout_async_32_h
|
|
+ procedure :: acc_copyout_async_64_h
|
|
+ procedure :: acc_copyout_async_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_delete_async
|
|
+ procedure :: acc_delete_async_32_h
|
|
+ procedure :: acc_delete_async_64_h
|
|
+ procedure :: acc_delete_async_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_update_device_async
|
|
+ procedure :: acc_update_device_async_32_h
|
|
+ procedure :: acc_update_device_async_64_h
|
|
+ procedure :: acc_update_device_async_array_h
|
|
+ end interface
|
|
+
|
|
+ interface acc_update_self_async
|
|
+ procedure :: acc_update_self_async_32_h
|
|
+ procedure :: acc_update_self_async_64_h
|
|
+ procedure :: acc_update_self_async_array_h
|
|
+ end interface
|
|
+
|
|
+end module
|
|
+
|
|
+function acc_get_num_devices_h (d)
|
|
+ use openacc_internal2, only: acc_get_num_devices_l
|
|
+ use openacc_kinds2
|
|
+ integer acc_get_num_devices_h
|
|
+ integer (acc_device_kind) d
|
|
+ acc_get_num_devices_h = acc_get_num_devices_l (d)
|
|
+end function
|
|
+
|
|
+subroutine acc_set_device_type_h (d)
|
|
+ use openacc_internal2, only: acc_set_device_type_l
|
|
+ use openacc_kinds2
|
|
+ integer (acc_device_kind) d
|
|
+ call acc_set_device_type_l (d)
|
|
+end subroutine
|
|
+
|
|
+function acc_get_device_type_h ()
|
|
+ use openacc_internal2, only: acc_get_device_type_l
|
|
+ use openacc_kinds2
|
|
+ integer (acc_device_kind) acc_get_device_type_h
|
|
+ acc_get_device_type_h = acc_get_device_type_l ()
|
|
+end function
|
|
+
|
|
+subroutine acc_set_device_num_h (n, d)
|
|
+ use openacc_internal2, only: acc_set_device_num_l
|
|
+ use openacc_kinds2
|
|
+ integer n
|
|
+ integer (acc_device_kind) d
|
|
+ call acc_set_device_num_l (n, d)
|
|
+end subroutine
|
|
+
|
|
+function acc_get_device_num_h (d)
|
|
+ use openacc_internal2, only: acc_get_device_num_l
|
|
+ use openacc_kinds2
|
|
+ integer acc_get_device_num_h
|
|
+ integer (acc_device_kind) d
|
|
+ acc_get_device_num_h = acc_get_device_num_l (d)
|
|
+end function
|
|
+
|
|
+function acc_async_test_h (a)
|
|
+ use openacc_internal2, only: acc_async_test_l
|
|
+ logical acc_async_test_h
|
|
+ integer a
|
|
+ if (acc_async_test_l (a) .eq. 1) then
|
|
+ acc_async_test_h = .TRUE.
|
|
+ else
|
|
+ acc_async_test_h = .FALSE.
|
|
+ end if
|
|
+end function
|
|
+
|
|
+function acc_async_test_all_h ()
|
|
+ use openacc_internal2, only: acc_async_test_all_l
|
|
+ logical acc_async_test_all_h
|
|
+ if (acc_async_test_all_l () .eq. 1) then
|
|
+ acc_async_test_all_h = .TRUE.
|
|
+ else
|
|
+ acc_async_test_all_h = .FALSE.
|
|
+ end if
|
|
+end function
|
|
+
|
|
+subroutine acc_wait_h (a)
|
|
+ use openacc_internal2, only: acc_wait_l
|
|
+ integer a
|
|
+ call acc_wait_l (a)
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_wait_async_h (a1, a2)
|
|
+ use openacc_internal2, only: acc_wait_async_l
|
|
+ integer a1, a2
|
|
+ call acc_wait_async_l (a1, a2)
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_wait_all_h ()
|
|
+ use openacc_internal2, only: acc_wait_all_l
|
|
+ call acc_wait_all_l ()
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_wait_all_async_h (a)
|
|
+ use openacc_internal2, only: acc_wait_all_async_l
|
|
+ integer a
|
|
+ call acc_wait_all_async_l (a)
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_init_h (d)
|
|
+ use openacc_internal2, only: acc_init_l
|
|
+ use openacc_kinds2
|
|
+ integer (acc_device_kind) d
|
|
+ call acc_init_l (d)
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_shutdown_h (d)
|
|
+ use openacc_internal2, only: acc_shutdown_l
|
|
+ use openacc_kinds2
|
|
+ integer (acc_device_kind) d
|
|
+ call acc_shutdown_l (d)
|
|
+end subroutine
|
|
+
|
|
+function acc_on_device_h (d)
|
|
+ use openacc_internal2, only: acc_on_device_l
|
|
+ use openacc_kinds2
|
|
+ integer (acc_device_kind) d
|
|
+ logical acc_on_device_h
|
|
+ if (acc_on_device_l (d) .eq. 1) then
|
|
+ acc_on_device_h = .TRUE.
|
|
+ else
|
|
+ acc_on_device_h = .FALSE.
|
|
+ end if
|
|
+end function
|
|
+
|
|
+subroutine acc_copyin_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_copyin_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_copyin_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyin_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_copyin_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_copyin_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyin_array_h (a)
|
|
+ use openacc_internal2, only: acc_copyin_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_copyin_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_present_or_copyin_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_present_or_copyin_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_present_or_copyin_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_present_or_copyin_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_present_or_copyin_array_h (a)
|
|
+ use openacc_internal2, only: acc_present_or_copyin_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_present_or_copyin_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_create_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_create_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_create_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_create_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_create_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_create_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_create_array_h (a)
|
|
+ use openacc_internal2, only: acc_create_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_create_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_present_or_create_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_present_or_create_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_present_or_create_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_present_or_create_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_present_or_create_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_present_or_create_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_present_or_create_array_h (a)
|
|
+ use openacc_internal2, only: acc_present_or_create_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_present_or_create_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_copyout_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_copyout_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_copyout_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_copyout_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_array_h (a)
|
|
+ use openacc_internal2, only: acc_copyout_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_copyout_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_finalize_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_copyout_finalize_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_finalize_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_copyout_finalize_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_finalize_array_h (a)
|
|
+ use openacc_internal2, only: acc_copyout_finalize_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_copyout_finalize_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_delete_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_delete_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_delete_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_delete_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_array_h (a)
|
|
+ use openacc_internal2, only: acc_delete_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_delete_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_finalize_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_delete_finalize_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_delete_finalize_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_finalize_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_delete_finalize_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_delete_finalize_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_finalize_array_h (a)
|
|
+ use openacc_internal2, only: acc_delete_finalize_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_delete_finalize_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_device_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_update_device_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_update_device_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_device_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_update_device_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_update_device_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_device_array_h (a)
|
|
+ use openacc_internal2, only: acc_update_device_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_update_device_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_self_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_update_self_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ call acc_update_self_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_self_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_update_self_l
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ call acc_update_self_l (a, int (len, kind = c_size_t))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_self_array_h (a)
|
|
+ use openacc_internal2, only: acc_update_self_l
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ call acc_update_self_l (a, sizeof (a))
|
|
+end subroutine
|
|
+
|
|
+function acc_is_present_32_h (a, len)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t
|
|
+ use openacc_internal2, only: acc_is_present_l
|
|
+ logical acc_is_present_32_h
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
|
|
+ acc_is_present_32_h = .TRUE.
|
|
+ else
|
|
+ acc_is_present_32_h = .FALSE.
|
|
+ end if
|
|
+end function
|
|
+
|
|
+function acc_is_present_64_h (a, len)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t
|
|
+ use openacc_internal2, only: acc_is_present_l
|
|
+ logical acc_is_present_64_h
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
|
|
+ acc_is_present_64_h = .TRUE.
|
|
+ else
|
|
+ acc_is_present_64_h = .FALSE.
|
|
+ end if
|
|
+end function
|
|
+
|
|
+function acc_is_present_array_h (a)
|
|
+ use openacc_internal2, only: acc_is_present_l
|
|
+ logical acc_is_present_array_h
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1
|
|
+end function
|
|
+
|
|
+subroutine acc_copyin_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_copyin_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyin_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_copyin_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyin_async_array_h (a, async)
|
|
+ use iso_c_binding, only: c_int
|
|
+ use openacc_internal2, only: acc_copyin_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_create_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_create_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_create_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_create_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_create_async_array_h (a, async)
|
|
+ use iso_c_binding, only: c_int
|
|
+ use openacc_internal2, only: acc_create_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_create_async_l (a, sizeof (a), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_copyout_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_copyout_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_copyout_async_array_h (a, async)
|
|
+ use iso_c_binding, only: c_int
|
|
+ use openacc_internal2, only: acc_copyout_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_delete_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_delete_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_delete_async_array_h (a, async)
|
|
+ use iso_c_binding, only: c_int
|
|
+ use openacc_internal2, only: acc_delete_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_device_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_update_device_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_device_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_update_device_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_device_async_array_h (a, async)
|
|
+ use iso_c_binding, only: c_int
|
|
+ use openacc_internal2, only: acc_update_device_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_self_async_32_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int32_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_update_self_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int32_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_self_async_64_h (a, len, async)
|
|
+ use iso_c_binding, only: c_int64_t, c_size_t, c_int
|
|
+ use openacc_internal2, only: acc_update_self_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
|
|
+ type (*), dimension (*) :: a
|
|
+ integer (c_int64_t) len
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
|
|
+end subroutine
|
|
+
|
|
+subroutine acc_update_self_async_array_h (a, async)
|
|
+ use iso_c_binding, only: c_int
|
|
+ use openacc_internal2, only: acc_update_self_async_l
|
|
+ use openacc_kinds2, only: acc_handle_kind
|
|
+ type (*), dimension (..), contiguous :: a
|
|
+ integer (acc_handle_kind) async
|
|
+ call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int))
|
|
+end subroutine
|
|
--- libgomp/taskloop.c.jj 2018-04-25 09:40:31.913655581 +0200
|
|
+++ libgomp/taskloop.c 2019-05-07 18:46:36.547109400 +0200
|
|
@@ -149,11 +149,28 @@ GOMP_taskloop (void (*fn) (void *), void
|
|
|
|
if (flags & GOMP_TASK_FLAG_NOGROUP)
|
|
{
|
|
- if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled)
|
|
- return;
|
|
+ if (__builtin_expect (gomp_cancel_var, 0)
|
|
+ && thr->task
|
|
+ && thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return;
|
|
+ }
|
|
}
|
|
else
|
|
- ialias_call (GOMP_taskgroup_start) ();
|
|
+ {
|
|
+ ialias_call (GOMP_taskgroup_start) ();
|
|
+ if (flags & GOMP_TASK_FLAG_REDUCTION)
|
|
+ {
|
|
+ struct gomp_data_head { TYPE t1, t2; uintptr_t *ptr; };
|
|
+ uintptr_t *ptr = ((struct gomp_data_head *) data)->ptr;
|
|
+ ialias_call (GOMP_taskgroup_reduction_register) (ptr);
|
|
+ }
|
|
+ }
|
|
|
|
if (priority > gomp_max_task_priority_var)
|
|
priority = gomp_max_task_priority_var;
|
|
@@ -284,19 +301,31 @@ GOMP_taskloop (void (*fn) (void *), void
|
|
gomp_mutex_lock (&team->task_lock);
|
|
/* If parallel or taskgroup has been cancelled, don't start new
|
|
tasks. */
|
|
- if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (taskgroup && taskgroup->cancelled))
|
|
- && cpyfn == NULL, 0))
|
|
+ if (__builtin_expect (gomp_cancel_var, 0)
|
|
+ && cpyfn == NULL)
|
|
{
|
|
- gomp_mutex_unlock (&team->task_lock);
|
|
- for (i = 0; i < num_tasks; i++)
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ {
|
|
+ do_cancel:
|
|
+ gomp_mutex_unlock (&team->task_lock);
|
|
+ for (i = 0; i < num_tasks; i++)
|
|
+ {
|
|
+ gomp_finish_task (tasks[i]);
|
|
+ free (tasks[i]);
|
|
+ }
|
|
+ if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
|
|
+ ialias_call (GOMP_taskgroup_end) ();
|
|
+ return;
|
|
+ }
|
|
+ if (taskgroup)
|
|
{
|
|
- gomp_finish_task (tasks[i]);
|
|
- free (tasks[i]);
|
|
+ if (taskgroup->cancelled)
|
|
+ goto do_cancel;
|
|
+ if (taskgroup->workshare
|
|
+ && taskgroup->prev
|
|
+ && taskgroup->prev->cancelled)
|
|
+ goto do_cancel;
|
|
}
|
|
- if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
|
|
- ialias_call (GOMP_taskgroup_end) ();
|
|
- return;
|
|
}
|
|
if (taskgroup)
|
|
taskgroup->num_children += num_tasks;
|
|
--- libgomp/parallel.c.jj 2018-04-25 09:40:31.926655587 +0200
|
|
+++ libgomp/parallel.c 2019-05-07 18:46:36.532109640 +0200
|
|
@@ -123,7 +123,8 @@ void
|
|
GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads)
|
|
{
|
|
num_threads = gomp_resolve_num_threads (num_threads, 0);
|
|
- gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads));
|
|
+ gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads),
|
|
+ NULL);
|
|
}
|
|
|
|
void
|
|
@@ -161,14 +162,33 @@ GOMP_parallel_end (void)
|
|
ialias (GOMP_parallel_end)
|
|
|
|
void
|
|
-GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags)
|
|
+GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
|
|
+ unsigned int flags)
|
|
{
|
|
num_threads = gomp_resolve_num_threads (num_threads, 0);
|
|
- gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads));
|
|
+ gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads),
|
|
+ NULL);
|
|
fn (data);
|
|
ialias_call (GOMP_parallel_end) ();
|
|
}
|
|
|
|
+unsigned
|
|
+GOMP_parallel_reductions (void (*fn) (void *), void *data,
|
|
+ unsigned num_threads, unsigned int flags)
|
|
+{
|
|
+ struct gomp_taskgroup *taskgroup;
|
|
+ num_threads = gomp_resolve_num_threads (num_threads, 0);
|
|
+ uintptr_t *rdata = *(uintptr_t **)data;
|
|
+ taskgroup = gomp_parallel_reduction_register (rdata, num_threads);
|
|
+ gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads),
|
|
+ taskgroup);
|
|
+ fn (data);
|
|
+ ialias_call (GOMP_parallel_end) ();
|
|
+ gomp_sem_destroy (&taskgroup->taskgroup_sem);
|
|
+ free (taskgroup);
|
|
+ return num_threads;
|
|
+}
|
|
+
|
|
bool
|
|
GOMP_cancellation_point (int which)
|
|
{
|
|
@@ -185,8 +205,15 @@ GOMP_cancellation_point (int which)
|
|
}
|
|
else if (which & GOMP_CANCEL_TASKGROUP)
|
|
{
|
|
- if (thr->task->taskgroup && thr->task->taskgroup->cancelled)
|
|
- return true;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return true;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return true;
|
|
+ }
|
|
/* FALLTHRU into the GOMP_CANCEL_PARALLEL case,
|
|
as #pragma omp cancel parallel also cancels all explicit
|
|
tasks. */
|
|
@@ -218,11 +245,17 @@ GOMP_cancel (int which, bool do_cancel)
|
|
}
|
|
else if (which & GOMP_CANCEL_TASKGROUP)
|
|
{
|
|
- if (thr->task->taskgroup && !thr->task->taskgroup->cancelled)
|
|
+ if (thr->task->taskgroup)
|
|
{
|
|
- gomp_mutex_lock (&team->task_lock);
|
|
- thr->task->taskgroup->cancelled = true;
|
|
- gomp_mutex_unlock (&team->task_lock);
|
|
+ struct gomp_taskgroup *taskgroup = thr->task->taskgroup;
|
|
+ if (taskgroup->workshare && taskgroup->prev)
|
|
+ taskgroup = taskgroup->prev;
|
|
+ if (!taskgroup->cancelled)
|
|
+ {
|
|
+ gomp_mutex_lock (&team->task_lock);
|
|
+ taskgroup->cancelled = true;
|
|
+ gomp_mutex_unlock (&team->task_lock);
|
|
+ }
|
|
}
|
|
return true;
|
|
}
|
|
--- libgomp/oacc-plugin.h.jj 2018-04-25 09:40:31.322655307 +0200
|
|
+++ libgomp/oacc-plugin.h 2019-05-07 18:46:36.531109656 +0200
|
|
@@ -29,5 +29,6 @@
|
|
|
|
extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
|
|
extern void *GOMP_PLUGIN_acc_thread (void);
|
|
+extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
|
|
|
|
#endif
|
|
--- libgomp/target.c.jj 2018-04-25 09:40:31.912655580 +0200
|
|
+++ libgomp/target.c 2019-05-07 19:07:21.032306327 +0200
|
|
@@ -180,16 +180,22 @@ gomp_device_copy (struct gomp_device_des
|
|
/* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
|
|
host to device memory transfers. */
|
|
|
|
+struct gomp_coalesce_chunk
|
|
+{
|
|
+ /* The starting and ending point of a coalesced chunk of memory. */
|
|
+ size_t start, end;
|
|
+};
|
|
+
|
|
struct gomp_coalesce_buf
|
|
{
|
|
/* Buffer into which gomp_copy_host2dev will memcpy data and from which
|
|
it will be copied to the device. */
|
|
void *buf;
|
|
struct target_mem_desc *tgt;
|
|
- /* Array with offsets, chunks[2 * i] is the starting offset and
|
|
- chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
|
|
+ /* Array with offsets, chunks[i].start is the starting offset and
|
|
+ chunks[i].end ending offset relative to tgt->tgt_start device address
|
|
of chunks which are to be copied to buf and later copied to device. */
|
|
- size_t *chunks;
|
|
+ struct gomp_coalesce_chunk *chunks;
|
|
/* Number of chunks in chunks array, or -1 if coalesce buffering should not
|
|
be performed. */
|
|
long chunk_cnt;
|
|
@@ -222,14 +228,14 @@ gomp_coalesce_buf_add (struct gomp_coale
|
|
{
|
|
if (cbuf->chunk_cnt < 0)
|
|
return;
|
|
- if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
|
|
+ if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end)
|
|
{
|
|
cbuf->chunk_cnt = -1;
|
|
return;
|
|
}
|
|
- if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
|
|
+ if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end + MAX_COALESCE_BUF_GAP)
|
|
{
|
|
- cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
|
|
+ cbuf->chunks[cbuf->chunk_cnt - 1].end = start + len;
|
|
cbuf->use_cnt++;
|
|
return;
|
|
}
|
|
@@ -239,8 +245,8 @@ gomp_coalesce_buf_add (struct gomp_coale
|
|
if (cbuf->use_cnt == 1)
|
|
cbuf->chunk_cnt--;
|
|
}
|
|
- cbuf->chunks[2 * cbuf->chunk_cnt] = start;
|
|
- cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
|
|
+ cbuf->chunks[cbuf->chunk_cnt].start = start;
|
|
+ cbuf->chunks[cbuf->chunk_cnt].end = start + len;
|
|
cbuf->chunk_cnt++;
|
|
cbuf->use_cnt = 1;
|
|
}
|
|
@@ -271,20 +277,20 @@ gomp_copy_host2dev (struct gomp_device_d
|
|
if (cbuf)
|
|
{
|
|
uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
|
|
- if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
|
|
+ if (doff < cbuf->chunks[cbuf->chunk_cnt - 1].end)
|
|
{
|
|
long first = 0;
|
|
long last = cbuf->chunk_cnt - 1;
|
|
while (first <= last)
|
|
{
|
|
long middle = (first + last) >> 1;
|
|
- if (cbuf->chunks[2 * middle + 1] <= doff)
|
|
+ if (cbuf->chunks[middle].end <= doff)
|
|
first = middle + 1;
|
|
- else if (cbuf->chunks[2 * middle] <= doff)
|
|
+ else if (cbuf->chunks[middle].start <= doff)
|
|
{
|
|
- if (doff + sz > cbuf->chunks[2 * middle + 1])
|
|
+ if (doff + sz > cbuf->chunks[middle].end)
|
|
gomp_fatal ("internal libgomp cbuf error");
|
|
- memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
|
|
+ memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
|
|
h, sz);
|
|
return;
|
|
}
|
|
@@ -510,8 +516,8 @@ gomp_map_vars (struct gomp_device_descr
|
|
cbuf.buf = NULL;
|
|
if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
|
|
{
|
|
- cbuf.chunks
|
|
- = (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
|
|
+ size_t chunks_size = (mapnum + 1) * sizeof (struct gomp_coalesce_chunk);
|
|
+ cbuf.chunks = (struct gomp_coalesce_chunk *) gomp_alloca (chunks_size);
|
|
cbuf.chunk_cnt = 0;
|
|
}
|
|
if (pragma_kind == GOMP_MAP_VARS_TARGET)
|
|
@@ -521,8 +527,8 @@ gomp_map_vars (struct gomp_device_descr
|
|
tgt_size = mapnum * sizeof (void *);
|
|
cbuf.chunk_cnt = 1;
|
|
cbuf.use_cnt = 1 + (mapnum > 1);
|
|
- cbuf.chunks[0] = 0;
|
|
- cbuf.chunks[1] = tgt_size;
|
|
+ cbuf.chunks[0].start = 0;
|
|
+ cbuf.chunks[0].end = tgt_size;
|
|
}
|
|
|
|
gomp_mutex_lock (&devicep->lock);
|
|
@@ -707,7 +713,7 @@ gomp_map_vars (struct gomp_device_descr
|
|
if (cbuf.chunk_cnt > 0)
|
|
{
|
|
cbuf.buf
|
|
- = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
|
|
+ = malloc (cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start);
|
|
if (cbuf.buf)
|
|
{
|
|
cbuf.tgt = tgt;
|
|
@@ -859,6 +865,7 @@ gomp_map_vars (struct gomp_device_descr
|
|
tgt->list[i].offset = 0;
|
|
tgt->list[i].length = k->host_end - k->host_start;
|
|
k->refcount = 1;
|
|
+ k->dynamic_refcount = 0;
|
|
tgt->refcount++;
|
|
array->left = NULL;
|
|
array->right = NULL;
|
|
@@ -956,9 +963,10 @@ gomp_map_vars (struct gomp_device_descr
|
|
/* Set link pointer on target to the device address of the
|
|
mapped object. */
|
|
void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
|
|
- devicep->host2dev_func (devicep->target_id,
|
|
- (void *) n->tgt_offset,
|
|
- &tgt_addr, sizeof (void *));
|
|
+ /* We intentionally do not use coalescing here, as it's not
|
|
+ data allocated by the current call to this function. */
|
|
+ gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
|
|
+ &tgt_addr, sizeof (void *), NULL);
|
|
}
|
|
array++;
|
|
}
|
|
@@ -981,10 +989,14 @@ gomp_map_vars (struct gomp_device_descr
|
|
{
|
|
long c = 0;
|
|
for (c = 0; c < cbuf.chunk_cnt; ++c)
|
|
- gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
|
|
- (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
|
|
- cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
|
|
+ gomp_copy_host2dev (devicep,
|
|
+ (void *) (tgt->tgt_start + cbuf.chunks[c].start),
|
|
+ (char *) cbuf.buf + (cbuf.chunks[c].start
|
|
+ - cbuf.chunks[0].start),
|
|
+ cbuf.chunks[c].end - cbuf.chunks[c].start, NULL);
|
|
free (cbuf.buf);
|
|
+ cbuf.buf = NULL;
|
|
+ cbufp = NULL;
|
|
}
|
|
|
|
/* If the variable from "omp target enter data" map-list was already mapped,
|
|
@@ -1011,6 +1023,23 @@ gomp_unmap_tgt (struct target_mem_desc *
|
|
free (tgt);
|
|
}
|
|
|
|
+attribute_hidden bool
|
|
+gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
|
|
+{
|
|
+ bool is_tgt_unmapped = false;
|
|
+ splay_tree_remove (&devicep->mem_map, k);
|
|
+ if (k->link_key)
|
|
+ splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key);
|
|
+ if (k->tgt->refcount > 1)
|
|
+ k->tgt->refcount--;
|
|
+ else
|
|
+ {
|
|
+ is_tgt_unmapped = true;
|
|
+ gomp_unmap_tgt (k->tgt);
|
|
+ }
|
|
+ return is_tgt_unmapped;
|
|
+}
|
|
+
|
|
/* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant
|
|
variables back from device to host: if it is false, it is assumed that this
|
|
has been done already. */
|
|
@@ -1059,16 +1088,7 @@ gomp_unmap_vars (struct target_mem_desc
|
|
+ tgt->list[i].offset),
|
|
tgt->list[i].length);
|
|
if (do_unmap)
|
|
- {
|
|
- splay_tree_remove (&devicep->mem_map, k);
|
|
- if (k->link_key)
|
|
- splay_tree_insert (&devicep->mem_map,
|
|
- (splay_tree_node) k->link_key);
|
|
- if (k->tgt->refcount > 1)
|
|
- k->tgt->refcount--;
|
|
- else
|
|
- gomp_unmap_tgt (k->tgt);
|
|
- }
|
|
+ gomp_remove_var (devicep, k);
|
|
}
|
|
|
|
if (tgt->refcount > 1)
|
|
@@ -1298,17 +1318,7 @@ gomp_unload_image_from_device (struct go
|
|
else
|
|
{
|
|
splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &k);
|
|
- splay_tree_remove (&devicep->mem_map, n);
|
|
- if (n->link_key)
|
|
- {
|
|
- if (n->tgt->refcount > 1)
|
|
- n->tgt->refcount--;
|
|
- else
|
|
- {
|
|
- is_tgt_unmapped = true;
|
|
- gomp_unmap_tgt (n->tgt);
|
|
- }
|
|
- }
|
|
+ is_tgt_unmapped = gomp_remove_var (devicep, n);
|
|
}
|
|
}
|
|
|
|
@@ -1855,11 +1865,20 @@ GOMP_target_update_ext (int device, size
|
|
struct gomp_team *team = thr->ts.team;
|
|
/* If parallel or taskgroup has been cancelled, don't start new
|
|
tasks. */
|
|
- if (team
|
|
- && (gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (thr->task->taskgroup
|
|
- && thr->task->taskgroup->cancelled)))
|
|
- return;
|
|
+ if (__builtin_expect (gomp_cancel_var, 0) && team)
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
|
|
gomp_task_maybe_wait_for_dependencies (depend);
|
|
}
|
|
@@ -1874,10 +1893,20 @@ GOMP_target_update_ext (int device, size
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
struct gomp_team *team = thr->ts.team;
|
|
/* If parallel or taskgroup has been cancelled, don't start new tasks. */
|
|
- if (team
|
|
- && (gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
|
|
- return;
|
|
+ if (__builtin_expect (gomp_cancel_var, 0) && team)
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
|
|
gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true);
|
|
}
|
|
@@ -1986,11 +2015,20 @@ GOMP_target_enter_exit_data (int device,
|
|
struct gomp_team *team = thr->ts.team;
|
|
/* If parallel or taskgroup has been cancelled, don't start new
|
|
tasks. */
|
|
- if (team
|
|
- && (gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (thr->task->taskgroup
|
|
- && thr->task->taskgroup->cancelled)))
|
|
- return;
|
|
+ if (__builtin_expect (gomp_cancel_var, 0) && team)
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
|
|
gomp_task_maybe_wait_for_dependencies (depend);
|
|
}
|
|
@@ -2005,10 +2043,20 @@ GOMP_target_enter_exit_data (int device,
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
struct gomp_team *team = thr->ts.team;
|
|
/* If parallel or taskgroup has been cancelled, don't start new tasks. */
|
|
- if (team
|
|
- && (gomp_team_barrier_cancelled (&team->barrier)
|
|
- || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
|
|
- return;
|
|
+ if (__builtin_expect (gomp_cancel_var, 0) && team)
|
|
+ {
|
|
+ if (gomp_team_barrier_cancelled (&team->barrier))
|
|
+ return;
|
|
+ if (thr->task->taskgroup)
|
|
+ {
|
|
+ if (thr->task->taskgroup->cancelled)
|
|
+ return;
|
|
+ if (thr->task->taskgroup->workshare
|
|
+ && thr->task->taskgroup->prev
|
|
+ && thr->task->taskgroup->prev->cancelled)
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
|
|
size_t i;
|
|
if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
|
|
@@ -2197,8 +2245,9 @@ omp_target_is_present (void *ptr, int de
|
|
}
|
|
|
|
int
|
|
-omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset,
|
|
- size_t src_offset, int dst_device_num, int src_device_num)
|
|
+omp_target_memcpy (void *dst, void *src, size_t length,
|
|
+ size_t dst_offset, size_t src_offset, int dst_device_num,
|
|
+ int src_device_num)
|
|
{
|
|
struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
|
|
bool ret;
|
|
@@ -2287,21 +2336,25 @@ omp_target_memcpy_rect_worker (void *dst
|
|
return EINVAL;
|
|
if (dst_devicep == NULL && src_devicep == NULL)
|
|
{
|
|
- memcpy ((char *) dst + dst_off, (char *) src + src_off, length);
|
|
+ memcpy ((char *) dst + dst_off, (char *) src + src_off,
|
|
+ length);
|
|
ret = 1;
|
|
}
|
|
else if (src_devicep == NULL)
|
|
ret = dst_devicep->host2dev_func (dst_devicep->target_id,
|
|
(char *) dst + dst_off,
|
|
- (char *) src + src_off, length);
|
|
+ (char *) src + src_off,
|
|
+ length);
|
|
else if (dst_devicep == NULL)
|
|
ret = src_devicep->dev2host_func (src_devicep->target_id,
|
|
(char *) dst + dst_off,
|
|
- (char *) src + src_off, length);
|
|
+ (char *) src + src_off,
|
|
+ length);
|
|
else if (src_devicep == dst_devicep)
|
|
ret = src_devicep->dev2dev_func (src_devicep->target_id,
|
|
(char *) dst + dst_off,
|
|
- (char *) src + src_off, length);
|
|
+ (char *) src + src_off,
|
|
+ length);
|
|
else
|
|
ret = 0;
|
|
return ret ? 0 : EINVAL;
|
|
@@ -2396,8 +2449,8 @@ omp_target_memcpy_rect (void *dst, void
|
|
}
|
|
|
|
int
|
|
-omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
|
|
- size_t device_offset, int device_num)
|
|
+omp_target_associate_ptr (void *host_ptr, void *device_ptr,
|
|
+ size_t size, size_t device_offset, int device_num)
|
|
{
|
|
if (device_num == GOMP_DEVICE_HOST_FALLBACK)
|
|
return EINVAL;
|
|
@@ -2499,6 +2552,31 @@ omp_target_disassociate_ptr (void *ptr,
|
|
return ret;
|
|
}
|
|
|
|
+int
|
|
+omp_pause_resource (omp_pause_resource_t kind, int device_num)
|
|
+{
|
|
+ (void) kind;
|
|
+ if (device_num == GOMP_DEVICE_HOST_FALLBACK)
|
|
+ return gomp_pause_host ();
|
|
+ if (device_num < 0 || device_num >= gomp_get_num_devices ())
|
|
+ return -1;
|
|
+ /* Do nothing for target devices for now. */
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int
|
|
+omp_pause_resource_all (omp_pause_resource_t kind)
|
|
+{
|
|
+ (void) kind;
|
|
+ if (gomp_pause_host ())
|
|
+ return -1;
|
|
+ /* Do nothing for target devices for now. */
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+ialias (omp_pause_resource)
|
|
+ialias (omp_pause_resource_all)
|
|
+
|
|
#ifdef PLUGIN_SUPPORT
|
|
|
|
/* This function tries to load a plugin for DEVICE. Name of plugin is passed
|
|
@@ -2632,9 +2710,9 @@ gomp_target_fini (void)
|
|
}
|
|
}
|
|
|
|
-/* This function initializes the runtime needed for offloading.
|
|
- It parses the list of offload targets and tries to load the plugins for
|
|
- these targets. On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP
|
|
+/* This function initializes the runtime for offloading.
|
|
+ It parses the list of offload plugins, and tries to load these.
|
|
+ On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP
|
|
will be set, and the array DEVICES initialized, containing descriptors for
|
|
corresponding devices, first the GOMP_OFFLOAD_CAP_OPENMP_400 ones, follows
|
|
by the others. */
|
|
@@ -2651,7 +2729,7 @@ gomp_target_init (void)
|
|
num_devices = 0;
|
|
devices = NULL;
|
|
|
|
- cur = OFFLOAD_TARGETS;
|
|
+ cur = OFFLOAD_PLUGINS;
|
|
if (*cur)
|
|
do
|
|
{
|
|
--- libgomp/ordered.c.jj 2018-04-25 09:40:31.926655587 +0200
|
|
+++ libgomp/ordered.c 2019-05-07 18:46:36.532109640 +0200
|
|
@@ -259,7 +259,8 @@ GOMP_ordered_end (void)
|
|
#define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
|
|
|
|
void
|
|
-gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
|
|
+gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
|
|
+ size_t extra)
|
|
{
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
struct gomp_team *team = thr->ts.team;
|
|
@@ -269,13 +270,24 @@ gomp_doacross_init (unsigned ncounts, lo
|
|
struct gomp_doacross_work_share *doacross;
|
|
|
|
if (team == NULL || team->nthreads == 1)
|
|
- return;
|
|
+ {
|
|
+ empty:
|
|
+ if (!extra)
|
|
+ ws->doacross = NULL;
|
|
+ else
|
|
+ {
|
|
+ doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
|
|
+ doacross->extra = (void *) (doacross + 1);
|
|
+ ws->doacross = doacross;
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
|
|
for (i = 0; i < ncounts; i++)
|
|
{
|
|
/* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
|
|
if (counts[i] == 0)
|
|
- return;
|
|
+ goto empty;
|
|
|
|
if (num_bits <= MAX_COLLAPSED_BITS)
|
|
{
|
|
@@ -314,7 +326,7 @@ gomp_doacross_init (unsigned ncounts, lo
|
|
elt_sz = (elt_sz + 63) & ~63UL;
|
|
|
|
doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
|
|
- + shift_sz);
|
|
+ + shift_sz + extra);
|
|
doacross->chunk_size = chunk_size;
|
|
doacross->elt_sz = elt_sz;
|
|
doacross->ncounts = ncounts;
|
|
@@ -322,6 +334,13 @@ gomp_doacross_init (unsigned ncounts, lo
|
|
doacross->array = (unsigned char *)
|
|
((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
|
|
& ~(uintptr_t) 63);
|
|
+ if (extra)
|
|
+ {
|
|
+ doacross->extra = doacross->array + num_ents * elt_sz;
|
|
+ memset (doacross->extra, '\0', extra);
|
|
+ }
|
|
+ else
|
|
+ doacross->extra = NULL;
|
|
if (num_bits <= MAX_COLLAPSED_BITS)
|
|
{
|
|
unsigned int shift_count = 0;
|
|
@@ -360,7 +379,8 @@ GOMP_doacross_post (long *counts)
|
|
unsigned long ent;
|
|
unsigned int i;
|
|
|
|
- if (__builtin_expect (doacross == NULL, 0))
|
|
+ if (__builtin_expect (doacross == NULL, 0)
|
|
+ || __builtin_expect (doacross->array == NULL, 0))
|
|
{
|
|
__sync_synchronize ();
|
|
return;
|
|
@@ -411,7 +431,8 @@ GOMP_doacross_wait (long first, ...)
|
|
unsigned long ent;
|
|
unsigned int i;
|
|
|
|
- if (__builtin_expect (doacross == NULL, 0))
|
|
+ if (__builtin_expect (doacross == NULL, 0)
|
|
+ || __builtin_expect (doacross->array == NULL, 0))
|
|
{
|
|
__sync_synchronize ();
|
|
return;
|
|
@@ -488,7 +509,8 @@ GOMP_doacross_wait (long first, ...)
|
|
typedef unsigned long long gomp_ull;
|
|
|
|
void
|
|
-gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
|
|
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
|
|
+ gomp_ull chunk_size, size_t extra)
|
|
{
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
struct gomp_team *team = thr->ts.team;
|
|
@@ -498,13 +520,24 @@ gomp_doacross_ull_init (unsigned ncounts
|
|
struct gomp_doacross_work_share *doacross;
|
|
|
|
if (team == NULL || team->nthreads == 1)
|
|
- return;
|
|
+ {
|
|
+ empty:
|
|
+ if (!extra)
|
|
+ ws->doacross = NULL;
|
|
+ else
|
|
+ {
|
|
+ doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
|
|
+ doacross->extra = (void *) (doacross + 1);
|
|
+ ws->doacross = doacross;
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
|
|
for (i = 0; i < ncounts; i++)
|
|
{
|
|
/* If any count is 0, GOMP_doacross_{post,wait} can't be called. */
|
|
if (counts[i] == 0)
|
|
- return;
|
|
+ goto empty;
|
|
|
|
if (num_bits <= MAX_COLLAPSED_BITS)
|
|
{
|
|
@@ -557,6 +590,13 @@ gomp_doacross_ull_init (unsigned ncounts
|
|
doacross->array = (unsigned char *)
|
|
((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
|
|
& ~(uintptr_t) 63);
|
|
+ if (extra)
|
|
+ {
|
|
+ doacross->extra = doacross->array + num_ents * elt_sz;
|
|
+ memset (doacross->extra, '\0', extra);
|
|
+ }
|
|
+ else
|
|
+ doacross->extra = NULL;
|
|
if (num_bits <= MAX_COLLAPSED_BITS)
|
|
{
|
|
unsigned int shift_count = 0;
|
|
@@ -595,7 +635,8 @@ GOMP_doacross_ull_post (gomp_ull *counts
|
|
unsigned long ent;
|
|
unsigned int i;
|
|
|
|
- if (__builtin_expect (doacross == NULL, 0))
|
|
+ if (__builtin_expect (doacross == NULL, 0)
|
|
+ || __builtin_expect (doacross->array == NULL, 0))
|
|
{
|
|
__sync_synchronize ();
|
|
return;
|
|
@@ -667,7 +708,8 @@ GOMP_doacross_ull_wait (gomp_ull first,
|
|
unsigned long ent;
|
|
unsigned int i;
|
|
|
|
- if (__builtin_expect (doacross == NULL, 0))
|
|
+ if (__builtin_expect (doacross == NULL, 0)
|
|
+ || __builtin_expect (doacross->array == NULL, 0))
|
|
{
|
|
__sync_synchronize ();
|
|
return;
|
|
--- libgomp/alloc.c.jj 2018-04-25 09:40:31.926655587 +0200
|
|
+++ libgomp/alloc.c 2019-05-07 18:46:36.336112770 +0200
|
|
@@ -57,3 +57,50 @@ gomp_realloc (void *old, size_t size)
|
|
gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size);
|
|
return ret;
|
|
}
|
|
+
|
|
+void *
|
|
+gomp_aligned_alloc (size_t al, size_t size)
|
|
+{
|
|
+ void *ret;
|
|
+ if (al < sizeof (void *))
|
|
+ al = sizeof (void *);
|
|
+#ifdef HAVE_ALIGNED_ALLOC
|
|
+ ret = aligned_alloc (al, size);
|
|
+#elif defined(HAVE__ALIGNED_MALLOC)
|
|
+ ret = _aligned_malloc (size, al);
|
|
+#elif defined(HAVE_POSIX_MEMALIGN)
|
|
+ if (posix_memalign (&ret, al, size) != 0)
|
|
+ ret = NULL;
|
|
+#elif defined(HAVE_MEMALIGN)
|
|
+ {
|
|
+ extern void *memalign (size_t, size_t);
|
|
+ ret = memalign (al, size);
|
|
+ }
|
|
+#else
|
|
+ ret = NULL;
|
|
+ if ((al & (al - 1)) == 0 && size)
|
|
+ {
|
|
+ void *p = malloc (size + al);
|
|
+ if (p)
|
|
+ {
|
|
+ void *ap = (void *) (((uintptr_t) p + al) & -al);
|
|
+ ((void **) ap)[-1] = p;
|
|
+ ret = ap;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+ if (ret == NULL)
|
|
+ gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void
|
|
+gomp_aligned_free (void *ptr)
|
|
+{
|
|
+#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC
|
|
+ free (ptr);
|
|
+#else
|
|
+ if (ptr)
|
|
+ free (((void **) ptr)[-1]);
|
|
+#endif
|
|
+}
|
|
--- libgomp/configure.ac.jj 2018-04-25 09:40:31.321655307 +0200
|
|
+++ libgomp/configure.ac 2019-05-07 18:46:36.471110614 +0200
|
|
@@ -219,6 +219,7 @@ m4_include([plugin/configfrag.ac])
|
|
|
|
# Check for functions needed.
|
|
AC_CHECK_FUNCS(getloadavg clock_gettime strtoull)
|
|
+AC_CHECK_FUNCS(aligned_alloc posix_memalign memalign _aligned_malloc)
|
|
|
|
# Check for broken semaphore implementation on darwin.
|
|
# sem_init returns: sem_init error: Function not implemented.
|
|
@@ -266,6 +267,41 @@ if test $ac_cv_func_clock_gettime = no;
|
|
[Define to 1 if you have the `clock_gettime' function.])])
|
|
fi
|
|
|
|
+# Check for uname.
|
|
+AC_COMPILE_IFELSE(
|
|
+ [AC_LANG_PROGRAM(
|
|
+ [#include <string.h>
|
|
+ #include <stdlib.h>
|
|
+ #include <sys/utsname.h>],
|
|
+ [struct utsname buf;
|
|
+ volatile size_t len = 0;
|
|
+ if (!uname (buf))
|
|
+ len = strlen (buf.nodename);])],
|
|
+ AC_DEFINE(HAVE_UNAME, 1,
|
|
+[ Define if uname is supported and struct utsname has nodename field.]))
|
|
+
|
|
+# Check for gethostname.
|
|
+AC_COMPILE_IFELSE(
|
|
+ [AC_LANG_PROGRAM(
|
|
+ [#include <unistd.h>],
|
|
+ [
|
|
+changequote(,)dnl
|
|
+ char buf[256];
|
|
+ if (gethostname (buf, sizeof (buf) - 1) == 0)
|
|
+ buf[255] = '\0';
|
|
+changequote([,])dnl
|
|
+ ])],
|
|
+ AC_DEFINE(HAVE_GETHOSTNAME, 1,
|
|
+[ Define if gethostname is supported.]))
|
|
+
|
|
+# Check for getpid.
|
|
+AC_COMPILE_IFELSE(
|
|
+ [AC_LANG_PROGRAM(
|
|
+ [#include <unistd.h>],
|
|
+ [int pid = getpid ();])],
|
|
+ AC_DEFINE(HAVE_GETPID, 1,
|
|
+[ Define if getpid is supported.]))
|
|
+
|
|
# See if we support thread-local storage.
|
|
GCC_CHECK_TLS
|
|
|
|
--- libgomp/icv.c.jj 2018-04-25 09:40:31.870655561 +0200
|
|
+++ libgomp/icv.c 2019-05-07 18:46:36.501110134 +0200
|
|
@@ -69,7 +69,7 @@ void
|
|
omp_set_schedule (omp_sched_t kind, int chunk_size)
|
|
{
|
|
struct gomp_task_icv *icv = gomp_icv (true);
|
|
- switch (kind)
|
|
+ switch (kind & ~omp_sched_monotonic)
|
|
{
|
|
case omp_sched_static:
|
|
if (chunk_size < 1)
|
|
--- libgomp/configure.jj 2018-04-25 09:40:31.913655581 +0200
|
|
+++ libgomp/configure 2019-05-07 18:47:37.961128420 +0200
|
|
@@ -636,6 +636,8 @@ PLUGIN_NVPTX_FALSE
|
|
PLUGIN_NVPTX_TRUE
|
|
offload_additional_lib_paths
|
|
offload_additional_options
|
|
+offload_targets
|
|
+offload_plugins
|
|
PLUGIN_HSA_LIBS
|
|
PLUGIN_HSA_LDFLAGS
|
|
PLUGIN_HSA_CPPFLAGS
|
|
@@ -648,7 +650,6 @@ PLUGIN_NVPTX_CPPFLAGS
|
|
PLUGIN_NVPTX
|
|
CUDA_DRIVER_LIB
|
|
CUDA_DRIVER_INCLUDE
|
|
-offload_targets
|
|
libtool_VERSION
|
|
ac_ct_FC
|
|
FCFLAGS
|
|
@@ -11157,7 +11158,7 @@ else
|
|
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
|
|
lt_status=$lt_dlunknown
|
|
cat > conftest.$ac_ext <<_LT_EOF
|
|
-#line 11160 "configure"
|
|
+#line 11161 "configure"
|
|
#include "confdefs.h"
|
|
|
|
#if HAVE_DLFCN_H
|
|
@@ -11263,7 +11264,7 @@ else
|
|
lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
|
|
lt_status=$lt_dlunknown
|
|
cat > conftest.$ac_ext <<_LT_EOF
|
|
-#line 11266 "configure"
|
|
+#line 11267 "configure"
|
|
#include "confdefs.h"
|
|
|
|
#if HAVE_DLFCN_H
|
|
@@ -15167,8 +15168,6 @@ fi
|
|
# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
-offload_targets=
|
|
-
|
|
plugin_support=yes
|
|
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlsym in -ldl" >&5
|
|
$as_echo_n "checking for dlsym in -ldl... " >&6; }
|
|
@@ -15302,7 +15301,11 @@ if test "${with_cuda_driver_lib+set}" =
|
|
fi
|
|
|
|
case "x$with_cuda_driver" in
|
|
- x | xno) ;;
|
|
+ x) ;;
|
|
+ xno)
|
|
+ CUDA_DRIVER_INCLUDE=no
|
|
+ CUDA_DRIVER_LIB=no
|
|
+ ;;
|
|
*) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
|
|
CUDA_DRIVER_LIB=$with_cuda_driver/lib
|
|
;;
|
|
@@ -15313,10 +15316,12 @@ fi
|
|
if test "x$with_cuda_driver_lib" != x; then
|
|
CUDA_DRIVER_LIB=$with_cuda_driver_lib
|
|
fi
|
|
-if test "x$CUDA_DRIVER_INCLUDE" != x; then
|
|
+if test "x$CUDA_DRIVER_INCLUDE" != x \
|
|
+ && test "x$CUDA_DRIVER_INCLUDE" != xno; then
|
|
CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
|
|
fi
|
|
-if test "x$CUDA_DRIVER_LIB" != x; then
|
|
+if test "x$CUDA_DRIVER_LIB" != x \
|
|
+ && test "x$CUDA_DRIVER_LIB" != xno; then
|
|
CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
|
|
fi
|
|
|
|
@@ -15383,7 +15388,13 @@ PLUGIN_HSA_LIBS=
|
|
|
|
|
|
|
|
-# Get offload targets and path to install tree of offloading compiler.
|
|
+# Parse '--enable-offload-targets', figure out the corresponding libgomp
|
|
+# plugins, and configure to find the corresponding offload compilers.
|
|
+# 'offload_plugins' and 'offload_targets' will be populated in the same order.
|
|
+offload_plugins=
|
|
+offload_targets=
|
|
+
|
|
+
|
|
offload_additional_options=
|
|
offload_additional_lib_paths=
|
|
|
|
@@ -15392,25 +15403,27 @@ if test x"$enable_offload_targets" != x;
|
|
for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
|
|
tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
|
|
tgt=`echo $tgt | sed 's/=.*//'`
|
|
- tgt_name=
|
|
+ tgt_plugin=
|
|
case $tgt in
|
|
*-intelmic-* | *-intelmicemul-*)
|
|
- tgt_name=intelmic
|
|
+ tgt_plugin=intelmic
|
|
;;
|
|
nvptx*)
|
|
- tgt_name=nvptx
|
|
+ tgt_plugin=nvptx
|
|
PLUGIN_NVPTX=$tgt
|
|
- PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
|
|
- PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
|
|
- PLUGIN_NVPTX_LIBS='-lcuda'
|
|
-
|
|
- PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
|
|
- CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
|
|
- PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
|
|
- LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
|
|
- PLUGIN_NVPTX_save_LIBS=$LIBS
|
|
- LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
|
|
- cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
|
+ if test "x$CUDA_DRIVER_LIB" != xno \
|
|
+ && test "x$CUDA_DRIVER_LIB" != xno; then
|
|
+ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
|
|
+ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
|
|
+ PLUGIN_NVPTX_LIBS='-lcuda'
|
|
+
|
|
+ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
|
|
+ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
|
|
+ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
|
|
+ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
|
|
+ PLUGIN_NVPTX_save_LIBS=$LIBS
|
|
+ LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
|
|
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
|
/* end confdefs.h. */
|
|
#include "cuda.h"
|
|
int
|
|
@@ -15426,13 +15439,16 @@ if ac_fn_c_try_link "$LINENO"; then :
|
|
fi
|
|
rm -f core conftest.err conftest.$ac_objext \
|
|
conftest$ac_exeext conftest.$ac_ext
|
|
- CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
|
|
- LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
|
|
- LIBS=$PLUGIN_NVPTX_save_LIBS
|
|
+ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
|
|
+ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
|
|
+ LIBS=$PLUGIN_NVPTX_save_LIBS
|
|
+ fi
|
|
case $PLUGIN_NVPTX in
|
|
nvptx*)
|
|
- if test "x$CUDA_DRIVER_INCLUDE" = x \
|
|
- && test "x$CUDA_DRIVER_LIB" = x; then
|
|
+ if (test "x$CUDA_DRIVER_INCLUDE" = x \
|
|
+ || test "x$CUDA_DRIVER_INCLUDE" = xno) \
|
|
+ && (test "x$CUDA_DRIVER_LIB" = x \
|
|
+ || test "x$CUDA_DRIVER_LIB" = xno); then
|
|
PLUGIN_NVPTX=1
|
|
PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
|
|
PLUGIN_NVPTX_LIBS='-ldl'
|
|
@@ -15452,7 +15468,7 @@ rm -f core conftest.err conftest.$ac_obj
|
|
PLUGIN_HSA=0
|
|
;;
|
|
*)
|
|
- tgt_name=hsa
|
|
+ tgt_plugin=hsa
|
|
PLUGIN_HSA=$tgt
|
|
PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
|
|
PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
|
|
@@ -15470,7 +15486,7 @@ rm -f core conftest.err conftest.$ac_obj
|
|
LDFLAGS=$PLUGIN_HSA_save_LDFLAGS
|
|
LIBS=$PLUGIN_HSA_save_LIBS
|
|
case $PLUGIN_HSA in
|
|
- hsa*)
|
|
+ hsa*)
|
|
HSA_PLUGIN=0
|
|
as_fn_error "HSA run-time package required for HSA support" "$LINENO" 5
|
|
;;
|
|
@@ -15487,16 +15503,19 @@ rm -f core conftest.err conftest.$ac_obj
|
|
as_fn_error "unknown offload target specified" "$LINENO" 5
|
|
;;
|
|
esac
|
|
- if test x"$tgt_name" = x; then
|
|
- # Don't configure libgomp for this offloading target if we don't build
|
|
- # the corresponding plugin.
|
|
+ if test x"$tgt_plugin" = x; then
|
|
+ # Not configuring libgomp for this offload target if we're not building
|
|
+ # the corresponding offload plugin.
|
|
continue
|
|
- elif test x"$offload_targets" = x; then
|
|
- offload_targets=$tgt_name
|
|
+ elif test x"$offload_plugins" = x; then
|
|
+ offload_plugins=$tgt_plugin
|
|
+ offload_targets=$tgt
|
|
else
|
|
- offload_targets=$offload_targets,$tgt_name
|
|
+ offload_plugins=$offload_plugins,$tgt_plugin
|
|
+ offload_targets=$offload_targets,$tgt
|
|
fi
|
|
- if test "$tgt_name" = hsa; then
|
|
+ # Configure additional search paths.
|
|
+ if test "$tgt_plugin" = hsa; then
|
|
# Offloading compilation is all handled by the target compiler.
|
|
:
|
|
elif test x"$tgt_dir" != x; then
|
|
@@ -15510,7 +15529,7 @@ rm -f core conftest.err conftest.$ac_obj
|
|
fi
|
|
|
|
cat >>confdefs.h <<_ACEOF
|
|
-#define OFFLOAD_TARGETS "$offload_targets"
|
|
+#define OFFLOAD_PLUGINS "$offload_plugins"
|
|
_ACEOF
|
|
|
|
if test $PLUGIN_NVPTX = 1; then
|
|
@@ -15570,6 +15589,19 @@ _ACEOF
|
|
fi
|
|
done
|
|
|
|
+for ac_func in aligned_alloc posix_memalign memalign _aligned_malloc
|
|
+do :
|
|
+ as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
|
|
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
|
|
+eval as_val=\$$as_ac_var
|
|
+ if test "x$as_val" = x""yes; then :
|
|
+ cat >>confdefs.h <<_ACEOF
|
|
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
|
|
+_ACEOF
|
|
+
|
|
+fi
|
|
+done
|
|
+
|
|
|
|
# Check for broken semaphore implementation on darwin.
|
|
# sem_init returns: sem_init error: Function not implemented.
|
|
@@ -15784,6 +15816,72 @@ fi
|
|
|
|
fi
|
|
|
|
+# Check for uname.
|
|
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
|
+/* end confdefs.h. */
|
|
+#include <string.h>
|
|
+ #include <stdlib.h>
|
|
+ #include <sys/utsname.h>
|
|
+int
|
|
+main ()
|
|
+{
|
|
+struct utsname buf;
|
|
+ volatile size_t len = 0;
|
|
+ if (!uname (buf))
|
|
+ len = strlen (buf.nodename);
|
|
+ ;
|
|
+ return 0;
|
|
+}
|
|
+_ACEOF
|
|
+if ac_fn_c_try_compile "$LINENO"; then :
|
|
+
|
|
+$as_echo "#define HAVE_UNAME 1" >>confdefs.h
|
|
+
|
|
+fi
|
|
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
|
+
|
|
+# Check for gethostname.
|
|
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
|
+/* end confdefs.h. */
|
|
+#include <unistd.h>
|
|
+int
|
|
+main ()
|
|
+{
|
|
+
|
|
+ char buf[256];
|
|
+ if (gethostname (buf, sizeof (buf) - 1) == 0)
|
|
+ buf[255] = '\0';
|
|
+
|
|
+ ;
|
|
+ return 0;
|
|
+}
|
|
+_ACEOF
|
|
+if ac_fn_c_try_compile "$LINENO"; then :
|
|
+
|
|
+$as_echo "#define HAVE_GETHOSTNAME 1" >>confdefs.h
|
|
+
|
|
+fi
|
|
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
|
+
|
|
+# Check for getpid.
|
|
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
|
+/* end confdefs.h. */
|
|
+#include <unistd.h>
|
|
+int
|
|
+main ()
|
|
+{
|
|
+int pid = getpid ();
|
|
+ ;
|
|
+ return 0;
|
|
+}
|
|
+_ACEOF
|
|
+if ac_fn_c_try_compile "$LINENO"; then :
|
|
+
|
|
+$as_echo "#define HAVE_GETPID 1" >>confdefs.h
|
|
+
|
|
+fi
|
|
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
|
|
+
|
|
# See if we support thread-local storage.
|
|
|
|
|
|
--- libgomp/Makefile.am.jj 2018-04-25 09:40:31.926655587 +0200
|
|
+++ libgomp/Makefile.am 2019-05-07 19:59:03.683989317 +0200
|
|
@@ -63,12 +63,13 @@ libgomp_la_SOURCES = alloc.c atomic.c ba
|
|
parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
|
|
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
|
|
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
|
|
- oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
|
|
+ oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
|
|
+ affinity-fmt.c teams.c
|
|
|
|
include $(top_srcdir)/plugin/Makefrag.am
|
|
|
|
if USE_FORTRAN
|
|
-libgomp_la_SOURCES += openacc.f90
|
|
+libgomp_la_SOURCES += openacc2.f90
|
|
endif
|
|
|
|
nodist_noinst_HEADERS = libgomp_f.h
|
|
@@ -87,8 +88,6 @@ omp_lib_kinds.mod: omp_lib.mod
|
|
:
|
|
openacc_kinds.mod: openacc.mod
|
|
:
|
|
-openacc.mod: openacc.lo
|
|
- :
|
|
%.mod: %.f90
|
|
$(FC) $(FCFLAGS) -fsyntax-only $<
|
|
fortran.lo: libgomp_f.h
|
|
--- libgomp/oacc-mem.c.jj 2018-04-25 09:40:31.924655586 +0200
|
|
+++ libgomp/oacc-mem.c 2019-05-07 18:46:36.530109672 +0200
|
|
@@ -153,8 +153,9 @@ acc_free (void *d)
|
|
gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
|
|
}
|
|
|
|
-void
|
|
-acc_memcpy_to_device (void *d, void *h, size_t s)
|
|
+static void
|
|
+memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
|
|
+ const char *libfnname)
|
|
{
|
|
/* No need to call lazy open here, as the device pointer must have
|
|
been obtained from a routine that did that. */
|
|
@@ -164,31 +165,49 @@ acc_memcpy_to_device (void *d, void *h,
|
|
|
|
if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
|
{
|
|
- memmove (d, h, s);
|
|
+ if (from)
|
|
+ memmove (h, d, s);
|
|
+ else
|
|
+ memmove (d, h, s);
|
|
return;
|
|
}
|
|
|
|
- if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s))
|
|
- gomp_fatal ("error in %s", __FUNCTION__);
|
|
+ if (async > acc_async_sync)
|
|
+ thr->dev->openacc.async_set_async_func (async);
|
|
+
|
|
+ bool ret = (from
|
|
+ ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
|
|
+ : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
|
|
+
|
|
+ if (async > acc_async_sync)
|
|
+ thr->dev->openacc.async_set_async_func (acc_async_sync);
|
|
+
|
|
+ if (!ret)
|
|
+ gomp_fatal ("error in %s", libfnname);
|
|
}
|
|
|
|
void
|
|
-acc_memcpy_from_device (void *h, void *d, size_t s)
|
|
+acc_memcpy_to_device (void *d, void *h, size_t s)
|
|
{
|
|
- /* No need to call lazy open here, as the device pointer must have
|
|
- been obtained from a routine that did that. */
|
|
- struct goacc_thread *thr = goacc_thread ();
|
|
+ memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
|
|
+}
|
|
|
|
- assert (thr && thr->dev);
|
|
+void
|
|
+acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
|
|
+{
|
|
+ memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
|
|
+}
|
|
|
|
- if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
|
|
- {
|
|
- memmove (h, d, s);
|
|
- return;
|
|
- }
|
|
+void
|
|
+acc_memcpy_from_device (void *h, void *d, size_t s)
|
|
+{
|
|
+ memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
|
|
+}
|
|
|
|
- if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s))
|
|
- gomp_fatal ("error in %s", __FUNCTION__);
|
|
+void
|
|
+acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
|
|
+{
|
|
+ memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
|
|
}
|
|
|
|
/* Return the device pointer that corresponds to host data H. Or NULL
|
|
@@ -347,6 +366,7 @@ acc_map_data (void *h, void *d, size_t s
|
|
|
|
tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
|
|
&kinds, true, GOMP_MAP_VARS_OPENACC);
|
|
+ tgt->list[0].key->refcount = REFCOUNT_INFINITY;
|
|
}
|
|
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
@@ -389,6 +409,9 @@ acc_unmap_data (void *h)
|
|
(void *) n->host_start, (int) host_size, (void *) h);
|
|
}
|
|
|
|
+ /* Mark for removal. */
|
|
+ n->refcount = 1;
|
|
+
|
|
t = n->tgt;
|
|
|
|
if (t->refcount == 2)
|
|
@@ -424,7 +447,7 @@ acc_unmap_data (void *h)
|
|
#define FLAG_COPY (1 << 2)
|
|
|
|
static void *
|
|
-present_create_copy (unsigned f, void *h, size_t s)
|
|
+present_create_copy (unsigned f, void *h, size_t s, int async)
|
|
{
|
|
void *d;
|
|
splay_tree_key n;
|
|
@@ -460,6 +483,11 @@ present_create_copy (unsigned f, void *h
|
|
gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
|
|
}
|
|
|
|
+ if (n->refcount != REFCOUNT_INFINITY)
|
|
+ {
|
|
+ n->refcount++;
|
|
+ n->dynamic_refcount++;
|
|
+ }
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
}
|
|
else if (!(f & FLAG_CREATE))
|
|
@@ -481,8 +509,16 @@ present_create_copy (unsigned f, void *h
|
|
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
|
|
+ if (async > acc_async_sync)
|
|
+ acc_dev->openacc.async_set_async_func (async);
|
|
+
|
|
tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
|
|
GOMP_MAP_VARS_OPENACC);
|
|
+ /* Initialize dynamic refcount. */
|
|
+ tgt->list[0].key->dynamic_refcount = 1;
|
|
+
|
|
+ if (async > acc_async_sync)
|
|
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
|
|
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
|
|
@@ -499,53 +535,71 @@ present_create_copy (unsigned f, void *h
|
|
void *
|
|
acc_create (void *h, size_t s)
|
|
{
|
|
- return present_create_copy (FLAG_CREATE, h, s);
|
|
+ return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync);
|
|
}
|
|
|
|
-void *
|
|
-acc_copyin (void *h, size_t s)
|
|
+void
|
|
+acc_create_async (void *h, size_t s, int async)
|
|
{
|
|
- return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s);
|
|
+ present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async);
|
|
}
|
|
|
|
+/* acc_present_or_create used to be what acc_create is now. */
|
|
+/* acc_pcreate is acc_present_or_create by a different name. */
|
|
+#ifdef HAVE_ATTRIBUTE_ALIAS
|
|
+strong_alias (acc_create, acc_present_or_create)
|
|
+strong_alias (acc_create, acc_pcreate)
|
|
+#else
|
|
void *
|
|
acc_present_or_create (void *h, size_t s)
|
|
{
|
|
- return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s);
|
|
+ return acc_create (h, s);
|
|
}
|
|
|
|
-/* acc_pcreate is acc_present_or_create by a different name. */
|
|
-#ifdef HAVE_ATTRIBUTE_ALIAS
|
|
-strong_alias (acc_present_or_create, acc_pcreate)
|
|
-#else
|
|
void *
|
|
acc_pcreate (void *h, size_t s)
|
|
{
|
|
- return acc_present_or_create (h, s);
|
|
+ return acc_create (h, s);
|
|
}
|
|
#endif
|
|
|
|
void *
|
|
-acc_present_or_copyin (void *h, size_t s)
|
|
+acc_copyin (void *h, size_t s)
|
|
+{
|
|
+ return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s,
|
|
+ acc_async_sync);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_copyin_async (void *h, size_t s, int async)
|
|
{
|
|
- return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s);
|
|
+ present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async);
|
|
}
|
|
|
|
+/* acc_present_or_copyin used to be what acc_copyin is now. */
|
|
/* acc_pcopyin is acc_present_or_copyin by a different name. */
|
|
#ifdef HAVE_ATTRIBUTE_ALIAS
|
|
-strong_alias (acc_present_or_copyin, acc_pcopyin)
|
|
+strong_alias (acc_copyin, acc_present_or_copyin)
|
|
+strong_alias (acc_copyin, acc_pcopyin)
|
|
#else
|
|
void *
|
|
+acc_present_or_copyin (void *h, size_t s)
|
|
+{
|
|
+ return acc_copyin (h, s);
|
|
+}
|
|
+
|
|
+void *
|
|
acc_pcopyin (void *h, size_t s)
|
|
{
|
|
- return acc_present_or_copyin (h, s);
|
|
+ return acc_copyin (h, s);
|
|
}
|
|
#endif
|
|
|
|
-#define FLAG_COPYOUT (1 << 0)
|
|
+#define FLAG_COPYOUT (1 << 0)
|
|
+#define FLAG_FINALIZE (1 << 1)
|
|
|
|
static void
|
|
-delete_copyout (unsigned f, void *h, size_t s, const char *libfnname)
|
|
+delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
|
|
{
|
|
size_t host_size;
|
|
splay_tree_key n;
|
|
@@ -581,31 +635,111 @@ delete_copyout (unsigned f, void *h, siz
|
|
(void *) n->host_start, (int) host_size, (void *) h, (int) s);
|
|
}
|
|
|
|
- gomp_mutex_unlock (&acc_dev->lock);
|
|
+ if (n->refcount == REFCOUNT_INFINITY)
|
|
+ {
|
|
+ n->refcount = 0;
|
|
+ n->dynamic_refcount = 0;
|
|
+ }
|
|
+ if (n->refcount < n->dynamic_refcount)
|
|
+ {
|
|
+ gomp_mutex_unlock (&acc_dev->lock);
|
|
+ gomp_fatal ("Dynamic reference counting assert fail\n");
|
|
+ }
|
|
|
|
- if (f & FLAG_COPYOUT)
|
|
- acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
|
|
+ if (f & FLAG_FINALIZE)
|
|
+ {
|
|
+ n->refcount -= n->dynamic_refcount;
|
|
+ n->dynamic_refcount = 0;
|
|
+ }
|
|
+ else if (n->dynamic_refcount)
|
|
+ {
|
|
+ n->dynamic_refcount--;
|
|
+ n->refcount--;
|
|
+ }
|
|
+
|
|
+ if (n->refcount == 0)
|
|
+ {
|
|
+ if (n->tgt->refcount == 2)
|
|
+ {
|
|
+ struct target_mem_desc *tp, *t;
|
|
+ for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
|
|
+ tp = t, t = t->prev)
|
|
+ if (n->tgt == t)
|
|
+ {
|
|
+ if (tp)
|
|
+ tp->prev = t->prev;
|
|
+ else
|
|
+ acc_dev->openacc.data_environ = t->prev;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (f & FLAG_COPYOUT)
|
|
+ {
|
|
+ if (async > acc_async_sync)
|
|
+ acc_dev->openacc.async_set_async_func (async);
|
|
+ acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
|
|
+ if (async > acc_async_sync)
|
|
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
|
|
+ }
|
|
|
|
- acc_unmap_data (h);
|
|
+ gomp_remove_var (acc_dev, n);
|
|
+ }
|
|
|
|
- if (!acc_dev->free_func (acc_dev->target_id, d))
|
|
- gomp_fatal ("error in freeing device memory in %s", libfnname);
|
|
+ gomp_mutex_unlock (&acc_dev->lock);
|
|
}
|
|
|
|
void
|
|
acc_delete (void *h , size_t s)
|
|
{
|
|
- delete_copyout (0, h, s, __FUNCTION__);
|
|
+ delete_copyout (0, h, s, acc_async_sync, __FUNCTION__);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_delete_async (void *h , size_t s, int async)
|
|
+{
|
|
+ delete_copyout (0, h, s, async, __FUNCTION__);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_delete_finalize (void *h , size_t s)
|
|
+{
|
|
+ delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_delete_finalize_async (void *h , size_t s, int async)
|
|
+{
|
|
+ delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__);
|
|
}
|
|
|
|
void
|
|
acc_copyout (void *h, size_t s)
|
|
{
|
|
- delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__);
|
|
+ delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_copyout_async (void *h, size_t s, int async)
|
|
+{
|
|
+ delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_copyout_finalize (void *h, size_t s)
|
|
+{
|
|
+ delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync,
|
|
+ __FUNCTION__);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_copyout_finalize_async (void *h, size_t s, int async)
|
|
+{
|
|
+ delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__);
|
|
}
|
|
|
|
static void
|
|
-update_dev_host (int is_dev, void *h, size_t s)
|
|
+update_dev_host (int is_dev, void *h, size_t s, int async)
|
|
{
|
|
splay_tree_key n;
|
|
void *d;
|
|
@@ -631,24 +765,42 @@ update_dev_host (int is_dev, void *h, si
|
|
d = (void *) (n->tgt->tgt_start + n->tgt_offset
|
|
+ (uintptr_t) h - n->host_start);
|
|
|
|
+ if (async > acc_async_sync)
|
|
+ acc_dev->openacc.async_set_async_func (async);
|
|
+
|
|
if (is_dev)
|
|
acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
|
|
else
|
|
acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
|
|
|
|
+ if (async > acc_async_sync)
|
|
+ acc_dev->openacc.async_set_async_func (acc_async_sync);
|
|
+
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
}
|
|
|
|
void
|
|
acc_update_device (void *h, size_t s)
|
|
{
|
|
- update_dev_host (1, h, s);
|
|
+ update_dev_host (1, h, s, acc_async_sync);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_update_device_async (void *h, size_t s, int async)
|
|
+{
|
|
+ update_dev_host (1, h, s, async);
|
|
}
|
|
|
|
void
|
|
acc_update_self (void *h, size_t s)
|
|
{
|
|
- update_dev_host (0, h, s);
|
|
+ update_dev_host (0, h, s, acc_async_sync);
|
|
+}
|
|
+
|
|
+void
|
|
+acc_update_self_async (void *h, size_t s, int async)
|
|
+{
|
|
+ update_dev_host (0, h, s, async);
|
|
}
|
|
|
|
void
|
|
@@ -659,11 +811,37 @@ gomp_acc_insert_pointer (size_t mapnum,
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
struct gomp_device_descr *acc_dev = thr->dev;
|
|
|
|
+ if (acc_is_present (*hostaddrs, *sizes))
|
|
+ {
|
|
+ splay_tree_key n;
|
|
+ gomp_mutex_lock (&acc_dev->lock);
|
|
+ n = lookup_host (acc_dev, *hostaddrs, *sizes);
|
|
+ gomp_mutex_unlock (&acc_dev->lock);
|
|
+
|
|
+ tgt = n->tgt;
|
|
+ for (size_t i = 0; i < tgt->list_count; i++)
|
|
+ if (tgt->list[i].key == n)
|
|
+ {
|
|
+ for (size_t j = 0; j < mapnum; j++)
|
|
+ if (i + j < tgt->list_count && tgt->list[i + j].key)
|
|
+ {
|
|
+ tgt->list[i + j].key->refcount++;
|
|
+ tgt->list[i + j].key->dynamic_refcount++;
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
+ /* Should not reach here. */
|
|
+ gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset");
|
|
+ }
|
|
+
|
|
gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
|
tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
|
|
NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
|
|
gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__);
|
|
|
|
+ /* Initialize dynamic refcount. */
|
|
+ tgt->list[0].key->dynamic_refcount = 1;
|
|
+
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
tgt->prev = acc_dev->openacc.data_environ;
|
|
acc_dev->openacc.data_environ = tgt;
|
|
@@ -671,7 +849,8 @@ gomp_acc_insert_pointer (size_t mapnum,
|
|
}
|
|
|
|
void
|
|
-gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
|
|
+gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async,
|
|
+ int finalize, int mapnum)
|
|
{
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
struct gomp_device_descr *acc_dev = thr->dev;
|
|
@@ -679,6 +858,9 @@ gomp_acc_remove_pointer (void *h, bool f
|
|
struct target_mem_desc *t;
|
|
int minrefs = (mapnum == 1) ? 2 : 3;
|
|
|
|
+ if (!acc_is_present (h, s))
|
|
+ return;
|
|
+
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
|
|
n = lookup_host (acc_dev, h, 1);
|
|
@@ -693,40 +875,65 @@ gomp_acc_remove_pointer (void *h, bool f
|
|
|
|
t = n->tgt;
|
|
|
|
- struct target_mem_desc *tp;
|
|
+ if (n->refcount < n->dynamic_refcount)
|
|
+ {
|
|
+ gomp_mutex_unlock (&acc_dev->lock);
|
|
+ gomp_fatal ("Dynamic reference counting assert fail\n");
|
|
+ }
|
|
|
|
- if (t->refcount == minrefs)
|
|
+ if (finalize)
|
|
{
|
|
- /* This is the last reference, so pull the descriptor off the
|
|
- chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from
|
|
- freeing the device memory. */
|
|
- t->tgt_end = 0;
|
|
- t->to_free = 0;
|
|
+ n->refcount -= n->dynamic_refcount;
|
|
+ n->dynamic_refcount = 0;
|
|
+ }
|
|
+ else if (n->dynamic_refcount)
|
|
+ {
|
|
+ n->dynamic_refcount--;
|
|
+ n->refcount--;
|
|
+ }
|
|
|
|
- for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
|
|
- tp = t, t = t->prev)
|
|
+ gomp_mutex_unlock (&acc_dev->lock);
|
|
+
|
|
+ if (n->refcount == 0)
|
|
+ {
|
|
+ if (t->refcount == minrefs)
|
|
{
|
|
- if (n->tgt == t)
|
|
+ /* This is the last reference, so pull the descriptor off the
|
|
+ chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from
|
|
+ freeing the device memory. */
|
|
+ struct target_mem_desc *tp;
|
|
+ for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
|
|
+ tp = t, t = t->prev)
|
|
{
|
|
- if (tp)
|
|
- tp->prev = t->prev;
|
|
- else
|
|
- acc_dev->openacc.data_environ = t->prev;
|
|
- break;
|
|
+ if (n->tgt == t)
|
|
+ {
|
|
+ if (tp)
|
|
+ tp->prev = t->prev;
|
|
+ else
|
|
+ acc_dev->openacc.data_environ = t->prev;
|
|
+ break;
|
|
+ }
|
|
}
|
|
}
|
|
- }
|
|
|
|
- if (force_copyfrom)
|
|
- t->list[0].copy_from = 1;
|
|
+ /* Set refcount to 1 to allow gomp_unmap_vars to unmap it. */
|
|
+ n->refcount = 1;
|
|
+ t->refcount = minrefs;
|
|
+ for (size_t i = 0; i < t->list_count; i++)
|
|
+ if (t->list[i].key == n)
|
|
+ {
|
|
+ t->list[i].copy_from = force_copyfrom ? 1 : 0;
|
|
+ break;
|
|
+ }
|
|
|
|
- gomp_mutex_unlock (&acc_dev->lock);
|
|
+ /* If running synchronously, unmap immediately. */
|
|
+ if (async < acc_async_noval)
|
|
+ gomp_unmap_vars (t, true);
|
|
+ else
|
|
+ t->device_descr->openacc.register_async_cleanup_func (t, async);
|
|
+ }
|
|
|
|
- /* If running synchronously, unmap immediately. */
|
|
- if (async < acc_async_noval)
|
|
- gomp_unmap_vars (t, true);
|
|
- else
|
|
- t->device_descr->openacc.register_async_cleanup_func (t, async);
|
|
+ gomp_mutex_unlock (&acc_dev->lock);
|
|
|
|
gomp_debug (0, " %s: mappings restored\n", __FUNCTION__);
|
|
}
|
|
--- libgomp/env.c.jj 2018-04-25 09:40:31.924655586 +0200
|
|
+++ libgomp/env.c 2019-05-07 18:46:36.482110438 +0200
|
|
@@ -88,8 +88,12 @@ void **gomp_places_list;
|
|
unsigned long gomp_places_list_len;
|
|
int gomp_debug_var;
|
|
unsigned int gomp_num_teams_var;
|
|
+bool gomp_display_affinity_var;
|
|
+char *gomp_affinity_format_var = "level %L thread %i affinity %A";
|
|
+size_t gomp_affinity_format_len;
|
|
char *goacc_device_type;
|
|
int goacc_device_num;
|
|
+int goacc_default_dims[GOMP_DIM_MAX];
|
|
|
|
#ifndef LIBGOMP_OFFLOADED_ONLY
|
|
|
|
@@ -100,6 +104,7 @@ parse_schedule (void)
|
|
{
|
|
char *env, *end;
|
|
unsigned long value;
|
|
+ int monotonic = 0;
|
|
|
|
env = getenv ("OMP_SCHEDULE");
|
|
if (env == NULL)
|
|
@@ -107,6 +112,26 @@ parse_schedule (void)
|
|
|
|
while (isspace ((unsigned char) *env))
|
|
++env;
|
|
+ if (strncasecmp (env, "monotonic", 9) == 0)
|
|
+ {
|
|
+ monotonic = 1;
|
|
+ env += 9;
|
|
+ }
|
|
+ else if (strncasecmp (env, "nonmonotonic", 12) == 0)
|
|
+ {
|
|
+ monotonic = -1;
|
|
+ env += 12;
|
|
+ }
|
|
+ if (monotonic)
|
|
+ {
|
|
+ while (isspace ((unsigned char) *env))
|
|
+ ++env;
|
|
+ if (*env != ':')
|
|
+ goto unknown;
|
|
+ ++env;
|
|
+ while (isspace ((unsigned char) *env))
|
|
+ ++env;
|
|
+ }
|
|
if (strncasecmp (env, "static", 6) == 0)
|
|
{
|
|
gomp_global_icv.run_sched_var = GFS_STATIC;
|
|
@@ -130,12 +155,16 @@ parse_schedule (void)
|
|
else
|
|
goto unknown;
|
|
|
|
+ if (monotonic == 1
|
|
+ || (monotonic == 0 && gomp_global_icv.run_sched_var == GFS_STATIC))
|
|
+ gomp_global_icv.run_sched_var |= GFS_MONOTONIC;
|
|
+
|
|
while (isspace ((unsigned char) *env))
|
|
++env;
|
|
if (*env == '\0')
|
|
{
|
|
gomp_global_icv.run_sched_chunk_size
|
|
- = gomp_global_icv.run_sched_var != GFS_STATIC;
|
|
+ = (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC;
|
|
return;
|
|
}
|
|
if (*env++ != ',')
|
|
@@ -158,7 +187,8 @@ parse_schedule (void)
|
|
if ((int)value != value)
|
|
goto invalid;
|
|
|
|
- if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
|
|
+ if (value == 0
|
|
+ && (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC)
|
|
value = 1;
|
|
gomp_global_icv.run_sched_chunk_size = value;
|
|
return;
|
|
@@ -1066,6 +1096,36 @@ parse_acc_device_type (void)
|
|
}
|
|
|
|
static void
|
|
+parse_gomp_openacc_dim (void)
|
|
+{
|
|
+ /* The syntax is the same as for the -fopenacc-dim compilation option. */
|
|
+ const char *var_name = "GOMP_OPENACC_DIM";
|
|
+ const char *env_var = getenv (var_name);
|
|
+ if (!env_var)
|
|
+ return;
|
|
+
|
|
+ const char *pos = env_var;
|
|
+ int i;
|
|
+ for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
|
|
+ {
|
|
+ if (i && *pos++ != ':')
|
|
+ break;
|
|
+
|
|
+ if (*pos == ':')
|
|
+ continue;
|
|
+
|
|
+ const char *eptr;
|
|
+ errno = 0;
|
|
+ long val = strtol (pos, (char **)&eptr, 10);
|
|
+ if (errno || val < 0 || (unsigned)val != val)
|
|
+ break;
|
|
+
|
|
+ goacc_default_dims[i] = (int)val;
|
|
+ pos = eptr;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
handle_omp_display_env (unsigned long stacksize, int wait_policy)
|
|
{
|
|
const char *env;
|
|
@@ -1119,19 +1179,34 @@ handle_omp_display_env (unsigned long st
|
|
fputs ("'\n", stderr);
|
|
|
|
fprintf (stderr, " OMP_SCHEDULE = '");
|
|
- switch (gomp_global_icv.run_sched_var)
|
|
+ if ((gomp_global_icv.run_sched_var & GFS_MONOTONIC))
|
|
+ {
|
|
+ if (gomp_global_icv.run_sched_var != (GFS_MONOTONIC | GFS_STATIC))
|
|
+ fputs ("MONOTONIC:", stderr);
|
|
+ }
|
|
+ else if (gomp_global_icv.run_sched_var == GFS_STATIC)
|
|
+ fputs ("NONMONOTONIC:", stderr);
|
|
+ switch (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC)
|
|
{
|
|
case GFS_RUNTIME:
|
|
fputs ("RUNTIME", stderr);
|
|
+ if (gomp_global_icv.run_sched_chunk_size != 1)
|
|
+ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
|
|
break;
|
|
case GFS_STATIC:
|
|
fputs ("STATIC", stderr);
|
|
+ if (gomp_global_icv.run_sched_chunk_size != 0)
|
|
+ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
|
|
break;
|
|
case GFS_DYNAMIC:
|
|
fputs ("DYNAMIC", stderr);
|
|
+ if (gomp_global_icv.run_sched_chunk_size != 1)
|
|
+ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
|
|
break;
|
|
case GFS_GUIDED:
|
|
fputs ("GUIDED", stderr);
|
|
+ if (gomp_global_icv.run_sched_chunk_size != 1)
|
|
+ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
|
|
break;
|
|
case GFS_AUTO:
|
|
fputs ("AUTO", stderr);
|
|
@@ -1197,6 +1272,10 @@ handle_omp_display_env (unsigned long st
|
|
gomp_global_icv.default_device_var);
|
|
fprintf (stderr, " OMP_MAX_TASK_PRIORITY = '%d'\n",
|
|
gomp_max_task_priority_var);
|
|
+ fprintf (stderr, " OMP_DISPLAY_AFFINITY = '%s'\n",
|
|
+ gomp_display_affinity_var ? "TRUE" : "FALSE");
|
|
+ fprintf (stderr, " OMP_AFFINITY_FORMAT = '%s'\n",
|
|
+ gomp_affinity_format_var);
|
|
|
|
if (verbose)
|
|
{
|
|
@@ -1228,6 +1307,7 @@ initialize_env (void)
|
|
parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
|
|
parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
|
|
parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
|
|
+ parse_boolean ("OMP_DISPLAY_AFFINITY", &gomp_display_affinity_var);
|
|
parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
|
|
parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true);
|
|
parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
|
|
@@ -1277,6 +1357,13 @@ initialize_env (void)
|
|
}
|
|
if (gomp_global_icv.bind_var != omp_proc_bind_false)
|
|
gomp_init_affinity ();
|
|
+
|
|
+ {
|
|
+ const char *env = getenv ("OMP_AFFINITY_FORMAT");
|
|
+ if (env != NULL)
|
|
+ gomp_set_affinity_format (env, strlen (env));
|
|
+ }
|
|
+
|
|
wait_policy = parse_wait_policy ();
|
|
if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
|
|
{
|
|
@@ -1302,7 +1389,6 @@ initialize_env (void)
|
|
|
|
/* Not strictly environment related, but ordering constructors is tricky. */
|
|
pthread_attr_init (&gomp_thread_attr);
|
|
- pthread_attr_setdetachstate (&gomp_thread_attr, PTHREAD_CREATE_DETACHED);
|
|
|
|
if (parse_stacksize ("OMP_STACKSIZE", &stacksize)
|
|
|| parse_stacksize ("GOMP_STACKSIZE", &stacksize)
|
|
@@ -1336,6 +1422,7 @@ initialize_env (void)
|
|
goacc_device_num = 0;
|
|
|
|
parse_acc_device_type ();
|
|
+ parse_gomp_openacc_dim ();
|
|
|
|
goacc_runtime_initialize ();
|
|
}
|
|
--- libgomp/fortran.c.jj 2018-04-25 09:40:31.913655581 +0200
|
|
+++ libgomp/fortran.c 2019-05-07 18:46:36.491110295 +0200
|
|
@@ -28,6 +28,8 @@
|
|
#include "libgomp.h"
|
|
#include "libgomp_f.h"
|
|
#include <stdlib.h>
|
|
+#include <stdio.h>
|
|
+#include <string.h>
|
|
#include <limits.h>
|
|
|
|
#ifdef HAVE_ATTRIBUTE_ALIAS
|
|
@@ -82,6 +84,8 @@ ialias_redirect (omp_get_team_num)
|
|
ialias_redirect (omp_is_initial_device)
|
|
ialias_redirect (omp_get_initial_device)
|
|
ialias_redirect (omp_get_max_task_priority)
|
|
+ialias_redirect (omp_pause_resource)
|
|
+ialias_redirect (omp_pause_resource_all)
|
|
#endif
|
|
|
|
#ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
|
|
@@ -368,7 +372,9 @@ omp_get_schedule_ (int32_t *kind, int32_
|
|
omp_sched_t k;
|
|
int cs;
|
|
omp_get_schedule (&k, &cs);
|
|
- *kind = k;
|
|
+ /* For now mask off GFS_MONOTONIC, because OpenMP 4.5 code will not
|
|
+ expect to see it. */
|
|
+ *kind = k & ~GFS_MONOTONIC;
|
|
*chunk_size = cs;
|
|
}
|
|
|
|
@@ -378,7 +384,8 @@ omp_get_schedule_8_ (int32_t *kind, int6
|
|
omp_sched_t k;
|
|
int cs;
|
|
omp_get_schedule (&k, &cs);
|
|
- *kind = k;
|
|
+ /* See above. */
|
|
+ *kind = k & ~GFS_MONOTONIC;
|
|
*chunk_size = cs;
|
|
}
|
|
|
|
@@ -576,3 +583,96 @@ omp_get_max_task_priority_ (void)
|
|
{
|
|
return omp_get_max_task_priority ();
|
|
}
|
|
+
|
|
+void
|
|
+omp_set_affinity_format_ (const char *format, size_t format_len)
|
|
+{
|
|
+ gomp_set_affinity_format (format, format_len);
|
|
+}
|
|
+
|
|
+int32_t
|
|
+omp_get_affinity_format_ (char *buffer, size_t buffer_len)
|
|
+{
|
|
+ size_t len = strlen (gomp_affinity_format_var);
|
|
+ if (buffer_len)
|
|
+ {
|
|
+ if (len < buffer_len)
|
|
+ {
|
|
+ memcpy (buffer, gomp_affinity_format_var, len);
|
|
+ memset (buffer + len, ' ', buffer_len - len);
|
|
+ }
|
|
+ else
|
|
+ memcpy (buffer, gomp_affinity_format_var, buffer_len);
|
|
+ }
|
|
+ return len;
|
|
+}
|
|
+
|
|
+void
|
|
+omp_display_affinity_ (const char *format, size_t format_len)
|
|
+{
|
|
+ char *fmt = NULL, fmt_buf[256];
|
|
+ char buf[512];
|
|
+ if (format_len)
|
|
+ {
|
|
+ fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1);
|
|
+ memcpy (fmt, format, format_len);
|
|
+ fmt[format_len] = '\0';
|
|
+ }
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ size_t ret
|
|
+ = gomp_display_affinity (buf, sizeof buf,
|
|
+ format_len ? fmt : gomp_affinity_format_var,
|
|
+ gomp_thread_self (), &thr->ts, thr->place);
|
|
+ if (ret < sizeof buf)
|
|
+ {
|
|
+ buf[ret] = '\n';
|
|
+ gomp_print_string (buf, ret + 1);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ char *b = gomp_malloc (ret + 1);
|
|
+ gomp_display_affinity (buf, sizeof buf,
|
|
+ format_len ? fmt : gomp_affinity_format_var,
|
|
+ gomp_thread_self (), &thr->ts, thr->place);
|
|
+ b[ret] = '\n';
|
|
+ gomp_print_string (b, ret + 1);
|
|
+ free (b);
|
|
+ }
|
|
+ if (fmt && fmt != fmt_buf)
|
|
+ free (fmt);
|
|
+}
|
|
+
|
|
+int32_t
|
|
+omp_capture_affinity_ (char *buffer, const char *format,
|
|
+ size_t buffer_len, size_t format_len)
|
|
+{
|
|
+ char *fmt = NULL, fmt_buf[256];
|
|
+ if (format_len)
|
|
+ {
|
|
+ fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1);
|
|
+ memcpy (fmt, format, format_len);
|
|
+ fmt[format_len] = '\0';
|
|
+ }
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ size_t ret
|
|
+ = gomp_display_affinity (buffer, buffer_len,
|
|
+ format_len ? fmt : gomp_affinity_format_var,
|
|
+ gomp_thread_self (), &thr->ts, thr->place);
|
|
+ if (fmt && fmt != fmt_buf)
|
|
+ free (fmt);
|
|
+ if (ret < buffer_len)
|
|
+ memset (buffer + ret, ' ', buffer_len - ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int32_t
|
|
+omp_pause_resource_ (const int32_t *kind, const int32_t *device_num)
|
|
+{
|
|
+ return omp_pause_resource (*kind, *device_num);
|
|
+}
|
|
+
|
|
+int32_t
|
|
+omp_pause_resource_all_ (const int32_t *kind)
|
|
+{
|
|
+ return omp_pause_resource_all (*kind);
|
|
+}
|
|
--- libgomp/configure.tgt.jj 2018-04-25 09:40:31.925655587 +0200
|
|
+++ libgomp/configure.tgt 2019-05-07 18:46:36.479110486 +0200
|
|
@@ -18,7 +18,7 @@ if test $gcc_cv_have_tls = yes ; then
|
|
;;
|
|
|
|
*-*-linux* | *-*-gnu*)
|
|
- XCFLAGS="${XCFLAGS} -ftls-model=initial-exec"
|
|
+ XCFLAGS="${XCFLAGS} -ftls-model=initial-exec -DUSING_INITIAL_EXEC_TLS"
|
|
;;
|
|
|
|
*-*-rtems*)
|
|
--- libgomp/icv-device.c.jj 2018-04-25 09:40:31.925655587 +0200
|
|
+++ libgomp/icv-device.c 2019-05-07 18:46:36.513109943 +0200
|
|
@@ -49,20 +49,6 @@ omp_get_num_devices (void)
|
|
}
|
|
|
|
int
|
|
-omp_get_num_teams (void)
|
|
-{
|
|
- /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */
|
|
- return 1;
|
|
-}
|
|
-
|
|
-int
|
|
-omp_get_team_num (void)
|
|
-{
|
|
- /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */
|
|
- return 0;
|
|
-}
|
|
-
|
|
-int
|
|
omp_is_initial_device (void)
|
|
{
|
|
/* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */
|
|
@@ -72,6 +58,4 @@ omp_is_initial_device (void)
|
|
ialias (omp_set_default_device)
|
|
ialias (omp_get_default_device)
|
|
ialias (omp_get_num_devices)
|
|
-ialias (omp_get_num_teams)
|
|
-ialias (omp_get_team_num)
|
|
ialias (omp_is_initial_device)
|
|
--- libgomp/Makefile.in.jj 2018-04-25 09:40:31.320655306 +0200
|
|
+++ libgomp/Makefile.in 2019-05-07 20:00:01.082077522 +0200
|
|
@@ -90,7 +90,7 @@ DIST_COMMON = $(top_srcdir)/plugin/Makef
|
|
$(srcdir)/libgomp.spec.in $(srcdir)/../depcomp
|
|
@PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
|
|
@PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la
|
|
-@USE_FORTRAN_TRUE@am__append_3 = openacc.f90
|
|
+@USE_FORTRAN_TRUE@am__append_3 = openacc2.f90
|
|
subdir = .
|
|
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
|
am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
|
|
@@ -172,7 +172,7 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL
|
|
@PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_rpath = -rpath \
|
|
@PLUGIN_NVPTX_TRUE@ $(toolexeclibdir)
|
|
libgomp_la_LIBADD =
|
|
-@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
|
|
+@USE_FORTRAN_TRUE@am__objects_1 = openacc2.lo
|
|
am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
|
|
env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
|
|
loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \
|
|
@@ -180,7 +180,8 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.
|
|
sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \
|
|
target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
|
|
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
|
|
- oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1)
|
|
+ oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
|
|
+ teams.lo $(am__objects_1)
|
|
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
|
|
DEFAULT_INCLUDES = -I.@am__isrc@
|
|
depcomp = $(SHELL) $(top_srcdir)/../depcomp
|
|
@@ -380,6 +381,7 @@ mkdir_p = @mkdir_p@
|
|
multi_basedir = @multi_basedir@
|
|
offload_additional_lib_paths = @offload_additional_lib_paths@
|
|
offload_additional_options = @offload_additional_options@
|
|
+offload_plugins = @offload_plugins@
|
|
offload_targets = @offload_targets@
|
|
oldincludedir = @oldincludedir@
|
|
pdfdir = @pdfdir@
|
|
@@ -436,7 +438,7 @@ libgomp_la_SOURCES = alloc.c atomic.c ba
|
|
affinity.c target.c splay-tree.c libgomp-plugin.c \
|
|
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
|
|
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
|
|
- $(am__append_3)
|
|
+ affinity-fmt.c teams.c $(am__append_3)
|
|
|
|
# Nvidia PTX OpenACC plugin.
|
|
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
|
|
@@ -599,6 +601,7 @@ mostlyclean-compile:
|
|
distclean-compile:
|
|
-rm -f *.tab.c
|
|
|
|
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity-fmt.Plo@am__quote@
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@
|
|
@@ -638,6 +641,7 @@ distclean-compile:
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
|
|
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/teams.Plo@am__quote@
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
|
|
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/work.Plo@am__quote@
|
|
|
|
@@ -1292,8 +1296,6 @@ omp_lib_kinds.mod: omp_lib.mod
|
|
:
|
|
openacc_kinds.mod: openacc.mod
|
|
:
|
|
-openacc.mod: openacc.lo
|
|
- :
|
|
%.mod: %.f90
|
|
$(FC) $(FCFLAGS) -fsyntax-only $<
|
|
fortran.lo: libgomp_f.h
|
|
--- libgomp/plugin/cuda/cuda.h.jj 2018-04-25 09:40:31.914655581 +0200
|
|
+++ libgomp/plugin/cuda/cuda.h 2019-05-07 18:46:36.533109624 +0200
|
|
@@ -44,6 +44,7 @@ typedef void *CUevent;
|
|
typedef void *CUfunction;
|
|
typedef void *CUlinkState;
|
|
typedef void *CUmodule;
|
|
+typedef size_t (*CUoccupancyB2DSize)(int);
|
|
typedef void *CUstream;
|
|
|
|
typedef enum {
|
|
@@ -88,6 +89,7 @@ typedef enum {
|
|
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
|
|
CU_JIT_ERROR_LOG_BUFFER = 5,
|
|
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
|
|
+ CU_JIT_OPTIMIZATION_LEVEL = 7,
|
|
CU_JIT_LOG_VERBOSE = 12
|
|
} CUjit_option;
|
|
|
|
@@ -169,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr
|
|
CUresult cuModuleLoad (CUmodule *, const char *);
|
|
CUresult cuModuleLoadData (CUmodule *, const void *);
|
|
CUresult cuModuleUnload (CUmodule);
|
|
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
|
|
+ CUoccupancyB2DSize, size_t, int);
|
|
CUresult cuStreamCreate (CUstream *, unsigned);
|
|
#define cuStreamDestroy cuStreamDestroy_v2
|
|
CUresult cuStreamDestroy (CUstream);
|
|
--- libgomp/plugin/cuda-lib.def.jj 2019-05-07 18:46:36.533109624 +0200
|
|
+++ libgomp/plugin/cuda-lib.def 2019-05-07 18:46:36.533109624 +0200
|
|
@@ -0,0 +1,49 @@
|
|
+CUDA_ONE_CALL (cuCtxCreate)
|
|
+CUDA_ONE_CALL (cuCtxDestroy)
|
|
+CUDA_ONE_CALL (cuCtxGetCurrent)
|
|
+CUDA_ONE_CALL (cuCtxGetDevice)
|
|
+CUDA_ONE_CALL (cuCtxPopCurrent)
|
|
+CUDA_ONE_CALL (cuCtxPushCurrent)
|
|
+CUDA_ONE_CALL (cuCtxSynchronize)
|
|
+CUDA_ONE_CALL (cuDeviceGet)
|
|
+CUDA_ONE_CALL (cuDeviceGetAttribute)
|
|
+CUDA_ONE_CALL (cuDeviceGetCount)
|
|
+CUDA_ONE_CALL (cuEventCreate)
|
|
+CUDA_ONE_CALL (cuEventDestroy)
|
|
+CUDA_ONE_CALL (cuEventElapsedTime)
|
|
+CUDA_ONE_CALL (cuEventQuery)
|
|
+CUDA_ONE_CALL (cuEventRecord)
|
|
+CUDA_ONE_CALL (cuEventSynchronize)
|
|
+CUDA_ONE_CALL (cuFuncGetAttribute)
|
|
+CUDA_ONE_CALL_MAYBE_NULL (cuGetErrorString)
|
|
+CUDA_ONE_CALL (cuInit)
|
|
+CUDA_ONE_CALL (cuLaunchKernel)
|
|
+CUDA_ONE_CALL (cuLinkAddData)
|
|
+CUDA_ONE_CALL_MAYBE_NULL (cuLinkAddData_v2)
|
|
+CUDA_ONE_CALL (cuLinkComplete)
|
|
+CUDA_ONE_CALL (cuLinkCreate)
|
|
+CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2)
|
|
+CUDA_ONE_CALL (cuLinkDestroy)
|
|
+CUDA_ONE_CALL (cuMemAlloc)
|
|
+CUDA_ONE_CALL (cuMemAllocHost)
|
|
+CUDA_ONE_CALL (cuMemcpy)
|
|
+CUDA_ONE_CALL (cuMemcpyDtoDAsync)
|
|
+CUDA_ONE_CALL (cuMemcpyDtoH)
|
|
+CUDA_ONE_CALL (cuMemcpyDtoHAsync)
|
|
+CUDA_ONE_CALL (cuMemcpyHtoD)
|
|
+CUDA_ONE_CALL (cuMemcpyHtoDAsync)
|
|
+CUDA_ONE_CALL (cuMemFree)
|
|
+CUDA_ONE_CALL (cuMemFreeHost)
|
|
+CUDA_ONE_CALL (cuMemGetAddressRange)
|
|
+CUDA_ONE_CALL (cuMemHostGetDevicePointer)
|
|
+CUDA_ONE_CALL (cuModuleGetFunction)
|
|
+CUDA_ONE_CALL (cuModuleGetGlobal)
|
|
+CUDA_ONE_CALL (cuModuleLoad)
|
|
+CUDA_ONE_CALL (cuModuleLoadData)
|
|
+CUDA_ONE_CALL (cuModuleUnload)
|
|
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
|
|
+CUDA_ONE_CALL (cuStreamCreate)
|
|
+CUDA_ONE_CALL (cuStreamDestroy)
|
|
+CUDA_ONE_CALL (cuStreamQuery)
|
|
+CUDA_ONE_CALL (cuStreamSynchronize)
|
|
+CUDA_ONE_CALL (cuStreamWaitEvent)
|
|
--- libgomp/plugin/plugin-nvptx.c.jj 2018-04-25 09:40:31.915655582 +0200
|
|
+++ libgomp/plugin/plugin-nvptx.c 2019-05-07 18:46:36.535109592 +0200
|
|
@@ -31,6 +31,7 @@
|
|
is not clear as to what that state might be. Or how one might
|
|
propagate it from one thread to another. */
|
|
|
|
+#define _GNU_SOURCE
|
|
#include "openacc.h"
|
|
#include "config.h"
|
|
#include "libgomp-plugin.h"
|
|
@@ -48,60 +49,41 @@
|
|
#include <assert.h>
|
|
#include <errno.h>
|
|
|
|
+#if CUDA_VERSION < 6000
|
|
+extern CUresult cuGetErrorString (CUresult, const char **);
|
|
+#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
|
|
+#endif
|
|
+
|
|
+#if CUDA_VERSION >= 6050
|
|
+#undef cuLinkCreate
|
|
+#undef cuLinkAddData
|
|
+CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
|
|
+ const char *, unsigned, CUjit_option *, void **);
|
|
+CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
|
|
+#else
|
|
+typedef size_t (*CUoccupancyB2DSize)(int);
|
|
+CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
|
|
+ const char *, unsigned, CUjit_option *, void **);
|
|
+CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
|
|
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
|
|
+ CUoccupancyB2DSize, size_t, int);
|
|
+#endif
|
|
+
|
|
+#define DO_PRAGMA(x) _Pragma (#x)
|
|
+
|
|
#if PLUGIN_NVPTX_DYNAMIC
|
|
# include <dlfcn.h>
|
|
|
|
-# define CUDA_CALLS \
|
|
-CUDA_ONE_CALL (cuCtxCreate) \
|
|
-CUDA_ONE_CALL (cuCtxDestroy) \
|
|
-CUDA_ONE_CALL (cuCtxGetCurrent) \
|
|
-CUDA_ONE_CALL (cuCtxGetDevice) \
|
|
-CUDA_ONE_CALL (cuCtxPopCurrent) \
|
|
-CUDA_ONE_CALL (cuCtxPushCurrent) \
|
|
-CUDA_ONE_CALL (cuCtxSynchronize) \
|
|
-CUDA_ONE_CALL (cuDeviceGet) \
|
|
-CUDA_ONE_CALL (cuDeviceGetAttribute) \
|
|
-CUDA_ONE_CALL (cuDeviceGetCount) \
|
|
-CUDA_ONE_CALL (cuEventCreate) \
|
|
-CUDA_ONE_CALL (cuEventDestroy) \
|
|
-CUDA_ONE_CALL (cuEventElapsedTime) \
|
|
-CUDA_ONE_CALL (cuEventQuery) \
|
|
-CUDA_ONE_CALL (cuEventRecord) \
|
|
-CUDA_ONE_CALL (cuEventSynchronize) \
|
|
-CUDA_ONE_CALL (cuFuncGetAttribute) \
|
|
-CUDA_ONE_CALL (cuGetErrorString) \
|
|
-CUDA_ONE_CALL (cuInit) \
|
|
-CUDA_ONE_CALL (cuLaunchKernel) \
|
|
-CUDA_ONE_CALL (cuLinkAddData) \
|
|
-CUDA_ONE_CALL (cuLinkComplete) \
|
|
-CUDA_ONE_CALL (cuLinkCreate) \
|
|
-CUDA_ONE_CALL (cuLinkDestroy) \
|
|
-CUDA_ONE_CALL (cuMemAlloc) \
|
|
-CUDA_ONE_CALL (cuMemAllocHost) \
|
|
-CUDA_ONE_CALL (cuMemcpy) \
|
|
-CUDA_ONE_CALL (cuMemcpyDtoDAsync) \
|
|
-CUDA_ONE_CALL (cuMemcpyDtoH) \
|
|
-CUDA_ONE_CALL (cuMemcpyDtoHAsync) \
|
|
-CUDA_ONE_CALL (cuMemcpyHtoD) \
|
|
-CUDA_ONE_CALL (cuMemcpyHtoDAsync) \
|
|
-CUDA_ONE_CALL (cuMemFree) \
|
|
-CUDA_ONE_CALL (cuMemFreeHost) \
|
|
-CUDA_ONE_CALL (cuMemGetAddressRange) \
|
|
-CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
|
|
-CUDA_ONE_CALL (cuModuleGetFunction) \
|
|
-CUDA_ONE_CALL (cuModuleGetGlobal) \
|
|
-CUDA_ONE_CALL (cuModuleLoad) \
|
|
-CUDA_ONE_CALL (cuModuleLoadData) \
|
|
-CUDA_ONE_CALL (cuModuleUnload) \
|
|
-CUDA_ONE_CALL (cuStreamCreate) \
|
|
-CUDA_ONE_CALL (cuStreamDestroy) \
|
|
-CUDA_ONE_CALL (cuStreamQuery) \
|
|
-CUDA_ONE_CALL (cuStreamSynchronize) \
|
|
-CUDA_ONE_CALL (cuStreamWaitEvent)
|
|
-# define CUDA_ONE_CALL(call) \
|
|
- __typeof (call) *call;
|
|
struct cuda_lib_s {
|
|
- CUDA_CALLS
|
|
+
|
|
+# define CUDA_ONE_CALL(call) \
|
|
+ __typeof (call) *call;
|
|
+# define CUDA_ONE_CALL_MAYBE_NULL(call) \
|
|
+ CUDA_ONE_CALL (call)
|
|
+#include "cuda-lib.def"
|
|
+# undef CUDA_ONE_CALL
|
|
+# undef CUDA_ONE_CALL_MAYBE_NULL
|
|
+
|
|
} cuda_lib;
|
|
|
|
/* -1 if init_cuda_lib has not been called yet, false
|
|
@@ -120,24 +102,41 @@ init_cuda_lib (void)
|
|
cuda_lib_inited = false;
|
|
if (h == NULL)
|
|
return false;
|
|
-# undef CUDA_ONE_CALL
|
|
-# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
|
|
-# define CUDA_ONE_CALL_1(call) \
|
|
+
|
|
+# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
|
|
+# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
|
|
+# define CUDA_ONE_CALL_1(call, allow_null) \
|
|
cuda_lib.call = dlsym (h, #call); \
|
|
- if (cuda_lib.call == NULL) \
|
|
+ if (!allow_null && cuda_lib.call == NULL) \
|
|
return false;
|
|
- CUDA_CALLS
|
|
+#include "cuda-lib.def"
|
|
+# undef CUDA_ONE_CALL
|
|
+# undef CUDA_ONE_CALL_1
|
|
+# undef CUDA_ONE_CALL_MAYBE_NULL
|
|
+
|
|
cuda_lib_inited = true;
|
|
return true;
|
|
}
|
|
-# undef CUDA_ONE_CALL
|
|
-# undef CUDA_ONE_CALL_1
|
|
# define CUDA_CALL_PREFIX cuda_lib.
|
|
#else
|
|
+
|
|
+# define CUDA_ONE_CALL(call)
|
|
+# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
|
|
+#include "cuda-lib.def"
|
|
+#undef CUDA_ONE_CALL_MAYBE_NULL
|
|
+#undef CUDA_ONE_CALL
|
|
+
|
|
# define CUDA_CALL_PREFIX
|
|
# define init_cuda_lib() true
|
|
#endif
|
|
|
|
+#include "secure_getenv.h"
|
|
+
|
|
+#undef MIN
|
|
+#undef MAX
|
|
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
|
|
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
|
|
+
|
|
/* Convenience macros for the frequently used CUDA library call and
|
|
error handling sequence as well as CUDA library calls that
|
|
do the error checking themselves or don't do it at all. */
|
|
@@ -171,40 +170,42 @@ init_cuda_lib (void)
|
|
#define CUDA_CALL_NOCHECK(FN, ...) \
|
|
CUDA_CALL_PREFIX FN (__VA_ARGS__)
|
|
|
|
+#define CUDA_CALL_EXISTS(FN) \
|
|
+ CUDA_CALL_PREFIX FN
|
|
+
|
|
static const char *
|
|
cuda_error (CUresult r)
|
|
{
|
|
-#if CUDA_VERSION < 7000
|
|
- /* Specified in documentation and present in library from at least
|
|
- 5.5. Not declared in header file prior to 7.0. */
|
|
- extern CUresult cuGetErrorString (CUresult, const char **);
|
|
-#endif
|
|
+ const char *fallback = "unknown cuda error";
|
|
const char *desc;
|
|
|
|
+ if (!CUDA_CALL_EXISTS (cuGetErrorString))
|
|
+ return fallback;
|
|
+
|
|
r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
|
|
- if (r != CUDA_SUCCESS)
|
|
- desc = "unknown cuda error";
|
|
+ if (r == CUDA_SUCCESS)
|
|
+ return desc;
|
|
|
|
- return desc;
|
|
+ return fallback;
|
|
}
|
|
|
|
static unsigned int instantiated_devices = 0;
|
|
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
+struct cuda_map
|
|
+{
|
|
+ CUdeviceptr d;
|
|
+ size_t size;
|
|
+ bool active;
|
|
+ struct cuda_map *next;
|
|
+};
|
|
+
|
|
struct ptx_stream
|
|
{
|
|
CUstream stream;
|
|
pthread_t host_thread;
|
|
bool multithreaded;
|
|
-
|
|
- CUdeviceptr d;
|
|
- void *h;
|
|
- void *h_begin;
|
|
- void *h_end;
|
|
- void *h_next;
|
|
- void *h_prev;
|
|
- void *h_tail;
|
|
-
|
|
+ struct cuda_map *map;
|
|
struct ptx_stream *next;
|
|
};
|
|
|
|
@@ -216,12 +217,64 @@ struct nvptx_thread
|
|
struct ptx_device *ptx_dev;
|
|
};
|
|
|
|
-struct map
|
|
+static struct cuda_map *
|
|
+cuda_map_create (size_t size)
|
|
{
|
|
- int async;
|
|
- size_t size;
|
|
- char mappings[0];
|
|
-};
|
|
+ struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
|
|
+
|
|
+ assert (map);
|
|
+
|
|
+ map->next = NULL;
|
|
+ map->size = size;
|
|
+ map->active = false;
|
|
+
|
|
+ CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
|
|
+ assert (map->d);
|
|
+
|
|
+ return map;
|
|
+}
|
|
+
|
|
+static void
|
|
+cuda_map_destroy (struct cuda_map *map)
|
|
+{
|
|
+ if (map->active)
|
|
+ /* Possible reasons for the map to be still active:
|
|
+ - the associated async kernel might still be running.
|
|
+ - the associated async kernel might have finished, but the
|
|
+ corresponding event that should trigger the pop_map has not been
|
|
+ processed by event_gc.
|
|
+ - the associated sync kernel might have aborted
|
|
+
|
|
+ The async cases could happen if the user specified an async region
|
|
+ without adding a corresponding wait that is guaranteed to be executed
|
|
+ (before returning from main, or in an atexit handler).
|
|
+ We do not want to deallocate a device pointer that is still being
|
|
+ used, so skip it.
|
|
+
|
|
+ In the sync case, the device pointer is no longer used, but deallocating
|
|
+ it using cuMemFree will not succeed, so skip it.
|
|
+
|
|
+ TODO: Handle this in a more constructive way, by f.i. waiting for streams
|
|
+ to finish before de-allocating them (PR88981), or by ensuring the CUDA
|
|
+ lib atexit handler is called before rather than after the libgomp plugin
|
|
+ atexit handler (PR83795). */
|
|
+ ;
|
|
+ else
|
|
+ CUDA_CALL_NOCHECK (cuMemFree, map->d);
|
|
+
|
|
+ free (map);
|
|
+}
|
|
+
|
|
+/* The following map_* routines manage the CUDA device memory that
|
|
+ contains the data mapping arguments for cuLaunchKernel. Each
|
|
+ asynchronous PTX stream may have multiple pending kernel
|
|
+ invocations, which are launched in a FIFO order. As such, the map
|
|
+ routines maintains a queue of cuLaunchKernel arguments.
|
|
+
|
|
+ Calls to map_push and map_pop must be guarded by ptx_event_lock.
|
|
+ Likewise, calls to map_init and map_fini are guarded by
|
|
+ ptx_dev_lock inside GOMP_OFFLOAD_init_device and
|
|
+ GOMP_OFFLOAD_fini_device, respectively. */
|
|
|
|
static bool
|
|
map_init (struct ptx_stream *s)
|
|
@@ -229,109 +282,83 @@ map_init (struct ptx_stream *s)
|
|
int size = getpagesize ();
|
|
|
|
assert (s);
|
|
- assert (!s->d);
|
|
- assert (!s->h);
|
|
-
|
|
- CUDA_CALL (cuMemAllocHost, &s->h, size);
|
|
- CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
|
|
|
|
- assert (s->h);
|
|
+ s->map = cuda_map_create (size);
|
|
|
|
- s->h_begin = s->h;
|
|
- s->h_end = s->h_begin + size;
|
|
- s->h_next = s->h_prev = s->h_tail = s->h_begin;
|
|
-
|
|
- assert (s->h_next);
|
|
- assert (s->h_end);
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
map_fini (struct ptx_stream *s)
|
|
{
|
|
- CUDA_CALL (cuMemFreeHost, s->h);
|
|
+ assert (s->map->next == NULL);
|
|
+
|
|
+ cuda_map_destroy (s->map);
|
|
+
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
map_pop (struct ptx_stream *s)
|
|
{
|
|
- struct map *m;
|
|
+ struct cuda_map *next;
|
|
|
|
assert (s != NULL);
|
|
- assert (s->h_next);
|
|
- assert (s->h_prev);
|
|
- assert (s->h_tail);
|
|
-
|
|
- m = s->h_tail;
|
|
-
|
|
- s->h_tail += m->size;
|
|
-
|
|
- if (s->h_tail >= s->h_end)
|
|
- s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
|
|
-
|
|
- if (s->h_next == s->h_tail)
|
|
- s->h_prev = s->h_next;
|
|
|
|
- assert (s->h_next >= s->h_begin);
|
|
- assert (s->h_tail >= s->h_begin);
|
|
- assert (s->h_prev >= s->h_begin);
|
|
+ if (s->map->next == NULL)
|
|
+ {
|
|
+ s->map->active = false;
|
|
+ return;
|
|
+ }
|
|
|
|
- assert (s->h_next <= s->h_end);
|
|
- assert (s->h_tail <= s->h_end);
|
|
- assert (s->h_prev <= s->h_end);
|
|
+ next = s->map->next;
|
|
+ cuda_map_destroy (s->map);
|
|
+ s->map = next;
|
|
}
|
|
|
|
-static void
|
|
-map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
|
|
+static CUdeviceptr
|
|
+map_push (struct ptx_stream *s, size_t size)
|
|
{
|
|
- int left;
|
|
- int offset;
|
|
- struct map *m;
|
|
+ struct cuda_map *map = NULL;
|
|
+ struct cuda_map **t;
|
|
|
|
- assert (s != NULL);
|
|
-
|
|
- left = s->h_end - s->h_next;
|
|
- size += sizeof (struct map);
|
|
-
|
|
- assert (s->h_prev);
|
|
- assert (s->h_next);
|
|
+ assert (s);
|
|
+ assert (s->map);
|
|
|
|
- if (size >= left)
|
|
+ /* Select an element to push. */
|
|
+ if (s->map->active)
|
|
+ map = cuda_map_create (size);
|
|
+ else
|
|
{
|
|
- m = s->h_prev;
|
|
- m->size += left;
|
|
- s->h_next = s->h_begin;
|
|
-
|
|
- if (s->h_next + size > s->h_end)
|
|
- GOMP_PLUGIN_fatal ("unable to push map");
|
|
- }
|
|
-
|
|
- assert (s->h_next);
|
|
-
|
|
- m = s->h_next;
|
|
- m->async = async;
|
|
- m->size = size;
|
|
+ /* Pop the inactive front element. */
|
|
+ struct cuda_map *pop = s->map;
|
|
+ s->map = pop->next;
|
|
+ pop->next = NULL;
|
|
|
|
- offset = (void *)&m->mappings[0] - s->h;
|
|
+ if (pop->size < size)
|
|
+ {
|
|
+ cuda_map_destroy (pop);
|
|
|
|
- *d = (void *)(s->d + offset);
|
|
- *h = (void *)(s->h + offset);
|
|
+ map = cuda_map_create (size);
|
|
+ }
|
|
+ else
|
|
+ map = pop;
|
|
+ }
|
|
|
|
- s->h_prev = s->h_next;
|
|
- s->h_next += size;
|
|
+ /* Check that the element is as expected. */
|
|
+ assert (map->next == NULL);
|
|
+ assert (!map->active);
|
|
|
|
- assert (s->h_prev);
|
|
- assert (s->h_next);
|
|
+ /* Mark the element active. */
|
|
+ map->active = true;
|
|
|
|
- assert (s->h_next >= s->h_begin);
|
|
- assert (s->h_tail >= s->h_begin);
|
|
- assert (s->h_prev >= s->h_begin);
|
|
- assert (s->h_next <= s->h_end);
|
|
- assert (s->h_tail <= s->h_end);
|
|
- assert (s->h_prev <= s->h_end);
|
|
+ /* Push the element to the back of the list. */
|
|
+ for (t = &s->map; (*t) != NULL; t = &(*t)->next)
|
|
+ ;
|
|
+ assert (t != NULL && *t == NULL);
|
|
+ *t = map;
|
|
|
|
- return;
|
|
+ return map->d;
|
|
}
|
|
|
|
/* Target data function launch information. */
|
|
@@ -411,6 +438,10 @@ struct ptx_device
|
|
int num_sms;
|
|
int regs_per_block;
|
|
int regs_per_sm;
|
|
+ int warp_size;
|
|
+ int max_threads_per_block;
|
|
+ int max_threads_per_multiprocessor;
|
|
+ int default_dims[GOMP_DIM_MAX];
|
|
|
|
struct ptx_image_data *images; /* Images loaded on device. */
|
|
pthread_mutex_t image_lock; /* Lock for above list. */
|
|
@@ -458,8 +489,6 @@ init_streams_for_device (struct ptx_devi
|
|
null_stream->stream = NULL;
|
|
null_stream->host_thread = pthread_self ();
|
|
null_stream->multithreaded = true;
|
|
- null_stream->d = (CUdeviceptr) NULL;
|
|
- null_stream->h = NULL;
|
|
if (!map_init (null_stream))
|
|
return false;
|
|
|
|
@@ -594,8 +623,6 @@ select_stream_for_async (int async, pthr
|
|
s->host_thread = thread;
|
|
s->multithreaded = false;
|
|
|
|
- s->d = (CUdeviceptr) NULL;
|
|
- s->h = NULL;
|
|
if (!map_init (s))
|
|
{
|
|
pthread_mutex_unlock (&ptx_dev->stream_lock);
|
|
@@ -777,9 +804,11 @@ nvptx_open_device (int n)
|
|
&pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
|
|
ptx_dev->regs_per_block = pi;
|
|
|
|
- /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
|
|
+ /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
|
|
in CUDA 6.0 and newer. */
|
|
- r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
|
|
+ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
|
|
+ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
|
|
+ dev);
|
|
/* Fallback: use limit of registers per block, which is usually equal. */
|
|
if (r == CUDA_ERROR_INVALID_VALUE)
|
|
pi = ptx_dev->regs_per_block;
|
|
@@ -797,12 +826,24 @@ nvptx_open_device (int n)
|
|
GOMP_PLUGIN_error ("Only warp size 32 is supported");
|
|
return NULL;
|
|
}
|
|
+ ptx_dev->warp_size = pi;
|
|
+
|
|
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
|
|
+ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
|
|
+ ptx_dev->max_threads_per_block = pi;
|
|
+
|
|
+ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
|
|
+ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
|
|
+ ptx_dev->max_threads_per_multiprocessor = pi;
|
|
|
|
r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
|
|
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
|
if (r != CUDA_SUCCESS)
|
|
async_engines = 1;
|
|
|
|
+ for (int i = 0; i != GOMP_DIM_MAX; i++)
|
|
+ ptx_dev->default_dims[i] = 0;
|
|
+
|
|
ptx_dev->images = NULL;
|
|
pthread_mutex_init (&ptx_dev->image_lock, NULL);
|
|
|
|
@@ -876,12 +917,42 @@ notify_var (const char *var_name, const
|
|
GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
|
|
}
|
|
|
|
+static void
|
|
+process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
|
|
+{
|
|
+ const char *var_name = "GOMP_NVPTX_JIT";
|
|
+ const char *env_var = secure_getenv (var_name);
|
|
+ notify_var (var_name, env_var);
|
|
+
|
|
+ if (env_var == NULL)
|
|
+ return;
|
|
+
|
|
+ const char *c = env_var;
|
|
+ while (*c != '\0')
|
|
+ {
|
|
+ while (*c == ' ')
|
|
+ c++;
|
|
+
|
|
+ if (c[0] == '-' && c[1] == 'O'
|
|
+ && '0' <= c[2] && c[2] <= '4'
|
|
+ && (c[3] == '\0' || c[3] == ' '))
|
|
+ {
|
|
+ *gomp_nvptx_o = c[2] - '0';
|
|
+ c += 3;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ GOMP_PLUGIN_error ("Error parsing %s", var_name);
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
static bool
|
|
link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
|
|
unsigned num_objs)
|
|
{
|
|
- CUjit_option opts[6];
|
|
- void *optvals[6];
|
|
+ CUjit_option opts[7];
|
|
+ void *optvals[7];
|
|
float elapsed = 0.0;
|
|
char elog[1024];
|
|
char ilog[16384];
|
|
@@ -908,16 +979,41 @@ link_ptx (CUmodule *module, const struct
|
|
opts[5] = CU_JIT_LOG_VERBOSE;
|
|
optvals[5] = (void *) 1;
|
|
|
|
- CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
|
|
+ static intptr_t gomp_nvptx_o = -1;
|
|
+
|
|
+ static bool init_done = false;
|
|
+ if (!init_done)
|
|
+ {
|
|
+ process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
|
|
+ init_done = true;
|
|
+ }
|
|
+
|
|
+ int nopts = 6;
|
|
+ if (gomp_nvptx_o != -1)
|
|
+ {
|
|
+ opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
|
|
+ optvals[nopts] = (void *) gomp_nvptx_o;
|
|
+ nopts++;
|
|
+ }
|
|
+
|
|
+ if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
|
|
+ CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
|
|
+ else
|
|
+ CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
|
|
|
|
for (; num_objs--; ptx_objs++)
|
|
{
|
|
/* cuLinkAddData's 'data' argument erroneously omits the const
|
|
qualifier. */
|
|
GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
|
|
- r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
|
|
- (char *) ptx_objs->code, ptx_objs->size,
|
|
- 0, 0, 0, 0);
|
|
+ if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
|
|
+ r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
|
|
+ (char *) ptx_objs->code, ptx_objs->size,
|
|
+ 0, 0, 0, 0);
|
|
+ else
|
|
+ r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
|
|
+ (char *) ptx_objs->code, ptx_objs->size,
|
|
+ 0, 0, 0, 0);
|
|
if (r != CUDA_SUCCESS)
|
|
{
|
|
GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
|
|
@@ -1067,8 +1163,10 @@ nvptx_exec (void (*fn), size_t mapnum, v
|
|
int i;
|
|
struct ptx_stream *dev_str;
|
|
void *kargs[1];
|
|
- void *hp, *dp;
|
|
+ void *hp;
|
|
+ CUdeviceptr dp = 0;
|
|
struct nvptx_thread *nvthd = nvptx_thread ();
|
|
+ int warp_size = nvthd->ptx_dev->warp_size;
|
|
const char *maybe_abort_msg = "(perhaps abort was called)";
|
|
|
|
function = targ_fn->fn;
|
|
@@ -1090,68 +1188,36 @@ nvptx_exec (void (*fn), size_t mapnum, v
|
|
|
|
if (seen_zero)
|
|
{
|
|
- /* See if the user provided GOMP_OPENACC_DIM environment
|
|
- variable to specify runtime defaults. */
|
|
- static int default_dims[GOMP_DIM_MAX];
|
|
-
|
|
pthread_mutex_lock (&ptx_dev_lock);
|
|
- if (!default_dims[0])
|
|
- {
|
|
- const char *var_name = "GOMP_OPENACC_DIM";
|
|
- /* We only read the environment variable once. You can't
|
|
- change it in the middle of execution. The syntax is
|
|
- the same as for the -fopenacc-dim compilation option. */
|
|
- const char *env_var = getenv (var_name);
|
|
- notify_var (var_name, env_var);
|
|
- if (env_var)
|
|
- {
|
|
- const char *pos = env_var;
|
|
|
|
- for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
|
|
- {
|
|
- if (i && *pos++ != ':')
|
|
- break;
|
|
- if (*pos != ':')
|
|
- {
|
|
- const char *eptr;
|
|
-
|
|
- errno = 0;
|
|
- long val = strtol (pos, (char **)&eptr, 10);
|
|
- if (errno || val < 0 || (unsigned)val != val)
|
|
- break;
|
|
- default_dims[i] = (int)val;
|
|
- pos = eptr;
|
|
- }
|
|
- }
|
|
- }
|
|
+ static int gomp_openacc_dims[GOMP_DIM_MAX];
|
|
+ if (!gomp_openacc_dims[0])
|
|
+ {
|
|
+ /* See if the user provided GOMP_OPENACC_DIM environment
|
|
+ variable to specify runtime defaults. */
|
|
+ for (int i = 0; i < GOMP_DIM_MAX; ++i)
|
|
+ gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
|
|
+ }
|
|
|
|
- int warp_size, block_size, dev_size, cpu_size;
|
|
- CUdevice dev = nvptx_thread()->ptx_dev->dev;
|
|
- /* 32 is the default for known hardware. */
|
|
- int gang = 0, worker = 32, vector = 32;
|
|
- CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
|
|
-
|
|
- cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
|
|
- cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
|
|
- cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
|
|
- cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
|
|
-
|
|
- if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
|
|
- dev) == CUDA_SUCCESS
|
|
- && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
|
|
- dev) == CUDA_SUCCESS
|
|
- && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
|
|
- dev) == CUDA_SUCCESS
|
|
- && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
|
|
- dev) == CUDA_SUCCESS)
|
|
- {
|
|
- GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
|
|
- " dev_size=%d, cpu_size=%d\n",
|
|
- warp_size, block_size, dev_size, cpu_size);
|
|
- gang = (cpu_size / block_size) * dev_size;
|
|
- worker = block_size / warp_size;
|
|
- vector = warp_size;
|
|
- }
|
|
+ if (!nvthd->ptx_dev->default_dims[0])
|
|
+ {
|
|
+ int default_dims[GOMP_DIM_MAX];
|
|
+ for (int i = 0; i < GOMP_DIM_MAX; ++i)
|
|
+ default_dims[i] = gomp_openacc_dims[i];
|
|
+
|
|
+ int gang, worker, vector;
|
|
+ {
|
|
+ int block_size = nvthd->ptx_dev->max_threads_per_block;
|
|
+ int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
|
|
+ int dev_size = nvthd->ptx_dev->num_sms;
|
|
+ GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
|
|
+ " dev_size=%d, cpu_size=%d\n",
|
|
+ warp_size, block_size, dev_size, cpu_size);
|
|
+
|
|
+ gang = (cpu_size / block_size) * dev_size;
|
|
+ worker = block_size / warp_size;
|
|
+ vector = warp_size;
|
|
+ }
|
|
|
|
/* There is no upper bound on the gang size. The best size
|
|
matches the hardware configuration. Logical gangs are
|
|
@@ -1172,29 +1238,150 @@ nvptx_exec (void (*fn), size_t mapnum, v
|
|
default_dims[GOMP_DIM_GANG],
|
|
default_dims[GOMP_DIM_WORKER],
|
|
default_dims[GOMP_DIM_VECTOR]);
|
|
+
|
|
+ for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
+ nvthd->ptx_dev->default_dims[i] = default_dims[i];
|
|
}
|
|
pthread_mutex_unlock (&ptx_dev_lock);
|
|
|
|
- for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
- if (!dims[i])
|
|
- dims[i] = default_dims[i];
|
|
- }
|
|
-
|
|
- /* This reserves a chunk of a pre-allocated page of memory mapped on both
|
|
- the host and the device. HP is a host pointer to the new chunk, and DP is
|
|
- the corresponding device pointer. */
|
|
- map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
|
|
-
|
|
- GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
|
-
|
|
- /* Copy the array of arguments to the mapped page. */
|
|
- for (i = 0; i < mapnum; i++)
|
|
- ((void **) hp)[i] = devaddrs[i];
|
|
-
|
|
- /* Copy the (device) pointers to arguments to the device (dp and hp might in
|
|
- fact have the same value on a unified-memory system). */
|
|
- CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
|
|
- mapnum * sizeof (void *));
|
|
+ {
|
|
+ bool default_dim_p[GOMP_DIM_MAX];
|
|
+ for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
+ default_dim_p[i] = !dims[i];
|
|
+
|
|
+ if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
|
|
+ {
|
|
+ for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
+ if (default_dim_p[i])
|
|
+ dims[i] = nvthd->ptx_dev->default_dims[i];
|
|
+
|
|
+ if (default_dim_p[GOMP_DIM_VECTOR])
|
|
+ dims[GOMP_DIM_VECTOR]
|
|
+ = MIN (dims[GOMP_DIM_VECTOR],
|
|
+ (targ_fn->max_threads_per_block / warp_size
|
|
+ * warp_size));
|
|
+
|
|
+ if (default_dim_p[GOMP_DIM_WORKER])
|
|
+ dims[GOMP_DIM_WORKER]
|
|
+ = MIN (dims[GOMP_DIM_WORKER],
|
|
+ targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ /* Handle the case that the compiler allows the runtime to choose
|
|
+ the vector-length conservatively, by ignoring
|
|
+ gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle
|
|
+ it. */
|
|
+ int vectors = 0;
|
|
+ /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
|
|
+ gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
|
|
+ exceed targ_fn->max_threads_per_block. */
|
|
+ int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
|
|
+ int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
|
|
+ int grids, blocks;
|
|
+
|
|
+ CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
|
|
+ &blocks, function, NULL, 0,
|
|
+ dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
|
|
+ GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
|
|
+ "grid = %d, block = %d\n", grids, blocks);
|
|
+
|
|
+ /* Keep the num_gangs proportional to the block size. In
|
|
+ the case were a block size is limited by shared-memory
|
|
+ or the register file capacity, the runtime will not
|
|
+ excessively over assign gangs to the multiprocessor
|
|
+ units if their state is going to be swapped out even
|
|
+ more than necessary. The constant factor 2 is there to
|
|
+ prevent threads from idling when there is insufficient
|
|
+ work for them. */
|
|
+ if (gangs == 0)
|
|
+ gangs = 2 * grids * (blocks / warp_size);
|
|
+
|
|
+ if (vectors == 0)
|
|
+ vectors = warp_size;
|
|
+
|
|
+ if (workers == 0)
|
|
+ {
|
|
+ int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
|
|
+ ? vectors
|
|
+ : dims[GOMP_DIM_VECTOR]);
|
|
+ workers = blocks / actual_vectors;
|
|
+ workers = MAX (workers, 1);
|
|
+ /* If we need a per-worker barrier ... . */
|
|
+ if (actual_vectors > 32)
|
|
+ /* Don't use more barriers than available. */
|
|
+ workers = MIN (workers, 15);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i != GOMP_DIM_MAX; i++)
|
|
+ if (default_dim_p[i])
|
|
+ switch (i)
|
|
+ {
|
|
+ case GOMP_DIM_GANG: dims[i] = gangs; break;
|
|
+ case GOMP_DIM_WORKER: dims[i] = workers; break;
|
|
+ case GOMP_DIM_VECTOR: dims[i] = vectors; break;
|
|
+ default: GOMP_PLUGIN_fatal ("invalid dim");
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Check if the accelerator has sufficient hardware resources to
|
|
+ launch the offloaded kernel. */
|
|
+ if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
|
|
+ > targ_fn->max_threads_per_block)
|
|
+ {
|
|
+ const char *msg
|
|
+ = ("The Nvidia accelerator has insufficient resources to launch '%s'"
|
|
+ " with num_workers = %d and vector_length = %d"
|
|
+ "; "
|
|
+ "recompile the program with 'num_workers = x and vector_length = y'"
|
|
+ " on that offloaded region or '-fopenacc-dim=:x:y' where"
|
|
+ " x * y <= %d"
|
|
+ ".\n");
|
|
+ GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
|
|
+ dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
|
|
+ }
|
|
+
|
|
+ /* Check if the accelerator has sufficient barrier resources to
|
|
+ launch the offloaded kernel. */
|
|
+ if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
|
|
+ {
|
|
+ const char *msg
|
|
+ = ("The Nvidia accelerator has insufficient barrier resources to launch"
|
|
+ " '%s' with num_workers = %d and vector_length = %d"
|
|
+ "; "
|
|
+ "recompile the program with 'num_workers = x' on that offloaded"
|
|
+ " region or '-fopenacc-dim=:x:' where x <= 15"
|
|
+ "; "
|
|
+ "or, recompile the program with 'vector_length = 32' on that"
|
|
+ " offloaded region or '-fopenacc-dim=::32'"
|
|
+ ".\n");
|
|
+ GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
|
|
+ dims[GOMP_DIM_VECTOR]);
|
|
+ }
|
|
+
|
|
+ if (mapnum > 0)
|
|
+ {
|
|
+ /* This reserves a chunk of a pre-allocated page of memory mapped on both
|
|
+ the host and the device. HP is a host pointer to the new chunk, and DP is
|
|
+ the corresponding device pointer. */
|
|
+ pthread_mutex_lock (&ptx_event_lock);
|
|
+ dp = map_push (dev_str, mapnum * sizeof (void *));
|
|
+ pthread_mutex_unlock (&ptx_event_lock);
|
|
+
|
|
+ GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
|
|
+
|
|
+ /* Copy the array of arguments to the mapped page. */
|
|
+ hp = alloca(sizeof(void *) * mapnum);
|
|
+ for (i = 0; i < mapnum; i++)
|
|
+ ((void **) hp)[i] = devaddrs[i];
|
|
+
|
|
+ /* Copy the (device) pointers to arguments to the device */
|
|
+ CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
|
|
+ mapnum * sizeof (void *));
|
|
+ }
|
|
+
|
|
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
|
|
" gangs=%u, workers=%u, vectors=%u\n",
|
|
__FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
|
|
@@ -1239,7 +1426,8 @@ nvptx_exec (void (*fn), size_t mapnum, v
|
|
|
|
CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
|
|
|
|
- event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
|
|
+ if (mapnum > 0)
|
|
+ event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
|
|
}
|
|
#else
|
|
r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
|
|
@@ -1256,7 +1444,10 @@ nvptx_exec (void (*fn), size_t mapnum, v
|
|
#ifndef DISABLE_ASYNC
|
|
if (async < acc_async_noval)
|
|
#endif
|
|
- map_pop (dev_str);
|
|
+ {
|
|
+ if (mapnum > 0)
|
|
+ map_pop (dev_str);
|
|
+ }
|
|
}
|
|
|
|
void * openacc_get_current_cuda_context (void);
|
|
@@ -1415,9 +1606,8 @@ nvptx_async_test (int async)
|
|
struct ptx_stream *s;
|
|
|
|
s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
-
|
|
if (!s)
|
|
- GOMP_PLUGIN_fatal ("unknown async %d", async);
|
|
+ return 1;
|
|
|
|
r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
|
|
if (r == CUDA_SUCCESS)
|
|
@@ -1472,7 +1662,7 @@ nvptx_wait (int async)
|
|
|
|
s = select_stream_for_async (async, pthread_self (), false, NULL);
|
|
if (!s)
|
|
- GOMP_PLUGIN_fatal ("unknown async %d", async);
|
|
+ return;
|
|
|
|
CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
|
|
|
|
@@ -1486,16 +1676,17 @@ nvptx_wait_async (int async1, int async2
|
|
struct ptx_stream *s1, *s2;
|
|
pthread_t self = pthread_self ();
|
|
|
|
+ s1 = select_stream_for_async (async1, self, false, NULL);
|
|
+ if (!s1)
|
|
+ return;
|
|
+
|
|
/* The stream that is waiting (rather than being waited for) doesn't
|
|
necessarily have to exist already. */
|
|
s2 = select_stream_for_async (async2, self, true, NULL);
|
|
|
|
- s1 = select_stream_for_async (async1, self, false, NULL);
|
|
- if (!s1)
|
|
- GOMP_PLUGIN_fatal ("invalid async 1\n");
|
|
-
|
|
+ /* A stream is always synchronized with itself. */
|
|
if (s1 == s2)
|
|
- GOMP_PLUGIN_fatal ("identical parameters");
|
|
+ return;
|
|
|
|
e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
|
|
|
|
@@ -1629,8 +1820,14 @@ nvptx_set_cuda_stream (int async, void *
|
|
pthread_t self = pthread_self ();
|
|
struct nvptx_thread *nvthd = nvptx_thread ();
|
|
|
|
- if (async < 0)
|
|
- GOMP_PLUGIN_fatal ("bad async %d", async);
|
|
+ /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
|
|
+ to change the stream handle associated with "acc_async_sync". */
|
|
+ if (async == acc_async_sync)
|
|
+ {
|
|
+ GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
|
|
+ " with \"acc_async_sync\"\n");
|
|
+ return 0;
|
|
+ }
|
|
|
|
pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
|
|
|
|
@@ -1739,6 +1936,12 @@ GOMP_OFFLOAD_fini_device (int n)
|
|
instantiated_devices--;
|
|
}
|
|
|
|
+ if (instantiated_devices == 0)
|
|
+ {
|
|
+ free (ptx_devices);
|
|
+ ptx_devices = NULL;
|
|
+ }
|
|
+
|
|
pthread_mutex_unlock (&ptx_dev_lock);
|
|
return true;
|
|
}
|
|
--- libgomp/plugin/configfrag.ac.jj 2018-04-25 09:40:31.914655581 +0200
|
|
+++ libgomp/plugin/configfrag.ac 2019-05-07 18:46:36.533109624 +0200
|
|
@@ -26,8 +26,6 @@
|
|
# see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
-offload_targets=
|
|
-AC_SUBST(offload_targets)
|
|
plugin_support=yes
|
|
AC_CHECK_LIB(dl, dlsym, , [plugin_support=no])
|
|
if test x"$plugin_support" = xyes; then
|
|
@@ -59,7 +57,11 @@ AC_ARG_WITH(cuda-driver-lib,
|
|
[AS_HELP_STRING([--with-cuda-driver-lib=PATH],
|
|
[specify directory for the installed CUDA driver library])])
|
|
case "x$with_cuda_driver" in
|
|
- x | xno) ;;
|
|
+ x) ;;
|
|
+ xno)
|
|
+ CUDA_DRIVER_INCLUDE=no
|
|
+ CUDA_DRIVER_LIB=no
|
|
+ ;;
|
|
*) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
|
|
CUDA_DRIVER_LIB=$with_cuda_driver/lib
|
|
;;
|
|
@@ -70,10 +72,12 @@ fi
|
|
if test "x$with_cuda_driver_lib" != x; then
|
|
CUDA_DRIVER_LIB=$with_cuda_driver_lib
|
|
fi
|
|
-if test "x$CUDA_DRIVER_INCLUDE" != x; then
|
|
+if test "x$CUDA_DRIVER_INCLUDE" != x \
|
|
+ && test "x$CUDA_DRIVER_INCLUDE" != xno; then
|
|
CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
|
|
fi
|
|
-if test "x$CUDA_DRIVER_LIB" != x; then
|
|
+if test "x$CUDA_DRIVER_LIB" != x \
|
|
+ && test "x$CUDA_DRIVER_LIB" != xno; then
|
|
CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
|
|
fi
|
|
|
|
@@ -133,7 +137,13 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS)
|
|
AC_SUBST(PLUGIN_HSA_LDFLAGS)
|
|
AC_SUBST(PLUGIN_HSA_LIBS)
|
|
|
|
-# Get offload targets and path to install tree of offloading compiler.
|
|
+# Parse '--enable-offload-targets', figure out the corresponding libgomp
|
|
+# plugins, and configure to find the corresponding offload compilers.
|
|
+# 'offload_plugins' and 'offload_targets' will be populated in the same order.
|
|
+offload_plugins=
|
|
+offload_targets=
|
|
+AC_SUBST(offload_plugins)
|
|
+AC_SUBST(offload_targets)
|
|
offload_additional_options=
|
|
offload_additional_lib_paths=
|
|
AC_SUBST(offload_additional_options)
|
|
@@ -142,36 +152,41 @@ if test x"$enable_offload_targets" != x;
|
|
for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
|
|
tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
|
|
tgt=`echo $tgt | sed 's/=.*//'`
|
|
- tgt_name=
|
|
+ tgt_plugin=
|
|
case $tgt in
|
|
*-intelmic-* | *-intelmicemul-*)
|
|
- tgt_name=intelmic
|
|
+ tgt_plugin=intelmic
|
|
;;
|
|
nvptx*)
|
|
- tgt_name=nvptx
|
|
+ tgt_plugin=nvptx
|
|
PLUGIN_NVPTX=$tgt
|
|
- PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
|
|
- PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
|
|
- PLUGIN_NVPTX_LIBS='-lcuda'
|
|
-
|
|
- PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
|
|
- CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
|
|
- PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
|
|
- LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
|
|
- PLUGIN_NVPTX_save_LIBS=$LIBS
|
|
- LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
|
|
- AC_LINK_IFELSE(
|
|
- [AC_LANG_PROGRAM(
|
|
- [#include "cuda.h"],
|
|
- [CUresult r = cuCtxPushCurrent (NULL);])],
|
|
- [PLUGIN_NVPTX=1])
|
|
- CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
|
|
- LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
|
|
- LIBS=$PLUGIN_NVPTX_save_LIBS
|
|
+ if test "x$CUDA_DRIVER_LIB" != xno \
|
|
+ && test "x$CUDA_DRIVER_LIB" != xno; then
|
|
+ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
|
|
+ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
|
|
+ PLUGIN_NVPTX_LIBS='-lcuda'
|
|
+
|
|
+ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
|
|
+ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
|
|
+ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
|
|
+ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
|
|
+ PLUGIN_NVPTX_save_LIBS=$LIBS
|
|
+ LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
|
|
+ AC_LINK_IFELSE(
|
|
+ [AC_LANG_PROGRAM(
|
|
+ [#include "cuda.h"],
|
|
+ [CUresult r = cuCtxPushCurrent (NULL);])],
|
|
+ [PLUGIN_NVPTX=1])
|
|
+ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
|
|
+ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
|
|
+ LIBS=$PLUGIN_NVPTX_save_LIBS
|
|
+ fi
|
|
case $PLUGIN_NVPTX in
|
|
nvptx*)
|
|
- if test "x$CUDA_DRIVER_INCLUDE" = x \
|
|
- && test "x$CUDA_DRIVER_LIB" = x; then
|
|
+ if (test "x$CUDA_DRIVER_INCLUDE" = x \
|
|
+ || test "x$CUDA_DRIVER_INCLUDE" = xno) \
|
|
+ && (test "x$CUDA_DRIVER_LIB" = x \
|
|
+ || test "x$CUDA_DRIVER_LIB" = xno); then
|
|
PLUGIN_NVPTX=1
|
|
PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
|
|
PLUGIN_NVPTX_LIBS='-ldl'
|
|
@@ -191,7 +206,7 @@ if test x"$enable_offload_targets" != x;
|
|
PLUGIN_HSA=0
|
|
;;
|
|
*)
|
|
- tgt_name=hsa
|
|
+ tgt_plugin=hsa
|
|
PLUGIN_HSA=$tgt
|
|
PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
|
|
PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
|
|
@@ -209,7 +224,7 @@ if test x"$enable_offload_targets" != x;
|
|
LDFLAGS=$PLUGIN_HSA_save_LDFLAGS
|
|
LIBS=$PLUGIN_HSA_save_LIBS
|
|
case $PLUGIN_HSA in
|
|
- hsa*)
|
|
+ hsa*)
|
|
HSA_PLUGIN=0
|
|
AC_MSG_ERROR([HSA run-time package required for HSA support])
|
|
;;
|
|
@@ -226,16 +241,19 @@ if test x"$enable_offload_targets" != x;
|
|
AC_MSG_ERROR([unknown offload target specified])
|
|
;;
|
|
esac
|
|
- if test x"$tgt_name" = x; then
|
|
- # Don't configure libgomp for this offloading target if we don't build
|
|
- # the corresponding plugin.
|
|
+ if test x"$tgt_plugin" = x; then
|
|
+ # Not configuring libgomp for this offload target if we're not building
|
|
+ # the corresponding offload plugin.
|
|
continue
|
|
- elif test x"$offload_targets" = x; then
|
|
- offload_targets=$tgt_name
|
|
+ elif test x"$offload_plugins" = x; then
|
|
+ offload_plugins=$tgt_plugin
|
|
+ offload_targets=$tgt
|
|
else
|
|
- offload_targets=$offload_targets,$tgt_name
|
|
+ offload_plugins=$offload_plugins,$tgt_plugin
|
|
+ offload_targets=$offload_targets,$tgt
|
|
fi
|
|
- if test "$tgt_name" = hsa; then
|
|
+ # Configure additional search paths.
|
|
+ if test "$tgt_plugin" = hsa; then
|
|
# Offloading compilation is all handled by the target compiler.
|
|
:
|
|
elif test x"$tgt_dir" != x; then
|
|
@@ -247,8 +265,8 @@ if test x"$enable_offload_targets" != x;
|
|
fi
|
|
done
|
|
fi
|
|
-AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets",
|
|
- [Define to offload targets, separated by commas.])
|
|
+AC_DEFINE_UNQUOTED(OFFLOAD_PLUGINS, "$offload_plugins",
|
|
+ [Define to offload plugins, separated by commas.])
|
|
AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
|
|
AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
|
|
[Define to 1 if the NVIDIA plugin is built, 0 if not.])
|
|
--- libgomp/affinity-fmt.c.jj 2019-05-07 18:46:36.285113585 +0200
|
|
+++ libgomp/affinity-fmt.c 2019-05-07 18:46:36.285113585 +0200
|
|
@@ -0,0 +1,495 @@
|
|
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
|
|
+ Contributed by Jakub Jelinek <jakub@redhat.com>.
|
|
+
|
|
+ This file is part of the GNU Offloading and Multi Processing Library
|
|
+ (libgomp).
|
|
+
|
|
+ Libgomp is free software; you can redistribute it and/or modify it
|
|
+ under the terms of the GNU General Public License as published by
|
|
+ the Free Software Foundation; either version 3, or (at your option)
|
|
+ any later version.
|
|
+
|
|
+ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
+ FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
+ more details.
|
|
+
|
|
+ Under Section 7 of GPL version 3, you are granted additional
|
|
+ permissions described in the GCC Runtime Library Exception, version
|
|
+ 3.1, as published by the Free Software Foundation.
|
|
+
|
|
+ You should have received a copy of the GNU General Public License and
|
|
+ a copy of the GCC Runtime Library Exception along with this program;
|
|
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
+ <http://www.gnu.org/licenses/>. */
|
|
+
|
|
+#include "libgomp.h"
|
|
+#include <string.h>
|
|
+#include <stdio.h>
|
|
+#include <stdlib.h>
|
|
+#ifdef HAVE_UNISTD_H
|
|
+#include <unistd.h>
|
|
+#endif
|
|
+#ifdef HAVE_INTTYPES_H
|
|
+# include <inttypes.h> /* For PRIx64. */
|
|
+#endif
|
|
+#ifdef HAVE_UNAME
|
|
+#include <sys/utsname.h>
|
|
+#endif
|
|
+
|
|
+void
|
|
+gomp_print_string (const char *str, size_t len)
|
|
+{
|
|
+ fwrite (str, 1, len, stderr);
|
|
+}
|
|
+
|
|
+void
|
|
+gomp_set_affinity_format (const char *format, size_t len)
|
|
+{
|
|
+ if (len < gomp_affinity_format_len)
|
|
+ memcpy (gomp_affinity_format_var, format, len);
|
|
+ else
|
|
+ {
|
|
+ char *p;
|
|
+ if (gomp_affinity_format_len)
|
|
+ p = gomp_realloc (gomp_affinity_format_var, len + 1);
|
|
+ else
|
|
+ p = gomp_malloc (len + 1);
|
|
+ memcpy (p, format, len);
|
|
+ gomp_affinity_format_var = p;
|
|
+ gomp_affinity_format_len = len + 1;
|
|
+ }
|
|
+ gomp_affinity_format_var[len] = '\0';
|
|
+}
|
|
+
|
|
+void
|
|
+omp_set_affinity_format (const char *format)
|
|
+{
|
|
+ gomp_set_affinity_format (format, strlen (format));
|
|
+}
|
|
+
|
|
+size_t
|
|
+omp_get_affinity_format (char *buffer, size_t size)
|
|
+{
|
|
+ size_t len = strlen (gomp_affinity_format_var);
|
|
+ if (size)
|
|
+ {
|
|
+ if (len < size)
|
|
+ memcpy (buffer, gomp_affinity_format_var, len + 1);
|
|
+ else
|
|
+ {
|
|
+ memcpy (buffer, gomp_affinity_format_var, size - 1);
|
|
+ buffer[size - 1] = '\0';
|
|
+ }
|
|
+ }
|
|
+ return len;
|
|
+}
|
|
+
|
|
+void
|
|
+gomp_display_string (char *buffer, size_t size, size_t *ret,
|
|
+ const char *str, size_t len)
|
|
+{
|
|
+ size_t r = *ret;
|
|
+ if (size && r < size)
|
|
+ {
|
|
+ size_t l = len;
|
|
+ if (size - r < len)
|
|
+ l = size - r;
|
|
+ memcpy (buffer + r, str, l);
|
|
+ }
|
|
+ *ret += len;
|
|
+ if (__builtin_expect (r > *ret, 0))
|
|
+ gomp_fatal ("overflow in omp_capture_affinity");
|
|
+}
|
|
+
|
|
+static void
|
|
+gomp_display_repeat (char *buffer, size_t size, size_t *ret,
|
|
+ char c, size_t len)
|
|
+{
|
|
+ size_t r = *ret;
|
|
+ if (size && r < size)
|
|
+ {
|
|
+ size_t l = len;
|
|
+ if (size - r < len)
|
|
+ l = size - r;
|
|
+ memset (buffer + r, c, l);
|
|
+ }
|
|
+ *ret += len;
|
|
+ if (__builtin_expect (r > *ret, 0))
|
|
+ gomp_fatal ("overflow in omp_capture_affinity");
|
|
+}
|
|
+
|
|
+static void
|
|
+gomp_display_num (char *buffer, size_t size, size_t *ret,
|
|
+ bool zero, bool right, size_t sz, char *buf)
|
|
+{
|
|
+ size_t l = strlen (buf);
|
|
+ if (sz == (size_t) -1 || l >= sz)
|
|
+ {
|
|
+ gomp_display_string (buffer, size, ret, buf, l);
|
|
+ return;
|
|
+ }
|
|
+ if (zero)
|
|
+ {
|
|
+ if (buf[0] == '-')
|
|
+ gomp_display_string (buffer, size, ret, buf, 1);
|
|
+ else if (buf[0] == '0' && buf[1] == 'x')
|
|
+ gomp_display_string (buffer, size, ret, buf, 2);
|
|
+ gomp_display_repeat (buffer, size, ret, '0', sz - l);
|
|
+ if (buf[0] == '-')
|
|
+ gomp_display_string (buffer, size, ret, buf + 1, l - 1);
|
|
+ else if (buf[0] == '0' && buf[1] == 'x')
|
|
+ gomp_display_string (buffer, size, ret, buf + 2, l - 2);
|
|
+ else
|
|
+ gomp_display_string (buffer, size, ret, buf, l);
|
|
+ }
|
|
+ else if (right)
|
|
+ {
|
|
+ gomp_display_repeat (buffer, size, ret, ' ', sz - l);
|
|
+ gomp_display_string (buffer, size, ret, buf, l);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ gomp_display_string (buffer, size, ret, buf, l);
|
|
+ gomp_display_repeat (buffer, size, ret, ' ', sz - l);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+gomp_display_int (char *buffer, size_t size, size_t *ret,
|
|
+ bool zero, bool right, size_t sz, int num)
|
|
+{
|
|
+ char buf[3 * sizeof (int) + 2];
|
|
+ sprintf (buf, "%d", num);
|
|
+ gomp_display_num (buffer, size, ret, zero, right, sz, buf);
|
|
+}
|
|
+
|
|
+static void
|
|
+gomp_display_string_len (char *buffer, size_t size, size_t *ret,
|
|
+ bool right, size_t sz, char *str, size_t len)
|
|
+{
|
|
+ if (sz == (size_t) -1 || len >= sz)
|
|
+ {
|
|
+ gomp_display_string (buffer, size, ret, str, len);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (right)
|
|
+ {
|
|
+ gomp_display_repeat (buffer, size, ret, ' ', sz - len);
|
|
+ gomp_display_string (buffer, size, ret, str, len);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ gomp_display_string (buffer, size, ret, str, len);
|
|
+ gomp_display_repeat (buffer, size, ret, ' ', sz - len);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+gomp_display_hostname (char *buffer, size_t size, size_t *ret,
|
|
+ bool right, size_t sz)
|
|
+{
|
|
+#ifdef HAVE_GETHOSTNAME
|
|
+ {
|
|
+ char buf[256];
|
|
+ char *b = buf;
|
|
+ size_t len = 256;
|
|
+ do
|
|
+ {
|
|
+ b[len - 1] = '\0';
|
|
+ if (gethostname (b, len - 1) == 0)
|
|
+ {
|
|
+ size_t l = strlen (b);
|
|
+ if (l < len - 1)
|
|
+ {
|
|
+ gomp_display_string_len (buffer, size, ret,
|
|
+ right, sz, b, l);
|
|
+ if (b != buf)
|
|
+ free (b);
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+ if (len == 1048576)
|
|
+ break;
|
|
+ len = len * 2;
|
|
+ if (len == 512)
|
|
+ b = gomp_malloc (len);
|
|
+ else
|
|
+ b = gomp_realloc (b, len);
|
|
+ }
|
|
+ while (1);
|
|
+ if (b != buf)
|
|
+ free (b);
|
|
+ }
|
|
+#endif
|
|
+#ifdef HAVE_UNAME
|
|
+ {
|
|
+ struct utsname buf;
|
|
+ if (uname (&buf) == 0)
|
|
+ {
|
|
+ gomp_display_string_len (buffer, size, ret, right, sz,
|
|
+ buf.nodename, strlen (buf.nodename));
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+#endif
|
|
+ gomp_display_string_len (buffer, size, ret, right, sz, "node", 4);
|
|
+}
|
|
+
|
|
+struct affinity_types_struct {
|
|
+ char long_str[18];
|
|
+ char long_len;
|
|
+ char short_c; };
|
|
+
|
|
+static struct affinity_types_struct affinity_types[] =
|
|
+{
|
|
+#define AFFINITY_TYPE(l, s) \
|
|
+ { #l, sizeof (#l) - 1, s }
|
|
+ AFFINITY_TYPE (team_num, 't'),
|
|
+ AFFINITY_TYPE (num_teams, 'T'),
|
|
+ AFFINITY_TYPE (nesting_level, 'L'),
|
|
+ AFFINITY_TYPE (thread_num, 'n'),
|
|
+ AFFINITY_TYPE (num_threads, 'N'),
|
|
+ AFFINITY_TYPE (ancestor_tnum, 'a'),
|
|
+ AFFINITY_TYPE (host, 'H'),
|
|
+ AFFINITY_TYPE (process_id, 'P'),
|
|
+ AFFINITY_TYPE (native_thread_id, 'i'),
|
|
+ AFFINITY_TYPE (thread_affinity, 'A')
|
|
+#undef AFFINITY_TYPE
|
|
+};
|
|
+
|
|
+size_t
|
|
+gomp_display_affinity (char *buffer, size_t size,
|
|
+ const char *format, gomp_thread_handle handle,
|
|
+ struct gomp_team_state *ts, unsigned int place)
|
|
+{
|
|
+ size_t ret = 0;
|
|
+ do
|
|
+ {
|
|
+ const char *p = strchr (format, '%');
|
|
+ bool zero = false;
|
|
+ bool right = false;
|
|
+ size_t sz = -1;
|
|
+ char c;
|
|
+ int val;
|
|
+ if (p == NULL)
|
|
+ p = strchr (format, '\0');
|
|
+ if (p != format)
|
|
+ gomp_display_string (buffer, size, &ret,
|
|
+ format, p - format);
|
|
+ if (*p == '\0')
|
|
+ break;
|
|
+ p++;
|
|
+ if (*p == '%')
|
|
+ {
|
|
+ gomp_display_string (buffer, size, &ret, "%", 1);
|
|
+ format = p + 1;
|
|
+ continue;
|
|
+ }
|
|
+ if (*p == '0')
|
|
+ {
|
|
+ zero = true;
|
|
+ p++;
|
|
+ if (*p != '.')
|
|
+ gomp_fatal ("leading zero not followed by dot in affinity format");
|
|
+ }
|
|
+ if (*p == '.')
|
|
+ {
|
|
+ right = true;
|
|
+ p++;
|
|
+ }
|
|
+ if (*p >= '1' && *p <= '9')
|
|
+ {
|
|
+ char *end;
|
|
+ sz = strtoul (p, &end, 10);
|
|
+ p = end;
|
|
+ }
|
|
+ else if (zero || right)
|
|
+ gomp_fatal ("leading zero or right justification in affinity format "
|
|
+ "requires size");
|
|
+ c = *p;
|
|
+ if (c == '{')
|
|
+ {
|
|
+ int i;
|
|
+ for (i = 0;
|
|
+ i < sizeof (affinity_types) / sizeof (affinity_types[0]); ++i)
|
|
+ if (strncmp (p + 1, affinity_types[i].long_str,
|
|
+ affinity_types[i].long_len) == 0
|
|
+ && p[affinity_types[i].long_len + 1] == '}')
|
|
+ {
|
|
+ c = affinity_types[i].short_c;
|
|
+ p += affinity_types[i].long_len + 1;
|
|
+ break;
|
|
+ }
|
|
+ if (c == '{')
|
|
+ {
|
|
+ char *q = strchr (p + 1, '}');
|
|
+ if (q)
|
|
+ gomp_fatal ("unsupported long type name '%.*s' in affinity "
|
|
+ "format", (int) (q - (p + 1)), p + 1);
|
|
+ else
|
|
+ gomp_fatal ("unterminated long type name '%s' in affinity "
|
|
+ "format", p + 1);
|
|
+ }
|
|
+ }
|
|
+ switch (c)
|
|
+ {
|
|
+ case 't':
|
|
+ val = omp_get_team_num ();
|
|
+ goto do_int;
|
|
+ case 'T':
|
|
+ val = omp_get_num_teams ();
|
|
+ goto do_int;
|
|
+ case 'L':
|
|
+ val = ts->level;
|
|
+ goto do_int;
|
|
+ case 'n':
|
|
+ val = ts->team_id;
|
|
+ goto do_int;
|
|
+ case 'N':
|
|
+ val = ts->team ? ts->team->nthreads : 1;
|
|
+ goto do_int;
|
|
+ case 'a':
|
|
+ val = ts->team ? ts->team->prev_ts.team_id : -1;
|
|
+ goto do_int;
|
|
+ case 'H':
|
|
+ gomp_display_hostname (buffer, size, &ret, right, sz);
|
|
+ break;
|
|
+ case 'P':
|
|
+#ifdef HAVE_GETPID
|
|
+ val = getpid ();
|
|
+#else
|
|
+ val = 0;
|
|
+#endif
|
|
+ goto do_int;
|
|
+ case 'i':
|
|
+#if defined(LIBGOMP_USE_PTHREADS) && defined(__GNUC__)
|
|
+ {
|
|
+ char buf[3 * (sizeof (handle) + sizeof (uintptr_t) + sizeof (int))
|
|
+ + 4];
|
|
+ /* This macro returns expr unmodified for integral or pointer
|
|
+ types and 0 for anything else (e.g. aggregates). */
|
|
+#define gomp_nonaggregate(expr) \
|
|
+ __builtin_choose_expr (__builtin_classify_type (expr) == 1 \
|
|
+ || __builtin_classify_type (expr) == 5, expr, 0)
|
|
+ /* This macro returns expr unmodified for integral types,
|
|
+ (uintptr_t) (expr) for pointer types and 0 for anything else
|
|
+ (e.g. aggregates). */
|
|
+#define gomp_integral(expr) \
|
|
+ __builtin_choose_expr (__builtin_classify_type (expr) == 5, \
|
|
+ (uintptr_t) gomp_nonaggregate (expr), \
|
|
+ gomp_nonaggregate (expr))
|
|
+
|
|
+ if (sizeof (gomp_integral (handle)) == sizeof (unsigned long))
|
|
+ sprintf (buf, "0x%lx", (unsigned long) gomp_integral (handle));
|
|
+#if defined (HAVE_INTTYPES_H) && defined (PRIx64)
|
|
+ else if (sizeof (gomp_integral (handle)) == sizeof (uint64_t))
|
|
+ sprintf (buf, "0x%" PRIx64, (uint64_t) gomp_integral (handle));
|
|
+#else
|
|
+ else if (sizeof (gomp_integral (handle))
|
|
+ == sizeof (unsigned long long))
|
|
+ sprintf (buf, "0x%llx",
|
|
+ (unsigned long long) gomp_integral (handle));
|
|
+#endif
|
|
+ else
|
|
+ sprintf (buf, "0x%x", (unsigned int) gomp_integral (handle));
|
|
+ gomp_display_num (buffer, size, &ret, zero, right, sz, buf);
|
|
+ break;
|
|
+ }
|
|
+#else
|
|
+ val = 0;
|
|
+ goto do_int;
|
|
+#endif
|
|
+ case 'A':
|
|
+ if (sz == (size_t) -1)
|
|
+ gomp_display_affinity_place (buffer, size, &ret,
|
|
+ place - 1);
|
|
+ else if (right)
|
|
+ {
|
|
+ size_t len = 0;
|
|
+ gomp_display_affinity_place (NULL, 0, &len, place - 1);
|
|
+ if (len < sz)
|
|
+ gomp_display_repeat (buffer, size, &ret, ' ', sz - len);
|
|
+ gomp_display_affinity_place (buffer, size, &ret, place - 1);
|
|
+ }
|
|
+ else
|
|
+ {
|
|
+ size_t start = ret;
|
|
+ gomp_display_affinity_place (buffer, size, &ret, place - 1);
|
|
+ if (ret - start < sz)
|
|
+ gomp_display_repeat (buffer, size, &ret, ' ', sz - (ret - start));
|
|
+ }
|
|
+ break;
|
|
+ do_int:
|
|
+ gomp_display_int (buffer, size, &ret, zero, right, sz, val);
|
|
+ break;
|
|
+ default:
|
|
+ gomp_fatal ("unsupported type %c in affinity format", c);
|
|
+ }
|
|
+ format = p + 1;
|
|
+ }
|
|
+ while (1);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+size_t
|
|
+omp_capture_affinity (char *buffer, size_t size, const char *format)
|
|
+{
|
|
+ struct gomp_thread *thr = gomp_thread ();
|
|
+ size_t ret
|
|
+ = gomp_display_affinity (buffer, size,
|
|
+ format && *format
|
|
+ ? format : gomp_affinity_format_var,
|
|
+ gomp_thread_self (), &thr->ts, thr->place);
|
|
+ if (size)
|
|
+ {
|
|
+ if (ret >= size)
|
|
+ buffer[size - 1] = '\0';
|
|
+ else
|
|
+ buffer[ret] = '\0';
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+ialias (omp_capture_affinity)
|
|
+
|
|
+void
|
|
+omp_display_affinity (const char *format)
|
|
+{
|
|
+ char buf[512];
|
|
+ char *b;
|
|
+ size_t ret = ialias_call (omp_capture_affinity) (buf, sizeof buf, format);
|
|
+ if (ret < sizeof buf)
|
|
+ {
|
|
+ buf[ret] = '\n';
|
|
+ gomp_print_string (buf, ret + 1);
|
|
+ return;
|
|
+ }
|
|
+ b = gomp_malloc (ret + 1);
|
|
+ ialias_call (omp_capture_affinity) (b, ret + 1, format);
|
|
+ b[ret] = '\n';
|
|
+ gomp_print_string (b, ret + 1);
|
|
+ free (b);
|
|
+}
|
|
+
|
|
+void
|
|
+gomp_display_affinity_thread (gomp_thread_handle handle,
|
|
+ struct gomp_team_state *ts, unsigned int place)
|
|
+{
|
|
+ char buf[512];
|
|
+ char *b;
|
|
+ size_t ret = gomp_display_affinity (buf, sizeof buf, gomp_affinity_format_var,
|
|
+ handle, ts, place);
|
|
+ if (ret < sizeof buf)
|
|
+ {
|
|
+ buf[ret] = '\n';
|
|
+ gomp_print_string (buf, ret + 1);
|
|
+ return;
|
|
+ }
|
|
+ b = gomp_malloc (ret + 1);
|
|
+ gomp_display_affinity (b, ret + 1, gomp_affinity_format_var,
|
|
+ handle, ts, place);
|
|
+ b[ret] = '\n';
|
|
+ gomp_print_string (b, ret + 1);
|
|
+ free (b);
|
|
+}
|
|
--- libgomp/single.c.jj 2018-04-25 09:40:31.870655561 +0200
|
|
+++ libgomp/single.c 2019-05-07 18:46:36.536109576 +0200
|
|
@@ -47,7 +47,7 @@ GOMP_single_start (void)
|
|
return __sync_bool_compare_and_swap (&team->single_count, single_count,
|
|
single_count + 1L);
|
|
#else
|
|
- bool ret = gomp_work_share_start (false);
|
|
+ bool ret = gomp_work_share_start (0);
|
|
if (ret)
|
|
gomp_work_share_init_done ();
|
|
gomp_work_share_end_nowait ();
|
|
@@ -68,7 +68,7 @@ GOMP_single_copy_start (void)
|
|
bool first;
|
|
void *ret;
|
|
|
|
- first = gomp_work_share_start (false);
|
|
+ first = gomp_work_share_start (0);
|
|
|
|
if (first)
|
|
{
|
|
--- libgomp/oacc-cuda.c.jj 2018-04-25 09:40:31.321655307 +0200
|
|
+++ libgomp/oacc-cuda.c 2019-05-07 18:46:36.528109704 +0200
|
|
@@ -58,7 +58,7 @@ acc_get_cuda_stream (int async)
|
|
{
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
- if (async < 0)
|
|
+ if (!async_valid_p (async))
|
|
return NULL;
|
|
|
|
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
|
|
@@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *st
|
|
{
|
|
struct goacc_thread *thr;
|
|
|
|
- if (async < 0 || stream == NULL)
|
|
+ if (!async_valid_p (async) || stream == NULL)
|
|
return 0;
|
|
|
|
goacc_lazy_initialize ();
|
|
--- libgomp/work.c.jj 2018-04-25 09:40:31.925655587 +0200
|
|
+++ libgomp/work.c 2019-05-07 18:46:36.548109384 +0200
|
|
@@ -76,7 +76,15 @@ alloc_work_share (struct gomp_team *team
|
|
#endif
|
|
|
|
team->work_share_chunk *= 2;
|
|
+ /* Allocating gomp_work_share structures aligned is just an
|
|
+ optimization, don't do it when using the fallback method. */
|
|
+#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC
|
|
+ ws = gomp_aligned_alloc (__alignof (struct gomp_work_share),
|
|
+ team->work_share_chunk
|
|
+ * sizeof (struct gomp_work_share));
|
|
+#else
|
|
ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share));
|
|
+#endif
|
|
ws->next_alloc = team->work_shares[0].next_alloc;
|
|
team->work_shares[0].next_alloc = ws;
|
|
team->work_share_list_alloc = &ws[1];
|
|
@@ -90,30 +98,35 @@ alloc_work_share (struct gomp_team *team
|
|
This shouldn't touch the next_alloc field. */
|
|
|
|
void
|
|
-gomp_init_work_share (struct gomp_work_share *ws, bool ordered,
|
|
+gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
|
|
unsigned nthreads)
|
|
{
|
|
gomp_mutex_init (&ws->lock);
|
|
if (__builtin_expect (ordered, 0))
|
|
{
|
|
-#define INLINE_ORDERED_TEAM_IDS_CNT \
|
|
- ((sizeof (struct gomp_work_share) \
|
|
- - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \
|
|
- / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0]))
|
|
-
|
|
- if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT)
|
|
- ws->ordered_team_ids
|
|
- = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids));
|
|
+#define INLINE_ORDERED_TEAM_IDS_SIZE \
|
|
+ (sizeof (struct gomp_work_share) \
|
|
+ - offsetof (struct gomp_work_share, inline_ordered_team_ids))
|
|
+
|
|
+ if (__builtin_expect (ordered != 1, 0))
|
|
+ {
|
|
+ ordered += nthreads * sizeof (*ws->ordered_team_ids) - 1;
|
|
+ ordered = ordered + __alignof__ (long long) - 1;
|
|
+ ordered &= ~(__alignof__ (long long) - 1);
|
|
+ }
|
|
+ else
|
|
+ ordered = nthreads * sizeof (*ws->ordered_team_ids);
|
|
+ if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
|
|
+ ws->ordered_team_ids = gomp_malloc (ordered);
|
|
else
|
|
ws->ordered_team_ids = ws->inline_ordered_team_ids;
|
|
- memset (ws->ordered_team_ids, '\0',
|
|
- nthreads * sizeof (*ws->ordered_team_ids));
|
|
+ memset (ws->ordered_team_ids, '\0', ordered);
|
|
ws->ordered_num_used = 0;
|
|
ws->ordered_owner = -1;
|
|
ws->ordered_cur = 0;
|
|
}
|
|
else
|
|
- ws->ordered_team_ids = NULL;
|
|
+ ws->ordered_team_ids = ws->inline_ordered_team_ids;
|
|
gomp_ptrlock_init (&ws->next_ws, NULL);
|
|
ws->threads_completed = 0;
|
|
}
|
|
@@ -166,7 +179,7 @@ free_work_share (struct gomp_team *team,
|
|
if this was the first thread to reach this point. */
|
|
|
|
bool
|
|
-gomp_work_share_start (bool ordered)
|
|
+gomp_work_share_start (size_t ordered)
|
|
{
|
|
struct gomp_thread *thr = gomp_thread ();
|
|
struct gomp_team *team = thr->ts.team;
|
|
@@ -178,7 +191,7 @@ gomp_work_share_start (bool ordered)
|
|
ws = gomp_malloc (sizeof (*ws));
|
|
gomp_init_work_share (ws, ordered, 1);
|
|
thr->ts.work_share = ws;
|
|
- return ws;
|
|
+ return true;
|
|
}
|
|
|
|
ws = thr->ts.work_share;
|
|
--- include/gomp-constants.h.jj 2018-04-25 09:40:39.757659209 +0200
|
|
+++ include/gomp-constants.h 2019-05-07 18:57:33.333627031 +0200
|
|
@@ -189,6 +189,7 @@ enum gomp_map_kind
|
|
#define GOMP_TASK_FLAG_GRAINSIZE (1 << 9)
|
|
#define GOMP_TASK_FLAG_IF (1 << 10)
|
|
#define GOMP_TASK_FLAG_NOGROUP (1 << 11)
|
|
+#define GOMP_TASK_FLAG_REDUCTION (1 << 12)
|
|
|
|
/* GOMP_target{_ext,update_ext,enter_exit_data} flags argument. */
|
|
#define GOMP_TARGET_FLAG_NOWAIT (1 << 0)
|
|
@@ -196,6 +197,18 @@ enum gomp_map_kind
|
|
/* Internal to libgomp. */
|
|
#define GOMP_TARGET_FLAG_UPDATE (1U << 31)
|
|
|
|
+
|
|
+/* OpenACC construct flags. */
|
|
+
|
|
+/* Force host fallback execution. */
|
|
+#define GOACC_FLAG_HOST_FALLBACK (1 << 0)
|
|
+
|
|
+/* For legacy reasons, in the ABI, the GOACC_FLAGs are encoded as an inverted
|
|
+ bitmask. */
|
|
+#define GOACC_FLAGS_MARSHAL_OP BIT_NOT_EXPR
|
|
+#define GOACC_FLAGS_UNMARSHAL(X) (~(X))
|
|
+
|
|
+
|
|
/* Versions of libgomp and device-specific plugins. GOMP_VERSION
|
|
should be incremented whenever an ABI-incompatible change is introduced
|
|
to the plugin interface defined in libgomp/libgomp.h. */
|
|
@@ -251,6 +264,12 @@ enum gomp_map_kind
|
|
at most and shifted by this many bits. */
|
|
#define GOMP_TARGET_ARG_VALUE_SHIFT 16
|
|
|
|
+/* Dependence types in omp_depend_t objects. */
|
|
+#define GOMP_DEPEND_IN 1
|
|
+#define GOMP_DEPEND_OUT 2
|
|
+#define GOMP_DEPEND_INOUT 3
|
|
+#define GOMP_DEPEND_MUTEXINOUTSET 4
|
|
+
|
|
/* HSA specific data structures. */
|
|
|
|
/* Identifiers of device-specific target arguments. */
|