diff --git a/.gcc.metadata b/.gcc.metadata index 99d2974..03b4cd9 100644 --- a/.gcc.metadata +++ b/.gcc.metadata @@ -1,3 +1,3 @@ -1fe3aa7ce95faa0f4d7f08f0dfefd86ff4b43015 SOURCES/gcc-8.2.1-20180905.tar.xz +8ee669ee60997110e6251c72dac66bf69bbe13c7 SOURCES/gcc-8.3.1-20190507.tar.xz 3bdb3cc01fa7690a0e20ea5cfffcbe690f7665eb SOURCES/nvptx-newlib-aadc8eb0ec43b7cd0dd2dfb484bae63c8b05ef24.tar.xz ce8eb83be0ac37fb5d5388df455a980fe37b4f13 SOURCES/nvptx-tools-c28050f60193b3b95a18866a96f03334e874e78f.tar.xz diff --git a/.gitignore b/.gitignore index 25f3c40..fb2c952 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -SOURCES/gcc-8.2.1-20180905.tar.xz +SOURCES/gcc-8.3.1-20190507.tar.xz SOURCES/nvptx-newlib-aadc8eb0ec43b7cd0dd2dfb484bae63c8b05ef24.tar.xz SOURCES/nvptx-tools-c28050f60193b3b95a18866a96f03334e874e78f.tar.xz diff --git a/SOURCES/gcc8-libgomp-20190503.patch b/SOURCES/gcc8-libgomp-20190503.patch new file mode 100644 index 0000000..caa13f2 --- /dev/null +++ b/SOURCES/gcc8-libgomp-20190503.patch @@ -0,0 +1,10060 @@ +--- libgomp/loop.c.jj 2018-04-25 09:40:31.870655561 +0200 ++++ libgomp/loop.c 2019-05-07 18:46:36.526109736 +0200 +@@ -27,9 +27,13 @@ + + #include + #include ++#include + #include "libgomp.h" + + ++ialias (GOMP_loop_runtime_next) ++ialias_redirect (GOMP_taskgroup_reduction_register) ++ + /* Initialize the given work share construct from the given arguments. */ + + static inline void +@@ -79,12 +83,12 @@ gomp_loop_init (struct gomp_work_share * + } + + /* The *_start routines are called when first encountering a loop construct +- that is not bound directly to a parallel construct. The first thread ++ that is not bound directly to a parallel construct. The first thread + that arrives will create the work-share construct; subsequent threads + will see the construct exists and allocate work from it. + + START, END, INCR are the bounds of the loop; due to the restrictions of +- OpenMP, these values must be the same in every thread. This is not ++ OpenMP, these values must be the same in every thread. This is not + verified (nor is it entirely verifiable, since START is not necessarily + retained intact in the work-share data structure). CHUNK_SIZE is the + scheduling parameter; again this must be identical in all threads. +@@ -101,7 +105,7 @@ gomp_loop_static_start (long start, long + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_init (thr->ts.work_share, start, end, incr, + GFS_STATIC, chunk_size); +@@ -123,7 +127,7 @@ gomp_loop_dynamic_start (long start, lon + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_init (thr->ts.work_share, start, end, incr, + GFS_DYNAMIC, chunk_size); +@@ -151,7 +155,7 @@ gomp_loop_guided_start (long start, long + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_init (thr->ts.work_share, start, end, incr, + GFS_GUIDED, chunk_size); +@@ -174,7 +178,7 @@ GOMP_loop_runtime_start (long start, lon + long *istart, long *iend) + { + struct gomp_task_icv *icv = gomp_icv (false); +- switch (icv->run_sched_var) ++ switch (icv->run_sched_var & ~GFS_MONOTONIC) + { + case GFS_STATIC: + return gomp_loop_static_start (start, end, incr, +@@ -197,6 +201,100 @@ GOMP_loop_runtime_start (long start, lon + } + } + ++static long ++gomp_adjust_sched (long sched, long *chunk_size) ++{ ++ sched &= ~GFS_MONOTONIC; ++ switch (sched) ++ { ++ case GFS_STATIC: ++ case GFS_DYNAMIC: ++ case GFS_GUIDED: ++ return sched; ++ /* GFS_RUNTIME is used for runtime schedule without monotonic ++ or nonmonotonic modifiers on the clause. ++ GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic ++ modifier. */ ++ case GFS_RUNTIME: ++ /* GFS_AUTO is used for runtime schedule with nonmonotonic ++ modifier. */ ++ case GFS_AUTO: ++ { ++ struct gomp_task_icv *icv = gomp_icv (false); ++ sched = icv->run_sched_var & ~GFS_MONOTONIC; ++ switch (sched) ++ { ++ case GFS_STATIC: ++ case GFS_DYNAMIC: ++ case GFS_GUIDED: ++ *chunk_size = icv->run_sched_chunk_size; ++ break; ++ case GFS_AUTO: ++ sched = GFS_STATIC; ++ *chunk_size = 0; ++ break; ++ default: ++ abort (); ++ } ++ return sched; ++ } ++ default: ++ abort (); ++ } ++} ++ ++bool ++GOMP_loop_start (long start, long end, long incr, long sched, ++ long chunk_size, long *istart, long *iend, ++ uintptr_t *reductions, void **mem) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ ++ thr->ts.static_trip = 0; ++ if (reductions) ++ gomp_workshare_taskgroup_start (); ++ if (gomp_work_share_start (0)) ++ { ++ sched = gomp_adjust_sched (sched, &chunk_size); ++ gomp_loop_init (thr->ts.work_share, start, end, incr, ++ sched, chunk_size); ++ if (reductions) ++ { ++ GOMP_taskgroup_reduction_register (reductions); ++ thr->task->taskgroup->workshare = true; ++ thr->ts.work_share->task_reductions = reductions; ++ } ++ if (mem) ++ { ++ uintptr_t size = (uintptr_t) *mem; ++ if (size > (sizeof (struct gomp_work_share) ++ - offsetof (struct gomp_work_share, ++ inline_ordered_team_ids))) ++ thr->ts.work_share->ordered_team_ids ++ = gomp_malloc_cleared (size); ++ else ++ memset (thr->ts.work_share->ordered_team_ids, '\0', size); ++ *mem = (void *) thr->ts.work_share->ordered_team_ids; ++ } ++ gomp_work_share_init_done (); ++ } ++ else ++ { ++ if (reductions) ++ { ++ uintptr_t *first_reductions = thr->ts.work_share->task_reductions; ++ gomp_workshare_task_reduction_register (reductions, ++ first_reductions); ++ } ++ if (mem) ++ *mem = (void *) thr->ts.work_share->ordered_team_ids; ++ } ++ ++ if (!istart) ++ return true; ++ return ialias_call (GOMP_loop_runtime_next) (istart, iend); ++} ++ + /* The *_ordered_*_start routines are similar. The only difference is that + this work-share construct is initialized to expect an ORDERED section. */ + +@@ -207,7 +305,7 @@ gomp_loop_ordered_static_start (long sta + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; +- if (gomp_work_share_start (true)) ++ if (gomp_work_share_start (1)) + { + gomp_loop_init (thr->ts.work_share, start, end, incr, + GFS_STATIC, chunk_size); +@@ -225,7 +323,7 @@ gomp_loop_ordered_dynamic_start (long st + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (true)) ++ if (gomp_work_share_start (1)) + { + gomp_loop_init (thr->ts.work_share, start, end, incr, + GFS_DYNAMIC, chunk_size); +@@ -250,7 +348,7 @@ gomp_loop_ordered_guided_start (long sta + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (true)) ++ if (gomp_work_share_start (1)) + { + gomp_loop_init (thr->ts.work_share, start, end, incr, + GFS_GUIDED, chunk_size); +@@ -273,7 +371,7 @@ GOMP_loop_ordered_runtime_start (long st + long *istart, long *iend) + { + struct gomp_task_icv *icv = gomp_icv (false); +- switch (icv->run_sched_var) ++ switch (icv->run_sched_var & ~GFS_MONOTONIC) + { + case GFS_STATIC: + return gomp_loop_ordered_static_start (start, end, incr, +@@ -297,6 +395,81 @@ GOMP_loop_ordered_runtime_start (long st + } + } + ++bool ++GOMP_loop_ordered_start (long start, long end, long incr, long sched, ++ long chunk_size, long *istart, long *iend, ++ uintptr_t *reductions, void **mem) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ size_t ordered = 1; ++ bool ret; ++ ++ thr->ts.static_trip = 0; ++ if (reductions) ++ gomp_workshare_taskgroup_start (); ++ if (mem) ++ ordered += (uintptr_t) *mem; ++ if (gomp_work_share_start (ordered)) ++ { ++ sched = gomp_adjust_sched (sched, &chunk_size); ++ gomp_loop_init (thr->ts.work_share, start, end, incr, ++ sched, chunk_size); ++ if (reductions) ++ { ++ GOMP_taskgroup_reduction_register (reductions); ++ thr->task->taskgroup->workshare = true; ++ thr->ts.work_share->task_reductions = reductions; ++ } ++ if (sched == GFS_STATIC) ++ gomp_ordered_static_init (); ++ else ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ gomp_work_share_init_done (); ++ } ++ else ++ { ++ if (reductions) ++ { ++ uintptr_t *first_reductions = thr->ts.work_share->task_reductions; ++ gomp_workshare_task_reduction_register (reductions, ++ first_reductions); ++ } ++ sched = thr->ts.work_share->sched; ++ if (sched != GFS_STATIC) ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ } ++ ++ if (mem) ++ { ++ uintptr_t p ++ = (uintptr_t) (thr->ts.work_share->ordered_team_ids ++ + (thr->ts.team ? thr->ts.team->nthreads : 1)); ++ p += __alignof__ (long long) - 1; ++ p &= ~(__alignof__ (long long) - 1); ++ *mem = (void *) p; ++ } ++ ++ switch (sched) ++ { ++ case GFS_STATIC: ++ case GFS_AUTO: ++ return !gomp_iter_static_next (istart, iend); ++ case GFS_DYNAMIC: ++ ret = gomp_iter_dynamic_next_locked (istart, iend); ++ break; ++ case GFS_GUIDED: ++ ret = gomp_iter_guided_next_locked (istart, iend); ++ break; ++ default: ++ abort (); ++ } ++ ++ if (ret) ++ gomp_ordered_first (); ++ gomp_mutex_unlock (&thr->ts.work_share->lock); ++ return ret; ++} ++ + /* The *_doacross_*_start routines are similar. The only difference is that + this work-share construct is initialized to expect an ORDERED(N) - DOACROSS + section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 +@@ -310,11 +483,11 @@ gomp_loop_doacross_static_start (unsigne + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, + GFS_STATIC, chunk_size); +- gomp_doacross_init (ncounts, counts, chunk_size); ++ gomp_doacross_init (ncounts, counts, chunk_size, 0); + gomp_work_share_init_done (); + } + +@@ -328,11 +501,11 @@ gomp_loop_doacross_dynamic_start (unsign + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, + GFS_DYNAMIC, chunk_size); +- gomp_doacross_init (ncounts, counts, chunk_size); ++ gomp_doacross_init (ncounts, counts, chunk_size, 0); + gomp_work_share_init_done (); + } + +@@ -354,11 +527,11 @@ gomp_loop_doacross_guided_start (unsigne + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, + GFS_GUIDED, chunk_size); +- gomp_doacross_init (ncounts, counts, chunk_size); ++ gomp_doacross_init (ncounts, counts, chunk_size, 0); + gomp_work_share_init_done (); + } + +@@ -378,7 +551,7 @@ GOMP_loop_doacross_runtime_start (unsign + long *istart, long *iend) + { + struct gomp_task_icv *icv = gomp_icv (false); +- switch (icv->run_sched_var) ++ switch (icv->run_sched_var & ~GFS_MONOTONIC) + { + case GFS_STATIC: + return gomp_loop_doacross_static_start (ncounts, counts, +@@ -402,8 +575,52 @@ GOMP_loop_doacross_runtime_start (unsign + } + } + +-/* The *_next routines are called when the thread completes processing of +- the iteration block currently assigned to it. If the work-share ++bool ++GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched, ++ long chunk_size, long *istart, long *iend, ++ uintptr_t *reductions, void **mem) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ ++ thr->ts.static_trip = 0; ++ if (reductions) ++ gomp_workshare_taskgroup_start (); ++ if (gomp_work_share_start (0)) ++ { ++ size_t extra = 0; ++ if (mem) ++ extra = (uintptr_t) *mem; ++ sched = gomp_adjust_sched (sched, &chunk_size); ++ gomp_loop_init (thr->ts.work_share, 0, counts[0], 1, ++ sched, chunk_size); ++ gomp_doacross_init (ncounts, counts, chunk_size, extra); ++ if (reductions) ++ { ++ GOMP_taskgroup_reduction_register (reductions); ++ thr->task->taskgroup->workshare = true; ++ thr->ts.work_share->task_reductions = reductions; ++ } ++ gomp_work_share_init_done (); ++ } ++ else ++ { ++ if (reductions) ++ { ++ uintptr_t *first_reductions = thr->ts.work_share->task_reductions; ++ gomp_workshare_task_reduction_register (reductions, ++ first_reductions); ++ } ++ sched = thr->ts.work_share->sched; ++ } ++ ++ if (mem) ++ *mem = thr->ts.work_share->doacross->extra; ++ ++ return ialias_call (GOMP_loop_runtime_next) (istart, iend); ++} ++ ++/* The *_next routines are called when the thread completes processing of ++ the iteration block currently assigned to it. If the work-share + construct is bound directly to a parallel construct, then the iteration + bounds may have been set up before the parallel. In which case, this + may be the first iteration for the thread. +@@ -456,7 +673,7 @@ bool + GOMP_loop_runtime_next (long *istart, long *iend) + { + struct gomp_thread *thr = gomp_thread (); +- ++ + switch (thr->ts.work_share->sched) + { + case GFS_STATIC: +@@ -534,7 +751,7 @@ bool + GOMP_loop_ordered_runtime_next (long *istart, long *iend) + { + struct gomp_thread *thr = gomp_thread (); +- ++ + switch (thr->ts.work_share->sched) + { + case GFS_STATIC: +@@ -563,7 +780,7 @@ gomp_parallel_loop_start (void (*fn) (vo + num_threads = gomp_resolve_num_threads (num_threads, 0); + team = gomp_new_team (num_threads); + gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size); +- gomp_team_start (fn, data, num_threads, flags, team); ++ gomp_team_start (fn, data, num_threads, flags, team, NULL); + } + + void +@@ -600,7 +817,8 @@ GOMP_parallel_loop_runtime_start (void ( + { + struct gomp_task_icv *icv = gomp_icv (false); + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- icv->run_sched_var, icv->run_sched_chunk_size, 0); ++ icv->run_sched_var & ~GFS_MONOTONIC, ++ icv->run_sched_chunk_size, 0); + } + + ialias_redirect (GOMP_parallel_end) +@@ -638,11 +856,28 @@ GOMP_parallel_loop_guided (void (*fn) (v + GOMP_parallel_end (); + } + ++void ++GOMP_parallel_loop_runtime (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, long end, ++ long incr, unsigned flags) ++{ ++ struct gomp_task_icv *icv = gomp_icv (false); ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ icv->run_sched_var & ~GFS_MONOTONIC, ++ icv->run_sched_chunk_size, flags); ++ fn (data); ++ GOMP_parallel_end (); ++} ++ + #ifdef HAVE_ATTRIBUTE_ALIAS + extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic + __attribute__((alias ("GOMP_parallel_loop_dynamic"))); + extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided + __attribute__((alias ("GOMP_parallel_loop_guided"))); ++extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_nonmonotonic_runtime ++ __attribute__((alias ("GOMP_parallel_loop_runtime"))); ++extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_maybe_nonmonotonic_runtime ++ __attribute__((alias ("GOMP_parallel_loop_runtime"))); + #else + void + GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data, +@@ -667,21 +902,35 @@ GOMP_parallel_loop_nonmonotonic_guided ( + fn (data); + GOMP_parallel_end (); + } +-#endif + + void +-GOMP_parallel_loop_runtime (void (*fn) (void *), void *data, +- unsigned num_threads, long start, long end, +- long incr, unsigned flags) ++GOMP_parallel_loop_nonmonotonic_runtime (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, ++ long end, long incr, unsigned flags) + { + struct gomp_task_icv *icv = gomp_icv (false); + gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, +- icv->run_sched_var, icv->run_sched_chunk_size, +- flags); ++ icv->run_sched_var & ~GFS_MONOTONIC, ++ icv->run_sched_chunk_size, flags); + fn (data); + GOMP_parallel_end (); + } + ++void ++GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*fn) (void *), void *data, ++ unsigned num_threads, long start, ++ long end, long incr, ++ unsigned flags) ++{ ++ struct gomp_task_icv *icv = gomp_icv (false); ++ gomp_parallel_loop_start (fn, data, num_threads, start, end, incr, ++ icv->run_sched_var & ~GFS_MONOTONIC, ++ icv->run_sched_chunk_size, flags); ++ fn (data); ++ GOMP_parallel_end (); ++} ++#endif ++ + /* The GOMP_loop_end* routines are called after the thread is told that + all loop iterations are complete. The first two versions synchronize + all threads; the nowait version does not. */ +@@ -721,6 +970,10 @@ extern __typeof(gomp_loop_dynamic_start) + __attribute__((alias ("gomp_loop_dynamic_start"))); + extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start + __attribute__((alias ("gomp_loop_guided_start"))); ++extern __typeof(GOMP_loop_runtime_start) GOMP_loop_nonmonotonic_runtime_start ++ __attribute__((alias ("GOMP_loop_runtime_start"))); ++extern __typeof(GOMP_loop_runtime_start) GOMP_loop_maybe_nonmonotonic_runtime_start ++ __attribute__((alias ("GOMP_loop_runtime_start"))); + + extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start + __attribute__((alias ("gomp_loop_ordered_static_start"))); +@@ -746,6 +999,10 @@ extern __typeof(gomp_loop_dynamic_next) + __attribute__((alias ("gomp_loop_dynamic_next"))); + extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next + __attribute__((alias ("gomp_loop_guided_next"))); ++extern __typeof(GOMP_loop_runtime_next) GOMP_loop_nonmonotonic_runtime_next ++ __attribute__((alias ("GOMP_loop_runtime_next"))); ++extern __typeof(GOMP_loop_runtime_next) GOMP_loop_maybe_nonmonotonic_runtime_next ++ __attribute__((alias ("GOMP_loop_runtime_next"))); + + extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next + __attribute__((alias ("gomp_loop_ordered_static_next"))); +@@ -791,6 +1048,20 @@ GOMP_loop_nonmonotonic_guided_start (lon + } + + bool ++GOMP_loop_nonmonotonic_runtime_start (long start, long end, long incr, ++ long *istart, long *iend) ++{ ++ return GOMP_loop_runtime_start (start, end, incr, istart, iend); ++} ++ ++bool ++GOMP_loop_maybe_nonmonotonic_runtime_start (long start, long end, long incr, ++ long *istart, long *iend) ++{ ++ return GOMP_loop_runtime_start (start, end, incr, istart, iend); ++} ++ ++bool + GOMP_loop_ordered_static_start (long start, long end, long incr, + long chunk_size, long *istart, long *iend) + { +@@ -869,6 +1140,18 @@ GOMP_loop_nonmonotonic_guided_next (long + } + + bool ++GOMP_loop_nonmonotonic_runtime_next (long *istart, long *iend) ++{ ++ return GOMP_loop_runtime_next (istart, iend); ++} ++ ++bool ++GOMP_loop_maybe_nonmonotonic_runtime_next (long *istart, long *iend) ++{ ++ return GOMP_loop_runtime_next (istart, iend); ++} ++ ++bool + GOMP_loop_ordered_static_next (long *istart, long *iend) + { + return gomp_loop_ordered_static_next (istart, iend); +--- libgomp/oacc-plugin.c.jj 2018-04-25 09:40:31.322655307 +0200 ++++ libgomp/oacc-plugin.c 2019-05-07 18:46:36.531109656 +0200 +@@ -49,3 +49,14 @@ GOMP_PLUGIN_acc_thread (void) + struct goacc_thread *thr = goacc_thread (); + return thr ? thr->target_tls : NULL; + } ++ ++int ++GOMP_PLUGIN_acc_default_dim (unsigned int i) ++{ ++ if (i >= GOMP_DIM_MAX) ++ { ++ gomp_fatal ("invalid dimension argument: %d", i); ++ return -1; ++ } ++ return goacc_default_dims[i]; ++} +--- libgomp/libgomp_g.h.jj 2018-04-25 09:40:31.320655306 +0200 ++++ libgomp/libgomp_g.h 2019-05-07 18:46:36.513109943 +0200 +@@ -1,4 +1,4 @@ +-/* Copyright (C) 2005-2018 Free Software Foundation, Inc. ++/* Copyright (C) 2005-2019 Free Software Foundation, Inc. + Contributed by Richard Henderson . + + This file is part of the GNU Offloading and Multi Processing Library +@@ -31,6 +31,7 @@ + + #include + #include ++#include "gstdint.h" + + /* barrier.c */ + +@@ -56,6 +57,12 @@ extern bool GOMP_loop_nonmonotonic_dynam + long *, long *); + extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long, + long *, long *); ++extern bool GOMP_loop_nonmonotonic_runtime_start (long, long, long, ++ long *, long *); ++extern bool GOMP_loop_maybe_nonmonotonic_runtime_start (long, long, long, ++ long *, long *); ++extern bool GOMP_loop_start (long, long, long, long, long, long *, long *, ++ uintptr_t *, void **); + + extern bool GOMP_loop_ordered_static_start (long, long, long, long, + long *, long *); +@@ -64,6 +71,8 @@ extern bool GOMP_loop_ordered_dynamic_st + extern bool GOMP_loop_ordered_guided_start (long, long, long, long, + long *, long *); + extern bool GOMP_loop_ordered_runtime_start (long, long, long, long *, long *); ++extern bool GOMP_loop_ordered_start (long, long, long, long, long, long *, ++ long *, uintptr_t *, void **); + + extern bool GOMP_loop_static_next (long *, long *); + extern bool GOMP_loop_dynamic_next (long *, long *); +@@ -71,6 +80,8 @@ extern bool GOMP_loop_guided_next (long + extern bool GOMP_loop_runtime_next (long *, long *); + extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *); + extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *); ++extern bool GOMP_loop_nonmonotonic_runtime_next (long *, long *); ++extern bool GOMP_loop_maybe_nonmonotonic_runtime_next (long *, long *); + + extern bool GOMP_loop_ordered_static_next (long *, long *); + extern bool GOMP_loop_ordered_dynamic_next (long *, long *); +@@ -85,6 +96,8 @@ extern bool GOMP_loop_doacross_guided_st + long *); + extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *, + long *); ++extern bool GOMP_loop_doacross_start (unsigned, long *, long, long, long *, ++ long *, uintptr_t *, void **); + + extern void GOMP_parallel_loop_static_start (void (*)(void *), void *, + unsigned, long, long, long, long); +@@ -112,6 +125,13 @@ extern void GOMP_parallel_loop_nonmonoto + extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *, + unsigned, long, long, + long, long, unsigned); ++extern void GOMP_parallel_loop_nonmonotonic_runtime (void (*)(void *), void *, ++ unsigned, long, long, ++ long, unsigned); ++extern void GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*)(void *), ++ void *, unsigned, ++ long, long, ++ long, unsigned); + + extern void GOMP_loop_end (void); + extern void GOMP_loop_end_nowait (void); +@@ -154,6 +174,21 @@ extern bool GOMP_loop_ull_nonmonotonic_g + unsigned long long, + unsigned long long *, + unsigned long long *); ++extern bool GOMP_loop_ull_nonmonotonic_runtime_start (bool, unsigned long long, ++ unsigned long long, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool, ++ unsigned long long, ++ unsigned long long, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_start (bool, unsigned long long, unsigned long long, ++ unsigned long long, long, unsigned long long, ++ unsigned long long *, unsigned long long *, ++ uintptr_t *, void **); + + extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long, + unsigned long long, +@@ -178,6 +213,13 @@ extern bool GOMP_loop_ull_ordered_runtim + unsigned long long, + unsigned long long *, + unsigned long long *); ++extern bool GOMP_loop_ull_ordered_start (bool, unsigned long long, ++ unsigned long long, ++ unsigned long long, long, ++ unsigned long long, ++ unsigned long long *, ++ unsigned long long *, ++ uintptr_t *, void **); + + extern bool GOMP_loop_ull_static_next (unsigned long long *, + unsigned long long *); +@@ -191,6 +233,10 @@ extern bool GOMP_loop_ull_nonmonotonic_d + unsigned long long *); + extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *, + unsigned long long *); ++extern bool GOMP_loop_ull_nonmonotonic_runtime_next (unsigned long long *, ++ unsigned long long *); ++extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_next (unsigned long long *, ++ unsigned long long *); + + extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *, + unsigned long long *); +@@ -220,6 +266,11 @@ extern bool GOMP_loop_ull_doacross_runti + unsigned long long *, + unsigned long long *, + unsigned long long *); ++extern bool GOMP_loop_ull_doacross_start (unsigned, unsigned long long *, ++ long, unsigned long long, ++ unsigned long long *, ++ unsigned long long *, ++ uintptr_t *, void **); + + /* ordered.c */ + +@@ -235,6 +286,8 @@ extern void GOMP_doacross_ull_wait (unsi + extern void GOMP_parallel_start (void (*) (void *), void *, unsigned); + extern void GOMP_parallel_end (void); + extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned); ++extern unsigned GOMP_parallel_reductions (void (*) (void *), void *, unsigned, ++ unsigned); + extern bool GOMP_cancel (int, bool); + extern bool GOMP_cancellation_point (int); + +@@ -251,13 +304,19 @@ extern void GOMP_taskloop_ull (void (*) + unsigned long long, unsigned long long, + unsigned long long); + extern void GOMP_taskwait (void); ++extern void GOMP_taskwait_depend (void **); + extern void GOMP_taskyield (void); + extern void GOMP_taskgroup_start (void); + extern void GOMP_taskgroup_end (void); ++extern void GOMP_taskgroup_reduction_register (uintptr_t *); ++extern void GOMP_taskgroup_reduction_unregister (uintptr_t *); ++extern void GOMP_task_reduction_remap (size_t, size_t, void **); ++extern void GOMP_workshare_task_reduction_unregister (bool); + + /* sections.c */ + + extern unsigned GOMP_sections_start (unsigned); ++extern unsigned GOMP_sections2_start (unsigned, uintptr_t *, void **); + extern unsigned GOMP_sections_next (void); + extern void GOMP_parallel_sections_start (void (*) (void *), void *, + unsigned, unsigned); +@@ -293,6 +352,11 @@ extern void GOMP_target_enter_exit_data + void **); + extern void GOMP_teams (unsigned int, unsigned int); + ++/* teams.c */ ++ ++extern void GOMP_teams_reg (void (*) (void *), void *, unsigned, unsigned, ++ unsigned); ++ + /* oacc-parallel.c */ + + extern void GOACC_parallel_keyed (int, void (*) (void *), size_t, +--- libgomp/affinity.c.jj 2018-04-25 09:40:31.913655581 +0200 ++++ libgomp/affinity.c 2019-05-07 18:46:36.254114081 +0200 +@@ -26,6 +26,8 @@ + /* This is a generic stub implementation of a CPU affinity setting. */ + + #include "libgomp.h" ++#include ++#include + + void + gomp_init_affinity (void) +@@ -138,5 +140,17 @@ gomp_get_place_proc_ids_8 (int place_num + (void) ids; + } + ++void ++gomp_display_affinity_place (char *buffer, size_t size, size_t *ret, ++ int place) ++{ ++ char buf[sizeof (long) * 3 + 4]; ++ if (gomp_available_cpus > 1) ++ sprintf (buf, "0-%lu", gomp_available_cpus - 1); ++ else ++ strcpy (buf, "0"); ++ gomp_display_string (buffer, size, ret, buf, strlen (buf)); ++} ++ + ialias(omp_get_place_num_procs) + ialias(omp_get_place_proc_ids) +--- libgomp/sections.c.jj 2018-04-25 09:40:31.924655586 +0200 ++++ libgomp/sections.c 2019-05-07 18:46:36.535109592 +0200 +@@ -26,8 +26,11 @@ + /* This file handles the SECTIONS construct. */ + + #include "libgomp.h" ++#include + + ++ialias_redirect (GOMP_taskgroup_reduction_register) ++ + /* Initialize the given work share construct from the given arguments. */ + + static inline void +@@ -72,7 +75,7 @@ GOMP_sections_start (unsigned count) + struct gomp_thread *thr = gomp_thread (); + long s, e, ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_sections_init (thr->ts.work_share, count); + gomp_work_share_init_done (); +@@ -95,6 +98,66 @@ GOMP_sections_start (unsigned count) + return ret; + } + ++unsigned ++GOMP_sections2_start (unsigned count, uintptr_t *reductions, void **mem) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ long s, e, ret; ++ ++ if (reductions) ++ gomp_workshare_taskgroup_start (); ++ if (gomp_work_share_start (0)) ++ { ++ gomp_sections_init (thr->ts.work_share, count); ++ if (reductions) ++ { ++ GOMP_taskgroup_reduction_register (reductions); ++ thr->task->taskgroup->workshare = true; ++ thr->ts.work_share->task_reductions = reductions; ++ } ++ if (mem) ++ { ++ uintptr_t size = (uintptr_t) *mem; ++ if (size > (sizeof (struct gomp_work_share) ++ - offsetof (struct gomp_work_share, ++ inline_ordered_team_ids))) ++ thr->ts.work_share->ordered_team_ids ++ = gomp_malloc_cleared (size); ++ else ++ memset (thr->ts.work_share->ordered_team_ids, '\0', size); ++ *mem = (void *) thr->ts.work_share->ordered_team_ids; ++ } ++ gomp_work_share_init_done (); ++ } ++ else ++ { ++ if (reductions) ++ { ++ uintptr_t *first_reductions = thr->ts.work_share->task_reductions; ++ gomp_workshare_task_reduction_register (reductions, ++ first_reductions); ++ } ++ if (mem) ++ *mem = (void *) thr->ts.work_share->ordered_team_ids; ++ } ++ ++#ifdef HAVE_SYNC_BUILTINS ++ if (gomp_iter_dynamic_next (&s, &e)) ++ ret = s; ++ else ++ ret = 0; ++#else ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ if (gomp_iter_dynamic_next_locked (&s, &e)) ++ ret = s; ++ else ++ ret = 0; ++ gomp_mutex_unlock (&thr->ts.work_share->lock); ++#endif ++ ++ return ret; ++} ++ + /* This routine is called when the thread completes processing of the + section currently assigned to it. If the work-share construct is + bound directly to a parallel construct, then the construct may have +@@ -140,7 +203,7 @@ GOMP_parallel_sections_start (void (*fn) + num_threads = gomp_resolve_num_threads (num_threads, count); + team = gomp_new_team (num_threads); + gomp_sections_init (&team->work_shares[0], count); +- gomp_team_start (fn, data, num_threads, 0, team); ++ gomp_team_start (fn, data, num_threads, 0, team, NULL); + } + + ialias_redirect (GOMP_parallel_end) +@@ -154,7 +217,7 @@ GOMP_parallel_sections (void (*fn) (void + num_threads = gomp_resolve_num_threads (num_threads, count); + team = gomp_new_team (num_threads); + gomp_sections_init (&team->work_shares[0], count); +- gomp_team_start (fn, data, num_threads, flags, team); ++ gomp_team_start (fn, data, num_threads, flags, team, NULL); + fn (data); + GOMP_parallel_end (); + } +--- libgomp/config/linux/affinity.c.jj 2018-04-25 09:40:31.875655563 +0200 ++++ libgomp/config/linux/affinity.c 2019-05-07 18:46:36.344112642 +0200 +@@ -396,6 +396,56 @@ gomp_get_place_proc_ids_8 (int place_num + *ids++ = i; + } + ++void ++gomp_display_affinity_place (char *buffer, size_t size, size_t *ret, ++ int place) ++{ ++ cpu_set_t *cpusetp; ++ char buf[sizeof (long) * 3 + 4]; ++ if (place >= 0 && place < gomp_places_list_len) ++ cpusetp = (cpu_set_t *) gomp_places_list[place]; ++ else if (gomp_cpusetp) ++ cpusetp = gomp_cpusetp; ++ else ++ { ++ if (gomp_available_cpus > 1) ++ sprintf (buf, "0-%lu", gomp_available_cpus - 1); ++ else ++ strcpy (buf, "0"); ++ gomp_display_string (buffer, size, ret, buf, strlen (buf)); ++ return; ++ } ++ ++ unsigned long i, max = 8 * gomp_cpuset_size, start; ++ bool prev_set = false; ++ start = max; ++ for (i = 0; i <= max; i++) ++ { ++ bool this_set; ++ if (i == max) ++ this_set = false; ++ else ++ this_set = CPU_ISSET_S (i, gomp_cpuset_size, cpusetp); ++ if (this_set != prev_set) ++ { ++ prev_set = this_set; ++ if (this_set) ++ { ++ char *p = buf; ++ if (start != max) ++ *p++ = ','; ++ sprintf (p, "%lu", i); ++ start = i; ++ } ++ else if (i == start + 1) ++ continue; ++ else ++ sprintf (buf, "-%lu", i - 1); ++ gomp_display_string (buffer, size, ret, buf, strlen (buf)); ++ } ++ } ++} ++ + ialias(omp_get_place_num_procs) + ialias(omp_get_place_proc_ids) + +--- libgomp/config/linux/ia64/futex.h.jj 2018-04-25 09:40:31.877655564 +0200 ++++ libgomp/config/linux/ia64/futex.h 2019-05-07 18:46:36.344112642 +0200 +@@ -45,8 +45,8 @@ sys_futex0(int *addr, int op, int val) + "=r"(r8), "=r"(r10) + : "r"(r15), "r"(out0), "r"(out1), "r"(out2), "r"(out3) + : "memory", "out4", "out5", "out6", "out7", +- /* Non-stacked integer registers, minus r8, r10, r15. */ +- "r2", "r3", "r9", "r11", "r12", "r13", "r14", "r16", "r17", "r18", ++ /* Non-stacked integer registers, minus r8, r10, r12, r15. */ ++ "r2", "r3", "r9", "r11", "r13", "r14", "r16", "r17", "r18", + "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", + "r28", "r29", "r30", "r31", + /* Predicate registers. */ +--- libgomp/config/nvptx/teams.c.jj 2019-05-07 18:46:36.459110805 +0200 ++++ libgomp/config/nvptx/teams.c 2019-05-07 18:46:36.459110805 +0200 +@@ -0,0 +1,57 @@ ++/* Copyright (C) 2015-2019 Free Software Foundation, Inc. ++ Contributed by Alexander Monakov ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ . */ ++ ++/* This file defines OpenMP API entry points that accelerator targets are ++ expected to replace. */ ++ ++#include "libgomp.h" ++ ++void ++GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams, ++ unsigned int thread_limit, unsigned int flags) ++{ ++ (void) fn; ++ (void) data; ++ (void) flags; ++ (void) num_teams; ++ (void) thread_limit; ++} ++ ++int ++omp_get_num_teams (void) ++{ ++ return gomp_num_teams_var + 1; ++} ++ ++int ++omp_get_team_num (void) ++{ ++ int ctaid; ++ asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid)); ++ return ctaid; ++} ++ ++ialias (omp_get_num_teams) ++ialias (omp_get_team_num) +--- libgomp/config/nvptx/team.c.jj 2018-04-25 09:40:31.890655570 +0200 ++++ libgomp/config/nvptx/team.c 2019-05-07 18:46:36.459110805 +0200 +@@ -116,7 +116,8 @@ gomp_thread_start (struct gomp_thread_po + + void + gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, +- unsigned flags, struct gomp_team *team) ++ unsigned flags, struct gomp_team *team, ++ struct gomp_taskgroup *taskgroup) + { + struct gomp_thread *thr, *nthr; + struct gomp_task *task; +@@ -147,6 +148,7 @@ gomp_team_start (void (*fn) (void *), vo + nthreads_var = icv->nthreads_var; + gomp_init_task (thr->task, task, icv); + team->implicit_task[0].icv.nthreads_var = nthreads_var; ++ team->implicit_task[0].taskgroup = taskgroup; + + if (nthreads == 1) + return; +@@ -166,6 +168,7 @@ gomp_team_start (void (*fn) (void *), vo + nthr->task = &team->implicit_task[i]; + gomp_init_task (nthr->task, task, icv); + team->implicit_task[i].icv.nthreads_var = nthreads_var; ++ team->implicit_task[i].taskgroup = taskgroup; + nthr->fn = fn; + nthr->data = data; + team->ordered_release[i] = &nthr->release; +@@ -174,5 +177,11 @@ gomp_team_start (void (*fn) (void *), vo + gomp_simple_barrier_wait (&pool->threads_dock); + } + ++int ++gomp_pause_host (void) ++{ ++ return -1; ++} ++ + #include "../../team.c" + #endif +--- libgomp/config/nvptx/oacc-parallel.c.jj 2018-04-25 09:40:31.887655569 +0200 ++++ libgomp/config/nvptx/oacc-parallel.c 2019-05-07 18:46:36.453110901 +0200 +@@ -1,358 +0,0 @@ +-/* OpenACC constructs +- +- Copyright (C) 2014-2018 Free Software Foundation, Inc. +- +- Contributed by Mentor Embedded. +- +- This file is part of the GNU Offloading and Multi Processing Library +- (libgomp). +- +- Libgomp is free software; you can redistribute it and/or modify it +- under the terms of the GNU General Public License as published by +- the Free Software Foundation; either version 3, or (at your option) +- any later version. +- +- Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY +- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +- FOR A PARTICULAR PURPOSE. See the GNU General Public License for +- more details. +- +- Under Section 7 of GPL version 3, you are granted additional +- permissions described in the GCC Runtime Library Exception, version +- 3.1, as published by the Free Software Foundation. +- +- You should have received a copy of the GNU General Public License and +- a copy of the GCC Runtime Library Exception along with this program; +- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +- . */ +- +-#include "libgomp_g.h" +- +-__asm__ (".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n" +- ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n" +- ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n" +- ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n" +- "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_num_threads\n" +- ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n" +- "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_thread_num\n" +- ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n" +- "// BEGIN GLOBAL FUNCTION DECL: abort\n" +- ".extern .func abort;\n" +- ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n" +- "{\n" +- ".reg .u32 %ar1;\n" +- ".reg .u32 %retval;\n" +- ".reg .u64 %hr10;\n" +- ".reg .u32 %r22;\n" +- ".reg .u32 %r23;\n" +- ".reg .u32 %r24;\n" +- ".reg .u32 %r25;\n" +- ".reg .u32 %r26;\n" +- ".reg .u32 %r27;\n" +- ".reg .u32 %r28;\n" +- ".reg .u32 %r29;\n" +- ".reg .pred %r30;\n" +- ".reg .u32 %r31;\n" +- ".reg .pred %r32;\n" +- ".reg .u32 %r33;\n" +- ".reg .pred %r34;\n" +- ".local .align 8 .b8 %frame[4];\n" +- "ld.param.u32 %ar1,[%in_ar1];\n" +- "mov.u32 %r27,%ar1;\n" +- "st.local.u32 [%frame],%r27;\n" +- "ld.local.u32 %r28,[%frame];\n" +- "mov.u32 %r29,1;\n" +- "setp.eq.u32 %r30,%r28,%r29;\n" +- "@%r30 bra $L4;\n" +- "mov.u32 %r31,2;\n" +- "setp.eq.u32 %r32,%r28,%r31;\n" +- "@%r32 bra $L5;\n" +- "mov.u32 %r33,0;\n" +- "setp.eq.u32 %r34,%r28,%r33;\n" +- "@!%r34 bra $L8;\n" +- "mov.u32 %r23,%tid.x;\n" +- "mov.u32 %r22,%r23;\n" +- "bra $L7;\n" +- "$L4:\n" +- "mov.u32 %r24,%tid.y;\n" +- "mov.u32 %r22,%r24;\n" +- "bra $L7;\n" +- "$L5:\n" +- "mov.u32 %r25,%tid.z;\n" +- "mov.u32 %r22,%r25;\n" +- "bra $L7;\n" +- "$L8:\n" +- "{\n" +- "{\n" +- "call abort;\n" +- "}\n" +- "}\n" +- "$L7:\n" +- "mov.u32 %r26,%r22;\n" +- "mov.u32 %retval,%r26;\n" +- "st.param.u32 [%out_retval],%retval;\n" +- "ret;\n" +- "}\n" +- ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n" +- "{\n" +- ".reg .u32 %ar1;\n" +- ".reg .u32 %retval;\n" +- ".reg .u64 %hr10;\n" +- ".reg .u32 %r22;\n" +- ".reg .u32 %r23;\n" +- ".reg .u32 %r24;\n" +- ".reg .u32 %r25;\n" +- ".reg .u32 %r26;\n" +- ".reg .u32 %r27;\n" +- ".reg .u32 %r28;\n" +- ".reg .u32 %r29;\n" +- ".reg .pred %r30;\n" +- ".reg .u32 %r31;\n" +- ".reg .pred %r32;\n" +- ".reg .u32 %r33;\n" +- ".reg .pred %r34;\n" +- ".local .align 8 .b8 %frame[4];\n" +- "ld.param.u32 %ar1,[%in_ar1];\n" +- "mov.u32 %r27,%ar1;\n" +- "st.local.u32 [%frame],%r27;\n" +- "ld.local.u32 %r28,[%frame];\n" +- "mov.u32 %r29,1;\n" +- "setp.eq.u32 %r30,%r28,%r29;\n" +- "@%r30 bra $L11;\n" +- "mov.u32 %r31,2;\n" +- "setp.eq.u32 %r32,%r28,%r31;\n" +- "@%r32 bra $L12;\n" +- "mov.u32 %r33,0;\n" +- "setp.eq.u32 %r34,%r28,%r33;\n" +- "@!%r34 bra $L15;\n" +- "mov.u32 %r23,%ntid.x;\n" +- "mov.u32 %r22,%r23;\n" +- "bra $L14;\n" +- "$L11:\n" +- "mov.u32 %r24,%ntid.y;\n" +- "mov.u32 %r22,%r24;\n" +- "bra $L14;\n" +- "$L12:\n" +- "mov.u32 %r25,%ntid.z;\n" +- "mov.u32 %r22,%r25;\n" +- "bra $L14;\n" +- "$L15:\n" +- "{\n" +- "{\n" +- "call abort;\n" +- "}\n" +- "}\n" +- "$L14:\n" +- "mov.u32 %r26,%r22;\n" +- "mov.u32 %retval,%r26;\n" +- "st.param.u32 [%out_retval],%retval;\n" +- "ret;\n" +- "}\n" +- ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n" +- "{\n" +- ".reg .u32 %ar1;\n" +- ".reg .u32 %retval;\n" +- ".reg .u64 %hr10;\n" +- ".reg .u32 %r22;\n" +- ".reg .u32 %r23;\n" +- ".reg .u32 %r24;\n" +- ".reg .u32 %r25;\n" +- ".reg .u32 %r26;\n" +- ".reg .u32 %r27;\n" +- ".reg .u32 %r28;\n" +- ".reg .u32 %r29;\n" +- ".reg .pred %r30;\n" +- ".reg .u32 %r31;\n" +- ".reg .pred %r32;\n" +- ".reg .u32 %r33;\n" +- ".reg .pred %r34;\n" +- ".local .align 8 .b8 %frame[4];\n" +- "ld.param.u32 %ar1,[%in_ar1];\n" +- "mov.u32 %r27,%ar1;\n" +- "st.local.u32 [%frame],%r27;\n" +- "ld.local.u32 %r28,[%frame];\n" +- "mov.u32 %r29,1;\n" +- "setp.eq.u32 %r30,%r28,%r29;\n" +- "@%r30 bra $L18;\n" +- "mov.u32 %r31,2;\n" +- "setp.eq.u32 %r32,%r28,%r31;\n" +- "@%r32 bra $L19;\n" +- "mov.u32 %r33,0;\n" +- "setp.eq.u32 %r34,%r28,%r33;\n" +- "@!%r34 bra $L22;\n" +- "mov.u32 %r23,%ctaid.x;\n" +- "mov.u32 %r22,%r23;\n" +- "bra $L21;\n" +- "$L18:\n" +- "mov.u32 %r24,%ctaid.y;\n" +- "mov.u32 %r22,%r24;\n" +- "bra $L21;\n" +- "$L19:\n" +- "mov.u32 %r25,%ctaid.z;\n" +- "mov.u32 %r22,%r25;\n" +- "bra $L21;\n" +- "$L22:\n" +- "{\n" +- "{\n" +- "call abort;\n" +- "}\n" +- "}\n" +- "$L21:\n" +- "mov.u32 %r26,%r22;\n" +- "mov.u32 %retval,%r26;\n" +- "st.param.u32 [%out_retval],%retval;\n" +- "ret;\n" +- "}\n" +- ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n" +- "{\n" +- ".reg .u32 %ar1;\n" +- ".reg .u32 %retval;\n" +- ".reg .u64 %hr10;\n" +- ".reg .u32 %r22;\n" +- ".reg .u32 %r23;\n" +- ".reg .u32 %r24;\n" +- ".reg .u32 %r25;\n" +- ".reg .u32 %r26;\n" +- ".reg .u32 %r27;\n" +- ".reg .u32 %r28;\n" +- ".reg .u32 %r29;\n" +- ".reg .pred %r30;\n" +- ".reg .u32 %r31;\n" +- ".reg .pred %r32;\n" +- ".reg .u32 %r33;\n" +- ".reg .pred %r34;\n" +- ".local .align 8 .b8 %frame[4];\n" +- "ld.param.u32 %ar1,[%in_ar1];\n" +- "mov.u32 %r27,%ar1;\n" +- "st.local.u32 [%frame],%r27;\n" +- "ld.local.u32 %r28,[%frame];\n" +- "mov.u32 %r29,1;\n" +- "setp.eq.u32 %r30,%r28,%r29;\n" +- "@%r30 bra $L25;\n" +- "mov.u32 %r31,2;\n" +- "setp.eq.u32 %r32,%r28,%r31;\n" +- "@%r32 bra $L26;\n" +- "mov.u32 %r33,0;\n" +- "setp.eq.u32 %r34,%r28,%r33;\n" +- "@!%r34 bra $L29;\n" +- "mov.u32 %r23,%nctaid.x;\n" +- "mov.u32 %r22,%r23;\n" +- "bra $L28;\n" +- "$L25:\n" +- "mov.u32 %r24,%nctaid.y;\n" +- "mov.u32 %r22,%r24;\n" +- "bra $L28;\n" +- "$L26:\n" +- "mov.u32 %r25,%nctaid.z;\n" +- "mov.u32 %r22,%r25;\n" +- "bra $L28;\n" +- "$L29:\n" +- "{\n" +- "{\n" +- "call abort;\n" +- "}\n" +- "}\n" +- "$L28:\n" +- "mov.u32 %r26,%r22;\n" +- "mov.u32 %retval,%r26;\n" +- "st.param.u32 [%out_retval],%retval;\n" +- "ret;\n" +- "}\n" +- "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_num_threads\n" +- ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n" +- "{\n" +- ".reg .u32 %retval;\n" +- ".reg .u64 %hr10;\n" +- ".reg .u32 %r22;\n" +- ".reg .u32 %r23;\n" +- ".reg .u32 %r24;\n" +- ".reg .u32 %r25;\n" +- ".reg .u32 %r26;\n" +- ".reg .u32 %r27;\n" +- ".reg .u32 %r28;\n" +- ".reg .u32 %r29;\n" +- "mov.u32 %r26,0;\n" +- "{\n" +- ".param .u32 %retval_in;\n" +- "{\n" +- ".param .u32 %out_arg0;\n" +- "st.param.u32 [%out_arg0],%r26;\n" +- "call (%retval_in),GOACC_ntid,(%out_arg0);\n" +- "}\n" +- "ld.param.u32 %r27,[%retval_in];\n" +- "}\n" +- "mov.u32 %r22,%r27;\n" +- "mov.u32 %r28,0;\n" +- "{\n" +- ".param .u32 %retval_in;\n" +- "{\n" +- ".param .u32 %out_arg0;\n" +- "st.param.u32 [%out_arg0],%r28;\n" +- "call (%retval_in),GOACC_nctaid,(%out_arg0);\n" +- "}\n" +- "ld.param.u32 %r29,[%retval_in];\n" +- "}\n" +- "mov.u32 %r23,%r29;\n" +- "mul.lo.u32 %r24,%r22,%r23;\n" +- "mov.u32 %r25,%r24;\n" +- "mov.u32 %retval,%r25;\n" +- "st.param.u32 [%out_retval],%retval;\n" +- "ret;\n" +- "}\n" +- "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_thread_num\n" +- ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n" +- "{\n" +- ".reg .u32 %retval;\n" +- ".reg .u64 %hr10;\n" +- ".reg .u32 %r22;\n" +- ".reg .u32 %r23;\n" +- ".reg .u32 %r24;\n" +- ".reg .u32 %r25;\n" +- ".reg .u32 %r26;\n" +- ".reg .u32 %r27;\n" +- ".reg .u32 %r28;\n" +- ".reg .u32 %r29;\n" +- ".reg .u32 %r30;\n" +- ".reg .u32 %r31;\n" +- ".reg .u32 %r32;\n" +- ".reg .u32 %r33;\n" +- "mov.u32 %r28,0;\n" +- "{\n" +- ".param .u32 %retval_in;\n" +- "{\n" +- ".param .u32 %out_arg0;\n" +- "st.param.u32 [%out_arg0],%r28;\n" +- "call (%retval_in),GOACC_ntid,(%out_arg0);\n" +- "}\n" +- "ld.param.u32 %r29,[%retval_in];\n" +- "}\n" +- "mov.u32 %r22,%r29;\n" +- "mov.u32 %r30,0;\n" +- "{\n" +- ".param .u32 %retval_in;\n" +- "{\n" +- ".param .u32 %out_arg0;\n" +- "st.param.u32 [%out_arg0],%r30;\n" +- "call (%retval_in),GOACC_ctaid,(%out_arg0);\n" +- "}\n" +- "ld.param.u32 %r31,[%retval_in];\n" +- "}\n" +- "mov.u32 %r23,%r31;\n" +- "mul.lo.u32 %r24,%r22,%r23;\n" +- "mov.u32 %r32,0;\n" +- "{\n" +- ".param .u32 %retval_in;\n" +- "{\n" +- ".param .u32 %out_arg0;\n" +- "st.param.u32 [%out_arg0],%r32;\n" +- "call (%retval_in),GOACC_tid,(%out_arg0);\n" +- "}\n" +- "ld.param.u32 %r33,[%retval_in];\n" +- "}\n" +- "mov.u32 %r25,%r33;\n" +- "add.u32 %r26,%r24,%r25;\n" +- "mov.u32 %r27,%r26;\n" +- "mov.u32 %retval,%r27;\n" +- "st.param.u32 [%out_retval],%retval;\n" +- "ret;\n" +- "}\n"); +--- libgomp/config/nvptx/target.c.jj 2018-04-25 09:40:31.890655570 +0200 ++++ libgomp/config/nvptx/target.c 2019-05-07 18:46:36.453110901 +0200 +@@ -47,3 +47,21 @@ GOMP_teams (unsigned int num_teams, unsi + } + gomp_num_teams_var = num_teams - 1; + } ++ ++int ++omp_pause_resource (omp_pause_resource_t kind, int device_num) ++{ ++ (void) kind; ++ (void) device_num; ++ return -1; ++} ++ ++int ++omp_pause_resource_all (omp_pause_resource_t kind) ++{ ++ (void) kind; ++ return -1; ++} ++ ++ialias (omp_pause_resource) ++ialias (omp_pause_resource_all) +--- libgomp/config/nvptx/icv-device.c.jj 2018-04-25 09:40:31.889655570 +0200 ++++ libgomp/config/nvptx/icv-device.c 2019-05-07 18:46:36.453110901 +0200 +@@ -46,20 +46,6 @@ omp_get_num_devices (void) + } + + int +-omp_get_num_teams (void) +-{ +- return gomp_num_teams_var + 1; +-} +- +-int +-omp_get_team_num (void) +-{ +- int ctaid; +- asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid)); +- return ctaid; +-} +- +-int + omp_is_initial_device (void) + { + /* NVPTX is an accelerator-only target. */ +@@ -69,6 +55,4 @@ omp_is_initial_device (void) + ialias (omp_set_default_device) + ialias (omp_get_default_device) + ialias (omp_get_num_devices) +-ialias (omp_get_num_teams) +-ialias (omp_get_team_num) + ialias (omp_is_initial_device) +--- libgomp/config/nvptx/affinity-fmt.c.jj 2019-05-07 18:46:36.358112419 +0200 ++++ libgomp/config/nvptx/affinity-fmt.c 2019-05-07 18:46:36.358112419 +0200 +@@ -0,0 +1,51 @@ ++/* Copyright (C) 2018-2019 Free Software Foundation, Inc. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ . */ ++ ++#include "libgomp.h" ++#include ++#include ++#include ++#ifdef HAVE_UNISTD_H ++#include ++#endif ++#ifdef HAVE_INTTYPES_H ++# include /* For PRIx64. */ ++#endif ++#ifdef HAVE_UNAME ++#include ++#endif ++ ++/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx, ++ while the nvptx newlib implementation does not support those functions. ++ Override the configure test results here. */ ++#undef HAVE_GETPID ++#undef HAVE_GETHOSTNAME ++ ++/* The nvptx newlib implementation does not support fwrite, but it does support ++ write. Map fwrite to write. */ ++#undef fwrite ++#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size)) ++ ++#include "../../affinity-fmt.c" ++ +--- libgomp/config/mingw32/affinity-fmt.c.jj 2019-05-07 18:46:36.344112642 +0200 ++++ libgomp/config/mingw32/affinity-fmt.c 2019-05-07 18:46:36.344112642 +0200 +@@ -0,0 +1,68 @@ ++/* Copyright (C) 2018-2019 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek . ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ . */ ++ ++#include "libgomp.h" ++#include ++#include ++#include ++#ifdef HAVE_UNISTD_H ++#include ++#endif ++#ifdef HAVE_INTTYPES_H ++# include /* For PRIx64. */ ++#endif ++#define WIN32_LEAN_AND_MEAN ++#include ++#include ++ ++static int ++gomp_gethostname (char *name, size_t len) ++{ ++ /* On Win9x GetComputerName fails if the input size is less ++ than MAX_COMPUTERNAME_LENGTH + 1. */ ++ char buffer[MAX_COMPUTERNAME_LENGTH + 1]; ++ DWORD size = sizeof (buffer); ++ int ret = 0; ++ ++ if (!GetComputerName (buffer, &size)) ++ return -1; ++ ++ if ((size = strlen (buffer) + 1) > len) ++ { ++ errno = EINVAL; ++ /* Truncate as per POSIX spec. We do not NUL-terminate. */ ++ size = len; ++ ret = -1; ++ } ++ memcpy (name, buffer, (size_t) size); ++ ++ return ret; ++} ++ ++#undef gethostname ++#define gethostname gomp_gethostname ++#define HAVE_GETHOSTNAME 1 ++ ++#include "../../affinity-fmt.c" +--- libgomp/config/rtems/bar.c.jj 2018-04-25 09:40:31.902655576 +0200 ++++ libgomp/config/rtems/bar.c 2019-05-07 18:46:36.460110789 +0200 +@@ -72,184 +72,5 @@ do_wait (int *addr, int val) + futex_wait (addr, val); + } + +-/* Everything below this point should be identical to the Linux +- implementation. */ +- +-void +-gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) +-{ +- if (__builtin_expect (state & BAR_WAS_LAST, 0)) +- { +- /* Next time we'll be awaiting TOTAL threads again. */ +- bar->awaited = bar->total; +- __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, +- MEMMODEL_RELEASE); +- futex_wake ((int *) &bar->generation, INT_MAX); +- } +- else +- { +- do +- do_wait ((int *) &bar->generation, state); +- while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) == state); +- } +-} +- +-void +-gomp_barrier_wait (gomp_barrier_t *bar) +-{ +- gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); +-} +- +-/* Like gomp_barrier_wait, except that if the encountering thread +- is not the last one to hit the barrier, it returns immediately. +- The intended usage is that a thread which intends to gomp_barrier_destroy +- this barrier calls gomp_barrier_wait, while all other threads +- call gomp_barrier_wait_last. When gomp_barrier_wait returns, +- the barrier can be safely destroyed. */ +- +-void +-gomp_barrier_wait_last (gomp_barrier_t *bar) +-{ +- gomp_barrier_state_t state = gomp_barrier_wait_start (bar); +- if (state & BAR_WAS_LAST) +- gomp_barrier_wait_end (bar, state); +-} +- +-void +-gomp_team_barrier_wake (gomp_barrier_t *bar, int count) +-{ +- futex_wake ((int *) &bar->generation, count == 0 ? INT_MAX : count); +-} +- +-void +-gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) +-{ +- unsigned int generation, gen; +- +- if (__builtin_expect (state & BAR_WAS_LAST, 0)) +- { +- /* Next time we'll be awaiting TOTAL threads again. */ +- struct gomp_thread *thr = gomp_thread (); +- struct gomp_team *team = thr->ts.team; +- +- bar->awaited = bar->total; +- team->work_share_cancelled = 0; +- if (__builtin_expect (team->task_count, 0)) +- { +- gomp_barrier_handle_tasks (state); +- state &= ~BAR_WAS_LAST; +- } +- else +- { +- state &= ~BAR_CANCELLED; +- state += BAR_INCR - BAR_WAS_LAST; +- __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); +- futex_wake ((int *) &bar->generation, INT_MAX); +- return; +- } +- } +- +- generation = state; +- state &= ~BAR_CANCELLED; +- do +- { +- do_wait ((int *) &bar->generation, generation); +- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); +- if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) +- { +- gomp_barrier_handle_tasks (state); +- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); +- } +- generation |= gen & BAR_WAITING_FOR_TASK; +- } +- while (gen != state + BAR_INCR); +-} +- +-void +-gomp_team_barrier_wait (gomp_barrier_t *bar) +-{ +- gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar)); +-} +- +-void +-gomp_team_barrier_wait_final (gomp_barrier_t *bar) +-{ +- gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar); +- if (__builtin_expect (state & BAR_WAS_LAST, 0)) +- bar->awaited_final = bar->total; +- gomp_team_barrier_wait_end (bar, state); +-} +- +-bool +-gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, +- gomp_barrier_state_t state) +-{ +- unsigned int generation, gen; +- +- if (__builtin_expect (state & BAR_WAS_LAST, 0)) +- { +- /* Next time we'll be awaiting TOTAL threads again. */ +- /* BAR_CANCELLED should never be set in state here, because +- cancellation means that at least one of the threads has been +- cancelled, thus on a cancellable barrier we should never see +- all threads to arrive. */ +- struct gomp_thread *thr = gomp_thread (); +- struct gomp_team *team = thr->ts.team; +- +- bar->awaited = bar->total; +- team->work_share_cancelled = 0; +- if (__builtin_expect (team->task_count, 0)) +- { +- gomp_barrier_handle_tasks (state); +- state &= ~BAR_WAS_LAST; +- } +- else +- { +- state += BAR_INCR - BAR_WAS_LAST; +- __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); +- futex_wake ((int *) &bar->generation, INT_MAX); +- return false; +- } +- } +- +- if (__builtin_expect (state & BAR_CANCELLED, 0)) +- return true; +- +- generation = state; +- do +- { +- do_wait ((int *) &bar->generation, generation); +- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); +- if (__builtin_expect (gen & BAR_CANCELLED, 0)) +- return true; +- if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) +- { +- gomp_barrier_handle_tasks (state); +- gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); +- } +- generation |= gen & BAR_WAITING_FOR_TASK; +- } +- while (gen != state + BAR_INCR); +- +- return false; +-} +- +-bool +-gomp_team_barrier_wait_cancel (gomp_barrier_t *bar) +-{ +- return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar)); +-} +- +-void +-gomp_team_barrier_cancel (struct gomp_team *team) +-{ +- gomp_mutex_lock (&team->task_lock); +- if (team->barrier.generation & BAR_CANCELLED) +- { +- gomp_mutex_unlock (&team->task_lock); +- return; +- } +- team->barrier.generation |= BAR_CANCELLED; +- gomp_mutex_unlock (&team->task_lock); +- futex_wake ((int *) &team->barrier.generation, INT_MAX); +-} ++#define GOMP_WAIT_H 1 ++#include "../linux/bar.c" +--- libgomp/config/rtems/affinity-fmt.c.jj 2019-05-07 18:46:36.459110805 +0200 ++++ libgomp/config/rtems/affinity-fmt.c 2019-05-07 18:46:36.459110805 +0200 +@@ -0,0 +1,49 @@ ++/* Copyright (C) 2018-2019 Free Software Foundation, Inc. ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ . */ ++ ++#include "libgomp.h" ++#include ++#include ++#include ++#ifdef HAVE_UNISTD_H ++#include ++#endif ++#ifdef HAVE_INTTYPES_H ++# include /* For PRIx64. */ ++#endif ++#ifdef HAVE_UNAME ++#include ++#endif ++ ++/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for RTEMS, ++ but the extra information they give are of little value for the user. ++ Override the configure test results here. */ ++#undef HAVE_GETPID ++#undef HAVE_GETHOSTNAME ++ ++/* Avoid the complex fwrite() in favour of the simple write(). */ ++#undef fwrite ++#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size)) ++ ++#include "../../affinity-fmt.c" +--- libgomp/config.h.in.jj 2018-04-25 09:40:31.870655561 +0200 ++++ libgomp/config.h.in 2019-05-07 18:46:36.465110710 +0200 +@@ -1,5 +1,8 @@ + /* config.h.in. Generated from configure.ac by autoheader. */ + ++/* Define to 1 if you have the `aligned_alloc' function. */ ++#undef HAVE_ALIGNED_ALLOC ++ + /* Define to 1 if the target assembler supports .symver directive. */ + #undef HAVE_AS_SYMVER_DIRECTIVE + +@@ -33,9 +36,15 @@ + /* Define to 1 if you have the `getgid' function. */ + #undef HAVE_GETGID + ++/* Define if gethostname is supported. */ ++#undef HAVE_GETHOSTNAME ++ + /* Define to 1 if you have the `getloadavg' function. */ + #undef HAVE_GETLOADAVG + ++/* Define if getpid is supported. */ ++#undef HAVE_GETPID ++ + /* Define to 1 if you have the `getuid' function. */ + #undef HAVE_GETUID + +@@ -45,9 +54,15 @@ + /* Define to 1 if you have the `dl' library (-ldl). */ + #undef HAVE_LIBDL + ++/* Define to 1 if you have the `memalign' function. */ ++#undef HAVE_MEMALIGN ++ + /* Define to 1 if you have the header file. */ + #undef HAVE_MEMORY_H + ++/* Define to 1 if you have the `posix_memalign' function. */ ++#undef HAVE_POSIX_MEMALIGN ++ + /* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */ + #undef HAVE_PTHREAD_AFFINITY_NP + +@@ -103,9 +118,15 @@ + /* Define to 1 if the target supports thread-local storage. */ + #undef HAVE_TLS + ++/* Define if uname is supported and struct utsname has nodename field. */ ++#undef HAVE_UNAME ++ + /* Define to 1 if you have the header file. */ + #undef HAVE_UNISTD_H + ++/* Define to 1 if you have the `_aligned_malloc' function. */ ++#undef HAVE__ALIGNED_MALLOC ++ + /* Define to 1 if you have the `__secure_getenv' function. */ + #undef HAVE___SECURE_GETENV + +@@ -125,8 +146,8 @@ + */ + #undef LT_OBJDIR + +-/* Define to offload targets, separated by commas. */ +-#undef OFFLOAD_TARGETS ++/* Define to offload plugins, separated by commas. */ ++#undef OFFLOAD_PLUGINS + + /* Name of package */ + #undef PACKAGE +--- libgomp/teams.c.jj 2019-05-07 18:46:36.548109384 +0200 ++++ libgomp/teams.c 2019-05-07 18:46:36.548109384 +0200 +@@ -0,0 +1,74 @@ ++/* Copyright (C) 2018-2019 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek . ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ . */ ++ ++/* This file handles the host TEAMS construct. */ ++ ++#include "libgomp.h" ++#include ++ ++static unsigned gomp_num_teams = 1, gomp_team_num = 0; ++ ++void ++GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams, ++ unsigned int thread_limit, unsigned int flags) ++{ ++ (void) flags; ++ (void) num_teams; ++ unsigned old_thread_limit_var = 0; ++ if (thread_limit) ++ { ++ struct gomp_task_icv *icv = gomp_icv (true); ++ old_thread_limit_var = icv->thread_limit_var; ++ icv->thread_limit_var ++ = thread_limit > INT_MAX ? UINT_MAX : thread_limit; ++ } ++ if (num_teams == 0) ++ num_teams = 3; ++ gomp_num_teams = num_teams; ++ for (gomp_team_num = 0; gomp_team_num < num_teams; gomp_team_num++) ++ fn (data); ++ gomp_num_teams = 1; ++ gomp_team_num = 0; ++ if (thread_limit) ++ { ++ struct gomp_task_icv *icv = gomp_icv (true); ++ icv->thread_limit_var = old_thread_limit_var; ++ } ++} ++ ++int ++omp_get_num_teams (void) ++{ ++ return gomp_num_teams; ++} ++ ++int ++omp_get_team_num (void) ++{ ++ return gomp_team_num; ++} ++ ++ialias (omp_get_num_teams) ++ialias (omp_get_team_num) +--- libgomp/libgomp.map.jj 2018-04-25 09:40:31.321655307 +0200 ++++ libgomp/libgomp.map 2019-05-07 18:46:36.525109751 +0200 +@@ -164,6 +164,22 @@ OMP_4.5 { + omp_target_disassociate_ptr; + } OMP_4.0; + ++OMP_5.0 { ++ global: ++ omp_capture_affinity; ++ omp_capture_affinity_; ++ omp_display_affinity; ++ omp_display_affinity_; ++ omp_get_affinity_format; ++ omp_get_affinity_format_; ++ omp_set_affinity_format; ++ omp_set_affinity_format_; ++ omp_pause_resource; ++ omp_pause_resource_; ++ omp_pause_resource_all; ++ omp_pause_resource_all_; ++} OMP_4.5; ++ + GOMP_1.0 { + global: + GOMP_atomic_end; +@@ -298,6 +314,34 @@ GOMP_4.5 { + GOMP_parallel_loop_nonmonotonic_guided; + } GOMP_4.0.1; + ++GOMP_5.0 { ++ global: ++ GOMP_loop_doacross_start; ++ GOMP_loop_maybe_nonmonotonic_runtime_next; ++ GOMP_loop_maybe_nonmonotonic_runtime_start; ++ GOMP_loop_nonmonotonic_runtime_next; ++ GOMP_loop_nonmonotonic_runtime_start; ++ GOMP_loop_ordered_start; ++ GOMP_loop_start; ++ GOMP_loop_ull_doacross_start; ++ GOMP_loop_ull_maybe_nonmonotonic_runtime_next; ++ GOMP_loop_ull_maybe_nonmonotonic_runtime_start; ++ GOMP_loop_ull_nonmonotonic_runtime_next; ++ GOMP_loop_ull_nonmonotonic_runtime_start; ++ GOMP_loop_ull_ordered_start; ++ GOMP_loop_ull_start; ++ GOMP_parallel_loop_maybe_nonmonotonic_runtime; ++ GOMP_parallel_loop_nonmonotonic_runtime; ++ GOMP_parallel_reductions; ++ GOMP_sections2_start; ++ GOMP_taskgroup_reduction_register; ++ GOMP_taskgroup_reduction_unregister; ++ GOMP_task_reduction_remap; ++ GOMP_taskwait_depend; ++ GOMP_teams_reg; ++ GOMP_workshare_task_reduction_unregister; ++} GOMP_4.5; ++ + OACC_2.0 { + global: + acc_get_num_devices; +@@ -386,6 +430,52 @@ OACC_2.0.1 { + acc_pcreate; + } OACC_2.0; + ++OACC_2.5 { ++ global: ++ acc_copyin_async; ++ acc_copyin_async_32_h_; ++ acc_copyin_async_64_h_; ++ acc_copyin_async_array_h_; ++ acc_copyout_async; ++ acc_copyout_async_32_h_; ++ acc_copyout_async_64_h_; ++ acc_copyout_async_array_h_; ++ acc_copyout_finalize; ++ acc_copyout_finalize_32_h_; ++ acc_copyout_finalize_64_h_; ++ acc_copyout_finalize_array_h_; ++ acc_copyout_finalize_async; ++ acc_copyout_finalize_async_32_h_; ++ acc_copyout_finalize_async_64_h_; ++ acc_copyout_finalize_async_array_h_; ++ acc_create_async; ++ acc_create_async_32_h_; ++ acc_create_async_64_h_; ++ acc_create_async_array_h_; ++ acc_delete_async; ++ acc_delete_async_32_h_; ++ acc_delete_async_64_h_; ++ acc_delete_async_array_h_; ++ acc_delete_finalize; ++ acc_delete_finalize_32_h_; ++ acc_delete_finalize_64_h_; ++ acc_delete_finalize_array_h_; ++ acc_delete_finalize_async; ++ acc_delete_finalize_async_32_h_; ++ acc_delete_finalize_async_64_h_; ++ acc_delete_finalize_async_array_h_; ++ acc_memcpy_from_device_async; ++ acc_memcpy_to_device_async; ++ acc_update_device_async; ++ acc_update_device_async_32_h_; ++ acc_update_device_async_64_h_; ++ acc_update_device_async_array_h_; ++ acc_update_self_async; ++ acc_update_self_async_32_h_; ++ acc_update_self_async_64_h_; ++ acc_update_self_async_array_h_; ++} OACC_2.0.1; ++ + GOACC_2.0 { + global: + GOACC_data_end; +@@ -420,3 +510,8 @@ GOMP_PLUGIN_1.1 { + global: + GOMP_PLUGIN_target_task_completion; + } GOMP_PLUGIN_1.0; ++ ++GOMP_PLUGIN_1.2 { ++ global: ++ GOMP_PLUGIN_acc_default_dim; ++} GOMP_PLUGIN_1.1; +--- libgomp/oacc-async.c.jj 2018-04-25 09:40:31.925655587 +0200 ++++ libgomp/oacc-async.c 2019-05-07 18:46:36.528109704 +0200 +@@ -34,7 +34,7 @@ + int + acc_async_test (int async) + { +- if (async < acc_async_sync) ++ if (!async_valid_p (async)) + gomp_fatal ("invalid async argument: %d", async); + + struct goacc_thread *thr = goacc_thread (); +@@ -59,7 +59,7 @@ acc_async_test_all (void) + void + acc_wait (int async) + { +- if (async < acc_async_sync) ++ if (!async_valid_p (async)) + gomp_fatal ("invalid async argument: %d", async); + + struct goacc_thread *thr = goacc_thread (); +@@ -117,7 +117,7 @@ acc_async_wait_all (void) + void + acc_wait_all_async (int async) + { +- if (async < acc_async_sync) ++ if (!async_valid_p (async)) + gomp_fatal ("invalid async argument: %d", async); + + struct goacc_thread *thr = goacc_thread (); +--- libgomp/loop_ull.c.jj 2018-04-25 09:40:31.912655580 +0200 ++++ libgomp/loop_ull.c 2019-05-07 18:46:36.527109719 +0200 +@@ -27,8 +27,12 @@ + + #include + #include ++#include + #include "libgomp.h" + ++ialias (GOMP_loop_ull_runtime_next) ++ialias_redirect (GOMP_taskgroup_reduction_register) ++ + typedef unsigned long long gomp_ull; + + /* Initialize the given work share construct from the given arguments. */ +@@ -104,7 +108,7 @@ gomp_loop_ull_static_start (bool up, gom + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, + GFS_STATIC, chunk_size); +@@ -122,7 +126,7 @@ gomp_loop_ull_dynamic_start (bool up, go + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, + GFS_DYNAMIC, chunk_size); +@@ -148,7 +152,7 @@ gomp_loop_ull_guided_start (bool up, gom + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, + GFS_GUIDED, chunk_size); +@@ -171,7 +175,7 @@ GOMP_loop_ull_runtime_start (bool up, go + gomp_ull incr, gomp_ull *istart, gomp_ull *iend) + { + struct gomp_task_icv *icv = gomp_icv (false); +- switch (icv->run_sched_var) ++ switch (icv->run_sched_var & ~GFS_MONOTONIC) + { + case GFS_STATIC: + return gomp_loop_ull_static_start (up, start, end, incr, +@@ -195,6 +199,99 @@ GOMP_loop_ull_runtime_start (bool up, go + } + } + ++static long ++gomp_adjust_sched (long sched, gomp_ull *chunk_size) ++{ ++ sched &= ~GFS_MONOTONIC; ++ switch (sched) ++ { ++ case GFS_STATIC: ++ case GFS_DYNAMIC: ++ case GFS_GUIDED: ++ return sched; ++ /* GFS_RUNTIME is used for runtime schedule without monotonic ++ or nonmonotonic modifiers on the clause. ++ GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic ++ modifier. */ ++ case GFS_RUNTIME: ++ /* GFS_AUTO is used for runtime schedule with nonmonotonic ++ modifier. */ ++ case GFS_AUTO: ++ { ++ struct gomp_task_icv *icv = gomp_icv (false); ++ sched = icv->run_sched_var & ~GFS_MONOTONIC; ++ switch (sched) ++ { ++ case GFS_STATIC: ++ case GFS_DYNAMIC: ++ case GFS_GUIDED: ++ *chunk_size = icv->run_sched_chunk_size; ++ break; ++ case GFS_AUTO: ++ sched = GFS_STATIC; ++ *chunk_size = 0; ++ break; ++ default: ++ abort (); ++ } ++ return sched; ++ } ++ default: ++ abort (); ++ } ++} ++ ++bool ++GOMP_loop_ull_start (bool up, gomp_ull start, gomp_ull end, ++ gomp_ull incr, long sched, gomp_ull chunk_size, ++ gomp_ull *istart, gomp_ull *iend, ++ uintptr_t *reductions, void **mem) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ ++ thr->ts.static_trip = 0; ++ if (reductions) ++ gomp_workshare_taskgroup_start (); ++ if (gomp_work_share_start (0)) ++ { ++ sched = gomp_adjust_sched (sched, &chunk_size); ++ gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, ++ sched, chunk_size); ++ if (reductions) ++ { ++ GOMP_taskgroup_reduction_register (reductions); ++ thr->task->taskgroup->workshare = true; ++ thr->ts.work_share->task_reductions = reductions; ++ } ++ if (mem) ++ { ++ uintptr_t size = (uintptr_t) *mem; ++ if (size > (sizeof (struct gomp_work_share) ++ - offsetof (struct gomp_work_share, ++ inline_ordered_team_ids))) ++ thr->ts.work_share->ordered_team_ids ++ = gomp_malloc_cleared (size); ++ else ++ memset (thr->ts.work_share->ordered_team_ids, '\0', size); ++ *mem = (void *) thr->ts.work_share->ordered_team_ids; ++ } ++ gomp_work_share_init_done (); ++ } ++ else ++ { ++ if (reductions) ++ { ++ uintptr_t *first_reductions = thr->ts.work_share->task_reductions; ++ gomp_workshare_task_reduction_register (reductions, ++ first_reductions); ++ } ++ if (mem) ++ *mem = (void *) thr->ts.work_share->ordered_team_ids; ++ } ++ ++ return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend); ++} ++ + /* The *_ordered_*_start routines are similar. The only difference is that + this work-share construct is initialized to expect an ORDERED section. */ + +@@ -206,7 +303,7 @@ gomp_loop_ull_ordered_static_start (bool + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; +- if (gomp_work_share_start (true)) ++ if (gomp_work_share_start (1)) + { + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, + GFS_STATIC, chunk_size); +@@ -225,7 +322,7 @@ gomp_loop_ull_ordered_dynamic_start (boo + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (true)) ++ if (gomp_work_share_start (1)) + { + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, + GFS_DYNAMIC, chunk_size); +@@ -251,7 +348,7 @@ gomp_loop_ull_ordered_guided_start (bool + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (true)) ++ if (gomp_work_share_start (1)) + { + gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, + GFS_GUIDED, chunk_size); +@@ -275,7 +372,7 @@ GOMP_loop_ull_ordered_runtime_start (boo + gomp_ull *iend) + { + struct gomp_task_icv *icv = gomp_icv (false); +- switch (icv->run_sched_var) ++ switch (icv->run_sched_var & ~GFS_MONOTONIC) + { + case GFS_STATIC: + return gomp_loop_ull_ordered_static_start (up, start, end, incr, +@@ -299,6 +396,82 @@ GOMP_loop_ull_ordered_runtime_start (boo + } + } + ++bool ++GOMP_loop_ull_ordered_start (bool up, gomp_ull start, gomp_ull end, ++ gomp_ull incr, long sched, gomp_ull chunk_size, ++ gomp_ull *istart, gomp_ull *iend, ++ uintptr_t *reductions, void **mem) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ size_t ordered = 1; ++ bool ret; ++ ++ thr->ts.static_trip = 0; ++ if (reductions) ++ gomp_workshare_taskgroup_start (); ++ if (mem) ++ ordered += (uintptr_t) *mem; ++ if (gomp_work_share_start (ordered)) ++ { ++ sched = gomp_adjust_sched (sched, &chunk_size); ++ gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr, ++ sched, chunk_size); ++ if (reductions) ++ { ++ GOMP_taskgroup_reduction_register (reductions); ++ thr->task->taskgroup->workshare = true; ++ thr->ts.work_share->task_reductions = reductions; ++ } ++ if (sched == GFS_STATIC) ++ gomp_ordered_static_init (); ++ else ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ gomp_work_share_init_done (); ++ } ++ else ++ { ++ if (reductions) ++ { ++ uintptr_t *first_reductions = thr->ts.work_share->task_reductions; ++ gomp_workshare_task_reduction_register (reductions, ++ first_reductions); ++ } ++ sched = thr->ts.work_share->sched; ++ if (sched != GFS_STATIC) ++ gomp_mutex_lock (&thr->ts.work_share->lock); ++ } ++ ++ if (mem) ++ { ++ uintptr_t p ++ = (uintptr_t) (thr->ts.work_share->ordered_team_ids ++ + (thr->ts.team ? thr->ts.team->nthreads : 1)); ++ p += __alignof__ (long long) - 1; ++ p &= ~(__alignof__ (long long) - 1); ++ *mem = (void *) p; ++ } ++ ++ switch (sched) ++ { ++ case GFS_STATIC: ++ case GFS_AUTO: ++ return !gomp_iter_ull_static_next (istart, iend); ++ case GFS_DYNAMIC: ++ ret = gomp_iter_ull_dynamic_next_locked (istart, iend); ++ break; ++ case GFS_GUIDED: ++ ret = gomp_iter_ull_guided_next_locked (istart, iend); ++ break; ++ default: ++ abort (); ++ } ++ ++ if (ret) ++ gomp_ordered_first (); ++ gomp_mutex_unlock (&thr->ts.work_share->lock); ++ return ret; ++} ++ + /* The *_doacross_*_start routines are similar. The only difference is that + this work-share construct is initialized to expect an ORDERED(N) - DOACROSS + section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1 +@@ -313,11 +486,11 @@ gomp_loop_ull_doacross_static_start (uns + struct gomp_thread *thr = gomp_thread (); + + thr->ts.static_trip = 0; +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_STATIC, chunk_size); +- gomp_doacross_ull_init (ncounts, counts, chunk_size); ++ gomp_doacross_ull_init (ncounts, counts, chunk_size, 0); + gomp_work_share_init_done (); + } + +@@ -332,11 +505,11 @@ gomp_loop_ull_doacross_dynamic_start (un + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_DYNAMIC, chunk_size); +- gomp_doacross_ull_init (ncounts, counts, chunk_size); ++ gomp_doacross_ull_init (ncounts, counts, chunk_size, 0); + gomp_work_share_init_done (); + } + +@@ -359,11 +532,11 @@ gomp_loop_ull_doacross_guided_start (uns + struct gomp_thread *thr = gomp_thread (); + bool ret; + +- if (gomp_work_share_start (false)) ++ if (gomp_work_share_start (0)) + { + gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, + GFS_GUIDED, chunk_size); +- gomp_doacross_ull_init (ncounts, counts, chunk_size); ++ gomp_doacross_ull_init (ncounts, counts, chunk_size, 0); + gomp_work_share_init_done (); + } + +@@ -383,7 +556,7 @@ GOMP_loop_ull_doacross_runtime_start (un + gomp_ull *istart, gomp_ull *iend) + { + struct gomp_task_icv *icv = gomp_icv (false); +- switch (icv->run_sched_var) ++ switch (icv->run_sched_var & ~GFS_MONOTONIC) + { + case GFS_STATIC: + return gomp_loop_ull_doacross_static_start (ncounts, counts, +@@ -407,6 +580,51 @@ GOMP_loop_ull_doacross_runtime_start (un + } + } + ++bool ++GOMP_loop_ull_doacross_start (unsigned ncounts, gomp_ull *counts, ++ long sched, gomp_ull chunk_size, ++ gomp_ull *istart, gomp_ull *iend, ++ uintptr_t *reductions, void **mem) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ ++ thr->ts.static_trip = 0; ++ if (reductions) ++ gomp_workshare_taskgroup_start (); ++ if (gomp_work_share_start (0)) ++ { ++ size_t extra = 0; ++ if (mem) ++ extra = (uintptr_t) *mem; ++ sched = gomp_adjust_sched (sched, &chunk_size); ++ gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1, ++ sched, chunk_size); ++ gomp_doacross_ull_init (ncounts, counts, chunk_size, extra); ++ if (reductions) ++ { ++ GOMP_taskgroup_reduction_register (reductions); ++ thr->task->taskgroup->workshare = true; ++ thr->ts.work_share->task_reductions = reductions; ++ } ++ gomp_work_share_init_done (); ++ } ++ else ++ { ++ if (reductions) ++ { ++ uintptr_t *first_reductions = thr->ts.work_share->task_reductions; ++ gomp_workshare_task_reduction_register (reductions, ++ first_reductions); ++ } ++ sched = thr->ts.work_share->sched; ++ } ++ ++ if (mem) ++ *mem = thr->ts.work_share->doacross->extra; ++ ++ return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend); ++} ++ + /* The *_next routines are called when the thread completes processing of + the iteration block currently assigned to it. If the work-share + construct is bound directly to a parallel construct, then the iteration +@@ -570,6 +788,10 @@ extern __typeof(gomp_loop_ull_dynamic_st + __attribute__((alias ("gomp_loop_ull_dynamic_start"))); + extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start + __attribute__((alias ("gomp_loop_ull_guided_start"))); ++extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_nonmonotonic_runtime_start ++ __attribute__((alias ("GOMP_loop_ull_runtime_start"))); ++extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_maybe_nonmonotonic_runtime_start ++ __attribute__((alias ("GOMP_loop_ull_runtime_start"))); + + extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start + __attribute__((alias ("gomp_loop_ull_ordered_static_start"))); +@@ -595,6 +817,10 @@ extern __typeof(gomp_loop_ull_dynamic_ne + __attribute__((alias ("gomp_loop_ull_dynamic_next"))); + extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next + __attribute__((alias ("gomp_loop_ull_guided_next"))); ++extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_nonmonotonic_runtime_next ++ __attribute__((alias ("GOMP_loop_ull_runtime_next"))); ++extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_maybe_nonmonotonic_runtime_next ++ __attribute__((alias ("GOMP_loop_ull_runtime_next"))); + + extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next + __attribute__((alias ("gomp_loop_ull_ordered_static_next"))); +@@ -650,6 +876,23 @@ GOMP_loop_ull_nonmonotonic_guided_start + } + + bool ++GOMP_loop_ull_nonmonotonic_runtime_start (bool up, gomp_ull start, ++ gomp_ull end, gomp_ull incr, ++ gomp_ull *istart, gomp_ull *iend) ++{ ++ return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend); ++} ++ ++bool ++GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool up, gomp_ull start, ++ gomp_ull end, gomp_ull incr, ++ gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend); ++} ++ ++bool + GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end, + gomp_ull incr, gomp_ull chunk_size, + gomp_ull *istart, gomp_ull *iend) +@@ -734,6 +977,19 @@ GOMP_loop_ull_nonmonotonic_guided_next ( + } + + bool ++GOMP_loop_ull_nonmonotonic_runtime_next (gomp_ull *istart, gomp_ull *iend) ++{ ++ return GOMP_loop_ull_runtime_next (istart, iend); ++} ++ ++bool ++GOMP_loop_ull_maybe_nonmonotonic_runtime_next (gomp_ull *istart, ++ gomp_ull *iend) ++{ ++ return GOMP_loop_ull_runtime_next (istart, iend); ++} ++ ++bool + GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend) + { + return gomp_loop_ull_ordered_static_next (istart, iend); +--- libgomp/oacc-int.h.jj 2018-04-25 09:40:31.320655306 +0200 ++++ libgomp/oacc-int.h 2019-05-07 18:46:36.529109688 +0200 +@@ -99,6 +99,28 @@ void goacc_restore_bind (void); + void goacc_lazy_initialize (void); + void goacc_host_init (void); + ++static inline bool ++async_valid_stream_id_p (int async) ++{ ++ return async >= 0; ++} ++ ++static inline bool ++async_valid_p (int async) ++{ ++ return (async == acc_async_noval || async == acc_async_sync ++ || async_valid_stream_id_p (async)); ++} ++ ++static inline bool ++async_synchronous_p (int async) ++{ ++ if (!async_valid_p (async)) ++ return true; ++ ++ return async == acc_async_sync; ++} ++ + #ifdef HAVE_ATTRIBUTE_VISIBILITY + # pragma GCC visibility pop + #endif +--- libgomp/testsuite/Makefile.in.jj 2018-04-25 09:40:31.452655368 +0200 ++++ libgomp/testsuite/Makefile.in 2019-05-07 18:51:35.754330084 +0200 +@@ -223,6 +223,7 @@ mkdir_p = @mkdir_p@ + multi_basedir = @multi_basedir@ + offload_additional_lib_paths = @offload_additional_lib_paths@ + offload_additional_options = @offload_additional_options@ ++offload_plugins = @offload_plugins@ + offload_targets = @offload_targets@ + oldincludedir = @oldincludedir@ + pdfdir = @pdfdir@ +--- libgomp/task.c.jj 2018-04-25 09:40:31.925655587 +0200 ++++ libgomp/task.c 2019-05-07 18:46:36.547109400 +0200 +@@ -166,21 +166,72 @@ gomp_task_handle_depend (struct gomp_tas + void **depend) + { + size_t ndepend = (uintptr_t) depend[0]; +- size_t nout = (uintptr_t) depend[1]; + size_t i; + hash_entry_type ent; + ++ if (ndepend) ++ { ++ /* depend[0] is total # */ ++ size_t nout = (uintptr_t) depend[1]; /* # of out: and inout: */ ++ /* ndepend - nout is # of in: */ ++ for (i = 0; i < ndepend; i++) ++ { ++ task->depend[i].addr = depend[2 + i]; ++ task->depend[i].is_in = i >= nout; ++ } ++ } ++ else ++ { ++ ndepend = (uintptr_t) depend[1]; /* total # */ ++ size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */ ++ size_t nmutexinoutset = (uintptr_t) depend[3]; /* # of mutexinoutset: */ ++ /* For now we treat mutexinoutset like out, which is compliant, but ++ inefficient. */ ++ size_t nin = (uintptr_t) depend[4]; /* # of in: */ ++ /* ndepend - nout - nmutexinoutset - nin is # of depobjs */ ++ size_t normal = nout + nmutexinoutset + nin; ++ size_t n = 0; ++ for (i = normal; i < ndepend; i++) ++ { ++ void **d = (void **) (uintptr_t) depend[5 + i]; ++ switch ((uintptr_t) d[1]) ++ { ++ case GOMP_DEPEND_OUT: ++ case GOMP_DEPEND_INOUT: ++ case GOMP_DEPEND_MUTEXINOUTSET: ++ break; ++ case GOMP_DEPEND_IN: ++ continue; ++ default: ++ gomp_fatal ("unknown omp_depend_t dependence type %d", ++ (int) (uintptr_t) d[1]); ++ } ++ task->depend[n].addr = d[0]; ++ task->depend[n++].is_in = 0; ++ } ++ for (i = 0; i < normal; i++) ++ { ++ task->depend[n].addr = depend[5 + i]; ++ task->depend[n++].is_in = i >= nout + nmutexinoutset; ++ } ++ for (i = normal; i < ndepend; i++) ++ { ++ void **d = (void **) (uintptr_t) depend[5 + i]; ++ if ((uintptr_t) d[1] != GOMP_DEPEND_IN) ++ continue; ++ task->depend[n].addr = d[0]; ++ task->depend[n++].is_in = 1; ++ } ++ } + task->depend_count = ndepend; + task->num_dependees = 0; + if (parent->depend_hash == NULL) + parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); + for (i = 0; i < ndepend; i++) + { +- task->depend[i].addr = depend[2 + i]; + task->depend[i].next = NULL; + task->depend[i].prev = NULL; + task->depend[i].task = task; +- task->depend[i].is_in = i >= nout; + task->depend[i].redundant = false; + task->depend[i].redundant_out = false; + +@@ -205,7 +256,7 @@ gomp_task_handle_depend (struct gomp_tas + last = ent; + + /* depend(in:...) doesn't depend on earlier depend(in:...). */ +- if (i >= nout && ent->is_in) ++ if (task->depend[i].is_in && ent->is_in) + continue; + + if (!ent->is_in) +@@ -280,9 +331,18 @@ gomp_task_handle_depend (struct gomp_tas + then the task may be executed by any member of the team. + + DEPEND is an array containing: ++ if depend[0] is non-zero, then: + depend[0]: number of depend elements. +- depend[1]: number of depend elements of type "out". +- depend[2..N+1]: address of [1..N]th depend element. */ ++ depend[1]: number of depend elements of type "out/inout". ++ depend[2..N+1]: address of [1..N]th depend element. ++ otherwise, when depend[0] is zero, then: ++ depend[1]: number of depend elements. ++ depend[2]: number of depend elements of type "out/inout". ++ depend[3]: number of depend elements of type "mutexinoutset". ++ depend[4]: number of depend elements of type "in". ++ depend[5..4+depend[2]+depend[3]+depend[4]]: address of depend elements ++ depend[5+depend[2]+depend[3]+depend[4]..4+depend[1]]: address of ++ omp_depend_t objects. */ + + void + GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), +@@ -303,10 +363,20 @@ GOMP_task (void (*fn) (void *), void *da + #endif + + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ +- if (team +- && (gomp_team_barrier_cancelled (&team->barrier) +- || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) +- return; ++ if (__builtin_expect (gomp_cancel_var, 0) && team) ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return; ++ } ++ } + + if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0) + priority = 0; +@@ -377,7 +447,7 @@ GOMP_task (void (*fn) (void *), void *da + size_t depend_size = 0; + + if (flags & GOMP_TASK_FLAG_DEPEND) +- depend_size = ((uintptr_t) depend[0] ++ depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1]) + * sizeof (struct gomp_task_depend_entry)); + task = gomp_malloc (sizeof (*task) + depend_size + + arg_size + arg_align - 1); +@@ -404,14 +474,26 @@ GOMP_task (void (*fn) (void *), void *da + gomp_mutex_lock (&team->task_lock); + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ +- if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) +- || (taskgroup && taskgroup->cancelled)) +- && !task->copy_ctors_done, 0)) ++ if (__builtin_expect (gomp_cancel_var, 0) ++ && !task->copy_ctors_done) + { +- gomp_mutex_unlock (&team->task_lock); +- gomp_finish_task (task); +- free (task); +- return; ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ { ++ do_cancel: ++ gomp_mutex_unlock (&team->task_lock); ++ gomp_finish_task (task); ++ free (task); ++ return; ++ } ++ if (taskgroup) ++ { ++ if (taskgroup->cancelled) ++ goto do_cancel; ++ if (taskgroup->workshare ++ && taskgroup->prev ++ && taskgroup->prev->cancelled) ++ goto do_cancel; ++ } + } + if (taskgroup) + taskgroup->num_children++; +@@ -463,6 +545,7 @@ GOMP_task (void (*fn) (void *), void *da + + ialias (GOMP_taskgroup_start) + ialias (GOMP_taskgroup_end) ++ialias (GOMP_taskgroup_reduction_register) + + #define TYPE long + #define UTYPE unsigned long +@@ -601,10 +684,20 @@ gomp_create_target_task (struct gomp_dev + struct gomp_team *team = thr->ts.team; + + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ +- if (team +- && (gomp_team_barrier_cancelled (&team->barrier) +- || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) +- return true; ++ if (__builtin_expect (gomp_cancel_var, 0) && team) ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return true; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return true; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return true; ++ } ++ } + + struct gomp_target_task *ttask; + struct gomp_task *task; +@@ -617,7 +710,7 @@ gomp_create_target_task (struct gomp_dev + + if (depend != NULL) + { +- depend_cnt = (uintptr_t) depend[0]; ++ depend_cnt = (uintptr_t) (depend[0] ? depend[0] : depend[1]); + depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry); + } + if (fn) +@@ -687,13 +780,25 @@ gomp_create_target_task (struct gomp_dev + task->final_task = 0; + gomp_mutex_lock (&team->task_lock); + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ +- if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier) +- || (taskgroup && taskgroup->cancelled), 0)) ++ if (__builtin_expect (gomp_cancel_var, 0)) + { +- gomp_mutex_unlock (&team->task_lock); +- gomp_finish_task (task); +- free (task); +- return true; ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ { ++ do_cancel: ++ gomp_mutex_unlock (&team->task_lock); ++ gomp_finish_task (task); ++ free (task); ++ return true; ++ } ++ if (taskgroup) ++ { ++ if (taskgroup->cancelled) ++ goto do_cancel; ++ if (taskgroup->workshare ++ && taskgroup->prev ++ && taskgroup->prev->cancelled) ++ goto do_cancel; ++ } + } + if (depend_size) + { +@@ -986,10 +1091,21 @@ gomp_task_run_pre (struct gomp_task *chi + + if (--team->task_queued_count == 0) + gomp_team_barrier_clear_task_pending (&team->barrier); +- if ((gomp_team_barrier_cancelled (&team->barrier) +- || (taskgroup && taskgroup->cancelled)) ++ if (__builtin_expect (gomp_cancel_var, 0) + && !child_task->copy_ctors_done) +- return true; ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return true; ++ if (taskgroup) ++ { ++ if (taskgroup->cancelled) ++ return true; ++ if (taskgroup->workshare ++ && taskgroup->prev ++ && taskgroup->prev->cancelled) ++ return true; ++ } ++ } + return false; + } + +@@ -1456,6 +1572,35 @@ GOMP_taskwait (void) + } + } + ++/* Called when encountering a taskwait directive with depend clause(s). ++ Wait as if it was an mergeable included task construct with empty body. */ ++ ++void ++GOMP_taskwait_depend (void **depend) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ ++ /* If parallel or taskgroup has been cancelled, return early. */ ++ if (__builtin_expect (gomp_cancel_var, 0) && team) ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return; ++ } ++ } ++ ++ if (thr->task && thr->task->depend_hash) ++ gomp_task_maybe_wait_for_dependencies (depend); ++} ++ + /* An undeferred task is about to run. Wait for all tasks that this + undeferred task depends on. + +@@ -1464,7 +1609,7 @@ GOMP_taskwait (void) + the scheduling queues. Then we iterate through these imminently + ready tasks (and possibly other high priority tasks), and run them. + If we run out of ready dependencies to execute, we either wait for +- the reamining dependencies to finish, or wait for them to get ++ the remaining dependencies to finish, or wait for them to get + scheduled so we can run them. + + DEPEND is as in GOMP_task. */ +@@ -1477,21 +1622,50 @@ gomp_task_maybe_wait_for_dependencies (v + struct gomp_team *team = thr->ts.team; + struct gomp_task_depend_entry elem, *ent = NULL; + struct gomp_taskwait taskwait; +- size_t ndepend = (uintptr_t) depend[0]; ++ size_t orig_ndepend = (uintptr_t) depend[0]; + size_t nout = (uintptr_t) depend[1]; ++ size_t ndepend = orig_ndepend; ++ size_t normal = ndepend; ++ size_t n = 2; + size_t i; + size_t num_awaited = 0; + struct gomp_task *child_task = NULL; + struct gomp_task *to_free = NULL; + int do_wake = 0; + ++ if (ndepend == 0) ++ { ++ ndepend = nout; ++ nout = (uintptr_t) depend[2] + (uintptr_t) depend[3]; ++ normal = nout + (uintptr_t) depend[4]; ++ n = 5; ++ } + gomp_mutex_lock (&team->task_lock); + for (i = 0; i < ndepend; i++) + { +- elem.addr = depend[i + 2]; ++ elem.addr = depend[i + n]; ++ elem.is_in = i >= nout; ++ if (__builtin_expect (i >= normal, 0)) ++ { ++ void **d = (void **) elem.addr; ++ switch ((uintptr_t) d[1]) ++ { ++ case GOMP_DEPEND_IN: ++ break; ++ case GOMP_DEPEND_OUT: ++ case GOMP_DEPEND_INOUT: ++ case GOMP_DEPEND_MUTEXINOUTSET: ++ elem.is_in = 0; ++ break; ++ default: ++ gomp_fatal ("unknown omp_depend_t dependence type %d", ++ (int) (uintptr_t) d[1]); ++ } ++ elem.addr = d[0]; ++ } + ent = htab_find (task->depend_hash, &elem); + for (; ent; ent = ent->next) +- if (i >= nout && ent->is_in) ++ if (elem.is_in && ent->is_in) + continue; + else + { +@@ -1654,13 +1828,28 @@ GOMP_taskyield (void) + /* Nothing at the moment. */ + } + ++static inline struct gomp_taskgroup * ++gomp_taskgroup_init (struct gomp_taskgroup *prev) ++{ ++ struct gomp_taskgroup *taskgroup ++ = gomp_malloc (sizeof (struct gomp_taskgroup)); ++ taskgroup->prev = prev; ++ priority_queue_init (&taskgroup->taskgroup_queue); ++ taskgroup->reductions = prev ? prev->reductions : NULL; ++ taskgroup->in_taskgroup_wait = false; ++ taskgroup->cancelled = false; ++ taskgroup->workshare = false; ++ taskgroup->num_children = 0; ++ gomp_sem_init (&taskgroup->taskgroup_sem, 0); ++ return taskgroup; ++} ++ + void + GOMP_taskgroup_start (void) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + struct gomp_task *task = thr->task; +- struct gomp_taskgroup *taskgroup; + + /* If team is NULL, all tasks are executed as + GOMP_TASK_UNDEFERRED tasks and thus all children tasks of +@@ -1668,14 +1857,7 @@ GOMP_taskgroup_start (void) + by the time GOMP_taskgroup_end is called. */ + if (team == NULL) + return; +- taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup)); +- taskgroup->prev = task->taskgroup; +- priority_queue_init (&taskgroup->taskgroup_queue); +- taskgroup->in_taskgroup_wait = false; +- taskgroup->cancelled = false; +- taskgroup->num_children = 0; +- gomp_sem_init (&taskgroup->taskgroup_sem, 0); +- task->taskgroup = taskgroup; ++ task->taskgroup = gomp_taskgroup_init (task->taskgroup); + } + + void +@@ -1840,6 +2022,302 @@ GOMP_taskgroup_end (void) + free (taskgroup); + } + ++static inline __attribute__((always_inline)) void ++gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig, ++ unsigned nthreads) ++{ ++ size_t total_cnt = 0; ++ uintptr_t *d = data; ++ struct htab *old_htab = NULL, *new_htab; ++ do ++ { ++ if (__builtin_expect (orig != NULL, 0)) ++ { ++ /* For worksharing task reductions, memory has been allocated ++ already by some other thread that encountered the construct ++ earlier. */ ++ d[2] = orig[2]; ++ d[6] = orig[6]; ++ orig = (uintptr_t *) orig[4]; ++ } ++ else ++ { ++ size_t sz = d[1] * nthreads; ++ /* Should use omp_alloc if d[3] is not -1. */ ++ void *ptr = gomp_aligned_alloc (d[2], sz); ++ memset (ptr, '\0', sz); ++ d[2] = (uintptr_t) ptr; ++ d[6] = d[2] + sz; ++ } ++ d[5] = 0; ++ total_cnt += d[0]; ++ if (d[4] == 0) ++ { ++ d[4] = (uintptr_t) old; ++ break; ++ } ++ else ++ d = (uintptr_t *) d[4]; ++ } ++ while (1); ++ if (old && old[5]) ++ { ++ old_htab = (struct htab *) old[5]; ++ total_cnt += htab_elements (old_htab); ++ } ++ new_htab = htab_create (total_cnt); ++ if (old_htab) ++ { ++ /* Copy old hash table, like in htab_expand. */ ++ hash_entry_type *p, *olimit; ++ new_htab->n_elements = htab_elements (old_htab); ++ olimit = old_htab->entries + old_htab->size; ++ p = old_htab->entries; ++ do ++ { ++ hash_entry_type x = *p; ++ if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY) ++ *find_empty_slot_for_expand (new_htab, htab_hash (x)) = x; ++ p++; ++ } ++ while (p < olimit); ++ } ++ d = data; ++ do ++ { ++ size_t j; ++ for (j = 0; j < d[0]; ++j) ++ { ++ uintptr_t *p = d + 7 + j * 3; ++ p[2] = (uintptr_t) d; ++ /* Ugly hack, hash_entry_type is defined for the task dependencies, ++ which hash on the first element which is a pointer. We need ++ to hash also on the first sizeof (uintptr_t) bytes which contain ++ a pointer. Hide the cast from the compiler. */ ++ hash_entry_type n; ++ __asm ("" : "=g" (n) : "0" (p)); ++ *htab_find_slot (&new_htab, n, INSERT) = n; ++ } ++ if (d[4] == (uintptr_t) old) ++ break; ++ else ++ d = (uintptr_t *) d[4]; ++ } ++ while (1); ++ d[5] = (uintptr_t) new_htab; ++} ++ ++static void ++gomp_create_artificial_team (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_task_icv *icv; ++ struct gomp_team *team = gomp_new_team (1); ++ struct gomp_task *task = thr->task; ++ icv = task ? &task->icv : &gomp_global_icv; ++ team->prev_ts = thr->ts; ++ thr->ts.team = team; ++ thr->ts.team_id = 0; ++ thr->ts.work_share = &team->work_shares[0]; ++ thr->ts.last_work_share = NULL; ++#ifdef HAVE_SYNC_BUILTINS ++ thr->ts.single_count = 0; ++#endif ++ thr->ts.static_trip = 0; ++ thr->task = &team->implicit_task[0]; ++ gomp_init_task (thr->task, NULL, icv); ++ if (task) ++ { ++ thr->task = task; ++ gomp_end_task (); ++ free (task); ++ thr->task = &team->implicit_task[0]; ++ } ++#ifdef LIBGOMP_USE_PTHREADS ++ else ++ pthread_setspecific (gomp_thread_destructor, thr); ++#endif ++} ++ ++/* The format of data is: ++ data[0] cnt ++ data[1] size ++ data[2] alignment (on output array pointer) ++ data[3] allocator (-1 if malloc allocator) ++ data[4] next pointer ++ data[5] used internally (htab pointer) ++ data[6] used internally (end of array) ++ cnt times ++ ent[0] address ++ ent[1] offset ++ ent[2] used internally (pointer to data[0]) ++ The entries are sorted by increasing offset, so that a binary ++ search can be performed. Normally, data[8] is 0, exception is ++ for worksharing construct task reductions in cancellable parallel, ++ where at offset 0 there should be space for a pointer and an integer ++ which are used internally. */ ++ ++void ++GOMP_taskgroup_reduction_register (uintptr_t *data) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_task *task; ++ unsigned nthreads; ++ if (__builtin_expect (team == NULL, 0)) ++ { ++ /* The task reduction code needs a team and task, so for ++ orphaned taskgroups just create the implicit team. */ ++ gomp_create_artificial_team (); ++ ialias_call (GOMP_taskgroup_start) (); ++ team = thr->ts.team; ++ } ++ nthreads = team->nthreads; ++ task = thr->task; ++ gomp_reduction_register (data, task->taskgroup->reductions, NULL, nthreads); ++ task->taskgroup->reductions = data; ++} ++ ++void ++GOMP_taskgroup_reduction_unregister (uintptr_t *data) ++{ ++ uintptr_t *d = data; ++ htab_free ((struct htab *) data[5]); ++ do ++ { ++ gomp_aligned_free ((void *) d[2]); ++ d = (uintptr_t *) d[4]; ++ } ++ while (d && !d[5]); ++} ++ialias (GOMP_taskgroup_reduction_unregister) ++ ++/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the ++ original list item or address of previously remapped original list ++ item to address of the private copy, store that to ptrs[i]. ++ For i < cntorig, additionally set ptrs[cnt+i] to the address of ++ the original list item. */ ++ ++void ++GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_task *task = thr->task; ++ unsigned id = thr->ts.team_id; ++ uintptr_t *data = task->taskgroup->reductions; ++ uintptr_t *d; ++ struct htab *reduction_htab = (struct htab *) data[5]; ++ size_t i; ++ for (i = 0; i < cnt; ++i) ++ { ++ hash_entry_type ent, n; ++ __asm ("" : "=g" (ent) : "0" (ptrs + i)); ++ n = htab_find (reduction_htab, ent); ++ if (n) ++ { ++ uintptr_t *p; ++ __asm ("" : "=g" (p) : "0" (n)); ++ /* At this point, p[0] should be equal to (uintptr_t) ptrs[i], ++ p[1] is the offset within the allocated chunk for each ++ thread, p[2] is the array registered with ++ GOMP_taskgroup_reduction_register, d[2] is the base of the ++ allocated memory and d[1] is the size of the allocated chunk ++ for one thread. */ ++ d = (uintptr_t *) p[2]; ++ ptrs[i] = (void *) (d[2] + id * d[1] + p[1]); ++ if (__builtin_expect (i < cntorig, 0)) ++ ptrs[cnt + i] = (void *) p[0]; ++ continue; ++ } ++ d = data; ++ while (d != NULL) ++ { ++ if ((uintptr_t) ptrs[i] >= d[2] && (uintptr_t) ptrs[i] < d[6]) ++ break; ++ d = (uintptr_t *) d[4]; ++ } ++ if (d == NULL) ++ gomp_fatal ("couldn't find matching task_reduction or reduction with " ++ "task modifier for %p", ptrs[i]); ++ uintptr_t off = ((uintptr_t) ptrs[i] - d[2]) % d[1]; ++ ptrs[i] = (void *) (d[2] + id * d[1] + off); ++ if (__builtin_expect (i < cntorig, 0)) ++ { ++ size_t lo = 0, hi = d[0] - 1; ++ while (lo <= hi) ++ { ++ size_t m = (lo + hi) / 2; ++ if (d[7 + 3 * m + 1] < off) ++ lo = m + 1; ++ else if (d[7 + 3 * m + 1] == off) ++ { ++ ptrs[cnt + i] = (void *) d[7 + 3 * m]; ++ break; ++ } ++ else ++ hi = m - 1; ++ } ++ if (lo > hi) ++ gomp_fatal ("couldn't find matching task_reduction or reduction " ++ "with task modifier for %p", ptrs[i]); ++ } ++ } ++} ++ ++struct gomp_taskgroup * ++gomp_parallel_reduction_register (uintptr_t *data, unsigned nthreads) ++{ ++ struct gomp_taskgroup *taskgroup = gomp_taskgroup_init (NULL); ++ gomp_reduction_register (data, NULL, NULL, nthreads); ++ taskgroup->reductions = data; ++ return taskgroup; ++} ++ ++void ++gomp_workshare_task_reduction_register (uintptr_t *data, uintptr_t *orig) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_task *task = thr->task; ++ unsigned nthreads = team->nthreads; ++ gomp_reduction_register (data, task->taskgroup->reductions, orig, nthreads); ++ task->taskgroup->reductions = data; ++} ++ ++void ++gomp_workshare_taskgroup_start (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_team *team = thr->ts.team; ++ struct gomp_task *task; ++ ++ if (team == NULL) ++ { ++ gomp_create_artificial_team (); ++ team = thr->ts.team; ++ } ++ task = thr->task; ++ task->taskgroup = gomp_taskgroup_init (task->taskgroup); ++ task->taskgroup->workshare = true; ++} ++ ++void ++GOMP_workshare_task_reduction_unregister (bool cancelled) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_task *task = thr->task; ++ struct gomp_team *team = thr->ts.team; ++ uintptr_t *data = task->taskgroup->reductions; ++ ialias_call (GOMP_taskgroup_end) (); ++ if (thr->ts.team_id == 0) ++ ialias_call (GOMP_taskgroup_reduction_unregister) (data); ++ else ++ htab_free ((struct htab *) data[5]); ++ ++ if (!cancelled) ++ gomp_team_barrier_wait (&team->barrier); ++} ++ + int + omp_in_final (void) + { +--- libgomp/team.c.jj 2018-04-25 09:40:31.322655307 +0200 ++++ libgomp/team.c 2019-05-07 18:46:36.548109384 +0200 +@@ -32,7 +32,6 @@ + #include + + #ifdef LIBGOMP_USE_PTHREADS +-/* This attribute contains PTHREAD_CREATE_DETACHED. */ + pthread_attr_t gomp_thread_attr; + + /* This key is for the thread destructor. */ +@@ -58,6 +57,7 @@ struct gomp_thread_start_data + struct gomp_thread_pool *thread_pool; + unsigned int place; + bool nested; ++ pthread_t handle; + }; + + +@@ -89,6 +89,9 @@ gomp_thread_start (void *xdata) + thr->ts = data->ts; + thr->task = data->task; + thr->place = data->place; ++#ifdef GOMP_NEEDS_THREAD_HANDLE ++ thr->handle = data->handle; ++#endif + + thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release; + +@@ -131,6 +134,7 @@ gomp_thread_start (void *xdata) + } + + gomp_sem_destroy (&thr->release); ++ pthread_detach (pthread_self ()); + thr->thread_pool = NULL; + thr->task = NULL; + return NULL; +@@ -183,7 +187,7 @@ gomp_new_team (unsigned nthreads) + team->single_count = 0; + #endif + team->work_shares_to_free = &team->work_shares[0]; +- gomp_init_work_share (&team->work_shares[0], false, nthreads); ++ gomp_init_work_share (&team->work_shares[0], 0, nthreads); + team->work_shares[0].next_alloc = NULL; + team->work_share_list_free = NULL; + team->work_share_list_alloc = &team->work_shares[1]; +@@ -231,6 +235,7 @@ gomp_free_pool_helper (void *thread_pool + thr->thread_pool = NULL; + thr->task = NULL; + #ifdef LIBGOMP_USE_PTHREADS ++ pthread_detach (pthread_self ()); + pthread_exit (NULL); + #elif defined(__nvptx__) + asm ("exit;"); +@@ -297,7 +302,8 @@ gomp_free_thread (void *arg __attribute_ + #ifdef LIBGOMP_USE_PTHREADS + void + gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads, +- unsigned flags, struct gomp_team *team) ++ unsigned flags, struct gomp_team *team, ++ struct gomp_taskgroup *taskgroup) + { + struct gomp_thread_start_data *start_data; + struct gomp_thread *thr, *nthr; +@@ -312,6 +318,7 @@ gomp_team_start (void (*fn) (void *), vo + unsigned int s = 0, rest = 0, p = 0, k = 0; + unsigned int affinity_count = 0; + struct gomp_thread **affinity_thr = NULL; ++ bool force_display = false; + + thr = gomp_thread (); + nested = thr->ts.level; +@@ -319,7 +326,12 @@ gomp_team_start (void (*fn) (void *), vo + task = thr->task; + icv = task ? &task->icv : &gomp_global_icv; + if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0) +- gomp_init_affinity (); ++ { ++ gomp_init_affinity (); ++ if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1) ++ gomp_display_affinity_thread (gomp_thread_self (), &thr->ts, ++ thr->place); ++ } + + /* Always save the previous state, even if this isn't a nested team. + In particular, we should save any work share state from an outer +@@ -338,6 +350,9 @@ gomp_team_start (void (*fn) (void *), vo + #endif + thr->ts.static_trip = 0; + thr->task = &team->implicit_task[0]; ++#ifdef GOMP_NEEDS_THREAD_HANDLE ++ thr->handle = pthread_self (); ++#endif + nthreads_var = icv->nthreads_var; + if (__builtin_expect (gomp_nthreads_var_list != NULL, 0) + && thr->ts.level < gomp_nthreads_var_list_len) +@@ -350,6 +365,7 @@ gomp_team_start (void (*fn) (void *), vo + && thr->ts.level < gomp_bind_var_list_len) + bind_var = gomp_bind_var_list[thr->ts.level]; + gomp_init_task (thr->task, task, icv); ++ thr->task->taskgroup = taskgroup; + team->implicit_task[0].icv.nthreads_var = nthreads_var; + team->implicit_task[0].icv.bind_var = bind_var; + +@@ -465,7 +481,9 @@ gomp_team_start (void (*fn) (void *), vo + pool->threads + = gomp_realloc (pool->threads, + pool->threads_size +- * sizeof (struct gomp_thread_data *)); ++ * sizeof (struct gomp_thread *)); ++ /* Add current (master) thread to threads[]. */ ++ pool->threads[0] = thr; + } + + /* Release existing idle threads. */ +@@ -540,6 +558,7 @@ gomp_team_start (void (*fn) (void *), vo + + place_partition_len)) + { + unsigned int l; ++ force_display = true; + if (affinity_thr == NULL) + { + unsigned int j; +@@ -623,6 +642,7 @@ gomp_team_start (void (*fn) (void *), vo + gomp_init_task (nthr->task, task, icv); + team->implicit_task[i].icv.nthreads_var = nthreads_var; + team->implicit_task[i].icv.bind_var = bind_var; ++ nthr->task->taskgroup = taskgroup; + nthr->fn = fn; + nthr->data = data; + team->ordered_release[i] = &nthr->release; +@@ -712,19 +732,17 @@ gomp_team_start (void (*fn) (void *), vo + { + size_t stacksize; + pthread_attr_init (&thread_attr); +- pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED); + if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize)) + pthread_attr_setstacksize (&thread_attr, stacksize); + attr = &thread_attr; + } + + start_data = gomp_alloca (sizeof (struct gomp_thread_start_data) +- * (nthreads-i)); ++ * (nthreads - i)); + + /* Launch new threads. */ + for (; i < nthreads; ++i) + { +- pthread_t pt; + int err; + + start_data->ts.place_partition_off = thr->ts.place_partition_off; +@@ -810,11 +828,14 @@ gomp_team_start (void (*fn) (void *), vo + gomp_init_task (start_data->task, task, icv); + team->implicit_task[i].icv.nthreads_var = nthreads_var; + team->implicit_task[i].icv.bind_var = bind_var; ++ start_data->task->taskgroup = taskgroup; + start_data->thread_pool = pool; + start_data->nested = nested; + + attr = gomp_adjust_thread_attr (attr, &thread_attr); +- err = pthread_create (&pt, attr, gomp_thread_start, start_data++); ++ err = pthread_create (&start_data->handle, attr, gomp_thread_start, ++ start_data); ++ start_data++; + if (err != 0) + gomp_fatal ("Thread creation failed: %s", strerror (err)); + } +@@ -854,6 +875,42 @@ gomp_team_start (void (*fn) (void *), vo + gomp_mutex_unlock (&gomp_managed_threads_lock); + #endif + } ++ if (__builtin_expect (gomp_display_affinity_var, 0)) ++ { ++ if (nested ++ || nthreads != old_threads_used ++ || force_display) ++ { ++ gomp_display_affinity_thread (gomp_thread_self (), &thr->ts, ++ thr->place); ++ if (nested) ++ { ++ start_data -= nthreads - 1; ++ for (i = 1; i < nthreads; ++i) ++ { ++ gomp_display_affinity_thread ( ++#ifdef LIBGOMP_USE_PTHREADS ++ start_data->handle, ++#else ++ gomp_thread_self (), ++#endif ++ &start_data->ts, ++ start_data->place); ++ start_data++; ++ } ++ } ++ else ++ { ++ for (i = 1; i < nthreads; ++i) ++ { ++ gomp_thread_handle handle ++ = gomp_thread_to_pthread_t (pool->threads[i]); ++ gomp_display_affinity_thread (handle, &pool->threads[i]->ts, ++ pool->threads[i]->place); ++ } ++ } ++ } ++ } + if (__builtin_expect (affinity_thr != NULL, 0) + && team->prev_ts.place_partition_len > 64) + free (affinity_thr); +@@ -894,7 +951,7 @@ gomp_team_end (void) + gomp_end_task (); + thr->ts = team->prev_ts; + +- if (__builtin_expect (thr->ts.team != NULL, 0)) ++ if (__builtin_expect (thr->ts.level != 0, 0)) + { + #ifdef HAVE_SYNC_BUILTINS + __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads); +@@ -959,6 +1016,76 @@ team_destructor (void) + crashes. */ + pthread_key_delete (gomp_thread_destructor); + } ++ ++/* Similar to gomp_free_pool_helper, but don't detach itself, ++ gomp_pause_host will pthread_join those threads. */ ++ ++static void ++gomp_pause_pool_helper (void *thread_pool) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_thread_pool *pool ++ = (struct gomp_thread_pool *) thread_pool; ++ gomp_simple_barrier_wait_last (&pool->threads_dock); ++ gomp_sem_destroy (&thr->release); ++ thr->thread_pool = NULL; ++ thr->task = NULL; ++ pthread_exit (NULL); ++} ++ ++/* Free a thread pool and release its threads. Return non-zero on ++ failure. */ ++ ++int ++gomp_pause_host (void) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ struct gomp_thread_pool *pool = thr->thread_pool; ++ if (thr->ts.level) ++ return -1; ++ if (pool) ++ { ++ if (pool->threads_used > 0) ++ { ++ int i; ++ pthread_t *thrs ++ = gomp_alloca (sizeof (pthread_t) * pool->threads_used); ++ for (i = 1; i < pool->threads_used; i++) ++ { ++ struct gomp_thread *nthr = pool->threads[i]; ++ nthr->fn = gomp_pause_pool_helper; ++ nthr->data = pool; ++ thrs[i] = gomp_thread_to_pthread_t (nthr); ++ } ++ /* This barrier undocks threads docked on pool->threads_dock. */ ++ gomp_simple_barrier_wait (&pool->threads_dock); ++ /* And this waits till all threads have called gomp_barrier_wait_last ++ in gomp_pause_pool_helper. */ ++ gomp_simple_barrier_wait (&pool->threads_dock); ++ /* Now it is safe to destroy the barrier and free the pool. */ ++ gomp_simple_barrier_destroy (&pool->threads_dock); ++ ++#ifdef HAVE_SYNC_BUILTINS ++ __sync_fetch_and_add (&gomp_managed_threads, ++ 1L - pool->threads_used); ++#else ++ gomp_mutex_lock (&gomp_managed_threads_lock); ++ gomp_managed_threads -= pool->threads_used - 1L; ++ gomp_mutex_unlock (&gomp_managed_threads_lock); ++#endif ++ for (i = 1; i < pool->threads_used; i++) ++ pthread_join (thrs[i], NULL); ++ } ++ if (pool->last_team) ++ free_team (pool->last_team); ++#ifndef __nvptx__ ++ free (pool->threads); ++ free (pool); ++#endif ++ thr->thread_pool = NULL; ++ } ++ return 0; ++} + #endif + + struct gomp_task_icv * +--- libgomp/libgomp.h.jj 2018-04-25 09:40:31.925655587 +0200 ++++ libgomp/libgomp.h 2019-05-07 19:01:51.285535999 +0200 +@@ -44,6 +44,7 @@ + #include "config.h" + #include "gstdint.h" + #include "libgomp-plugin.h" ++#include "gomp-constants.h" + + #ifdef HAVE_PTHREAD_H + #include +@@ -85,9 +86,21 @@ enum memmodel + + /* alloc.c */ + ++#if defined(HAVE_ALIGNED_ALLOC) \ ++ || defined(HAVE__ALIGNED_MALLOC) \ ++ || defined(HAVE_POSIX_MEMALIGN) \ ++ || defined(HAVE_MEMALIGN) ++/* Defined if gomp_aligned_alloc doesn't use fallback version ++ and free can be used instead of gomp_aligned_free. */ ++#define GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC 1 ++#endif ++ + extern void *gomp_malloc (size_t) __attribute__((malloc)); + extern void *gomp_malloc_cleared (size_t) __attribute__((malloc)); + extern void *gomp_realloc (void *, size_t); ++extern void *gomp_aligned_alloc (size_t, size_t) ++ __attribute__((malloc, alloc_size (2))); ++extern void gomp_aligned_free (void *); + + /* Avoid conflicting prototypes of alloca() in system headers by using + GCC's builtin alloca(). */ +@@ -137,7 +150,8 @@ enum gomp_schedule_type + GFS_STATIC, + GFS_DYNAMIC, + GFS_GUIDED, +- GFS_AUTO ++ GFS_AUTO, ++ GFS_MONOTONIC = 0x80000000U + }; + + struct gomp_doacross_work_share +@@ -174,6 +188,8 @@ struct gomp_doacross_work_share + /* Likewise, but for the ull implementation. */ + unsigned long long boundary_ull; + }; ++ /* Pointer to extra memory if needed for lastprivate(conditional). */ ++ void *extra; + /* Array of shift counts for each dimension if they can be flattened. */ + unsigned int shift_counts[]; + }; +@@ -275,6 +291,9 @@ struct gomp_work_share + struct gomp_work_share *next_free; + }; + ++ /* Task reductions for this work-sharing construct. */ ++ uintptr_t *task_reductions; ++ + /* If only few threads are in the team, ordered_team_ids can point + to this array which fills the padding at the end of this struct. */ + unsigned inline_ordered_team_ids[0]; +@@ -365,8 +384,12 @@ extern void **gomp_places_list; + extern unsigned long gomp_places_list_len; + extern unsigned int gomp_num_teams_var; + extern int gomp_debug_var; ++extern bool gomp_display_affinity_var; ++extern char *gomp_affinity_format_var; ++extern size_t gomp_affinity_format_len; + extern int goacc_device_num; + extern char *goacc_device_type; ++extern int goacc_default_dims[GOMP_DIM_MAX]; + + enum gomp_task_kind + { +@@ -469,8 +492,10 @@ struct gomp_taskgroup + struct gomp_taskgroup *prev; + /* Queue of tasks that belong in this taskgroup. */ + struct priority_queue taskgroup_queue; ++ uintptr_t *reductions; + bool in_taskgroup_wait; + bool cancelled; ++ bool workshare; + gomp_sem_t taskgroup_sem; + size_t num_children; + }; +@@ -613,6 +638,19 @@ struct gomp_thread + + /* User pthread thread pool */ + struct gomp_thread_pool *thread_pool; ++ ++#if defined(LIBGOMP_USE_PTHREADS) \ ++ && (!defined(HAVE_TLS) \ ++ || !defined(__GLIBC__) \ ++ || !defined(USING_INITIAL_EXEC_TLS)) ++ /* pthread_t of the thread containing this gomp_thread. ++ On Linux when using initial-exec TLS, ++ (typeof (pthread_t)) gomp_thread () - pthread_self () ++ is constant in all threads, so we can optimize and not ++ store it. */ ++#define GOMP_NEEDS_THREAD_HANDLE 1 ++ pthread_t handle; ++#endif + }; + + +@@ -709,6 +747,25 @@ extern bool gomp_affinity_finalize_place + extern bool gomp_affinity_init_level (int, unsigned long, bool); + extern void gomp_affinity_print_place (void *); + extern void gomp_get_place_proc_ids_8 (int, int64_t *); ++extern void gomp_display_affinity_place (char *, size_t, size_t *, int); ++ ++/* affinity-fmt.c */ ++ ++extern void gomp_print_string (const char *str, size_t len); ++extern void gomp_set_affinity_format (const char *, size_t); ++extern void gomp_display_string (char *, size_t, size_t *, const char *, ++ size_t); ++#ifdef LIBGOMP_USE_PTHREADS ++typedef pthread_t gomp_thread_handle; ++#else ++typedef struct {} gomp_thread_handle; ++#endif ++extern size_t gomp_display_affinity (char *, size_t, const char *, ++ gomp_thread_handle, ++ struct gomp_team_state *, unsigned int); ++extern void gomp_display_affinity_thread (gomp_thread_handle, ++ struct gomp_team_state *, ++ unsigned int) __attribute__((cold)); + + /* iter.c */ + +@@ -745,9 +802,9 @@ extern void gomp_ordered_next (void); + extern void gomp_ordered_static_init (void); + extern void gomp_ordered_static_next (void); + extern void gomp_ordered_sync (void); +-extern void gomp_doacross_init (unsigned, long *, long); ++extern void gomp_doacross_init (unsigned, long *, long, size_t); + extern void gomp_doacross_ull_init (unsigned, unsigned long long *, +- unsigned long long); ++ unsigned long long, size_t); + + /* parallel.c */ + +@@ -770,6 +827,10 @@ extern bool gomp_create_target_task (str + size_t *, unsigned short *, unsigned int, + void **, void **, + enum gomp_target_task_state); ++extern struct gomp_taskgroup *gomp_parallel_reduction_register (uintptr_t *, ++ unsigned); ++extern void gomp_workshare_taskgroup_start (void); ++extern void gomp_workshare_task_reduction_register (uintptr_t *, uintptr_t *); + + static void inline + gomp_finish_task (struct gomp_task *task) +@@ -782,9 +843,11 @@ gomp_finish_task (struct gomp_task *task + + extern struct gomp_team *gomp_new_team (unsigned); + extern void gomp_team_start (void (*) (void *), void *, unsigned, +- unsigned, struct gomp_team *); ++ unsigned, struct gomp_team *, ++ struct gomp_taskgroup *); + extern void gomp_team_end (void); + extern void gomp_free_thread (void *); ++extern int gomp_pause_host (void); + + /* target.c */ + +@@ -851,6 +914,8 @@ struct splay_tree_key_s { + uintptr_t tgt_offset; + /* Reference count. */ + uintptr_t refcount; ++ /* Dynamic reference count. */ ++ uintptr_t dynamic_refcount; + /* Pointer to the original mapping of "omp declare target link" object. */ + splay_tree_key link_key; + }; +@@ -989,7 +1054,9 @@ enum gomp_map_vars_kind + }; + + extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *); +-extern void gomp_acc_remove_pointer (void *, bool, int, int); ++extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int); ++extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *, ++ unsigned short *); + + extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *, + size_t, void **, void **, +@@ -999,12 +1066,13 @@ extern void gomp_unmap_vars (struct targ + extern void gomp_init_device (struct gomp_device_descr *); + extern void gomp_free_memmap (struct splay_tree_s *); + extern void gomp_unload_device (struct gomp_device_descr *); ++extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key); + + /* work.c */ + +-extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned); ++extern void gomp_init_work_share (struct gomp_work_share *, size_t, unsigned); + extern void gomp_fini_work_share (struct gomp_work_share *); +-extern bool gomp_work_share_start (bool); ++extern bool gomp_work_share_start (size_t); + extern void gomp_work_share_end (void); + extern bool gomp_work_share_end_cancel (void); + extern void gomp_work_share_end_nowait (void); +@@ -1028,6 +1096,14 @@ gomp_work_share_init_done (void) + #include "omp-lock.h" + #define _LIBGOMP_OMP_LOCK_DEFINED 1 + #include "omp.h.in" ++#define omp_sched_monotonic 0x80000000U ++typedef enum omp_pause_resource_t ++{ ++ omp_pause_soft = 1, ++ omp_pause_hard = 2 ++} omp_pause_resource_t; ++extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW; ++extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW; + + #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \ + || !defined (HAVE_ATTRIBUTE_ALIAS) \ +@@ -1082,16 +1158,26 @@ extern int gomp_test_nest_lock_25 (omp_n + # define attribute_hidden + #endif + ++#if __GNUC__ >= 9 ++# define HAVE_ATTRIBUTE_COPY ++#endif ++ ++#ifdef HAVE_ATTRIBUTE_COPY ++# define attribute_copy(arg) __attribute__ ((copy (arg))) ++#else ++# define attribute_copy(arg) ++#endif ++ + #ifdef HAVE_ATTRIBUTE_ALIAS + # define strong_alias(fn, al) \ +- extern __typeof (fn) al __attribute__ ((alias (#fn))); ++ extern __typeof (fn) al __attribute__ ((alias (#fn))) attribute_copy (fn); + + # define ialias_ulp ialias_str1(__USER_LABEL_PREFIX__) + # define ialias_str1(x) ialias_str2(x) + # define ialias_str2(x) #x + # define ialias(fn) \ + extern __typeof (fn) gomp_ialias_##fn \ +- __attribute__ ((alias (#fn))) attribute_hidden; ++ __attribute__ ((alias (#fn))) attribute_hidden attribute_copy (fn); + # define ialias_redirect(fn) \ + extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden; + # define ialias_call(fn) gomp_ialias_ ## fn +@@ -1131,4 +1217,42 @@ task_to_priority_node (enum priority_que + return (struct priority_node *) ((char *) task + + priority_queue_offset (type)); + } ++ ++#ifdef LIBGOMP_USE_PTHREADS ++static inline gomp_thread_handle ++gomp_thread_self (void) ++{ ++ return pthread_self (); ++} ++ ++static inline gomp_thread_handle ++gomp_thread_to_pthread_t (struct gomp_thread *thr) ++{ ++ struct gomp_thread *this_thr = gomp_thread (); ++ if (thr == this_thr) ++ return pthread_self (); ++#ifdef GOMP_NEEDS_THREAD_HANDLE ++ return thr->handle; ++#else ++ /* On Linux with initial-exec TLS, the pthread_t of the thread containing ++ thr can be computed from thr, this_thr and pthread_self (), ++ as the distance between this_thr and pthread_self () is constant. */ ++ return pthread_self () + ((uintptr_t) thr - (uintptr_t) this_thr); ++#endif ++} ++#else ++static inline gomp_thread_handle ++gomp_thread_self (void) ++{ ++ return (gomp_thread_handle) {}; ++} ++ ++static inline gomp_thread_handle ++gomp_thread_to_pthread_t (struct gomp_thread *thr) ++{ ++ (void) thr; ++ return gomp_thread_self (); ++} ++#endif ++ + #endif /* LIBGOMP_H */ +--- libgomp/oacc-parallel.c.jj 2018-04-25 09:40:31.319655306 +0200 ++++ libgomp/oacc-parallel.c 2019-05-07 19:09:47.010991153 +0200 +@@ -27,6 +27,8 @@ + /* This file handles OpenACC constructs. */ + + #include "openacc.h" ++void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW; ++void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW; + #include "libgomp.h" + #include "libgomp_g.h" + #include "gomp-constants.h" +@@ -38,31 +40,95 @@ + #include + #include + ++ ++/* In the ABI, the GOACC_FLAGs are encoded as an inverted bitmask, so that we ++ continue to support the following two legacy values. */ ++_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_ICV) == 0, ++ "legacy GOMP_DEVICE_ICV broken"); ++_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_HOST_FALLBACK) ++ == GOACC_FLAG_HOST_FALLBACK, ++ "legacy GOMP_DEVICE_HOST_FALLBACK broken"); ++ ++ ++/* Returns the number of mappings associated with the pointer or pset. PSET ++ have three mappings, whereas pointer have two. */ ++ + static int +-find_pset (int pos, size_t mapnum, unsigned short *kinds) ++find_pointer (int pos, size_t mapnum, unsigned short *kinds) + { + if (pos + 1 >= mapnum) + return 0; + + unsigned char kind = kinds[pos+1] & 0xff; + +- return kind == GOMP_MAP_TO_PSET; ++ if (kind == GOMP_MAP_TO_PSET) ++ return 3; ++ else if (kind == GOMP_MAP_POINTER) ++ return 2; ++ ++ return 0; ++} ++ ++/* Handle the mapping pair that are presented when a ++ deviceptr clause is used with Fortran. */ ++ ++static void ++handle_ftn_pointers (size_t mapnum, void **hostaddrs, size_t *sizes, ++ unsigned short *kinds) ++{ ++ int i; ++ ++ for (i = 0; i < mapnum; i++) ++ { ++ unsigned short kind1 = kinds[i] & 0xff; ++ ++ /* Handle Fortran deviceptr clause. */ ++ if (kind1 == GOMP_MAP_FORCE_DEVICEPTR) ++ { ++ unsigned short kind2; ++ ++ if (i < (signed)mapnum - 1) ++ kind2 = kinds[i + 1] & 0xff; ++ else ++ kind2 = 0xffff; ++ ++ if (sizes[i] == sizeof (void *)) ++ continue; ++ ++ /* At this point, we're dealing with a Fortran deviceptr. ++ If the next element is not what we're expecting, then ++ this is an instance of where the deviceptr variable was ++ not used within the region and the pointer was removed ++ by the gimplifier. */ ++ if (kind2 == GOMP_MAP_POINTER ++ && sizes[i + 1] == 0 ++ && hostaddrs[i] == *(void **)hostaddrs[i + 1]) ++ { ++ kinds[i+1] = kinds[i]; ++ sizes[i+1] = sizeof (void *); ++ } ++ ++ /* Invalidate the entry. */ ++ hostaddrs[i] = NULL; ++ } ++ } + } + + static void goacc_wait (int async, int num_waits, va_list *ap); + + +-/* Launch a possibly offloaded function on DEVICE. FN is the host fn ++/* Launch a possibly offloaded function with FLAGS. FN is the host fn + address. MAPNUM, HOSTADDRS, SIZES & KINDS describe the memory + blocks to be copied to/from the device. Varadic arguments are + keyed optional parameters terminated with a zero. */ + + void +-GOACC_parallel_keyed (int device, void (*fn) (void *), ++GOACC_parallel_keyed (int flags_m, void (*fn) (void *), + size_t mapnum, void **hostaddrs, size_t *sizes, + unsigned short *kinds, ...) + { +- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; ++ int flags = GOACC_FLAGS_UNMARSHAL (flags_m); ++ + va_list ap; + struct goacc_thread *thr; + struct gomp_device_descr *acc_dev; +@@ -88,9 +154,11 @@ GOACC_parallel_keyed (int device, void ( + thr = goacc_thread (); + acc_dev = thr->dev; + ++ handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds); ++ + /* Host fallback if "if" clause is false or if the current device is set to + the host. */ +- if (host_fallback) ++ if (flags & GOACC_FLAG_HOST_FALLBACK) + { + goacc_save_and_set_bind (acc_device_host); + fn (hostaddrs); +@@ -140,9 +208,7 @@ GOACC_parallel_keyed (int device, void ( + case GOMP_LAUNCH_WAIT: + { + unsigned num_waits = GOMP_LAUNCH_OP (tag); +- +- if (num_waits) +- goacc_wait (async, num_waits, &ap); ++ goacc_wait (async, num_waits, &ap); + break; + } + +@@ -177,16 +243,36 @@ GOACC_parallel_keyed (int device, void ( + devaddrs = gomp_alloca (sizeof (void *) * mapnum); + for (i = 0; i < mapnum; i++) + devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start +- + tgt->list[i].key->tgt_offset); ++ + tgt->list[i].key->tgt_offset ++ + tgt->list[i].offset); + + acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs, + async, dims, tgt); + + /* If running synchronously, unmap immediately. */ +- if (async < acc_async_noval) ++ bool copyfrom = true; ++ if (async_synchronous_p (async)) + gomp_unmap_vars (tgt, true); + else +- tgt->device_descr->openacc.register_async_cleanup_func (tgt, async); ++ { ++ bool async_unmap = false; ++ for (size_t i = 0; i < tgt->list_count; i++) ++ { ++ splay_tree_key k = tgt->list[i].key; ++ if (k && k->refcount == 1) ++ { ++ async_unmap = true; ++ break; ++ } ++ } ++ if (async_unmap) ++ tgt->device_descr->openacc.register_async_cleanup_func (tgt, async); ++ else ++ { ++ copyfrom = false; ++ gomp_unmap_vars (tgt, copyfrom); ++ } ++ } + + acc_dev->openacc.async_set_async_func (acc_async_sync); + } +@@ -194,7 +280,7 @@ GOACC_parallel_keyed (int device, void ( + /* Legacy entry point, only provide host execution. */ + + void +-GOACC_parallel (int device, void (*fn) (void *), ++GOACC_parallel (int flags_m, void (*fn) (void *), + size_t mapnum, void **hostaddrs, size_t *sizes, + unsigned short *kinds, + int num_gangs, int num_workers, int vector_length, +@@ -206,10 +292,11 @@ GOACC_parallel (int device, void (*fn) ( + } + + void +-GOACC_data_start (int device, size_t mapnum, ++GOACC_data_start (int flags_m, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds) + { +- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; ++ int flags = GOACC_FLAGS_UNMARSHAL (flags_m); ++ + struct target_mem_desc *tgt; + + #ifdef HAVE_INTTYPES_H +@@ -227,7 +314,7 @@ GOACC_data_start (int device, size_t map + + /* Host fallback or 'do nothing'. */ + if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) +- || host_fallback) ++ || (flags & GOACC_FLAG_HOST_FALLBACK)) + { + tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true, + GOMP_MAP_VARS_OPENACC); +@@ -258,13 +345,14 @@ GOACC_data_end (void) + } + + void +-GOACC_enter_exit_data (int device, size_t mapnum, ++GOACC_enter_exit_data (int flags_m, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds, + int async, int num_waits, ...) + { ++ int flags = GOACC_FLAGS_UNMARSHAL (flags_m); ++ + struct goacc_thread *thr; + struct gomp_device_descr *acc_dev; +- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; + bool data_enter = false; + size_t i; + +@@ -274,7 +362,7 @@ GOACC_enter_exit_data (int device, size_ + acc_dev = thr->dev; + + if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) +- || host_fallback) ++ || (flags & GOACC_FLAG_HOST_FALLBACK)) + return; + + if (num_waits) +@@ -286,6 +374,17 @@ GOACC_enter_exit_data (int device, size_ + va_end (ap); + } + ++ /* Determine whether "finalize" semantics apply to all mappings of this ++ OpenACC directive. */ ++ bool finalize = false; ++ if (mapnum > 0) ++ { ++ unsigned char kind = kinds[0] & 0xff; ++ if (kind == GOMP_MAP_DELETE ++ || kind == GOMP_MAP_FORCE_FROM) ++ finalize = true; ++ } ++ + acc_dev->openacc.async_set_async_func (async); + + /* Determine if this is an "acc enter data". */ +@@ -298,13 +397,17 @@ GOACC_enter_exit_data (int device, size_ + + if (kind == GOMP_MAP_FORCE_ALLOC + || kind == GOMP_MAP_FORCE_PRESENT +- || kind == GOMP_MAP_FORCE_TO) ++ || kind == GOMP_MAP_FORCE_TO ++ || kind == GOMP_MAP_TO ++ || kind == GOMP_MAP_ALLOC) + { + data_enter = true; + break; + } + +- if (kind == GOMP_MAP_DELETE ++ if (kind == GOMP_MAP_RELEASE ++ || kind == GOMP_MAP_DELETE ++ || kind == GOMP_MAP_FROM + || kind == GOMP_MAP_FORCE_FROM) + break; + +@@ -312,31 +415,35 @@ GOACC_enter_exit_data (int device, size_ + kind); + } + ++ /* In c, non-pointers and arrays are represented by a single data clause. ++ Dynamically allocated arrays and subarrays are represented by a data ++ clause followed by an internal GOMP_MAP_POINTER. ++ ++ In fortran, scalars and not allocated arrays are represented by a ++ single data clause. Allocated arrays and subarrays have three mappings: ++ 1) the original data clause, 2) a PSET 3) a pointer to the array data. ++ */ ++ + if (data_enter) + { + for (i = 0; i < mapnum; i++) + { + unsigned char kind = kinds[i] & 0xff; + +- /* Scan for PSETs. */ +- int psets = find_pset (i, mapnum, kinds); ++ /* Scan for pointers and PSETs. */ ++ int pointer = find_pointer (i, mapnum, kinds); + +- if (!psets) ++ if (!pointer) + { + switch (kind) + { +- case GOMP_MAP_POINTER: +- gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i], +- &kinds[i]); +- break; ++ case GOMP_MAP_ALLOC: + case GOMP_MAP_FORCE_ALLOC: + acc_create (hostaddrs[i], sizes[i]); + break; +- case GOMP_MAP_FORCE_PRESENT: +- acc_present_or_copyin (hostaddrs[i], sizes[i]); +- break; ++ case GOMP_MAP_TO: + case GOMP_MAP_FORCE_TO: +- acc_present_or_copyin (hostaddrs[i], sizes[i]); ++ acc_copyin (hostaddrs[i], sizes[i]); + break; + default: + gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", +@@ -346,12 +453,13 @@ GOACC_enter_exit_data (int device, size_ + } + else + { +- gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]); ++ gomp_acc_insert_pointer (pointer, &hostaddrs[i], ++ &sizes[i], &kinds[i]); + /* Increment 'i' by two because OpenACC requires fortran + arrays to be contiguous, so each PSET is associated with + one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and + one MAP_POINTER. */ +- i += 2; ++ i += pointer - 1; + } + } + } +@@ -360,22 +468,28 @@ GOACC_enter_exit_data (int device, size_ + { + unsigned char kind = kinds[i] & 0xff; + +- int psets = find_pset (i, mapnum, kinds); ++ int pointer = find_pointer (i, mapnum, kinds); + +- if (!psets) ++ if (!pointer) + { + switch (kind) + { +- case GOMP_MAP_POINTER: +- gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff) +- == GOMP_MAP_FORCE_FROM, +- async, 1); +- break; ++ case GOMP_MAP_RELEASE: + case GOMP_MAP_DELETE: +- acc_delete (hostaddrs[i], sizes[i]); ++ if (acc_is_present (hostaddrs[i], sizes[i])) ++ { ++ if (finalize) ++ acc_delete_finalize (hostaddrs[i], sizes[i]); ++ else ++ acc_delete (hostaddrs[i], sizes[i]); ++ } + break; ++ case GOMP_MAP_FROM: + case GOMP_MAP_FORCE_FROM: +- acc_copyout (hostaddrs[i], sizes[i]); ++ if (finalize) ++ acc_copyout_finalize (hostaddrs[i], sizes[i]); ++ else ++ acc_copyout (hostaddrs[i], sizes[i]); + break; + default: + gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x", +@@ -385,10 +499,12 @@ GOACC_enter_exit_data (int device, size_ + } + else + { +- gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff) +- == GOMP_MAP_FORCE_FROM, async, 3); ++ bool copyfrom = (kind == GOMP_MAP_FORCE_FROM ++ || kind == GOMP_MAP_FROM); ++ gomp_acc_remove_pointer (hostaddrs[i], sizes[i], copyfrom, async, ++ finalize, pointer); + /* See the above comment. */ +- i += 2; ++ i += pointer - 1; + } + } + +@@ -398,13 +514,20 @@ GOACC_enter_exit_data (int device, size_ + static void + goacc_wait (int async, int num_waits, va_list *ap) + { +- struct goacc_thread *thr = goacc_thread (); +- struct gomp_device_descr *acc_dev = thr->dev; +- + while (num_waits--) + { + int qid = va_arg (*ap, int); +- ++ ++ /* Waiting on ACC_ASYNC_NOVAL maps to 'wait all'. */ ++ if (qid == acc_async_noval) ++ { ++ if (async == acc_async_sync) ++ acc_wait_all (); ++ else ++ acc_wait_all_async (async); ++ break; ++ } ++ + if (acc_async_test (qid)) + continue; + +@@ -415,16 +538,17 @@ goacc_wait (int async, int num_waits, va + launching on, the queue itself will order work as + required, so there's no need to wait explicitly. */ + else +- acc_dev->openacc.async_wait_async_func (qid, async); ++ acc_wait_async (qid, async); + } + } + + void +-GOACC_update (int device, size_t mapnum, ++GOACC_update (int flags_m, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds, + int async, int num_waits, ...) + { +- bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK; ++ int flags = GOACC_FLAGS_UNMARSHAL (flags_m); ++ + size_t i; + + goacc_lazy_initialize (); +@@ -433,7 +557,7 @@ GOACC_update (int device, size_t mapnum, + struct gomp_device_descr *acc_dev = thr->dev; + + if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) +- || host_fallback) ++ || (flags & GOACC_FLAG_HOST_FALLBACK)) + return; + + if (num_waits) +@@ -447,6 +571,7 @@ GOACC_update (int device, size_t mapnum, + + acc_dev->openacc.async_set_async_func (async); + ++ bool update_device = false; + for (i = 0; i < mapnum; ++i) + { + unsigned char kind = kinds[i] & 0xff; +@@ -457,11 +582,46 @@ GOACC_update (int device, size_t mapnum, + case GOMP_MAP_TO_PSET: + break; + ++ case GOMP_MAP_ALWAYS_POINTER: ++ if (update_device) ++ { ++ /* Save the contents of the host pointer. */ ++ void *dptr = acc_deviceptr (hostaddrs[i-1]); ++ uintptr_t t = *(uintptr_t *) hostaddrs[i]; ++ ++ /* Update the contents of the host pointer to reflect ++ the value of the allocated device memory in the ++ previous pointer. */ ++ *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr; ++ acc_update_device (hostaddrs[i], sizeof (uintptr_t)); ++ ++ /* Restore the host pointer. */ ++ *(uintptr_t *) hostaddrs[i] = t; ++ update_device = false; ++ } ++ break; ++ ++ case GOMP_MAP_TO: ++ if (!acc_is_present (hostaddrs[i], sizes[i])) ++ { ++ update_device = false; ++ break; ++ } ++ /* Fallthru */ + case GOMP_MAP_FORCE_TO: ++ update_device = true; + acc_update_device (hostaddrs[i], sizes[i]); + break; + ++ case GOMP_MAP_FROM: ++ if (!acc_is_present (hostaddrs[i], sizes[i])) ++ { ++ update_device = false; ++ break; ++ } ++ /* Fallthru */ + case GOMP_MAP_FORCE_FROM: ++ update_device = false; + acc_update_self (hostaddrs[i], sizes[i]); + break; + +@@ -487,8 +647,8 @@ GOACC_wait (int async, int num_waits, .. + } + else if (async == acc_async_sync) + acc_wait_all (); +- else if (async == acc_async_noval) +- goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval); ++ else ++ acc_wait_all_async (async); + } + + int +@@ -504,7 +664,7 @@ GOACC_get_thread_num (void) + } + + void +-GOACC_declare (int device, size_t mapnum, ++GOACC_declare (int flags_m, size_t mapnum, + void **hostaddrs, size_t *sizes, unsigned short *kinds) + { + int i; +@@ -522,9 +682,10 @@ GOACC_declare (int device, size_t mapnum + case GOMP_MAP_FORCE_FROM: + case GOMP_MAP_FORCE_TO: + case GOMP_MAP_POINTER: ++ case GOMP_MAP_RELEASE: + case GOMP_MAP_DELETE: +- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], +- &kinds[i], 0, 0); ++ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], GOMP_ASYNC_SYNC, 0); + break; + + case GOMP_MAP_FORCE_DEVICEPTR: +@@ -532,20 +693,19 @@ GOACC_declare (int device, size_t mapnum + + case GOMP_MAP_ALLOC: + if (!acc_is_present (hostaddrs[i], sizes[i])) +- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], +- &kinds[i], 0, 0); ++ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], GOMP_ASYNC_SYNC, 0); + break; + + case GOMP_MAP_TO: +- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], +- &kinds[i], 0, 0); ++ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], GOMP_ASYNC_SYNC, 0); + + break; + + case GOMP_MAP_FROM: +- kinds[i] = GOMP_MAP_FORCE_FROM; +- GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i], +- &kinds[i], 0, 0); ++ GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i], ++ &kinds[i], GOMP_ASYNC_SYNC, 0); + break; + + case GOMP_MAP_FORCE_PRESENT: +--- libgomp/openacc2.f90.jj 2019-05-07 19:54:18.828514375 +0200 ++++ libgomp/openacc2.f90 2019-05-07 19:56:38.454296347 +0200 +@@ -0,0 +1,1502 @@ ++! OpenACC Runtime Library Definitions. ++ ++! Copyright (C) 2014-2019 Free Software Foundation, Inc. ++ ++! Contributed by Tobias Burnus ++! and Mentor Embedded. ++ ++! This file is part of the GNU Offloading and Multi Processing Library ++! (libgomp). ++ ++! Libgomp is free software; you can redistribute it and/or modify it ++! under the terms of the GNU General Public License as published by ++! the Free Software Foundation; either version 3, or (at your option) ++! any later version. ++ ++! Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++! FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++! more details. ++ ++! Under Section 7 of GPL version 3, you are granted additional ++! permissions described in the GCC Runtime Library Exception, version ++! 3.1, as published by the Free Software Foundation. ++ ++! You should have received a copy of the GNU General Public License and ++! a copy of the GCC Runtime Library Exception along with this program; ++! see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++! . ++ ++module openacc_kinds2 ++ use iso_fortran_env, only: int32 ++ implicit none ++ ++ private :: int32 ++ public :: acc_device_kind ++ ++ integer, parameter :: acc_device_kind = int32 ++ ++ public :: acc_device_none, acc_device_default, acc_device_host ++ public :: acc_device_not_host, acc_device_nvidia ++ ++ ! Keep in sync with include/gomp-constants.h. ++ integer (acc_device_kind), parameter :: acc_device_none = 0 ++ integer (acc_device_kind), parameter :: acc_device_default = 1 ++ integer (acc_device_kind), parameter :: acc_device_host = 2 ++ ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed. ++ integer (acc_device_kind), parameter :: acc_device_not_host = 4 ++ integer (acc_device_kind), parameter :: acc_device_nvidia = 5 ++ ++ public :: acc_handle_kind ++ ++ integer, parameter :: acc_handle_kind = int32 ++ ++ public :: acc_async_noval, acc_async_sync ++ ++ ! Keep in sync with include/gomp-constants.h. ++ integer (acc_handle_kind), parameter :: acc_async_noval = -1 ++ integer (acc_handle_kind), parameter :: acc_async_sync = -2 ++ ++end module ++ ++module openacc_internal2 ++ use openacc_kinds2 ++ implicit none ++ ++ interface ++ function acc_get_num_devices_h (d) ++ import ++ integer acc_get_num_devices_h ++ integer (acc_device_kind) d ++ end function ++ ++ subroutine acc_set_device_type_h (d) ++ import ++ integer (acc_device_kind) d ++ end subroutine ++ ++ function acc_get_device_type_h () ++ import ++ integer (acc_device_kind) acc_get_device_type_h ++ end function ++ ++ subroutine acc_set_device_num_h (n, d) ++ import ++ integer n ++ integer (acc_device_kind) d ++ end subroutine ++ ++ function acc_get_device_num_h (d) ++ import ++ integer acc_get_device_num_h ++ integer (acc_device_kind) d ++ end function ++ ++ function acc_async_test_h (a) ++ logical acc_async_test_h ++ integer a ++ end function ++ ++ function acc_async_test_all_h () ++ logical acc_async_test_all_h ++ end function ++ ++ subroutine acc_wait_h (a) ++ integer a ++ end subroutine ++ ++ subroutine acc_wait_async_h (a1, a2) ++ integer a1, a2 ++ end subroutine ++ ++ subroutine acc_wait_all_h () ++ end subroutine ++ ++ subroutine acc_wait_all_async_h (a) ++ integer a ++ end subroutine ++ ++ subroutine acc_init_h (d) ++ import ++ integer (acc_device_kind) d ++ end subroutine ++ ++ subroutine acc_shutdown_h (d) ++ import ++ integer (acc_device_kind) d ++ end subroutine ++ ++ function acc_on_device_h (d) ++ import ++ integer (acc_device_kind) d ++ logical acc_on_device_h ++ end function ++ ++ subroutine acc_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_copyin_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_present_or_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_create_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_present_or_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_present_or_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_present_or_create_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_copyout_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_copyout_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_copyout_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_copyout_finalize_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_copyout_finalize_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_copyout_finalize_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_delete_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_delete_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_delete_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_delete_finalize_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_delete_finalize_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_delete_finalize_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_update_device_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_update_device_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_update_device_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ subroutine acc_update_self_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end subroutine ++ ++ subroutine acc_update_self_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end subroutine ++ ++ subroutine acc_update_self_array_h (a) ++ type (*), dimension (..), contiguous :: a ++ end subroutine ++ ++ function acc_is_present_32_h (a, len) ++ use iso_c_binding, only: c_int32_t ++ logical acc_is_present_32_h ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ end function ++ ++ function acc_is_present_64_h (a, len) ++ use iso_c_binding, only: c_int64_t ++ logical acc_is_present_64_h ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ end function ++ ++ function acc_is_present_array_h (a) ++ logical acc_is_present_array_h ++ type (*), dimension (..), contiguous :: a ++ end function ++ ++ subroutine acc_copyin_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_copyin_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_copyin_async_array_h (a, async) ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_create_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_create_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_create_async_array_h (a, async) ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_copyout_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_copyout_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_copyout_async_array_h (a, async) ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_delete_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_delete_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_delete_async_array_h (a, async) ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_update_device_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_update_device_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_update_device_async_array_h (a, async) ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_update_self_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_update_self_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ end subroutine ++ ++ subroutine acc_update_self_async_array_h (a, async) ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ end subroutine ++ end interface ++ ++ interface ++ function acc_get_num_devices_l (d) & ++ bind (C, name = "acc_get_num_devices") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_get_num_devices_l ++ integer (c_int), value :: d ++ end function ++ ++ subroutine acc_set_device_type_l (d) & ++ bind (C, name = "acc_set_device_type") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: d ++ end subroutine ++ ++ function acc_get_device_type_l () & ++ bind (C, name = "acc_get_device_type") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_get_device_type_l ++ end function ++ ++ subroutine acc_set_device_num_l (n, d) & ++ bind (C, name = "acc_set_device_num") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: n, d ++ end subroutine ++ ++ function acc_get_device_num_l (d) & ++ bind (C, name = "acc_get_device_num") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_get_device_num_l ++ integer (c_int), value :: d ++ end function ++ ++ function acc_async_test_l (a) & ++ bind (C, name = "acc_async_test") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_async_test_l ++ integer (c_int), value :: a ++ end function ++ ++ function acc_async_test_all_l () & ++ bind (C, name = "acc_async_test_all") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_async_test_all_l ++ end function ++ ++ subroutine acc_wait_l (a) & ++ bind (C, name = "acc_wait") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: a ++ end subroutine ++ ++ subroutine acc_wait_async_l (a1, a2) & ++ bind (C, name = "acc_wait_async") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: a1, a2 ++ end subroutine ++ ++ subroutine acc_wait_all_l () & ++ bind (C, name = "acc_wait_all") ++ use iso_c_binding, only: c_int ++ end subroutine ++ ++ subroutine acc_wait_all_async_l (a) & ++ bind (C, name = "acc_wait_all_async") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: a ++ end subroutine ++ ++ subroutine acc_init_l (d) & ++ bind (C, name = "acc_init") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: d ++ end subroutine ++ ++ subroutine acc_shutdown_l (d) & ++ bind (C, name = "acc_shutdown") ++ use iso_c_binding, only: c_int ++ integer (c_int), value :: d ++ end subroutine ++ ++ function acc_on_device_l (d) & ++ bind (C, name = "acc_on_device") ++ use iso_c_binding, only: c_int ++ integer (c_int) :: acc_on_device_l ++ integer (c_int), value :: d ++ end function ++ ++ subroutine acc_copyin_l (a, len) & ++ bind (C, name = "acc_copyin") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_present_or_copyin_l (a, len) & ++ bind (C, name = "acc_present_or_copyin") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_create_l (a, len) & ++ bind (C, name = "acc_create") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_present_or_create_l (a, len) & ++ bind (C, name = "acc_present_or_create") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_copyout_l (a, len) & ++ bind (C, name = "acc_copyout") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_copyout_finalize_l (a, len) & ++ bind (C, name = "acc_copyout_finalize") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_delete_l (a, len) & ++ bind (C, name = "acc_delete") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_delete_finalize_l (a, len) & ++ bind (C, name = "acc_delete_finalize") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_update_device_l (a, len) & ++ bind (C, name = "acc_update_device") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ subroutine acc_update_self_l (a, len) & ++ bind (C, name = "acc_update_self") ++ use iso_c_binding, only: c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end subroutine ++ ++ function acc_is_present_l (a, len) & ++ bind (C, name = "acc_is_present") ++ use iso_c_binding, only: c_int32_t, c_size_t ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ integer (c_int32_t) :: acc_is_present_l ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ end function ++ ++ subroutine acc_copyin_async_l (a, len, async) & ++ bind (C, name = "acc_copyin_async") ++ use iso_c_binding, only: c_size_t, c_int ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ integer (c_int), value :: async ++ end subroutine ++ ++ subroutine acc_create_async_l (a, len, async) & ++ bind (C, name = "acc_create_async") ++ use iso_c_binding, only: c_size_t, c_int ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ integer (c_int), value :: async ++ end subroutine ++ ++ subroutine acc_copyout_async_l (a, len, async) & ++ bind (C, name = "acc_copyout_async") ++ use iso_c_binding, only: c_size_t, c_int ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ integer (c_int), value :: async ++ end subroutine ++ ++ subroutine acc_delete_async_l (a, len, async) & ++ bind (C, name = "acc_delete_async") ++ use iso_c_binding, only: c_size_t, c_int ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ integer (c_int), value :: async ++ end subroutine ++ ++ subroutine acc_update_device_async_l (a, len, async) & ++ bind (C, name = "acc_update_device_async") ++ use iso_c_binding, only: c_size_t, c_int ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ integer (c_int), value :: async ++ end subroutine ++ ++ subroutine acc_update_self_async_l (a, len, async) & ++ bind (C, name = "acc_update_self_async") ++ use iso_c_binding, only: c_size_t, c_int ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_size_t), value :: len ++ integer (c_int), value :: async ++ end subroutine ++ end interface ++end module ++ ++module openacc2 ++ use openacc_kinds2 ++ use openacc_internal2 ++ implicit none ++ ++ public :: openacc_version ++ ++ public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type ++ public :: acc_set_device_num, acc_get_device_num, acc_async_test ++ public :: acc_async_test_all ++ public :: acc_wait, acc_async_wait, acc_wait_async ++ public :: acc_wait_all, acc_async_wait_all, acc_wait_all_async ++ public :: acc_init, acc_shutdown, acc_on_device ++ public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create ++ public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete ++ public :: acc_update_device, acc_update_self, acc_is_present ++ public :: acc_copyin_async, acc_create_async, acc_copyout_async ++ public :: acc_delete_async, acc_update_device_async, acc_update_self_async ++ ++ integer, parameter :: openacc_version = 201306 ++ ++ interface acc_get_num_devices ++ procedure :: acc_get_num_devices_h ++ end interface ++ ++ interface acc_set_device_type ++ procedure :: acc_set_device_type_h ++ end interface ++ ++ interface acc_get_device_type ++ procedure :: acc_get_device_type_h ++ end interface ++ ++ interface acc_set_device_num ++ procedure :: acc_set_device_num_h ++ end interface ++ ++ interface acc_get_device_num ++ procedure :: acc_get_device_num_h ++ end interface ++ ++ interface acc_async_test ++ procedure :: acc_async_test_h ++ end interface ++ ++ interface acc_async_test_all ++ procedure :: acc_async_test_all_h ++ end interface ++ ++ interface acc_wait ++ procedure :: acc_wait_h ++ end interface ++ ++ ! acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait. ++ interface acc_async_wait ++ procedure :: acc_wait_h ++ end interface ++ ++ interface acc_wait_async ++ procedure :: acc_wait_async_h ++ end interface ++ ++ interface acc_wait_all ++ procedure :: acc_wait_all_h ++ end interface ++ ++ ! acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all. ++ interface acc_async_wait_all ++ procedure :: acc_wait_all_h ++ end interface ++ ++ interface acc_wait_all_async ++ procedure :: acc_wait_all_async_h ++ end interface ++ ++ interface acc_init ++ procedure :: acc_init_h ++ end interface ++ ++ interface acc_shutdown ++ procedure :: acc_shutdown_h ++ end interface ++ ++ interface acc_on_device ++ procedure :: acc_on_device_h ++ end interface ++ ++ ! acc_malloc: Only available in C/C++ ++ ! acc_free: Only available in C/C++ ++ ++ ! As vendor extension, the following code supports both 32bit and 64bit ++ ! arguments for "size"; the OpenACC standard only permits default-kind ++ ! integers, which are of kind 4 (i.e. 32 bits). ++ ! Additionally, the two-argument version also takes arrays as argument. ++ ! and the one argument version also scalars. Note that the code assumes ++ ! that the arrays are contiguous. ++ ++ interface acc_copyin ++ procedure :: acc_copyin_32_h ++ procedure :: acc_copyin_64_h ++ procedure :: acc_copyin_array_h ++ end interface ++ ++ interface acc_present_or_copyin ++ procedure :: acc_present_or_copyin_32_h ++ procedure :: acc_present_or_copyin_64_h ++ procedure :: acc_present_or_copyin_array_h ++ end interface ++ ++ interface acc_pcopyin ++ procedure :: acc_present_or_copyin_32_h ++ procedure :: acc_present_or_copyin_64_h ++ procedure :: acc_present_or_copyin_array_h ++ end interface ++ ++ interface acc_create ++ procedure :: acc_create_32_h ++ procedure :: acc_create_64_h ++ procedure :: acc_create_array_h ++ end interface ++ ++ interface acc_present_or_create ++ procedure :: acc_present_or_create_32_h ++ procedure :: acc_present_or_create_64_h ++ procedure :: acc_present_or_create_array_h ++ end interface ++ ++ interface acc_pcreate ++ procedure :: acc_present_or_create_32_h ++ procedure :: acc_present_or_create_64_h ++ procedure :: acc_present_or_create_array_h ++ end interface ++ ++ interface acc_copyout ++ procedure :: acc_copyout_32_h ++ procedure :: acc_copyout_64_h ++ procedure :: acc_copyout_array_h ++ end interface ++ ++ interface acc_copyout_finalize ++ procedure :: acc_copyout_finalize_32_h ++ procedure :: acc_copyout_finalize_64_h ++ procedure :: acc_copyout_finalize_array_h ++ end interface ++ ++ interface acc_delete ++ procedure :: acc_delete_32_h ++ procedure :: acc_delete_64_h ++ procedure :: acc_delete_array_h ++ end interface ++ ++ interface acc_delete_finalize ++ procedure :: acc_delete_finalize_32_h ++ procedure :: acc_delete_finalize_64_h ++ procedure :: acc_delete_finalize_array_h ++ end interface ++ ++ interface acc_update_device ++ procedure :: acc_update_device_32_h ++ procedure :: acc_update_device_64_h ++ procedure :: acc_update_device_array_h ++ end interface ++ ++ interface acc_update_self ++ procedure :: acc_update_self_32_h ++ procedure :: acc_update_self_64_h ++ procedure :: acc_update_self_array_h ++ end interface ++ ++ ! acc_map_data: Only available in C/C++ ++ ! acc_unmap_data: Only available in C/C++ ++ ! acc_deviceptr: Only available in C/C++ ++ ! acc_hostptr: Only available in C/C++ ++ ++ interface acc_is_present ++ procedure :: acc_is_present_32_h ++ procedure :: acc_is_present_64_h ++ procedure :: acc_is_present_array_h ++ end interface ++ ++ ! acc_memcpy_to_device: Only available in C/C++ ++ ! acc_memcpy_from_device: Only available in C/C++ ++ ++ interface acc_copyin_async ++ procedure :: acc_copyin_async_32_h ++ procedure :: acc_copyin_async_64_h ++ procedure :: acc_copyin_async_array_h ++ end interface ++ ++ interface acc_create_async ++ procedure :: acc_create_async_32_h ++ procedure :: acc_create_async_64_h ++ procedure :: acc_create_async_array_h ++ end interface ++ ++ interface acc_copyout_async ++ procedure :: acc_copyout_async_32_h ++ procedure :: acc_copyout_async_64_h ++ procedure :: acc_copyout_async_array_h ++ end interface ++ ++ interface acc_delete_async ++ procedure :: acc_delete_async_32_h ++ procedure :: acc_delete_async_64_h ++ procedure :: acc_delete_async_array_h ++ end interface ++ ++ interface acc_update_device_async ++ procedure :: acc_update_device_async_32_h ++ procedure :: acc_update_device_async_64_h ++ procedure :: acc_update_device_async_array_h ++ end interface ++ ++ interface acc_update_self_async ++ procedure :: acc_update_self_async_32_h ++ procedure :: acc_update_self_async_64_h ++ procedure :: acc_update_self_async_array_h ++ end interface ++ ++end module ++ ++function acc_get_num_devices_h (d) ++ use openacc_internal2, only: acc_get_num_devices_l ++ use openacc_kinds2 ++ integer acc_get_num_devices_h ++ integer (acc_device_kind) d ++ acc_get_num_devices_h = acc_get_num_devices_l (d) ++end function ++ ++subroutine acc_set_device_type_h (d) ++ use openacc_internal2, only: acc_set_device_type_l ++ use openacc_kinds2 ++ integer (acc_device_kind) d ++ call acc_set_device_type_l (d) ++end subroutine ++ ++function acc_get_device_type_h () ++ use openacc_internal2, only: acc_get_device_type_l ++ use openacc_kinds2 ++ integer (acc_device_kind) acc_get_device_type_h ++ acc_get_device_type_h = acc_get_device_type_l () ++end function ++ ++subroutine acc_set_device_num_h (n, d) ++ use openacc_internal2, only: acc_set_device_num_l ++ use openacc_kinds2 ++ integer n ++ integer (acc_device_kind) d ++ call acc_set_device_num_l (n, d) ++end subroutine ++ ++function acc_get_device_num_h (d) ++ use openacc_internal2, only: acc_get_device_num_l ++ use openacc_kinds2 ++ integer acc_get_device_num_h ++ integer (acc_device_kind) d ++ acc_get_device_num_h = acc_get_device_num_l (d) ++end function ++ ++function acc_async_test_h (a) ++ use openacc_internal2, only: acc_async_test_l ++ logical acc_async_test_h ++ integer a ++ if (acc_async_test_l (a) .eq. 1) then ++ acc_async_test_h = .TRUE. ++ else ++ acc_async_test_h = .FALSE. ++ end if ++end function ++ ++function acc_async_test_all_h () ++ use openacc_internal2, only: acc_async_test_all_l ++ logical acc_async_test_all_h ++ if (acc_async_test_all_l () .eq. 1) then ++ acc_async_test_all_h = .TRUE. ++ else ++ acc_async_test_all_h = .FALSE. ++ end if ++end function ++ ++subroutine acc_wait_h (a) ++ use openacc_internal2, only: acc_wait_l ++ integer a ++ call acc_wait_l (a) ++end subroutine ++ ++subroutine acc_wait_async_h (a1, a2) ++ use openacc_internal2, only: acc_wait_async_l ++ integer a1, a2 ++ call acc_wait_async_l (a1, a2) ++end subroutine ++ ++subroutine acc_wait_all_h () ++ use openacc_internal2, only: acc_wait_all_l ++ call acc_wait_all_l () ++end subroutine ++ ++subroutine acc_wait_all_async_h (a) ++ use openacc_internal2, only: acc_wait_all_async_l ++ integer a ++ call acc_wait_all_async_l (a) ++end subroutine ++ ++subroutine acc_init_h (d) ++ use openacc_internal2, only: acc_init_l ++ use openacc_kinds2 ++ integer (acc_device_kind) d ++ call acc_init_l (d) ++end subroutine ++ ++subroutine acc_shutdown_h (d) ++ use openacc_internal2, only: acc_shutdown_l ++ use openacc_kinds2 ++ integer (acc_device_kind) d ++ call acc_shutdown_l (d) ++end subroutine ++ ++function acc_on_device_h (d) ++ use openacc_internal2, only: acc_on_device_l ++ use openacc_kinds2 ++ integer (acc_device_kind) d ++ logical acc_on_device_h ++ if (acc_on_device_l (d) .eq. 1) then ++ acc_on_device_h = .TRUE. ++ else ++ acc_on_device_h = .FALSE. ++ end if ++end function ++ ++subroutine acc_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_copyin_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_copyin_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyin_array_h (a) ++ use openacc_internal2, only: acc_copyin_l ++ type (*), dimension (..), contiguous :: a ++ call acc_copyin_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_present_or_copyin_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_present_or_copyin_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_copyin_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_present_or_copyin_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_present_or_copyin_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_copyin_array_h (a) ++ use openacc_internal2, only: acc_present_or_copyin_l ++ type (*), dimension (..), contiguous :: a ++ call acc_present_or_copyin_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_create_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_create_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_create_array_h (a) ++ use openacc_internal2, only: acc_create_l ++ type (*), dimension (..), contiguous :: a ++ call acc_create_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_present_or_create_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_present_or_create_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_present_or_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_create_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_present_or_create_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_present_or_create_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_present_or_create_array_h (a) ++ use openacc_internal2, only: acc_present_or_create_l ++ type (*), dimension (..), contiguous :: a ++ call acc_present_or_create_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_copyout_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_copyout_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_copyout_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyout_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_copyout_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_copyout_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyout_array_h (a) ++ use openacc_internal2, only: acc_copyout_l ++ type (*), dimension (..), contiguous :: a ++ call acc_copyout_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_copyout_finalize_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_copyout_finalize_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_copyout_finalize_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyout_finalize_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_copyout_finalize_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_copyout_finalize_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_copyout_finalize_array_h (a) ++ use openacc_internal2, only: acc_copyout_finalize_l ++ type (*), dimension (..), contiguous :: a ++ call acc_copyout_finalize_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_delete_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_delete_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_delete_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_delete_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_delete_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_delete_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_delete_array_h (a) ++ use openacc_internal2, only: acc_delete_l ++ type (*), dimension (..), contiguous :: a ++ call acc_delete_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_delete_finalize_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_delete_finalize_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_delete_finalize_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_delete_finalize_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_delete_finalize_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_delete_finalize_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_delete_finalize_array_h (a) ++ use openacc_internal2, only: acc_delete_finalize_l ++ type (*), dimension (..), contiguous :: a ++ call acc_delete_finalize_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_update_device_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_update_device_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_update_device_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_device_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_update_device_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_update_device_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_device_array_h (a) ++ use openacc_internal2, only: acc_update_device_l ++ type (*), dimension (..), contiguous :: a ++ call acc_update_device_l (a, sizeof (a)) ++end subroutine ++ ++subroutine acc_update_self_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_update_self_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ call acc_update_self_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_self_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_update_self_l ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ call acc_update_self_l (a, int (len, kind = c_size_t)) ++end subroutine ++ ++subroutine acc_update_self_array_h (a) ++ use openacc_internal2, only: acc_update_self_l ++ type (*), dimension (..), contiguous :: a ++ call acc_update_self_l (a, sizeof (a)) ++end subroutine ++ ++function acc_is_present_32_h (a, len) ++ use iso_c_binding, only: c_int32_t, c_size_t ++ use openacc_internal2, only: acc_is_present_l ++ logical acc_is_present_32_h ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then ++ acc_is_present_32_h = .TRUE. ++ else ++ acc_is_present_32_h = .FALSE. ++ end if ++end function ++ ++function acc_is_present_64_h (a, len) ++ use iso_c_binding, only: c_int64_t, c_size_t ++ use openacc_internal2, only: acc_is_present_l ++ logical acc_is_present_64_h ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then ++ acc_is_present_64_h = .TRUE. ++ else ++ acc_is_present_64_h = .FALSE. ++ end if ++end function ++ ++function acc_is_present_array_h (a) ++ use openacc_internal2, only: acc_is_present_l ++ logical acc_is_present_array_h ++ type (*), dimension (..), contiguous :: a ++ acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1 ++end function ++ ++subroutine acc_copyin_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t, c_size_t, c_int ++ use openacc_internal2, only: acc_copyin_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_copyin_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t, c_size_t, c_int ++ use openacc_internal2, only: acc_copyin_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_copyin_async_array_h (a, async) ++ use iso_c_binding, only: c_int ++ use openacc_internal2, only: acc_copyin_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_create_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t, c_size_t, c_int ++ use openacc_internal2, only: acc_create_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_create_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t, c_size_t, c_int ++ use openacc_internal2, only: acc_create_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_create_async_array_h (a, async) ++ use iso_c_binding, only: c_int ++ use openacc_internal2, only: acc_create_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ call acc_create_async_l (a, sizeof (a), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_copyout_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t, c_size_t, c_int ++ use openacc_internal2, only: acc_copyout_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_copyout_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t, c_size_t, c_int ++ use openacc_internal2, only: acc_copyout_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_copyout_async_array_h (a, async) ++ use iso_c_binding, only: c_int ++ use openacc_internal2, only: acc_copyout_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_delete_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t, c_size_t, c_int ++ use openacc_internal2, only: acc_delete_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_delete_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t, c_size_t, c_int ++ use openacc_internal2, only: acc_delete_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_delete_async_array_h (a, async) ++ use iso_c_binding, only: c_int ++ use openacc_internal2, only: acc_delete_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_update_device_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t, c_size_t, c_int ++ use openacc_internal2, only: acc_update_device_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_update_device_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t, c_size_t, c_int ++ use openacc_internal2, only: acc_update_device_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_update_device_async_array_h (a, async) ++ use iso_c_binding, only: c_int ++ use openacc_internal2, only: acc_update_device_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_update_self_async_32_h (a, len, async) ++ use iso_c_binding, only: c_int32_t, c_size_t, c_int ++ use openacc_internal2, only: acc_update_self_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int32_t) len ++ integer (acc_handle_kind) async ++ call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_update_self_async_64_h (a, len, async) ++ use iso_c_binding, only: c_int64_t, c_size_t, c_int ++ use openacc_internal2, only: acc_update_self_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ !GCC$ ATTRIBUTES NO_ARG_CHECK :: a ++ type (*), dimension (*) :: a ++ integer (c_int64_t) len ++ integer (acc_handle_kind) async ++ call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int)) ++end subroutine ++ ++subroutine acc_update_self_async_array_h (a, async) ++ use iso_c_binding, only: c_int ++ use openacc_internal2, only: acc_update_self_async_l ++ use openacc_kinds2, only: acc_handle_kind ++ type (*), dimension (..), contiguous :: a ++ integer (acc_handle_kind) async ++ call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int)) ++end subroutine +--- libgomp/taskloop.c.jj 2018-04-25 09:40:31.913655581 +0200 ++++ libgomp/taskloop.c 2019-05-07 18:46:36.547109400 +0200 +@@ -149,11 +149,28 @@ GOMP_taskloop (void (*fn) (void *), void + + if (flags & GOMP_TASK_FLAG_NOGROUP) + { +- if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled) +- return; ++ if (__builtin_expect (gomp_cancel_var, 0) ++ && thr->task ++ && thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return; ++ } + } + else +- ialias_call (GOMP_taskgroup_start) (); ++ { ++ ialias_call (GOMP_taskgroup_start) (); ++ if (flags & GOMP_TASK_FLAG_REDUCTION) ++ { ++ struct gomp_data_head { TYPE t1, t2; uintptr_t *ptr; }; ++ uintptr_t *ptr = ((struct gomp_data_head *) data)->ptr; ++ ialias_call (GOMP_taskgroup_reduction_register) (ptr); ++ } ++ } + + if (priority > gomp_max_task_priority_var) + priority = gomp_max_task_priority_var; +@@ -284,19 +301,31 @@ GOMP_taskloop (void (*fn) (void *), void + gomp_mutex_lock (&team->task_lock); + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ +- if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier) +- || (taskgroup && taskgroup->cancelled)) +- && cpyfn == NULL, 0)) ++ if (__builtin_expect (gomp_cancel_var, 0) ++ && cpyfn == NULL) + { +- gomp_mutex_unlock (&team->task_lock); +- for (i = 0; i < num_tasks; i++) ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ { ++ do_cancel: ++ gomp_mutex_unlock (&team->task_lock); ++ for (i = 0; i < num_tasks; i++) ++ { ++ gomp_finish_task (tasks[i]); ++ free (tasks[i]); ++ } ++ if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) ++ ialias_call (GOMP_taskgroup_end) (); ++ return; ++ } ++ if (taskgroup) + { +- gomp_finish_task (tasks[i]); +- free (tasks[i]); ++ if (taskgroup->cancelled) ++ goto do_cancel; ++ if (taskgroup->workshare ++ && taskgroup->prev ++ && taskgroup->prev->cancelled) ++ goto do_cancel; + } +- if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0) +- ialias_call (GOMP_taskgroup_end) (); +- return; + } + if (taskgroup) + taskgroup->num_children += num_tasks; +--- libgomp/parallel.c.jj 2018-04-25 09:40:31.926655587 +0200 ++++ libgomp/parallel.c 2019-05-07 18:46:36.532109640 +0200 +@@ -123,7 +123,8 @@ void + GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads) + { + num_threads = gomp_resolve_num_threads (num_threads, 0); +- gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads)); ++ gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads), ++ NULL); + } + + void +@@ -161,14 +162,33 @@ GOMP_parallel_end (void) + ialias (GOMP_parallel_end) + + void +-GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags) ++GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, ++ unsigned int flags) + { + num_threads = gomp_resolve_num_threads (num_threads, 0); +- gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads)); ++ gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads), ++ NULL); + fn (data); + ialias_call (GOMP_parallel_end) (); + } + ++unsigned ++GOMP_parallel_reductions (void (*fn) (void *), void *data, ++ unsigned num_threads, unsigned int flags) ++{ ++ struct gomp_taskgroup *taskgroup; ++ num_threads = gomp_resolve_num_threads (num_threads, 0); ++ uintptr_t *rdata = *(uintptr_t **)data; ++ taskgroup = gomp_parallel_reduction_register (rdata, num_threads); ++ gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads), ++ taskgroup); ++ fn (data); ++ ialias_call (GOMP_parallel_end) (); ++ gomp_sem_destroy (&taskgroup->taskgroup_sem); ++ free (taskgroup); ++ return num_threads; ++} ++ + bool + GOMP_cancellation_point (int which) + { +@@ -185,8 +205,15 @@ GOMP_cancellation_point (int which) + } + else if (which & GOMP_CANCEL_TASKGROUP) + { +- if (thr->task->taskgroup && thr->task->taskgroup->cancelled) +- return true; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return true; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return true; ++ } + /* FALLTHRU into the GOMP_CANCEL_PARALLEL case, + as #pragma omp cancel parallel also cancels all explicit + tasks. */ +@@ -218,11 +245,17 @@ GOMP_cancel (int which, bool do_cancel) + } + else if (which & GOMP_CANCEL_TASKGROUP) + { +- if (thr->task->taskgroup && !thr->task->taskgroup->cancelled) ++ if (thr->task->taskgroup) + { +- gomp_mutex_lock (&team->task_lock); +- thr->task->taskgroup->cancelled = true; +- gomp_mutex_unlock (&team->task_lock); ++ struct gomp_taskgroup *taskgroup = thr->task->taskgroup; ++ if (taskgroup->workshare && taskgroup->prev) ++ taskgroup = taskgroup->prev; ++ if (!taskgroup->cancelled) ++ { ++ gomp_mutex_lock (&team->task_lock); ++ taskgroup->cancelled = true; ++ gomp_mutex_unlock (&team->task_lock); ++ } + } + return true; + } +--- libgomp/oacc-plugin.h.jj 2018-04-25 09:40:31.322655307 +0200 ++++ libgomp/oacc-plugin.h 2019-05-07 18:46:36.531109656 +0200 +@@ -29,5 +29,6 @@ + + extern void GOMP_PLUGIN_async_unmap_vars (void *, int); + extern void *GOMP_PLUGIN_acc_thread (void); ++extern int GOMP_PLUGIN_acc_default_dim (unsigned int); + + #endif +--- libgomp/target.c.jj 2018-04-25 09:40:31.912655580 +0200 ++++ libgomp/target.c 2019-05-07 19:07:21.032306327 +0200 +@@ -180,16 +180,22 @@ gomp_device_copy (struct gomp_device_des + /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses) + host to device memory transfers. */ + ++struct gomp_coalesce_chunk ++{ ++ /* The starting and ending point of a coalesced chunk of memory. */ ++ size_t start, end; ++}; ++ + struct gomp_coalesce_buf + { + /* Buffer into which gomp_copy_host2dev will memcpy data and from which + it will be copied to the device. */ + void *buf; + struct target_mem_desc *tgt; +- /* Array with offsets, chunks[2 * i] is the starting offset and +- chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address ++ /* Array with offsets, chunks[i].start is the starting offset and ++ chunks[i].end ending offset relative to tgt->tgt_start device address + of chunks which are to be copied to buf and later copied to device. */ +- size_t *chunks; ++ struct gomp_coalesce_chunk *chunks; + /* Number of chunks in chunks array, or -1 if coalesce buffering should not + be performed. */ + long chunk_cnt; +@@ -222,14 +228,14 @@ gomp_coalesce_buf_add (struct gomp_coale + { + if (cbuf->chunk_cnt < 0) + return; +- if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1]) ++ if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end) + { + cbuf->chunk_cnt = -1; + return; + } +- if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP) ++ if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end + MAX_COALESCE_BUF_GAP) + { +- cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len; ++ cbuf->chunks[cbuf->chunk_cnt - 1].end = start + len; + cbuf->use_cnt++; + return; + } +@@ -239,8 +245,8 @@ gomp_coalesce_buf_add (struct gomp_coale + if (cbuf->use_cnt == 1) + cbuf->chunk_cnt--; + } +- cbuf->chunks[2 * cbuf->chunk_cnt] = start; +- cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len; ++ cbuf->chunks[cbuf->chunk_cnt].start = start; ++ cbuf->chunks[cbuf->chunk_cnt].end = start + len; + cbuf->chunk_cnt++; + cbuf->use_cnt = 1; + } +@@ -271,20 +277,20 @@ gomp_copy_host2dev (struct gomp_device_d + if (cbuf) + { + uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start; +- if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1]) ++ if (doff < cbuf->chunks[cbuf->chunk_cnt - 1].end) + { + long first = 0; + long last = cbuf->chunk_cnt - 1; + while (first <= last) + { + long middle = (first + last) >> 1; +- if (cbuf->chunks[2 * middle + 1] <= doff) ++ if (cbuf->chunks[middle].end <= doff) + first = middle + 1; +- else if (cbuf->chunks[2 * middle] <= doff) ++ else if (cbuf->chunks[middle].start <= doff) + { +- if (doff + sz > cbuf->chunks[2 * middle + 1]) ++ if (doff + sz > cbuf->chunks[middle].end) + gomp_fatal ("internal libgomp cbuf error"); +- memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]), ++ memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start), + h, sz); + return; + } +@@ -510,8 +516,8 @@ gomp_map_vars (struct gomp_device_descr + cbuf.buf = NULL; + if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET) + { +- cbuf.chunks +- = (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t)); ++ size_t chunks_size = (mapnum + 1) * sizeof (struct gomp_coalesce_chunk); ++ cbuf.chunks = (struct gomp_coalesce_chunk *) gomp_alloca (chunks_size); + cbuf.chunk_cnt = 0; + } + if (pragma_kind == GOMP_MAP_VARS_TARGET) +@@ -521,8 +527,8 @@ gomp_map_vars (struct gomp_device_descr + tgt_size = mapnum * sizeof (void *); + cbuf.chunk_cnt = 1; + cbuf.use_cnt = 1 + (mapnum > 1); +- cbuf.chunks[0] = 0; +- cbuf.chunks[1] = tgt_size; ++ cbuf.chunks[0].start = 0; ++ cbuf.chunks[0].end = tgt_size; + } + + gomp_mutex_lock (&devicep->lock); +@@ -707,7 +713,7 @@ gomp_map_vars (struct gomp_device_descr + if (cbuf.chunk_cnt > 0) + { + cbuf.buf +- = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]); ++ = malloc (cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start); + if (cbuf.buf) + { + cbuf.tgt = tgt; +@@ -859,6 +865,7 @@ gomp_map_vars (struct gomp_device_descr + tgt->list[i].offset = 0; + tgt->list[i].length = k->host_end - k->host_start; + k->refcount = 1; ++ k->dynamic_refcount = 0; + tgt->refcount++; + array->left = NULL; + array->right = NULL; +@@ -956,9 +963,10 @@ gomp_map_vars (struct gomp_device_descr + /* Set link pointer on target to the device address of the + mapped object. */ + void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset); +- devicep->host2dev_func (devicep->target_id, +- (void *) n->tgt_offset, +- &tgt_addr, sizeof (void *)); ++ /* We intentionally do not use coalescing here, as it's not ++ data allocated by the current call to this function. */ ++ gomp_copy_host2dev (devicep, (void *) n->tgt_offset, ++ &tgt_addr, sizeof (void *), NULL); + } + array++; + } +@@ -981,10 +989,14 @@ gomp_map_vars (struct gomp_device_descr + { + long c = 0; + for (c = 0; c < cbuf.chunk_cnt; ++c) +- gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]), +- (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]), +- cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL); ++ gomp_copy_host2dev (devicep, ++ (void *) (tgt->tgt_start + cbuf.chunks[c].start), ++ (char *) cbuf.buf + (cbuf.chunks[c].start ++ - cbuf.chunks[0].start), ++ cbuf.chunks[c].end - cbuf.chunks[c].start, NULL); + free (cbuf.buf); ++ cbuf.buf = NULL; ++ cbufp = NULL; + } + + /* If the variable from "omp target enter data" map-list was already mapped, +@@ -1011,6 +1023,23 @@ gomp_unmap_tgt (struct target_mem_desc * + free (tgt); + } + ++attribute_hidden bool ++gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k) ++{ ++ bool is_tgt_unmapped = false; ++ splay_tree_remove (&devicep->mem_map, k); ++ if (k->link_key) ++ splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key); ++ if (k->tgt->refcount > 1) ++ k->tgt->refcount--; ++ else ++ { ++ is_tgt_unmapped = true; ++ gomp_unmap_tgt (k->tgt); ++ } ++ return is_tgt_unmapped; ++} ++ + /* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant + variables back from device to host: if it is false, it is assumed that this + has been done already. */ +@@ -1059,16 +1088,7 @@ gomp_unmap_vars (struct target_mem_desc + + tgt->list[i].offset), + tgt->list[i].length); + if (do_unmap) +- { +- splay_tree_remove (&devicep->mem_map, k); +- if (k->link_key) +- splay_tree_insert (&devicep->mem_map, +- (splay_tree_node) k->link_key); +- if (k->tgt->refcount > 1) +- k->tgt->refcount--; +- else +- gomp_unmap_tgt (k->tgt); +- } ++ gomp_remove_var (devicep, k); + } + + if (tgt->refcount > 1) +@@ -1298,17 +1318,7 @@ gomp_unload_image_from_device (struct go + else + { + splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &k); +- splay_tree_remove (&devicep->mem_map, n); +- if (n->link_key) +- { +- if (n->tgt->refcount > 1) +- n->tgt->refcount--; +- else +- { +- is_tgt_unmapped = true; +- gomp_unmap_tgt (n->tgt); +- } +- } ++ is_tgt_unmapped = gomp_remove_var (devicep, n); + } + } + +@@ -1855,11 +1865,20 @@ GOMP_target_update_ext (int device, size + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ +- if (team +- && (gomp_team_barrier_cancelled (&team->barrier) +- || (thr->task->taskgroup +- && thr->task->taskgroup->cancelled))) +- return; ++ if (__builtin_expect (gomp_cancel_var, 0) && team) ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return; ++ } ++ } + + gomp_task_maybe_wait_for_dependencies (depend); + } +@@ -1874,10 +1893,20 @@ GOMP_target_update_ext (int device, size + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ +- if (team +- && (gomp_team_barrier_cancelled (&team->barrier) +- || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) +- return; ++ if (__builtin_expect (gomp_cancel_var, 0) && team) ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return; ++ } ++ } + + gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true); + } +@@ -1986,11 +2015,20 @@ GOMP_target_enter_exit_data (int device, + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new + tasks. */ +- if (team +- && (gomp_team_barrier_cancelled (&team->barrier) +- || (thr->task->taskgroup +- && thr->task->taskgroup->cancelled))) +- return; ++ if (__builtin_expect (gomp_cancel_var, 0) && team) ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return; ++ } ++ } + + gomp_task_maybe_wait_for_dependencies (depend); + } +@@ -2005,10 +2043,20 @@ GOMP_target_enter_exit_data (int device, + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; + /* If parallel or taskgroup has been cancelled, don't start new tasks. */ +- if (team +- && (gomp_team_barrier_cancelled (&team->barrier) +- || (thr->task->taskgroup && thr->task->taskgroup->cancelled))) +- return; ++ if (__builtin_expect (gomp_cancel_var, 0) && team) ++ { ++ if (gomp_team_barrier_cancelled (&team->barrier)) ++ return; ++ if (thr->task->taskgroup) ++ { ++ if (thr->task->taskgroup->cancelled) ++ return; ++ if (thr->task->taskgroup->workshare ++ && thr->task->taskgroup->prev ++ && thr->task->taskgroup->prev->cancelled) ++ return; ++ } ++ } + + size_t i; + if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0) +@@ -2197,8 +2245,9 @@ omp_target_is_present (void *ptr, int de + } + + int +-omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset, +- size_t src_offset, int dst_device_num, int src_device_num) ++omp_target_memcpy (void *dst, void *src, size_t length, ++ size_t dst_offset, size_t src_offset, int dst_device_num, ++ int src_device_num) + { + struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL; + bool ret; +@@ -2287,21 +2336,25 @@ omp_target_memcpy_rect_worker (void *dst + return EINVAL; + if (dst_devicep == NULL && src_devicep == NULL) + { +- memcpy ((char *) dst + dst_off, (char *) src + src_off, length); ++ memcpy ((char *) dst + dst_off, (char *) src + src_off, ++ length); + ret = 1; + } + else if (src_devicep == NULL) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, + (char *) dst + dst_off, +- (char *) src + src_off, length); ++ (char *) src + src_off, ++ length); + else if (dst_devicep == NULL) + ret = src_devicep->dev2host_func (src_devicep->target_id, + (char *) dst + dst_off, +- (char *) src + src_off, length); ++ (char *) src + src_off, ++ length); + else if (src_devicep == dst_devicep) + ret = src_devicep->dev2dev_func (src_devicep->target_id, + (char *) dst + dst_off, +- (char *) src + src_off, length); ++ (char *) src + src_off, ++ length); + else + ret = 0; + return ret ? 0 : EINVAL; +@@ -2396,8 +2449,8 @@ omp_target_memcpy_rect (void *dst, void + } + + int +-omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size, +- size_t device_offset, int device_num) ++omp_target_associate_ptr (void *host_ptr, void *device_ptr, ++ size_t size, size_t device_offset, int device_num) + { + if (device_num == GOMP_DEVICE_HOST_FALLBACK) + return EINVAL; +@@ -2499,6 +2552,31 @@ omp_target_disassociate_ptr (void *ptr, + return ret; + } + ++int ++omp_pause_resource (omp_pause_resource_t kind, int device_num) ++{ ++ (void) kind; ++ if (device_num == GOMP_DEVICE_HOST_FALLBACK) ++ return gomp_pause_host (); ++ if (device_num < 0 || device_num >= gomp_get_num_devices ()) ++ return -1; ++ /* Do nothing for target devices for now. */ ++ return 0; ++} ++ ++int ++omp_pause_resource_all (omp_pause_resource_t kind) ++{ ++ (void) kind; ++ if (gomp_pause_host ()) ++ return -1; ++ /* Do nothing for target devices for now. */ ++ return 0; ++} ++ ++ialias (omp_pause_resource) ++ialias (omp_pause_resource_all) ++ + #ifdef PLUGIN_SUPPORT + + /* This function tries to load a plugin for DEVICE. Name of plugin is passed +@@ -2632,9 +2710,9 @@ gomp_target_fini (void) + } + } + +-/* This function initializes the runtime needed for offloading. +- It parses the list of offload targets and tries to load the plugins for +- these targets. On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP ++/* This function initializes the runtime for offloading. ++ It parses the list of offload plugins, and tries to load these. ++ On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP + will be set, and the array DEVICES initialized, containing descriptors for + corresponding devices, first the GOMP_OFFLOAD_CAP_OPENMP_400 ones, follows + by the others. */ +@@ -2651,7 +2729,7 @@ gomp_target_init (void) + num_devices = 0; + devices = NULL; + +- cur = OFFLOAD_TARGETS; ++ cur = OFFLOAD_PLUGINS; + if (*cur) + do + { +--- libgomp/ordered.c.jj 2018-04-25 09:40:31.926655587 +0200 ++++ libgomp/ordered.c 2019-05-07 18:46:36.532109640 +0200 +@@ -259,7 +259,8 @@ GOMP_ordered_end (void) + #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__) + + void +-gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size) ++gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size, ++ size_t extra) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; +@@ -269,13 +270,24 @@ gomp_doacross_init (unsigned ncounts, lo + struct gomp_doacross_work_share *doacross; + + if (team == NULL || team->nthreads == 1) +- return; ++ { ++ empty: ++ if (!extra) ++ ws->doacross = NULL; ++ else ++ { ++ doacross = gomp_malloc_cleared (sizeof (*doacross) + extra); ++ doacross->extra = (void *) (doacross + 1); ++ ws->doacross = doacross; ++ } ++ return; ++ } + + for (i = 0; i < ncounts; i++) + { + /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ + if (counts[i] == 0) +- return; ++ goto empty; + + if (num_bits <= MAX_COLLAPSED_BITS) + { +@@ -314,7 +326,7 @@ gomp_doacross_init (unsigned ncounts, lo + elt_sz = (elt_sz + 63) & ~63UL; + + doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz +- + shift_sz); ++ + shift_sz + extra); + doacross->chunk_size = chunk_size; + doacross->elt_sz = elt_sz; + doacross->ncounts = ncounts; +@@ -322,6 +334,13 @@ gomp_doacross_init (unsigned ncounts, lo + doacross->array = (unsigned char *) + ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) + & ~(uintptr_t) 63); ++ if (extra) ++ { ++ doacross->extra = doacross->array + num_ents * elt_sz; ++ memset (doacross->extra, '\0', extra); ++ } ++ else ++ doacross->extra = NULL; + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int shift_count = 0; +@@ -360,7 +379,8 @@ GOMP_doacross_post (long *counts) + unsigned long ent; + unsigned int i; + +- if (__builtin_expect (doacross == NULL, 0)) ++ if (__builtin_expect (doacross == NULL, 0) ++ || __builtin_expect (doacross->array == NULL, 0)) + { + __sync_synchronize (); + return; +@@ -411,7 +431,8 @@ GOMP_doacross_wait (long first, ...) + unsigned long ent; + unsigned int i; + +- if (__builtin_expect (doacross == NULL, 0)) ++ if (__builtin_expect (doacross == NULL, 0) ++ || __builtin_expect (doacross->array == NULL, 0)) + { + __sync_synchronize (); + return; +@@ -488,7 +509,8 @@ GOMP_doacross_wait (long first, ...) + typedef unsigned long long gomp_ull; + + void +-gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size) ++gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, ++ gomp_ull chunk_size, size_t extra) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; +@@ -498,13 +520,24 @@ gomp_doacross_ull_init (unsigned ncounts + struct gomp_doacross_work_share *doacross; + + if (team == NULL || team->nthreads == 1) +- return; ++ { ++ empty: ++ if (!extra) ++ ws->doacross = NULL; ++ else ++ { ++ doacross = gomp_malloc_cleared (sizeof (*doacross) + extra); ++ doacross->extra = (void *) (doacross + 1); ++ ws->doacross = doacross; ++ } ++ return; ++ } + + for (i = 0; i < ncounts; i++) + { + /* If any count is 0, GOMP_doacross_{post,wait} can't be called. */ + if (counts[i] == 0) +- return; ++ goto empty; + + if (num_bits <= MAX_COLLAPSED_BITS) + { +@@ -557,6 +590,13 @@ gomp_doacross_ull_init (unsigned ncounts + doacross->array = (unsigned char *) + ((((uintptr_t) (doacross + 1)) + 63 + shift_sz) + & ~(uintptr_t) 63); ++ if (extra) ++ { ++ doacross->extra = doacross->array + num_ents * elt_sz; ++ memset (doacross->extra, '\0', extra); ++ } ++ else ++ doacross->extra = NULL; + if (num_bits <= MAX_COLLAPSED_BITS) + { + unsigned int shift_count = 0; +@@ -595,7 +635,8 @@ GOMP_doacross_ull_post (gomp_ull *counts + unsigned long ent; + unsigned int i; + +- if (__builtin_expect (doacross == NULL, 0)) ++ if (__builtin_expect (doacross == NULL, 0) ++ || __builtin_expect (doacross->array == NULL, 0)) + { + __sync_synchronize (); + return; +@@ -667,7 +708,8 @@ GOMP_doacross_ull_wait (gomp_ull first, + unsigned long ent; + unsigned int i; + +- if (__builtin_expect (doacross == NULL, 0)) ++ if (__builtin_expect (doacross == NULL, 0) ++ || __builtin_expect (doacross->array == NULL, 0)) + { + __sync_synchronize (); + return; +--- libgomp/alloc.c.jj 2018-04-25 09:40:31.926655587 +0200 ++++ libgomp/alloc.c 2019-05-07 18:46:36.336112770 +0200 +@@ -57,3 +57,50 @@ gomp_realloc (void *old, size_t size) + gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size); + return ret; + } ++ ++void * ++gomp_aligned_alloc (size_t al, size_t size) ++{ ++ void *ret; ++ if (al < sizeof (void *)) ++ al = sizeof (void *); ++#ifdef HAVE_ALIGNED_ALLOC ++ ret = aligned_alloc (al, size); ++#elif defined(HAVE__ALIGNED_MALLOC) ++ ret = _aligned_malloc (size, al); ++#elif defined(HAVE_POSIX_MEMALIGN) ++ if (posix_memalign (&ret, al, size) != 0) ++ ret = NULL; ++#elif defined(HAVE_MEMALIGN) ++ { ++ extern void *memalign (size_t, size_t); ++ ret = memalign (al, size); ++ } ++#else ++ ret = NULL; ++ if ((al & (al - 1)) == 0 && size) ++ { ++ void *p = malloc (size + al); ++ if (p) ++ { ++ void *ap = (void *) (((uintptr_t) p + al) & -al); ++ ((void **) ap)[-1] = p; ++ ret = ap; ++ } ++ } ++#endif ++ if (ret == NULL) ++ gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size); ++ return ret; ++} ++ ++void ++gomp_aligned_free (void *ptr) ++{ ++#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC ++ free (ptr); ++#else ++ if (ptr) ++ free (((void **) ptr)[-1]); ++#endif ++} +--- libgomp/configure.ac.jj 2018-04-25 09:40:31.321655307 +0200 ++++ libgomp/configure.ac 2019-05-07 18:46:36.471110614 +0200 +@@ -219,6 +219,7 @@ m4_include([plugin/configfrag.ac]) + + # Check for functions needed. + AC_CHECK_FUNCS(getloadavg clock_gettime strtoull) ++AC_CHECK_FUNCS(aligned_alloc posix_memalign memalign _aligned_malloc) + + # Check for broken semaphore implementation on darwin. + # sem_init returns: sem_init error: Function not implemented. +@@ -266,6 +267,41 @@ if test $ac_cv_func_clock_gettime = no; + [Define to 1 if you have the `clock_gettime' function.])]) + fi + ++# Check for uname. ++AC_COMPILE_IFELSE( ++ [AC_LANG_PROGRAM( ++ [#include ++ #include ++ #include ], ++ [struct utsname buf; ++ volatile size_t len = 0; ++ if (!uname (buf)) ++ len = strlen (buf.nodename);])], ++ AC_DEFINE(HAVE_UNAME, 1, ++[ Define if uname is supported and struct utsname has nodename field.])) ++ ++# Check for gethostname. ++AC_COMPILE_IFELSE( ++ [AC_LANG_PROGRAM( ++ [#include ], ++ [ ++changequote(,)dnl ++ char buf[256]; ++ if (gethostname (buf, sizeof (buf) - 1) == 0) ++ buf[255] = '\0'; ++changequote([,])dnl ++ ])], ++ AC_DEFINE(HAVE_GETHOSTNAME, 1, ++[ Define if gethostname is supported.])) ++ ++# Check for getpid. ++AC_COMPILE_IFELSE( ++ [AC_LANG_PROGRAM( ++ [#include ], ++ [int pid = getpid ();])], ++ AC_DEFINE(HAVE_GETPID, 1, ++[ Define if getpid is supported.])) ++ + # See if we support thread-local storage. + GCC_CHECK_TLS + +--- libgomp/icv.c.jj 2018-04-25 09:40:31.870655561 +0200 ++++ libgomp/icv.c 2019-05-07 18:46:36.501110134 +0200 +@@ -69,7 +69,7 @@ void + omp_set_schedule (omp_sched_t kind, int chunk_size) + { + struct gomp_task_icv *icv = gomp_icv (true); +- switch (kind) ++ switch (kind & ~omp_sched_monotonic) + { + case omp_sched_static: + if (chunk_size < 1) +--- libgomp/configure.jj 2018-04-25 09:40:31.913655581 +0200 ++++ libgomp/configure 2019-05-07 18:47:37.961128420 +0200 +@@ -636,6 +636,8 @@ PLUGIN_NVPTX_FALSE + PLUGIN_NVPTX_TRUE + offload_additional_lib_paths + offload_additional_options ++offload_targets ++offload_plugins + PLUGIN_HSA_LIBS + PLUGIN_HSA_LDFLAGS + PLUGIN_HSA_CPPFLAGS +@@ -648,7 +650,6 @@ PLUGIN_NVPTX_CPPFLAGS + PLUGIN_NVPTX + CUDA_DRIVER_LIB + CUDA_DRIVER_INCLUDE +-offload_targets + libtool_VERSION + ac_ct_FC + FCFLAGS +@@ -11157,7 +11158,7 @@ else + lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 + lt_status=$lt_dlunknown + cat > conftest.$ac_ext <<_LT_EOF +-#line 11160 "configure" ++#line 11161 "configure" + #include "confdefs.h" + + #if HAVE_DLFCN_H +@@ -11263,7 +11264,7 @@ else + lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2 + lt_status=$lt_dlunknown + cat > conftest.$ac_ext <<_LT_EOF +-#line 11266 "configure" ++#line 11267 "configure" + #include "confdefs.h" + + #if HAVE_DLFCN_H +@@ -15167,8 +15168,6 @@ fi + # see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + # . + +-offload_targets= +- + plugin_support=yes + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlsym in -ldl" >&5 + $as_echo_n "checking for dlsym in -ldl... " >&6; } +@@ -15302,7 +15301,11 @@ if test "${with_cuda_driver_lib+set}" = + fi + + case "x$with_cuda_driver" in +- x | xno) ;; ++ x) ;; ++ xno) ++ CUDA_DRIVER_INCLUDE=no ++ CUDA_DRIVER_LIB=no ++ ;; + *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include + CUDA_DRIVER_LIB=$with_cuda_driver/lib + ;; +@@ -15313,10 +15316,12 @@ fi + if test "x$with_cuda_driver_lib" != x; then + CUDA_DRIVER_LIB=$with_cuda_driver_lib + fi +-if test "x$CUDA_DRIVER_INCLUDE" != x; then ++if test "x$CUDA_DRIVER_INCLUDE" != x \ ++ && test "x$CUDA_DRIVER_INCLUDE" != xno; then + CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE + fi +-if test "x$CUDA_DRIVER_LIB" != x; then ++if test "x$CUDA_DRIVER_LIB" != x \ ++ && test "x$CUDA_DRIVER_LIB" != xno; then + CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB + fi + +@@ -15383,7 +15388,13 @@ PLUGIN_HSA_LIBS= + + + +-# Get offload targets and path to install tree of offloading compiler. ++# Parse '--enable-offload-targets', figure out the corresponding libgomp ++# plugins, and configure to find the corresponding offload compilers. ++# 'offload_plugins' and 'offload_targets' will be populated in the same order. ++offload_plugins= ++offload_targets= ++ ++ + offload_additional_options= + offload_additional_lib_paths= + +@@ -15392,25 +15403,27 @@ if test x"$enable_offload_targets" != x; + for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do + tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'` + tgt=`echo $tgt | sed 's/=.*//'` +- tgt_name= ++ tgt_plugin= + case $tgt in + *-intelmic-* | *-intelmicemul-*) +- tgt_name=intelmic ++ tgt_plugin=intelmic + ;; + nvptx*) +- tgt_name=nvptx ++ tgt_plugin=nvptx + PLUGIN_NVPTX=$tgt +- PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS +- PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS +- PLUGIN_NVPTX_LIBS='-lcuda' +- +- PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS +- CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" +- PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS +- LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" +- PLUGIN_NVPTX_save_LIBS=$LIBS +- LIBS="$PLUGIN_NVPTX_LIBS $LIBS" +- cat confdefs.h - <<_ACEOF >conftest.$ac_ext ++ if test "x$CUDA_DRIVER_LIB" != xno \ ++ && test "x$CUDA_DRIVER_LIB" != xno; then ++ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS ++ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS ++ PLUGIN_NVPTX_LIBS='-lcuda' ++ ++ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS ++ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" ++ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS ++ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" ++ PLUGIN_NVPTX_save_LIBS=$LIBS ++ LIBS="$PLUGIN_NVPTX_LIBS $LIBS" ++ cat confdefs.h - <<_ACEOF >conftest.$ac_ext + /* end confdefs.h. */ + #include "cuda.h" + int +@@ -15426,13 +15439,16 @@ if ac_fn_c_try_link "$LINENO"; then : + fi + rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +- CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS +- LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS +- LIBS=$PLUGIN_NVPTX_save_LIBS ++ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS ++ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS ++ LIBS=$PLUGIN_NVPTX_save_LIBS ++ fi + case $PLUGIN_NVPTX in + nvptx*) +- if test "x$CUDA_DRIVER_INCLUDE" = x \ +- && test "x$CUDA_DRIVER_LIB" = x; then ++ if (test "x$CUDA_DRIVER_INCLUDE" = x \ ++ || test "x$CUDA_DRIVER_INCLUDE" = xno) \ ++ && (test "x$CUDA_DRIVER_LIB" = x \ ++ || test "x$CUDA_DRIVER_LIB" = xno); then + PLUGIN_NVPTX=1 + PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda' + PLUGIN_NVPTX_LIBS='-ldl' +@@ -15452,7 +15468,7 @@ rm -f core conftest.err conftest.$ac_obj + PLUGIN_HSA=0 + ;; + *) +- tgt_name=hsa ++ tgt_plugin=hsa + PLUGIN_HSA=$tgt + PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS + PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS" +@@ -15470,7 +15486,7 @@ rm -f core conftest.err conftest.$ac_obj + LDFLAGS=$PLUGIN_HSA_save_LDFLAGS + LIBS=$PLUGIN_HSA_save_LIBS + case $PLUGIN_HSA in +- hsa*) ++ hsa*) + HSA_PLUGIN=0 + as_fn_error "HSA run-time package required for HSA support" "$LINENO" 5 + ;; +@@ -15487,16 +15503,19 @@ rm -f core conftest.err conftest.$ac_obj + as_fn_error "unknown offload target specified" "$LINENO" 5 + ;; + esac +- if test x"$tgt_name" = x; then +- # Don't configure libgomp for this offloading target if we don't build +- # the corresponding plugin. ++ if test x"$tgt_plugin" = x; then ++ # Not configuring libgomp for this offload target if we're not building ++ # the corresponding offload plugin. + continue +- elif test x"$offload_targets" = x; then +- offload_targets=$tgt_name ++ elif test x"$offload_plugins" = x; then ++ offload_plugins=$tgt_plugin ++ offload_targets=$tgt + else +- offload_targets=$offload_targets,$tgt_name ++ offload_plugins=$offload_plugins,$tgt_plugin ++ offload_targets=$offload_targets,$tgt + fi +- if test "$tgt_name" = hsa; then ++ # Configure additional search paths. ++ if test "$tgt_plugin" = hsa; then + # Offloading compilation is all handled by the target compiler. + : + elif test x"$tgt_dir" != x; then +@@ -15510,7 +15529,7 @@ rm -f core conftest.err conftest.$ac_obj + fi + + cat >>confdefs.h <<_ACEOF +-#define OFFLOAD_TARGETS "$offload_targets" ++#define OFFLOAD_PLUGINS "$offload_plugins" + _ACEOF + + if test $PLUGIN_NVPTX = 1; then +@@ -15570,6 +15589,19 @@ _ACEOF + fi + done + ++for ac_func in aligned_alloc posix_memalign memalign _aligned_malloc ++do : ++ as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ++ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" ++eval as_val=\$$as_ac_var ++ if test "x$as_val" = x""yes; then : ++ cat >>confdefs.h <<_ACEOF ++#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1 ++_ACEOF ++ ++fi ++done ++ + + # Check for broken semaphore implementation on darwin. + # sem_init returns: sem_init error: Function not implemented. +@@ -15784,6 +15816,72 @@ fi + + fi + ++# Check for uname. ++cat confdefs.h - <<_ACEOF >conftest.$ac_ext ++/* end confdefs.h. */ ++#include ++ #include ++ #include ++int ++main () ++{ ++struct utsname buf; ++ volatile size_t len = 0; ++ if (!uname (buf)) ++ len = strlen (buf.nodename); ++ ; ++ return 0; ++} ++_ACEOF ++if ac_fn_c_try_compile "$LINENO"; then : ++ ++$as_echo "#define HAVE_UNAME 1" >>confdefs.h ++ ++fi ++rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext ++ ++# Check for gethostname. ++cat confdefs.h - <<_ACEOF >conftest.$ac_ext ++/* end confdefs.h. */ ++#include ++int ++main () ++{ ++ ++ char buf[256]; ++ if (gethostname (buf, sizeof (buf) - 1) == 0) ++ buf[255] = '\0'; ++ ++ ; ++ return 0; ++} ++_ACEOF ++if ac_fn_c_try_compile "$LINENO"; then : ++ ++$as_echo "#define HAVE_GETHOSTNAME 1" >>confdefs.h ++ ++fi ++rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext ++ ++# Check for getpid. ++cat confdefs.h - <<_ACEOF >conftest.$ac_ext ++/* end confdefs.h. */ ++#include ++int ++main () ++{ ++int pid = getpid (); ++ ; ++ return 0; ++} ++_ACEOF ++if ac_fn_c_try_compile "$LINENO"; then : ++ ++$as_echo "#define HAVE_GETPID 1" >>confdefs.h ++ ++fi ++rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext ++ + # See if we support thread-local storage. + + +--- libgomp/Makefile.am.jj 2018-04-25 09:40:31.926655587 +0200 ++++ libgomp/Makefile.am 2019-05-07 19:59:03.683989317 +0200 +@@ -63,12 +63,13 @@ libgomp_la_SOURCES = alloc.c atomic.c ba + parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \ + proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \ + splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \ +- oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c ++ oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ ++ affinity-fmt.c teams.c + + include $(top_srcdir)/plugin/Makefrag.am + + if USE_FORTRAN +-libgomp_la_SOURCES += openacc.f90 ++libgomp_la_SOURCES += openacc2.f90 + endif + + nodist_noinst_HEADERS = libgomp_f.h +@@ -87,8 +88,6 @@ omp_lib_kinds.mod: omp_lib.mod + : + openacc_kinds.mod: openacc.mod + : +-openacc.mod: openacc.lo +- : + %.mod: %.f90 + $(FC) $(FCFLAGS) -fsyntax-only $< + fortran.lo: libgomp_f.h +--- libgomp/oacc-mem.c.jj 2018-04-25 09:40:31.924655586 +0200 ++++ libgomp/oacc-mem.c 2019-05-07 18:46:36.530109672 +0200 +@@ -153,8 +153,9 @@ acc_free (void *d) + gomp_fatal ("error in freeing device memory in %s", __FUNCTION__); + } + +-void +-acc_memcpy_to_device (void *d, void *h, size_t s) ++static void ++memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, ++ const char *libfnname) + { + /* No need to call lazy open here, as the device pointer must have + been obtained from a routine that did that. */ +@@ -164,31 +165,49 @@ acc_memcpy_to_device (void *d, void *h, + + if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + { +- memmove (d, h, s); ++ if (from) ++ memmove (h, d, s); ++ else ++ memmove (d, h, s); + return; + } + +- if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s)) +- gomp_fatal ("error in %s", __FUNCTION__); ++ if (async > acc_async_sync) ++ thr->dev->openacc.async_set_async_func (async); ++ ++ bool ret = (from ++ ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s) ++ : thr->dev->host2dev_func (thr->dev->target_id, d, h, s)); ++ ++ if (async > acc_async_sync) ++ thr->dev->openacc.async_set_async_func (acc_async_sync); ++ ++ if (!ret) ++ gomp_fatal ("error in %s", libfnname); + } + + void +-acc_memcpy_from_device (void *h, void *d, size_t s) ++acc_memcpy_to_device (void *d, void *h, size_t s) + { +- /* No need to call lazy open here, as the device pointer must have +- been obtained from a routine that did that. */ +- struct goacc_thread *thr = goacc_thread (); ++ memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__); ++} + +- assert (thr && thr->dev); ++void ++acc_memcpy_to_device_async (void *d, void *h, size_t s, int async) ++{ ++ memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__); ++} + +- if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) +- { +- memmove (h, d, s); +- return; +- } ++void ++acc_memcpy_from_device (void *h, void *d, size_t s) ++{ ++ memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__); ++} + +- if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s)) +- gomp_fatal ("error in %s", __FUNCTION__); ++void ++acc_memcpy_from_device_async (void *h, void *d, size_t s, int async) ++{ ++ memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__); + } + + /* Return the device pointer that corresponds to host data H. Or NULL +@@ -347,6 +366,7 @@ acc_map_data (void *h, void *d, size_t s + + tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes, + &kinds, true, GOMP_MAP_VARS_OPENACC); ++ tgt->list[0].key->refcount = REFCOUNT_INFINITY; + } + + gomp_mutex_lock (&acc_dev->lock); +@@ -389,6 +409,9 @@ acc_unmap_data (void *h) + (void *) n->host_start, (int) host_size, (void *) h); + } + ++ /* Mark for removal. */ ++ n->refcount = 1; ++ + t = n->tgt; + + if (t->refcount == 2) +@@ -424,7 +447,7 @@ acc_unmap_data (void *h) + #define FLAG_COPY (1 << 2) + + static void * +-present_create_copy (unsigned f, void *h, size_t s) ++present_create_copy (unsigned f, void *h, size_t s, int async) + { + void *d; + splay_tree_key n; +@@ -460,6 +483,11 @@ present_create_copy (unsigned f, void *h + gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s); + } + ++ if (n->refcount != REFCOUNT_INFINITY) ++ { ++ n->refcount++; ++ n->dynamic_refcount++; ++ } + gomp_mutex_unlock (&acc_dev->lock); + } + else if (!(f & FLAG_CREATE)) +@@ -481,8 +509,16 @@ present_create_copy (unsigned f, void *h + + gomp_mutex_unlock (&acc_dev->lock); + ++ if (async > acc_async_sync) ++ acc_dev->openacc.async_set_async_func (async); ++ + tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true, + GOMP_MAP_VARS_OPENACC); ++ /* Initialize dynamic refcount. */ ++ tgt->list[0].key->dynamic_refcount = 1; ++ ++ if (async > acc_async_sync) ++ acc_dev->openacc.async_set_async_func (acc_async_sync); + + gomp_mutex_lock (&acc_dev->lock); + +@@ -499,53 +535,71 @@ present_create_copy (unsigned f, void *h + void * + acc_create (void *h, size_t s) + { +- return present_create_copy (FLAG_CREATE, h, s); ++ return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync); + } + +-void * +-acc_copyin (void *h, size_t s) ++void ++acc_create_async (void *h, size_t s, int async) + { +- return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s); ++ present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async); + } + ++/* acc_present_or_create used to be what acc_create is now. */ ++/* acc_pcreate is acc_present_or_create by a different name. */ ++#ifdef HAVE_ATTRIBUTE_ALIAS ++strong_alias (acc_create, acc_present_or_create) ++strong_alias (acc_create, acc_pcreate) ++#else + void * + acc_present_or_create (void *h, size_t s) + { +- return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s); ++ return acc_create (h, s); + } + +-/* acc_pcreate is acc_present_or_create by a different name. */ +-#ifdef HAVE_ATTRIBUTE_ALIAS +-strong_alias (acc_present_or_create, acc_pcreate) +-#else + void * + acc_pcreate (void *h, size_t s) + { +- return acc_present_or_create (h, s); ++ return acc_create (h, s); + } + #endif + + void * +-acc_present_or_copyin (void *h, size_t s) ++acc_copyin (void *h, size_t s) ++{ ++ return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, ++ acc_async_sync); ++} ++ ++void ++acc_copyin_async (void *h, size_t s, int async) + { +- return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s); ++ present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async); + } + ++/* acc_present_or_copyin used to be what acc_copyin is now. */ + /* acc_pcopyin is acc_present_or_copyin by a different name. */ + #ifdef HAVE_ATTRIBUTE_ALIAS +-strong_alias (acc_present_or_copyin, acc_pcopyin) ++strong_alias (acc_copyin, acc_present_or_copyin) ++strong_alias (acc_copyin, acc_pcopyin) + #else + void * ++acc_present_or_copyin (void *h, size_t s) ++{ ++ return acc_copyin (h, s); ++} ++ ++void * + acc_pcopyin (void *h, size_t s) + { +- return acc_present_or_copyin (h, s); ++ return acc_copyin (h, s); + } + #endif + +-#define FLAG_COPYOUT (1 << 0) ++#define FLAG_COPYOUT (1 << 0) ++#define FLAG_FINALIZE (1 << 1) + + static void +-delete_copyout (unsigned f, void *h, size_t s, const char *libfnname) ++delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname) + { + size_t host_size; + splay_tree_key n; +@@ -581,31 +635,111 @@ delete_copyout (unsigned f, void *h, siz + (void *) n->host_start, (int) host_size, (void *) h, (int) s); + } + +- gomp_mutex_unlock (&acc_dev->lock); ++ if (n->refcount == REFCOUNT_INFINITY) ++ { ++ n->refcount = 0; ++ n->dynamic_refcount = 0; ++ } ++ if (n->refcount < n->dynamic_refcount) ++ { ++ gomp_mutex_unlock (&acc_dev->lock); ++ gomp_fatal ("Dynamic reference counting assert fail\n"); ++ } + +- if (f & FLAG_COPYOUT) +- acc_dev->dev2host_func (acc_dev->target_id, h, d, s); ++ if (f & FLAG_FINALIZE) ++ { ++ n->refcount -= n->dynamic_refcount; ++ n->dynamic_refcount = 0; ++ } ++ else if (n->dynamic_refcount) ++ { ++ n->dynamic_refcount--; ++ n->refcount--; ++ } ++ ++ if (n->refcount == 0) ++ { ++ if (n->tgt->refcount == 2) ++ { ++ struct target_mem_desc *tp, *t; ++ for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; ++ tp = t, t = t->prev) ++ if (n->tgt == t) ++ { ++ if (tp) ++ tp->prev = t->prev; ++ else ++ acc_dev->openacc.data_environ = t->prev; ++ break; ++ } ++ } ++ ++ if (f & FLAG_COPYOUT) ++ { ++ if (async > acc_async_sync) ++ acc_dev->openacc.async_set_async_func (async); ++ acc_dev->dev2host_func (acc_dev->target_id, h, d, s); ++ if (async > acc_async_sync) ++ acc_dev->openacc.async_set_async_func (acc_async_sync); ++ } + +- acc_unmap_data (h); ++ gomp_remove_var (acc_dev, n); ++ } + +- if (!acc_dev->free_func (acc_dev->target_id, d)) +- gomp_fatal ("error in freeing device memory in %s", libfnname); ++ gomp_mutex_unlock (&acc_dev->lock); + } + + void + acc_delete (void *h , size_t s) + { +- delete_copyout (0, h, s, __FUNCTION__); ++ delete_copyout (0, h, s, acc_async_sync, __FUNCTION__); ++} ++ ++void ++acc_delete_async (void *h , size_t s, int async) ++{ ++ delete_copyout (0, h, s, async, __FUNCTION__); ++} ++ ++void ++acc_delete_finalize (void *h , size_t s) ++{ ++ delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__); ++} ++ ++void ++acc_delete_finalize_async (void *h , size_t s, int async) ++{ ++ delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__); + } + + void + acc_copyout (void *h, size_t s) + { +- delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__); ++ delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__); ++} ++ ++void ++acc_copyout_async (void *h, size_t s, int async) ++{ ++ delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__); ++} ++ ++void ++acc_copyout_finalize (void *h, size_t s) ++{ ++ delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync, ++ __FUNCTION__); ++} ++ ++void ++acc_copyout_finalize_async (void *h, size_t s, int async) ++{ ++ delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__); + } + + static void +-update_dev_host (int is_dev, void *h, size_t s) ++update_dev_host (int is_dev, void *h, size_t s, int async) + { + splay_tree_key n; + void *d; +@@ -631,24 +765,42 @@ update_dev_host (int is_dev, void *h, si + d = (void *) (n->tgt->tgt_start + n->tgt_offset + + (uintptr_t) h - n->host_start); + ++ if (async > acc_async_sync) ++ acc_dev->openacc.async_set_async_func (async); ++ + if (is_dev) + acc_dev->host2dev_func (acc_dev->target_id, d, h, s); + else + acc_dev->dev2host_func (acc_dev->target_id, h, d, s); + ++ if (async > acc_async_sync) ++ acc_dev->openacc.async_set_async_func (acc_async_sync); ++ + gomp_mutex_unlock (&acc_dev->lock); + } + + void + acc_update_device (void *h, size_t s) + { +- update_dev_host (1, h, s); ++ update_dev_host (1, h, s, acc_async_sync); ++} ++ ++void ++acc_update_device_async (void *h, size_t s, int async) ++{ ++ update_dev_host (1, h, s, async); + } + + void + acc_update_self (void *h, size_t s) + { +- update_dev_host (0, h, s); ++ update_dev_host (0, h, s, acc_async_sync); ++} ++ ++void ++acc_update_self_async (void *h, size_t s, int async) ++{ ++ update_dev_host (0, h, s, async); + } + + void +@@ -659,11 +811,37 @@ gomp_acc_insert_pointer (size_t mapnum, + struct goacc_thread *thr = goacc_thread (); + struct gomp_device_descr *acc_dev = thr->dev; + ++ if (acc_is_present (*hostaddrs, *sizes)) ++ { ++ splay_tree_key n; ++ gomp_mutex_lock (&acc_dev->lock); ++ n = lookup_host (acc_dev, *hostaddrs, *sizes); ++ gomp_mutex_unlock (&acc_dev->lock); ++ ++ tgt = n->tgt; ++ for (size_t i = 0; i < tgt->list_count; i++) ++ if (tgt->list[i].key == n) ++ { ++ for (size_t j = 0; j < mapnum; j++) ++ if (i + j < tgt->list_count && tgt->list[i + j].key) ++ { ++ tgt->list[i + j].key->refcount++; ++ tgt->list[i + j].key->dynamic_refcount++; ++ } ++ return; ++ } ++ /* Should not reach here. */ ++ gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset"); ++ } ++ + gomp_debug (0, " %s: prepare mappings\n", __FUNCTION__); + tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, + NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); + gomp_debug (0, " %s: mappings prepared\n", __FUNCTION__); + ++ /* Initialize dynamic refcount. */ ++ tgt->list[0].key->dynamic_refcount = 1; ++ + gomp_mutex_lock (&acc_dev->lock); + tgt->prev = acc_dev->openacc.data_environ; + acc_dev->openacc.data_environ = tgt; +@@ -671,7 +849,8 @@ gomp_acc_insert_pointer (size_t mapnum, + } + + void +-gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum) ++gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async, ++ int finalize, int mapnum) + { + struct goacc_thread *thr = goacc_thread (); + struct gomp_device_descr *acc_dev = thr->dev; +@@ -679,6 +858,9 @@ gomp_acc_remove_pointer (void *h, bool f + struct target_mem_desc *t; + int minrefs = (mapnum == 1) ? 2 : 3; + ++ if (!acc_is_present (h, s)) ++ return; ++ + gomp_mutex_lock (&acc_dev->lock); + + n = lookup_host (acc_dev, h, 1); +@@ -693,40 +875,65 @@ gomp_acc_remove_pointer (void *h, bool f + + t = n->tgt; + +- struct target_mem_desc *tp; ++ if (n->refcount < n->dynamic_refcount) ++ { ++ gomp_mutex_unlock (&acc_dev->lock); ++ gomp_fatal ("Dynamic reference counting assert fail\n"); ++ } + +- if (t->refcount == minrefs) ++ if (finalize) + { +- /* This is the last reference, so pull the descriptor off the +- chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from +- freeing the device memory. */ +- t->tgt_end = 0; +- t->to_free = 0; ++ n->refcount -= n->dynamic_refcount; ++ n->dynamic_refcount = 0; ++ } ++ else if (n->dynamic_refcount) ++ { ++ n->dynamic_refcount--; ++ n->refcount--; ++ } + +- for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; +- tp = t, t = t->prev) ++ gomp_mutex_unlock (&acc_dev->lock); ++ ++ if (n->refcount == 0) ++ { ++ if (t->refcount == minrefs) + { +- if (n->tgt == t) ++ /* This is the last reference, so pull the descriptor off the ++ chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from ++ freeing the device memory. */ ++ struct target_mem_desc *tp; ++ for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL; ++ tp = t, t = t->prev) + { +- if (tp) +- tp->prev = t->prev; +- else +- acc_dev->openacc.data_environ = t->prev; +- break; ++ if (n->tgt == t) ++ { ++ if (tp) ++ tp->prev = t->prev; ++ else ++ acc_dev->openacc.data_environ = t->prev; ++ break; ++ } + } + } +- } + +- if (force_copyfrom) +- t->list[0].copy_from = 1; ++ /* Set refcount to 1 to allow gomp_unmap_vars to unmap it. */ ++ n->refcount = 1; ++ t->refcount = minrefs; ++ for (size_t i = 0; i < t->list_count; i++) ++ if (t->list[i].key == n) ++ { ++ t->list[i].copy_from = force_copyfrom ? 1 : 0; ++ break; ++ } + +- gomp_mutex_unlock (&acc_dev->lock); ++ /* If running synchronously, unmap immediately. */ ++ if (async < acc_async_noval) ++ gomp_unmap_vars (t, true); ++ else ++ t->device_descr->openacc.register_async_cleanup_func (t, async); ++ } + +- /* If running synchronously, unmap immediately. */ +- if (async < acc_async_noval) +- gomp_unmap_vars (t, true); +- else +- t->device_descr->openacc.register_async_cleanup_func (t, async); ++ gomp_mutex_unlock (&acc_dev->lock); + + gomp_debug (0, " %s: mappings restored\n", __FUNCTION__); + } +--- libgomp/env.c.jj 2018-04-25 09:40:31.924655586 +0200 ++++ libgomp/env.c 2019-05-07 18:46:36.482110438 +0200 +@@ -88,8 +88,12 @@ void **gomp_places_list; + unsigned long gomp_places_list_len; + int gomp_debug_var; + unsigned int gomp_num_teams_var; ++bool gomp_display_affinity_var; ++char *gomp_affinity_format_var = "level %L thread %i affinity %A"; ++size_t gomp_affinity_format_len; + char *goacc_device_type; + int goacc_device_num; ++int goacc_default_dims[GOMP_DIM_MAX]; + + #ifndef LIBGOMP_OFFLOADED_ONLY + +@@ -100,6 +104,7 @@ parse_schedule (void) + { + char *env, *end; + unsigned long value; ++ int monotonic = 0; + + env = getenv ("OMP_SCHEDULE"); + if (env == NULL) +@@ -107,6 +112,26 @@ parse_schedule (void) + + while (isspace ((unsigned char) *env)) + ++env; ++ if (strncasecmp (env, "monotonic", 9) == 0) ++ { ++ monotonic = 1; ++ env += 9; ++ } ++ else if (strncasecmp (env, "nonmonotonic", 12) == 0) ++ { ++ monotonic = -1; ++ env += 12; ++ } ++ if (monotonic) ++ { ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ if (*env != ':') ++ goto unknown; ++ ++env; ++ while (isspace ((unsigned char) *env)) ++ ++env; ++ } + if (strncasecmp (env, "static", 6) == 0) + { + gomp_global_icv.run_sched_var = GFS_STATIC; +@@ -130,12 +155,16 @@ parse_schedule (void) + else + goto unknown; + ++ if (monotonic == 1 ++ || (monotonic == 0 && gomp_global_icv.run_sched_var == GFS_STATIC)) ++ gomp_global_icv.run_sched_var |= GFS_MONOTONIC; ++ + while (isspace ((unsigned char) *env)) + ++env; + if (*env == '\0') + { + gomp_global_icv.run_sched_chunk_size +- = gomp_global_icv.run_sched_var != GFS_STATIC; ++ = (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC; + return; + } + if (*env++ != ',') +@@ -158,7 +187,8 @@ parse_schedule (void) + if ((int)value != value) + goto invalid; + +- if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC) ++ if (value == 0 ++ && (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC) + value = 1; + gomp_global_icv.run_sched_chunk_size = value; + return; +@@ -1066,6 +1096,36 @@ parse_acc_device_type (void) + } + + static void ++parse_gomp_openacc_dim (void) ++{ ++ /* The syntax is the same as for the -fopenacc-dim compilation option. */ ++ const char *var_name = "GOMP_OPENACC_DIM"; ++ const char *env_var = getenv (var_name); ++ if (!env_var) ++ return; ++ ++ const char *pos = env_var; ++ int i; ++ for (i = 0; *pos && i != GOMP_DIM_MAX; i++) ++ { ++ if (i && *pos++ != ':') ++ break; ++ ++ if (*pos == ':') ++ continue; ++ ++ const char *eptr; ++ errno = 0; ++ long val = strtol (pos, (char **)&eptr, 10); ++ if (errno || val < 0 || (unsigned)val != val) ++ break; ++ ++ goacc_default_dims[i] = (int)val; ++ pos = eptr; ++ } ++} ++ ++static void + handle_omp_display_env (unsigned long stacksize, int wait_policy) + { + const char *env; +@@ -1119,19 +1179,34 @@ handle_omp_display_env (unsigned long st + fputs ("'\n", stderr); + + fprintf (stderr, " OMP_SCHEDULE = '"); +- switch (gomp_global_icv.run_sched_var) ++ if ((gomp_global_icv.run_sched_var & GFS_MONOTONIC)) ++ { ++ if (gomp_global_icv.run_sched_var != (GFS_MONOTONIC | GFS_STATIC)) ++ fputs ("MONOTONIC:", stderr); ++ } ++ else if (gomp_global_icv.run_sched_var == GFS_STATIC) ++ fputs ("NONMONOTONIC:", stderr); ++ switch (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) + { + case GFS_RUNTIME: + fputs ("RUNTIME", stderr); ++ if (gomp_global_icv.run_sched_chunk_size != 1) ++ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); + break; + case GFS_STATIC: + fputs ("STATIC", stderr); ++ if (gomp_global_icv.run_sched_chunk_size != 0) ++ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); + break; + case GFS_DYNAMIC: + fputs ("DYNAMIC", stderr); ++ if (gomp_global_icv.run_sched_chunk_size != 1) ++ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); + break; + case GFS_GUIDED: + fputs ("GUIDED", stderr); ++ if (gomp_global_icv.run_sched_chunk_size != 1) ++ fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size); + break; + case GFS_AUTO: + fputs ("AUTO", stderr); +@@ -1197,6 +1272,10 @@ handle_omp_display_env (unsigned long st + gomp_global_icv.default_device_var); + fprintf (stderr, " OMP_MAX_TASK_PRIORITY = '%d'\n", + gomp_max_task_priority_var); ++ fprintf (stderr, " OMP_DISPLAY_AFFINITY = '%s'\n", ++ gomp_display_affinity_var ? "TRUE" : "FALSE"); ++ fprintf (stderr, " OMP_AFFINITY_FORMAT = '%s'\n", ++ gomp_affinity_format_var); + + if (verbose) + { +@@ -1228,6 +1307,7 @@ initialize_env (void) + parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var); + parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var); + parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var); ++ parse_boolean ("OMP_DISPLAY_AFFINITY", &gomp_display_affinity_var); + parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true); + parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true); + parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var, +@@ -1277,6 +1357,13 @@ initialize_env (void) + } + if (gomp_global_icv.bind_var != omp_proc_bind_false) + gomp_init_affinity (); ++ ++ { ++ const char *env = getenv ("OMP_AFFINITY_FORMAT"); ++ if (env != NULL) ++ gomp_set_affinity_format (env, strlen (env)); ++ } ++ + wait_policy = parse_wait_policy (); + if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var)) + { +@@ -1302,7 +1389,6 @@ initialize_env (void) + + /* Not strictly environment related, but ordering constructors is tricky. */ + pthread_attr_init (&gomp_thread_attr); +- pthread_attr_setdetachstate (&gomp_thread_attr, PTHREAD_CREATE_DETACHED); + + if (parse_stacksize ("OMP_STACKSIZE", &stacksize) + || parse_stacksize ("GOMP_STACKSIZE", &stacksize) +@@ -1336,6 +1422,7 @@ initialize_env (void) + goacc_device_num = 0; + + parse_acc_device_type (); ++ parse_gomp_openacc_dim (); + + goacc_runtime_initialize (); + } +--- libgomp/fortran.c.jj 2018-04-25 09:40:31.913655581 +0200 ++++ libgomp/fortran.c 2019-05-07 18:46:36.491110295 +0200 +@@ -28,6 +28,8 @@ + #include "libgomp.h" + #include "libgomp_f.h" + #include ++#include ++#include + #include + + #ifdef HAVE_ATTRIBUTE_ALIAS +@@ -82,6 +84,8 @@ ialias_redirect (omp_get_team_num) + ialias_redirect (omp_is_initial_device) + ialias_redirect (omp_get_initial_device) + ialias_redirect (omp_get_max_task_priority) ++ialias_redirect (omp_pause_resource) ++ialias_redirect (omp_pause_resource_all) + #endif + + #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING +@@ -368,7 +372,9 @@ omp_get_schedule_ (int32_t *kind, int32_ + omp_sched_t k; + int cs; + omp_get_schedule (&k, &cs); +- *kind = k; ++ /* For now mask off GFS_MONOTONIC, because OpenMP 4.5 code will not ++ expect to see it. */ ++ *kind = k & ~GFS_MONOTONIC; + *chunk_size = cs; + } + +@@ -378,7 +384,8 @@ omp_get_schedule_8_ (int32_t *kind, int6 + omp_sched_t k; + int cs; + omp_get_schedule (&k, &cs); +- *kind = k; ++ /* See above. */ ++ *kind = k & ~GFS_MONOTONIC; + *chunk_size = cs; + } + +@@ -576,3 +583,96 @@ omp_get_max_task_priority_ (void) + { + return omp_get_max_task_priority (); + } ++ ++void ++omp_set_affinity_format_ (const char *format, size_t format_len) ++{ ++ gomp_set_affinity_format (format, format_len); ++} ++ ++int32_t ++omp_get_affinity_format_ (char *buffer, size_t buffer_len) ++{ ++ size_t len = strlen (gomp_affinity_format_var); ++ if (buffer_len) ++ { ++ if (len < buffer_len) ++ { ++ memcpy (buffer, gomp_affinity_format_var, len); ++ memset (buffer + len, ' ', buffer_len - len); ++ } ++ else ++ memcpy (buffer, gomp_affinity_format_var, buffer_len); ++ } ++ return len; ++} ++ ++void ++omp_display_affinity_ (const char *format, size_t format_len) ++{ ++ char *fmt = NULL, fmt_buf[256]; ++ char buf[512]; ++ if (format_len) ++ { ++ fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1); ++ memcpy (fmt, format, format_len); ++ fmt[format_len] = '\0'; ++ } ++ struct gomp_thread *thr = gomp_thread (); ++ size_t ret ++ = gomp_display_affinity (buf, sizeof buf, ++ format_len ? fmt : gomp_affinity_format_var, ++ gomp_thread_self (), &thr->ts, thr->place); ++ if (ret < sizeof buf) ++ { ++ buf[ret] = '\n'; ++ gomp_print_string (buf, ret + 1); ++ } ++ else ++ { ++ char *b = gomp_malloc (ret + 1); ++ gomp_display_affinity (buf, sizeof buf, ++ format_len ? fmt : gomp_affinity_format_var, ++ gomp_thread_self (), &thr->ts, thr->place); ++ b[ret] = '\n'; ++ gomp_print_string (b, ret + 1); ++ free (b); ++ } ++ if (fmt && fmt != fmt_buf) ++ free (fmt); ++} ++ ++int32_t ++omp_capture_affinity_ (char *buffer, const char *format, ++ size_t buffer_len, size_t format_len) ++{ ++ char *fmt = NULL, fmt_buf[256]; ++ if (format_len) ++ { ++ fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1); ++ memcpy (fmt, format, format_len); ++ fmt[format_len] = '\0'; ++ } ++ struct gomp_thread *thr = gomp_thread (); ++ size_t ret ++ = gomp_display_affinity (buffer, buffer_len, ++ format_len ? fmt : gomp_affinity_format_var, ++ gomp_thread_self (), &thr->ts, thr->place); ++ if (fmt && fmt != fmt_buf) ++ free (fmt); ++ if (ret < buffer_len) ++ memset (buffer + ret, ' ', buffer_len - ret); ++ return ret; ++} ++ ++int32_t ++omp_pause_resource_ (const int32_t *kind, const int32_t *device_num) ++{ ++ return omp_pause_resource (*kind, *device_num); ++} ++ ++int32_t ++omp_pause_resource_all_ (const int32_t *kind) ++{ ++ return omp_pause_resource_all (*kind); ++} +--- libgomp/configure.tgt.jj 2018-04-25 09:40:31.925655587 +0200 ++++ libgomp/configure.tgt 2019-05-07 18:46:36.479110486 +0200 +@@ -18,7 +18,7 @@ if test $gcc_cv_have_tls = yes ; then + ;; + + *-*-linux* | *-*-gnu*) +- XCFLAGS="${XCFLAGS} -ftls-model=initial-exec" ++ XCFLAGS="${XCFLAGS} -ftls-model=initial-exec -DUSING_INITIAL_EXEC_TLS" + ;; + + *-*-rtems*) +--- libgomp/icv-device.c.jj 2018-04-25 09:40:31.925655587 +0200 ++++ libgomp/icv-device.c 2019-05-07 18:46:36.513109943 +0200 +@@ -49,20 +49,6 @@ omp_get_num_devices (void) + } + + int +-omp_get_num_teams (void) +-{ +- /* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */ +- return 1; +-} +- +-int +-omp_get_team_num (void) +-{ +- /* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */ +- return 0; +-} +- +-int + omp_is_initial_device (void) + { + /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */ +@@ -72,6 +58,4 @@ omp_is_initial_device (void) + ialias (omp_set_default_device) + ialias (omp_get_default_device) + ialias (omp_get_num_devices) +-ialias (omp_get_num_teams) +-ialias (omp_get_team_num) + ialias (omp_is_initial_device) +--- libgomp/Makefile.in.jj 2018-04-25 09:40:31.320655306 +0200 ++++ libgomp/Makefile.in 2019-05-07 20:00:01.082077522 +0200 +@@ -90,7 +90,7 @@ DIST_COMMON = $(top_srcdir)/plugin/Makef + $(srcdir)/libgomp.spec.in $(srcdir)/../depcomp + @PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la + @PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la +-@USE_FORTRAN_TRUE@am__append_3 = openacc.f90 ++@USE_FORTRAN_TRUE@am__append_3 = openacc2.f90 + subdir = . + ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 + am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \ +@@ -172,7 +172,7 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL + @PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_rpath = -rpath \ + @PLUGIN_NVPTX_TRUE@ $(toolexeclibdir) + libgomp_la_LIBADD = +-@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo ++@USE_FORTRAN_TRUE@am__objects_1 = openacc2.lo + am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \ + env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \ + loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \ +@@ -180,7 +180,8 @@ am_libgomp_la_OBJECTS = alloc.lo atomic. + sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \ + target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \ + oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \ +- oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1) ++ oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \ ++ teams.lo $(am__objects_1) + libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) + DEFAULT_INCLUDES = -I.@am__isrc@ + depcomp = $(SHELL) $(top_srcdir)/../depcomp +@@ -380,6 +381,7 @@ mkdir_p = @mkdir_p@ + multi_basedir = @multi_basedir@ + offload_additional_lib_paths = @offload_additional_lib_paths@ + offload_additional_options = @offload_additional_options@ ++offload_plugins = @offload_plugins@ + offload_targets = @offload_targets@ + oldincludedir = @oldincludedir@ + pdfdir = @pdfdir@ +@@ -436,7 +438,7 @@ libgomp_la_SOURCES = alloc.c atomic.c ba + affinity.c target.c splay-tree.c libgomp-plugin.c \ + oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \ + oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \ +- $(am__append_3) ++ affinity-fmt.c teams.c $(am__append_3) + + # Nvidia PTX OpenACC plugin. + @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) +@@ -599,6 +601,7 @@ mostlyclean-compile: + distclean-compile: + -rm -f *.tab.c + ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity-fmt.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@ +@@ -638,6 +641,7 @@ distclean-compile: + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@ ++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/teams.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/work.Plo@am__quote@ + +@@ -1292,8 +1296,6 @@ omp_lib_kinds.mod: omp_lib.mod + : + openacc_kinds.mod: openacc.mod + : +-openacc.mod: openacc.lo +- : + %.mod: %.f90 + $(FC) $(FCFLAGS) -fsyntax-only $< + fortran.lo: libgomp_f.h +--- libgomp/plugin/cuda/cuda.h.jj 2018-04-25 09:40:31.914655581 +0200 ++++ libgomp/plugin/cuda/cuda.h 2019-05-07 18:46:36.533109624 +0200 +@@ -44,6 +44,7 @@ typedef void *CUevent; + typedef void *CUfunction; + typedef void *CUlinkState; + typedef void *CUmodule; ++typedef size_t (*CUoccupancyB2DSize)(int); + typedef void *CUstream; + + typedef enum { +@@ -88,6 +89,7 @@ typedef enum { + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4, + CU_JIT_ERROR_LOG_BUFFER = 5, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6, ++ CU_JIT_OPTIMIZATION_LEVEL = 7, + CU_JIT_LOG_VERBOSE = 12 + } CUjit_option; + +@@ -169,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr + CUresult cuModuleLoad (CUmodule *, const char *); + CUresult cuModuleLoadData (CUmodule *, const void *); + CUresult cuModuleUnload (CUmodule); ++CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, ++ CUoccupancyB2DSize, size_t, int); + CUresult cuStreamCreate (CUstream *, unsigned); + #define cuStreamDestroy cuStreamDestroy_v2 + CUresult cuStreamDestroy (CUstream); +--- libgomp/plugin/cuda-lib.def.jj 2019-05-07 18:46:36.533109624 +0200 ++++ libgomp/plugin/cuda-lib.def 2019-05-07 18:46:36.533109624 +0200 +@@ -0,0 +1,49 @@ ++CUDA_ONE_CALL (cuCtxCreate) ++CUDA_ONE_CALL (cuCtxDestroy) ++CUDA_ONE_CALL (cuCtxGetCurrent) ++CUDA_ONE_CALL (cuCtxGetDevice) ++CUDA_ONE_CALL (cuCtxPopCurrent) ++CUDA_ONE_CALL (cuCtxPushCurrent) ++CUDA_ONE_CALL (cuCtxSynchronize) ++CUDA_ONE_CALL (cuDeviceGet) ++CUDA_ONE_CALL (cuDeviceGetAttribute) ++CUDA_ONE_CALL (cuDeviceGetCount) ++CUDA_ONE_CALL (cuEventCreate) ++CUDA_ONE_CALL (cuEventDestroy) ++CUDA_ONE_CALL (cuEventElapsedTime) ++CUDA_ONE_CALL (cuEventQuery) ++CUDA_ONE_CALL (cuEventRecord) ++CUDA_ONE_CALL (cuEventSynchronize) ++CUDA_ONE_CALL (cuFuncGetAttribute) ++CUDA_ONE_CALL_MAYBE_NULL (cuGetErrorString) ++CUDA_ONE_CALL (cuInit) ++CUDA_ONE_CALL (cuLaunchKernel) ++CUDA_ONE_CALL (cuLinkAddData) ++CUDA_ONE_CALL_MAYBE_NULL (cuLinkAddData_v2) ++CUDA_ONE_CALL (cuLinkComplete) ++CUDA_ONE_CALL (cuLinkCreate) ++CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2) ++CUDA_ONE_CALL (cuLinkDestroy) ++CUDA_ONE_CALL (cuMemAlloc) ++CUDA_ONE_CALL (cuMemAllocHost) ++CUDA_ONE_CALL (cuMemcpy) ++CUDA_ONE_CALL (cuMemcpyDtoDAsync) ++CUDA_ONE_CALL (cuMemcpyDtoH) ++CUDA_ONE_CALL (cuMemcpyDtoHAsync) ++CUDA_ONE_CALL (cuMemcpyHtoD) ++CUDA_ONE_CALL (cuMemcpyHtoDAsync) ++CUDA_ONE_CALL (cuMemFree) ++CUDA_ONE_CALL (cuMemFreeHost) ++CUDA_ONE_CALL (cuMemGetAddressRange) ++CUDA_ONE_CALL (cuMemHostGetDevicePointer) ++CUDA_ONE_CALL (cuModuleGetFunction) ++CUDA_ONE_CALL (cuModuleGetGlobal) ++CUDA_ONE_CALL (cuModuleLoad) ++CUDA_ONE_CALL (cuModuleLoadData) ++CUDA_ONE_CALL (cuModuleUnload) ++CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) ++CUDA_ONE_CALL (cuStreamCreate) ++CUDA_ONE_CALL (cuStreamDestroy) ++CUDA_ONE_CALL (cuStreamQuery) ++CUDA_ONE_CALL (cuStreamSynchronize) ++CUDA_ONE_CALL (cuStreamWaitEvent) +--- libgomp/plugin/plugin-nvptx.c.jj 2018-04-25 09:40:31.915655582 +0200 ++++ libgomp/plugin/plugin-nvptx.c 2019-05-07 18:46:36.535109592 +0200 +@@ -31,6 +31,7 @@ + is not clear as to what that state might be. Or how one might + propagate it from one thread to another. */ + ++#define _GNU_SOURCE + #include "openacc.h" + #include "config.h" + #include "libgomp-plugin.h" +@@ -48,60 +49,41 @@ + #include + #include + ++#if CUDA_VERSION < 6000 ++extern CUresult cuGetErrorString (CUresult, const char **); ++#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82 ++#endif ++ ++#if CUDA_VERSION >= 6050 ++#undef cuLinkCreate ++#undef cuLinkAddData ++CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t, ++ const char *, unsigned, CUjit_option *, void **); ++CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *); ++#else ++typedef size_t (*CUoccupancyB2DSize)(int); ++CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t, ++ const char *, unsigned, CUjit_option *, void **); ++CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *); ++CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, ++ CUoccupancyB2DSize, size_t, int); ++#endif ++ ++#define DO_PRAGMA(x) _Pragma (#x) ++ + #if PLUGIN_NVPTX_DYNAMIC + # include + +-# define CUDA_CALLS \ +-CUDA_ONE_CALL (cuCtxCreate) \ +-CUDA_ONE_CALL (cuCtxDestroy) \ +-CUDA_ONE_CALL (cuCtxGetCurrent) \ +-CUDA_ONE_CALL (cuCtxGetDevice) \ +-CUDA_ONE_CALL (cuCtxPopCurrent) \ +-CUDA_ONE_CALL (cuCtxPushCurrent) \ +-CUDA_ONE_CALL (cuCtxSynchronize) \ +-CUDA_ONE_CALL (cuDeviceGet) \ +-CUDA_ONE_CALL (cuDeviceGetAttribute) \ +-CUDA_ONE_CALL (cuDeviceGetCount) \ +-CUDA_ONE_CALL (cuEventCreate) \ +-CUDA_ONE_CALL (cuEventDestroy) \ +-CUDA_ONE_CALL (cuEventElapsedTime) \ +-CUDA_ONE_CALL (cuEventQuery) \ +-CUDA_ONE_CALL (cuEventRecord) \ +-CUDA_ONE_CALL (cuEventSynchronize) \ +-CUDA_ONE_CALL (cuFuncGetAttribute) \ +-CUDA_ONE_CALL (cuGetErrorString) \ +-CUDA_ONE_CALL (cuInit) \ +-CUDA_ONE_CALL (cuLaunchKernel) \ +-CUDA_ONE_CALL (cuLinkAddData) \ +-CUDA_ONE_CALL (cuLinkComplete) \ +-CUDA_ONE_CALL (cuLinkCreate) \ +-CUDA_ONE_CALL (cuLinkDestroy) \ +-CUDA_ONE_CALL (cuMemAlloc) \ +-CUDA_ONE_CALL (cuMemAllocHost) \ +-CUDA_ONE_CALL (cuMemcpy) \ +-CUDA_ONE_CALL (cuMemcpyDtoDAsync) \ +-CUDA_ONE_CALL (cuMemcpyDtoH) \ +-CUDA_ONE_CALL (cuMemcpyDtoHAsync) \ +-CUDA_ONE_CALL (cuMemcpyHtoD) \ +-CUDA_ONE_CALL (cuMemcpyHtoDAsync) \ +-CUDA_ONE_CALL (cuMemFree) \ +-CUDA_ONE_CALL (cuMemFreeHost) \ +-CUDA_ONE_CALL (cuMemGetAddressRange) \ +-CUDA_ONE_CALL (cuMemHostGetDevicePointer)\ +-CUDA_ONE_CALL (cuModuleGetFunction) \ +-CUDA_ONE_CALL (cuModuleGetGlobal) \ +-CUDA_ONE_CALL (cuModuleLoad) \ +-CUDA_ONE_CALL (cuModuleLoadData) \ +-CUDA_ONE_CALL (cuModuleUnload) \ +-CUDA_ONE_CALL (cuStreamCreate) \ +-CUDA_ONE_CALL (cuStreamDestroy) \ +-CUDA_ONE_CALL (cuStreamQuery) \ +-CUDA_ONE_CALL (cuStreamSynchronize) \ +-CUDA_ONE_CALL (cuStreamWaitEvent) +-# define CUDA_ONE_CALL(call) \ +- __typeof (call) *call; + struct cuda_lib_s { +- CUDA_CALLS ++ ++# define CUDA_ONE_CALL(call) \ ++ __typeof (call) *call; ++# define CUDA_ONE_CALL_MAYBE_NULL(call) \ ++ CUDA_ONE_CALL (call) ++#include "cuda-lib.def" ++# undef CUDA_ONE_CALL ++# undef CUDA_ONE_CALL_MAYBE_NULL ++ + } cuda_lib; + + /* -1 if init_cuda_lib has not been called yet, false +@@ -120,24 +102,41 @@ init_cuda_lib (void) + cuda_lib_inited = false; + if (h == NULL) + return false; +-# undef CUDA_ONE_CALL +-# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call) +-# define CUDA_ONE_CALL_1(call) \ ++ ++# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false) ++# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true) ++# define CUDA_ONE_CALL_1(call, allow_null) \ + cuda_lib.call = dlsym (h, #call); \ +- if (cuda_lib.call == NULL) \ ++ if (!allow_null && cuda_lib.call == NULL) \ + return false; +- CUDA_CALLS ++#include "cuda-lib.def" ++# undef CUDA_ONE_CALL ++# undef CUDA_ONE_CALL_1 ++# undef CUDA_ONE_CALL_MAYBE_NULL ++ + cuda_lib_inited = true; + return true; + } +-# undef CUDA_ONE_CALL +-# undef CUDA_ONE_CALL_1 + # define CUDA_CALL_PREFIX cuda_lib. + #else ++ ++# define CUDA_ONE_CALL(call) ++# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call) ++#include "cuda-lib.def" ++#undef CUDA_ONE_CALL_MAYBE_NULL ++#undef CUDA_ONE_CALL ++ + # define CUDA_CALL_PREFIX + # define init_cuda_lib() true + #endif + ++#include "secure_getenv.h" ++ ++#undef MIN ++#undef MAX ++#define MIN(X,Y) ((X) < (Y) ? (X) : (Y)) ++#define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) ++ + /* Convenience macros for the frequently used CUDA library call and + error handling sequence as well as CUDA library calls that + do the error checking themselves or don't do it at all. */ +@@ -171,40 +170,42 @@ init_cuda_lib (void) + #define CUDA_CALL_NOCHECK(FN, ...) \ + CUDA_CALL_PREFIX FN (__VA_ARGS__) + ++#define CUDA_CALL_EXISTS(FN) \ ++ CUDA_CALL_PREFIX FN ++ + static const char * + cuda_error (CUresult r) + { +-#if CUDA_VERSION < 7000 +- /* Specified in documentation and present in library from at least +- 5.5. Not declared in header file prior to 7.0. */ +- extern CUresult cuGetErrorString (CUresult, const char **); +-#endif ++ const char *fallback = "unknown cuda error"; + const char *desc; + ++ if (!CUDA_CALL_EXISTS (cuGetErrorString)) ++ return fallback; ++ + r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc); +- if (r != CUDA_SUCCESS) +- desc = "unknown cuda error"; ++ if (r == CUDA_SUCCESS) ++ return desc; + +- return desc; ++ return fallback; + } + + static unsigned int instantiated_devices = 0; + static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; + ++struct cuda_map ++{ ++ CUdeviceptr d; ++ size_t size; ++ bool active; ++ struct cuda_map *next; ++}; ++ + struct ptx_stream + { + CUstream stream; + pthread_t host_thread; + bool multithreaded; +- +- CUdeviceptr d; +- void *h; +- void *h_begin; +- void *h_end; +- void *h_next; +- void *h_prev; +- void *h_tail; +- ++ struct cuda_map *map; + struct ptx_stream *next; + }; + +@@ -216,12 +217,64 @@ struct nvptx_thread + struct ptx_device *ptx_dev; + }; + +-struct map ++static struct cuda_map * ++cuda_map_create (size_t size) + { +- int async; +- size_t size; +- char mappings[0]; +-}; ++ struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); ++ ++ assert (map); ++ ++ map->next = NULL; ++ map->size = size; ++ map->active = false; ++ ++ CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); ++ assert (map->d); ++ ++ return map; ++} ++ ++static void ++cuda_map_destroy (struct cuda_map *map) ++{ ++ if (map->active) ++ /* Possible reasons for the map to be still active: ++ - the associated async kernel might still be running. ++ - the associated async kernel might have finished, but the ++ corresponding event that should trigger the pop_map has not been ++ processed by event_gc. ++ - the associated sync kernel might have aborted ++ ++ The async cases could happen if the user specified an async region ++ without adding a corresponding wait that is guaranteed to be executed ++ (before returning from main, or in an atexit handler). ++ We do not want to deallocate a device pointer that is still being ++ used, so skip it. ++ ++ In the sync case, the device pointer is no longer used, but deallocating ++ it using cuMemFree will not succeed, so skip it. ++ ++ TODO: Handle this in a more constructive way, by f.i. waiting for streams ++ to finish before de-allocating them (PR88981), or by ensuring the CUDA ++ lib atexit handler is called before rather than after the libgomp plugin ++ atexit handler (PR83795). */ ++ ; ++ else ++ CUDA_CALL_NOCHECK (cuMemFree, map->d); ++ ++ free (map); ++} ++ ++/* The following map_* routines manage the CUDA device memory that ++ contains the data mapping arguments for cuLaunchKernel. Each ++ asynchronous PTX stream may have multiple pending kernel ++ invocations, which are launched in a FIFO order. As such, the map ++ routines maintains a queue of cuLaunchKernel arguments. ++ ++ Calls to map_push and map_pop must be guarded by ptx_event_lock. ++ Likewise, calls to map_init and map_fini are guarded by ++ ptx_dev_lock inside GOMP_OFFLOAD_init_device and ++ GOMP_OFFLOAD_fini_device, respectively. */ + + static bool + map_init (struct ptx_stream *s) +@@ -229,109 +282,83 @@ map_init (struct ptx_stream *s) + int size = getpagesize (); + + assert (s); +- assert (!s->d); +- assert (!s->h); +- +- CUDA_CALL (cuMemAllocHost, &s->h, size); +- CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); + +- assert (s->h); ++ s->map = cuda_map_create (size); + +- s->h_begin = s->h; +- s->h_end = s->h_begin + size; +- s->h_next = s->h_prev = s->h_tail = s->h_begin; +- +- assert (s->h_next); +- assert (s->h_end); + return true; + } + + static bool + map_fini (struct ptx_stream *s) + { +- CUDA_CALL (cuMemFreeHost, s->h); ++ assert (s->map->next == NULL); ++ ++ cuda_map_destroy (s->map); ++ + return true; + } + + static void + map_pop (struct ptx_stream *s) + { +- struct map *m; ++ struct cuda_map *next; + + assert (s != NULL); +- assert (s->h_next); +- assert (s->h_prev); +- assert (s->h_tail); +- +- m = s->h_tail; +- +- s->h_tail += m->size; +- +- if (s->h_tail >= s->h_end) +- s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); +- +- if (s->h_next == s->h_tail) +- s->h_prev = s->h_next; + +- assert (s->h_next >= s->h_begin); +- assert (s->h_tail >= s->h_begin); +- assert (s->h_prev >= s->h_begin); ++ if (s->map->next == NULL) ++ { ++ s->map->active = false; ++ return; ++ } + +- assert (s->h_next <= s->h_end); +- assert (s->h_tail <= s->h_end); +- assert (s->h_prev <= s->h_end); ++ next = s->map->next; ++ cuda_map_destroy (s->map); ++ s->map = next; + } + +-static void +-map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d) ++static CUdeviceptr ++map_push (struct ptx_stream *s, size_t size) + { +- int left; +- int offset; +- struct map *m; ++ struct cuda_map *map = NULL; ++ struct cuda_map **t; + +- assert (s != NULL); +- +- left = s->h_end - s->h_next; +- size += sizeof (struct map); +- +- assert (s->h_prev); +- assert (s->h_next); ++ assert (s); ++ assert (s->map); + +- if (size >= left) ++ /* Select an element to push. */ ++ if (s->map->active) ++ map = cuda_map_create (size); ++ else + { +- m = s->h_prev; +- m->size += left; +- s->h_next = s->h_begin; +- +- if (s->h_next + size > s->h_end) +- GOMP_PLUGIN_fatal ("unable to push map"); +- } +- +- assert (s->h_next); +- +- m = s->h_next; +- m->async = async; +- m->size = size; ++ /* Pop the inactive front element. */ ++ struct cuda_map *pop = s->map; ++ s->map = pop->next; ++ pop->next = NULL; + +- offset = (void *)&m->mappings[0] - s->h; ++ if (pop->size < size) ++ { ++ cuda_map_destroy (pop); + +- *d = (void *)(s->d + offset); +- *h = (void *)(s->h + offset); ++ map = cuda_map_create (size); ++ } ++ else ++ map = pop; ++ } + +- s->h_prev = s->h_next; +- s->h_next += size; ++ /* Check that the element is as expected. */ ++ assert (map->next == NULL); ++ assert (!map->active); + +- assert (s->h_prev); +- assert (s->h_next); ++ /* Mark the element active. */ ++ map->active = true; + +- assert (s->h_next >= s->h_begin); +- assert (s->h_tail >= s->h_begin); +- assert (s->h_prev >= s->h_begin); +- assert (s->h_next <= s->h_end); +- assert (s->h_tail <= s->h_end); +- assert (s->h_prev <= s->h_end); ++ /* Push the element to the back of the list. */ ++ for (t = &s->map; (*t) != NULL; t = &(*t)->next) ++ ; ++ assert (t != NULL && *t == NULL); ++ *t = map; + +- return; ++ return map->d; + } + + /* Target data function launch information. */ +@@ -411,6 +438,10 @@ struct ptx_device + int num_sms; + int regs_per_block; + int regs_per_sm; ++ int warp_size; ++ int max_threads_per_block; ++ int max_threads_per_multiprocessor; ++ int default_dims[GOMP_DIM_MAX]; + + struct ptx_image_data *images; /* Images loaded on device. */ + pthread_mutex_t image_lock; /* Lock for above list. */ +@@ -458,8 +489,6 @@ init_streams_for_device (struct ptx_devi + null_stream->stream = NULL; + null_stream->host_thread = pthread_self (); + null_stream->multithreaded = true; +- null_stream->d = (CUdeviceptr) NULL; +- null_stream->h = NULL; + if (!map_init (null_stream)) + return false; + +@@ -594,8 +623,6 @@ select_stream_for_async (int async, pthr + s->host_thread = thread; + s->multithreaded = false; + +- s->d = (CUdeviceptr) NULL; +- s->h = NULL; + if (!map_init (s)) + { + pthread_mutex_unlock (&ptx_dev->stream_lock); +@@ -777,9 +804,11 @@ nvptx_open_device (int n) + &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev); + ptx_dev->regs_per_block = pi; + +- /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only ++ /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only + in CUDA 6.0 and newer. */ +- r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev); ++ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, ++ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, ++ dev); + /* Fallback: use limit of registers per block, which is usually equal. */ + if (r == CUDA_ERROR_INVALID_VALUE) + pi = ptx_dev->regs_per_block; +@@ -797,12 +826,24 @@ nvptx_open_device (int n) + GOMP_PLUGIN_error ("Only warp size 32 is supported"); + return NULL; + } ++ ptx_dev->warp_size = pi; ++ ++ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, ++ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev); ++ ptx_dev->max_threads_per_block = pi; ++ ++ CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi, ++ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev); ++ ptx_dev->max_threads_per_multiprocessor = pi; + + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines, + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); + if (r != CUDA_SUCCESS) + async_engines = 1; + ++ for (int i = 0; i != GOMP_DIM_MAX; i++) ++ ptx_dev->default_dims[i] = 0; ++ + ptx_dev->images = NULL; + pthread_mutex_init (&ptx_dev->image_lock, NULL); + +@@ -876,12 +917,42 @@ notify_var (const char *var_name, const + GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var); + } + ++static void ++process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o) ++{ ++ const char *var_name = "GOMP_NVPTX_JIT"; ++ const char *env_var = secure_getenv (var_name); ++ notify_var (var_name, env_var); ++ ++ if (env_var == NULL) ++ return; ++ ++ const char *c = env_var; ++ while (*c != '\0') ++ { ++ while (*c == ' ') ++ c++; ++ ++ if (c[0] == '-' && c[1] == 'O' ++ && '0' <= c[2] && c[2] <= '4' ++ && (c[3] == '\0' || c[3] == ' ')) ++ { ++ *gomp_nvptx_o = c[2] - '0'; ++ c += 3; ++ continue; ++ } ++ ++ GOMP_PLUGIN_error ("Error parsing %s", var_name); ++ break; ++ } ++} ++ + static bool + link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs, + unsigned num_objs) + { +- CUjit_option opts[6]; +- void *optvals[6]; ++ CUjit_option opts[7]; ++ void *optvals[7]; + float elapsed = 0.0; + char elog[1024]; + char ilog[16384]; +@@ -908,16 +979,41 @@ link_ptx (CUmodule *module, const struct + opts[5] = CU_JIT_LOG_VERBOSE; + optvals[5] = (void *) 1; + +- CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate); ++ static intptr_t gomp_nvptx_o = -1; ++ ++ static bool init_done = false; ++ if (!init_done) ++ { ++ process_GOMP_NVPTX_JIT (&gomp_nvptx_o); ++ init_done = true; ++ } ++ ++ int nopts = 6; ++ if (gomp_nvptx_o != -1) ++ { ++ opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL; ++ optvals[nopts] = (void *) gomp_nvptx_o; ++ nopts++; ++ } ++ ++ if (CUDA_CALL_EXISTS (cuLinkCreate_v2)) ++ CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate); ++ else ++ CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate); + + for (; num_objs--; ptx_objs++) + { + /* cuLinkAddData's 'data' argument erroneously omits the const + qualifier. */ + GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code); +- r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, +- (char *) ptx_objs->code, ptx_objs->size, +- 0, 0, 0, 0); ++ if (CUDA_CALL_EXISTS (cuLinkAddData_v2)) ++ r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX, ++ (char *) ptx_objs->code, ptx_objs->size, ++ 0, 0, 0, 0); ++ else ++ r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX, ++ (char *) ptx_objs->code, ptx_objs->size, ++ 0, 0, 0, 0); + if (r != CUDA_SUCCESS) + { + GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]); +@@ -1067,8 +1163,10 @@ nvptx_exec (void (*fn), size_t mapnum, v + int i; + struct ptx_stream *dev_str; + void *kargs[1]; +- void *hp, *dp; ++ void *hp; ++ CUdeviceptr dp = 0; + struct nvptx_thread *nvthd = nvptx_thread (); ++ int warp_size = nvthd->ptx_dev->warp_size; + const char *maybe_abort_msg = "(perhaps abort was called)"; + + function = targ_fn->fn; +@@ -1090,68 +1188,36 @@ nvptx_exec (void (*fn), size_t mapnum, v + + if (seen_zero) + { +- /* See if the user provided GOMP_OPENACC_DIM environment +- variable to specify runtime defaults. */ +- static int default_dims[GOMP_DIM_MAX]; +- + pthread_mutex_lock (&ptx_dev_lock); +- if (!default_dims[0]) +- { +- const char *var_name = "GOMP_OPENACC_DIM"; +- /* We only read the environment variable once. You can't +- change it in the middle of execution. The syntax is +- the same as for the -fopenacc-dim compilation option. */ +- const char *env_var = getenv (var_name); +- notify_var (var_name, env_var); +- if (env_var) +- { +- const char *pos = env_var; + +- for (i = 0; *pos && i != GOMP_DIM_MAX; i++) +- { +- if (i && *pos++ != ':') +- break; +- if (*pos != ':') +- { +- const char *eptr; +- +- errno = 0; +- long val = strtol (pos, (char **)&eptr, 10); +- if (errno || val < 0 || (unsigned)val != val) +- break; +- default_dims[i] = (int)val; +- pos = eptr; +- } +- } +- } ++ static int gomp_openacc_dims[GOMP_DIM_MAX]; ++ if (!gomp_openacc_dims[0]) ++ { ++ /* See if the user provided GOMP_OPENACC_DIM environment ++ variable to specify runtime defaults. */ ++ for (int i = 0; i < GOMP_DIM_MAX; ++i) ++ gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i); ++ } + +- int warp_size, block_size, dev_size, cpu_size; +- CUdevice dev = nvptx_thread()->ptx_dev->dev; +- /* 32 is the default for known hardware. */ +- int gang = 0, worker = 32, vector = 32; +- CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm; +- +- cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK; +- cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE; +- cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT; +- cu_tpm = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR; +- +- if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb, +- dev) == CUDA_SUCCESS +- && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws, +- dev) == CUDA_SUCCESS +- && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc, +- dev) == CUDA_SUCCESS +- && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm, +- dev) == CUDA_SUCCESS) +- { +- GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," +- " dev_size=%d, cpu_size=%d\n", +- warp_size, block_size, dev_size, cpu_size); +- gang = (cpu_size / block_size) * dev_size; +- worker = block_size / warp_size; +- vector = warp_size; +- } ++ if (!nvthd->ptx_dev->default_dims[0]) ++ { ++ int default_dims[GOMP_DIM_MAX]; ++ for (int i = 0; i < GOMP_DIM_MAX; ++i) ++ default_dims[i] = gomp_openacc_dims[i]; ++ ++ int gang, worker, vector; ++ { ++ int block_size = nvthd->ptx_dev->max_threads_per_block; ++ int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor; ++ int dev_size = nvthd->ptx_dev->num_sms; ++ GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d," ++ " dev_size=%d, cpu_size=%d\n", ++ warp_size, block_size, dev_size, cpu_size); ++ ++ gang = (cpu_size / block_size) * dev_size; ++ worker = block_size / warp_size; ++ vector = warp_size; ++ } + + /* There is no upper bound on the gang size. The best size + matches the hardware configuration. Logical gangs are +@@ -1172,29 +1238,150 @@ nvptx_exec (void (*fn), size_t mapnum, v + default_dims[GOMP_DIM_GANG], + default_dims[GOMP_DIM_WORKER], + default_dims[GOMP_DIM_VECTOR]); ++ ++ for (i = 0; i != GOMP_DIM_MAX; i++) ++ nvthd->ptx_dev->default_dims[i] = default_dims[i]; + } + pthread_mutex_unlock (&ptx_dev_lock); + +- for (i = 0; i != GOMP_DIM_MAX; i++) +- if (!dims[i]) +- dims[i] = default_dims[i]; +- } +- +- /* This reserves a chunk of a pre-allocated page of memory mapped on both +- the host and the device. HP is a host pointer to the new chunk, and DP is +- the corresponding device pointer. */ +- map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); +- +- GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); +- +- /* Copy the array of arguments to the mapped page. */ +- for (i = 0; i < mapnum; i++) +- ((void **) hp)[i] = devaddrs[i]; +- +- /* Copy the (device) pointers to arguments to the device (dp and hp might in +- fact have the same value on a unified-memory system). */ +- CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, +- mapnum * sizeof (void *)); ++ { ++ bool default_dim_p[GOMP_DIM_MAX]; ++ for (i = 0; i != GOMP_DIM_MAX; i++) ++ default_dim_p[i] = !dims[i]; ++ ++ if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize)) ++ { ++ for (i = 0; i != GOMP_DIM_MAX; i++) ++ if (default_dim_p[i]) ++ dims[i] = nvthd->ptx_dev->default_dims[i]; ++ ++ if (default_dim_p[GOMP_DIM_VECTOR]) ++ dims[GOMP_DIM_VECTOR] ++ = MIN (dims[GOMP_DIM_VECTOR], ++ (targ_fn->max_threads_per_block / warp_size ++ * warp_size)); ++ ++ if (default_dim_p[GOMP_DIM_WORKER]) ++ dims[GOMP_DIM_WORKER] ++ = MIN (dims[GOMP_DIM_WORKER], ++ targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]); ++ } ++ else ++ { ++ /* Handle the case that the compiler allows the runtime to choose ++ the vector-length conservatively, by ignoring ++ gomp_openacc_dims[GOMP_DIM_VECTOR]. TODO: actually handle ++ it. */ ++ int vectors = 0; ++ /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that ++ gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not ++ exceed targ_fn->max_threads_per_block. */ ++ int workers = gomp_openacc_dims[GOMP_DIM_WORKER]; ++ int gangs = gomp_openacc_dims[GOMP_DIM_GANG]; ++ int grids, blocks; ++ ++ CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids, ++ &blocks, function, NULL, 0, ++ dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]); ++ GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: " ++ "grid = %d, block = %d\n", grids, blocks); ++ ++ /* Keep the num_gangs proportional to the block size. In ++ the case were a block size is limited by shared-memory ++ or the register file capacity, the runtime will not ++ excessively over assign gangs to the multiprocessor ++ units if their state is going to be swapped out even ++ more than necessary. The constant factor 2 is there to ++ prevent threads from idling when there is insufficient ++ work for them. */ ++ if (gangs == 0) ++ gangs = 2 * grids * (blocks / warp_size); ++ ++ if (vectors == 0) ++ vectors = warp_size; ++ ++ if (workers == 0) ++ { ++ int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR] ++ ? vectors ++ : dims[GOMP_DIM_VECTOR]); ++ workers = blocks / actual_vectors; ++ workers = MAX (workers, 1); ++ /* If we need a per-worker barrier ... . */ ++ if (actual_vectors > 32) ++ /* Don't use more barriers than available. */ ++ workers = MIN (workers, 15); ++ } ++ ++ for (i = 0; i != GOMP_DIM_MAX; i++) ++ if (default_dim_p[i]) ++ switch (i) ++ { ++ case GOMP_DIM_GANG: dims[i] = gangs; break; ++ case GOMP_DIM_WORKER: dims[i] = workers; break; ++ case GOMP_DIM_VECTOR: dims[i] = vectors; break; ++ default: GOMP_PLUGIN_fatal ("invalid dim"); ++ } ++ } ++ } ++ } ++ ++ /* Check if the accelerator has sufficient hardware resources to ++ launch the offloaded kernel. */ ++ if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] ++ > targ_fn->max_threads_per_block) ++ { ++ const char *msg ++ = ("The Nvidia accelerator has insufficient resources to launch '%s'" ++ " with num_workers = %d and vector_length = %d" ++ "; " ++ "recompile the program with 'num_workers = x and vector_length = y'" ++ " on that offloaded region or '-fopenacc-dim=:x:y' where" ++ " x * y <= %d" ++ ".\n"); ++ GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], ++ dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block); ++ } ++ ++ /* Check if the accelerator has sufficient barrier resources to ++ launch the offloaded kernel. */ ++ if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32) ++ { ++ const char *msg ++ = ("The Nvidia accelerator has insufficient barrier resources to launch" ++ " '%s' with num_workers = %d and vector_length = %d" ++ "; " ++ "recompile the program with 'num_workers = x' on that offloaded" ++ " region or '-fopenacc-dim=:x:' where x <= 15" ++ "; " ++ "or, recompile the program with 'vector_length = 32' on that" ++ " offloaded region or '-fopenacc-dim=::32'" ++ ".\n"); ++ GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER], ++ dims[GOMP_DIM_VECTOR]); ++ } ++ ++ if (mapnum > 0) ++ { ++ /* This reserves a chunk of a pre-allocated page of memory mapped on both ++ the host and the device. HP is a host pointer to the new chunk, and DP is ++ the corresponding device pointer. */ ++ pthread_mutex_lock (&ptx_event_lock); ++ dp = map_push (dev_str, mapnum * sizeof (void *)); ++ pthread_mutex_unlock (&ptx_event_lock); ++ ++ GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); ++ ++ /* Copy the array of arguments to the mapped page. */ ++ hp = alloca(sizeof(void *) * mapnum); ++ for (i = 0; i < mapnum; i++) ++ ((void **) hp)[i] = devaddrs[i]; ++ ++ /* Copy the (device) pointers to arguments to the device */ ++ CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, ++ mapnum * sizeof (void *)); ++ } ++ + GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" + " gangs=%u, workers=%u, vectors=%u\n", + __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG], +@@ -1239,7 +1426,8 @@ nvptx_exec (void (*fn), size_t mapnum, v + + CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream); + +- event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); ++ if (mapnum > 0) ++ event_add (PTX_EVT_KNL, e, (void *)dev_str, 0); + } + #else + r = CUDA_CALL_NOCHECK (cuCtxSynchronize, ); +@@ -1256,7 +1444,10 @@ nvptx_exec (void (*fn), size_t mapnum, v + #ifndef DISABLE_ASYNC + if (async < acc_async_noval) + #endif +- map_pop (dev_str); ++ { ++ if (mapnum > 0) ++ map_pop (dev_str); ++ } + } + + void * openacc_get_current_cuda_context (void); +@@ -1415,9 +1606,8 @@ nvptx_async_test (int async) + struct ptx_stream *s; + + s = select_stream_for_async (async, pthread_self (), false, NULL); +- + if (!s) +- GOMP_PLUGIN_fatal ("unknown async %d", async); ++ return 1; + + r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream); + if (r == CUDA_SUCCESS) +@@ -1472,7 +1662,7 @@ nvptx_wait (int async) + + s = select_stream_for_async (async, pthread_self (), false, NULL); + if (!s) +- GOMP_PLUGIN_fatal ("unknown async %d", async); ++ return; + + CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream); + +@@ -1486,16 +1676,17 @@ nvptx_wait_async (int async1, int async2 + struct ptx_stream *s1, *s2; + pthread_t self = pthread_self (); + ++ s1 = select_stream_for_async (async1, self, false, NULL); ++ if (!s1) ++ return; ++ + /* The stream that is waiting (rather than being waited for) doesn't + necessarily have to exist already. */ + s2 = select_stream_for_async (async2, self, true, NULL); + +- s1 = select_stream_for_async (async1, self, false, NULL); +- if (!s1) +- GOMP_PLUGIN_fatal ("invalid async 1\n"); +- ++ /* A stream is always synchronized with itself. */ + if (s1 == s2) +- GOMP_PLUGIN_fatal ("identical parameters"); ++ return; + + e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); + +@@ -1629,8 +1820,14 @@ nvptx_set_cuda_stream (int async, void * + pthread_t self = pthread_self (); + struct nvptx_thread *nvthd = nvptx_thread (); + +- if (async < 0) +- GOMP_PLUGIN_fatal ("bad async %d", async); ++ /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used ++ to change the stream handle associated with "acc_async_sync". */ ++ if (async == acc_async_sync) ++ { ++ GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated" ++ " with \"acc_async_sync\"\n"); ++ return 0; ++ } + + pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); + +@@ -1739,6 +1936,12 @@ GOMP_OFFLOAD_fini_device (int n) + instantiated_devices--; + } + ++ if (instantiated_devices == 0) ++ { ++ free (ptx_devices); ++ ptx_devices = NULL; ++ } ++ + pthread_mutex_unlock (&ptx_dev_lock); + return true; + } +--- libgomp/plugin/configfrag.ac.jj 2018-04-25 09:40:31.914655581 +0200 ++++ libgomp/plugin/configfrag.ac 2019-05-07 18:46:36.533109624 +0200 +@@ -26,8 +26,6 @@ + # see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + # . + +-offload_targets= +-AC_SUBST(offload_targets) + plugin_support=yes + AC_CHECK_LIB(dl, dlsym, , [plugin_support=no]) + if test x"$plugin_support" = xyes; then +@@ -59,7 +57,11 @@ AC_ARG_WITH(cuda-driver-lib, + [AS_HELP_STRING([--with-cuda-driver-lib=PATH], + [specify directory for the installed CUDA driver library])]) + case "x$with_cuda_driver" in +- x | xno) ;; ++ x) ;; ++ xno) ++ CUDA_DRIVER_INCLUDE=no ++ CUDA_DRIVER_LIB=no ++ ;; + *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include + CUDA_DRIVER_LIB=$with_cuda_driver/lib + ;; +@@ -70,10 +72,12 @@ fi + if test "x$with_cuda_driver_lib" != x; then + CUDA_DRIVER_LIB=$with_cuda_driver_lib + fi +-if test "x$CUDA_DRIVER_INCLUDE" != x; then ++if test "x$CUDA_DRIVER_INCLUDE" != x \ ++ && test "x$CUDA_DRIVER_INCLUDE" != xno; then + CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE + fi +-if test "x$CUDA_DRIVER_LIB" != x; then ++if test "x$CUDA_DRIVER_LIB" != x \ ++ && test "x$CUDA_DRIVER_LIB" != xno; then + CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB + fi + +@@ -133,7 +137,13 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS) + AC_SUBST(PLUGIN_HSA_LDFLAGS) + AC_SUBST(PLUGIN_HSA_LIBS) + +-# Get offload targets and path to install tree of offloading compiler. ++# Parse '--enable-offload-targets', figure out the corresponding libgomp ++# plugins, and configure to find the corresponding offload compilers. ++# 'offload_plugins' and 'offload_targets' will be populated in the same order. ++offload_plugins= ++offload_targets= ++AC_SUBST(offload_plugins) ++AC_SUBST(offload_targets) + offload_additional_options= + offload_additional_lib_paths= + AC_SUBST(offload_additional_options) +@@ -142,36 +152,41 @@ if test x"$enable_offload_targets" != x; + for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do + tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'` + tgt=`echo $tgt | sed 's/=.*//'` +- tgt_name= ++ tgt_plugin= + case $tgt in + *-intelmic-* | *-intelmicemul-*) +- tgt_name=intelmic ++ tgt_plugin=intelmic + ;; + nvptx*) +- tgt_name=nvptx ++ tgt_plugin=nvptx + PLUGIN_NVPTX=$tgt +- PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS +- PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS +- PLUGIN_NVPTX_LIBS='-lcuda' +- +- PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS +- CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" +- PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS +- LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" +- PLUGIN_NVPTX_save_LIBS=$LIBS +- LIBS="$PLUGIN_NVPTX_LIBS $LIBS" +- AC_LINK_IFELSE( +- [AC_LANG_PROGRAM( +- [#include "cuda.h"], +- [CUresult r = cuCtxPushCurrent (NULL);])], +- [PLUGIN_NVPTX=1]) +- CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS +- LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS +- LIBS=$PLUGIN_NVPTX_save_LIBS ++ if test "x$CUDA_DRIVER_LIB" != xno \ ++ && test "x$CUDA_DRIVER_LIB" != xno; then ++ PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS ++ PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS ++ PLUGIN_NVPTX_LIBS='-lcuda' ++ ++ PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS ++ CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS" ++ PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS ++ LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS" ++ PLUGIN_NVPTX_save_LIBS=$LIBS ++ LIBS="$PLUGIN_NVPTX_LIBS $LIBS" ++ AC_LINK_IFELSE( ++ [AC_LANG_PROGRAM( ++ [#include "cuda.h"], ++ [CUresult r = cuCtxPushCurrent (NULL);])], ++ [PLUGIN_NVPTX=1]) ++ CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS ++ LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS ++ LIBS=$PLUGIN_NVPTX_save_LIBS ++ fi + case $PLUGIN_NVPTX in + nvptx*) +- if test "x$CUDA_DRIVER_INCLUDE" = x \ +- && test "x$CUDA_DRIVER_LIB" = x; then ++ if (test "x$CUDA_DRIVER_INCLUDE" = x \ ++ || test "x$CUDA_DRIVER_INCLUDE" = xno) \ ++ && (test "x$CUDA_DRIVER_LIB" = x \ ++ || test "x$CUDA_DRIVER_LIB" = xno); then + PLUGIN_NVPTX=1 + PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda' + PLUGIN_NVPTX_LIBS='-ldl' +@@ -191,7 +206,7 @@ if test x"$enable_offload_targets" != x; + PLUGIN_HSA=0 + ;; + *) +- tgt_name=hsa ++ tgt_plugin=hsa + PLUGIN_HSA=$tgt + PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS + PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS" +@@ -209,7 +224,7 @@ if test x"$enable_offload_targets" != x; + LDFLAGS=$PLUGIN_HSA_save_LDFLAGS + LIBS=$PLUGIN_HSA_save_LIBS + case $PLUGIN_HSA in +- hsa*) ++ hsa*) + HSA_PLUGIN=0 + AC_MSG_ERROR([HSA run-time package required for HSA support]) + ;; +@@ -226,16 +241,19 @@ if test x"$enable_offload_targets" != x; + AC_MSG_ERROR([unknown offload target specified]) + ;; + esac +- if test x"$tgt_name" = x; then +- # Don't configure libgomp for this offloading target if we don't build +- # the corresponding plugin. ++ if test x"$tgt_plugin" = x; then ++ # Not configuring libgomp for this offload target if we're not building ++ # the corresponding offload plugin. + continue +- elif test x"$offload_targets" = x; then +- offload_targets=$tgt_name ++ elif test x"$offload_plugins" = x; then ++ offload_plugins=$tgt_plugin ++ offload_targets=$tgt + else +- offload_targets=$offload_targets,$tgt_name ++ offload_plugins=$offload_plugins,$tgt_plugin ++ offload_targets=$offload_targets,$tgt + fi +- if test "$tgt_name" = hsa; then ++ # Configure additional search paths. ++ if test "$tgt_plugin" = hsa; then + # Offloading compilation is all handled by the target compiler. + : + elif test x"$tgt_dir" != x; then +@@ -247,8 +265,8 @@ if test x"$enable_offload_targets" != x; + fi + done + fi +-AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets", +- [Define to offload targets, separated by commas.]) ++AC_DEFINE_UNQUOTED(OFFLOAD_PLUGINS, "$offload_plugins", ++ [Define to offload plugins, separated by commas.]) + AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1]) + AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX], + [Define to 1 if the NVIDIA plugin is built, 0 if not.]) +--- libgomp/affinity-fmt.c.jj 2019-05-07 18:46:36.285113585 +0200 ++++ libgomp/affinity-fmt.c 2019-05-07 18:46:36.285113585 +0200 +@@ -0,0 +1,495 @@ ++/* Copyright (C) 2018-2019 Free Software Foundation, Inc. ++ Contributed by Jakub Jelinek . ++ ++ This file is part of the GNU Offloading and Multi Processing Library ++ (libgomp). ++ ++ Libgomp is free software; you can redistribute it and/or modify it ++ under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3, or (at your option) ++ any later version. ++ ++ Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY ++ WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++ FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++ more details. ++ ++ Under Section 7 of GPL version 3, you are granted additional ++ permissions described in the GCC Runtime Library Exception, version ++ 3.1, as published by the Free Software Foundation. ++ ++ You should have received a copy of the GNU General Public License and ++ a copy of the GCC Runtime Library Exception along with this program; ++ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see ++ . */ ++ ++#include "libgomp.h" ++#include ++#include ++#include ++#ifdef HAVE_UNISTD_H ++#include ++#endif ++#ifdef HAVE_INTTYPES_H ++# include /* For PRIx64. */ ++#endif ++#ifdef HAVE_UNAME ++#include ++#endif ++ ++void ++gomp_print_string (const char *str, size_t len) ++{ ++ fwrite (str, 1, len, stderr); ++} ++ ++void ++gomp_set_affinity_format (const char *format, size_t len) ++{ ++ if (len < gomp_affinity_format_len) ++ memcpy (gomp_affinity_format_var, format, len); ++ else ++ { ++ char *p; ++ if (gomp_affinity_format_len) ++ p = gomp_realloc (gomp_affinity_format_var, len + 1); ++ else ++ p = gomp_malloc (len + 1); ++ memcpy (p, format, len); ++ gomp_affinity_format_var = p; ++ gomp_affinity_format_len = len + 1; ++ } ++ gomp_affinity_format_var[len] = '\0'; ++} ++ ++void ++omp_set_affinity_format (const char *format) ++{ ++ gomp_set_affinity_format (format, strlen (format)); ++} ++ ++size_t ++omp_get_affinity_format (char *buffer, size_t size) ++{ ++ size_t len = strlen (gomp_affinity_format_var); ++ if (size) ++ { ++ if (len < size) ++ memcpy (buffer, gomp_affinity_format_var, len + 1); ++ else ++ { ++ memcpy (buffer, gomp_affinity_format_var, size - 1); ++ buffer[size - 1] = '\0'; ++ } ++ } ++ return len; ++} ++ ++void ++gomp_display_string (char *buffer, size_t size, size_t *ret, ++ const char *str, size_t len) ++{ ++ size_t r = *ret; ++ if (size && r < size) ++ { ++ size_t l = len; ++ if (size - r < len) ++ l = size - r; ++ memcpy (buffer + r, str, l); ++ } ++ *ret += len; ++ if (__builtin_expect (r > *ret, 0)) ++ gomp_fatal ("overflow in omp_capture_affinity"); ++} ++ ++static void ++gomp_display_repeat (char *buffer, size_t size, size_t *ret, ++ char c, size_t len) ++{ ++ size_t r = *ret; ++ if (size && r < size) ++ { ++ size_t l = len; ++ if (size - r < len) ++ l = size - r; ++ memset (buffer + r, c, l); ++ } ++ *ret += len; ++ if (__builtin_expect (r > *ret, 0)) ++ gomp_fatal ("overflow in omp_capture_affinity"); ++} ++ ++static void ++gomp_display_num (char *buffer, size_t size, size_t *ret, ++ bool zero, bool right, size_t sz, char *buf) ++{ ++ size_t l = strlen (buf); ++ if (sz == (size_t) -1 || l >= sz) ++ { ++ gomp_display_string (buffer, size, ret, buf, l); ++ return; ++ } ++ if (zero) ++ { ++ if (buf[0] == '-') ++ gomp_display_string (buffer, size, ret, buf, 1); ++ else if (buf[0] == '0' && buf[1] == 'x') ++ gomp_display_string (buffer, size, ret, buf, 2); ++ gomp_display_repeat (buffer, size, ret, '0', sz - l); ++ if (buf[0] == '-') ++ gomp_display_string (buffer, size, ret, buf + 1, l - 1); ++ else if (buf[0] == '0' && buf[1] == 'x') ++ gomp_display_string (buffer, size, ret, buf + 2, l - 2); ++ else ++ gomp_display_string (buffer, size, ret, buf, l); ++ } ++ else if (right) ++ { ++ gomp_display_repeat (buffer, size, ret, ' ', sz - l); ++ gomp_display_string (buffer, size, ret, buf, l); ++ } ++ else ++ { ++ gomp_display_string (buffer, size, ret, buf, l); ++ gomp_display_repeat (buffer, size, ret, ' ', sz - l); ++ } ++} ++ ++static void ++gomp_display_int (char *buffer, size_t size, size_t *ret, ++ bool zero, bool right, size_t sz, int num) ++{ ++ char buf[3 * sizeof (int) + 2]; ++ sprintf (buf, "%d", num); ++ gomp_display_num (buffer, size, ret, zero, right, sz, buf); ++} ++ ++static void ++gomp_display_string_len (char *buffer, size_t size, size_t *ret, ++ bool right, size_t sz, char *str, size_t len) ++{ ++ if (sz == (size_t) -1 || len >= sz) ++ { ++ gomp_display_string (buffer, size, ret, str, len); ++ return; ++ } ++ ++ if (right) ++ { ++ gomp_display_repeat (buffer, size, ret, ' ', sz - len); ++ gomp_display_string (buffer, size, ret, str, len); ++ } ++ else ++ { ++ gomp_display_string (buffer, size, ret, str, len); ++ gomp_display_repeat (buffer, size, ret, ' ', sz - len); ++ } ++} ++ ++static void ++gomp_display_hostname (char *buffer, size_t size, size_t *ret, ++ bool right, size_t sz) ++{ ++#ifdef HAVE_GETHOSTNAME ++ { ++ char buf[256]; ++ char *b = buf; ++ size_t len = 256; ++ do ++ { ++ b[len - 1] = '\0'; ++ if (gethostname (b, len - 1) == 0) ++ { ++ size_t l = strlen (b); ++ if (l < len - 1) ++ { ++ gomp_display_string_len (buffer, size, ret, ++ right, sz, b, l); ++ if (b != buf) ++ free (b); ++ return; ++ } ++ } ++ if (len == 1048576) ++ break; ++ len = len * 2; ++ if (len == 512) ++ b = gomp_malloc (len); ++ else ++ b = gomp_realloc (b, len); ++ } ++ while (1); ++ if (b != buf) ++ free (b); ++ } ++#endif ++#ifdef HAVE_UNAME ++ { ++ struct utsname buf; ++ if (uname (&buf) == 0) ++ { ++ gomp_display_string_len (buffer, size, ret, right, sz, ++ buf.nodename, strlen (buf.nodename)); ++ return; ++ } ++ } ++#endif ++ gomp_display_string_len (buffer, size, ret, right, sz, "node", 4); ++} ++ ++struct affinity_types_struct { ++ char long_str[18]; ++ char long_len; ++ char short_c; }; ++ ++static struct affinity_types_struct affinity_types[] = ++{ ++#define AFFINITY_TYPE(l, s) \ ++ { #l, sizeof (#l) - 1, s } ++ AFFINITY_TYPE (team_num, 't'), ++ AFFINITY_TYPE (num_teams, 'T'), ++ AFFINITY_TYPE (nesting_level, 'L'), ++ AFFINITY_TYPE (thread_num, 'n'), ++ AFFINITY_TYPE (num_threads, 'N'), ++ AFFINITY_TYPE (ancestor_tnum, 'a'), ++ AFFINITY_TYPE (host, 'H'), ++ AFFINITY_TYPE (process_id, 'P'), ++ AFFINITY_TYPE (native_thread_id, 'i'), ++ AFFINITY_TYPE (thread_affinity, 'A') ++#undef AFFINITY_TYPE ++}; ++ ++size_t ++gomp_display_affinity (char *buffer, size_t size, ++ const char *format, gomp_thread_handle handle, ++ struct gomp_team_state *ts, unsigned int place) ++{ ++ size_t ret = 0; ++ do ++ { ++ const char *p = strchr (format, '%'); ++ bool zero = false; ++ bool right = false; ++ size_t sz = -1; ++ char c; ++ int val; ++ if (p == NULL) ++ p = strchr (format, '\0'); ++ if (p != format) ++ gomp_display_string (buffer, size, &ret, ++ format, p - format); ++ if (*p == '\0') ++ break; ++ p++; ++ if (*p == '%') ++ { ++ gomp_display_string (buffer, size, &ret, "%", 1); ++ format = p + 1; ++ continue; ++ } ++ if (*p == '0') ++ { ++ zero = true; ++ p++; ++ if (*p != '.') ++ gomp_fatal ("leading zero not followed by dot in affinity format"); ++ } ++ if (*p == '.') ++ { ++ right = true; ++ p++; ++ } ++ if (*p >= '1' && *p <= '9') ++ { ++ char *end; ++ sz = strtoul (p, &end, 10); ++ p = end; ++ } ++ else if (zero || right) ++ gomp_fatal ("leading zero or right justification in affinity format " ++ "requires size"); ++ c = *p; ++ if (c == '{') ++ { ++ int i; ++ for (i = 0; ++ i < sizeof (affinity_types) / sizeof (affinity_types[0]); ++i) ++ if (strncmp (p + 1, affinity_types[i].long_str, ++ affinity_types[i].long_len) == 0 ++ && p[affinity_types[i].long_len + 1] == '}') ++ { ++ c = affinity_types[i].short_c; ++ p += affinity_types[i].long_len + 1; ++ break; ++ } ++ if (c == '{') ++ { ++ char *q = strchr (p + 1, '}'); ++ if (q) ++ gomp_fatal ("unsupported long type name '%.*s' in affinity " ++ "format", (int) (q - (p + 1)), p + 1); ++ else ++ gomp_fatal ("unterminated long type name '%s' in affinity " ++ "format", p + 1); ++ } ++ } ++ switch (c) ++ { ++ case 't': ++ val = omp_get_team_num (); ++ goto do_int; ++ case 'T': ++ val = omp_get_num_teams (); ++ goto do_int; ++ case 'L': ++ val = ts->level; ++ goto do_int; ++ case 'n': ++ val = ts->team_id; ++ goto do_int; ++ case 'N': ++ val = ts->team ? ts->team->nthreads : 1; ++ goto do_int; ++ case 'a': ++ val = ts->team ? ts->team->prev_ts.team_id : -1; ++ goto do_int; ++ case 'H': ++ gomp_display_hostname (buffer, size, &ret, right, sz); ++ break; ++ case 'P': ++#ifdef HAVE_GETPID ++ val = getpid (); ++#else ++ val = 0; ++#endif ++ goto do_int; ++ case 'i': ++#if defined(LIBGOMP_USE_PTHREADS) && defined(__GNUC__) ++ { ++ char buf[3 * (sizeof (handle) + sizeof (uintptr_t) + sizeof (int)) ++ + 4]; ++ /* This macro returns expr unmodified for integral or pointer ++ types and 0 for anything else (e.g. aggregates). */ ++#define gomp_nonaggregate(expr) \ ++ __builtin_choose_expr (__builtin_classify_type (expr) == 1 \ ++ || __builtin_classify_type (expr) == 5, expr, 0) ++ /* This macro returns expr unmodified for integral types, ++ (uintptr_t) (expr) for pointer types and 0 for anything else ++ (e.g. aggregates). */ ++#define gomp_integral(expr) \ ++ __builtin_choose_expr (__builtin_classify_type (expr) == 5, \ ++ (uintptr_t) gomp_nonaggregate (expr), \ ++ gomp_nonaggregate (expr)) ++ ++ if (sizeof (gomp_integral (handle)) == sizeof (unsigned long)) ++ sprintf (buf, "0x%lx", (unsigned long) gomp_integral (handle)); ++#if defined (HAVE_INTTYPES_H) && defined (PRIx64) ++ else if (sizeof (gomp_integral (handle)) == sizeof (uint64_t)) ++ sprintf (buf, "0x%" PRIx64, (uint64_t) gomp_integral (handle)); ++#else ++ else if (sizeof (gomp_integral (handle)) ++ == sizeof (unsigned long long)) ++ sprintf (buf, "0x%llx", ++ (unsigned long long) gomp_integral (handle)); ++#endif ++ else ++ sprintf (buf, "0x%x", (unsigned int) gomp_integral (handle)); ++ gomp_display_num (buffer, size, &ret, zero, right, sz, buf); ++ break; ++ } ++#else ++ val = 0; ++ goto do_int; ++#endif ++ case 'A': ++ if (sz == (size_t) -1) ++ gomp_display_affinity_place (buffer, size, &ret, ++ place - 1); ++ else if (right) ++ { ++ size_t len = 0; ++ gomp_display_affinity_place (NULL, 0, &len, place - 1); ++ if (len < sz) ++ gomp_display_repeat (buffer, size, &ret, ' ', sz - len); ++ gomp_display_affinity_place (buffer, size, &ret, place - 1); ++ } ++ else ++ { ++ size_t start = ret; ++ gomp_display_affinity_place (buffer, size, &ret, place - 1); ++ if (ret - start < sz) ++ gomp_display_repeat (buffer, size, &ret, ' ', sz - (ret - start)); ++ } ++ break; ++ do_int: ++ gomp_display_int (buffer, size, &ret, zero, right, sz, val); ++ break; ++ default: ++ gomp_fatal ("unsupported type %c in affinity format", c); ++ } ++ format = p + 1; ++ } ++ while (1); ++ return ret; ++} ++ ++size_t ++omp_capture_affinity (char *buffer, size_t size, const char *format) ++{ ++ struct gomp_thread *thr = gomp_thread (); ++ size_t ret ++ = gomp_display_affinity (buffer, size, ++ format && *format ++ ? format : gomp_affinity_format_var, ++ gomp_thread_self (), &thr->ts, thr->place); ++ if (size) ++ { ++ if (ret >= size) ++ buffer[size - 1] = '\0'; ++ else ++ buffer[ret] = '\0'; ++ } ++ return ret; ++} ++ialias (omp_capture_affinity) ++ ++void ++omp_display_affinity (const char *format) ++{ ++ char buf[512]; ++ char *b; ++ size_t ret = ialias_call (omp_capture_affinity) (buf, sizeof buf, format); ++ if (ret < sizeof buf) ++ { ++ buf[ret] = '\n'; ++ gomp_print_string (buf, ret + 1); ++ return; ++ } ++ b = gomp_malloc (ret + 1); ++ ialias_call (omp_capture_affinity) (b, ret + 1, format); ++ b[ret] = '\n'; ++ gomp_print_string (b, ret + 1); ++ free (b); ++} ++ ++void ++gomp_display_affinity_thread (gomp_thread_handle handle, ++ struct gomp_team_state *ts, unsigned int place) ++{ ++ char buf[512]; ++ char *b; ++ size_t ret = gomp_display_affinity (buf, sizeof buf, gomp_affinity_format_var, ++ handle, ts, place); ++ if (ret < sizeof buf) ++ { ++ buf[ret] = '\n'; ++ gomp_print_string (buf, ret + 1); ++ return; ++ } ++ b = gomp_malloc (ret + 1); ++ gomp_display_affinity (b, ret + 1, gomp_affinity_format_var, ++ handle, ts, place); ++ b[ret] = '\n'; ++ gomp_print_string (b, ret + 1); ++ free (b); ++} +--- libgomp/single.c.jj 2018-04-25 09:40:31.870655561 +0200 ++++ libgomp/single.c 2019-05-07 18:46:36.536109576 +0200 +@@ -47,7 +47,7 @@ GOMP_single_start (void) + return __sync_bool_compare_and_swap (&team->single_count, single_count, + single_count + 1L); + #else +- bool ret = gomp_work_share_start (false); ++ bool ret = gomp_work_share_start (0); + if (ret) + gomp_work_share_init_done (); + gomp_work_share_end_nowait (); +@@ -68,7 +68,7 @@ GOMP_single_copy_start (void) + bool first; + void *ret; + +- first = gomp_work_share_start (false); ++ first = gomp_work_share_start (0); + + if (first) + { +--- libgomp/oacc-cuda.c.jj 2018-04-25 09:40:31.321655307 +0200 ++++ libgomp/oacc-cuda.c 2019-05-07 18:46:36.528109704 +0200 +@@ -58,7 +58,7 @@ acc_get_cuda_stream (int async) + { + struct goacc_thread *thr = goacc_thread (); + +- if (async < 0) ++ if (!async_valid_p (async)) + return NULL; + + if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func) +@@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *st + { + struct goacc_thread *thr; + +- if (async < 0 || stream == NULL) ++ if (!async_valid_p (async) || stream == NULL) + return 0; + + goacc_lazy_initialize (); +--- libgomp/work.c.jj 2018-04-25 09:40:31.925655587 +0200 ++++ libgomp/work.c 2019-05-07 18:46:36.548109384 +0200 +@@ -76,7 +76,15 @@ alloc_work_share (struct gomp_team *team + #endif + + team->work_share_chunk *= 2; ++ /* Allocating gomp_work_share structures aligned is just an ++ optimization, don't do it when using the fallback method. */ ++#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC ++ ws = gomp_aligned_alloc (__alignof (struct gomp_work_share), ++ team->work_share_chunk ++ * sizeof (struct gomp_work_share)); ++#else + ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share)); ++#endif + ws->next_alloc = team->work_shares[0].next_alloc; + team->work_shares[0].next_alloc = ws; + team->work_share_list_alloc = &ws[1]; +@@ -90,30 +98,35 @@ alloc_work_share (struct gomp_team *team + This shouldn't touch the next_alloc field. */ + + void +-gomp_init_work_share (struct gomp_work_share *ws, bool ordered, ++gomp_init_work_share (struct gomp_work_share *ws, size_t ordered, + unsigned nthreads) + { + gomp_mutex_init (&ws->lock); + if (__builtin_expect (ordered, 0)) + { +-#define INLINE_ORDERED_TEAM_IDS_CNT \ +- ((sizeof (struct gomp_work_share) \ +- - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \ +- / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0])) +- +- if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT) +- ws->ordered_team_ids +- = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids)); ++#define INLINE_ORDERED_TEAM_IDS_SIZE \ ++ (sizeof (struct gomp_work_share) \ ++ - offsetof (struct gomp_work_share, inline_ordered_team_ids)) ++ ++ if (__builtin_expect (ordered != 1, 0)) ++ { ++ ordered += nthreads * sizeof (*ws->ordered_team_ids) - 1; ++ ordered = ordered + __alignof__ (long long) - 1; ++ ordered &= ~(__alignof__ (long long) - 1); ++ } ++ else ++ ordered = nthreads * sizeof (*ws->ordered_team_ids); ++ if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE) ++ ws->ordered_team_ids = gomp_malloc (ordered); + else + ws->ordered_team_ids = ws->inline_ordered_team_ids; +- memset (ws->ordered_team_ids, '\0', +- nthreads * sizeof (*ws->ordered_team_ids)); ++ memset (ws->ordered_team_ids, '\0', ordered); + ws->ordered_num_used = 0; + ws->ordered_owner = -1; + ws->ordered_cur = 0; + } + else +- ws->ordered_team_ids = NULL; ++ ws->ordered_team_ids = ws->inline_ordered_team_ids; + gomp_ptrlock_init (&ws->next_ws, NULL); + ws->threads_completed = 0; + } +@@ -166,7 +179,7 @@ free_work_share (struct gomp_team *team, + if this was the first thread to reach this point. */ + + bool +-gomp_work_share_start (bool ordered) ++gomp_work_share_start (size_t ordered) + { + struct gomp_thread *thr = gomp_thread (); + struct gomp_team *team = thr->ts.team; +@@ -178,7 +191,7 @@ gomp_work_share_start (bool ordered) + ws = gomp_malloc (sizeof (*ws)); + gomp_init_work_share (ws, ordered, 1); + thr->ts.work_share = ws; +- return ws; ++ return true; + } + + ws = thr->ts.work_share; +--- include/gomp-constants.h.jj 2018-04-25 09:40:39.757659209 +0200 ++++ include/gomp-constants.h 2019-05-07 18:57:33.333627031 +0200 +@@ -189,6 +189,7 @@ enum gomp_map_kind + #define GOMP_TASK_FLAG_GRAINSIZE (1 << 9) + #define GOMP_TASK_FLAG_IF (1 << 10) + #define GOMP_TASK_FLAG_NOGROUP (1 << 11) ++#define GOMP_TASK_FLAG_REDUCTION (1 << 12) + + /* GOMP_target{_ext,update_ext,enter_exit_data} flags argument. */ + #define GOMP_TARGET_FLAG_NOWAIT (1 << 0) +@@ -196,6 +197,18 @@ enum gomp_map_kind + /* Internal to libgomp. */ + #define GOMP_TARGET_FLAG_UPDATE (1U << 31) + ++ ++/* OpenACC construct flags. */ ++ ++/* Force host fallback execution. */ ++#define GOACC_FLAG_HOST_FALLBACK (1 << 0) ++ ++/* For legacy reasons, in the ABI, the GOACC_FLAGs are encoded as an inverted ++ bitmask. */ ++#define GOACC_FLAGS_MARSHAL_OP BIT_NOT_EXPR ++#define GOACC_FLAGS_UNMARSHAL(X) (~(X)) ++ ++ + /* Versions of libgomp and device-specific plugins. GOMP_VERSION + should be incremented whenever an ABI-incompatible change is introduced + to the plugin interface defined in libgomp/libgomp.h. */ +@@ -251,6 +264,12 @@ enum gomp_map_kind + at most and shifted by this many bits. */ + #define GOMP_TARGET_ARG_VALUE_SHIFT 16 + ++/* Dependence types in omp_depend_t objects. */ ++#define GOMP_DEPEND_IN 1 ++#define GOMP_DEPEND_OUT 2 ++#define GOMP_DEPEND_INOUT 3 ++#define GOMP_DEPEND_MUTEXINOUTSET 4 ++ + /* HSA specific data structures. */ + + /* Identifiers of device-specific target arguments. */ diff --git a/SOURCES/gcc8-libgomp-testsuite.patch b/SOURCES/gcc8-libgomp-testsuite.patch new file mode 100644 index 0000000..502ee22 --- /dev/null +++ b/SOURCES/gcc8-libgomp-testsuite.patch @@ -0,0 +1,41 @@ +--- libgomp/testsuite/libgomp-test-support.exp.in.jj 2018-04-25 09:40:31.323655308 +0200 ++++ libgomp/testsuite/libgomp-test-support.exp.in 2019-04-25 20:01:50.028243827 +0200 +@@ -2,4 +2,5 @@ set cuda_driver_include "@CUDA_DRIVER_IN + set cuda_driver_lib "@CUDA_DRIVER_LIB@" + set hsa_runtime_lib "@HSA_RUNTIME_LIB@" + ++set offload_plugins "@offload_plugins@" + set offload_targets "@offload_targets@" +--- libgomp/testsuite/lib/libgomp.exp.jj 2018-04-25 09:40:31.584655429 +0200 ++++ libgomp/testsuite/lib/libgomp.exp 2019-05-24 11:41:51.015822702 +0200 +@@ -40,7 +40,7 @@ load_file libgomp-test-support.exp + # Populate offload_targets_s (offloading targets separated by a space), and + # offload_targets_s_openacc (the same, but with OpenACC names; OpenACC spells + # some of them a little differently). +-set offload_targets_s [split $offload_targets ","] ++set offload_targets_s [split $offload_plugins ","] + set offload_targets_s_openacc {} + foreach offload_target_openacc $offload_targets_s { + # Translate to OpenACC names, or skip if not yet supported. +@@ -137,8 +137,8 @@ proc libgomp_init { args } { + + # Add liboffloadmic build directory in LD_LIBRARY_PATH to support + # non-fallback testing for Intel MIC targets +- global offload_targets +- if { [string match "*,intelmic,*" ",$offload_targets,"] } { ++ global offload_plugins ++ if { [string match "*,intelmic,*" ",$offload_plugins,"] } { + append always_ld_library_path ":${blddir}/../liboffloadmic/.libs" + append always_ld_library_path ":${blddir}/../liboffloadmic/plugin/.libs" + # libstdc++ is required by liboffloadmic +@@ -362,8 +362,8 @@ proc check_effective_target_offload_devi + # Return 1 if configured for nvptx offloading. + + proc check_effective_target_openacc_nvidia_accel_configured { } { +- global offload_targets +- if { ![string match "*,nvptx,*" ",$offload_targets,"] } { ++ global offload_plugins ++ if { ![string match "*,nvptx,*" ",$offload_plugins,"] } { + return 0 + } + # PR libgomp/65099: Currently, we only support offloading in 64-bit diff --git a/SOURCES/gcc8-pr60790.patch b/SOURCES/gcc8-pr60790.patch deleted file mode 100644 index 810919f..0000000 --- a/SOURCES/gcc8-pr60790.patch +++ /dev/null @@ -1,84 +0,0 @@ - PR libgcc/60790 - x86: Do not assume ELF constructors run before IFUNC resolvers. - * config/x86/host-config.h (libat_feat1_ecx, libat_feat1_edx): - Remove declarations. - (__libat_feat1, __libat_feat1_init): Declare. - (FEAT1_REGISTER): Define. - (load_feat1): New function. - (IFUNC_COND_1): Adjust. - * config/x86/init.c (libat_feat1_ecx, libat_feat1_edx) - (init_cpuid): Remove definitions. - (__libat_feat1): New variable. - (__libat_feat1_init): New function. - ---- libatomic/config/x86/host-config.h (revision 264990) -+++ libatomic/config/x86/host-config.h (working copy) -@@ -25,13 +25,39 @@ - #if HAVE_IFUNC - #include - --extern unsigned int libat_feat1_ecx HIDDEN; --extern unsigned int libat_feat1_edx HIDDEN; -+#ifdef __x86_64__ -+# define FEAT1_REGISTER ecx -+#else -+# define FEAT1_REGISTER edx -+#endif - -+/* Value of the CPUID feature register FEAT1_REGISTER for the cmpxchg -+ bit for IFUNC_COND1 below. */ -+extern unsigned int __libat_feat1 HIDDEN; -+ -+/* Initialize libat_feat1 and return its value. */ -+unsigned int __libat_feat1_init (void) HIDDEN; -+ -+/* Return the value of the relevant feature register for the relevant -+ cmpxchg bit, or 0 if there is no CPUID support. */ -+static inline unsigned int -+__attribute__ ((const)) -+load_feat1 (void) -+{ -+ /* See the store in __libat_feat1_init. */ -+ unsigned int feat1 = __atomic_load_n (&__libat_feat1, __ATOMIC_RELAXED); -+ if (feat1 == 0) -+ /* Assume that initialization has not happened yet. This may get -+ called repeatedly if the CPU does not have any feature bits at -+ all. */ -+ feat1 = __libat_feat1_init (); -+ return feat1; -+} -+ - #ifdef __x86_64__ --# define IFUNC_COND_1 (libat_feat1_ecx & bit_CMPXCHG16B) -+# define IFUNC_COND_1 (load_feat1 () & bit_CMPXCHG16B) - #else --# define IFUNC_COND_1 (libat_feat1_edx & bit_CMPXCHG8B) -+# define IFUNC_COND_1 (load_feat1 () & bit_CMPXCHG8B) - #endif - - #ifdef __x86_64__ ---- libatomic/config/x86/init.c (revision 264990) -+++ libatomic/config/x86/init.c (working copy) -@@ -26,13 +26,17 @@ - - #if HAVE_IFUNC - --unsigned int libat_feat1_ecx, libat_feat1_edx; -+unsigned int __libat_feat1; - --static void __attribute__((constructor)) --init_cpuid (void) -+unsigned int -+__libat_feat1_init (void) - { -- unsigned int eax, ebx; -- __get_cpuid (1, &eax, &ebx, &libat_feat1_ecx, &libat_feat1_edx); -+ unsigned int eax, ebx, ecx, edx; -+ FEAT1_REGISTER = 0; -+ __get_cpuid (1, &eax, &ebx, &ecx, &edx); -+ /* See the load in load_feat1. */ -+ __atomic_store_n (&__libat_feat1, FEAT1_REGISTER, __ATOMIC_RELAXED); -+ return FEAT1_REGISTER; - } - - #endif /* HAVE_IFUNC */ diff --git a/SOURCES/gcc8-pr85400.patch b/SOURCES/gcc8-pr85400.patch new file mode 100644 index 0000000..0c0d887 --- /dev/null +++ b/SOURCES/gcc8-pr85400.patch @@ -0,0 +1,94 @@ +2018-05-10 Eric Botcazou + + PR c++/85400 + * c-attribs.c (handle_visibility_attribute): Do not set no_add_attrs. + + * decl2.c (adjust_var_decl_tls_model): New static function. + (comdat_linkage): Call it on a variable. + (maybe_make_one_only): Likewise. + +--- gcc/c-family/c-attribs.c ++++ gcc/c-family/c-attribs.c +@@ -2299,14 +2299,13 @@ handle_visibility_attribute (tree *node, tree name, tree args, + + static tree + handle_tls_model_attribute (tree *node, tree name, tree args, +- int ARG_UNUSED (flags), bool *no_add_attrs) ++ int ARG_UNUSED (flags), ++ bool *ARG_UNUSED (no_add_attrs)) + { + tree id; + tree decl = *node; + enum tls_model kind; + +- *no_add_attrs = true; +- + if (!VAR_P (decl) || !DECL_THREAD_LOCAL_P (decl)) + { + warning (OPT_Wattributes, "%qE attribute ignored", name); +--- gcc/cp/decl2.c ++++ gcc/cp/decl2.c +@@ -1838,6 +1838,17 @@ mark_vtable_entries (tree decl) + } + } + ++/* Adjust the TLS model on variable DECL if need be, typically after ++ the linkage of DECL has been modified. */ ++ ++static void ++adjust_var_decl_tls_model (tree decl) ++{ ++ if (CP_DECL_THREAD_LOCAL_P (decl) ++ && !lookup_attribute ("tls_model", DECL_ATTRIBUTES (decl))) ++ set_decl_tls_model (decl, decl_default_tls_model (decl)); ++} ++ + /* Set DECL up to have the closest approximation of "initialized common" + linkage available. */ + +@@ -1888,6 +1899,9 @@ comdat_linkage (tree decl) + + if (TREE_PUBLIC (decl)) + DECL_COMDAT (decl) = 1; ++ ++ if (VAR_P (decl)) ++ adjust_var_decl_tls_model (decl); + } + + /* For win32 we also want to put explicit instantiations in +@@ -1926,6 +1940,8 @@ maybe_make_one_only (tree decl) + /* Mark it needed so we don't forget to emit it. */ + node->forced_by_abi = true; + TREE_USED (decl) = 1; ++ ++ adjust_var_decl_tls_model (decl); + } + } + } +--- /dev/null ++++ gcc/testsuite/g++.dg/tls/pr85400.C +@@ -0,0 +1,24 @@ ++// PR c++/85400 ++// Testcase by Brian Vandenberg ++ ++// { dg-do link { target c++11 } } ++// { dg-require-effective-target fpic } ++// { dg-require-effective-target shared } ++// { dg-require-effective-target tls } ++// { dg-options "-shared -fPIC -O" } ++// { dg-add-options tls } ++ ++struct Test ++{ ++ int blah (int y) ++ { ++ thread_local int mything = 3; ++ mything = y > 0 ? y : mything; ++ return mything; ++ } ++}; ++ ++int stuff (Test& test, int y) ++{ ++ return test.blah(y); ++} diff --git a/SOURCES/gcc8-pr86098.patch b/SOURCES/gcc8-pr86098.patch new file mode 100644 index 0000000..5f5a651 --- /dev/null +++ b/SOURCES/gcc8-pr86098.patch @@ -0,0 +1,39 @@ +2018-06-12 Jason Merrill + + PR c++/86098 - ICE with template placeholder for TTP. + * typeck.c (structural_comptypes) [TEMPLATE_TYPE_PARM]: Check + CLASS_PLACEHOLDER_TEMPLATE. + +--- gcc/cp/typeck.c ++++ gcc/cp/typeck.c +@@ -1375,6 +1375,11 @@ structural_comptypes (tree t1, tree t2, int strict) + template parameters set, they can't be equal. */ + if (!comp_template_parms_position (t1, t2)) + return false; ++ /* If T1 and T2 don't represent the same class template deduction, ++ they aren't equal. */ ++ if (CLASS_PLACEHOLDER_TEMPLATE (t1) ++ != CLASS_PLACEHOLDER_TEMPLATE (t2)) ++ return false; + /* Constrained 'auto's are distinct from parms that don't have the same + constraints. */ + if (!equivalent_placeholder_constraints (t1, t2)) +--- /dev/null ++++ gcc/testsuite/g++.dg/cpp1z/class-deduction58.C +@@ -0,0 +1,16 @@ ++// PR c++/86098 ++// { dg-additional-options -std=c++17 } ++ ++template class future; ++template T&& declval(); ++ ++template