gcc/SOURCES/gcc8-libgomp-20190503.patch

--- libgomp/loop.c.jj	2018-04-25 09:40:31.870655561 +0200
+++ libgomp/loop.c	2019-05-07 18:46:36.526109736 +0200
@@ -27,9 +27,13 @@

 #include <limits.h>
 #include <stdlib.h>
+#include <string.h>
 #include "libgomp.h"


+ialias (GOMP_loop_runtime_next)
+ialias_redirect (GOMP_taskgroup_reduction_register)
+
 /* Initialize the given work share construct from the given arguments.  */

 static inline void
@@ -79,12 +83,12 @@ gomp_loop_init (struct gomp_work_share *
 }

 /* The *_start routines are called when first encountering a loop construct
-   that is not bound directly to a parallel construct.  The first thread
+   that is not bound directly to a parallel construct.  The first thread
    that arrives will create the work-share construct; subsequent threads
    will see the construct exists and allocate work from it.

    START, END, INCR are the bounds of the loop; due to the restrictions of
-   OpenMP, these values must be the same in every thread.  This is not
+   OpenMP, these values must be the same in every thread.  This is not
    verified (nor is it entirely verifiable, since START is not necessarily
    retained intact in the work-share data structure).  CHUNK_SIZE is the
    scheduling parameter; again this must be identical in all threads.
@@ -101,7 +105,7 @@ gomp_loop_static_start (long start, long
   struct gomp_thread *thr = gomp_thread ();

   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_STATIC, chunk_size);
@@ -123,7 +127,7 @@ gomp_loop_dynamic_start (long start, lon
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_DYNAMIC, chunk_size);
@@ -151,7 +155,7 @@ gomp_loop_guided_start (long start, long
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_GUIDED, chunk_size);
@@ -174,7 +178,7 @@ GOMP_loop_runtime_start (long start, lon
 			 long *istart, long *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_static_start (start, end, incr,
@@ -197,6 +201,100 @@ GOMP_loop_runtime_start (long start, lon
     }
 }

+static long
+gomp_adjust_sched (long sched, long *chunk_size)
+{
+  sched &= ~GFS_MONOTONIC;
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_DYNAMIC:
+    case GFS_GUIDED:
+      return sched;
+    /* GFS_RUNTIME is used for runtime schedule without monotonic
+       or nonmonotonic modifiers on the clause.
+       GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
+       modifier.  */
+    case GFS_RUNTIME:
+    /* GFS_AUTO is used for runtime schedule with nonmonotonic
+       modifier.  */
+    case GFS_AUTO:
+      {
+	struct gomp_task_icv *icv = gomp_icv (false);
+	sched = icv->run_sched_var & ~GFS_MONOTONIC;
+	switch (sched)
+	  {
+	  case GFS_STATIC:
+	  case GFS_DYNAMIC:
+	  case GFS_GUIDED:
+	    *chunk_size = icv->run_sched_chunk_size;
+	    break;
+	  case GFS_AUTO:
+	    sched = GFS_STATIC;
+	    *chunk_size = 0;
+	    break;
+	  default:
+	    abort ();
+	  }
+	return sched;
+      }
+    default:
+      abort ();
+    }
+}
+
+bool
+GOMP_loop_start (long start, long end, long incr, long sched,
+		 long chunk_size, long *istart, long *iend,
+		 uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, start, end, incr,
+		      sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (mem)
+	{
+	  uintptr_t size = (uintptr_t) *mem;
+	  if (size > (sizeof (struct gomp_work_share)
+		      - offsetof (struct gomp_work_share,
+				  inline_ordered_team_ids)))
+	    thr->ts.work_share->ordered_team_ids
+	      = gomp_malloc_cleared (size);
+	  else
+	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
+	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      if (mem)
+	*mem = (void *) thr->ts.work_share->ordered_team_ids;
+    }
+
+  if (!istart)
+    return true;
+  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
+}
+
 /* The *_ordered_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED section.  */

@@ -207,7 +305,7 @@ gomp_loop_ordered_static_start (long sta
   struct gomp_thread *thr = gomp_thread ();

   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_STATIC, chunk_size);
@@ -225,7 +323,7 @@ gomp_loop_ordered_dynamic_start (long st
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_DYNAMIC, chunk_size);
@@ -250,7 +348,7 @@ gomp_loop_ordered_guided_start (long sta
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_init (thr->ts.work_share, start, end, incr,
 		      GFS_GUIDED, chunk_size);
@@ -273,7 +371,7 @@ GOMP_loop_ordered_runtime_start (long st
 				 long *istart, long *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_ordered_static_start (start, end, incr,
@@ -297,6 +395,81 @@ GOMP_loop_ordered_runtime_start (long st
     }
 }

+bool
+GOMP_loop_ordered_start (long start, long end, long incr, long sched,
+			 long chunk_size, long *istart, long *iend,
+			 uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ordered = 1;
+  bool ret;
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (mem)
+    ordered += (uintptr_t) *mem;
+  if (gomp_work_share_start (ordered))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, start, end, incr,
+		      sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (sched == GFS_STATIC)
+	gomp_ordered_static_init ();
+      else
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+      if (sched != GFS_STATIC)
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+    }
+
+  if (mem)
+    {
+      uintptr_t p
+	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
+		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
+      p += __alignof__ (long long) - 1;
+      p &= ~(__alignof__ (long long) - 1);
+      *mem = (void *) p;
+    }
+
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_AUTO:
+      return !gomp_iter_static_next (istart, iend);
+    case GFS_DYNAMIC:
+      ret = gomp_iter_dynamic_next_locked (istart, iend);
+      break;
+    case GFS_GUIDED:
+      ret = gomp_iter_guided_next_locked (istart, iend);
+      break;
+    default:
+      abort ();
+    }
+
+  if (ret)
+    gomp_ordered_first ();
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+  return ret;
+}
+
 /* The *_doacross_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
    section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
@@ -310,11 +483,11 @@ gomp_loop_doacross_static_start (unsigne
   struct gomp_thread *thr = gomp_thread ();

   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_STATIC, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }

@@ -328,11 +501,11 @@ gomp_loop_doacross_dynamic_start (unsign
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_DYNAMIC, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }

@@ -354,11 +527,11 @@ gomp_loop_doacross_guided_start (unsigne
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
 		      GFS_GUIDED, chunk_size);
-      gomp_doacross_init (ncounts, counts, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }

@@ -378,7 +551,7 @@ GOMP_loop_doacross_runtime_start (unsign
 				  long *istart, long *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_doacross_static_start (ncounts, counts,
@@ -402,8 +575,52 @@ GOMP_loop_doacross_runtime_start (unsign
     }
 }

-/* The *_next routines are called when the thread completes processing of
-   the iteration block currently assigned to it.  If the work-share
+bool
+GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched,
+			  long chunk_size, long *istart, long *iend,
+			  uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      size_t extra = 0;
+      if (mem)
+	extra = (uintptr_t) *mem;
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+		      sched, chunk_size);
+      gomp_doacross_init (ncounts, counts, chunk_size, extra);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+    }
+
+  if (mem)
+    *mem = thr->ts.work_share->doacross->extra;
+
+  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
+}
+
+/* The *_next routines are called when the thread completes processing of
+   the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
    bounds may have been set up before the parallel.  In which case, this
    may be the first iteration for the thread.
@@ -456,7 +673,7 @@ bool
 GOMP_loop_runtime_next (long *istart, long *iend)
 {
   struct gomp_thread *thr = gomp_thread ();
-
+
   switch (thr->ts.work_share->sched)
     {
     case GFS_STATIC:
@@ -534,7 +751,7 @@ bool
 GOMP_loop_ordered_runtime_next (long *istart, long *iend)
 {
   struct gomp_thread *thr = gomp_thread ();
-
+
   switch (thr->ts.work_share->sched)
     {
     case GFS_STATIC:
@@ -563,7 +780,7 @@ gomp_parallel_loop_start (void (*fn) (vo
   num_threads = gomp_resolve_num_threads (num_threads, 0);
   team = gomp_new_team (num_threads);
   gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
-  gomp_team_start (fn, data, num_threads, flags, team);
+  gomp_team_start (fn, data, num_threads, flags, team, NULL);
 }

 void
@@ -600,7 +817,8 @@ GOMP_parallel_loop_runtime_start (void (
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_chunk_size, 0);
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, 0);
 }

 ialias_redirect (GOMP_parallel_end)
@@ -638,11 +856,28 @@ GOMP_parallel_loop_guided (void (*fn) (v
   GOMP_parallel_end ();
 }

+void
+GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, unsigned flags)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+
 #ifdef HAVE_ATTRIBUTE_ALIAS
 extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic
 	__attribute__((alias ("GOMP_parallel_loop_dynamic")));
 extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided
 	__attribute__((alias ("GOMP_parallel_loop_guided")));
+extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_nonmonotonic_runtime
+	__attribute__((alias ("GOMP_parallel_loop_runtime")));
+extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_maybe_nonmonotonic_runtime
+	__attribute__((alias ("GOMP_parallel_loop_runtime")));
 #else
 void
 GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data,
@@ -667,21 +902,35 @@ GOMP_parallel_loop_nonmonotonic_guided (
   fn (data);
   GOMP_parallel_end ();
 }
-#endif

 void
-GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
-			    unsigned num_threads, long start, long end,
-			    long incr, unsigned flags)
+GOMP_parallel_loop_nonmonotonic_runtime (void (*fn) (void *), void *data,
+					 unsigned num_threads, long start,
+					 long end, long incr, unsigned flags)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
-			    icv->run_sched_var, icv->run_sched_chunk_size,
-			    flags);
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, flags);
   fn (data);
   GOMP_parallel_end ();
 }

+void
+GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*fn) (void *), void *data,
+					       unsigned num_threads, long start,
+					       long end, long incr,
+					       unsigned flags)
+{
+  struct gomp_task_icv *icv = gomp_icv (false);
+  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+			    icv->run_sched_var & ~GFS_MONOTONIC,
+			    icv->run_sched_chunk_size, flags);
+  fn (data);
+  GOMP_parallel_end ();
+}
+#endif
+
 /* The GOMP_loop_end* routines are called after the thread is told that
    all loop iterations are complete.  The first two versions synchronize
    all threads; the nowait version does not.  */
@@ -721,6 +970,10 @@ extern __typeof(gomp_loop_dynamic_start)
 	__attribute__((alias ("gomp_loop_dynamic_start")));
 extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start
 	__attribute__((alias ("gomp_loop_guided_start")));
+extern __typeof(GOMP_loop_runtime_start) GOMP_loop_nonmonotonic_runtime_start
+	__attribute__((alias ("GOMP_loop_runtime_start")));
+extern __typeof(GOMP_loop_runtime_start) GOMP_loop_maybe_nonmonotonic_runtime_start
+	__attribute__((alias ("GOMP_loop_runtime_start")));

 extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start
 	__attribute__((alias ("gomp_loop_ordered_static_start")));
@@ -746,6 +999,10 @@ extern __typeof(gomp_loop_dynamic_next)
 	__attribute__((alias ("gomp_loop_dynamic_next")));
 extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next
 	__attribute__((alias ("gomp_loop_guided_next")));
+extern __typeof(GOMP_loop_runtime_next) GOMP_loop_nonmonotonic_runtime_next
+	__attribute__((alias ("GOMP_loop_runtime_next")));
+extern __typeof(GOMP_loop_runtime_next) GOMP_loop_maybe_nonmonotonic_runtime_next
+	__attribute__((alias ("GOMP_loop_runtime_next")));

 extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next
 	__attribute__((alias ("gomp_loop_ordered_static_next")));
@@ -791,6 +1048,20 @@ GOMP_loop_nonmonotonic_guided_start (lon
 }

 bool
+GOMP_loop_nonmonotonic_runtime_start (long start, long end, long incr,
+				      long *istart, long *iend)
+{
+  return GOMP_loop_runtime_start (start, end, incr, istart, iend);
+}
+
+bool
+GOMP_loop_maybe_nonmonotonic_runtime_start (long start, long end, long incr,
+					    long *istart, long *iend)
+{
+  return GOMP_loop_runtime_start (start, end, incr, istart, iend);
+}
+
+bool
 GOMP_loop_ordered_static_start (long start, long end, long incr,
 				long chunk_size, long *istart, long *iend)
 {
@@ -869,6 +1140,18 @@ GOMP_loop_nonmonotonic_guided_next (long
 }

 bool
+GOMP_loop_nonmonotonic_runtime_next (long *istart, long *iend)
+{
+  return GOMP_loop_runtime_next (istart, iend);
+}
+
+bool
+GOMP_loop_maybe_nonmonotonic_runtime_next (long *istart, long *iend)
+{
+  return GOMP_loop_runtime_next (istart, iend);
+}
+
+bool
 GOMP_loop_ordered_static_next (long *istart, long *iend)
 {
   return gomp_loop_ordered_static_next (istart, iend);
--- libgomp/oacc-plugin.c.jj	2018-04-25 09:40:31.322655307 +0200
+++ libgomp/oacc-plugin.c	2019-05-07 18:46:36.531109656 +0200
@@ -49,3 +49,14 @@ GOMP_PLUGIN_acc_thread (void)
   struct goacc_thread *thr = goacc_thread ();
   return thr ? thr->target_tls : NULL;
 }
+
+int
+GOMP_PLUGIN_acc_default_dim (unsigned int i)
+{
+  if (i >= GOMP_DIM_MAX)
+    {
+      gomp_fatal ("invalid dimension argument: %d", i);
+      return -1;
+    }
+  return goacc_default_dims[i];
+}
--- libgomp/libgomp_g.h.jj	2018-04-25 09:40:31.320655306 +0200
+++ libgomp/libgomp_g.h	2019-05-07 18:46:36.513109943 +0200
@@ -1,4 +1,4 @@
-/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
+/* Copyright (C) 2005-2019 Free Software Foundation, Inc.
    Contributed by Richard Henderson <rth@redhat.com>.

    This file is part of the GNU Offloading and Multi Processing Library
@@ -31,6 +31,7 @@

 #include <stdbool.h>
 #include <stddef.h>
+#include "gstdint.h"

 /* barrier.c */

@@ -56,6 +57,12 @@ extern bool GOMP_loop_nonmonotonic_dynam
 						  long *, long *);
 extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long,
 						 long *, long *);
+extern bool GOMP_loop_nonmonotonic_runtime_start (long, long, long,
+						  long *, long *);
+extern bool GOMP_loop_maybe_nonmonotonic_runtime_start (long, long, long,
+							long *, long *);
+extern bool GOMP_loop_start (long, long, long, long, long, long *, long *,
+			     uintptr_t *, void **);

 extern bool GOMP_loop_ordered_static_start (long, long, long, long,
 					    long *, long *);
@@ -64,6 +71,8 @@ extern bool GOMP_loop_ordered_dynamic_st
 extern bool GOMP_loop_ordered_guided_start (long, long, long, long,
 					    long *, long *);
 extern bool GOMP_loop_ordered_runtime_start (long, long, long, long *, long *);
+extern bool GOMP_loop_ordered_start (long, long, long, long, long, long *,
+				     long *, uintptr_t *, void **);

 extern bool GOMP_loop_static_next (long *, long *);
 extern bool GOMP_loop_dynamic_next (long *, long *);
@@ -71,6 +80,8 @@ extern bool GOMP_loop_guided_next (long
 extern bool GOMP_loop_runtime_next (long *, long *);
 extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *);
 extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *);
+extern bool GOMP_loop_nonmonotonic_runtime_next (long *, long *);
+extern bool GOMP_loop_maybe_nonmonotonic_runtime_next (long *, long *);

 extern bool GOMP_loop_ordered_static_next (long *, long *);
 extern bool GOMP_loop_ordered_dynamic_next (long *, long *);
@@ -85,6 +96,8 @@ extern bool GOMP_loop_doacross_guided_st
 					     long *);
 extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *,
 					      long *);
+extern bool GOMP_loop_doacross_start (unsigned, long *, long, long, long *,
+				      long *, uintptr_t *, void **);

 extern void GOMP_parallel_loop_static_start (void (*)(void *), void *,
 					     unsigned, long, long, long, long);
@@ -112,6 +125,13 @@ extern void GOMP_parallel_loop_nonmonoto
 extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *,
 						    unsigned, long, long,
 						    long, long, unsigned);
+extern void GOMP_parallel_loop_nonmonotonic_runtime (void (*)(void *), void *,
+						     unsigned, long, long,
+						     long, unsigned);
+extern void GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*)(void *),
+							   void *, unsigned,
+							   long, long,
+							   long, unsigned);

 extern void GOMP_loop_end (void);
 extern void GOMP_loop_end_nowait (void);
@@ -154,6 +174,21 @@ extern bool GOMP_loop_ull_nonmonotonic_g
 						     unsigned long long,
 						     unsigned long long *,
 						     unsigned long long *);
+extern bool GOMP_loop_ull_nonmonotonic_runtime_start (bool, unsigned long long,
+						      unsigned long long,
+						      unsigned long long,
+						      unsigned long long *,
+						      unsigned long long *);
+extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool,
+							    unsigned long long,
+							    unsigned long long,
+							    unsigned long long,
+							    unsigned long long *,
+							    unsigned long long *);
+extern bool GOMP_loop_ull_start (bool, unsigned long long, unsigned long long,
+				 unsigned long long, long, unsigned long long,
+				 unsigned long long *, unsigned long long *,
+				 uintptr_t *, void **);

 extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long,
 						unsigned long long,
@@ -178,6 +213,13 @@ extern bool GOMP_loop_ull_ordered_runtim
 						 unsigned long long,
 						 unsigned long long *,
 						 unsigned long long *);
+extern bool GOMP_loop_ull_ordered_start (bool, unsigned long long,
+					 unsigned long long,
+					 unsigned long long, long,
+					 unsigned long long,
+					 unsigned long long *,
+					 unsigned long long *,
+					 uintptr_t *, void **);

 extern bool GOMP_loop_ull_static_next (unsigned long long *,
 				       unsigned long long *);
@@ -191,6 +233,10 @@ extern bool GOMP_loop_ull_nonmonotonic_d
 						     unsigned long long *);
 extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *,
 						    unsigned long long *);
+extern bool GOMP_loop_ull_nonmonotonic_runtime_next (unsigned long long *,
+						     unsigned long long *);
+extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_next (unsigned long long *,
+							   unsigned long long *);

 extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *,
 					       unsigned long long *);
@@ -220,6 +266,11 @@ extern bool GOMP_loop_ull_doacross_runti
 						  unsigned long long *,
 						  unsigned long long *,
 						  unsigned long long *);
+extern bool GOMP_loop_ull_doacross_start (unsigned, unsigned long long *,
+					  long, unsigned long long,
+					  unsigned long long *,
+					  unsigned long long *,
+					  uintptr_t *, void **);

 /* ordered.c */

@@ -235,6 +286,8 @@ extern void GOMP_doacross_ull_wait (unsi
 extern void GOMP_parallel_start (void (*) (void *), void *, unsigned);
 extern void GOMP_parallel_end (void);
 extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned);
+extern unsigned GOMP_parallel_reductions (void (*) (void *), void *, unsigned,
+					  unsigned);
 extern bool GOMP_cancel (int, bool);
 extern bool GOMP_cancellation_point (int);

@@ -251,13 +304,19 @@ extern void GOMP_taskloop_ull (void (*)
 			       unsigned long long, unsigned long long,
 			       unsigned long long);
 extern void GOMP_taskwait (void);
+extern void GOMP_taskwait_depend (void **);
 extern void GOMP_taskyield (void);
 extern void GOMP_taskgroup_start (void);
 extern void GOMP_taskgroup_end (void);
+extern void GOMP_taskgroup_reduction_register (uintptr_t *);
+extern void GOMP_taskgroup_reduction_unregister (uintptr_t *);
+extern void GOMP_task_reduction_remap (size_t, size_t, void **);
+extern void GOMP_workshare_task_reduction_unregister (bool);

 /* sections.c */

 extern unsigned GOMP_sections_start (unsigned);
+extern unsigned GOMP_sections2_start (unsigned, uintptr_t *, void **);
 extern unsigned GOMP_sections_next (void);
 extern void GOMP_parallel_sections_start (void (*) (void *), void *,
 					  unsigned, unsigned);
@@ -293,6 +352,11 @@ extern void GOMP_target_enter_exit_data
 					 void **);
 extern void GOMP_teams (unsigned int, unsigned int);

+/* teams.c */
+
+extern void GOMP_teams_reg (void (*) (void *), void *, unsigned, unsigned,
+			    unsigned);
+
 /* oacc-parallel.c */

 extern void GOACC_parallel_keyed (int, void (*) (void *), size_t,
--- libgomp/affinity.c.jj	2018-04-25 09:40:31.913655581 +0200
+++ libgomp/affinity.c	2019-05-07 18:46:36.254114081 +0200
@@ -26,6 +26,8 @@
 /* This is a generic stub implementation of a CPU affinity setting.  */

 #include "libgomp.h"
+#include <string.h>
+#include <stdio.h>

 void
 gomp_init_affinity (void)
@@ -138,5 +140,17 @@ gomp_get_place_proc_ids_8 (int place_num
   (void) ids;
 }

+void
+gomp_display_affinity_place (char *buffer, size_t size, size_t *ret,
+			     int place)
+{
+  char buf[sizeof (long) * 3 + 4];
+  if (gomp_available_cpus > 1)
+    sprintf (buf, "0-%lu", gomp_available_cpus - 1);
+  else
+    strcpy (buf, "0");
+  gomp_display_string (buffer, size, ret, buf, strlen (buf));
+}
+
 ialias(omp_get_place_num_procs)
 ialias(omp_get_place_proc_ids)
--- libgomp/sections.c.jj	2018-04-25 09:40:31.924655586 +0200
+++ libgomp/sections.c	2019-05-07 18:46:36.535109592 +0200
@@ -26,8 +26,11 @@
 /* This file handles the SECTIONS construct.  */

 #include "libgomp.h"
+#include <string.h>


+ialias_redirect (GOMP_taskgroup_reduction_register)
+
 /* Initialize the given work share construct from the given arguments.  */

 static inline void
@@ -72,7 +75,7 @@ GOMP_sections_start (unsigned count)
   struct gomp_thread *thr = gomp_thread ();
   long s, e, ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_sections_init (thr->ts.work_share, count);
       gomp_work_share_init_done ();
@@ -95,6 +98,66 @@ GOMP_sections_start (unsigned count)
   return ret;
 }

+unsigned
+GOMP_sections2_start (unsigned count, uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  long s, e, ret;
+
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      gomp_sections_init (thr->ts.work_share, count);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (mem)
+	{
+	  uintptr_t size = (uintptr_t) *mem;
+	  if (size > (sizeof (struct gomp_work_share)
+		      - offsetof (struct gomp_work_share,
+				  inline_ordered_team_ids)))
+	    thr->ts.work_share->ordered_team_ids
+	      = gomp_malloc_cleared (size);
+	  else
+	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
+	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      if (mem)
+	*mem = (void *) thr->ts.work_share->ordered_team_ids;
+    }
+
+#ifdef HAVE_SYNC_BUILTINS
+  if (gomp_iter_dynamic_next (&s, &e))
+    ret = s;
+  else
+    ret = 0;
+#else
+  gomp_mutex_lock (&thr->ts.work_share->lock);
+  if (gomp_iter_dynamic_next_locked (&s, &e))
+    ret = s;
+  else
+    ret = 0;
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+#endif
+
+  return ret;
+}
+
 /* This routine is called when the thread completes processing of the
    section currently assigned to it.  If the work-share construct is
    bound directly to a parallel construct, then the construct may have
@@ -140,7 +203,7 @@ GOMP_parallel_sections_start (void (*fn)
   num_threads = gomp_resolve_num_threads (num_threads, count);
   team = gomp_new_team (num_threads);
   gomp_sections_init (&team->work_shares[0], count);
-  gomp_team_start (fn, data, num_threads, 0, team);
+  gomp_team_start (fn, data, num_threads, 0, team, NULL);
 }

 ialias_redirect (GOMP_parallel_end)
@@ -154,7 +217,7 @@ GOMP_parallel_sections (void (*fn) (void
   num_threads = gomp_resolve_num_threads (num_threads, count);
   team = gomp_new_team (num_threads);
   gomp_sections_init (&team->work_shares[0], count);
-  gomp_team_start (fn, data, num_threads, flags, team);
+  gomp_team_start (fn, data, num_threads, flags, team, NULL);
   fn (data);
   GOMP_parallel_end ();
 }
--- libgomp/config/linux/affinity.c.jj	2018-04-25 09:40:31.875655563 +0200
+++ libgomp/config/linux/affinity.c	2019-05-07 18:46:36.344112642 +0200
@@ -396,6 +396,56 @@ gomp_get_place_proc_ids_8 (int place_num
       *ids++ = i;
 }

+void
+gomp_display_affinity_place (char *buffer, size_t size, size_t *ret,
+			     int place)
+{
+  cpu_set_t *cpusetp;
+  char buf[sizeof (long) * 3 + 4];
+  if (place >= 0 && place < gomp_places_list_len)
+    cpusetp = (cpu_set_t *) gomp_places_list[place];
+  else if (gomp_cpusetp)
+    cpusetp = gomp_cpusetp;
+  else
+    {
+      if (gomp_available_cpus > 1)
+	sprintf (buf, "0-%lu", gomp_available_cpus - 1);
+      else
+	strcpy (buf, "0");
+      gomp_display_string (buffer, size, ret, buf, strlen (buf));
+      return;
+    }
+
+  unsigned long i, max = 8 * gomp_cpuset_size, start;
+  bool prev_set = false;
+  start = max;
+  for (i = 0; i <= max; i++)
+    {
+      bool this_set;
+      if (i == max)
+	this_set = false;
+      else
+	this_set = CPU_ISSET_S (i, gomp_cpuset_size, cpusetp);
+      if (this_set != prev_set)
+	{
+	  prev_set = this_set;
+	  if (this_set)
+	    {
+	      char *p = buf;
+	      if (start != max)
+		*p++ = ',';
+	      sprintf (p, "%lu", i);
+	      start = i;
+	    }
+	  else if (i == start + 1)
+	    continue;
+	  else
+	    sprintf (buf, "-%lu", i - 1);
+	  gomp_display_string (buffer, size, ret, buf, strlen (buf));
+	}
+    }
+}
+
 ialias(omp_get_place_num_procs)
 ialias(omp_get_place_proc_ids)

--- libgomp/config/linux/ia64/futex.h.jj	2018-04-25 09:40:31.877655564 +0200
+++ libgomp/config/linux/ia64/futex.h	2019-05-07 18:46:36.344112642 +0200
@@ -45,8 +45,8 @@ sys_futex0(int *addr, int op, int val)
 	  "=r"(r8), "=r"(r10)
 	: "r"(r15), "r"(out0), "r"(out1), "r"(out2), "r"(out3)
 	: "memory", "out4", "out5", "out6", "out7",
-	  /* Non-stacked integer registers, minus r8, r10, r15.  */
-	  "r2", "r3", "r9", "r11", "r12", "r13", "r14", "r16", "r17", "r18",
+	  /* Non-stacked integer registers, minus r8, r10, r12, r15.  */
+	  "r2", "r3", "r9", "r11", "r13", "r14", "r16", "r17", "r18",
 	  "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27",
 	  "r28", "r29", "r30", "r31",
 	  /* Predicate registers.  */
--- libgomp/config/nvptx/teams.c.jj	2019-05-07 18:46:36.459110805 +0200
+++ libgomp/config/nvptx/teams.c	2019-05-07 18:46:36.459110805 +0200
@@ -0,0 +1,57 @@
+/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
+   Contributed by Alexander Monakov <amonakov@ispras.ru>
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file defines OpenMP API entry points that accelerator targets are
+   expected to replace.  */
+
+#include "libgomp.h"
+
+void
+GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
+		unsigned int thread_limit, unsigned int flags)
+{
+  (void) fn;
+  (void) data;
+  (void) flags;
+  (void) num_teams;
+  (void) thread_limit;
+}
+
+int
+omp_get_num_teams (void)
+{
+  return gomp_num_teams_var + 1;
+}
+
+int
+omp_get_team_num (void)
+{
+  int ctaid;
+  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
+  return ctaid;
+}
+
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
--- libgomp/config/nvptx/team.c.jj	2018-04-25 09:40:31.890655570 +0200
+++ libgomp/config/nvptx/team.c	2019-05-07 18:46:36.459110805 +0200
@@ -116,7 +116,8 @@ gomp_thread_start (struct gomp_thread_po

 void
 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
-		 unsigned flags, struct gomp_team *team)
+		 unsigned flags, struct gomp_team *team,
+		 struct gomp_taskgroup *taskgroup)
 {
   struct gomp_thread *thr, *nthr;
   struct gomp_task *task;
@@ -147,6 +148,7 @@ gomp_team_start (void (*fn) (void *), vo
   nthreads_var = icv->nthreads_var;
   gomp_init_task (thr->task, task, icv);
   team->implicit_task[0].icv.nthreads_var = nthreads_var;
+  team->implicit_task[0].taskgroup = taskgroup;

   if (nthreads == 1)
     return;
@@ -166,6 +168,7 @@ gomp_team_start (void (*fn) (void *), vo
       nthr->task = &team->implicit_task[i];
       gomp_init_task (nthr->task, task, icv);
       team->implicit_task[i].icv.nthreads_var = nthreads_var;
+      team->implicit_task[i].taskgroup = taskgroup;
       nthr->fn = fn;
       nthr->data = data;
       team->ordered_release[i] = &nthr->release;
@@ -174,5 +177,11 @@ gomp_team_start (void (*fn) (void *), vo
   gomp_simple_barrier_wait (&pool->threads_dock);
 }

+int
+gomp_pause_host (void)
+{
+  return -1;
+}
+
 #include "../../team.c"
 #endif
--- libgomp/config/nvptx/oacc-parallel.c.jj	2018-04-25 09:40:31.887655569 +0200
+++ libgomp/config/nvptx/oacc-parallel.c	2019-05-07 18:46:36.453110901 +0200
@@ -1,358 +0,0 @@
-/* OpenACC constructs
-
-   Copyright (C) 2014-2018 Free Software Foundation, Inc.
-
-   Contributed by Mentor Embedded.
-
-   This file is part of the GNU Offloading and Multi Processing Library
-   (libgomp).
-
-   Libgomp is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published by
-   the Free Software Foundation; either version 3, or (at your option)
-   any later version.
-
-   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
-   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-   more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include "libgomp_g.h"
-
-__asm__ (".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n"
-	 "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_num_threads\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n"
-	 "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_thread_num\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n"
-	 "// BEGIN GLOBAL FUNCTION DECL: abort\n"
-	 ".extern .func abort;\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n"
-	 "{\n"
-	 ".reg .u32 %ar1;\n"
-	 ".reg .u32 %retval;\n"
-	 ".reg .u64 %hr10;\n"
-	 ".reg .u32 %r22;\n"
-	 ".reg .u32 %r23;\n"
-	 ".reg .u32 %r24;\n"
-	 ".reg .u32 %r25;\n"
-	 ".reg .u32 %r26;\n"
-	 ".reg .u32 %r27;\n"
-	 ".reg .u32 %r28;\n"
-	 ".reg .u32 %r29;\n"
-	 ".reg .pred %r30;\n"
-	 ".reg .u32 %r31;\n"
-	 ".reg .pred %r32;\n"
-	 ".reg .u32 %r33;\n"
-	 ".reg .pred %r34;\n"
-	 ".local .align 8 .b8 %frame[4];\n"
-	 "ld.param.u32 %ar1,[%in_ar1];\n"
-	 "mov.u32 %r27,%ar1;\n"
-	 "st.local.u32 [%frame],%r27;\n"
-	 "ld.local.u32 %r28,[%frame];\n"
-	 "mov.u32 %r29,1;\n"
-	 "setp.eq.u32 %r30,%r28,%r29;\n"
-	 "@%r30 bra $L4;\n"
-	 "mov.u32 %r31,2;\n"
-	 "setp.eq.u32 %r32,%r28,%r31;\n"
-	 "@%r32 bra $L5;\n"
-	 "mov.u32 %r33,0;\n"
-	 "setp.eq.u32 %r34,%r28,%r33;\n"
-	 "@!%r34 bra $L8;\n"
-	 "mov.u32 %r23,%tid.x;\n"
-	 "mov.u32 %r22,%r23;\n"
-	 "bra $L7;\n"
-	 "$L4:\n"
-	 "mov.u32 %r24,%tid.y;\n"
-	 "mov.u32 %r22,%r24;\n"
-	 "bra $L7;\n"
-	 "$L5:\n"
-	 "mov.u32 %r25,%tid.z;\n"
-	 "mov.u32 %r22,%r25;\n"
-	 "bra $L7;\n"
-	 "$L8:\n"
-	 "{\n"
-	 "{\n"
-	 "call abort;\n"
-	 "}\n"
-	 "}\n"
-	 "$L7:\n"
-	 "mov.u32 %r26,%r22;\n"
-	 "mov.u32 %retval,%r26;\n"
-	 "st.param.u32 [%out_retval],%retval;\n"
-	 "ret;\n"
-	 "}\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n"
-	 "{\n"
-	 ".reg .u32 %ar1;\n"
-	 ".reg .u32 %retval;\n"
-	 ".reg .u64 %hr10;\n"
-	 ".reg .u32 %r22;\n"
-	 ".reg .u32 %r23;\n"
-	 ".reg .u32 %r24;\n"
-	 ".reg .u32 %r25;\n"
-	 ".reg .u32 %r26;\n"
-	 ".reg .u32 %r27;\n"
-	 ".reg .u32 %r28;\n"
-	 ".reg .u32 %r29;\n"
-	 ".reg .pred %r30;\n"
-	 ".reg .u32 %r31;\n"
-	 ".reg .pred %r32;\n"
-	 ".reg .u32 %r33;\n"
-	 ".reg .pred %r34;\n"
-	 ".local .align 8 .b8 %frame[4];\n"
-	 "ld.param.u32 %ar1,[%in_ar1];\n"
-	 "mov.u32 %r27,%ar1;\n"
-	 "st.local.u32 [%frame],%r27;\n"
-	 "ld.local.u32 %r28,[%frame];\n"
-	 "mov.u32 %r29,1;\n"
-	 "setp.eq.u32 %r30,%r28,%r29;\n"
-	 "@%r30 bra $L11;\n"
-	 "mov.u32 %r31,2;\n"
-	 "setp.eq.u32 %r32,%r28,%r31;\n"
-	 "@%r32 bra $L12;\n"
-	 "mov.u32 %r33,0;\n"
-	 "setp.eq.u32 %r34,%r28,%r33;\n"
-	 "@!%r34 bra $L15;\n"
-	 "mov.u32 %r23,%ntid.x;\n"
-	 "mov.u32 %r22,%r23;\n"
-	 "bra $L14;\n"
-	 "$L11:\n"
-	 "mov.u32 %r24,%ntid.y;\n"
-	 "mov.u32 %r22,%r24;\n"
-	 "bra $L14;\n"
-	 "$L12:\n"
-	 "mov.u32 %r25,%ntid.z;\n"
-	 "mov.u32 %r22,%r25;\n"
-	 "bra $L14;\n"
-	 "$L15:\n"
-	 "{\n"
-	 "{\n"
-	 "call abort;\n"
-	 "}\n"
-	 "}\n"
-	 "$L14:\n"
-	 "mov.u32 %r26,%r22;\n"
-	 "mov.u32 %retval,%r26;\n"
-	 "st.param.u32 [%out_retval],%retval;\n"
-	 "ret;\n"
-	 "}\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n"
-	 "{\n"
-	 ".reg .u32 %ar1;\n"
-	 ".reg .u32 %retval;\n"
-	 ".reg .u64 %hr10;\n"
-	 ".reg .u32 %r22;\n"
-	 ".reg .u32 %r23;\n"
-	 ".reg .u32 %r24;\n"
-	 ".reg .u32 %r25;\n"
-	 ".reg .u32 %r26;\n"
-	 ".reg .u32 %r27;\n"
-	 ".reg .u32 %r28;\n"
-	 ".reg .u32 %r29;\n"
-	 ".reg .pred %r30;\n"
-	 ".reg .u32 %r31;\n"
-	 ".reg .pred %r32;\n"
-	 ".reg .u32 %r33;\n"
-	 ".reg .pred %r34;\n"
-	 ".local .align 8 .b8 %frame[4];\n"
-	 "ld.param.u32 %ar1,[%in_ar1];\n"
-	 "mov.u32 %r27,%ar1;\n"
-	 "st.local.u32 [%frame],%r27;\n"
-	 "ld.local.u32 %r28,[%frame];\n"
-	 "mov.u32 %r29,1;\n"
-	 "setp.eq.u32 %r30,%r28,%r29;\n"
-	 "@%r30 bra $L18;\n"
-	 "mov.u32 %r31,2;\n"
-	 "setp.eq.u32 %r32,%r28,%r31;\n"
-	 "@%r32 bra $L19;\n"
-	 "mov.u32 %r33,0;\n"
-	 "setp.eq.u32 %r34,%r28,%r33;\n"
-	 "@!%r34 bra $L22;\n"
-	 "mov.u32 %r23,%ctaid.x;\n"
-	 "mov.u32 %r22,%r23;\n"
-	 "bra $L21;\n"
-	 "$L18:\n"
-	 "mov.u32 %r24,%ctaid.y;\n"
-	 "mov.u32 %r22,%r24;\n"
-	 "bra $L21;\n"
-	 "$L19:\n"
-	 "mov.u32 %r25,%ctaid.z;\n"
-	 "mov.u32 %r22,%r25;\n"
-	 "bra $L21;\n"
-	 "$L22:\n"
-	 "{\n"
-	 "{\n"
-	 "call abort;\n"
-	 "}\n"
-	 "}\n"
-	 "$L21:\n"
-	 "mov.u32 %r26,%r22;\n"
-	 "mov.u32 %retval,%r26;\n"
-	 "st.param.u32 [%out_retval],%retval;\n"
-	 "ret;\n"
-	 "}\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n"
-	 "{\n"
-	 ".reg .u32 %ar1;\n"
-	 ".reg .u32 %retval;\n"
-	 ".reg .u64 %hr10;\n"
-	 ".reg .u32 %r22;\n"
-	 ".reg .u32 %r23;\n"
-	 ".reg .u32 %r24;\n"
-	 ".reg .u32 %r25;\n"
-	 ".reg .u32 %r26;\n"
-	 ".reg .u32 %r27;\n"
-	 ".reg .u32 %r28;\n"
-	 ".reg .u32 %r29;\n"
-	 ".reg .pred %r30;\n"
-	 ".reg .u32 %r31;\n"
-	 ".reg .pred %r32;\n"
-	 ".reg .u32 %r33;\n"
-	 ".reg .pred %r34;\n"
-	 ".local .align 8 .b8 %frame[4];\n"
-	 "ld.param.u32 %ar1,[%in_ar1];\n"
-	 "mov.u32 %r27,%ar1;\n"
-	 "st.local.u32 [%frame],%r27;\n"
-	 "ld.local.u32 %r28,[%frame];\n"
-	 "mov.u32 %r29,1;\n"
-	 "setp.eq.u32 %r30,%r28,%r29;\n"
-	 "@%r30 bra $L25;\n"
-	 "mov.u32 %r31,2;\n"
-	 "setp.eq.u32 %r32,%r28,%r31;\n"
-	 "@%r32 bra $L26;\n"
-	 "mov.u32 %r33,0;\n"
-	 "setp.eq.u32 %r34,%r28,%r33;\n"
-	 "@!%r34 bra $L29;\n"
-	 "mov.u32 %r23,%nctaid.x;\n"
-	 "mov.u32 %r22,%r23;\n"
-	 "bra $L28;\n"
-	 "$L25:\n"
-	 "mov.u32 %r24,%nctaid.y;\n"
-	 "mov.u32 %r22,%r24;\n"
-	 "bra $L28;\n"
-	 "$L26:\n"
-	 "mov.u32 %r25,%nctaid.z;\n"
-	 "mov.u32 %r22,%r25;\n"
-	 "bra $L28;\n"
-	 "$L29:\n"
-	 "{\n"
-	 "{\n"
-	 "call abort;\n"
-	 "}\n"
-	 "}\n"
-	 "$L28:\n"
-	 "mov.u32 %r26,%r22;\n"
-	 "mov.u32 %retval,%r26;\n"
-	 "st.param.u32 [%out_retval],%retval;\n"
-	 "ret;\n"
-	 "}\n"
-	 "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_num_threads\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n"
-	 "{\n"
-	 ".reg .u32 %retval;\n"
-	 ".reg .u64 %hr10;\n"
-	 ".reg .u32 %r22;\n"
-	 ".reg .u32 %r23;\n"
-	 ".reg .u32 %r24;\n"
-	 ".reg .u32 %r25;\n"
-	 ".reg .u32 %r26;\n"
-	 ".reg .u32 %r27;\n"
-	 ".reg .u32 %r28;\n"
-	 ".reg .u32 %r29;\n"
-	 "mov.u32 %r26,0;\n"
-	 "{\n"
-	 ".param .u32 %retval_in;\n"
-	 "{\n"
-	 ".param .u32 %out_arg0;\n"
-	 "st.param.u32 [%out_arg0],%r26;\n"
-	 "call (%retval_in),GOACC_ntid,(%out_arg0);\n"
-	 "}\n"
-	 "ld.param.u32 %r27,[%retval_in];\n"
-	 "}\n"
-	 "mov.u32 %r22,%r27;\n"
-	 "mov.u32 %r28,0;\n"
-	 "{\n"
-	 ".param .u32 %retval_in;\n"
-	 "{\n"
-	 ".param .u32 %out_arg0;\n"
-	 "st.param.u32 [%out_arg0],%r28;\n"
-	 "call (%retval_in),GOACC_nctaid,(%out_arg0);\n"
-	 "}\n"
-	 "ld.param.u32 %r29,[%retval_in];\n"
-	 "}\n"
-	 "mov.u32 %r23,%r29;\n"
-	 "mul.lo.u32 %r24,%r22,%r23;\n"
-	 "mov.u32 %r25,%r24;\n"
-	 "mov.u32 %retval,%r25;\n"
-	 "st.param.u32 [%out_retval],%retval;\n"
-	 "ret;\n"
-	 "}\n"
-	 "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_thread_num\n"
-	 ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n"
-	 "{\n"
-	 ".reg .u32 %retval;\n"
-	 ".reg .u64 %hr10;\n"
-	 ".reg .u32 %r22;\n"
-	 ".reg .u32 %r23;\n"
-	 ".reg .u32 %r24;\n"
-	 ".reg .u32 %r25;\n"
-	 ".reg .u32 %r26;\n"
-	 ".reg .u32 %r27;\n"
-	 ".reg .u32 %r28;\n"
-	 ".reg .u32 %r29;\n"
-	 ".reg .u32 %r30;\n"
-	 ".reg .u32 %r31;\n"
-	 ".reg .u32 %r32;\n"
-	 ".reg .u32 %r33;\n"
-	 "mov.u32 %r28,0;\n"
-	 "{\n"
-	 ".param .u32 %retval_in;\n"
-	 "{\n"
-	 ".param .u32 %out_arg0;\n"
-	 "st.param.u32 [%out_arg0],%r28;\n"
-	 "call (%retval_in),GOACC_ntid,(%out_arg0);\n"
-	 "}\n"
-	 "ld.param.u32 %r29,[%retval_in];\n"
-	 "}\n"
-	 "mov.u32 %r22,%r29;\n"
-	 "mov.u32 %r30,0;\n"
-	 "{\n"
-	 ".param .u32 %retval_in;\n"
-	 "{\n"
-	 ".param .u32 %out_arg0;\n"
-	 "st.param.u32 [%out_arg0],%r30;\n"
-	 "call (%retval_in),GOACC_ctaid,(%out_arg0);\n"
-	 "}\n"
-	 "ld.param.u32 %r31,[%retval_in];\n"
-	 "}\n"
-	 "mov.u32 %r23,%r31;\n"
-	 "mul.lo.u32 %r24,%r22,%r23;\n"
-	 "mov.u32 %r32,0;\n"
-	 "{\n"
-	 ".param .u32 %retval_in;\n"
-	 "{\n"
-	 ".param .u32 %out_arg0;\n"
-	 "st.param.u32 [%out_arg0],%r32;\n"
-	 "call (%retval_in),GOACC_tid,(%out_arg0);\n"
-	 "}\n"
-	 "ld.param.u32 %r33,[%retval_in];\n"
-	 "}\n"
-	 "mov.u32 %r25,%r33;\n"
-	 "add.u32 %r26,%r24,%r25;\n"
-	 "mov.u32 %r27,%r26;\n"
-	 "mov.u32 %retval,%r27;\n"
-	 "st.param.u32 [%out_retval],%retval;\n"
-	 "ret;\n"
-	 "}\n");
--- libgomp/config/nvptx/target.c.jj	2018-04-25 09:40:31.890655570 +0200
+++ libgomp/config/nvptx/target.c	2019-05-07 18:46:36.453110901 +0200
@@ -47,3 +47,21 @@ GOMP_teams (unsigned int num_teams, unsi
     }
   gomp_num_teams_var = num_teams - 1;
 }
+
+int
+omp_pause_resource (omp_pause_resource_t kind, int device_num)
+{
+  (void) kind;
+  (void) device_num;
+  return -1;
+}
+
+int
+omp_pause_resource_all (omp_pause_resource_t kind)
+{
+  (void) kind;
+  return -1;
+}
+
+ialias (omp_pause_resource)
+ialias (omp_pause_resource_all)
--- libgomp/config/nvptx/icv-device.c.jj	2018-04-25 09:40:31.889655570 +0200
+++ libgomp/config/nvptx/icv-device.c	2019-05-07 18:46:36.453110901 +0200
@@ -46,20 +46,6 @@ omp_get_num_devices (void)
 }

 int
-omp_get_num_teams (void)
-{
-  return gomp_num_teams_var + 1;
-}
-
-int
-omp_get_team_num (void)
-{
-  int ctaid;
-  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
-  return ctaid;
-}
-
-int
 omp_is_initial_device (void)
 {
   /* NVPTX is an accelerator-only target.  */
@@ -69,6 +55,4 @@ omp_is_initial_device (void)
 ialias (omp_set_default_device)
 ialias (omp_get_default_device)
 ialias (omp_get_num_devices)
-ialias (omp_get_num_teams)
-ialias (omp_get_team_num)
 ialias (omp_is_initial_device)
--- libgomp/config/nvptx/affinity-fmt.c.jj	2019-05-07 18:46:36.358112419 +0200
+++ libgomp/config/nvptx/affinity-fmt.c	2019-05-07 18:46:36.358112419 +0200
@@ -0,0 +1,51 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>  /* For PRIx64.  */
+#endif
+#ifdef HAVE_UNAME
+#include <sys/utsname.h>
+#endif
+
+/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx,
+   while the nvptx newlib implementation does not support those functions.
+   Override the configure test results here.  */
+#undef HAVE_GETPID
+#undef HAVE_GETHOSTNAME
+
+/* The nvptx newlib implementation does not support fwrite, but it does support
+   write.  Map fwrite to write.  */
+#undef fwrite
+#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
+
+#include "../../affinity-fmt.c"
+
--- libgomp/config/mingw32/affinity-fmt.c.jj	2019-05-07 18:46:36.344112642 +0200
+++ libgomp/config/mingw32/affinity-fmt.c	2019-05-07 18:46:36.344112642 +0200
@@ -0,0 +1,68 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>  /* For PRIx64.  */
+#endif
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <errno.h>
+
+static int
+gomp_gethostname (char *name, size_t len)
+{
+  /* On Win9x GetComputerName fails if the input size is less
+     than MAX_COMPUTERNAME_LENGTH + 1.  */
+  char buffer[MAX_COMPUTERNAME_LENGTH + 1];
+  DWORD size = sizeof (buffer);
+  int ret = 0;
+
+  if (!GetComputerName (buffer, &size))
+    return -1;
+
+  if ((size = strlen (buffer) + 1) > len)
+    {
+      errno = EINVAL;
+      /* Truncate as per POSIX spec.  We do not NUL-terminate. */
+      size = len;
+      ret = -1;
+    }
+  memcpy (name, buffer, (size_t) size);
+
+  return ret;
+}
+
+#undef gethostname
+#define gethostname gomp_gethostname
+#define  HAVE_GETHOSTNAME 1
+
+#include "../../affinity-fmt.c"
--- libgomp/config/rtems/bar.c.jj	2018-04-25 09:40:31.902655576 +0200
+++ libgomp/config/rtems/bar.c	2019-05-07 18:46:36.460110789 +0200
@@ -72,184 +72,5 @@ do_wait (int *addr, int val)
     futex_wait (addr, val);
 }

-/* Everything below this point should be identical to the Linux
-   implementation.  */
-
-void
-gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
-{
-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
-    {
-      /* Next time we'll be awaiting TOTAL threads again.  */
-      bar->awaited = bar->total;
-      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
-			MEMMODEL_RELEASE);
-      futex_wake ((int *) &bar->generation, INT_MAX);
-    }
-  else
-    {
-      do
-	do_wait ((int *) &bar->generation, state);
-      while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) == state);
-    }
-}
-
-void
-gomp_barrier_wait (gomp_barrier_t *bar)
-{
-  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
-}
-
-/* Like gomp_barrier_wait, except that if the encountering thread
-   is not the last one to hit the barrier, it returns immediately.
-   The intended usage is that a thread which intends to gomp_barrier_destroy
-   this barrier calls gomp_barrier_wait, while all other threads
-   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
-   the barrier can be safely destroyed.  */
-
-void
-gomp_barrier_wait_last (gomp_barrier_t *bar)
-{
-  gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
-  if (state & BAR_WAS_LAST)
-    gomp_barrier_wait_end (bar, state);
-}
-
-void
-gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
-{
-  futex_wake ((int *) &bar->generation, count == 0 ? INT_MAX : count);
-}
-
-void
-gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
-{
-  unsigned int generation, gen;
-
-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
-    {
-      /* Next time we'll be awaiting TOTAL threads again.  */
-      struct gomp_thread *thr = gomp_thread ();
-      struct gomp_team *team = thr->ts.team;
-
-      bar->awaited = bar->total;
-      team->work_share_cancelled = 0;
-      if (__builtin_expect (team->task_count, 0))
-	{
-	  gomp_barrier_handle_tasks (state);
-	  state &= ~BAR_WAS_LAST;
-	}
-      else
-	{
-	  state &= ~BAR_CANCELLED;
-	  state += BAR_INCR - BAR_WAS_LAST;
-	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-	  futex_wake ((int *) &bar->generation, INT_MAX);
-	  return;
-	}
-    }
-
-  generation = state;
-  state &= ~BAR_CANCELLED;
-  do
-    {
-      do_wait ((int *) &bar->generation, generation);
-      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
-      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
-	{
-	  gomp_barrier_handle_tasks (state);
-	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
-	}
-      generation |= gen & BAR_WAITING_FOR_TASK;
-    }
-  while (gen != state + BAR_INCR);
-}
-
-void
-gomp_team_barrier_wait (gomp_barrier_t *bar)
-{
-  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
-}
-
-void
-gomp_team_barrier_wait_final (gomp_barrier_t *bar)
-{
-  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
-    bar->awaited_final = bar->total;
-  gomp_team_barrier_wait_end (bar, state);
-}
-
-bool
-gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
-				   gomp_barrier_state_t state)
-{
-  unsigned int generation, gen;
-
-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
-    {
-      /* Next time we'll be awaiting TOTAL threads again.  */
-      /* BAR_CANCELLED should never be set in state here, because
-	 cancellation means that at least one of the threads has been
-	 cancelled, thus on a cancellable barrier we should never see
-	 all threads to arrive.  */
-      struct gomp_thread *thr = gomp_thread ();
-      struct gomp_team *team = thr->ts.team;
-
-      bar->awaited = bar->total;
-      team->work_share_cancelled = 0;
-      if (__builtin_expect (team->task_count, 0))
-	{
-	  gomp_barrier_handle_tasks (state);
-	  state &= ~BAR_WAS_LAST;
-	}
-      else
-	{
-	  state += BAR_INCR - BAR_WAS_LAST;
-	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
-	  futex_wake ((int *) &bar->generation, INT_MAX);
-	  return false;
-	}
-    }
-
-  if (__builtin_expect (state & BAR_CANCELLED, 0))
-    return true;
-
-  generation = state;
-  do
-    {
-      do_wait ((int *) &bar->generation, generation);
-      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
-      if (__builtin_expect (gen & BAR_CANCELLED, 0))
-	return true;
-      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
-	{
-	  gomp_barrier_handle_tasks (state);
-	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
-	}
-      generation |= gen & BAR_WAITING_FOR_TASK;
-    }
-  while (gen != state + BAR_INCR);
-
-  return false;
-}
-
-bool
-gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
-{
-  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
-}
-
-void
-gomp_team_barrier_cancel (struct gomp_team *team)
-{
-  gomp_mutex_lock (&team->task_lock);
-  if (team->barrier.generation & BAR_CANCELLED)
-    {
-      gomp_mutex_unlock (&team->task_lock);
-      return;
-    }
-  team->barrier.generation |= BAR_CANCELLED;
-  gomp_mutex_unlock (&team->task_lock);
-  futex_wake ((int *) &team->barrier.generation, INT_MAX);
-}
+#define GOMP_WAIT_H 1
+#include "../linux/bar.c"
--- libgomp/config/rtems/affinity-fmt.c.jj	2019-05-07 18:46:36.459110805 +0200
+++ libgomp/config/rtems/affinity-fmt.c	2019-05-07 18:46:36.459110805 +0200
@@ -0,0 +1,49 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>  /* For PRIx64.  */
+#endif
+#ifdef HAVE_UNAME
+#include <sys/utsname.h>
+#endif
+
+/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for RTEMS,
+   but the extra information they give are of little value for the user.
+   Override the configure test results here.  */
+#undef HAVE_GETPID
+#undef HAVE_GETHOSTNAME
+
+/* Avoid the complex fwrite() in favour of the simple write().  */
+#undef fwrite
+#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
+
+#include "../../affinity-fmt.c"
--- libgomp/config.h.in.jj	2018-04-25 09:40:31.870655561 +0200
+++ libgomp/config.h.in	2019-05-07 18:46:36.465110710 +0200
@@ -1,5 +1,8 @@
 /* config.h.in.  Generated from configure.ac by autoheader.  */

+/* Define to 1 if you have the `aligned_alloc' function. */
+#undef HAVE_ALIGNED_ALLOC
+
 /* Define to 1 if the target assembler supports .symver directive. */
 #undef HAVE_AS_SYMVER_DIRECTIVE

@@ -33,9 +36,15 @@
 /* Define to 1 if you have the `getgid' function. */
 #undef HAVE_GETGID

+/* Define if gethostname is supported. */
+#undef HAVE_GETHOSTNAME
+
 /* Define to 1 if you have the `getloadavg' function. */
 #undef HAVE_GETLOADAVG

+/* Define if getpid is supported. */
+#undef HAVE_GETPID
+
 /* Define to 1 if you have the `getuid' function. */
 #undef HAVE_GETUID

@@ -45,9 +54,15 @@
 /* Define to 1 if you have the `dl' library (-ldl). */
 #undef HAVE_LIBDL

+/* Define to 1 if you have the `memalign' function. */
+#undef HAVE_MEMALIGN
+
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H

+/* Define to 1 if you have the `posix_memalign' function. */
+#undef HAVE_POSIX_MEMALIGN
+
 /* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */
 #undef HAVE_PTHREAD_AFFINITY_NP

@@ -103,9 +118,15 @@
 /* Define to 1 if the target supports thread-local storage. */
 #undef HAVE_TLS

+/* Define if uname is supported and struct utsname has nodename field. */
+#undef HAVE_UNAME
+
 /* Define to 1 if you have the <unistd.h> header file. */
 #undef HAVE_UNISTD_H

+/* Define to 1 if you have the `_aligned_malloc' function. */
+#undef HAVE__ALIGNED_MALLOC
+
 /* Define to 1 if you have the `__secure_getenv' function. */
 #undef HAVE___SECURE_GETENV

@@ -125,8 +146,8 @@
    */
 #undef LT_OBJDIR

-/* Define to offload targets, separated by commas. */
-#undef OFFLOAD_TARGETS
+/* Define to offload plugins, separated by commas. */
+#undef OFFLOAD_PLUGINS

 /* Name of package */
 #undef PACKAGE
--- libgomp/teams.c.jj	2019-05-07 18:46:36.548109384 +0200
+++ libgomp/teams.c	2019-05-07 18:46:36.548109384 +0200
@@ -0,0 +1,74 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* This file handles the host TEAMS construct.  */
+
+#include "libgomp.h"
+#include <limits.h>
+
+static unsigned gomp_num_teams = 1, gomp_team_num = 0;
+
+void
+GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
+		unsigned int thread_limit, unsigned int flags)
+{
+  (void) flags;
+  (void) num_teams;
+  unsigned old_thread_limit_var = 0;
+  if (thread_limit)
+    {
+      struct gomp_task_icv *icv = gomp_icv (true);
+      old_thread_limit_var = icv->thread_limit_var;
+      icv->thread_limit_var
+	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
+    }
+  if (num_teams == 0)
+    num_teams = 3;
+  gomp_num_teams = num_teams;
+  for (gomp_team_num = 0; gomp_team_num < num_teams; gomp_team_num++)
+    fn (data);
+  gomp_num_teams = 1;
+  gomp_team_num = 0;
+  if (thread_limit)
+    {
+      struct gomp_task_icv *icv = gomp_icv (true);
+      icv->thread_limit_var = old_thread_limit_var;
+    }
+}
+
+int
+omp_get_num_teams (void)
+{
+  return gomp_num_teams;
+}
+
+int
+omp_get_team_num (void)
+{
+  return gomp_team_num;
+}
+
+ialias (omp_get_num_teams)
+ialias (omp_get_team_num)
--- libgomp/libgomp.map.jj	2018-04-25 09:40:31.321655307 +0200
+++ libgomp/libgomp.map	2019-05-07 18:46:36.525109751 +0200
@@ -164,6 +164,22 @@ OMP_4.5 {
 	omp_target_disassociate_ptr;
 } OMP_4.0;

+OMP_5.0 {
+  global:
+	omp_capture_affinity;
+	omp_capture_affinity_;
+	omp_display_affinity;
+	omp_display_affinity_;
+	omp_get_affinity_format;
+	omp_get_affinity_format_;
+	omp_set_affinity_format;
+	omp_set_affinity_format_;
+	omp_pause_resource;
+	omp_pause_resource_;
+	omp_pause_resource_all;
+	omp_pause_resource_all_;
+} OMP_4.5;
+
 GOMP_1.0 {
   global:
 	GOMP_atomic_end;
@@ -298,6 +314,34 @@ GOMP_4.5 {
 	GOMP_parallel_loop_nonmonotonic_guided;
 } GOMP_4.0.1;

+GOMP_5.0 {
+  global:
+	GOMP_loop_doacross_start;
+	GOMP_loop_maybe_nonmonotonic_runtime_next;
+	GOMP_loop_maybe_nonmonotonic_runtime_start;
+	GOMP_loop_nonmonotonic_runtime_next;
+	GOMP_loop_nonmonotonic_runtime_start;
+	GOMP_loop_ordered_start;
+	GOMP_loop_start;
+	GOMP_loop_ull_doacross_start;
+	GOMP_loop_ull_maybe_nonmonotonic_runtime_next;
+	GOMP_loop_ull_maybe_nonmonotonic_runtime_start;
+	GOMP_loop_ull_nonmonotonic_runtime_next;
+	GOMP_loop_ull_nonmonotonic_runtime_start;
+	GOMP_loop_ull_ordered_start;
+	GOMP_loop_ull_start;
+	GOMP_parallel_loop_maybe_nonmonotonic_runtime;
+	GOMP_parallel_loop_nonmonotonic_runtime;
+	GOMP_parallel_reductions;
+	GOMP_sections2_start;
+	GOMP_taskgroup_reduction_register;
+	GOMP_taskgroup_reduction_unregister;
+	GOMP_task_reduction_remap;
+	GOMP_taskwait_depend;
+	GOMP_teams_reg;
+	GOMP_workshare_task_reduction_unregister;
+} GOMP_4.5;
+
 OACC_2.0 {
   global:
 	acc_get_num_devices;
@@ -386,6 +430,52 @@ OACC_2.0.1 {
 	acc_pcreate;
 } OACC_2.0;

+OACC_2.5 {
+  global:
+	acc_copyin_async;
+	acc_copyin_async_32_h_;
+	acc_copyin_async_64_h_;
+	acc_copyin_async_array_h_;
+	acc_copyout_async;
+	acc_copyout_async_32_h_;
+	acc_copyout_async_64_h_;
+	acc_copyout_async_array_h_;
+	acc_copyout_finalize;
+	acc_copyout_finalize_32_h_;
+	acc_copyout_finalize_64_h_;
+	acc_copyout_finalize_array_h_;
+	acc_copyout_finalize_async;
+	acc_copyout_finalize_async_32_h_;
+	acc_copyout_finalize_async_64_h_;
+	acc_copyout_finalize_async_array_h_;
+	acc_create_async;
+	acc_create_async_32_h_;
+	acc_create_async_64_h_;
+	acc_create_async_array_h_;
+	acc_delete_async;
+	acc_delete_async_32_h_;
+	acc_delete_async_64_h_;
+	acc_delete_async_array_h_;
+	acc_delete_finalize;
+	acc_delete_finalize_32_h_;
+	acc_delete_finalize_64_h_;
+	acc_delete_finalize_array_h_;
+	acc_delete_finalize_async;
+	acc_delete_finalize_async_32_h_;
+	acc_delete_finalize_async_64_h_;
+	acc_delete_finalize_async_array_h_;
+	acc_memcpy_from_device_async;
+	acc_memcpy_to_device_async;
+	acc_update_device_async;
+	acc_update_device_async_32_h_;
+	acc_update_device_async_64_h_;
+	acc_update_device_async_array_h_;
+	acc_update_self_async;
+	acc_update_self_async_32_h_;
+	acc_update_self_async_64_h_;
+	acc_update_self_async_array_h_;
+} OACC_2.0.1;
+
 GOACC_2.0 {
   global:
 	GOACC_data_end;
@@ -420,3 +510,8 @@ GOMP_PLUGIN_1.1 {
   global:
 	GOMP_PLUGIN_target_task_completion;
 } GOMP_PLUGIN_1.0;
+
+GOMP_PLUGIN_1.2 {
+  global:
+	GOMP_PLUGIN_acc_default_dim;
+} GOMP_PLUGIN_1.1;
--- libgomp/oacc-async.c.jj	2018-04-25 09:40:31.925655587 +0200
+++ libgomp/oacc-async.c	2019-05-07 18:46:36.528109704 +0200
@@ -34,7 +34,7 @@
 int
 acc_async_test (int async)
 {
-  if (async < acc_async_sync)
+  if (!async_valid_p (async))
     gomp_fatal ("invalid async argument: %d", async);

   struct goacc_thread *thr = goacc_thread ();
@@ -59,7 +59,7 @@ acc_async_test_all (void)
 void
 acc_wait (int async)
 {
-  if (async < acc_async_sync)
+  if (!async_valid_p (async))
     gomp_fatal ("invalid async argument: %d", async);

   struct goacc_thread *thr = goacc_thread ();
@@ -117,7 +117,7 @@ acc_async_wait_all (void)
 void
 acc_wait_all_async (int async)
 {
-  if (async < acc_async_sync)
+  if (!async_valid_p (async))
     gomp_fatal ("invalid async argument: %d", async);

   struct goacc_thread *thr = goacc_thread ();
--- libgomp/loop_ull.c.jj	2018-04-25 09:40:31.912655580 +0200
+++ libgomp/loop_ull.c	2019-05-07 18:46:36.527109719 +0200
@@ -27,8 +27,12 @@

 #include <limits.h>
 #include <stdlib.h>
+#include <string.h>
 #include "libgomp.h"

+ialias (GOMP_loop_ull_runtime_next)
+ialias_redirect (GOMP_taskgroup_reduction_register)
+
 typedef unsigned long long gomp_ull;

 /* Initialize the given work share construct from the given arguments.  */
@@ -104,7 +108,7 @@ gomp_loop_ull_static_start (bool up, gom
   struct gomp_thread *thr = gomp_thread ();

   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_STATIC, chunk_size);
@@ -122,7 +126,7 @@ gomp_loop_ull_dynamic_start (bool up, go
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_DYNAMIC, chunk_size);
@@ -148,7 +152,7 @@ gomp_loop_ull_guided_start (bool up, gom
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_GUIDED, chunk_size);
@@ -171,7 +175,7 @@ GOMP_loop_ull_runtime_start (bool up, go
 			     gomp_ull incr, gomp_ull *istart, gomp_ull *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_ull_static_start (up, start, end, incr,
@@ -195,6 +199,99 @@ GOMP_loop_ull_runtime_start (bool up, go
     }
 }

+static long
+gomp_adjust_sched (long sched, gomp_ull *chunk_size)
+{
+  sched &= ~GFS_MONOTONIC;
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_DYNAMIC:
+    case GFS_GUIDED:
+      return sched;
+    /* GFS_RUNTIME is used for runtime schedule without monotonic
+       or nonmonotonic modifiers on the clause.
+       GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
+       modifier.  */
+    case GFS_RUNTIME:
+    /* GFS_AUTO is used for runtime schedule with nonmonotonic
+       modifier.  */
+    case GFS_AUTO:
+      {
+	struct gomp_task_icv *icv = gomp_icv (false);
+	sched = icv->run_sched_var & ~GFS_MONOTONIC;
+	switch (sched)
+	  {
+	  case GFS_STATIC:
+	  case GFS_DYNAMIC:
+	  case GFS_GUIDED:
+	    *chunk_size = icv->run_sched_chunk_size;
+	    break;
+	  case GFS_AUTO:
+	    sched = GFS_STATIC;
+	    *chunk_size = 0;
+	    break;
+	  default:
+	    abort ();
+	  }
+	return sched;
+      }
+    default:
+      abort ();
+    }
+}
+
+bool
+GOMP_loop_ull_start (bool up, gomp_ull start, gomp_ull end,
+		     gomp_ull incr, long sched, gomp_ull chunk_size,
+		     gomp_ull *istart, gomp_ull *iend,
+		     uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+      			  sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (mem)
+	{
+	  uintptr_t size = (uintptr_t) *mem;
+	  if (size > (sizeof (struct gomp_work_share)
+		      - offsetof (struct gomp_work_share,
+				  inline_ordered_team_ids)))
+	    thr->ts.work_share->ordered_team_ids
+	      = gomp_malloc_cleared (size);
+	  else
+	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
+	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      if (mem)
+	*mem = (void *) thr->ts.work_share->ordered_team_ids;
+    }
+
+  return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
+}
+
 /* The *_ordered_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED section.  */

@@ -206,7 +303,7 @@ gomp_loop_ull_ordered_static_start (bool
   struct gomp_thread *thr = gomp_thread ();

   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_STATIC, chunk_size);
@@ -225,7 +322,7 @@ gomp_loop_ull_ordered_dynamic_start (boo
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_DYNAMIC, chunk_size);
@@ -251,7 +348,7 @@ gomp_loop_ull_ordered_guided_start (bool
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (true))
+  if (gomp_work_share_start (1))
     {
       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
 			  GFS_GUIDED, chunk_size);
@@ -275,7 +372,7 @@ GOMP_loop_ull_ordered_runtime_start (boo
 				     gomp_ull *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_ull_ordered_static_start (up, start, end, incr,
@@ -299,6 +396,82 @@ GOMP_loop_ull_ordered_runtime_start (boo
     }
 }

+bool
+GOMP_loop_ull_ordered_start (bool up, gomp_ull start, gomp_ull end,
+			     gomp_ull incr, long sched, gomp_ull chunk_size,
+			     gomp_ull *istart, gomp_ull *iend,
+			     uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ordered = 1;
+  bool ret;
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (mem)
+    ordered += (uintptr_t) *mem;
+  if (gomp_work_share_start (ordered))
+    {
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+			  sched, chunk_size);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      if (sched == GFS_STATIC)
+	gomp_ordered_static_init ();
+      else
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+      if (sched != GFS_STATIC)
+	gomp_mutex_lock (&thr->ts.work_share->lock);
+    }
+
+  if (mem)
+    {
+      uintptr_t p
+	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
+		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
+      p += __alignof__ (long long) - 1;
+      p &= ~(__alignof__ (long long) - 1);
+      *mem = (void *) p;
+    }
+
+  switch (sched)
+    {
+    case GFS_STATIC:
+    case GFS_AUTO:
+      return !gomp_iter_ull_static_next (istart, iend);
+    case GFS_DYNAMIC:
+      ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
+      break;
+    case GFS_GUIDED:
+      ret = gomp_iter_ull_guided_next_locked (istart, iend);
+      break;
+    default:
+      abort ();
+    }
+
+  if (ret)
+    gomp_ordered_first ();
+  gomp_mutex_unlock (&thr->ts.work_share->lock);
+  return ret;
+}
+
 /* The *_doacross_*_start routines are similar.  The only difference is that
    this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
    section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
@@ -313,11 +486,11 @@ gomp_loop_ull_doacross_static_start (uns
   struct gomp_thread *thr = gomp_thread ();

   thr->ts.static_trip = 0;
-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
 			  GFS_STATIC, chunk_size);
-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }

@@ -332,11 +505,11 @@ gomp_loop_ull_doacross_dynamic_start (un
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
 			  GFS_DYNAMIC, chunk_size);
-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }

@@ -359,11 +532,11 @@ gomp_loop_ull_doacross_guided_start (uns
   struct gomp_thread *thr = gomp_thread ();
   bool ret;

-  if (gomp_work_share_start (false))
+  if (gomp_work_share_start (0))
     {
       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
 			  GFS_GUIDED, chunk_size);
-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
       gomp_work_share_init_done ();
     }

@@ -383,7 +556,7 @@ GOMP_loop_ull_doacross_runtime_start (un
 				      gomp_ull *istart, gomp_ull *iend)
 {
   struct gomp_task_icv *icv = gomp_icv (false);
-  switch (icv->run_sched_var)
+  switch (icv->run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_STATIC:
       return gomp_loop_ull_doacross_static_start (ncounts, counts,
@@ -407,6 +580,51 @@ GOMP_loop_ull_doacross_runtime_start (un
     }
 }

+bool
+GOMP_loop_ull_doacross_start (unsigned ncounts, gomp_ull *counts,
+			      long sched, gomp_ull chunk_size,
+			      gomp_ull *istart, gomp_ull *iend,
+			      uintptr_t *reductions, void **mem)
+{
+  struct gomp_thread *thr = gomp_thread ();
+
+  thr->ts.static_trip = 0;
+  if (reductions)
+    gomp_workshare_taskgroup_start ();
+  if (gomp_work_share_start (0))
+    {
+      size_t extra = 0;
+      if (mem)
+	extra = (uintptr_t) *mem;
+      sched = gomp_adjust_sched (sched, &chunk_size);
+      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+			  sched, chunk_size);
+      gomp_doacross_ull_init (ncounts, counts, chunk_size, extra);
+      if (reductions)
+	{
+	  GOMP_taskgroup_reduction_register (reductions);
+	  thr->task->taskgroup->workshare = true;
+	  thr->ts.work_share->task_reductions = reductions;
+	}
+      gomp_work_share_init_done ();
+    }
+  else
+    {
+      if (reductions)
+	{
+	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
+	  gomp_workshare_task_reduction_register (reductions,
+						  first_reductions);
+	}
+      sched = thr->ts.work_share->sched;
+    }
+
+  if (mem)
+    *mem = thr->ts.work_share->doacross->extra;
+
+  return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
+}
+
 /* The *_next routines are called when the thread completes processing of
    the iteration block currently assigned to it.  If the work-share
    construct is bound directly to a parallel construct, then the iteration
@@ -570,6 +788,10 @@ extern __typeof(gomp_loop_ull_dynamic_st
 	__attribute__((alias ("gomp_loop_ull_dynamic_start")));
 extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start
 	__attribute__((alias ("gomp_loop_ull_guided_start")));
+extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_nonmonotonic_runtime_start
+	__attribute__((alias ("GOMP_loop_ull_runtime_start")));
+extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_maybe_nonmonotonic_runtime_start
+	__attribute__((alias ("GOMP_loop_ull_runtime_start")));

 extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start
 	__attribute__((alias ("gomp_loop_ull_ordered_static_start")));
@@ -595,6 +817,10 @@ extern __typeof(gomp_loop_ull_dynamic_ne
 	__attribute__((alias ("gomp_loop_ull_dynamic_next")));
 extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next
 	__attribute__((alias ("gomp_loop_ull_guided_next")));
+extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_nonmonotonic_runtime_next
+	__attribute__((alias ("GOMP_loop_ull_runtime_next")));
+extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_maybe_nonmonotonic_runtime_next
+	__attribute__((alias ("GOMP_loop_ull_runtime_next")));

 extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next
 	__attribute__((alias ("gomp_loop_ull_ordered_static_next")));
@@ -650,6 +876,23 @@ GOMP_loop_ull_nonmonotonic_guided_start
 }

 bool
+GOMP_loop_ull_nonmonotonic_runtime_start (bool up, gomp_ull start,
+					  gomp_ull end, gomp_ull incr,
+					  gomp_ull *istart, gomp_ull *iend)
+{
+  return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend);
+}
+
+bool
+GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool up, gomp_ull start,
+						gomp_ull end, gomp_ull incr,
+						gomp_ull *istart,
+						gomp_ull *iend)
+{
+  return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend);
+}
+
+bool
 GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end,
 				    gomp_ull incr, gomp_ull chunk_size,
 				    gomp_ull *istart, gomp_ull *iend)
@@ -734,6 +977,19 @@ GOMP_loop_ull_nonmonotonic_guided_next (
 }

 bool
+GOMP_loop_ull_nonmonotonic_runtime_next (gomp_ull *istart, gomp_ull *iend)
+{
+  return GOMP_loop_ull_runtime_next (istart, iend);
+}
+
+bool
+GOMP_loop_ull_maybe_nonmonotonic_runtime_next (gomp_ull *istart,
+					       gomp_ull *iend)
+{
+  return GOMP_loop_ull_runtime_next (istart, iend);
+}
+
+bool
 GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend)
 {
   return gomp_loop_ull_ordered_static_next (istart, iend);
--- libgomp/oacc-int.h.jj	2018-04-25 09:40:31.320655306 +0200
+++ libgomp/oacc-int.h	2019-05-07 18:46:36.529109688 +0200
@@ -99,6 +99,28 @@ void goacc_restore_bind (void);
 void goacc_lazy_initialize (void);
 void goacc_host_init (void);

+static inline bool
+async_valid_stream_id_p (int async)
+{
+  return async >= 0;
+}
+
+static inline bool
+async_valid_p (int async)
+{
+  return (async == acc_async_noval || async == acc_async_sync
+	  || async_valid_stream_id_p (async));
+}
+
+static inline bool
+async_synchronous_p (int async)
+{
+  if (!async_valid_p (async))
+    return true;
+
+  return async == acc_async_sync;
+}
+
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility pop
 #endif
--- libgomp/testsuite/Makefile.in.jj	2018-04-25 09:40:31.452655368 +0200
+++ libgomp/testsuite/Makefile.in	2019-05-07 18:51:35.754330084 +0200
@@ -223,6 +223,7 @@ mkdir_p = @mkdir_p@
 multi_basedir = @multi_basedir@
 offload_additional_lib_paths = @offload_additional_lib_paths@
 offload_additional_options = @offload_additional_options@
+offload_plugins = @offload_plugins@
 offload_targets = @offload_targets@
 oldincludedir = @oldincludedir@
 pdfdir = @pdfdir@
--- libgomp/task.c.jj	2018-04-25 09:40:31.925655587 +0200
+++ libgomp/task.c	2019-05-07 18:46:36.547109400 +0200
@@ -166,21 +166,72 @@ gomp_task_handle_depend (struct gomp_tas
 			 void **depend)
 {
   size_t ndepend = (uintptr_t) depend[0];
-  size_t nout = (uintptr_t) depend[1];
   size_t i;
   hash_entry_type ent;

+  if (ndepend)
+    {
+      /* depend[0] is total # */
+      size_t nout = (uintptr_t) depend[1]; /* # of out: and inout: */
+      /* ndepend - nout is # of in: */
+      for (i = 0; i < ndepend; i++)
+	{
+	  task->depend[i].addr = depend[2 + i];
+	  task->depend[i].is_in = i >= nout;
+	}
+    }
+  else
+    {
+      ndepend = (uintptr_t) depend[1]; /* total # */
+      size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */
+      size_t nmutexinoutset = (uintptr_t) depend[3]; /* # of mutexinoutset: */
+      /* For now we treat mutexinoutset like out, which is compliant, but
+	 inefficient.  */
+      size_t nin = (uintptr_t) depend[4]; /* # of in: */
+      /* ndepend - nout - nmutexinoutset - nin is # of depobjs */
+      size_t normal = nout + nmutexinoutset + nin;
+      size_t n = 0;
+      for (i = normal; i < ndepend; i++)
+	{
+	  void **d = (void **) (uintptr_t) depend[5 + i];
+	  switch ((uintptr_t) d[1])
+	    {
+	    case GOMP_DEPEND_OUT:
+	    case GOMP_DEPEND_INOUT:
+	    case GOMP_DEPEND_MUTEXINOUTSET:
+	      break;
+	    case GOMP_DEPEND_IN:
+	      continue;
+	    default:
+	      gomp_fatal ("unknown omp_depend_t dependence type %d",
+			  (int) (uintptr_t) d[1]);
+	    }
+	  task->depend[n].addr = d[0];
+	  task->depend[n++].is_in = 0;
+	}
+      for (i = 0; i < normal; i++)
+	{
+	  task->depend[n].addr = depend[5 + i];
+	  task->depend[n++].is_in = i >= nout + nmutexinoutset;
+	}
+      for (i = normal; i < ndepend; i++)
+	{
+	  void **d = (void **) (uintptr_t) depend[5 + i];
+	  if ((uintptr_t) d[1] != GOMP_DEPEND_IN)
+	    continue;
+	  task->depend[n].addr = d[0];
+	  task->depend[n++].is_in = 1;
+	}
+    }
   task->depend_count = ndepend;
   task->num_dependees = 0;
   if (parent->depend_hash == NULL)
     parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
   for (i = 0; i < ndepend; i++)
     {
-      task->depend[i].addr = depend[2 + i];
       task->depend[i].next = NULL;
       task->depend[i].prev = NULL;
       task->depend[i].task = task;
-      task->depend[i].is_in = i >= nout;
       task->depend[i].redundant = false;
       task->depend[i].redundant_out = false;

@@ -205,7 +256,7 @@ gomp_task_handle_depend (struct gomp_tas
 	      last = ent;

 	      /* depend(in:...) doesn't depend on earlier depend(in:...).  */
-	      if (i >= nout && ent->is_in)
+	      if (task->depend[i].is_in && ent->is_in)
 		continue;

 	      if (!ent->is_in)
@@ -280,9 +331,18 @@ gomp_task_handle_depend (struct gomp_tas
    then the task may be executed by any member of the team.

    DEPEND is an array containing:
+     if depend[0] is non-zero, then:
 	depend[0]: number of depend elements.
-	depend[1]: number of depend elements of type "out".
-	depend[2..N+1]: address of [1..N]th depend element.  */
+	depend[1]: number of depend elements of type "out/inout".
+	depend[2..N+1]: address of [1..N]th depend element.
+     otherwise, when depend[0] is zero, then:
+	depend[1]: number of depend elements.
+	depend[2]: number of depend elements of type "out/inout".
+	depend[3]: number of depend elements of type "mutexinoutset".
+	depend[4]: number of depend elements of type "in".
+	depend[5..4+depend[2]+depend[3]+depend[4]]: address of depend elements
+	depend[5+depend[2]+depend[3]+depend[4]..4+depend[1]]: address of
+		   omp_depend_t objects.  */

 void
 GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
@@ -303,10 +363,20 @@ GOMP_task (void (*fn) (void *), void *da
 #endif

   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }

   if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0)
     priority = 0;
@@ -377,7 +447,7 @@ GOMP_task (void (*fn) (void *), void *da
       size_t depend_size = 0;

       if (flags & GOMP_TASK_FLAG_DEPEND)
-	depend_size = ((uintptr_t) depend[0]
+	depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1])
 		       * sizeof (struct gomp_task_depend_entry));
       task = gomp_malloc (sizeof (*task) + depend_size
 			  + arg_size + arg_align - 1);
@@ -404,14 +474,26 @@ GOMP_task (void (*fn) (void *), void *da
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
 	 tasks.  */
-      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
-			     || (taskgroup && taskgroup->cancelled))
-			    && !task->copy_ctors_done, 0))
+      if (__builtin_expect (gomp_cancel_var, 0)
+	  && !task->copy_ctors_done)
 	{
-	  gomp_mutex_unlock (&team->task_lock);
-	  gomp_finish_task (task);
-	  free (task);
-	  return;
+	  if (gomp_team_barrier_cancelled (&team->barrier))
+	    {
+	    do_cancel:
+	      gomp_mutex_unlock (&team->task_lock);
+	      gomp_finish_task (task);
+	      free (task);
+	      return;
+	    }
+	  if (taskgroup)
+	    {
+	      if (taskgroup->cancelled)
+		goto do_cancel;
+	      if (taskgroup->workshare
+		  && taskgroup->prev
+		  && taskgroup->prev->cancelled)
+		goto do_cancel;
+	    }
 	}
       if (taskgroup)
 	taskgroup->num_children++;
@@ -463,6 +545,7 @@ GOMP_task (void (*fn) (void *), void *da

 ialias (GOMP_taskgroup_start)
 ialias (GOMP_taskgroup_end)
+ialias (GOMP_taskgroup_reduction_register)

 #define TYPE long
 #define UTYPE unsigned long
@@ -601,10 +684,20 @@ gomp_create_target_task (struct gomp_dev
   struct gomp_team *team = thr->ts.team;

   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return true;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return true;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return true;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return true;
+	}
+    }

   struct gomp_target_task *ttask;
   struct gomp_task *task;
@@ -617,7 +710,7 @@ gomp_create_target_task (struct gomp_dev

   if (depend != NULL)
     {
-      depend_cnt = (uintptr_t) depend[0];
+      depend_cnt = (uintptr_t) (depend[0] ? depend[0] : depend[1]);
       depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry);
     }
   if (fn)
@@ -687,13 +780,25 @@ gomp_create_target_task (struct gomp_dev
   task->final_task = 0;
   gomp_mutex_lock (&team->task_lock);
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier)
-			|| (taskgroup && taskgroup->cancelled), 0))
+  if (__builtin_expect (gomp_cancel_var, 0))
     {
-      gomp_mutex_unlock (&team->task_lock);
-      gomp_finish_task (task);
-      free (task);
-      return true;
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	{
+	do_cancel:
+	  gomp_mutex_unlock (&team->task_lock);
+	  gomp_finish_task (task);
+	  free (task);
+	  return true;
+	}
+      if (taskgroup)
+	{
+	  if (taskgroup->cancelled)
+	    goto do_cancel;
+	  if (taskgroup->workshare
+	      && taskgroup->prev
+	      && taskgroup->prev->cancelled)
+	    goto do_cancel;
+	}
     }
   if (depend_size)
     {
@@ -986,10 +1091,21 @@ gomp_task_run_pre (struct gomp_task *chi

   if (--team->task_queued_count == 0)
     gomp_team_barrier_clear_task_pending (&team->barrier);
-  if ((gomp_team_barrier_cancelled (&team->barrier)
-       || (taskgroup && taskgroup->cancelled))
+  if (__builtin_expect (gomp_cancel_var, 0)
       && !child_task->copy_ctors_done)
-    return true;
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return true;
+      if (taskgroup)
+	{
+	  if (taskgroup->cancelled)
+	    return true;
+	  if (taskgroup->workshare
+	      && taskgroup->prev
+	      && taskgroup->prev->cancelled)
+	    return true;
+	}
+    }
   return false;
 }

@@ -1456,6 +1572,35 @@ GOMP_taskwait (void)
     }
 }

+/* Called when encountering a taskwait directive with depend clause(s).
+   Wait as if it was an mergeable included task construct with empty body.  */
+
+void
+GOMP_taskwait_depend (void **depend)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+  /* If parallel or taskgroup has been cancelled, return early.  */
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }
+
+  if (thr->task && thr->task->depend_hash)
+    gomp_task_maybe_wait_for_dependencies (depend);
+}
+
 /* An undeferred task is about to run.  Wait for all tasks that this
    undeferred task depends on.

@@ -1464,7 +1609,7 @@ GOMP_taskwait (void)
    the scheduling queues.  Then we iterate through these imminently
    ready tasks (and possibly other high priority tasks), and run them.
    If we run out of ready dependencies to execute, we either wait for
-   the reamining dependencies to finish, or wait for them to get
+   the remaining dependencies to finish, or wait for them to get
    scheduled so we can run them.

    DEPEND is as in GOMP_task.  */
@@ -1477,21 +1622,50 @@ gomp_task_maybe_wait_for_dependencies (v
   struct gomp_team *team = thr->ts.team;
   struct gomp_task_depend_entry elem, *ent = NULL;
   struct gomp_taskwait taskwait;
-  size_t ndepend = (uintptr_t) depend[0];
+  size_t orig_ndepend = (uintptr_t) depend[0];
   size_t nout = (uintptr_t) depend[1];
+  size_t ndepend = orig_ndepend;
+  size_t normal = ndepend;
+  size_t n = 2;
   size_t i;
   size_t num_awaited = 0;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
   int do_wake = 0;

+  if (ndepend == 0)
+    {
+      ndepend = nout;
+      nout = (uintptr_t) depend[2] + (uintptr_t) depend[3];
+      normal = nout + (uintptr_t) depend[4];
+      n = 5;
+    }
   gomp_mutex_lock (&team->task_lock);
   for (i = 0; i < ndepend; i++)
     {
-      elem.addr = depend[i + 2];
+      elem.addr = depend[i + n];
+      elem.is_in = i >= nout;
+      if (__builtin_expect (i >= normal, 0))
+	{
+	  void **d = (void **) elem.addr;
+	  switch ((uintptr_t) d[1])
+	    {
+	    case GOMP_DEPEND_IN:
+	      break;
+	    case GOMP_DEPEND_OUT:
+	    case GOMP_DEPEND_INOUT:
+	    case GOMP_DEPEND_MUTEXINOUTSET:
+	      elem.is_in = 0;
+	      break;
+	    default:
+	      gomp_fatal ("unknown omp_depend_t dependence type %d",
+			  (int) (uintptr_t) d[1]);
+	    }
+	  elem.addr = d[0];
+	}
       ent = htab_find (task->depend_hash, &elem);
       for (; ent; ent = ent->next)
-	if (i >= nout && ent->is_in)
+	if (elem.is_in && ent->is_in)
 	  continue;
 	else
 	  {
@@ -1654,13 +1828,28 @@ GOMP_taskyield (void)
   /* Nothing at the moment.  */
 }

+static inline struct gomp_taskgroup *
+gomp_taskgroup_init (struct gomp_taskgroup *prev)
+{
+  struct gomp_taskgroup *taskgroup
+    = gomp_malloc (sizeof (struct gomp_taskgroup));
+  taskgroup->prev = prev;
+  priority_queue_init (&taskgroup->taskgroup_queue);
+  taskgroup->reductions = prev ? prev->reductions : NULL;
+  taskgroup->in_taskgroup_wait = false;
+  taskgroup->cancelled = false;
+  taskgroup->workshare = false;
+  taskgroup->num_children = 0;
+  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
+  return taskgroup;
+}
+
 void
 GOMP_taskgroup_start (void)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   struct gomp_task *task = thr->task;
-  struct gomp_taskgroup *taskgroup;

   /* If team is NULL, all tasks are executed as
      GOMP_TASK_UNDEFERRED tasks and thus all children tasks of
@@ -1668,14 +1857,7 @@ GOMP_taskgroup_start (void)
      by the time GOMP_taskgroup_end is called.  */
   if (team == NULL)
     return;
-  taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
-  taskgroup->prev = task->taskgroup;
-  priority_queue_init (&taskgroup->taskgroup_queue);
-  taskgroup->in_taskgroup_wait = false;
-  taskgroup->cancelled = false;
-  taskgroup->num_children = 0;
-  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
-  task->taskgroup = taskgroup;
+  task->taskgroup = gomp_taskgroup_init (task->taskgroup);
 }

 void
@@ -1840,6 +2022,302 @@ GOMP_taskgroup_end (void)
   free (taskgroup);
 }

+static inline __attribute__((always_inline)) void
+gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,
+			 unsigned nthreads)
+{
+  size_t total_cnt = 0;
+  uintptr_t *d = data;
+  struct htab *old_htab = NULL, *new_htab;
+  do
+    {
+      if (__builtin_expect (orig != NULL, 0))
+	{
+	  /* For worksharing task reductions, memory has been allocated
+	     already by some other thread that encountered the construct
+	     earlier.  */
+	  d[2] = orig[2];
+	  d[6] = orig[6];
+	  orig = (uintptr_t *) orig[4];
+	}
+      else
+	{
+	  size_t sz = d[1] * nthreads;
+	  /* Should use omp_alloc if d[3] is not -1.  */
+	  void *ptr = gomp_aligned_alloc (d[2], sz);
+	  memset (ptr, '\0', sz);
+	  d[2] = (uintptr_t) ptr;
+	  d[6] = d[2] + sz;
+	}
+      d[5] = 0;
+      total_cnt += d[0];
+      if (d[4] == 0)
+	{
+	  d[4] = (uintptr_t) old;
+	  break;
+	}
+      else
+	d = (uintptr_t *) d[4];
+    }
+  while (1);
+  if (old && old[5])
+    {
+      old_htab = (struct htab *) old[5];
+      total_cnt += htab_elements (old_htab);
+    }
+  new_htab = htab_create (total_cnt);
+  if (old_htab)
+    {
+      /* Copy old hash table, like in htab_expand.  */
+      hash_entry_type *p, *olimit;
+      new_htab->n_elements = htab_elements (old_htab);
+      olimit = old_htab->entries + old_htab->size;
+      p = old_htab->entries;
+      do
+	{
+	  hash_entry_type x = *p;
+	  if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
+	    *find_empty_slot_for_expand (new_htab, htab_hash (x)) = x;
+	  p++;
+	}
+      while (p < olimit);
+    }
+  d = data;
+  do
+    {
+      size_t j;
+      for (j = 0; j < d[0]; ++j)
+	{
+	  uintptr_t *p = d + 7 + j * 3;
+	  p[2] = (uintptr_t) d;
+	  /* Ugly hack, hash_entry_type is defined for the task dependencies,
+	     which hash on the first element which is a pointer.  We need
+	     to hash also on the first sizeof (uintptr_t) bytes which contain
+	     a pointer.  Hide the cast from the compiler.  */
+	  hash_entry_type n;
+	  __asm ("" : "=g" (n) : "0" (p));
+	  *htab_find_slot (&new_htab, n, INSERT) = n;
+	}
+      if (d[4] == (uintptr_t) old)
+	break;
+      else
+	d = (uintptr_t *) d[4];
+    }
+  while (1);
+  d[5] = (uintptr_t) new_htab;
+}
+
+static void
+gomp_create_artificial_team (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_task_icv *icv;
+  struct gomp_team *team = gomp_new_team (1);
+  struct gomp_task *task = thr->task;
+  icv = task ? &task->icv : &gomp_global_icv;
+  team->prev_ts = thr->ts;
+  thr->ts.team = team;
+  thr->ts.team_id = 0;
+  thr->ts.work_share = &team->work_shares[0];
+  thr->ts.last_work_share = NULL;
+#ifdef HAVE_SYNC_BUILTINS
+  thr->ts.single_count = 0;
+#endif
+  thr->ts.static_trip = 0;
+  thr->task = &team->implicit_task[0];
+  gomp_init_task (thr->task, NULL, icv);
+  if (task)
+    {
+      thr->task = task;
+      gomp_end_task ();
+      free (task);
+      thr->task = &team->implicit_task[0];
+    }
+#ifdef LIBGOMP_USE_PTHREADS
+  else
+    pthread_setspecific (gomp_thread_destructor, thr);
+#endif
+}
+
+/* The format of data is:
+   data[0]	cnt
+   data[1]	size
+   data[2]	alignment (on output array pointer)
+   data[3]	allocator (-1 if malloc allocator)
+   data[4]	next pointer
+   data[5]	used internally (htab pointer)
+   data[6]	used internally (end of array)
+   cnt times
+   ent[0]	address
+   ent[1]	offset
+   ent[2]	used internally (pointer to data[0])
+   The entries are sorted by increasing offset, so that a binary
+   search can be performed.  Normally, data[8] is 0, exception is
+   for worksharing construct task reductions in cancellable parallel,
+   where at offset 0 there should be space for a pointer and an integer
+   which are used internally.  */
+
+void
+GOMP_taskgroup_reduction_register (uintptr_t *data)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task;
+  unsigned nthreads;
+  if (__builtin_expect (team == NULL, 0))
+    {
+      /* The task reduction code needs a team and task, so for
+	 orphaned taskgroups just create the implicit team.  */
+      gomp_create_artificial_team ();
+      ialias_call (GOMP_taskgroup_start) ();
+      team = thr->ts.team;
+    }
+  nthreads = team->nthreads;
+  task = thr->task;
+  gomp_reduction_register (data, task->taskgroup->reductions, NULL, nthreads);
+  task->taskgroup->reductions = data;
+}
+
+void
+GOMP_taskgroup_reduction_unregister (uintptr_t *data)
+{
+  uintptr_t *d = data;
+  htab_free ((struct htab *) data[5]);
+  do
+    {
+      gomp_aligned_free ((void *) d[2]);
+      d = (uintptr_t *) d[4];
+    }
+  while (d && !d[5]);
+}
+ialias (GOMP_taskgroup_reduction_unregister)
+
+/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the
+   original list item or address of previously remapped original list
+   item to address of the private copy, store that to ptrs[i].
+   For i < cntorig, additionally set ptrs[cnt+i] to the address of
+   the original list item.  */
+
+void
+GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_task *task = thr->task;
+  unsigned id = thr->ts.team_id;
+  uintptr_t *data = task->taskgroup->reductions;
+  uintptr_t *d;
+  struct htab *reduction_htab = (struct htab *) data[5];
+  size_t i;
+  for (i = 0; i < cnt; ++i)
+    {
+      hash_entry_type ent, n;
+      __asm ("" : "=g" (ent) : "0" (ptrs + i));
+      n = htab_find (reduction_htab, ent);
+      if (n)
+	{
+	  uintptr_t *p;
+	  __asm ("" : "=g" (p) : "0" (n));
+	  /* At this point, p[0] should be equal to (uintptr_t) ptrs[i],
+	     p[1] is the offset within the allocated chunk for each
+	     thread, p[2] is the array registered with
+	     GOMP_taskgroup_reduction_register, d[2] is the base of the
+	     allocated memory and d[1] is the size of the allocated chunk
+	     for one thread.  */
+	  d = (uintptr_t *) p[2];
+	  ptrs[i] = (void *) (d[2] + id * d[1] + p[1]);
+	  if (__builtin_expect (i < cntorig, 0))
+	    ptrs[cnt + i] = (void *) p[0];
+	  continue;
+	}
+      d = data;
+      while (d != NULL)
+	{
+	  if ((uintptr_t) ptrs[i] >= d[2] && (uintptr_t) ptrs[i] < d[6])
+	    break;
+	  d = (uintptr_t *) d[4];
+	}
+      if (d == NULL)
+	gomp_fatal ("couldn't find matching task_reduction or reduction with "
+		    "task modifier for %p", ptrs[i]);
+      uintptr_t off = ((uintptr_t) ptrs[i] - d[2]) % d[1];
+      ptrs[i] = (void *) (d[2] + id * d[1] + off);
+      if (__builtin_expect (i < cntorig, 0))
+	{
+	  size_t lo = 0, hi = d[0] - 1;
+	  while (lo <= hi)
+	    {
+	      size_t m = (lo + hi) / 2;
+	      if (d[7 + 3 * m + 1] < off)
+		lo = m + 1;
+	      else if (d[7 + 3 * m + 1] == off)
+		{
+		  ptrs[cnt + i] = (void *) d[7 + 3 * m];
+		  break;
+		}
+	      else
+		hi = m - 1;
+	    }
+	  if (lo > hi)
+	    gomp_fatal ("couldn't find matching task_reduction or reduction "
+			"with task modifier for %p", ptrs[i]);
+	}
+    }
+}
+
+struct gomp_taskgroup *
+gomp_parallel_reduction_register (uintptr_t *data, unsigned nthreads)
+{
+  struct gomp_taskgroup *taskgroup = gomp_taskgroup_init (NULL);
+  gomp_reduction_register (data, NULL, NULL, nthreads);
+  taskgroup->reductions = data;
+  return taskgroup;
+}
+
+void
+gomp_workshare_task_reduction_register (uintptr_t *data, uintptr_t *orig)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task = thr->task;
+  unsigned nthreads = team->nthreads;
+  gomp_reduction_register (data, task->taskgroup->reductions, orig, nthreads);
+  task->taskgroup->reductions = data;
+}
+
+void
+gomp_workshare_taskgroup_start (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+  struct gomp_task *task;
+
+  if (team == NULL)
+    {
+      gomp_create_artificial_team ();
+      team = thr->ts.team;
+    }
+  task = thr->task;
+  task->taskgroup = gomp_taskgroup_init (task->taskgroup);
+  task->taskgroup->workshare = true;
+}
+
+void
+GOMP_workshare_task_reduction_unregister (bool cancelled)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_task *task = thr->task;
+  struct gomp_team *team = thr->ts.team;
+  uintptr_t *data = task->taskgroup->reductions;
+  ialias_call (GOMP_taskgroup_end) ();
+  if (thr->ts.team_id == 0)
+    ialias_call (GOMP_taskgroup_reduction_unregister) (data);
+  else
+    htab_free ((struct htab *) data[5]);
+
+  if (!cancelled)
+    gomp_team_barrier_wait (&team->barrier);
+}
+
 int
 omp_in_final (void)
 {
--- libgomp/team.c.jj	2018-04-25 09:40:31.322655307 +0200
+++ libgomp/team.c	2019-05-07 18:46:36.548109384 +0200
@@ -32,7 +32,6 @@
 #include <string.h>

 #ifdef LIBGOMP_USE_PTHREADS
-/* This attribute contains PTHREAD_CREATE_DETACHED.  */
 pthread_attr_t gomp_thread_attr;

 /* This key is for the thread destructor.  */
@@ -58,6 +57,7 @@ struct gomp_thread_start_data
   struct gomp_thread_pool *thread_pool;
   unsigned int place;
   bool nested;
+  pthread_t handle;
 };


@@ -89,6 +89,9 @@ gomp_thread_start (void *xdata)
   thr->ts = data->ts;
   thr->task = data->task;
   thr->place = data->place;
+#ifdef GOMP_NEEDS_THREAD_HANDLE
+  thr->handle = data->handle;
+#endif

   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;

@@ -131,6 +134,7 @@ gomp_thread_start (void *xdata)
     }

   gomp_sem_destroy (&thr->release);
+  pthread_detach (pthread_self ());
   thr->thread_pool = NULL;
   thr->task = NULL;
   return NULL;
@@ -183,7 +187,7 @@ gomp_new_team (unsigned nthreads)
   team->single_count = 0;
 #endif
   team->work_shares_to_free = &team->work_shares[0];
-  gomp_init_work_share (&team->work_shares[0], false, nthreads);
+  gomp_init_work_share (&team->work_shares[0], 0, nthreads);
   team->work_shares[0].next_alloc = NULL;
   team->work_share_list_free = NULL;
   team->work_share_list_alloc = &team->work_shares[1];
@@ -231,6 +235,7 @@ gomp_free_pool_helper (void *thread_pool
   thr->thread_pool = NULL;
   thr->task = NULL;
 #ifdef LIBGOMP_USE_PTHREADS
+  pthread_detach (pthread_self ());
   pthread_exit (NULL);
 #elif defined(__nvptx__)
   asm ("exit;");
@@ -297,7 +302,8 @@ gomp_free_thread (void *arg __attribute_
 #ifdef LIBGOMP_USE_PTHREADS
 void
 gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
-		 unsigned flags, struct gomp_team *team)
+		 unsigned flags, struct gomp_team *team,
+		 struct gomp_taskgroup *taskgroup)
 {
   struct gomp_thread_start_data *start_data;
   struct gomp_thread *thr, *nthr;
@@ -312,6 +318,7 @@ gomp_team_start (void (*fn) (void *), vo
   unsigned int s = 0, rest = 0, p = 0, k = 0;
   unsigned int affinity_count = 0;
   struct gomp_thread **affinity_thr = NULL;
+  bool force_display = false;

   thr = gomp_thread ();
   nested = thr->ts.level;
@@ -319,7 +326,12 @@ gomp_team_start (void (*fn) (void *), vo
   task = thr->task;
   icv = task ? &task->icv : &gomp_global_icv;
   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
-    gomp_init_affinity ();
+    {
+      gomp_init_affinity ();
+      if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1)
+	gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
+				      thr->place);
+    }

   /* Always save the previous state, even if this isn't a nested team.
      In particular, we should save any work share state from an outer
@@ -338,6 +350,9 @@ gomp_team_start (void (*fn) (void *), vo
 #endif
   thr->ts.static_trip = 0;
   thr->task = &team->implicit_task[0];
+#ifdef GOMP_NEEDS_THREAD_HANDLE
+  thr->handle = pthread_self ();
+#endif
   nthreads_var = icv->nthreads_var;
   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
       && thr->ts.level < gomp_nthreads_var_list_len)
@@ -350,6 +365,7 @@ gomp_team_start (void (*fn) (void *), vo
       && thr->ts.level < gomp_bind_var_list_len)
     bind_var = gomp_bind_var_list[thr->ts.level];
   gomp_init_task (thr->task, task, icv);
+  thr->task->taskgroup = taskgroup;
   team->implicit_task[0].icv.nthreads_var = nthreads_var;
   team->implicit_task[0].icv.bind_var = bind_var;

@@ -465,7 +481,9 @@ gomp_team_start (void (*fn) (void *), vo
 	  pool->threads
 	    = gomp_realloc (pool->threads,
 			    pool->threads_size
-			    * sizeof (struct gomp_thread_data *));
+			    * sizeof (struct gomp_thread *));
+	  /* Add current (master) thread to threads[].  */
+	  pool->threads[0] = thr;
 	}

       /* Release existing idle threads.  */
@@ -540,6 +558,7 @@ gomp_team_start (void (*fn) (void *), vo
 						+ place_partition_len))
 		{
 		  unsigned int l;
+		  force_display = true;
 		  if (affinity_thr == NULL)
 		    {
 		      unsigned int j;
@@ -623,6 +642,7 @@ gomp_team_start (void (*fn) (void *), vo
 	  gomp_init_task (nthr->task, task, icv);
 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
 	  team->implicit_task[i].icv.bind_var = bind_var;
+	  nthr->task->taskgroup = taskgroup;
 	  nthr->fn = fn;
 	  nthr->data = data;
 	  team->ordered_release[i] = &nthr->release;
@@ -712,19 +732,17 @@ gomp_team_start (void (*fn) (void *), vo
     {
       size_t stacksize;
       pthread_attr_init (&thread_attr);
-      pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED);
       if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
 	pthread_attr_setstacksize (&thread_attr, stacksize);
       attr = &thread_attr;
     }

   start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
-			    * (nthreads-i));
+			    * (nthreads - i));

   /* Launch new threads.  */
   for (; i < nthreads; ++i)
     {
-      pthread_t pt;
       int err;

       start_data->ts.place_partition_off = thr->ts.place_partition_off;
@@ -810,11 +828,14 @@ gomp_team_start (void (*fn) (void *), vo
       gomp_init_task (start_data->task, task, icv);
       team->implicit_task[i].icv.nthreads_var = nthreads_var;
       team->implicit_task[i].icv.bind_var = bind_var;
+      start_data->task->taskgroup = taskgroup;
       start_data->thread_pool = pool;
       start_data->nested = nested;

       attr = gomp_adjust_thread_attr (attr, &thread_attr);
-      err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
+      err = pthread_create (&start_data->handle, attr, gomp_thread_start,
+			    start_data);
+      start_data++;
       if (err != 0)
 	gomp_fatal ("Thread creation failed: %s", strerror (err));
     }
@@ -854,6 +875,42 @@ gomp_team_start (void (*fn) (void *), vo
       gomp_mutex_unlock (&gomp_managed_threads_lock);
 #endif
     }
+  if (__builtin_expect (gomp_display_affinity_var, 0))
+    {
+      if (nested
+	  || nthreads != old_threads_used
+	  || force_display)
+	{
+	  gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
+					thr->place);
+	  if (nested)
+	    {
+	      start_data -= nthreads - 1;
+	      for (i = 1; i < nthreads; ++i)
+		{
+		  gomp_display_affinity_thread (
+#ifdef LIBGOMP_USE_PTHREADS
+						start_data->handle,
+#else
+						gomp_thread_self (),
+#endif
+						&start_data->ts,
+						start_data->place);
+		  start_data++;
+		}
+	    }
+	  else
+	    {
+	      for (i = 1; i < nthreads; ++i)
+		{
+		  gomp_thread_handle handle
+		    = gomp_thread_to_pthread_t (pool->threads[i]);
+		  gomp_display_affinity_thread (handle, &pool->threads[i]->ts,
+						pool->threads[i]->place);
+		}
+	    }
+	}
+    }
   if (__builtin_expect (affinity_thr != NULL, 0)
       && team->prev_ts.place_partition_len > 64)
     free (affinity_thr);
@@ -894,7 +951,7 @@ gomp_team_end (void)
   gomp_end_task ();
   thr->ts = team->prev_ts;

-  if (__builtin_expect (thr->ts.team != NULL, 0))
+  if (__builtin_expect (thr->ts.level != 0, 0))
     {
 #ifdef HAVE_SYNC_BUILTINS
       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
@@ -959,6 +1016,76 @@ team_destructor (void)
      crashes.  */
   pthread_key_delete (gomp_thread_destructor);
 }
+
+/* Similar to gomp_free_pool_helper, but don't detach itself,
+   gomp_pause_host will pthread_join those threads.  */
+
+static void
+gomp_pause_pool_helper (void *thread_pool)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_thread_pool *pool
+    = (struct gomp_thread_pool *) thread_pool;
+  gomp_simple_barrier_wait_last (&pool->threads_dock);
+  gomp_sem_destroy (&thr->release);
+  thr->thread_pool = NULL;
+  thr->task = NULL;
+  pthread_exit (NULL);
+}
+
+/* Free a thread pool and release its threads.  Return non-zero on
+   failure.  */
+
+int
+gomp_pause_host (void)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_thread_pool *pool = thr->thread_pool;
+  if (thr->ts.level)
+    return -1;
+  if (pool)
+    {
+      if (pool->threads_used > 0)
+	{
+	  int i;
+	  pthread_t *thrs
+	    = gomp_alloca (sizeof (pthread_t) * pool->threads_used);
+	  for (i = 1; i < pool->threads_used; i++)
+	    {
+	      struct gomp_thread *nthr = pool->threads[i];
+	      nthr->fn = gomp_pause_pool_helper;
+	      nthr->data = pool;
+	      thrs[i] = gomp_thread_to_pthread_t (nthr);
+	    }
+	  /* This barrier undocks threads docked on pool->threads_dock.  */
+	  gomp_simple_barrier_wait (&pool->threads_dock);
+	  /* And this waits till all threads have called gomp_barrier_wait_last
+	     in gomp_pause_pool_helper.  */
+	  gomp_simple_barrier_wait (&pool->threads_dock);
+	  /* Now it is safe to destroy the barrier and free the pool.  */
+	  gomp_simple_barrier_destroy (&pool->threads_dock);
+
+#ifdef HAVE_SYNC_BUILTINS
+	  __sync_fetch_and_add (&gomp_managed_threads,
+				1L - pool->threads_used);
+#else
+	  gomp_mutex_lock (&gomp_managed_threads_lock);
+	  gomp_managed_threads -= pool->threads_used - 1L;
+	  gomp_mutex_unlock (&gomp_managed_threads_lock);
+#endif
+	  for (i = 1; i < pool->threads_used; i++)
+	    pthread_join (thrs[i], NULL);
+	}
+      if (pool->last_team)
+	free_team (pool->last_team);
+#ifndef __nvptx__
+      free (pool->threads);
+      free (pool);
+#endif
+      thr->thread_pool = NULL;
+    }
+  return 0;
+}
 #endif

 struct gomp_task_icv *
--- libgomp/libgomp.h.jj	2018-04-25 09:40:31.925655587 +0200
+++ libgomp/libgomp.h	2019-05-07 19:01:51.285535999 +0200
@@ -44,6 +44,7 @@
 #include "config.h"
 #include "gstdint.h"
 #include "libgomp-plugin.h"
+#include "gomp-constants.h"

 #ifdef HAVE_PTHREAD_H
 #include <pthread.h>
@@ -85,9 +86,21 @@ enum memmodel

 /* alloc.c */

+#if defined(HAVE_ALIGNED_ALLOC) \
+    || defined(HAVE__ALIGNED_MALLOC) \
+    || defined(HAVE_POSIX_MEMALIGN) \
+    || defined(HAVE_MEMALIGN)
+/* Defined if gomp_aligned_alloc doesn't use fallback version
+   and free can be used instead of gomp_aligned_free.  */
+#define GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC 1
+#endif
+
 extern void *gomp_malloc (size_t) __attribute__((malloc));
 extern void *gomp_malloc_cleared (size_t) __attribute__((malloc));
 extern void *gomp_realloc (void *, size_t);
+extern void *gomp_aligned_alloc (size_t, size_t)
+  __attribute__((malloc, alloc_size (2)));
+extern void gomp_aligned_free (void *);

 /* Avoid conflicting prototypes of alloca() in system headers by using
    GCC's builtin alloca().  */
@@ -137,7 +150,8 @@ enum gomp_schedule_type
   GFS_STATIC,
   GFS_DYNAMIC,
   GFS_GUIDED,
-  GFS_AUTO
+  GFS_AUTO,
+  GFS_MONOTONIC = 0x80000000U
 };

 struct gomp_doacross_work_share
@@ -174,6 +188,8 @@ struct gomp_doacross_work_share
     /* Likewise, but for the ull implementation.  */
     unsigned long long boundary_ull;
   };
+  /* Pointer to extra memory if needed for lastprivate(conditional).  */
+  void *extra;
   /* Array of shift counts for each dimension if they can be flattened.  */
   unsigned int shift_counts[];
 };
@@ -275,6 +291,9 @@ struct gomp_work_share
     struct gomp_work_share *next_free;
   };

+  /* Task reductions for this work-sharing construct.  */
+  uintptr_t *task_reductions;
+
   /* If only few threads are in the team, ordered_team_ids can point
      to this array which fills the padding at the end of this struct.  */
   unsigned inline_ordered_team_ids[0];
@@ -365,8 +384,12 @@ extern void **gomp_places_list;
 extern unsigned long gomp_places_list_len;
 extern unsigned int gomp_num_teams_var;
 extern int gomp_debug_var;
+extern bool gomp_display_affinity_var;
+extern char *gomp_affinity_format_var;
+extern size_t gomp_affinity_format_len;
 extern int goacc_device_num;
 extern char *goacc_device_type;
+extern int goacc_default_dims[GOMP_DIM_MAX];

 enum gomp_task_kind
 {
@@ -469,8 +492,10 @@ struct gomp_taskgroup
   struct gomp_taskgroup *prev;
   /* Queue of tasks that belong in this taskgroup.  */
   struct priority_queue taskgroup_queue;
+  uintptr_t *reductions;
   bool in_taskgroup_wait;
   bool cancelled;
+  bool workshare;
   gomp_sem_t taskgroup_sem;
   size_t num_children;
 };
@@ -613,6 +638,19 @@ struct gomp_thread

   /* User pthread thread pool */
   struct gomp_thread_pool *thread_pool;
+
+#if defined(LIBGOMP_USE_PTHREADS) \
+    && (!defined(HAVE_TLS) \
+	|| !defined(__GLIBC__) \
+	|| !defined(USING_INITIAL_EXEC_TLS))
+  /* pthread_t of the thread containing this gomp_thread.
+     On Linux when using initial-exec TLS,
+     (typeof (pthread_t)) gomp_thread () - pthread_self ()
+     is constant in all threads, so we can optimize and not
+     store it.  */
+#define GOMP_NEEDS_THREAD_HANDLE 1
+  pthread_t handle;
+#endif
 };


@@ -709,6 +747,25 @@ extern bool gomp_affinity_finalize_place
 extern bool gomp_affinity_init_level (int, unsigned long, bool);
 extern void gomp_affinity_print_place (void *);
 extern void gomp_get_place_proc_ids_8 (int, int64_t *);
+extern void gomp_display_affinity_place (char *, size_t, size_t *, int);
+
+/* affinity-fmt.c */
+
+extern void gomp_print_string (const char *str, size_t len);
+extern void gomp_set_affinity_format (const char *, size_t);
+extern void gomp_display_string (char *, size_t, size_t *, const char *,
+				 size_t);
+#ifdef LIBGOMP_USE_PTHREADS
+typedef pthread_t gomp_thread_handle;
+#else
+typedef struct {} gomp_thread_handle;
+#endif
+extern size_t gomp_display_affinity (char *, size_t, const char *,
+				     gomp_thread_handle,
+				     struct gomp_team_state *, unsigned int);
+extern void gomp_display_affinity_thread (gomp_thread_handle,
+					  struct gomp_team_state *,
+					  unsigned int) __attribute__((cold));

 /* iter.c */

@@ -745,9 +802,9 @@ extern void gomp_ordered_next (void);
 extern void gomp_ordered_static_init (void);
 extern void gomp_ordered_static_next (void);
 extern void gomp_ordered_sync (void);
-extern void gomp_doacross_init (unsigned, long *, long);
+extern void gomp_doacross_init (unsigned, long *, long, size_t);
 extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
-				    unsigned long long);
+				    unsigned long long, size_t);

 /* parallel.c */

@@ -770,6 +827,10 @@ extern bool gomp_create_target_task (str
 				     size_t *, unsigned short *, unsigned int,
 				     void **, void **,
 				     enum gomp_target_task_state);
+extern struct gomp_taskgroup *gomp_parallel_reduction_register (uintptr_t *,
+								unsigned);
+extern void gomp_workshare_taskgroup_start (void);
+extern void gomp_workshare_task_reduction_register (uintptr_t *, uintptr_t *);

 static void inline
 gomp_finish_task (struct gomp_task *task)
@@ -782,9 +843,11 @@ gomp_finish_task (struct gomp_task *task

 extern struct gomp_team *gomp_new_team (unsigned);
 extern void gomp_team_start (void (*) (void *), void *, unsigned,
-			     unsigned, struct gomp_team *);
+			     unsigned, struct gomp_team *,
+			     struct gomp_taskgroup *);
 extern void gomp_team_end (void);
 extern void gomp_free_thread (void *);
+extern int gomp_pause_host (void);

 /* target.c */

@@ -851,6 +914,8 @@ struct splay_tree_key_s {
   uintptr_t tgt_offset;
   /* Reference count.  */
   uintptr_t refcount;
+  /* Dynamic reference count.  */
+  uintptr_t dynamic_refcount;
   /* Pointer to the original mapping of "omp declare target link" object.  */
   splay_tree_key link_key;
 };
@@ -989,7 +1054,9 @@ enum gomp_map_vars_kind
 };

 extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
-extern void gomp_acc_remove_pointer (void *, bool, int, int);
+extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
+extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
+				       unsigned short *);

 extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
 					      size_t, void **, void **,
@@ -999,12 +1066,13 @@ extern void gomp_unmap_vars (struct targ
 extern void gomp_init_device (struct gomp_device_descr *);
 extern void gomp_free_memmap (struct splay_tree_s *);
 extern void gomp_unload_device (struct gomp_device_descr *);
+extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);

 /* work.c */

-extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned);
+extern void gomp_init_work_share (struct gomp_work_share *, size_t, unsigned);
 extern void gomp_fini_work_share (struct gomp_work_share *);
-extern bool gomp_work_share_start (bool);
+extern bool gomp_work_share_start (size_t);
 extern void gomp_work_share_end (void);
 extern bool gomp_work_share_end_cancel (void);
 extern void gomp_work_share_end_nowait (void);
@@ -1028,6 +1096,14 @@ gomp_work_share_init_done (void)
 #include "omp-lock.h"
 #define _LIBGOMP_OMP_LOCK_DEFINED 1
 #include "omp.h.in"
+#define omp_sched_monotonic 0x80000000U
+typedef enum omp_pause_resource_t
+{
+  omp_pause_soft = 1,
+  omp_pause_hard = 2
+} omp_pause_resource_t;
+extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW;
+extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW;

 #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \
     || !defined (HAVE_ATTRIBUTE_ALIAS) \
@@ -1082,16 +1158,26 @@ extern int gomp_test_nest_lock_25 (omp_n
 # define attribute_hidden
 #endif

+#if __GNUC__ >= 9
+#  define HAVE_ATTRIBUTE_COPY
+#endif
+
+#ifdef HAVE_ATTRIBUTE_COPY
+# define attribute_copy(arg) __attribute__ ((copy (arg)))
+#else
+# define attribute_copy(arg)
+#endif
+
 #ifdef HAVE_ATTRIBUTE_ALIAS
 # define strong_alias(fn, al) \
-  extern __typeof (fn) al __attribute__ ((alias (#fn)));
+  extern __typeof (fn) al __attribute__ ((alias (#fn))) attribute_copy (fn);

 # define ialias_ulp	ialias_str1(__USER_LABEL_PREFIX__)
 # define ialias_str1(x)	ialias_str2(x)
 # define ialias_str2(x)	#x
 # define ialias(fn) \
   extern __typeof (fn) gomp_ialias_##fn \
-    __attribute__ ((alias (#fn))) attribute_hidden;
+    __attribute__ ((alias (#fn))) attribute_hidden attribute_copy (fn);
 # define ialias_redirect(fn) \
   extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden;
 # define ialias_call(fn) gomp_ialias_ ## fn
@@ -1131,4 +1217,42 @@ task_to_priority_node (enum priority_que
   return (struct priority_node *) ((char *) task
 				   + priority_queue_offset (type));
 }
+
+#ifdef LIBGOMP_USE_PTHREADS
+static inline gomp_thread_handle
+gomp_thread_self (void)
+{
+  return pthread_self ();
+}
+
+static inline gomp_thread_handle
+gomp_thread_to_pthread_t (struct gomp_thread *thr)
+{
+  struct gomp_thread *this_thr = gomp_thread ();
+  if (thr == this_thr)
+    return pthread_self ();
+#ifdef GOMP_NEEDS_THREAD_HANDLE
+  return thr->handle;
+#else
+  /* On Linux with initial-exec TLS, the pthread_t of the thread containing
+     thr can be computed from thr, this_thr and pthread_self (),
+     as the distance between this_thr and pthread_self () is constant.  */
+  return pthread_self () + ((uintptr_t) thr - (uintptr_t) this_thr);
+#endif
+}
+#else
+static inline gomp_thread_handle
+gomp_thread_self (void)
+{
+  return (gomp_thread_handle) {};
+}
+
+static inline gomp_thread_handle
+gomp_thread_to_pthread_t (struct gomp_thread *thr)
+{
+  (void) thr;
+  return gomp_thread_self ();
+}
+#endif
+
 #endif /* LIBGOMP_H */
--- libgomp/oacc-parallel.c.jj	2018-04-25 09:40:31.319655306 +0200
+++ libgomp/oacc-parallel.c	2019-05-07 19:09:47.010991153 +0200
@@ -27,6 +27,8 @@
 /* This file handles OpenACC constructs.  */

 #include "openacc.h"
+void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW;
+void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW;
 #include "libgomp.h"
 #include "libgomp_g.h"
 #include "gomp-constants.h"
@@ -38,31 +40,95 @@
 #include <stdarg.h>
 #include <assert.h>

+
+/* In the ABI, the GOACC_FLAGs are encoded as an inverted bitmask, so that we
+   continue to support the following two legacy values.  */
+_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_ICV) == 0,
+		"legacy GOMP_DEVICE_ICV broken");
+_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_HOST_FALLBACK)
+		== GOACC_FLAG_HOST_FALLBACK,
+		"legacy GOMP_DEVICE_HOST_FALLBACK broken");
+
+
+/* Returns the number of mappings associated with the pointer or pset. PSET
+   have three mappings, whereas pointer have two.  */
+
 static int
-find_pset (int pos, size_t mapnum, unsigned short *kinds)
+find_pointer (int pos, size_t mapnum, unsigned short *kinds)
 {
   if (pos + 1 >= mapnum)
     return 0;

   unsigned char kind = kinds[pos+1] & 0xff;

-  return kind == GOMP_MAP_TO_PSET;
+  if (kind == GOMP_MAP_TO_PSET)
+    return 3;
+  else if (kind == GOMP_MAP_POINTER)
+    return 2;
+
+  return 0;
+}
+
+/* Handle the mapping pair that are presented when a
+   deviceptr clause is used with Fortran.  */
+
+static void
+handle_ftn_pointers (size_t mapnum, void **hostaddrs, size_t *sizes,
+		     unsigned short *kinds)
+{
+  int i;
+
+  for (i = 0; i < mapnum; i++)
+    {
+      unsigned short kind1 = kinds[i] & 0xff;
+
+      /* Handle Fortran deviceptr clause.  */
+      if (kind1 == GOMP_MAP_FORCE_DEVICEPTR)
+	{
+	  unsigned short kind2;
+
+	  if (i < (signed)mapnum - 1)
+	    kind2 = kinds[i + 1] & 0xff;
+	  else
+	    kind2 = 0xffff;
+
+	  if (sizes[i] == sizeof (void *))
+	    continue;
+
+	  /* At this point, we're dealing with a Fortran deviceptr.
+	     If the next element is not what we're expecting, then
+	     this is an instance of where the deviceptr variable was
+	     not used within the region and the pointer was removed
+	     by the gimplifier.  */
+	  if (kind2 == GOMP_MAP_POINTER
+	      && sizes[i + 1] == 0
+	      && hostaddrs[i] == *(void **)hostaddrs[i + 1])
+	    {
+	      kinds[i+1] = kinds[i];
+	      sizes[i+1] = sizeof (void *);
+	    }
+
+	  /* Invalidate the entry.  */
+	  hostaddrs[i] = NULL;
+	}
+    }
 }

 static void goacc_wait (int async, int num_waits, va_list *ap);


-/* Launch a possibly offloaded function on DEVICE.  FN is the host fn
+/* Launch a possibly offloaded function with FLAGS.  FN is the host fn
    address.  MAPNUM, HOSTADDRS, SIZES & KINDS  describe the memory
    blocks to be copied to/from the device.  Varadic arguments are
    keyed optional parameters terminated with a zero.  */

 void
-GOACC_parallel_keyed (int device, void (*fn) (void *),
+GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
 		      size_t mapnum, void **hostaddrs, size_t *sizes,
 		      unsigned short *kinds, ...)
 {
-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
+  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
+
   va_list ap;
   struct goacc_thread *thr;
   struct gomp_device_descr *acc_dev;
@@ -88,9 +154,11 @@ GOACC_parallel_keyed (int device, void (
   thr = goacc_thread ();
   acc_dev = thr->dev;

+  handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
+
   /* Host fallback if "if" clause is false or if the current device is set to
      the host.  */
-  if (host_fallback)
+  if (flags & GOACC_FLAG_HOST_FALLBACK)
     {
       goacc_save_and_set_bind (acc_device_host);
       fn (hostaddrs);
@@ -140,9 +208,7 @@ GOACC_parallel_keyed (int device, void (
 	case GOMP_LAUNCH_WAIT:
 	  {
 	    unsigned num_waits = GOMP_LAUNCH_OP (tag);
-
-	    if (num_waits)
-	      goacc_wait (async, num_waits, &ap);
+	    goacc_wait (async, num_waits, &ap);
 	    break;
 	  }

@@ -177,16 +243,36 @@ GOACC_parallel_keyed (int device, void (
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
-			    + tgt->list[i].key->tgt_offset);
+			    + tgt->list[i].key->tgt_offset
+			    + tgt->list[i].offset);

   acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
 			      async, dims, tgt);

   /* If running synchronously, unmap immediately.  */
-  if (async < acc_async_noval)
+  bool copyfrom = true;
+  if (async_synchronous_p (async))
     gomp_unmap_vars (tgt, true);
   else
-    tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
+    {
+      bool async_unmap = false;
+      for (size_t i = 0; i < tgt->list_count; i++)
+	{
+	  splay_tree_key k = tgt->list[i].key;
+	  if (k && k->refcount == 1)
+	    {
+	      async_unmap = true;
+	      break;
+	    }
+	}
+      if (async_unmap)
+	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
+      else
+	{
+	  copyfrom = false;
+	  gomp_unmap_vars (tgt, copyfrom);
+	}
+    }

   acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
@@ -194,7 +280,7 @@ GOACC_parallel_keyed (int device, void (
 /* Legacy entry point, only provide host execution.  */

 void
-GOACC_parallel (int device, void (*fn) (void *),
+GOACC_parallel (int flags_m, void (*fn) (void *),
 		size_t mapnum, void **hostaddrs, size_t *sizes,
 		unsigned short *kinds,
 		int num_gangs, int num_workers, int vector_length,
@@ -206,10 +292,11 @@ GOACC_parallel (int device, void (*fn) (
 }

 void
-GOACC_data_start (int device, size_t mapnum,
+GOACC_data_start (int flags_m, size_t mapnum,
 		  void **hostaddrs, size_t *sizes, unsigned short *kinds)
 {
-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
+  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
+
   struct target_mem_desc *tgt;

 #ifdef HAVE_INTTYPES_H
@@ -227,7 +314,7 @@ GOACC_data_start (int device, size_t map

   /* Host fallback or 'do nothing'.  */
   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-      || host_fallback)
+      || (flags & GOACC_FLAG_HOST_FALLBACK))
     {
       tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
 			   GOMP_MAP_VARS_OPENACC);
@@ -258,13 +345,14 @@ GOACC_data_end (void)
 }

 void
-GOACC_enter_exit_data (int device, size_t mapnum,
+GOACC_enter_exit_data (int flags_m, size_t mapnum,
 		       void **hostaddrs, size_t *sizes, unsigned short *kinds,
 		       int async, int num_waits, ...)
 {
+  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
+
   struct goacc_thread *thr;
   struct gomp_device_descr *acc_dev;
-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
   bool data_enter = false;
   size_t i;

@@ -274,7 +362,7 @@ GOACC_enter_exit_data (int device, size_
   acc_dev = thr->dev;

   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-      || host_fallback)
+      || (flags & GOACC_FLAG_HOST_FALLBACK))
     return;

   if (num_waits)
@@ -286,6 +374,17 @@ GOACC_enter_exit_data (int device, size_
       va_end (ap);
     }

+  /* Determine whether "finalize" semantics apply to all mappings of this
+     OpenACC directive.  */
+  bool finalize = false;
+  if (mapnum > 0)
+    {
+      unsigned char kind = kinds[0] & 0xff;
+      if (kind == GOMP_MAP_DELETE
+	  || kind == GOMP_MAP_FORCE_FROM)
+	finalize = true;
+    }
+
   acc_dev->openacc.async_set_async_func (async);

   /* Determine if this is an "acc enter data".  */
@@ -298,13 +397,17 @@ GOACC_enter_exit_data (int device, size_

       if (kind == GOMP_MAP_FORCE_ALLOC
 	  || kind == GOMP_MAP_FORCE_PRESENT
-	  || kind == GOMP_MAP_FORCE_TO)
+	  || kind == GOMP_MAP_FORCE_TO
+	  || kind == GOMP_MAP_TO
+	  || kind == GOMP_MAP_ALLOC)
 	{
 	  data_enter = true;
 	  break;
 	}

-      if (kind == GOMP_MAP_DELETE
+      if (kind == GOMP_MAP_RELEASE
+	  || kind == GOMP_MAP_DELETE
+	  || kind == GOMP_MAP_FROM
 	  || kind == GOMP_MAP_FORCE_FROM)
 	break;

@@ -312,31 +415,35 @@ GOACC_enter_exit_data (int device, size_
 		      kind);
     }

+  /* In c, non-pointers and arrays are represented by a single data clause.
+     Dynamically allocated arrays and subarrays are represented by a data
+     clause followed by an internal GOMP_MAP_POINTER.
+
+     In fortran, scalars and not allocated arrays are represented by a
+     single data clause. Allocated arrays and subarrays have three mappings:
+     1) the original data clause, 2) a PSET 3) a pointer to the array data.
+  */
+
   if (data_enter)
     {
       for (i = 0; i < mapnum; i++)
 	{
 	  unsigned char kind = kinds[i] & 0xff;

-	  /* Scan for PSETs.  */
-	  int psets = find_pset (i, mapnum, kinds);
+	  /* Scan for pointers and PSETs.  */
+	  int pointer = find_pointer (i, mapnum, kinds);

-	  if (!psets)
+	  if (!pointer)
 	    {
 	      switch (kind)
 		{
-		case GOMP_MAP_POINTER:
-		  gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i],
-					&kinds[i]);
-		  break;
+		case GOMP_MAP_ALLOC:
 		case GOMP_MAP_FORCE_ALLOC:
 		  acc_create (hostaddrs[i], sizes[i]);
 		  break;
-		case GOMP_MAP_FORCE_PRESENT:
-		  acc_present_or_copyin (hostaddrs[i], sizes[i]);
-		  break;
+		case GOMP_MAP_TO:
 		case GOMP_MAP_FORCE_TO:
-		  acc_present_or_copyin (hostaddrs[i], sizes[i]);
+		  acc_copyin (hostaddrs[i], sizes[i]);
 		  break;
 		default:
 		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -346,12 +453,13 @@ GOACC_enter_exit_data (int device, size_
 	    }
 	  else
 	    {
-	      gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]);
+	      gomp_acc_insert_pointer (pointer, &hostaddrs[i],
+				       &sizes[i], &kinds[i]);
 	      /* Increment 'i' by two because OpenACC requires fortran
 		 arrays to be contiguous, so each PSET is associated with
 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
 		 one MAP_POINTER.  */
-	      i += 2;
+	      i += pointer - 1;
 	    }
 	}
     }
@@ -360,22 +468,28 @@ GOACC_enter_exit_data (int device, size_
       {
 	unsigned char kind = kinds[i] & 0xff;

-	int psets = find_pset (i, mapnum, kinds);
+	int pointer = find_pointer (i, mapnum, kinds);

-	if (!psets)
+	if (!pointer)
 	  {
 	    switch (kind)
 	      {
-	      case GOMP_MAP_POINTER:
-		gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
-					 == GOMP_MAP_FORCE_FROM,
-					 async, 1);
-		break;
+	      case GOMP_MAP_RELEASE:
 	      case GOMP_MAP_DELETE:
-		acc_delete (hostaddrs[i], sizes[i]);
+		if (acc_is_present (hostaddrs[i], sizes[i]))
+		  {
+		    if (finalize)
+		      acc_delete_finalize (hostaddrs[i], sizes[i]);
+		    else
+		      acc_delete (hostaddrs[i], sizes[i]);
+		  }
 		break;
+	      case GOMP_MAP_FROM:
 	      case GOMP_MAP_FORCE_FROM:
-		acc_copyout (hostaddrs[i], sizes[i]);
+		if (finalize)
+		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
+		else
+		  acc_copyout (hostaddrs[i], sizes[i]);
 		break;
 	      default:
 		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
@@ -385,10 +499,12 @@ GOACC_enter_exit_data (int device, size_
 	  }
 	else
 	  {
-	    gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
-				     == GOMP_MAP_FORCE_FROM, async, 3);
+	    bool copyfrom = (kind == GOMP_MAP_FORCE_FROM
+			     || kind == GOMP_MAP_FROM);
+	    gomp_acc_remove_pointer (hostaddrs[i], sizes[i], copyfrom, async,
+				     finalize, pointer);
 	    /* See the above comment.  */
-	    i += 2;
+	    i += pointer - 1;
 	  }
       }

@@ -398,13 +514,20 @@ GOACC_enter_exit_data (int device, size_
 static void
 goacc_wait (int async, int num_waits, va_list *ap)
 {
-  struct goacc_thread *thr = goacc_thread ();
-  struct gomp_device_descr *acc_dev = thr->dev;
-
   while (num_waits--)
     {
       int qid = va_arg (*ap, int);
-
+
+      /* Waiting on ACC_ASYNC_NOVAL maps to 'wait all'.  */
+      if (qid == acc_async_noval)
+	{
+	  if (async == acc_async_sync)
+	    acc_wait_all ();
+	  else
+	    acc_wait_all_async (async);
+	  break;
+	}
+
       if (acc_async_test (qid))
 	continue;

@@ -415,16 +538,17 @@ goacc_wait (int async, int num_waits, va
 	    launching on, the queue itself will order work as
 	    required, so there's no need to wait explicitly.  */
       else
-	acc_dev->openacc.async_wait_async_func (qid, async);
+	acc_wait_async (qid, async);
     }
 }

 void
-GOACC_update (int device, size_t mapnum,
+GOACC_update (int flags_m, size_t mapnum,
 	      void **hostaddrs, size_t *sizes, unsigned short *kinds,
 	      int async, int num_waits, ...)
 {
-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
+  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
+
   size_t i;

   goacc_lazy_initialize ();
@@ -433,7 +557,7 @@ GOACC_update (int device, size_t mapnum,
   struct gomp_device_descr *acc_dev = thr->dev;

   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-      || host_fallback)
+      || (flags & GOACC_FLAG_HOST_FALLBACK))
     return;

   if (num_waits)
@@ -447,6 +571,7 @@ GOACC_update (int device, size_t mapnum,

   acc_dev->openacc.async_set_async_func (async);

+  bool update_device = false;
   for (i = 0; i < mapnum; ++i)
     {
       unsigned char kind = kinds[i] & 0xff;
@@ -457,11 +582,46 @@ GOACC_update (int device, size_t mapnum,
 	case GOMP_MAP_TO_PSET:
 	  break;

+	case GOMP_MAP_ALWAYS_POINTER:
+	  if (update_device)
+	    {
+	      /* Save the contents of the host pointer.  */
+	      void *dptr = acc_deviceptr (hostaddrs[i-1]);
+	      uintptr_t t = *(uintptr_t *) hostaddrs[i];
+
+	      /* Update the contents of the host pointer to reflect
+		 the value of the allocated device memory in the
+		 previous pointer.  */
+	      *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
+	      acc_update_device (hostaddrs[i], sizeof (uintptr_t));
+
+	      /* Restore the host pointer.  */
+	      *(uintptr_t *) hostaddrs[i] = t;
+	      update_device = false;
+	    }
+	  break;
+
+	case GOMP_MAP_TO:
+	  if (!acc_is_present (hostaddrs[i], sizes[i]))
+	    {
+	      update_device = false;
+	      break;
+	    }
+	  /* Fallthru  */
 	case GOMP_MAP_FORCE_TO:
+	  update_device = true;
 	  acc_update_device (hostaddrs[i], sizes[i]);
 	  break;

+	case GOMP_MAP_FROM:
+	  if (!acc_is_present (hostaddrs[i], sizes[i]))
+	    {
+	      update_device = false;
+	      break;
+	    }
+	  /* Fallthru  */
 	case GOMP_MAP_FORCE_FROM:
+	  update_device = false;
 	  acc_update_self (hostaddrs[i], sizes[i]);
 	  break;

@@ -487,8 +647,8 @@ GOACC_wait (int async, int num_waits, ..
     }
   else if (async == acc_async_sync)
     acc_wait_all ();
-  else if (async == acc_async_noval)
-    goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval);
+  else
+    acc_wait_all_async (async);
 }

 int
@@ -504,7 +664,7 @@ GOACC_get_thread_num (void)
 }

 void
-GOACC_declare (int device, size_t mapnum,
+GOACC_declare (int flags_m, size_t mapnum,
 	       void **hostaddrs, size_t *sizes, unsigned short *kinds)
 {
   int i;
@@ -522,9 +682,10 @@ GOACC_declare (int device, size_t mapnum
 	  case GOMP_MAP_FORCE_FROM:
 	  case GOMP_MAP_FORCE_TO:
 	  case GOMP_MAP_POINTER:
+	  case GOMP_MAP_RELEASE:
 	  case GOMP_MAP_DELETE:
-	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				   &kinds[i], 0, 0);
+	    GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
+				   &kinds[i], GOMP_ASYNC_SYNC, 0);
 	    break;

 	  case GOMP_MAP_FORCE_DEVICEPTR:
@@ -532,20 +693,19 @@ GOACC_declare (int device, size_t mapnum

 	  case GOMP_MAP_ALLOC:
 	    if (!acc_is_present (hostaddrs[i], sizes[i]))
-	      GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				     &kinds[i], 0, 0);
+	      GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
+				     &kinds[i], GOMP_ASYNC_SYNC, 0);
 	    break;

 	  case GOMP_MAP_TO:
-	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				   &kinds[i], 0, 0);
+	    GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
+				   &kinds[i], GOMP_ASYNC_SYNC, 0);

 	    break;

 	  case GOMP_MAP_FROM:
-	    kinds[i] = GOMP_MAP_FORCE_FROM;
-	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
-				   &kinds[i], 0, 0);
+	    GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
+				   &kinds[i], GOMP_ASYNC_SYNC, 0);
 	    break;

 	  case GOMP_MAP_FORCE_PRESENT:
--- libgomp/openacc2.f90.jj	2019-05-07 19:54:18.828514375 +0200
+++ libgomp/openacc2.f90	2019-05-07 19:56:38.454296347 +0200
@@ -0,0 +1,1502 @@
+!  OpenACC Runtime Library Definitions.
+
+!  Copyright (C) 2014-2019 Free Software Foundation, Inc.
+
+!  Contributed by Tobias Burnus <burnus@net-b.de>
+!              and Mentor Embedded.
+
+!  This file is part of the GNU Offloading and Multi Processing Library
+!  (libgomp).
+
+!  Libgomp is free software; you can redistribute it and/or modify it
+!  under the terms of the GNU General Public License as published by
+!  the Free Software Foundation; either version 3, or (at your option)
+!  any later version.
+
+!  Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+!  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+!  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+!  more details.
+
+!  Under Section 7 of GPL version 3, you are granted additional
+!  permissions described in the GCC Runtime Library Exception, version
+!  3.1, as published by the Free Software Foundation.
+
+!  You should have received a copy of the GNU General Public License and
+!  a copy of the GCC Runtime Library Exception along with this program;
+!  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+!  <http://www.gnu.org/licenses/>.
+
+module openacc_kinds2
+  use iso_fortran_env, only: int32
+  implicit none
+
+  private :: int32
+  public :: acc_device_kind
+
+  integer, parameter :: acc_device_kind = int32
+
+  public :: acc_device_none, acc_device_default, acc_device_host
+  public :: acc_device_not_host, acc_device_nvidia
+
+  ! Keep in sync with include/gomp-constants.h.
+  integer (acc_device_kind), parameter :: acc_device_none = 0
+  integer (acc_device_kind), parameter :: acc_device_default = 1
+  integer (acc_device_kind), parameter :: acc_device_host = 2
+  ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
+  integer (acc_device_kind), parameter :: acc_device_not_host = 4
+  integer (acc_device_kind), parameter :: acc_device_nvidia = 5
+
+  public :: acc_handle_kind
+
+  integer, parameter :: acc_handle_kind = int32
+
+  public :: acc_async_noval, acc_async_sync
+
+  ! Keep in sync with include/gomp-constants.h.
+  integer (acc_handle_kind), parameter :: acc_async_noval = -1
+  integer (acc_handle_kind), parameter :: acc_async_sync = -2
+
+end module
+
+module openacc_internal2
+  use openacc_kinds2
+  implicit none
+
+  interface
+    function acc_get_num_devices_h (d)
+      import
+      integer acc_get_num_devices_h
+      integer (acc_device_kind) d
+    end function
+
+    subroutine acc_set_device_type_h (d)
+      import
+      integer (acc_device_kind) d
+    end subroutine
+
+    function acc_get_device_type_h ()
+      import
+      integer (acc_device_kind) acc_get_device_type_h
+    end function
+
+    subroutine acc_set_device_num_h (n, d)
+      import
+      integer n
+      integer (acc_device_kind) d
+    end subroutine
+
+    function acc_get_device_num_h (d)
+      import
+      integer acc_get_device_num_h
+      integer (acc_device_kind) d
+    end function
+
+    function acc_async_test_h (a)
+      logical acc_async_test_h
+      integer a
+    end function
+
+    function acc_async_test_all_h ()
+      logical acc_async_test_all_h
+    end function
+
+    subroutine acc_wait_h (a)
+      integer a
+    end subroutine
+
+    subroutine acc_wait_async_h (a1, a2)
+      integer a1, a2
+    end subroutine
+
+    subroutine acc_wait_all_h ()
+    end subroutine
+
+    subroutine acc_wait_all_async_h (a)
+      integer a
+    end subroutine
+
+    subroutine acc_init_h (d)
+      import
+      integer (acc_device_kind) d
+    end subroutine
+
+    subroutine acc_shutdown_h (d)
+      import
+      integer (acc_device_kind) d
+    end subroutine
+
+    function acc_on_device_h (d)
+      import
+      integer (acc_device_kind) d
+      logical acc_on_device_h
+    end function
+
+    subroutine acc_copyin_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_copyin_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_copyin_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_present_or_copyin_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_present_or_copyin_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_present_or_copyin_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_create_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_create_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_create_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_present_or_create_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_present_or_create_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_present_or_create_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_copyout_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_copyout_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_copyout_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_copyout_finalize_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_copyout_finalize_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_copyout_finalize_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_delete_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_delete_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_delete_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_delete_finalize_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_delete_finalize_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_delete_finalize_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_update_device_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_update_device_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_update_device_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    subroutine acc_update_self_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end subroutine
+
+    subroutine acc_update_self_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end subroutine
+
+    subroutine acc_update_self_array_h (a)
+      type (*), dimension (..), contiguous :: a
+    end subroutine
+
+    function acc_is_present_32_h (a, len)
+      use iso_c_binding, only: c_int32_t
+      logical acc_is_present_32_h
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+    end function
+
+    function acc_is_present_64_h (a, len)
+      use iso_c_binding, only: c_int64_t
+      logical acc_is_present_64_h
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+    end function
+
+    function acc_is_present_array_h (a)
+      logical acc_is_present_array_h
+      type (*), dimension (..), contiguous :: a
+    end function
+
+    subroutine acc_copyin_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_copyin_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_copyin_async_array_h (a, async)
+      use openacc_kinds2, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_create_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_create_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_create_async_array_h (a, async)
+      use openacc_kinds2, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_copyout_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_copyout_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_copyout_async_array_h (a, async)
+      use openacc_kinds2, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_delete_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_delete_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_delete_async_array_h (a, async)
+      use openacc_kinds2, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_update_device_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_update_device_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_update_device_async_array_h (a, async)
+      use openacc_kinds2, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_update_self_async_32_h (a, len, async)
+      use iso_c_binding, only: c_int32_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int32_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_update_self_async_64_h (a, len, async)
+      use iso_c_binding, only: c_int64_t
+      use openacc_kinds2, only: acc_handle_kind
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_int64_t) len
+      integer (acc_handle_kind) async
+    end subroutine
+
+    subroutine acc_update_self_async_array_h (a, async)
+      use openacc_kinds2, only: acc_handle_kind
+      type (*), dimension (..), contiguous :: a
+      integer (acc_handle_kind) async
+    end subroutine
+  end interface
+
+  interface
+    function acc_get_num_devices_l (d) &
+        bind (C, name = "acc_get_num_devices")
+      use iso_c_binding, only: c_int
+      integer (c_int) :: acc_get_num_devices_l
+      integer (c_int), value :: d
+    end function
+
+    subroutine acc_set_device_type_l (d) &
+        bind (C, name = "acc_set_device_type")
+      use iso_c_binding, only: c_int
+      integer (c_int), value :: d
+    end subroutine
+
+    function acc_get_device_type_l () &
+        bind (C, name = "acc_get_device_type")
+      use iso_c_binding, only: c_int
+      integer (c_int) :: acc_get_device_type_l
+    end function
+
+    subroutine acc_set_device_num_l (n, d) &
+        bind (C, name = "acc_set_device_num")
+      use iso_c_binding, only: c_int
+      integer (c_int), value :: n, d
+    end subroutine
+
+    function acc_get_device_num_l (d) &
+        bind (C, name = "acc_get_device_num")
+      use iso_c_binding, only: c_int
+      integer (c_int) :: acc_get_device_num_l
+      integer (c_int), value :: d
+    end function
+
+    function acc_async_test_l (a) &
+        bind (C, name = "acc_async_test")
+      use iso_c_binding, only: c_int
+      integer (c_int) :: acc_async_test_l
+      integer (c_int), value :: a
+    end function
+
+    function acc_async_test_all_l () &
+        bind (C, name = "acc_async_test_all")
+      use iso_c_binding, only: c_int
+      integer (c_int) :: acc_async_test_all_l
+    end function
+
+    subroutine acc_wait_l (a) &
+        bind (C, name = "acc_wait")
+      use iso_c_binding, only: c_int
+      integer (c_int), value :: a
+    end subroutine
+
+    subroutine acc_wait_async_l (a1, a2) &
+        bind (C, name = "acc_wait_async")
+      use iso_c_binding, only: c_int
+      integer (c_int), value :: a1, a2
+    end subroutine
+
+    subroutine acc_wait_all_l () &
+        bind (C, name = "acc_wait_all")
+      use iso_c_binding, only: c_int
+    end subroutine
+
+    subroutine acc_wait_all_async_l (a) &
+        bind (C, name = "acc_wait_all_async")
+      use iso_c_binding, only: c_int
+      integer (c_int), value :: a
+    end subroutine
+
+    subroutine acc_init_l (d) &
+        bind (C, name = "acc_init")
+      use iso_c_binding, only: c_int
+      integer (c_int), value :: d
+    end subroutine
+
+    subroutine acc_shutdown_l (d) &
+        bind (C, name = "acc_shutdown")
+      use iso_c_binding, only: c_int
+      integer (c_int), value :: d
+    end subroutine
+
+    function acc_on_device_l (d) &
+        bind (C, name = "acc_on_device")
+      use iso_c_binding, only: c_int
+      integer (c_int) :: acc_on_device_l
+      integer (c_int), value :: d
+    end function
+
+    subroutine acc_copyin_l (a, len) &
+        bind (C, name = "acc_copyin")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_present_or_copyin_l (a, len) &
+        bind (C, name = "acc_present_or_copyin")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_create_l (a, len) &
+        bind (C, name = "acc_create")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_present_or_create_l (a, len) &
+        bind (C, name = "acc_present_or_create")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_copyout_l (a, len) &
+        bind (C, name = "acc_copyout")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_copyout_finalize_l (a, len) &
+        bind (C, name = "acc_copyout_finalize")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_delete_l (a, len) &
+        bind (C, name = "acc_delete")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_delete_finalize_l (a, len) &
+        bind (C, name = "acc_delete_finalize")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_update_device_l (a, len) &
+        bind (C, name = "acc_update_device")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    subroutine acc_update_self_l (a, len) &
+        bind (C, name = "acc_update_self")
+      use iso_c_binding, only: c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end subroutine
+
+    function acc_is_present_l (a, len) &
+        bind (C, name = "acc_is_present")
+      use iso_c_binding, only: c_int32_t, c_size_t
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      integer (c_int32_t) :: acc_is_present_l
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+    end function
+
+    subroutine acc_copyin_async_l (a, len, async) &
+        bind (C, name = "acc_copyin_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+
+    subroutine acc_create_async_l (a, len, async) &
+        bind (C, name = "acc_create_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+
+    subroutine acc_copyout_async_l (a, len, async) &
+        bind (C, name = "acc_copyout_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+
+    subroutine acc_delete_async_l (a, len, async) &
+        bind (C, name = "acc_delete_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+
+    subroutine acc_update_device_async_l (a, len, async) &
+        bind (C, name = "acc_update_device_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+
+    subroutine acc_update_self_async_l (a, len, async) &
+        bind (C, name = "acc_update_self_async")
+      use iso_c_binding, only: c_size_t, c_int
+      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+      type (*), dimension (*) :: a
+      integer (c_size_t), value :: len
+      integer (c_int), value :: async
+    end subroutine
+  end interface
+end module
+
+module openacc2
+  use openacc_kinds2
+  use openacc_internal2
+  implicit none
+
+  public :: openacc_version
+
+  public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type
+  public :: acc_set_device_num, acc_get_device_num, acc_async_test
+  public :: acc_async_test_all
+  public :: acc_wait, acc_async_wait, acc_wait_async
+  public :: acc_wait_all, acc_async_wait_all, acc_wait_all_async
+  public :: acc_init, acc_shutdown, acc_on_device
+  public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create
+  public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete
+  public :: acc_update_device, acc_update_self, acc_is_present
+  public :: acc_copyin_async, acc_create_async, acc_copyout_async
+  public :: acc_delete_async, acc_update_device_async, acc_update_self_async
+
+  integer, parameter :: openacc_version = 201306
+
+  interface acc_get_num_devices
+    procedure :: acc_get_num_devices_h
+  end interface
+
+  interface acc_set_device_type
+    procedure :: acc_set_device_type_h
+  end interface
+
+  interface acc_get_device_type
+    procedure :: acc_get_device_type_h
+  end interface
+
+  interface acc_set_device_num
+    procedure :: acc_set_device_num_h
+  end interface
+
+  interface acc_get_device_num
+    procedure :: acc_get_device_num_h
+  end interface
+
+  interface acc_async_test
+    procedure :: acc_async_test_h
+  end interface
+
+  interface acc_async_test_all
+    procedure :: acc_async_test_all_h
+  end interface
+
+  interface acc_wait
+    procedure :: acc_wait_h
+  end interface
+
+  ! acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.
+  interface acc_async_wait
+    procedure :: acc_wait_h
+  end interface
+
+  interface acc_wait_async
+    procedure :: acc_wait_async_h
+  end interface
+
+  interface acc_wait_all
+    procedure :: acc_wait_all_h
+  end interface
+
+  ! acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.
+  interface acc_async_wait_all
+    procedure :: acc_wait_all_h
+  end interface
+
+  interface acc_wait_all_async
+    procedure :: acc_wait_all_async_h
+  end interface
+
+  interface acc_init
+    procedure :: acc_init_h
+  end interface
+
+  interface acc_shutdown
+    procedure :: acc_shutdown_h
+  end interface
+
+  interface acc_on_device
+    procedure :: acc_on_device_h
+  end interface
+
+  ! acc_malloc: Only available in C/C++
+  ! acc_free: Only available in C/C++
+
+  ! As vendor extension, the following code supports both 32bit and 64bit
+  ! arguments for "size"; the OpenACC standard only permits default-kind
+  ! integers, which are of kind 4 (i.e. 32 bits).
+  ! Additionally, the two-argument version also takes arrays as argument.
+  ! and the one argument version also scalars. Note that the code assumes
+  ! that the arrays are contiguous.
+
+  interface acc_copyin
+    procedure :: acc_copyin_32_h
+    procedure :: acc_copyin_64_h
+    procedure :: acc_copyin_array_h
+  end interface
+
+  interface acc_present_or_copyin
+    procedure :: acc_present_or_copyin_32_h
+    procedure :: acc_present_or_copyin_64_h
+    procedure :: acc_present_or_copyin_array_h
+  end interface
+
+  interface acc_pcopyin
+    procedure :: acc_present_or_copyin_32_h
+    procedure :: acc_present_or_copyin_64_h
+    procedure :: acc_present_or_copyin_array_h
+  end interface
+
+  interface acc_create
+    procedure :: acc_create_32_h
+    procedure :: acc_create_64_h
+    procedure :: acc_create_array_h
+  end interface
+
+  interface acc_present_or_create
+    procedure :: acc_present_or_create_32_h
+    procedure :: acc_present_or_create_64_h
+    procedure :: acc_present_or_create_array_h
+  end interface
+
+  interface acc_pcreate
+    procedure :: acc_present_or_create_32_h
+    procedure :: acc_present_or_create_64_h
+    procedure :: acc_present_or_create_array_h
+  end interface
+
+  interface acc_copyout
+    procedure :: acc_copyout_32_h
+    procedure :: acc_copyout_64_h
+    procedure :: acc_copyout_array_h
+  end interface
+
+  interface acc_copyout_finalize
+    procedure :: acc_copyout_finalize_32_h
+    procedure :: acc_copyout_finalize_64_h
+    procedure :: acc_copyout_finalize_array_h
+  end interface
+
+  interface acc_delete
+    procedure :: acc_delete_32_h
+    procedure :: acc_delete_64_h
+    procedure :: acc_delete_array_h
+  end interface
+
+  interface acc_delete_finalize
+    procedure :: acc_delete_finalize_32_h
+    procedure :: acc_delete_finalize_64_h
+    procedure :: acc_delete_finalize_array_h
+  end interface
+
+  interface acc_update_device
+    procedure :: acc_update_device_32_h
+    procedure :: acc_update_device_64_h
+    procedure :: acc_update_device_array_h
+  end interface
+
+  interface acc_update_self
+    procedure :: acc_update_self_32_h
+    procedure :: acc_update_self_64_h
+    procedure :: acc_update_self_array_h
+  end interface
+
+  ! acc_map_data: Only available in C/C++
+  ! acc_unmap_data: Only available in C/C++
+  ! acc_deviceptr: Only available in C/C++
+  ! acc_hostptr: Only available in C/C++
+
+  interface acc_is_present
+    procedure :: acc_is_present_32_h
+    procedure :: acc_is_present_64_h
+    procedure :: acc_is_present_array_h
+  end interface
+
+  ! acc_memcpy_to_device: Only available in C/C++
+  ! acc_memcpy_from_device: Only available in C/C++
+
+  interface acc_copyin_async
+    procedure :: acc_copyin_async_32_h
+    procedure :: acc_copyin_async_64_h
+    procedure :: acc_copyin_async_array_h
+  end interface
+
+  interface acc_create_async
+    procedure :: acc_create_async_32_h
+    procedure :: acc_create_async_64_h
+    procedure :: acc_create_async_array_h
+  end interface
+
+  interface acc_copyout_async
+    procedure :: acc_copyout_async_32_h
+    procedure :: acc_copyout_async_64_h
+    procedure :: acc_copyout_async_array_h
+  end interface
+
+  interface acc_delete_async
+    procedure :: acc_delete_async_32_h
+    procedure :: acc_delete_async_64_h
+    procedure :: acc_delete_async_array_h
+  end interface
+
+  interface acc_update_device_async
+    procedure :: acc_update_device_async_32_h
+    procedure :: acc_update_device_async_64_h
+    procedure :: acc_update_device_async_array_h
+  end interface
+
+  interface acc_update_self_async
+    procedure :: acc_update_self_async_32_h
+    procedure :: acc_update_self_async_64_h
+    procedure :: acc_update_self_async_array_h
+  end interface
+
+end module
+
+function acc_get_num_devices_h (d)
+  use openacc_internal2, only: acc_get_num_devices_l
+  use openacc_kinds2
+  integer acc_get_num_devices_h
+  integer (acc_device_kind) d
+  acc_get_num_devices_h = acc_get_num_devices_l (d)
+end function
+
+subroutine acc_set_device_type_h (d)
+  use openacc_internal2, only: acc_set_device_type_l
+  use openacc_kinds2
+  integer (acc_device_kind) d
+  call acc_set_device_type_l (d)
+end subroutine
+
+function acc_get_device_type_h ()
+  use openacc_internal2, only: acc_get_device_type_l
+  use openacc_kinds2
+  integer (acc_device_kind) acc_get_device_type_h
+  acc_get_device_type_h = acc_get_device_type_l ()
+end function
+
+subroutine acc_set_device_num_h (n, d)
+  use openacc_internal2, only: acc_set_device_num_l
+  use openacc_kinds2
+  integer n
+  integer (acc_device_kind) d
+  call acc_set_device_num_l (n, d)
+end subroutine
+
+function acc_get_device_num_h (d)
+  use openacc_internal2, only: acc_get_device_num_l
+  use openacc_kinds2
+  integer acc_get_device_num_h
+  integer (acc_device_kind) d
+  acc_get_device_num_h = acc_get_device_num_l (d)
+end function
+
+function acc_async_test_h (a)
+  use openacc_internal2, only: acc_async_test_l
+  logical acc_async_test_h
+  integer a
+  if (acc_async_test_l (a) .eq. 1) then
+    acc_async_test_h = .TRUE.
+  else
+    acc_async_test_h = .FALSE.
+  end if
+end function
+
+function acc_async_test_all_h ()
+  use openacc_internal2, only: acc_async_test_all_l
+  logical acc_async_test_all_h
+  if (acc_async_test_all_l () .eq. 1) then
+    acc_async_test_all_h = .TRUE.
+  else
+    acc_async_test_all_h = .FALSE.
+  end if
+end function
+
+subroutine acc_wait_h (a)
+  use openacc_internal2, only: acc_wait_l
+  integer a
+  call acc_wait_l (a)
+end subroutine
+
+subroutine acc_wait_async_h (a1, a2)
+  use openacc_internal2, only: acc_wait_async_l
+  integer a1, a2
+  call acc_wait_async_l (a1, a2)
+end subroutine
+
+subroutine acc_wait_all_h ()
+  use openacc_internal2, only: acc_wait_all_l
+  call acc_wait_all_l ()
+end subroutine
+
+subroutine acc_wait_all_async_h (a)
+  use openacc_internal2, only: acc_wait_all_async_l
+  integer a
+  call acc_wait_all_async_l (a)
+end subroutine
+
+subroutine acc_init_h (d)
+  use openacc_internal2, only: acc_init_l
+  use openacc_kinds2
+  integer (acc_device_kind) d
+  call acc_init_l (d)
+end subroutine
+
+subroutine acc_shutdown_h (d)
+  use openacc_internal2, only: acc_shutdown_l
+  use openacc_kinds2
+  integer (acc_device_kind) d
+  call acc_shutdown_l (d)
+end subroutine
+
+function acc_on_device_h (d)
+  use openacc_internal2, only: acc_on_device_l
+  use openacc_kinds2
+  integer (acc_device_kind) d
+  logical acc_on_device_h
+  if (acc_on_device_l (d) .eq. 1) then
+    acc_on_device_h = .TRUE.
+  else
+    acc_on_device_h = .FALSE.
+  end if
+end function
+
+subroutine acc_copyin_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_copyin_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyin_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_copyin_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyin_array_h (a)
+  use openacc_internal2, only: acc_copyin_l
+  type (*), dimension (..), contiguous :: a
+  call acc_copyin_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_present_or_copyin_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_present_or_copyin_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_copyin_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_present_or_copyin_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_copyin_array_h (a)
+  use openacc_internal2, only: acc_present_or_copyin_l
+  type (*), dimension (..), contiguous :: a
+  call acc_present_or_copyin_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_create_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_create_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_create_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_create_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_create_array_h (a)
+  use openacc_internal2, only: acc_create_l
+  type (*), dimension (..), contiguous :: a
+  call acc_create_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_present_or_create_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_present_or_create_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_present_or_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_create_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_present_or_create_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_present_or_create_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_present_or_create_array_h (a)
+  use openacc_internal2, only: acc_present_or_create_l
+  type (*), dimension (..), contiguous :: a
+  call acc_present_or_create_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_copyout_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_copyout_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_copyout_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_copyout_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_copyout_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_array_h (a)
+  use openacc_internal2, only: acc_copyout_l
+  type (*), dimension (..), contiguous :: a
+  call acc_copyout_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_copyout_finalize_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_copyout_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_finalize_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_copyout_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_copyout_finalize_array_h (a)
+  use openacc_internal2, only: acc_copyout_finalize_l
+  type (*), dimension (..), contiguous :: a
+  call acc_copyout_finalize_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_delete_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_delete_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_delete_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_delete_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_delete_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_array_h (a)
+  use openacc_internal2, only: acc_delete_l
+  type (*), dimension (..), contiguous :: a
+  call acc_delete_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_delete_finalize_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_delete_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_delete_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_finalize_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_delete_finalize_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_delete_finalize_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_delete_finalize_array_h (a)
+  use openacc_internal2, only: acc_delete_finalize_l
+  type (*), dimension (..), contiguous :: a
+  call acc_delete_finalize_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_update_device_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_update_device_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_update_device_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_device_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_update_device_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_update_device_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_device_array_h (a)
+  use openacc_internal2, only: acc_update_device_l
+  type (*), dimension (..), contiguous :: a
+  call acc_update_device_l (a, sizeof (a))
+end subroutine
+
+subroutine acc_update_self_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_update_self_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  call acc_update_self_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_self_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_update_self_l
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  call acc_update_self_l (a, int (len, kind = c_size_t))
+end subroutine
+
+subroutine acc_update_self_array_h (a)
+  use openacc_internal2, only: acc_update_self_l
+  type (*), dimension (..), contiguous :: a
+  call acc_update_self_l (a, sizeof (a))
+end subroutine
+
+function acc_is_present_32_h (a, len)
+  use iso_c_binding, only: c_int32_t, c_size_t
+  use openacc_internal2, only: acc_is_present_l
+  logical acc_is_present_32_h
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
+    acc_is_present_32_h = .TRUE.
+  else
+    acc_is_present_32_h = .FALSE.
+  end if
+end function
+
+function acc_is_present_64_h (a, len)
+  use iso_c_binding, only: c_int64_t, c_size_t
+  use openacc_internal2, only: acc_is_present_l
+  logical acc_is_present_64_h
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
+    acc_is_present_64_h = .TRUE.
+  else
+    acc_is_present_64_h = .FALSE.
+  end if
+end function
+
+function acc_is_present_array_h (a)
+  use openacc_internal2, only: acc_is_present_l
+  logical acc_is_present_array_h
+  type (*), dimension (..), contiguous :: a
+  acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1
+end function
+
+subroutine acc_copyin_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal2, only: acc_copyin_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_copyin_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal2, only: acc_copyin_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_copyin_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal2, only: acc_copyin_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_create_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal2, only: acc_create_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_create_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal2, only: acc_create_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_create_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal2, only: acc_create_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_create_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_copyout_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal2, only: acc_copyout_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_copyout_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal2, only: acc_copyout_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_copyout_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal2, only: acc_copyout_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_delete_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal2, only: acc_delete_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_delete_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal2, only: acc_delete_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_delete_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal2, only: acc_delete_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_update_device_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal2, only: acc_update_device_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_update_device_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal2, only: acc_update_device_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_update_device_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal2, only: acc_update_device_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_update_self_async_32_h (a, len, async)
+  use iso_c_binding, only: c_int32_t, c_size_t, c_int
+  use openacc_internal2, only: acc_update_self_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int32_t) len
+  integer (acc_handle_kind) async
+  call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_update_self_async_64_h (a, len, async)
+  use iso_c_binding, only: c_int64_t, c_size_t, c_int
+  use openacc_internal2, only: acc_update_self_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
+  type (*), dimension (*) :: a
+  integer (c_int64_t) len
+  integer (acc_handle_kind) async
+  call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
+end subroutine
+
+subroutine acc_update_self_async_array_h (a, async)
+  use iso_c_binding, only: c_int
+  use openacc_internal2, only: acc_update_self_async_l
+  use openacc_kinds2, only: acc_handle_kind
+  type (*), dimension (..), contiguous :: a
+  integer (acc_handle_kind) async
+  call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int))
+end subroutine
--- libgomp/taskloop.c.jj	2018-04-25 09:40:31.913655581 +0200
+++ libgomp/taskloop.c	2019-05-07 18:46:36.547109400 +0200
@@ -149,11 +149,28 @@ GOMP_taskloop (void (*fn) (void *), void

   if (flags & GOMP_TASK_FLAG_NOGROUP)
     {
-      if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled)
-	return;
+      if (__builtin_expect (gomp_cancel_var, 0)
+	  && thr->task
+	  && thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
     }
   else
-    ialias_call (GOMP_taskgroup_start) ();
+    {
+      ialias_call (GOMP_taskgroup_start) ();
+      if (flags & GOMP_TASK_FLAG_REDUCTION)
+	{
+	  struct gomp_data_head { TYPE t1, t2; uintptr_t *ptr; };
+	  uintptr_t *ptr = ((struct gomp_data_head *) data)->ptr;
+	  ialias_call (GOMP_taskgroup_reduction_register) (ptr);
+	}
+    }

   if (priority > gomp_max_task_priority_var)
     priority = gomp_max_task_priority_var;
@@ -284,19 +301,31 @@ GOMP_taskloop (void (*fn) (void *), void
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
 	 tasks.  */
-      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
-			     || (taskgroup && taskgroup->cancelled))
-			    && cpyfn == NULL, 0))
+      if (__builtin_expect (gomp_cancel_var, 0)
+	  && cpyfn == NULL)
 	{
-	  gomp_mutex_unlock (&team->task_lock);
-	  for (i = 0; i < num_tasks; i++)
+	  if (gomp_team_barrier_cancelled (&team->barrier))
+	    {
+	    do_cancel:
+	      gomp_mutex_unlock (&team->task_lock);
+	      for (i = 0; i < num_tasks; i++)
+		{
+		  gomp_finish_task (tasks[i]);
+		  free (tasks[i]);
+		}
+	      if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
+		ialias_call (GOMP_taskgroup_end) ();
+	      return;
+	    }
+	  if (taskgroup)
 	    {
-	      gomp_finish_task (tasks[i]);
-	      free (tasks[i]);
+	      if (taskgroup->cancelled)
+		goto do_cancel;
+	      if (taskgroup->workshare
+		  && taskgroup->prev
+		  && taskgroup->prev->cancelled)
+		goto do_cancel;
 	    }
-	  if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
-	    ialias_call (GOMP_taskgroup_end) ();
-	  return;
 	}
       if (taskgroup)
 	taskgroup->num_children += num_tasks;
--- libgomp/parallel.c.jj	2018-04-25 09:40:31.926655587 +0200
+++ libgomp/parallel.c	2019-05-07 18:46:36.532109640 +0200
@@ -123,7 +123,8 @@ void
 GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads)
 {
   num_threads = gomp_resolve_num_threads (num_threads, 0);
-  gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads));
+  gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads),
+		   NULL);
 }

 void
@@ -161,14 +162,33 @@ GOMP_parallel_end (void)
 ialias (GOMP_parallel_end)

 void
-GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags)
+GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
+	       unsigned int flags)
 {
   num_threads = gomp_resolve_num_threads (num_threads, 0);
-  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads));
+  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads),
+		   NULL);
   fn (data);
   ialias_call (GOMP_parallel_end) ();
 }

+unsigned
+GOMP_parallel_reductions (void (*fn) (void *), void *data,
+			  unsigned num_threads, unsigned int flags)
+{
+  struct gomp_taskgroup *taskgroup;
+  num_threads = gomp_resolve_num_threads (num_threads, 0);
+  uintptr_t *rdata = *(uintptr_t **)data;
+  taskgroup = gomp_parallel_reduction_register (rdata, num_threads);
+  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads),
+		   taskgroup);
+  fn (data);
+  ialias_call (GOMP_parallel_end) ();
+  gomp_sem_destroy (&taskgroup->taskgroup_sem);
+  free (taskgroup);
+  return num_threads;
+}
+
 bool
 GOMP_cancellation_point (int which)
 {
@@ -185,8 +205,15 @@ GOMP_cancellation_point (int which)
     }
   else if (which & GOMP_CANCEL_TASKGROUP)
     {
-      if (thr->task->taskgroup && thr->task->taskgroup->cancelled)
-	return true;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return true;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return true;
+	}
       /* FALLTHRU into the GOMP_CANCEL_PARALLEL case,
 	 as #pragma omp cancel parallel also cancels all explicit
 	 tasks.  */
@@ -218,11 +245,17 @@ GOMP_cancel (int which, bool do_cancel)
     }
   else if (which & GOMP_CANCEL_TASKGROUP)
     {
-      if (thr->task->taskgroup && !thr->task->taskgroup->cancelled)
+      if (thr->task->taskgroup)
 	{
-	  gomp_mutex_lock (&team->task_lock);
-	  thr->task->taskgroup->cancelled = true;
-	  gomp_mutex_unlock (&team->task_lock);
+	  struct gomp_taskgroup *taskgroup = thr->task->taskgroup;
+	  if (taskgroup->workshare && taskgroup->prev)
+	    taskgroup = taskgroup->prev;
+	  if (!taskgroup->cancelled)
+	    {
+	      gomp_mutex_lock (&team->task_lock);
+	      taskgroup->cancelled = true;
+	      gomp_mutex_unlock (&team->task_lock);
+	    }
 	}
       return true;
     }
--- libgomp/oacc-plugin.h.jj	2018-04-25 09:40:31.322655307 +0200
+++ libgomp/oacc-plugin.h	2019-05-07 18:46:36.531109656 +0200
@@ -29,5 +29,6 @@

 extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
 extern void *GOMP_PLUGIN_acc_thread (void);
+extern int GOMP_PLUGIN_acc_default_dim (unsigned int);

 #endif
--- libgomp/target.c.jj	2018-04-25 09:40:31.912655580 +0200
+++ libgomp/target.c	2019-05-07 19:07:21.032306327 +0200
@@ -180,16 +180,22 @@ gomp_device_copy (struct gomp_device_des
 /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
    host to device memory transfers.  */

+struct gomp_coalesce_chunk
+{
+  /* The starting and ending point of a coalesced chunk of memory.  */
+  size_t start, end;
+};
+
 struct gomp_coalesce_buf
 {
   /* Buffer into which gomp_copy_host2dev will memcpy data and from which
      it will be copied to the device.  */
   void *buf;
   struct target_mem_desc *tgt;
-  /* Array with offsets, chunks[2 * i] is the starting offset and
-     chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
+  /* Array with offsets, chunks[i].start is the starting offset and
+     chunks[i].end ending offset relative to tgt->tgt_start device address
      of chunks which are to be copied to buf and later copied to device.  */
-  size_t *chunks;
+  struct gomp_coalesce_chunk *chunks;
   /* Number of chunks in chunks array, or -1 if coalesce buffering should not
      be performed.  */
   long chunk_cnt;
@@ -222,14 +228,14 @@ gomp_coalesce_buf_add (struct gomp_coale
     {
       if (cbuf->chunk_cnt < 0)
 	return;
-      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+      if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end)
 	{
 	  cbuf->chunk_cnt = -1;
 	  return;
 	}
-      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
+      if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end + MAX_COALESCE_BUF_GAP)
 	{
-	  cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
+	  cbuf->chunks[cbuf->chunk_cnt - 1].end = start + len;
 	  cbuf->use_cnt++;
 	  return;
 	}
@@ -239,8 +245,8 @@ gomp_coalesce_buf_add (struct gomp_coale
       if (cbuf->use_cnt == 1)
 	cbuf->chunk_cnt--;
     }
-  cbuf->chunks[2 * cbuf->chunk_cnt] = start;
-  cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
+  cbuf->chunks[cbuf->chunk_cnt].start = start;
+  cbuf->chunks[cbuf->chunk_cnt].end = start + len;
   cbuf->chunk_cnt++;
   cbuf->use_cnt = 1;
 }
@@ -271,20 +277,20 @@ gomp_copy_host2dev (struct gomp_device_d
   if (cbuf)
     {
       uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
-      if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
+      if (doff < cbuf->chunks[cbuf->chunk_cnt - 1].end)
 	{
 	  long first = 0;
 	  long last = cbuf->chunk_cnt - 1;
 	  while (first <= last)
 	    {
 	      long middle = (first + last) >> 1;
-	      if (cbuf->chunks[2 * middle + 1] <= doff)
+	      if (cbuf->chunks[middle].end <= doff)
 		first = middle + 1;
-	      else if (cbuf->chunks[2 * middle] <= doff)
+	      else if (cbuf->chunks[middle].start <= doff)
 		{
-		  if (doff + sz > cbuf->chunks[2 * middle + 1])
+		  if (doff + sz > cbuf->chunks[middle].end)
 		    gomp_fatal ("internal libgomp cbuf error");
-		  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
+		  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
 			  h, sz);
 		  return;
 		}
@@ -510,8 +516,8 @@ gomp_map_vars (struct gomp_device_descr
   cbuf.buf = NULL;
   if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
     {
-      cbuf.chunks
-	= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
+      size_t chunks_size = (mapnum + 1) * sizeof (struct gomp_coalesce_chunk);
+      cbuf.chunks = (struct gomp_coalesce_chunk *) gomp_alloca (chunks_size);
       cbuf.chunk_cnt = 0;
     }
   if (pragma_kind == GOMP_MAP_VARS_TARGET)
@@ -521,8 +527,8 @@ gomp_map_vars (struct gomp_device_descr
       tgt_size = mapnum * sizeof (void *);
       cbuf.chunk_cnt = 1;
       cbuf.use_cnt = 1 + (mapnum > 1);
-      cbuf.chunks[0] = 0;
-      cbuf.chunks[1] = tgt_size;
+      cbuf.chunks[0].start = 0;
+      cbuf.chunks[0].end = tgt_size;
     }

   gomp_mutex_lock (&devicep->lock);
@@ -707,7 +713,7 @@ gomp_map_vars (struct gomp_device_descr
       if (cbuf.chunk_cnt > 0)
 	{
 	  cbuf.buf
-	    = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
+	    = malloc (cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start);
 	  if (cbuf.buf)
 	    {
 	      cbuf.tgt = tgt;
@@ -859,6 +865,7 @@ gomp_map_vars (struct gomp_device_descr
 		tgt->list[i].offset = 0;
 		tgt->list[i].length = k->host_end - k->host_start;
 		k->refcount = 1;
+		k->dynamic_refcount = 0;
 		tgt->refcount++;
 		array->left = NULL;
 		array->right = NULL;
@@ -956,9 +963,10 @@ gomp_map_vars (struct gomp_device_descr
 		    /* Set link pointer on target to the device address of the
 		       mapped object.  */
 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
-		    devicep->host2dev_func (devicep->target_id,
-					    (void *) n->tgt_offset,
-					    &tgt_addr, sizeof (void *));
+		    /* We intentionally do not use coalescing here, as it's not
+		       data allocated by the current call to this function.  */
+		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
+					&tgt_addr, sizeof (void *), NULL);
 		  }
 		array++;
 	      }
@@ -981,10 +989,14 @@ gomp_map_vars (struct gomp_device_descr
     {
       long c = 0;
       for (c = 0; c < cbuf.chunk_cnt; ++c)
-	gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
-			    (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
-			    cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
+	gomp_copy_host2dev (devicep,
+			    (void *) (tgt->tgt_start + cbuf.chunks[c].start),
+			    (char *) cbuf.buf + (cbuf.chunks[c].start
+						 - cbuf.chunks[0].start),
+			    cbuf.chunks[c].end - cbuf.chunks[c].start, NULL);
       free (cbuf.buf);
+      cbuf.buf = NULL;
+      cbufp = NULL;
     }

   /* If the variable from "omp target enter data" map-list was already mapped,
@@ -1011,6 +1023,23 @@ gomp_unmap_tgt (struct target_mem_desc *
   free (tgt);
 }

+attribute_hidden bool
+gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
+{
+  bool is_tgt_unmapped = false;
+  splay_tree_remove (&devicep->mem_map, k);
+  if (k->link_key)
+    splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key);
+  if (k->tgt->refcount > 1)
+    k->tgt->refcount--;
+  else
+    {
+      is_tgt_unmapped = true;
+      gomp_unmap_tgt (k->tgt);
+    }
+  return is_tgt_unmapped;
+}
+
 /* Unmap variables described by TGT.  If DO_COPYFROM is true, copy relevant
    variables back from device to host: if it is false, it is assumed that this
    has been done already.  */
@@ -1059,16 +1088,7 @@ gomp_unmap_vars (struct target_mem_desc
 				      + tgt->list[i].offset),
 			    tgt->list[i].length);
       if (do_unmap)
-	{
-	  splay_tree_remove (&devicep->mem_map, k);
-	  if (k->link_key)
-	    splay_tree_insert (&devicep->mem_map,
-			       (splay_tree_node) k->link_key);
-	  if (k->tgt->refcount > 1)
-	    k->tgt->refcount--;
-	  else
-	    gomp_unmap_tgt (k->tgt);
-	}
+	gomp_remove_var (devicep, k);
     }

   if (tgt->refcount > 1)
@@ -1298,17 +1318,7 @@ gomp_unload_image_from_device (struct go
       else
 	{
 	  splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &k);
-	  splay_tree_remove (&devicep->mem_map, n);
-	  if (n->link_key)
-	    {
-	      if (n->tgt->refcount > 1)
-		n->tgt->refcount--;
-	      else
-		{
-		  is_tgt_unmapped = true;
-		  gomp_unmap_tgt (n->tgt);
-		}
-	    }
+	  is_tgt_unmapped = gomp_remove_var (devicep, n);
 	}
     }

@@ -1855,11 +1865,20 @@ GOMP_target_update_ext (int device, size
 	      struct gomp_team *team = thr->ts.team;
 	      /* If parallel or taskgroup has been cancelled, don't start new
 		 tasks.  */
-	      if (team
-		  && (gomp_team_barrier_cancelled (&team->barrier)
-		      || (thr->task->taskgroup
-			  && thr->task->taskgroup->cancelled)))
-		return;
+	      if (__builtin_expect (gomp_cancel_var, 0) && team)
+		{
+		  if (gomp_team_barrier_cancelled (&team->barrier))
+		    return;
+		  if (thr->task->taskgroup)
+		    {
+		      if (thr->task->taskgroup->cancelled)
+			return;
+		      if (thr->task->taskgroup->workshare
+			  && thr->task->taskgroup->prev
+			  && thr->task->taskgroup->prev->cancelled)
+			return;
+		    }
+		}

 	      gomp_task_maybe_wait_for_dependencies (depend);
 	    }
@@ -1874,10 +1893,20 @@ GOMP_target_update_ext (int device, size
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }

   gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true);
 }
@@ -1986,11 +2015,20 @@ GOMP_target_enter_exit_data (int device,
 	      struct gomp_team *team = thr->ts.team;
 	      /* If parallel or taskgroup has been cancelled, don't start new
 		 tasks.  */
-	      if (team
-		  && (gomp_team_barrier_cancelled (&team->barrier)
-		      || (thr->task->taskgroup
-			  && thr->task->taskgroup->cancelled)))
-		return;
+	      if (__builtin_expect (gomp_cancel_var, 0) && team)
+		{
+		  if (gomp_team_barrier_cancelled (&team->barrier))
+		    return;
+		  if (thr->task->taskgroup)
+		    {
+		      if (thr->task->taskgroup->cancelled)
+			return;
+		      if (thr->task->taskgroup->workshare
+			  && thr->task->taskgroup->prev
+			  && thr->task->taskgroup->prev->cancelled)
+			return;
+		    }
+		}

 	      gomp_task_maybe_wait_for_dependencies (depend);
 	    }
@@ -2005,10 +2043,20 @@ GOMP_target_enter_exit_data (int device,
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
-  if (team
-      && (gomp_team_barrier_cancelled (&team->barrier)
-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
-    return;
+  if (__builtin_expect (gomp_cancel_var, 0) && team)
+    {
+      if (gomp_team_barrier_cancelled (&team->barrier))
+	return;
+      if (thr->task->taskgroup)
+	{
+	  if (thr->task->taskgroup->cancelled)
+	    return;
+	  if (thr->task->taskgroup->workshare
+	      && thr->task->taskgroup->prev
+	      && thr->task->taskgroup->prev->cancelled)
+	    return;
+	}
+    }

   size_t i;
   if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
@@ -2197,8 +2245,9 @@ omp_target_is_present (void *ptr, int de
 }

 int
-omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset,
-		   size_t src_offset, int dst_device_num, int src_device_num)
+omp_target_memcpy (void *dst, void *src, size_t length,
+		   size_t dst_offset, size_t src_offset, int dst_device_num,
+		   int src_device_num)
 {
   struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
   bool ret;
@@ -2287,21 +2336,25 @@ omp_target_memcpy_rect_worker (void *dst
 	return EINVAL;
       if (dst_devicep == NULL && src_devicep == NULL)
 	{
-	  memcpy ((char *) dst + dst_off, (char *) src + src_off, length);
+	  memcpy ((char *) dst + dst_off, (char *) src + src_off,
+		  length);
 	  ret = 1;
 	}
       else if (src_devicep == NULL)
 	ret = dst_devicep->host2dev_func (dst_devicep->target_id,
 					  (char *) dst + dst_off,
-					  (char *) src + src_off, length);
+					  (char *) src + src_off,
+					  length);
       else if (dst_devicep == NULL)
 	ret = src_devicep->dev2host_func (src_devicep->target_id,
 					  (char *) dst + dst_off,
-					  (char *) src + src_off, length);
+					  (char *) src + src_off,
+					  length);
       else if (src_devicep == dst_devicep)
 	ret = src_devicep->dev2dev_func (src_devicep->target_id,
 					 (char *) dst + dst_off,
-					 (char *) src + src_off, length);
+					 (char *) src + src_off,
+					 length);
       else
 	ret = 0;
       return ret ? 0 : EINVAL;
@@ -2396,8 +2449,8 @@ omp_target_memcpy_rect (void *dst, void
 }

 int
-omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
-			  size_t device_offset, int device_num)
+omp_target_associate_ptr (void *host_ptr, void *device_ptr,
+			  size_t size, size_t device_offset, int device_num)
 {
   if (device_num == GOMP_DEVICE_HOST_FALLBACK)
     return EINVAL;
@@ -2499,6 +2552,31 @@ omp_target_disassociate_ptr (void *ptr,
   return ret;
 }

+int
+omp_pause_resource (omp_pause_resource_t kind, int device_num)
+{
+  (void) kind;
+  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+    return gomp_pause_host ();
+  if (device_num < 0 || device_num >= gomp_get_num_devices ())
+    return -1;
+  /* Do nothing for target devices for now.  */
+  return 0;
+}
+
+int
+omp_pause_resource_all (omp_pause_resource_t kind)
+{
+  (void) kind;
+  if (gomp_pause_host ())
+    return -1;
+  /* Do nothing for target devices for now.  */
+  return 0;
+}
+
+ialias (omp_pause_resource)
+ialias (omp_pause_resource_all)
+
 #ifdef PLUGIN_SUPPORT

 /* This function tries to load a plugin for DEVICE.  Name of plugin is passed
@@ -2632,9 +2710,9 @@ gomp_target_fini (void)
     }
 }

-/* This function initializes the runtime needed for offloading.
-   It parses the list of offload targets and tries to load the plugins for
-   these targets.  On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP
+/* This function initializes the runtime for offloading.
+   It parses the list of offload plugins, and tries to load these.
+   On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP
    will be set, and the array DEVICES initialized, containing descriptors for
    corresponding devices, first the GOMP_OFFLOAD_CAP_OPENMP_400 ones, follows
    by the others.  */
@@ -2651,7 +2729,7 @@ gomp_target_init (void)
   num_devices = 0;
   devices = NULL;

-  cur = OFFLOAD_TARGETS;
+  cur = OFFLOAD_PLUGINS;
   if (*cur)
     do
       {
--- libgomp/ordered.c.jj	2018-04-25 09:40:31.926655587 +0200
+++ libgomp/ordered.c	2019-05-07 18:46:36.532109640 +0200
@@ -259,7 +259,8 @@ GOMP_ordered_end (void)
 #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)

 void
-gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
+gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
+		    size_t extra)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -269,13 +270,24 @@ gomp_doacross_init (unsigned ncounts, lo
   struct gomp_doacross_work_share *doacross;

   if (team == NULL || team->nthreads == 1)
-    return;
+    {
+    empty:
+      if (!extra)
+	ws->doacross = NULL;
+      else
+	{
+	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+	  doacross->extra = (void *) (doacross + 1);
+	  ws->doacross = doacross;
+	}
+      return;
+    }

   for (i = 0; i < ncounts; i++)
     {
       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
       if (counts[i] == 0)
-	return;
+	goto empty;

       if (num_bits <= MAX_COLLAPSED_BITS)
 	{
@@ -314,7 +326,7 @@ gomp_doacross_init (unsigned ncounts, lo
   elt_sz = (elt_sz + 63) & ~63UL;

   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
-			  + shift_sz);
+			  + shift_sz + extra);
   doacross->chunk_size = chunk_size;
   doacross->elt_sz = elt_sz;
   doacross->ncounts = ncounts;
@@ -322,6 +334,13 @@ gomp_doacross_init (unsigned ncounts, lo
   doacross->array = (unsigned char *)
 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 		     & ~(uintptr_t) 63);
+  if (extra)
+    {
+      doacross->extra = doacross->array + num_ents * elt_sz;
+      memset (doacross->extra, '\0', extra);
+    }
+  else
+    doacross->extra = NULL;
   if (num_bits <= MAX_COLLAPSED_BITS)
     {
       unsigned int shift_count = 0;
@@ -360,7 +379,8 @@ GOMP_doacross_post (long *counts)
   unsigned long ent;
   unsigned int i;

-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
@@ -411,7 +431,8 @@ GOMP_doacross_wait (long first, ...)
   unsigned long ent;
   unsigned int i;

-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
@@ -488,7 +509,8 @@ GOMP_doacross_wait (long first, ...)
 typedef unsigned long long gomp_ull;

 void
-gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
+gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
+			gomp_ull chunk_size, size_t extra)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -498,13 +520,24 @@ gomp_doacross_ull_init (unsigned ncounts
   struct gomp_doacross_work_share *doacross;

   if (team == NULL || team->nthreads == 1)
-    return;
+    {
+    empty:
+      if (!extra)
+	ws->doacross = NULL;
+      else
+	{
+	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
+	  doacross->extra = (void *) (doacross + 1);
+	  ws->doacross = doacross;
+	}
+      return;
+    }

   for (i = 0; i < ncounts; i++)
     {
       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
       if (counts[i] == 0)
-	return;
+	goto empty;

       if (num_bits <= MAX_COLLAPSED_BITS)
 	{
@@ -557,6 +590,13 @@ gomp_doacross_ull_init (unsigned ncounts
   doacross->array = (unsigned char *)
 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
 		     & ~(uintptr_t) 63);
+  if (extra)
+    {
+      doacross->extra = doacross->array + num_ents * elt_sz;
+      memset (doacross->extra, '\0', extra);
+    }
+  else
+    doacross->extra = NULL;
   if (num_bits <= MAX_COLLAPSED_BITS)
     {
       unsigned int shift_count = 0;
@@ -595,7 +635,8 @@ GOMP_doacross_ull_post (gomp_ull *counts
   unsigned long ent;
   unsigned int i;

-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
@@ -667,7 +708,8 @@ GOMP_doacross_ull_wait (gomp_ull first,
   unsigned long ent;
   unsigned int i;

-  if (__builtin_expect (doacross == NULL, 0))
+  if (__builtin_expect (doacross == NULL, 0)
+      || __builtin_expect (doacross->array == NULL, 0))
     {
       __sync_synchronize ();
       return;
--- libgomp/alloc.c.jj	2018-04-25 09:40:31.926655587 +0200
+++ libgomp/alloc.c	2019-05-07 18:46:36.336112770 +0200
@@ -57,3 +57,50 @@ gomp_realloc (void *old, size_t size)
     gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size);
   return ret;
 }
+
+void *
+gomp_aligned_alloc (size_t al, size_t size)
+{
+  void *ret;
+  if (al < sizeof (void *))
+    al = sizeof (void *);
+#ifdef HAVE_ALIGNED_ALLOC
+  ret = aligned_alloc (al, size);
+#elif defined(HAVE__ALIGNED_MALLOC)
+  ret = _aligned_malloc (size, al);
+#elif defined(HAVE_POSIX_MEMALIGN)
+  if (posix_memalign (&ret, al, size) != 0)
+    ret = NULL;
+#elif defined(HAVE_MEMALIGN)
+  {
+    extern void *memalign (size_t, size_t);
+    ret = memalign (al, size);
+  }
+#else
+  ret = NULL;
+  if ((al & (al - 1)) == 0 && size)
+    {
+      void *p = malloc (size + al);
+      if (p)
+	{
+	  void *ap = (void *) (((uintptr_t) p + al) & -al);
+	  ((void **) ap)[-1] = p;
+	  ret = ap;
+	}
+    }
+#endif
+  if (ret == NULL)
+    gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size);
+  return ret;
+}
+
+void
+gomp_aligned_free (void *ptr)
+{
+#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC
+  free (ptr);
+#else
+  if (ptr)
+    free (((void **) ptr)[-1]);
+#endif
+}
--- libgomp/configure.ac.jj	2018-04-25 09:40:31.321655307 +0200
+++ libgomp/configure.ac	2019-05-07 18:46:36.471110614 +0200
@@ -219,6 +219,7 @@ m4_include([plugin/configfrag.ac])

 # Check for functions needed.
 AC_CHECK_FUNCS(getloadavg clock_gettime strtoull)
+AC_CHECK_FUNCS(aligned_alloc posix_memalign memalign _aligned_malloc)

 # Check for broken semaphore implementation on darwin.
 # sem_init returns: sem_init error: Function not implemented.
@@ -266,6 +267,41 @@ if test $ac_cv_func_clock_gettime = no;
 	       [Define to 1 if you have the `clock_gettime' function.])])
 fi

+# Check for uname.
+AC_COMPILE_IFELSE(
+ [AC_LANG_PROGRAM(
+  [#include <string.h>
+   #include <stdlib.h>
+   #include <sys/utsname.h>],
+  [struct utsname buf;
+   volatile size_t len = 0;
+   if (!uname (buf))
+     len = strlen (buf.nodename);])],
+  AC_DEFINE(HAVE_UNAME, 1,
+[	Define if uname is supported and struct utsname has nodename field.]))
+
+# Check for gethostname.
+AC_COMPILE_IFELSE(
+ [AC_LANG_PROGRAM(
+  [#include <unistd.h>],
+  [
+changequote(,)dnl
+   char buf[256];
+   if (gethostname (buf, sizeof (buf) - 1) == 0)
+     buf[255] = '\0';
+changequote([,])dnl
+  ])],
+  AC_DEFINE(HAVE_GETHOSTNAME, 1,
+[	Define if gethostname is supported.]))
+
+# Check for getpid.
+AC_COMPILE_IFELSE(
+ [AC_LANG_PROGRAM(
+  [#include <unistd.h>],
+  [int pid = getpid ();])],
+  AC_DEFINE(HAVE_GETPID, 1,
+[	Define if getpid is supported.]))
+
 # See if we support thread-local storage.
 GCC_CHECK_TLS

--- libgomp/icv.c.jj	2018-04-25 09:40:31.870655561 +0200
+++ libgomp/icv.c	2019-05-07 18:46:36.501110134 +0200
@@ -69,7 +69,7 @@ void
 omp_set_schedule (omp_sched_t kind, int chunk_size)
 {
   struct gomp_task_icv *icv = gomp_icv (true);
-  switch (kind)
+  switch (kind & ~omp_sched_monotonic)
     {
     case omp_sched_static:
       if (chunk_size < 1)
--- libgomp/configure.jj	2018-04-25 09:40:31.913655581 +0200
+++ libgomp/configure	2019-05-07 18:47:37.961128420 +0200
@@ -636,6 +636,8 @@ PLUGIN_NVPTX_FALSE
 PLUGIN_NVPTX_TRUE
 offload_additional_lib_paths
 offload_additional_options
+offload_targets
+offload_plugins
 PLUGIN_HSA_LIBS
 PLUGIN_HSA_LDFLAGS
 PLUGIN_HSA_CPPFLAGS
@@ -648,7 +650,6 @@ PLUGIN_NVPTX_CPPFLAGS
 PLUGIN_NVPTX
 CUDA_DRIVER_LIB
 CUDA_DRIVER_INCLUDE
-offload_targets
 libtool_VERSION
 ac_ct_FC
 FCFLAGS
@@ -11157,7 +11158,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11160 "configure"
+#line 11161 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
@@ -11263,7 +11264,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<_LT_EOF
-#line 11266 "configure"
+#line 11267 "configure"
 #include "confdefs.h"

 #if HAVE_DLFCN_H
@@ -15167,8 +15168,6 @@ fi
 # see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 # <http://www.gnu.org/licenses/>.

-offload_targets=
-
 plugin_support=yes
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlsym in -ldl" >&5
 $as_echo_n "checking for dlsym in -ldl... " >&6; }
@@ -15302,7 +15301,11 @@ if test "${with_cuda_driver_lib+set}" =
 fi

 case "x$with_cuda_driver" in
-  x | xno) ;;
+  x) ;;
+  xno)
+    CUDA_DRIVER_INCLUDE=no
+    CUDA_DRIVER_LIB=no
+    ;;
   *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
      CUDA_DRIVER_LIB=$with_cuda_driver/lib
      ;;
@@ -15313,10 +15316,12 @@ fi
 if test "x$with_cuda_driver_lib" != x; then
   CUDA_DRIVER_LIB=$with_cuda_driver_lib
 fi
-if test "x$CUDA_DRIVER_INCLUDE" != x; then
+if test "x$CUDA_DRIVER_INCLUDE" != x \
+   && test "x$CUDA_DRIVER_INCLUDE" != xno; then
   CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
 fi
-if test "x$CUDA_DRIVER_LIB" != x; then
+if test "x$CUDA_DRIVER_LIB" != x \
+   && test "x$CUDA_DRIVER_LIB" != xno; then
   CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
 fi

@@ -15383,7 +15388,13 @@ PLUGIN_HSA_LIBS=


-# Get offload targets and path to install tree of offloading compiler.
+# Parse '--enable-offload-targets', figure out the corresponding libgomp
+# plugins, and configure to find the corresponding offload compilers.
+# 'offload_plugins' and 'offload_targets' will be populated in the same order.
+offload_plugins=
+offload_targets=
+
+
 offload_additional_options=
 offload_additional_lib_paths=

@@ -15403,10 +15403,10 @@ if test x"$enable_offload_targets" != x;
   for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
     tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
     tgt=`echo $tgt | sed 's/=.*//'`
-    tgt_name=
+    tgt_plugin=
     case $tgt in
       *-intelmic-* | *-intelmicemul-*)
-	tgt_name=intelmic
+	tgt_plugin=intelmic
 	;;
       nvptx*)
 	case "${target}" in
@@ -15418,19 +15418,21 @@ if test x"$enable_offload_targets" != x;
 		PLUGIN_NVPTX=0
 		;;
 	      *)
-		tgt_name=nvptx
+		tgt_plugin=nvptx
 		PLUGIN_NVPTX=$tgt
-		PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
-		PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
-		PLUGIN_NVPTX_LIBS='-lcuda'
+		if test "x$CUDA_DRIVER_LIB" != xno \
+		   && test "x$CUDA_DRIVER_LIB" != xno; then
+		  PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+		  PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+		  PLUGIN_NVPTX_LIBS='-lcuda'

-		PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
-		CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
-		PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
-		LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
-		PLUGIN_NVPTX_save_LIBS=$LIBS
-		LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
-		cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+		  PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+		  CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+		  PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+		  LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+		  PLUGIN_NVPTX_save_LIBS=$LIBS
+		  LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+		  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include "cuda.h"
 int
@@ -15446,13 +15448,16 @@ if ac_fn_c_try_link "$LINENO"; then :
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
-		CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
-		LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
-		LIBS=$PLUGIN_NVPTX_save_LIBS
+		  CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+		  LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+		  LIBS=$PLUGIN_NVPTX_save_LIBS
+		fi
 		case $PLUGIN_NVPTX in
 		  nvptx*)
-		    if test "x$CUDA_DRIVER_INCLUDE" = x \
-		       && test "x$CUDA_DRIVER_LIB" = x; then
+		    if (test "x$CUDA_DRIVER_INCLUDE" = x \
+			|| test "x$CUDA_DRIVER_INCLUDE" = xno) \
+		       && (test "x$CUDA_DRIVER_LIB" = x \
+			   || test "x$CUDA_DRIVER_LIB" = xno); then
 		      PLUGIN_NVPTX=1
 		      PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
 		      PLUGIN_NVPTX_LIBS='-ldl'
@@ -15452,7 +15468,7 @@ rm -f core conftest.err conftest.$ac_obj
 	        PLUGIN_HSA=0
 		;;
 	      *)
-	        tgt_name=hsa
+		tgt_plugin=hsa
 	        PLUGIN_HSA=$tgt
 	        PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
 	        PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
@@ -15470,7 +15486,7 @@ rm -f core conftest.err conftest.$ac_obj
 	        LDFLAGS=$PLUGIN_HSA_save_LDFLAGS
 	        LIBS=$PLUGIN_HSA_save_LIBS
 	        case $PLUGIN_HSA in
-	          hsa*)
+		  hsa*)
 	            HSA_PLUGIN=0
 	            as_fn_error "HSA run-time package required for HSA support" "$LINENO" 5
 	            ;;
@@ -15487,16 +15503,19 @@ rm -f core conftest.err conftest.$ac_obj
 	as_fn_error "unknown offload target specified" "$LINENO" 5
 	;;
     esac
-    if test x"$tgt_name" = x; then
-      # Don't configure libgomp for this offloading target if we don't build
-      # the corresponding plugin.
+    if test x"$tgt_plugin" = x; then
+      # Not configuring libgomp for this offload target if we're not building
+      # the corresponding offload plugin.
       continue
-    elif test x"$offload_targets" = x; then
-      offload_targets=$tgt_name
+    elif test x"$offload_plugins" = x; then
+      offload_plugins=$tgt_plugin
+      offload_targets=$tgt
     else
-      offload_targets=$offload_targets,$tgt_name
+      offload_plugins=$offload_plugins,$tgt_plugin
+      offload_targets=$offload_targets,$tgt
     fi
-    if test "$tgt_name" = hsa; then
+    # Configure additional search paths.
+    if test "$tgt_plugin" = hsa; then
       # Offloading compilation is all handled by the target compiler.
       :
     elif test x"$tgt_dir" != x; then
@@ -15510,7 +15529,7 @@ rm -f core conftest.err conftest.$ac_obj
 fi

 cat >>confdefs.h <<_ACEOF
-#define OFFLOAD_TARGETS "$offload_targets"
+#define OFFLOAD_PLUGINS "$offload_plugins"
 _ACEOF

  if test $PLUGIN_NVPTX = 1; then
@@ -15570,6 +15589,19 @@ _ACEOF
 fi
 done

+for ac_func in aligned_alloc posix_memalign memalign _aligned_malloc
+do :
+  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
+eval as_val=\$$as_ac_var
+   if test "x$as_val" = x""yes; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+done
+

 # Check for broken semaphore implementation on darwin.
 # sem_init returns: sem_init error: Function not implemented.
@@ -15784,6 +15816,72 @@ fi

 fi

+# Check for uname.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+   #include <stdlib.h>
+   #include <sys/utsname.h>
+int
+main ()
+{
+struct utsname buf;
+   volatile size_t len = 0;
+   if (!uname (buf))
+     len = strlen (buf.nodename);
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_UNAME 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+# Check for gethostname.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <unistd.h>
+int
+main ()
+{
+
+   char buf[256];
+   if (gethostname (buf, sizeof (buf) - 1) == 0)
+     buf[255] = '\0';
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_GETHOSTNAME 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+# Check for getpid.
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <unistd.h>
+int
+main ()
+{
+int pid = getpid ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_GETPID 1" >>confdefs.h
+
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
 # See if we support thread-local storage.


--- libgomp/Makefile.am.jj	2018-04-25 09:40:31.926655587 +0200
+++ libgomp/Makefile.am	2019-05-07 19:59:03.683989317 +0200
@@ -63,12 +63,13 @@ libgomp_la_SOURCES = alloc.c atomic.c ba
 	parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
 	proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
 	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
-	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
+	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
+	affinity-fmt.c teams.c

 include $(top_srcdir)/plugin/Makefrag.am

 if USE_FORTRAN
-libgomp_la_SOURCES += openacc.f90
+libgomp_la_SOURCES += openacc2.f90
 endif

 nodist_noinst_HEADERS = libgomp_f.h
@@ -87,8 +88,6 @@ omp_lib_kinds.mod: omp_lib.mod
 	:
 openacc_kinds.mod: openacc.mod
 	:
-openacc.mod: openacc.lo
-	:
 %.mod: %.f90
 	$(FC) $(FCFLAGS) -fsyntax-only $<
 fortran.lo: libgomp_f.h
--- libgomp/oacc-mem.c.jj	2018-04-25 09:40:31.924655586 +0200
+++ libgomp/oacc-mem.c	2019-05-07 18:46:36.530109672 +0200
@@ -153,8 +153,9 @@ acc_free (void *d)
     gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
 }

-void
-acc_memcpy_to_device (void *d, void *h, size_t s)
+static void
+memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
+		      const char *libfnname)
 {
   /* No need to call lazy open here, as the device pointer must have
      been obtained from a routine that did that.  */
@@ -164,31 +165,49 @@ acc_memcpy_to_device (void *d, void *h,

   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
     {
-      memmove (d, h, s);
+      if (from)
+	memmove (h, d, s);
+      else
+	memmove (d, h, s);
       return;
     }

-  if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s))
-    gomp_fatal ("error in %s", __FUNCTION__);
+  if (async > acc_async_sync)
+    thr->dev->openacc.async_set_async_func (async);
+
+  bool ret = (from
+	      ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
+	      : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
+
+  if (async > acc_async_sync)
+    thr->dev->openacc.async_set_async_func (acc_async_sync);
+
+  if (!ret)
+    gomp_fatal ("error in %s", libfnname);
 }

 void
-acc_memcpy_from_device (void *h, void *d, size_t s)
+acc_memcpy_to_device (void *d, void *h, size_t s)
 {
-  /* No need to call lazy open here, as the device pointer must have
-     been obtained from a routine that did that.  */
-  struct goacc_thread *thr = goacc_thread ();
+  memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
+}

-  assert (thr && thr->dev);
+void
+acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
+{
+  memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
+}

-  if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-    {
-      memmove (h, d, s);
-      return;
-    }
+void
+acc_memcpy_from_device (void *h, void *d, size_t s)
+{
+  memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
+}

-  if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s))
-    gomp_fatal ("error in %s", __FUNCTION__);
+void
+acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
+{
+  memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
 }

 /* Return the device pointer that corresponds to host data H.  Or NULL
@@ -347,6 +366,7 @@ acc_map_data (void *h, void *d, size_t s

       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
 			   &kinds, true, GOMP_MAP_VARS_OPENACC);
+      tgt->list[0].key->refcount = REFCOUNT_INFINITY;
     }

   gomp_mutex_lock (&acc_dev->lock);
@@ -389,6 +409,9 @@ acc_unmap_data (void *h)
 		  (void *) n->host_start, (int) host_size, (void *) h);
     }

+  /* Mark for removal.  */
+  n->refcount = 1;
+
   t = n->tgt;

   if (t->refcount == 2)
@@ -424,7 +447,7 @@ acc_unmap_data (void *h)
 #define FLAG_COPY (1 << 2)

 static void *
-present_create_copy (unsigned f, void *h, size_t s)
+present_create_copy (unsigned f, void *h, size_t s, int async)
 {
   void *d;
   splay_tree_key n;
@@ -460,6 +483,11 @@ present_create_copy (unsigned f, void *h
 	  gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
 	}

+      if (n->refcount != REFCOUNT_INFINITY)
+	{
+	  n->refcount++;
+	  n->dynamic_refcount++;
+	}
       gomp_mutex_unlock (&acc_dev->lock);
     }
   else if (!(f & FLAG_CREATE))
@@ -481,8 +509,16 @@ present_create_copy (unsigned f, void *h

       gomp_mutex_unlock (&acc_dev->lock);

+      if (async > acc_async_sync)
+	acc_dev->openacc.async_set_async_func (async);
+
       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
 			   GOMP_MAP_VARS_OPENACC);
+      /* Initialize dynamic refcount.  */
+      tgt->list[0].key->dynamic_refcount = 1;
+
+      if (async > acc_async_sync)
+	acc_dev->openacc.async_set_async_func (acc_async_sync);

       gomp_mutex_lock (&acc_dev->lock);

@@ -499,53 +535,71 @@ present_create_copy (unsigned f, void *h
 void *
 acc_create (void *h, size_t s)
 {
-  return present_create_copy (FLAG_CREATE, h, s);
+  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync);
 }

-void *
-acc_copyin (void *h, size_t s)
+void
+acc_create_async (void *h, size_t s, int async)
 {
-  return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s);
+  present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async);
 }

+/* acc_present_or_create used to be what acc_create is now.  */
+/* acc_pcreate is acc_present_or_create by a different name.  */
+#ifdef HAVE_ATTRIBUTE_ALIAS
+strong_alias (acc_create, acc_present_or_create)
+strong_alias (acc_create, acc_pcreate)
+#else
 void *
 acc_present_or_create (void *h, size_t s)
 {
-  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s);
+  return acc_create (h, s);
 }

-/* acc_pcreate is acc_present_or_create by a different name.  */
-#ifdef HAVE_ATTRIBUTE_ALIAS
-strong_alias (acc_present_or_create, acc_pcreate)
-#else
 void *
 acc_pcreate (void *h, size_t s)
 {
-  return acc_present_or_create (h, s);
+  return acc_create (h, s);
 }
 #endif

 void *
-acc_present_or_copyin (void *h, size_t s)
+acc_copyin (void *h, size_t s)
+{
+  return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s,
+			      acc_async_sync);
+}
+
+void
+acc_copyin_async (void *h, size_t s, int async)
 {
-  return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s);
+  present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async);
 }

+/* acc_present_or_copyin used to be what acc_copyin is now.  */
 /* acc_pcopyin is acc_present_or_copyin by a different name.  */
 #ifdef HAVE_ATTRIBUTE_ALIAS
-strong_alias (acc_present_or_copyin, acc_pcopyin)
+strong_alias (acc_copyin, acc_present_or_copyin)
+strong_alias (acc_copyin, acc_pcopyin)
 #else
 void *
+acc_present_or_copyin (void *h, size_t s)
+{
+  return acc_copyin (h, s);
+}
+
+void *
 acc_pcopyin (void *h, size_t s)
 {
-  return acc_present_or_copyin (h, s);
+  return acc_copyin (h, s);
 }
 #endif

-#define FLAG_COPYOUT (1 << 0)
+#define FLAG_COPYOUT  (1 << 0)
+#define FLAG_FINALIZE (1 << 1)

 static void
-delete_copyout (unsigned f, void *h, size_t s, const char *libfnname)
+delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
 {
   size_t host_size;
   splay_tree_key n;
@@ -581,31 +635,111 @@ delete_copyout (unsigned f, void *h, siz
 		  (void *) n->host_start, (int) host_size, (void *) h, (int) s);
     }

-  gomp_mutex_unlock (&acc_dev->lock);
+  if (n->refcount == REFCOUNT_INFINITY)
+    {
+      n->refcount = 0;
+      n->dynamic_refcount = 0;
+    }
+  if (n->refcount < n->dynamic_refcount)
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("Dynamic reference counting assert fail\n");
+    }

-  if (f & FLAG_COPYOUT)
-    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+  if (f & FLAG_FINALIZE)
+    {
+      n->refcount -= n->dynamic_refcount;
+      n->dynamic_refcount = 0;
+    }
+  else if (n->dynamic_refcount)
+    {
+      n->dynamic_refcount--;
+      n->refcount--;
+    }
+
+  if (n->refcount == 0)
+    {
+      if (n->tgt->refcount == 2)
+	{
+	  struct target_mem_desc *tp, *t;
+	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+	       tp = t, t = t->prev)
+	    if (n->tgt == t)
+	      {
+		if (tp)
+		  tp->prev = t->prev;
+		else
+		  acc_dev->openacc.data_environ = t->prev;
+		break;
+	      }
+	}
+
+      if (f & FLAG_COPYOUT)
+	{
+	  if (async > acc_async_sync)
+	    acc_dev->openacc.async_set_async_func (async);
+	  acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+	  if (async > acc_async_sync)
+	    acc_dev->openacc.async_set_async_func (acc_async_sync);
+	}

-  acc_unmap_data (h);
+      gomp_remove_var (acc_dev, n);
+    }

-  if (!acc_dev->free_func (acc_dev->target_id, d))
-    gomp_fatal ("error in freeing device memory in %s", libfnname);
+  gomp_mutex_unlock (&acc_dev->lock);
 }

 void
 acc_delete (void *h , size_t s)
 {
-  delete_copyout (0, h, s, __FUNCTION__);
+  delete_copyout (0, h, s, acc_async_sync, __FUNCTION__);
+}
+
+void
+acc_delete_async (void *h , size_t s, int async)
+{
+  delete_copyout (0, h, s, async, __FUNCTION__);
+}
+
+void
+acc_delete_finalize (void *h , size_t s)
+{
+  delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__);
+}
+
+void
+acc_delete_finalize_async (void *h , size_t s, int async)
+{
+  delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__);
 }

 void
 acc_copyout (void *h, size_t s)
 {
-  delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__);
+  delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__);
+}
+
+void
+acc_copyout_async (void *h, size_t s, int async)
+{
+  delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__);
+}
+
+void
+acc_copyout_finalize (void *h, size_t s)
+{
+  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync,
+		  __FUNCTION__);
+}
+
+void
+acc_copyout_finalize_async (void *h, size_t s, int async)
+{
+  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__);
 }

 static void
-update_dev_host (int is_dev, void *h, size_t s)
+update_dev_host (int is_dev, void *h, size_t s, int async)
 {
   splay_tree_key n;
   void *d;
@@ -631,24 +765,42 @@ update_dev_host (int is_dev, void *h, si
   d = (void *) (n->tgt->tgt_start + n->tgt_offset
 		+ (uintptr_t) h - n->host_start);

+  if (async > acc_async_sync)
+    acc_dev->openacc.async_set_async_func (async);
+
   if (is_dev)
     acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
   else
     acc_dev->dev2host_func (acc_dev->target_id, h, d, s);

+  if (async > acc_async_sync)
+    acc_dev->openacc.async_set_async_func (acc_async_sync);
+
   gomp_mutex_unlock (&acc_dev->lock);
 }

 void
 acc_update_device (void *h, size_t s)
 {
-  update_dev_host (1, h, s);
+  update_dev_host (1, h, s, acc_async_sync);
+}
+
+void
+acc_update_device_async (void *h, size_t s, int async)
+{
+  update_dev_host (1, h, s, async);
 }

 void
 acc_update_self (void *h, size_t s)
 {
-  update_dev_host (0, h, s);
+  update_dev_host (0, h, s, acc_async_sync);
+}
+
+void
+acc_update_self_async (void *h, size_t s, int async)
+{
+  update_dev_host (0, h, s, async);
 }

 void
@@ -659,11 +811,37 @@ gomp_acc_insert_pointer (size_t mapnum,
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;

+  if (acc_is_present (*hostaddrs, *sizes))
+    {
+      splay_tree_key n;
+      gomp_mutex_lock (&acc_dev->lock);
+      n = lookup_host (acc_dev, *hostaddrs, *sizes);
+      gomp_mutex_unlock (&acc_dev->lock);
+
+      tgt = n->tgt;
+      for (size_t i = 0; i < tgt->list_count; i++)
+	if (tgt->list[i].key == n)
+	  {
+	    for (size_t j = 0; j < mapnum; j++)
+	      if (i + j < tgt->list_count && tgt->list[i + j].key)
+		{
+		  tgt->list[i + j].key->refcount++;
+		  tgt->list[i + j].key->dynamic_refcount++;
+		}
+	    return;
+	  }
+      /* Should not reach here.  */
+      gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset");
+    }
+
   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
 		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);

+  /* Initialize dynamic refcount.  */
+  tgt->list[0].key->dynamic_refcount = 1;
+
   gomp_mutex_lock (&acc_dev->lock);
   tgt->prev = acc_dev->openacc.data_environ;
   acc_dev->openacc.data_environ = tgt;
@@ -671,7 +849,8 @@ gomp_acc_insert_pointer (size_t mapnum,
 }

 void
-gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
+gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async,
+			 int finalize, int mapnum)
 {
   struct goacc_thread *thr = goacc_thread ();
   struct gomp_device_descr *acc_dev = thr->dev;
@@ -679,6 +858,9 @@ gomp_acc_remove_pointer (void *h, bool f
   struct target_mem_desc *t;
   int minrefs = (mapnum == 1) ? 2 : 3;

+  if (!acc_is_present (h, s))
+    return;
+
   gomp_mutex_lock (&acc_dev->lock);

   n = lookup_host (acc_dev, h, 1);
@@ -693,40 +875,65 @@ gomp_acc_remove_pointer (void *h, bool f

   t = n->tgt;

-  struct target_mem_desc *tp;
+  if (n->refcount < n->dynamic_refcount)
+    {
+      gomp_mutex_unlock (&acc_dev->lock);
+      gomp_fatal ("Dynamic reference counting assert fail\n");
+    }

-  if (t->refcount == minrefs)
+  if (finalize)
     {
-      /* This is the last reference, so pull the descriptor off the
-	 chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from
-	 freeing the device memory. */
-      t->tgt_end = 0;
-      t->to_free = 0;
+      n->refcount -= n->dynamic_refcount;
+      n->dynamic_refcount = 0;
+    }
+  else if (n->dynamic_refcount)
+    {
+      n->dynamic_refcount--;
+      n->refcount--;
+    }

-      for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
-	   tp = t, t = t->prev)
+  gomp_mutex_unlock (&acc_dev->lock);
+
+  if (n->refcount == 0)
+    {
+      if (t->refcount == minrefs)
 	{
-	  if (n->tgt == t)
+	  /* This is the last reference, so pull the descriptor off the
+	     chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from
+	     freeing the device memory. */
+	  struct target_mem_desc *tp;
+	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+	       tp = t, t = t->prev)
 	    {
-	      if (tp)
-		tp->prev = t->prev;
-	      else
-		acc_dev->openacc.data_environ = t->prev;
-	      break;
+	      if (n->tgt == t)
+		{
+		  if (tp)
+		    tp->prev = t->prev;
+		  else
+		    acc_dev->openacc.data_environ = t->prev;
+		  break;
+		}
 	    }
 	}
-    }

-  if (force_copyfrom)
-    t->list[0].copy_from = 1;
+      /* Set refcount to 1 to allow gomp_unmap_vars to unmap it.  */
+      n->refcount = 1;
+      t->refcount = minrefs;
+      for (size_t i = 0; i < t->list_count; i++)
+	if (t->list[i].key == n)
+	  {
+	    t->list[i].copy_from = force_copyfrom ? 1 : 0;
+	    break;
+	  }

-  gomp_mutex_unlock (&acc_dev->lock);
+      /* If running synchronously, unmap immediately.  */
+      if (async < acc_async_noval)
+	gomp_unmap_vars (t, true);
+      else
+	t->device_descr->openacc.register_async_cleanup_func (t, async);
+    }

-  /* If running synchronously, unmap immediately.  */
-  if (async < acc_async_noval)
-    gomp_unmap_vars (t, true);
-  else
-    t->device_descr->openacc.register_async_cleanup_func (t, async);
+  gomp_mutex_unlock (&acc_dev->lock);

   gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
 }
--- libgomp/env.c.jj	2018-04-25 09:40:31.924655586 +0200
+++ libgomp/env.c	2019-05-07 18:46:36.482110438 +0200
@@ -88,8 +88,12 @@ void **gomp_places_list;
 unsigned long gomp_places_list_len;
 int gomp_debug_var;
 unsigned int gomp_num_teams_var;
+bool gomp_display_affinity_var;
+char *gomp_affinity_format_var = "level %L thread %i affinity %A";
+size_t gomp_affinity_format_len;
 char *goacc_device_type;
 int goacc_device_num;
+int goacc_default_dims[GOMP_DIM_MAX];

 #ifndef LIBGOMP_OFFLOADED_ONLY

@@ -100,6 +104,7 @@ parse_schedule (void)
 {
   char *env, *end;
   unsigned long value;
+  int monotonic = 0;

   env = getenv ("OMP_SCHEDULE");
   if (env == NULL)
@@ -107,6 +112,26 @@ parse_schedule (void)

   while (isspace ((unsigned char) *env))
     ++env;
+  if (strncasecmp (env, "monotonic", 9) == 0)
+    {
+      monotonic = 1;
+      env += 9;
+    }
+  else if (strncasecmp (env, "nonmonotonic", 12) == 0)
+    {
+      monotonic = -1;
+      env += 12;
+    }
+  if (monotonic)
+    {
+      while (isspace ((unsigned char) *env))
+	++env;
+      if (*env != ':')
+	goto unknown;
+      ++env;
+      while (isspace ((unsigned char) *env))
+	++env;
+    }
   if (strncasecmp (env, "static", 6) == 0)
     {
       gomp_global_icv.run_sched_var = GFS_STATIC;
@@ -130,12 +155,16 @@ parse_schedule (void)
   else
     goto unknown;

+  if (monotonic == 1
+      || (monotonic == 0 && gomp_global_icv.run_sched_var == GFS_STATIC))
+    gomp_global_icv.run_sched_var |= GFS_MONOTONIC;
+
   while (isspace ((unsigned char) *env))
     ++env;
   if (*env == '\0')
     {
       gomp_global_icv.run_sched_chunk_size
-	= gomp_global_icv.run_sched_var != GFS_STATIC;
+	= (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC;
       return;
     }
   if (*env++ != ',')
@@ -158,7 +187,8 @@ parse_schedule (void)
   if ((int)value != value)
     goto invalid;

-  if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
+  if (value == 0
+      && (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC)
     value = 1;
   gomp_global_icv.run_sched_chunk_size = value;
   return;
@@ -1066,6 +1096,36 @@ parse_acc_device_type (void)
 }

 static void
+parse_gomp_openacc_dim (void)
+{
+  /* The syntax is the same as for the -fopenacc-dim compilation option.  */
+  const char *var_name = "GOMP_OPENACC_DIM";
+  const char *env_var = getenv (var_name);
+  if (!env_var)
+    return;
+
+  const char *pos = env_var;
+  int i;
+  for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+    {
+      if (i && *pos++ != ':')
+	break;
+
+      if (*pos == ':')
+	continue;
+
+      const char *eptr;
+      errno = 0;
+      long val = strtol (pos, (char **)&eptr, 10);
+      if (errno || val < 0 || (unsigned)val != val)
+	break;
+
+      goacc_default_dims[i] = (int)val;
+      pos = eptr;
+    }
+}
+
+static void
 handle_omp_display_env (unsigned long stacksize, int wait_policy)
 {
   const char *env;
@@ -1119,19 +1179,34 @@ handle_omp_display_env (unsigned long st
   fputs ("'\n", stderr);

   fprintf (stderr, "  OMP_SCHEDULE = '");
-  switch (gomp_global_icv.run_sched_var)
+  if ((gomp_global_icv.run_sched_var & GFS_MONOTONIC))
+    {
+      if (gomp_global_icv.run_sched_var != (GFS_MONOTONIC | GFS_STATIC))
+	fputs ("MONOTONIC:", stderr);
+    }
+  else if (gomp_global_icv.run_sched_var == GFS_STATIC)
+    fputs ("NONMONOTONIC:", stderr);
+  switch (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC)
     {
     case GFS_RUNTIME:
       fputs ("RUNTIME", stderr);
+      if (gomp_global_icv.run_sched_chunk_size != 1)
+	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
       break;
     case GFS_STATIC:
       fputs ("STATIC", stderr);
+      if (gomp_global_icv.run_sched_chunk_size != 0)
+	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
       break;
     case GFS_DYNAMIC:
       fputs ("DYNAMIC", stderr);
+      if (gomp_global_icv.run_sched_chunk_size != 1)
+	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
       break;
     case GFS_GUIDED:
       fputs ("GUIDED", stderr);
+      if (gomp_global_icv.run_sched_chunk_size != 1)
+	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
       break;
     case GFS_AUTO:
       fputs ("AUTO", stderr);
@@ -1197,6 +1272,10 @@ handle_omp_display_env (unsigned long st
 	   gomp_global_icv.default_device_var);
   fprintf (stderr, "  OMP_MAX_TASK_PRIORITY = '%d'\n",
 	   gomp_max_task_priority_var);
+  fprintf (stderr, "  OMP_DISPLAY_AFFINITY = '%s'\n",
+	   gomp_display_affinity_var ? "TRUE" : "FALSE");
+  fprintf (stderr, "  OMP_AFFINITY_FORMAT = '%s'\n",
+	   gomp_affinity_format_var);

   if (verbose)
     {
@@ -1228,6 +1307,7 @@ initialize_env (void)
   parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
   parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
+  parse_boolean ("OMP_DISPLAY_AFFINITY", &gomp_display_affinity_var);
   parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
   parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true);
   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
@@ -1277,6 +1357,13 @@ initialize_env (void)
     }
   if (gomp_global_icv.bind_var != omp_proc_bind_false)
     gomp_init_affinity ();
+
+  {
+    const char *env = getenv ("OMP_AFFINITY_FORMAT");
+    if (env != NULL)
+      gomp_set_affinity_format (env, strlen (env));
+  }
+
   wait_policy = parse_wait_policy ();
   if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
     {
@@ -1302,7 +1389,6 @@ initialize_env (void)

   /* Not strictly environment related, but ordering constructors is tricky.  */
   pthread_attr_init (&gomp_thread_attr);
-  pthread_attr_setdetachstate (&gomp_thread_attr, PTHREAD_CREATE_DETACHED);

   if (parse_stacksize ("OMP_STACKSIZE", &stacksize)
       || parse_stacksize ("GOMP_STACKSIZE", &stacksize)
@@ -1336,6 +1422,7 @@ initialize_env (void)
     goacc_device_num = 0;

   parse_acc_device_type ();
+  parse_gomp_openacc_dim ();

   goacc_runtime_initialize ();
 }
--- libgomp/fortran.c.jj	2018-04-25 09:40:31.913655581 +0200
+++ libgomp/fortran.c	2019-05-07 18:46:36.491110295 +0200
@@ -28,6 +28,8 @@
 #include "libgomp.h"
 #include "libgomp_f.h"
 #include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
 #include <limits.h>

 #ifdef HAVE_ATTRIBUTE_ALIAS
@@ -82,6 +84,8 @@ ialias_redirect (omp_get_team_num)
 ialias_redirect (omp_is_initial_device)
 ialias_redirect (omp_get_initial_device)
 ialias_redirect (omp_get_max_task_priority)
+ialias_redirect (omp_pause_resource)
+ialias_redirect (omp_pause_resource_all)
 #endif

 #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
@@ -368,7 +372,9 @@ omp_get_schedule_ (int32_t *kind, int32_
   omp_sched_t k;
   int cs;
   omp_get_schedule (&k, &cs);
-  *kind = k;
+  /* For now mask off GFS_MONOTONIC, because OpenMP 4.5 code will not
+     expect to see it.  */
+  *kind = k & ~GFS_MONOTONIC;
   *chunk_size = cs;
 }

@@ -378,7 +384,8 @@ omp_get_schedule_8_ (int32_t *kind, int6
   omp_sched_t k;
   int cs;
   omp_get_schedule (&k, &cs);
-  *kind = k;
+  /* See above.  */
+  *kind = k & ~GFS_MONOTONIC;
   *chunk_size = cs;
 }

@@ -576,3 +583,96 @@ omp_get_max_task_priority_ (void)
 {
   return omp_get_max_task_priority ();
 }
+
+void
+omp_set_affinity_format_ (const char *format, size_t format_len)
+{
+  gomp_set_affinity_format (format, format_len);
+}
+
+int32_t
+omp_get_affinity_format_ (char *buffer, size_t buffer_len)
+{
+  size_t len = strlen (gomp_affinity_format_var);
+  if (buffer_len)
+    {
+      if (len < buffer_len)
+	{
+	  memcpy (buffer, gomp_affinity_format_var, len);
+	  memset (buffer + len, ' ', buffer_len - len);
+	}
+      else
+	memcpy (buffer, gomp_affinity_format_var, buffer_len);
+    }
+  return len;
+}
+
+void
+omp_display_affinity_ (const char *format, size_t format_len)
+{
+  char *fmt = NULL, fmt_buf[256];
+  char buf[512];
+  if (format_len)
+    {
+      fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1);
+      memcpy (fmt, format, format_len);
+      fmt[format_len] = '\0';
+    }
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ret
+    = gomp_display_affinity (buf, sizeof buf,
+			     format_len ? fmt : gomp_affinity_format_var,
+			     gomp_thread_self (), &thr->ts, thr->place);
+  if (ret < sizeof buf)
+    {
+      buf[ret] = '\n';
+      gomp_print_string (buf, ret + 1);
+    }
+  else
+    {
+      char *b = gomp_malloc (ret + 1);
+      gomp_display_affinity (buf, sizeof buf,
+			     format_len ? fmt : gomp_affinity_format_var,
+			     gomp_thread_self (), &thr->ts, thr->place);
+      b[ret] = '\n';
+      gomp_print_string (b, ret + 1);
+      free (b);
+    }
+  if (fmt && fmt != fmt_buf)
+    free (fmt);
+}
+
+int32_t
+omp_capture_affinity_ (char *buffer, const char *format,
+		       size_t buffer_len, size_t format_len)
+{
+  char *fmt = NULL, fmt_buf[256];
+  if (format_len)
+    {
+      fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1);
+      memcpy (fmt, format, format_len);
+      fmt[format_len] = '\0';
+    }
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ret
+    = gomp_display_affinity (buffer, buffer_len,
+			     format_len ? fmt : gomp_affinity_format_var,
+			     gomp_thread_self (), &thr->ts, thr->place);
+  if (fmt && fmt != fmt_buf)
+    free (fmt);
+  if (ret < buffer_len)
+    memset (buffer + ret, ' ', buffer_len - ret);
+  return ret;
+}
+
+int32_t
+omp_pause_resource_ (const int32_t *kind, const int32_t *device_num)
+{
+  return omp_pause_resource (*kind, *device_num);
+}
+
+int32_t
+omp_pause_resource_all_ (const int32_t *kind)
+{
+  return omp_pause_resource_all (*kind);
+}
--- libgomp/configure.tgt.jj	2018-04-25 09:40:31.925655587 +0200
+++ libgomp/configure.tgt	2019-05-07 18:46:36.479110486 +0200
@@ -18,7 +18,7 @@ if test $gcc_cv_have_tls = yes ; then
 	;;

     *-*-linux* | *-*-gnu*)
-	XCFLAGS="${XCFLAGS} -ftls-model=initial-exec"
+	XCFLAGS="${XCFLAGS} -ftls-model=initial-exec -DUSING_INITIAL_EXEC_TLS"
 	;;

     *-*-rtems*)
--- libgomp/icv-device.c.jj	2018-04-25 09:40:31.925655587 +0200
+++ libgomp/icv-device.c	2019-05-07 18:46:36.513109943 +0200
@@ -49,20 +49,6 @@ omp_get_num_devices (void)
 }

 int
-omp_get_num_teams (void)
-{
-  /* Hardcoded to 1 on host, MIC, HSAIL?  Maybe variable on PTX.  */
-  return 1;
-}
-
-int
-omp_get_team_num (void)
-{
-  /* Hardcoded to 0 on host, MIC, HSAIL?  Maybe variable on PTX.  */
-  return 0;
-}
-
-int
 omp_is_initial_device (void)
 {
   /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX.  */
@@ -72,6 +58,4 @@ omp_is_initial_device (void)
 ialias (omp_set_default_device)
 ialias (omp_get_default_device)
 ialias (omp_get_num_devices)
-ialias (omp_get_num_teams)
-ialias (omp_get_team_num)
 ialias (omp_is_initial_device)
--- libgomp/Makefile.in.jj	2018-04-25 09:40:31.320655306 +0200
+++ libgomp/Makefile.in	2019-05-07 20:00:01.082077522 +0200
@@ -90,7 +90,7 @@ DIST_COMMON = $(top_srcdir)/plugin/Makef
 	$(srcdir)/libgomp.spec.in $(srcdir)/../depcomp
 @PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
 @PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la
-@USE_FORTRAN_TRUE@am__append_3 = openacc.f90
+@USE_FORTRAN_TRUE@am__append_3 = openacc2.f90
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -172,7 +172,7 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL
 @PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_rpath = -rpath \
 @PLUGIN_NVPTX_TRUE@	$(toolexeclibdir)
 libgomp_la_LIBADD =
-@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
+@USE_FORTRAN_TRUE@am__objects_1 = openacc2.lo
 am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
 	env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
 	loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \
@@ -180,7 +180,8 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.
 	sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \
 	target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
 	oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
-	oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1)
+	oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
+	teams.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -380,6 +381,7 @@ mkdir_p = @mkdir_p@
 multi_basedir = @multi_basedir@
 offload_additional_lib_paths = @offload_additional_lib_paths@
 offload_additional_options = @offload_additional_options@
+offload_plugins = @offload_plugins@
 offload_targets = @offload_targets@
 oldincludedir = @oldincludedir@
 pdfdir = @pdfdir@
@@ -436,7 +438,7 @@ libgomp_la_SOURCES = alloc.c atomic.c ba
 	affinity.c target.c splay-tree.c libgomp-plugin.c \
 	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
 	oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
-	$(am__append_3)
+	affinity-fmt.c teams.c $(am__append_3)

 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@@ -599,6 +601,7 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c

+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity-fmt.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@
@@ -638,6 +641,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/teams.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/work.Plo@am__quote@

@@ -1292,8 +1296,6 @@ omp_lib_kinds.mod: omp_lib.mod
 	:
 openacc_kinds.mod: openacc.mod
 	:
-openacc.mod: openacc.lo
-	:
 %.mod: %.f90
 	$(FC) $(FCFLAGS) -fsyntax-only $<
 fortran.lo: libgomp_f.h
--- libgomp/plugin/cuda/cuda.h.jj	2018-04-25 09:40:31.914655581 +0200
+++ libgomp/plugin/cuda/cuda.h	2019-05-07 18:46:36.533109624 +0200
@@ -44,6 +44,7 @@ typedef void *CUevent;
 typedef void *CUfunction;
 typedef void *CUlinkState;
 typedef void *CUmodule;
+typedef size_t (*CUoccupancyB2DSize)(int);
 typedef void *CUstream;

 typedef enum {
@@ -88,6 +89,7 @@ typedef enum {
   CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
   CU_JIT_ERROR_LOG_BUFFER = 5,
   CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
+  CU_JIT_OPTIMIZATION_LEVEL = 7,
   CU_JIT_LOG_VERBOSE = 12
 } CUjit_option;

@@ -169,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr
 CUresult cuModuleLoad (CUmodule *, const char *);
 CUresult cuModuleLoadData (CUmodule *, const void *);
 CUresult cuModuleUnload (CUmodule);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
 CUresult cuStreamCreate (CUstream *, unsigned);
 #define cuStreamDestroy cuStreamDestroy_v2
 CUresult cuStreamDestroy (CUstream);
--- libgomp/plugin/cuda-lib.def.jj	2019-05-07 18:46:36.533109624 +0200
+++ libgomp/plugin/cuda-lib.def	2019-05-07 18:46:36.533109624 +0200
@@ -0,0 +1,49 @@
+CUDA_ONE_CALL (cuCtxCreate)
+CUDA_ONE_CALL (cuCtxDestroy)
+CUDA_ONE_CALL (cuCtxGetCurrent)
+CUDA_ONE_CALL (cuCtxGetDevice)
+CUDA_ONE_CALL (cuCtxPopCurrent)
+CUDA_ONE_CALL (cuCtxPushCurrent)
+CUDA_ONE_CALL (cuCtxSynchronize)
+CUDA_ONE_CALL (cuDeviceGet)
+CUDA_ONE_CALL (cuDeviceGetAttribute)
+CUDA_ONE_CALL (cuDeviceGetCount)
+CUDA_ONE_CALL (cuEventCreate)
+CUDA_ONE_CALL (cuEventDestroy)
+CUDA_ONE_CALL (cuEventElapsedTime)
+CUDA_ONE_CALL (cuEventQuery)
+CUDA_ONE_CALL (cuEventRecord)
+CUDA_ONE_CALL (cuEventSynchronize)
+CUDA_ONE_CALL (cuFuncGetAttribute)
+CUDA_ONE_CALL_MAYBE_NULL (cuGetErrorString)
+CUDA_ONE_CALL (cuInit)
+CUDA_ONE_CALL (cuLaunchKernel)
+CUDA_ONE_CALL (cuLinkAddData)
+CUDA_ONE_CALL_MAYBE_NULL (cuLinkAddData_v2)
+CUDA_ONE_CALL (cuLinkComplete)
+CUDA_ONE_CALL (cuLinkCreate)
+CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2)
+CUDA_ONE_CALL (cuLinkDestroy)
+CUDA_ONE_CALL (cuMemAlloc)
+CUDA_ONE_CALL (cuMemAllocHost)
+CUDA_ONE_CALL (cuMemcpy)
+CUDA_ONE_CALL (cuMemcpyDtoDAsync)
+CUDA_ONE_CALL (cuMemcpyDtoH)
+CUDA_ONE_CALL (cuMemcpyDtoHAsync)
+CUDA_ONE_CALL (cuMemcpyHtoD)
+CUDA_ONE_CALL (cuMemcpyHtoDAsync)
+CUDA_ONE_CALL (cuMemFree)
+CUDA_ONE_CALL (cuMemFreeHost)
+CUDA_ONE_CALL (cuMemGetAddressRange)
+CUDA_ONE_CALL (cuMemHostGetDevicePointer)
+CUDA_ONE_CALL (cuModuleGetFunction)
+CUDA_ONE_CALL (cuModuleGetGlobal)
+CUDA_ONE_CALL (cuModuleLoad)
+CUDA_ONE_CALL (cuModuleLoadData)
+CUDA_ONE_CALL (cuModuleUnload)
+CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
+CUDA_ONE_CALL (cuStreamCreate)
+CUDA_ONE_CALL (cuStreamDestroy)
+CUDA_ONE_CALL (cuStreamQuery)
+CUDA_ONE_CALL (cuStreamSynchronize)
+CUDA_ONE_CALL (cuStreamWaitEvent)
--- libgomp/plugin/plugin-nvptx.c.jj	2018-04-25 09:40:31.915655582 +0200
+++ libgomp/plugin/plugin-nvptx.c	2019-05-07 18:46:36.535109592 +0200
@@ -31,6 +31,7 @@
    is not clear as to what that state might be.  Or how one might
    propagate it from one thread to another.  */

+#define _GNU_SOURCE
 #include "openacc.h"
 #include "config.h"
 #include "libgomp-plugin.h"
@@ -48,60 +49,41 @@
 #include <assert.h>
 #include <errno.h>

+#if CUDA_VERSION < 6000
+extern CUresult cuGetErrorString (CUresult, const char **);
+#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
+#endif
+
+#if CUDA_VERSION >= 6050
+#undef cuLinkCreate
+#undef cuLinkAddData
+CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
+			const char *, unsigned, CUjit_option *, void **);
+CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
+#else
+typedef size_t (*CUoccupancyB2DSize)(int);
+CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
+			   const char *, unsigned, CUjit_option *, void **);
+CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+					  CUoccupancyB2DSize, size_t, int);
+#endif
+
+#define DO_PRAGMA(x) _Pragma (#x)
+
 #if PLUGIN_NVPTX_DYNAMIC
 # include <dlfcn.h>

-# define CUDA_CALLS \
-CUDA_ONE_CALL (cuCtxCreate)		\
-CUDA_ONE_CALL (cuCtxDestroy)		\
-CUDA_ONE_CALL (cuCtxGetCurrent)		\
-CUDA_ONE_CALL (cuCtxGetDevice)		\
-CUDA_ONE_CALL (cuCtxPopCurrent)		\
-CUDA_ONE_CALL (cuCtxPushCurrent)	\
-CUDA_ONE_CALL (cuCtxSynchronize)	\
-CUDA_ONE_CALL (cuDeviceGet)		\
-CUDA_ONE_CALL (cuDeviceGetAttribute)	\
-CUDA_ONE_CALL (cuDeviceGetCount)	\
-CUDA_ONE_CALL (cuEventCreate)		\
-CUDA_ONE_CALL (cuEventDestroy)		\
-CUDA_ONE_CALL (cuEventElapsedTime)	\
-CUDA_ONE_CALL (cuEventQuery)		\
-CUDA_ONE_CALL (cuEventRecord)		\
-CUDA_ONE_CALL (cuEventSynchronize)	\
-CUDA_ONE_CALL (cuFuncGetAttribute)	\
-CUDA_ONE_CALL (cuGetErrorString)	\
-CUDA_ONE_CALL (cuInit)			\
-CUDA_ONE_CALL (cuLaunchKernel)		\
-CUDA_ONE_CALL (cuLinkAddData)		\
-CUDA_ONE_CALL (cuLinkComplete)		\
-CUDA_ONE_CALL (cuLinkCreate)		\
-CUDA_ONE_CALL (cuLinkDestroy)		\
-CUDA_ONE_CALL (cuMemAlloc)		\
-CUDA_ONE_CALL (cuMemAllocHost)		\
-CUDA_ONE_CALL (cuMemcpy)		\
-CUDA_ONE_CALL (cuMemcpyDtoDAsync)	\
-CUDA_ONE_CALL (cuMemcpyDtoH)		\
-CUDA_ONE_CALL (cuMemcpyDtoHAsync)	\
-CUDA_ONE_CALL (cuMemcpyHtoD)		\
-CUDA_ONE_CALL (cuMemcpyHtoDAsync)	\
-CUDA_ONE_CALL (cuMemFree)		\
-CUDA_ONE_CALL (cuMemFreeHost)		\
-CUDA_ONE_CALL (cuMemGetAddressRange)	\
-CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
-CUDA_ONE_CALL (cuModuleGetFunction)	\
-CUDA_ONE_CALL (cuModuleGetGlobal)	\
-CUDA_ONE_CALL (cuModuleLoad)		\
-CUDA_ONE_CALL (cuModuleLoadData)	\
-CUDA_ONE_CALL (cuModuleUnload)		\
-CUDA_ONE_CALL (cuStreamCreate)		\
-CUDA_ONE_CALL (cuStreamDestroy)		\
-CUDA_ONE_CALL (cuStreamQuery)		\
-CUDA_ONE_CALL (cuStreamSynchronize)	\
-CUDA_ONE_CALL (cuStreamWaitEvent)
-# define CUDA_ONE_CALL(call) \
-  __typeof (call) *call;
 struct cuda_lib_s {
-  CUDA_CALLS
+
+# define CUDA_ONE_CALL(call)			\
+  __typeof (call) *call;
+# define CUDA_ONE_CALL_MAYBE_NULL(call)		\
+  CUDA_ONE_CALL (call)
+#include "cuda-lib.def"
+# undef CUDA_ONE_CALL
+# undef CUDA_ONE_CALL_MAYBE_NULL
+
 } cuda_lib;

 /* -1 if init_cuda_lib has not been called yet, false
@@ -120,24 +102,41 @@ init_cuda_lib (void)
   cuda_lib_inited = false;
   if (h == NULL)
     return false;
-# undef CUDA_ONE_CALL
-# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
-# define CUDA_ONE_CALL_1(call) \
+
+# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
+# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
+# define CUDA_ONE_CALL_1(call, allow_null)		\
   cuda_lib.call = dlsym (h, #call);	\
-  if (cuda_lib.call == NULL)		\
+  if (!allow_null && cuda_lib.call == NULL)		\
     return false;
-  CUDA_CALLS
+#include "cuda-lib.def"
+# undef CUDA_ONE_CALL
+# undef CUDA_ONE_CALL_1
+# undef CUDA_ONE_CALL_MAYBE_NULL
+
   cuda_lib_inited = true;
   return true;
 }
-# undef CUDA_ONE_CALL
-# undef CUDA_ONE_CALL_1
 # define CUDA_CALL_PREFIX cuda_lib.
 #else
+
+# define CUDA_ONE_CALL(call)
+# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
+#include "cuda-lib.def"
+#undef CUDA_ONE_CALL_MAYBE_NULL
+#undef CUDA_ONE_CALL
+
 # define CUDA_CALL_PREFIX
 # define init_cuda_lib() true
 #endif

+#include "secure_getenv.h"
+
+#undef MIN
+#undef MAX
+#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
+#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+
 /* Convenience macros for the frequently used CUDA library call and
    error handling sequence as well as CUDA library calls that
    do the error checking themselves or don't do it at all.  */
@@ -171,40 +170,42 @@ init_cuda_lib (void)
 #define CUDA_CALL_NOCHECK(FN, ...)		\
   CUDA_CALL_PREFIX FN (__VA_ARGS__)

+#define CUDA_CALL_EXISTS(FN)			\
+  CUDA_CALL_PREFIX FN
+
 static const char *
 cuda_error (CUresult r)
 {
-#if CUDA_VERSION < 7000
-  /* Specified in documentation and present in library from at least
-     5.5.  Not declared in header file prior to 7.0.  */
-  extern CUresult cuGetErrorString (CUresult, const char **);
-#endif
+  const char *fallback = "unknown cuda error";
   const char *desc;

+  if (!CUDA_CALL_EXISTS (cuGetErrorString))
+    return fallback;
+
   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
-  if (r != CUDA_SUCCESS)
-    desc = "unknown cuda error";
+  if (r == CUDA_SUCCESS)
+    return desc;

-  return desc;
+  return fallback;
 }

 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;

+struct cuda_map
+{
+  CUdeviceptr d;
+  size_t size;
+  bool active;
+  struct cuda_map *next;
+};
+
 struct ptx_stream
 {
   CUstream stream;
   pthread_t host_thread;
   bool multithreaded;
-
-  CUdeviceptr d;
-  void *h;
-  void *h_begin;
-  void *h_end;
-  void *h_next;
-  void *h_prev;
-  void *h_tail;
-
+  struct cuda_map *map;
   struct ptx_stream *next;
 };

@@ -216,12 +217,64 @@ struct nvptx_thread
   struct ptx_device *ptx_dev;
 };

-struct map
+static struct cuda_map *
+cuda_map_create (size_t size)
 {
-  int     async;
-  size_t  size;
-  char    mappings[0];
-};
+  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+  assert (map);
+
+  map->next = NULL;
+  map->size = size;
+  map->active = false;
+
+  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
+  assert (map->d);
+
+  return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+  if (map->active)
+    /* Possible reasons for the map to be still active:
+       - the associated async kernel might still be running.
+       - the associated async kernel might have finished, but the
+         corresponding event that should trigger the pop_map has not been
+	 processed by event_gc.
+       - the associated sync kernel might have aborted
+
+       The async cases could happen if the user specified an async region
+       without adding a corresponding wait that is guaranteed to be executed
+       (before returning from main, or in an atexit handler).
+       We do not want to deallocate a device pointer that is still being
+       used, so skip it.
+
+       In the sync case, the device pointer is no longer used, but deallocating
+       it using cuMemFree will not succeed, so skip it.
+
+       TODO: Handle this in a more constructive way, by f.i. waiting for streams
+       to finish before de-allocating them (PR88981), or by ensuring the CUDA
+       lib atexit handler is called before rather than after the libgomp plugin
+       atexit handler (PR83795).  */
+    ;
+  else
+    CUDA_CALL_NOCHECK (cuMemFree, map->d);
+
+  free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+   contains the data mapping arguments for cuLaunchKernel.  Each
+   asynchronous PTX stream may have multiple pending kernel
+   invocations, which are launched in a FIFO order.  As such, the map
+   routines maintains a queue of cuLaunchKernel arguments.
+
+   Calls to map_push and map_pop must be guarded by ptx_event_lock.
+   Likewise, calls to map_init and map_fini are guarded by
+   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+   GOMP_OFFLOAD_fini_device, respectively.  */

 static bool
 map_init (struct ptx_stream *s)
@@ -229,109 +282,83 @@ map_init (struct ptx_stream *s)
   int size = getpagesize ();

   assert (s);
-  assert (!s->d);
-  assert (!s->h);
-
-  CUDA_CALL (cuMemAllocHost, &s->h, size);
-  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);

-  assert (s->h);
+  s->map = cuda_map_create (size);

-  s->h_begin = s->h;
-  s->h_end = s->h_begin + size;
-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
-  assert (s->h_next);
-  assert (s->h_end);
   return true;
 }

 static bool
 map_fini (struct ptx_stream *s)
 {
-  CUDA_CALL (cuMemFreeHost, s->h);
+  assert (s->map->next == NULL);
+
+  cuda_map_destroy (s->map);
+
   return true;
 }

 static void
 map_pop (struct ptx_stream *s)
 {
-  struct map *m;
+  struct cuda_map *next;

   assert (s != NULL);
-  assert (s->h_next);
-  assert (s->h_prev);
-  assert (s->h_tail);
-
-  m = s->h_tail;
-
-  s->h_tail += m->size;
-
-  if (s->h_tail >= s->h_end)
-    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
-
-  if (s->h_next == s->h_tail)
-    s->h_prev = s->h_next;

-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
+  if (s->map->next == NULL)
+    {
+      s->map->active = false;
+      return;
+    }

-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  next = s->map->next;
+  cuda_map_destroy (s->map);
+  s->map = next;
 }

-static void
-map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
+static CUdeviceptr
+map_push (struct ptx_stream *s, size_t size)
 {
-  int left;
-  int offset;
-  struct map *m;
+  struct cuda_map *map = NULL;
+  struct cuda_map **t;

-  assert (s != NULL);
-
-  left = s->h_end - s->h_next;
-  size += sizeof (struct map);
-
-  assert (s->h_prev);
-  assert (s->h_next);
+  assert (s);
+  assert (s->map);

-  if (size >= left)
+  /* Select an element to push.  */
+  if (s->map->active)
+    map = cuda_map_create (size);
+  else
     {
-      m = s->h_prev;
-      m->size += left;
-      s->h_next = s->h_begin;
-
-      if (s->h_next + size > s->h_end)
-	GOMP_PLUGIN_fatal ("unable to push map");
-    }
-
-  assert (s->h_next);
-
-  m = s->h_next;
-  m->async = async;
-  m->size = size;
+      /* Pop the inactive front element.  */
+      struct cuda_map *pop = s->map;
+      s->map = pop->next;
+      pop->next = NULL;

-  offset = (void *)&m->mappings[0] - s->h;
+      if (pop->size < size)
+	{
+	  cuda_map_destroy (pop);

-  *d = (void *)(s->d + offset);
-  *h = (void *)(s->h + offset);
+	  map = cuda_map_create (size);
+	}
+      else
+	map = pop;
+    }

-  s->h_prev = s->h_next;
-  s->h_next += size;
+  /* Check that the element is as expected.  */
+  assert (map->next == NULL);
+  assert (!map->active);

-  assert (s->h_prev);
-  assert (s->h_next);
+  /* Mark the element active.  */
+  map->active = true;

-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  /* Push the element to the back of the list.  */
+  for (t = &s->map; (*t) != NULL; t = &(*t)->next)
+    ;
+  assert (t != NULL && *t == NULL);
+  *t = map;

-  return;
+  return map->d;
 }

 /* Target data function launch information.  */
@@ -411,6 +438,10 @@ struct ptx_device
   int num_sms;
   int regs_per_block;
   int regs_per_sm;
+  int warp_size;
+  int max_threads_per_block;
+  int max_threads_per_multiprocessor;
+  int default_dims[GOMP_DIM_MAX];

   struct ptx_image_data *images;  /* Images loaded on device.  */
   pthread_mutex_t image_lock;     /* Lock for above list.  */
@@ -458,8 +489,6 @@ init_streams_for_device (struct ptx_devi
   null_stream->stream = NULL;
   null_stream->host_thread = pthread_self ();
   null_stream->multithreaded = true;
-  null_stream->d = (CUdeviceptr) NULL;
-  null_stream->h = NULL;
   if (!map_init (null_stream))
     return false;

@@ -594,8 +623,6 @@ select_stream_for_async (int async, pthr
 	  s->host_thread = thread;
 	  s->multithreaded = false;

-	  s->d = (CUdeviceptr) NULL;
-	  s->h = NULL;
 	  if (!map_init (s))
 	    {
 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
@@ -777,9 +804,11 @@ nvptx_open_device (int n)
 		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
   ptx_dev->regs_per_block = pi;

-  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
+  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
      in CUDA 6.0 and newer.  */
-  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
+  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+			 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
+			 dev);
   /* Fallback: use limit of registers per block, which is usually equal.  */
   if (r == CUDA_ERROR_INVALID_VALUE)
     pi = ptx_dev->regs_per_block;
@@ -797,12 +826,24 @@ nvptx_open_device (int n)
       GOMP_PLUGIN_error ("Only warp size 32 is supported");
       return NULL;
     }
+  ptx_dev->warp_size = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
+		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
+  ptx_dev->max_threads_per_block = pi;
+
+  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
+		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
+  ptx_dev->max_threads_per_multiprocessor = pi;

   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
   if (r != CUDA_SUCCESS)
     async_engines = 1;

+  for (int i = 0; i != GOMP_DIM_MAX; i++)
+    ptx_dev->default_dims[i] = 0;
+
   ptx_dev->images = NULL;
   pthread_mutex_init (&ptx_dev->image_lock, NULL);

@@ -876,12 +917,42 @@ notify_var (const char *var_name, const
     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
 }

+static void
+process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
+{
+  const char *var_name = "GOMP_NVPTX_JIT";
+  const char *env_var = secure_getenv (var_name);
+  notify_var (var_name, env_var);
+
+  if (env_var == NULL)
+    return;
+
+  const char *c = env_var;
+  while (*c != '\0')
+    {
+      while (*c == ' ')
+	c++;
+
+      if (c[0] == '-' && c[1] == 'O'
+	  && '0' <= c[2] && c[2] <= '4'
+	  && (c[3] == '\0' || c[3] == ' '))
+	{
+	  *gomp_nvptx_o = c[2] - '0';
+	  c += 3;
+	  continue;
+	}
+
+      GOMP_PLUGIN_error ("Error parsing %s", var_name);
+      break;
+    }
+}
+
 static bool
 link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
 	  unsigned num_objs)
 {
-  CUjit_option opts[6];
-  void *optvals[6];
+  CUjit_option opts[7];
+  void *optvals[7];
   float elapsed = 0.0;
   char elog[1024];
   char ilog[16384];
@@ -908,16 +979,41 @@ link_ptx (CUmodule *module, const struct
   opts[5] = CU_JIT_LOG_VERBOSE;
   optvals[5] = (void *) 1;

-  CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
+  static intptr_t gomp_nvptx_o = -1;
+
+  static bool init_done = false;
+  if (!init_done)
+    {
+      process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
+      init_done = true;
+  }
+
+  int nopts = 6;
+  if (gomp_nvptx_o != -1)
+    {
+      opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
+      optvals[nopts] = (void *) gomp_nvptx_o;
+      nopts++;
+    }
+
+  if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
+    CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
+  else
+    CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);

   for (; num_objs--; ptx_objs++)
     {
       /* cuLinkAddData's 'data' argument erroneously omits the const
 	 qualifier.  */
       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
-      r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
-			     (char *) ptx_objs->code, ptx_objs->size,
-			     0, 0, 0, 0);
+      if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
+	r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
+			       (char *) ptx_objs->code, ptx_objs->size,
+			       0, 0, 0, 0);
+      else
+	r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
+			       (char *) ptx_objs->code, ptx_objs->size,
+			       0, 0, 0, 0);
       if (r != CUDA_SUCCESS)
 	{
 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
@@ -1067,8 +1163,10 @@ nvptx_exec (void (*fn), size_t mapnum, v
   int i;
   struct ptx_stream *dev_str;
   void *kargs[1];
-  void *hp, *dp;
+  void *hp;
+  CUdeviceptr dp = 0;
   struct nvptx_thread *nvthd = nvptx_thread ();
+  int warp_size = nvthd->ptx_dev->warp_size;
   const char *maybe_abort_msg = "(perhaps abort was called)";

   function = targ_fn->fn;
@@ -1090,68 +1188,36 @@ nvptx_exec (void (*fn), size_t mapnum, v

   if (seen_zero)
     {
-      /* See if the user provided GOMP_OPENACC_DIM environment
-	 variable to specify runtime defaults. */
-      static int default_dims[GOMP_DIM_MAX];
-
       pthread_mutex_lock (&ptx_dev_lock);
-      if (!default_dims[0])
-	{
-	  const char *var_name = "GOMP_OPENACC_DIM";
-	  /* We only read the environment variable once.  You can't
-	     change it in the middle of execution.  The syntax  is
-	     the same as for the -fopenacc-dim compilation option.  */
-	  const char *env_var = getenv (var_name);
-	  notify_var (var_name, env_var);
-	  if (env_var)
-	    {
-	      const char *pos = env_var;

-	      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
-		{
-		  if (i && *pos++ != ':')
-		    break;
-		  if (*pos != ':')
-		    {
-		      const char *eptr;
-
-		      errno = 0;
-		      long val = strtol (pos, (char **)&eptr, 10);
-		      if (errno || val < 0 || (unsigned)val != val)
-			break;
-		      default_dims[i] = (int)val;
-		      pos = eptr;
-		    }
-		}
-	    }
+      static int gomp_openacc_dims[GOMP_DIM_MAX];
+      if (!gomp_openacc_dims[0])
+	{
+	  /* See if the user provided GOMP_OPENACC_DIM environment
+	     variable to specify runtime defaults.  */
+	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
+	    gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
+	}

-	  int warp_size, block_size, dev_size, cpu_size;
-	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
-	  /* 32 is the default for known hardware.  */
-	  int gang = 0, worker = 32, vector = 32;
-	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
-
-	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
-	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
-	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
-	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
-
-	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
-				 dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
-				    dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
-				    dev) == CUDA_SUCCESS
-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
-				    dev) == CUDA_SUCCESS)
-	    {
-	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
-				 " dev_size=%d, cpu_size=%d\n",
-				 warp_size, block_size, dev_size, cpu_size);
-	      gang = (cpu_size / block_size) * dev_size;
-	      worker = block_size / warp_size;
-	      vector = warp_size;
-	    }
+      if (!nvthd->ptx_dev->default_dims[0])
+	{
+	  int default_dims[GOMP_DIM_MAX];
+	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
+	    default_dims[i] = gomp_openacc_dims[i];
+
+	  int gang, worker, vector;
+	  {
+	    int block_size = nvthd->ptx_dev->max_threads_per_block;
+	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
+	    int dev_size = nvthd->ptx_dev->num_sms;
+	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+			       " dev_size=%d, cpu_size=%d\n",
+			       warp_size, block_size, dev_size, cpu_size);
+
+	    gang = (cpu_size / block_size) * dev_size;
+	    worker = block_size / warp_size;
+	    vector = warp_size;
+	  }

 	  /* There is no upper bound on the gang size.  The best size
 	     matches the hardware configuration.  Logical gangs are
@@ -1172,29 +1238,150 @@ nvptx_exec (void (*fn), size_t mapnum, v
 			     default_dims[GOMP_DIM_GANG],
 			     default_dims[GOMP_DIM_WORKER],
 			     default_dims[GOMP_DIM_VECTOR]);
+
+	  for (i = 0; i != GOMP_DIM_MAX; i++)
+	    nvthd->ptx_dev->default_dims[i] = default_dims[i];
 	}
       pthread_mutex_unlock (&ptx_dev_lock);

-      for (i = 0; i != GOMP_DIM_MAX; i++)
-	if (!dims[i])
-	  dims[i] = default_dims[i];
-    }
-
-  /* This reserves a chunk of a pre-allocated page of memory mapped on both
-     the host and the device. HP is a host pointer to the new chunk, and DP is
-     the corresponding device pointer.  */
-  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
-
-  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
-
-  /* Copy the array of arguments to the mapped page.  */
-  for (i = 0; i < mapnum; i++)
-    ((void **) hp)[i] = devaddrs[i];
-
-  /* Copy the (device) pointers to arguments to the device (dp and hp might in
-     fact have the same value on a unified-memory system).  */
-  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
-		    mapnum * sizeof (void *));
+      {
+	bool default_dim_p[GOMP_DIM_MAX];
+	for (i = 0; i != GOMP_DIM_MAX; i++)
+	  default_dim_p[i] = !dims[i];
+
+	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
+	  {
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		dims[i] = nvthd->ptx_dev->default_dims[i];
+
+	    if (default_dim_p[GOMP_DIM_VECTOR])
+	      dims[GOMP_DIM_VECTOR]
+		= MIN (dims[GOMP_DIM_VECTOR],
+		       (targ_fn->max_threads_per_block / warp_size
+			* warp_size));
+
+	    if (default_dim_p[GOMP_DIM_WORKER])
+	      dims[GOMP_DIM_WORKER]
+		= MIN (dims[GOMP_DIM_WORKER],
+		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
+	  }
+	else
+	  {
+	    /* Handle the case that the compiler allows the runtime to choose
+	       the vector-length conservatively, by ignoring
+	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
+	       it.  */
+	    int vectors = 0;
+	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
+	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
+	       exceed targ_fn->max_threads_per_block. */
+	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
+	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
+	    int grids, blocks;
+
+	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
+			      &blocks, function, NULL, 0,
+			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
+	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
+			       "grid = %d, block = %d\n", grids, blocks);
+
+	    /* Keep the num_gangs proportional to the block size.  In
+	       the case were a block size is limited by shared-memory
+	       or the register file capacity, the runtime will not
+	       excessively over assign gangs to the multiprocessor
+	       units if their state is going to be swapped out even
+	       more than necessary. The constant factor 2 is there to
+	       prevent threads from idling when there is insufficient
+	       work for them.  */
+	    if (gangs == 0)
+	      gangs = 2 * grids * (blocks / warp_size);
+
+	    if (vectors == 0)
+	      vectors = warp_size;
+
+	    if (workers == 0)
+	      {
+		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
+				      ? vectors
+				      : dims[GOMP_DIM_VECTOR]);
+		workers = blocks / actual_vectors;
+		workers = MAX (workers, 1);
+		/* If we need a per-worker barrier ... .  */
+		if (actual_vectors > 32)
+		  /* Don't use more barriers than available.  */
+		  workers = MIN (workers, 15);
+	      }
+
+	    for (i = 0; i != GOMP_DIM_MAX; i++)
+	      if (default_dim_p[i])
+		switch (i)
+		  {
+		  case GOMP_DIM_GANG: dims[i] = gangs; break;
+		  case GOMP_DIM_WORKER: dims[i] = workers; break;
+		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
+		  default: GOMP_PLUGIN_fatal ("invalid dim");
+		  }
+	  }
+      }
+    }
+
+  /* Check if the accelerator has sufficient hardware resources to
+     launch the offloaded kernel.  */
+  if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
+      > targ_fn->max_threads_per_block)
+    {
+      const char *msg
+	= ("The Nvidia accelerator has insufficient resources to launch '%s'"
+	   " with num_workers = %d and vector_length = %d"
+	   "; "
+	   "recompile the program with 'num_workers = x and vector_length = y'"
+	   " on that offloaded region or '-fopenacc-dim=:x:y' where"
+	   " x * y <= %d"
+	   ".\n");
+      GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
+			 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
+    }
+
+  /* Check if the accelerator has sufficient barrier resources to
+     launch the offloaded kernel.  */
+  if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
+    {
+      const char *msg
+	= ("The Nvidia accelerator has insufficient barrier resources to launch"
+	   " '%s' with num_workers = %d and vector_length = %d"
+	   "; "
+	   "recompile the program with 'num_workers = x' on that offloaded"
+	   " region or '-fopenacc-dim=:x:' where x <= 15"
+	   "; "
+	   "or, recompile the program with 'vector_length = 32' on that"
+	   " offloaded region or '-fopenacc-dim=::32'"
+	   ".\n");
+	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
+			   dims[GOMP_DIM_VECTOR]);
+    }
+
+  if (mapnum > 0)
+    {
+      /* This reserves a chunk of a pre-allocated page of memory mapped on both
+	 the host and the device. HP is a host pointer to the new chunk, and DP is
+	 the corresponding device pointer.  */
+      pthread_mutex_lock (&ptx_event_lock);
+      dp = map_push (dev_str, mapnum * sizeof (void *));
+      pthread_mutex_unlock (&ptx_event_lock);
+
+      GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
+
+      /* Copy the array of arguments to the mapped page.  */
+      hp = alloca(sizeof(void *) * mapnum);
+      for (i = 0; i < mapnum; i++)
+	((void **) hp)[i] = devaddrs[i];
+
+      /* Copy the (device) pointers to arguments to the device */
+      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
+			mapnum * sizeof (void *));
+    }
+
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
@@ -1239,7 +1426,8 @@ nvptx_exec (void (*fn), size_t mapnum, v

       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);

-      event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
+      if (mapnum > 0)
+	event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
     }
 #else
   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
@@ -1256,7 +1444,10 @@ nvptx_exec (void (*fn), size_t mapnum, v
 #ifndef DISABLE_ASYNC
   if (async < acc_async_noval)
 #endif
-    map_pop (dev_str);
+    {
+      if (mapnum > 0)
+	map_pop (dev_str);
+    }
 }

 void * openacc_get_current_cuda_context (void);
@@ -1415,9 +1606,8 @@ nvptx_async_test (int async)
   struct ptx_stream *s;

   s = select_stream_for_async (async, pthread_self (), false, NULL);
-
   if (!s)
-    GOMP_PLUGIN_fatal ("unknown async %d", async);
+    return 1;

   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
   if (r == CUDA_SUCCESS)
@@ -1472,7 +1662,7 @@ nvptx_wait (int async)

   s = select_stream_for_async (async, pthread_self (), false, NULL);
   if (!s)
-    GOMP_PLUGIN_fatal ("unknown async %d", async);
+    return;

   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);

@@ -1486,16 +1676,17 @@ nvptx_wait_async (int async1, int async2
   struct ptx_stream *s1, *s2;
   pthread_t self = pthread_self ();

+  s1 = select_stream_for_async (async1, self, false, NULL);
+  if (!s1)
+    return;
+
   /* The stream that is waiting (rather than being waited for) doesn't
      necessarily have to exist already.  */
   s2 = select_stream_for_async (async2, self, true, NULL);

-  s1 = select_stream_for_async (async1, self, false, NULL);
-  if (!s1)
-    GOMP_PLUGIN_fatal ("invalid async 1\n");
-
+  /* A stream is always synchronized with itself.  */
   if (s1 == s2)
-    GOMP_PLUGIN_fatal ("identical parameters");
+    return;

   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));

@@ -1629,8 +1820,14 @@ nvptx_set_cuda_stream (int async, void *
   pthread_t self = pthread_self ();
   struct nvptx_thread *nvthd = nvptx_thread ();

-  if (async < 0)
-    GOMP_PLUGIN_fatal ("bad async %d", async);
+  /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
+     to change the stream handle associated with "acc_async_sync".  */
+  if (async == acc_async_sync)
+    {
+      GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
+			 " with \"acc_async_sync\"\n");
+      return 0;
+    }

   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);

@@ -1739,6 +1936,12 @@ GOMP_OFFLOAD_fini_device (int n)
       instantiated_devices--;
     }

+  if (instantiated_devices == 0)
+    {
+      free (ptx_devices);
+      ptx_devices = NULL;
+    }
+
   pthread_mutex_unlock (&ptx_dev_lock);
   return true;
 }
--- libgomp/plugin/configfrag.ac.jj	2018-04-25 09:40:31.914655581 +0200
+++ libgomp/plugin/configfrag.ac	2019-05-07 18:46:36.533109624 +0200
@@ -26,8 +26,6 @@
 # see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 # <http://www.gnu.org/licenses/>.

-offload_targets=
-AC_SUBST(offload_targets)
 plugin_support=yes
 AC_CHECK_LIB(dl, dlsym, , [plugin_support=no])
 if test x"$plugin_support" = xyes; then
@@ -59,7 +57,11 @@ AC_ARG_WITH(cuda-driver-lib,
 	[AS_HELP_STRING([--with-cuda-driver-lib=PATH],
 		[specify directory for the installed CUDA driver library])])
 case "x$with_cuda_driver" in
-  x | xno) ;;
+  x) ;;
+  xno)
+    CUDA_DRIVER_INCLUDE=no
+    CUDA_DRIVER_LIB=no
+    ;;
   *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
      CUDA_DRIVER_LIB=$with_cuda_driver/lib
      ;;
@@ -70,10 +72,12 @@ fi
 if test "x$with_cuda_driver_lib" != x; then
   CUDA_DRIVER_LIB=$with_cuda_driver_lib
 fi
-if test "x$CUDA_DRIVER_INCLUDE" != x; then
+if test "x$CUDA_DRIVER_INCLUDE" != x \
+   && test "x$CUDA_DRIVER_INCLUDE" != xno; then
   CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
 fi
-if test "x$CUDA_DRIVER_LIB" != x; then
+if test "x$CUDA_DRIVER_LIB" != x \
+   && test "x$CUDA_DRIVER_LIB" != xno; then
   CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
 fi

@@ -133,7 +137,13 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS)
 AC_SUBST(PLUGIN_HSA_LDFLAGS)
 AC_SUBST(PLUGIN_HSA_LIBS)

-# Get offload targets and path to install tree of offloading compiler.
+# Parse '--enable-offload-targets', figure out the corresponding libgomp
+# plugins, and configure to find the corresponding offload compilers.
+# 'offload_plugins' and 'offload_targets' will be populated in the same order.
+offload_plugins=
+offload_targets=
+AC_SUBST(offload_plugins)
+AC_SUBST(offload_targets)
 offload_additional_options=
 offload_additional_lib_paths=
 AC_SUBST(offload_additional_options)
@@ -152,10 +152,10 @@ if test x"$enable_offload_targets" != x;
   for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
     tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
     tgt=`echo $tgt | sed 's/=.*//'`
-    tgt_name=
+    tgt_plugin=
     case $tgt in
       *-intelmic-* | *-intelmicemul-*)
-	tgt_name=intelmic
+	tgt_plugin=intelmic
 	;;
       nvptx*)
 	case "${target}" in
@@ -167,30 +167,35 @@ if test x"$enable_offload_targets" != x;
 		PLUGIN_NVPTX=0
 		;;
 	      *)
-		tgt_name=nvptx
+		tgt_plugin=nvptx
 		PLUGIN_NVPTX=$tgt
-		PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
-		PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
-		PLUGIN_NVPTX_LIBS='-lcuda'
+		if test "x$CUDA_DRIVER_LIB" != xno \
+		   && test "x$CUDA_DRIVER_LIB" != xno; then
+		  PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+		  PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+		  PLUGIN_NVPTX_LIBS='-lcuda'

-		PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
-		CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
-		PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
-		LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
-		PLUGIN_NVPTX_save_LIBS=$LIBS
-		LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
-		AC_LINK_IFELSE(
-		  [AC_LANG_PROGRAM(
-		    [#include "cuda.h"],
-		      [CUresult r = cuCtxPushCurrent (NULL);])],
-		  [PLUGIN_NVPTX=1])
-		CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
-		LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
-		LIBS=$PLUGIN_NVPTX_save_LIBS
+		  PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+		  CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+		  PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+		  LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+		  PLUGIN_NVPTX_save_LIBS=$LIBS
+		  LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+		  AC_LINK_IFELSE(
+		    [AC_LANG_PROGRAM(
+		      [#include "cuda.h"],
+			[CUresult r = cuCtxPushCurrent (NULL);])],
+		    [PLUGIN_NVPTX=1])
+		  CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+		  LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+		  LIBS=$PLUGIN_NVPTX_save_LIBS
+		fi
 		case $PLUGIN_NVPTX in
 		  nvptx*)
-		    if test "x$CUDA_DRIVER_INCLUDE" = x \
-		       && test "x$CUDA_DRIVER_LIB" = x; then
+		    if (test "x$CUDA_DRIVER_INCLUDE" = x \
+			|| test "x$CUDA_DRIVER_INCLUDE" = xno) \
+		       && (test "x$CUDA_DRIVER_LIB" = x \
+			   || test "x$CUDA_DRIVER_LIB" = xno); then
 		      PLUGIN_NVPTX=1
 		      PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
 		      PLUGIN_NVPTX_LIBS='-ldl'
@@ -191,7 +206,7 @@ if test x"$enable_offload_targets" != x;
 	        PLUGIN_HSA=0
 		;;
 	      *)
-	        tgt_name=hsa
+		tgt_plugin=hsa
 	        PLUGIN_HSA=$tgt
 	        PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
 	        PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
@@ -209,7 +224,7 @@ if test x"$enable_offload_targets" != x;
 	        LDFLAGS=$PLUGIN_HSA_save_LDFLAGS
 	        LIBS=$PLUGIN_HSA_save_LIBS
 	        case $PLUGIN_HSA in
-	          hsa*)
+		  hsa*)
 	            HSA_PLUGIN=0
 	            AC_MSG_ERROR([HSA run-time package required for HSA support])
 	            ;;
@@ -226,16 +241,19 @@ if test x"$enable_offload_targets" != x;
 	AC_MSG_ERROR([unknown offload target specified])
 	;;
     esac
-    if test x"$tgt_name" = x; then
-      # Don't configure libgomp for this offloading target if we don't build
-      # the corresponding plugin.
+    if test x"$tgt_plugin" = x; then
+      # Not configuring libgomp for this offload target if we're not building
+      # the corresponding offload plugin.
       continue
-    elif test x"$offload_targets" = x; then
-      offload_targets=$tgt_name
+    elif test x"$offload_plugins" = x; then
+      offload_plugins=$tgt_plugin
+      offload_targets=$tgt
     else
-      offload_targets=$offload_targets,$tgt_name
+      offload_plugins=$offload_plugins,$tgt_plugin
+      offload_targets=$offload_targets,$tgt
     fi
-    if test "$tgt_name" = hsa; then
+    # Configure additional search paths.
+    if test "$tgt_plugin" = hsa; then
       # Offloading compilation is all handled by the target compiler.
       :
     elif test x"$tgt_dir" != x; then
@@ -247,8 +265,8 @@ if test x"$enable_offload_targets" != x;
     fi
   done
 fi
-AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets",
-  [Define to offload targets, separated by commas.])
+AC_DEFINE_UNQUOTED(OFFLOAD_PLUGINS, "$offload_plugins",
+  [Define to offload plugins, separated by commas.])
 AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
 AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
   [Define to 1 if the NVIDIA plugin is built, 0 if not.])
--- libgomp/affinity-fmt.c.jj	2019-05-07 18:46:36.285113585 +0200
+++ libgomp/affinity-fmt.c	2019-05-07 18:46:36.285113585 +0200
@@ -0,0 +1,495 @@
+/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
+   Contributed by Jakub Jelinek <jakub@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "libgomp.h"
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>  /* For PRIx64.  */
+#endif
+#ifdef HAVE_UNAME
+#include <sys/utsname.h>
+#endif
+
+void
+gomp_print_string (const char *str, size_t len)
+{
+  fwrite (str, 1, len, stderr);
+}
+
+void
+gomp_set_affinity_format (const char *format, size_t len)
+{
+  if (len < gomp_affinity_format_len)
+    memcpy (gomp_affinity_format_var, format, len);
+  else
+    {
+      char *p;
+      if (gomp_affinity_format_len)
+	p = gomp_realloc (gomp_affinity_format_var, len + 1);
+      else
+	p = gomp_malloc (len + 1);
+      memcpy (p, format, len);
+      gomp_affinity_format_var = p;
+      gomp_affinity_format_len = len + 1;
+    }
+  gomp_affinity_format_var[len] = '\0';
+}
+
+void
+omp_set_affinity_format (const char *format)
+{
+  gomp_set_affinity_format (format, strlen (format));
+}
+
+size_t
+omp_get_affinity_format (char *buffer, size_t size)
+{
+  size_t len = strlen (gomp_affinity_format_var);
+  if (size)
+    {
+      if (len < size)
+	memcpy (buffer, gomp_affinity_format_var, len + 1);
+      else
+	{
+	  memcpy (buffer, gomp_affinity_format_var, size - 1);
+	  buffer[size - 1] = '\0';
+	}
+    }
+  return len;
+}
+
+void
+gomp_display_string (char *buffer, size_t size, size_t *ret,
+		     const char *str, size_t len)
+{
+  size_t r = *ret;
+  if (size && r < size)
+    {
+      size_t l = len;
+      if (size - r < len)
+	l = size - r;
+      memcpy (buffer + r, str, l);
+    }
+  *ret += len;
+  if (__builtin_expect (r > *ret, 0))
+    gomp_fatal ("overflow in omp_capture_affinity");
+}
+
+static void
+gomp_display_repeat (char *buffer, size_t size, size_t *ret,
+		     char c, size_t len)
+{
+  size_t r = *ret;
+  if (size && r < size)
+    {
+      size_t l = len;
+      if (size - r < len)
+	l = size - r;
+      memset (buffer + r, c, l);
+    }
+  *ret += len;
+  if (__builtin_expect (r > *ret, 0))
+    gomp_fatal ("overflow in omp_capture_affinity");
+}
+
+static void
+gomp_display_num (char *buffer, size_t size, size_t *ret,
+		  bool zero, bool right, size_t sz, char *buf)
+{
+  size_t l = strlen (buf);
+  if (sz == (size_t) -1 || l >= sz)
+    {
+      gomp_display_string (buffer, size, ret, buf, l);
+      return;
+    }
+  if (zero)
+    {
+      if (buf[0] == '-')
+	gomp_display_string (buffer, size, ret, buf, 1);
+      else if (buf[0] == '0' && buf[1] == 'x')
+	gomp_display_string (buffer, size, ret, buf, 2);
+      gomp_display_repeat (buffer, size, ret, '0', sz - l);
+      if (buf[0] == '-')
+	gomp_display_string (buffer, size, ret, buf + 1, l - 1);
+      else if (buf[0] == '0' && buf[1] == 'x')
+	gomp_display_string (buffer, size, ret, buf + 2, l - 2);
+      else
+	gomp_display_string (buffer, size, ret, buf, l);
+    }
+  else if (right)
+    {
+      gomp_display_repeat (buffer, size, ret, ' ', sz - l);
+      gomp_display_string (buffer, size, ret, buf, l);
+    }
+  else
+    {
+      gomp_display_string (buffer, size, ret, buf, l);
+      gomp_display_repeat (buffer, size, ret, ' ', sz - l);
+    }
+}
+
+static void
+gomp_display_int (char *buffer, size_t size, size_t *ret,
+		  bool zero, bool right, size_t sz, int num)
+{
+  char buf[3 * sizeof (int) + 2];
+  sprintf (buf, "%d", num);
+  gomp_display_num (buffer, size, ret, zero, right, sz, buf);
+}
+
+static void
+gomp_display_string_len (char *buffer, size_t size, size_t *ret,
+			 bool right, size_t sz, char *str, size_t len)
+{
+  if (sz == (size_t) -1 || len >= sz)
+    {
+      gomp_display_string (buffer, size, ret, str, len);
+      return;
+    }
+
+  if (right)
+    {
+      gomp_display_repeat (buffer, size, ret, ' ', sz - len);
+      gomp_display_string (buffer, size, ret, str, len);
+    }
+  else
+    {
+      gomp_display_string (buffer, size, ret, str, len);
+      gomp_display_repeat (buffer, size, ret, ' ', sz - len);
+    }
+}
+
+static void
+gomp_display_hostname (char *buffer, size_t size, size_t *ret,
+		       bool right, size_t sz)
+{
+#ifdef HAVE_GETHOSTNAME
+  {
+    char buf[256];
+    char *b = buf;
+    size_t len = 256;
+    do
+      {
+	b[len - 1] = '\0';
+	if (gethostname (b, len - 1) == 0)
+	  {
+	    size_t l = strlen (b);
+	    if (l < len - 1)
+	      {
+		gomp_display_string_len (buffer, size, ret,
+					 right, sz, b, l);
+		if (b != buf)
+		  free (b);
+		return;
+	      }
+	  }
+	if (len == 1048576)
+	  break;
+	len = len * 2;
+	if (len == 512)
+	  b = gomp_malloc (len);
+	else
+	  b = gomp_realloc (b, len);
+      }
+    while (1);
+    if (b != buf)
+      free (b);
+  }
+#endif
+#ifdef HAVE_UNAME
+  {
+    struct utsname buf;
+    if (uname (&buf) == 0)
+      {
+	gomp_display_string_len (buffer, size, ret, right, sz,
+				 buf.nodename, strlen (buf.nodename));
+	return;
+      }
+  }
+#endif
+  gomp_display_string_len (buffer, size, ret, right, sz, "node", 4);
+}
+
+struct affinity_types_struct {
+  char long_str[18];
+  char long_len;
+  char short_c; };
+
+static struct affinity_types_struct affinity_types[] =
+{
+#define AFFINITY_TYPE(l, s) \
+  { #l, sizeof (#l) - 1, s }
+  AFFINITY_TYPE (team_num, 't'),
+  AFFINITY_TYPE (num_teams, 'T'),
+  AFFINITY_TYPE (nesting_level, 'L'),
+  AFFINITY_TYPE (thread_num, 'n'),
+  AFFINITY_TYPE (num_threads, 'N'),
+  AFFINITY_TYPE (ancestor_tnum, 'a'),
+  AFFINITY_TYPE (host, 'H'),
+  AFFINITY_TYPE (process_id, 'P'),
+  AFFINITY_TYPE (native_thread_id, 'i'),
+  AFFINITY_TYPE (thread_affinity, 'A')
+#undef AFFINITY_TYPE
+};
+
+size_t
+gomp_display_affinity (char *buffer, size_t size,
+		       const char *format, gomp_thread_handle handle,
+		       struct gomp_team_state *ts, unsigned int place)
+{
+  size_t ret = 0;
+  do
+    {
+      const char *p = strchr (format, '%');
+      bool zero = false;
+      bool right = false;
+      size_t sz = -1;
+      char c;
+      int val;
+      if (p == NULL)
+	p = strchr (format, '\0');
+      if (p != format)
+	gomp_display_string (buffer, size, &ret,
+			     format, p - format);
+      if (*p == '\0')
+	break;
+      p++;
+      if (*p == '%')
+	{
+	  gomp_display_string (buffer, size, &ret, "%", 1);
+	  format = p + 1;
+	  continue;
+	}
+      if (*p == '0')
+	{
+	  zero = true;
+	  p++;
+	  if (*p != '.')
+	    gomp_fatal ("leading zero not followed by dot in affinity format");
+	}
+      if (*p == '.')
+	{
+	  right = true;
+	  p++;
+	}
+      if (*p >= '1' && *p <= '9')
+	{
+	  char *end;
+	  sz = strtoul (p, &end, 10);
+	  p = end;
+	}
+      else if (zero || right)
+	gomp_fatal ("leading zero or right justification in affinity format "
+		    "requires size");
+      c = *p;
+      if (c == '{')
+	{
+	  int i;
+	  for (i = 0;
+	       i < sizeof (affinity_types) / sizeof (affinity_types[0]); ++i)
+	    if (strncmp (p + 1, affinity_types[i].long_str,
+			 affinity_types[i].long_len) == 0
+		&& p[affinity_types[i].long_len + 1] == '}')
+	      {
+		c = affinity_types[i].short_c;
+		p += affinity_types[i].long_len + 1;
+		break;
+	      }
+	  if (c == '{')
+	    {
+	      char *q = strchr (p + 1, '}');
+	      if (q)
+		gomp_fatal ("unsupported long type name '%.*s' in affinity "
+			    "format", (int) (q - (p + 1)), p + 1);
+	      else
+		gomp_fatal ("unterminated long type name '%s' in affinity "
+			    "format", p + 1);
+	    }
+	}
+      switch (c)
+	{
+	case 't':
+	  val = omp_get_team_num ();
+	  goto do_int;
+	case 'T':
+	  val = omp_get_num_teams ();
+	  goto do_int;
+	case 'L':
+	  val = ts->level;
+	  goto do_int;
+	case 'n':
+	  val = ts->team_id;
+	  goto do_int;
+	case 'N':
+	  val = ts->team ? ts->team->nthreads : 1;
+	  goto do_int;
+	case 'a':
+	  val = ts->team ? ts->team->prev_ts.team_id : -1;
+	  goto do_int;
+	case 'H':
+	  gomp_display_hostname (buffer, size, &ret, right, sz);
+	  break;
+	case 'P':
+#ifdef HAVE_GETPID
+	  val = getpid ();
+#else
+	  val = 0;
+#endif
+	  goto do_int;
+	case 'i':
+#if defined(LIBGOMP_USE_PTHREADS) && defined(__GNUC__)
+	  {
+	    char buf[3 * (sizeof (handle) + sizeof (uintptr_t) + sizeof (int))
+		     + 4];
+	    /* This macro returns expr unmodified for integral or pointer
+	       types and 0 for anything else (e.g. aggregates).  */
+#define gomp_nonaggregate(expr) \
+  __builtin_choose_expr (__builtin_classify_type (expr) == 1		    \
+			 || __builtin_classify_type (expr) == 5, expr, 0)
+	    /* This macro returns expr unmodified for integral types,
+	       (uintptr_t) (expr) for pointer types and 0 for anything else
+	       (e.g. aggregates).  */
+#define gomp_integral(expr) \
+  __builtin_choose_expr (__builtin_classify_type (expr) == 5,		    \
+			 (uintptr_t) gomp_nonaggregate (expr),		    \
+			 gomp_nonaggregate (expr))
+
+	    if (sizeof (gomp_integral (handle)) == sizeof (unsigned long))
+	      sprintf (buf, "0x%lx", (unsigned long) gomp_integral (handle));
+#if defined (HAVE_INTTYPES_H) && defined (PRIx64)
+	    else if (sizeof (gomp_integral (handle)) == sizeof (uint64_t))
+	      sprintf (buf, "0x%" PRIx64, (uint64_t) gomp_integral (handle));
+#else
+	    else if (sizeof (gomp_integral (handle))
+		     == sizeof (unsigned long long))
+	      sprintf (buf, "0x%llx",
+		       (unsigned long long) gomp_integral (handle));
+#endif
+	    else
+	      sprintf (buf, "0x%x", (unsigned int) gomp_integral (handle));
+	    gomp_display_num (buffer, size, &ret, zero, right, sz, buf);
+	    break;
+	  }
+#else
+	  val = 0;
+	  goto do_int;
+#endif
+	case 'A':
+	  if (sz == (size_t) -1)
+	    gomp_display_affinity_place (buffer, size, &ret,
+					 place - 1);
+	  else if (right)
+	    {
+	      size_t len = 0;
+	      gomp_display_affinity_place (NULL, 0, &len, place - 1);
+	      if (len < sz)
+		gomp_display_repeat (buffer, size, &ret, ' ', sz - len);
+	      gomp_display_affinity_place (buffer, size, &ret, place - 1);
+	    }
+	  else
+	    {
+	      size_t start = ret;
+	      gomp_display_affinity_place (buffer, size, &ret, place - 1);
+	      if (ret - start < sz)
+		gomp_display_repeat (buffer, size, &ret, ' ', sz - (ret - start));
+	    }
+	  break;
+	do_int:
+	  gomp_display_int (buffer, size, &ret, zero, right, sz, val);
+	  break;
+	default:
+	  gomp_fatal ("unsupported type %c in affinity format", c);
+	}
+      format = p + 1;
+    }
+  while (1);
+  return ret;
+}
+
+size_t
+omp_capture_affinity (char *buffer, size_t size, const char *format)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  size_t ret
+    = gomp_display_affinity (buffer, size,
+			     format && *format
+			     ? format : gomp_affinity_format_var,
+			     gomp_thread_self (), &thr->ts, thr->place);
+  if (size)
+    {
+      if (ret >= size)
+	buffer[size - 1] = '\0';
+      else
+	buffer[ret] = '\0';
+    }
+  return ret;
+}
+ialias (omp_capture_affinity)
+
+void
+omp_display_affinity (const char *format)
+{
+  char buf[512];
+  char *b;
+  size_t ret = ialias_call (omp_capture_affinity) (buf, sizeof buf, format);
+  if (ret < sizeof buf)
+    {
+      buf[ret] = '\n';
+      gomp_print_string (buf, ret + 1);
+      return;
+    }
+  b = gomp_malloc (ret + 1);
+  ialias_call (omp_capture_affinity) (b, ret + 1, format);
+  b[ret] = '\n';
+  gomp_print_string (b, ret + 1);
+  free (b);
+}
+
+void
+gomp_display_affinity_thread (gomp_thread_handle handle,
+			      struct gomp_team_state *ts, unsigned int place)
+{
+  char buf[512];
+  char *b;
+  size_t ret = gomp_display_affinity (buf, sizeof buf, gomp_affinity_format_var,
+				      handle, ts, place);
+  if (ret < sizeof buf)
+    {
+      buf[ret] = '\n';
+      gomp_print_string (buf, ret + 1);
+      return;
+    }
+  b = gomp_malloc (ret + 1);
+  gomp_display_affinity (b, ret + 1, gomp_affinity_format_var,
+  			 handle, ts, place);
+  b[ret] = '\n';
+  gomp_print_string (b, ret + 1);
+  free (b);
+}
--- libgomp/single.c.jj	2018-04-25 09:40:31.870655561 +0200
+++ libgomp/single.c	2019-05-07 18:46:36.536109576 +0200
@@ -47,7 +47,7 @@ GOMP_single_start (void)
   return __sync_bool_compare_and_swap (&team->single_count, single_count,
 				       single_count + 1L);
 #else
-  bool ret = gomp_work_share_start (false);
+  bool ret = gomp_work_share_start (0);
   if (ret)
     gomp_work_share_init_done ();
   gomp_work_share_end_nowait ();
@@ -68,7 +68,7 @@ GOMP_single_copy_start (void)
   bool first;
   void *ret;

-  first = gomp_work_share_start (false);
+  first = gomp_work_share_start (0);

   if (first)
     {
--- libgomp/oacc-cuda.c.jj	2018-04-25 09:40:31.321655307 +0200
+++ libgomp/oacc-cuda.c	2019-05-07 18:46:36.528109704 +0200
@@ -58,7 +58,7 @@ acc_get_cuda_stream (int async)
 {
   struct goacc_thread *thr = goacc_thread ();

-  if (async < 0)
+  if (!async_valid_p (async))
     return NULL;

   if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
@@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *st
 {
   struct goacc_thread *thr;

-  if (async < 0 || stream == NULL)
+  if (!async_valid_p (async) || stream == NULL)
     return 0;

   goacc_lazy_initialize ();
--- libgomp/work.c.jj	2018-04-25 09:40:31.925655587 +0200
+++ libgomp/work.c	2019-05-07 18:46:36.548109384 +0200
@@ -76,7 +76,15 @@ alloc_work_share (struct gomp_team *team
 #endif

   team->work_share_chunk *= 2;
+  /* Allocating gomp_work_share structures aligned is just an
+     optimization, don't do it when using the fallback method.  */
+#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC
+  ws = gomp_aligned_alloc (__alignof (struct gomp_work_share),
+			   team->work_share_chunk
+			   * sizeof (struct gomp_work_share));
+#else
   ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share));
+#endif
   ws->next_alloc = team->work_shares[0].next_alloc;
   team->work_shares[0].next_alloc = ws;
   team->work_share_list_alloc = &ws[1];
@@ -90,30 +98,35 @@ alloc_work_share (struct gomp_team *team
    This shouldn't touch the next_alloc field.  */

 void
-gomp_init_work_share (struct gomp_work_share *ws, bool ordered,
+gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
 		      unsigned nthreads)
 {
   gomp_mutex_init (&ws->lock);
   if (__builtin_expect (ordered, 0))
     {
-#define INLINE_ORDERED_TEAM_IDS_CNT \
-  ((sizeof (struct gomp_work_share) \
-    - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \
-   / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0]))
-
-      if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT)
-	ws->ordered_team_ids
-	  = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids));
+#define INLINE_ORDERED_TEAM_IDS_SIZE \
+  (sizeof (struct gomp_work_share) \
+   - offsetof (struct gomp_work_share, inline_ordered_team_ids))
+
+      if (__builtin_expect (ordered != 1, 0))
+	{
+	  ordered += nthreads * sizeof (*ws->ordered_team_ids) - 1;
+	  ordered = ordered + __alignof__ (long long) - 1;
+	  ordered &= ~(__alignof__ (long long) - 1);
+	}
+      else
+	ordered = nthreads * sizeof (*ws->ordered_team_ids);
+      if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
+	ws->ordered_team_ids = gomp_malloc (ordered);
       else
 	ws->ordered_team_ids = ws->inline_ordered_team_ids;
-      memset (ws->ordered_team_ids, '\0',
-	      nthreads * sizeof (*ws->ordered_team_ids));
+      memset (ws->ordered_team_ids, '\0', ordered);
       ws->ordered_num_used = 0;
       ws->ordered_owner = -1;
       ws->ordered_cur = 0;
     }
   else
-    ws->ordered_team_ids = NULL;
+    ws->ordered_team_ids = ws->inline_ordered_team_ids;
   gomp_ptrlock_init (&ws->next_ws, NULL);
   ws->threads_completed = 0;
 }
@@ -166,7 +179,7 @@ free_work_share (struct gomp_team *team,
    if this was the first thread to reach this point.  */

 bool
-gomp_work_share_start (bool ordered)
+gomp_work_share_start (size_t ordered)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -178,7 +191,7 @@ gomp_work_share_start (bool ordered)
       ws = gomp_malloc (sizeof (*ws));
       gomp_init_work_share (ws, ordered, 1);
       thr->ts.work_share = ws;
-      return ws;
+      return true;
     }

   ws = thr->ts.work_share;
--- include/gomp-constants.h.jj	2018-04-25 09:40:39.757659209 +0200
+++ include/gomp-constants.h	2019-05-07 18:57:33.333627031 +0200
@@ -189,6 +189,7 @@ enum gomp_map_kind
 #define GOMP_TASK_FLAG_GRAINSIZE	(1 << 9)
 #define GOMP_TASK_FLAG_IF		(1 << 10)
 #define GOMP_TASK_FLAG_NOGROUP		(1 << 11)
+#define GOMP_TASK_FLAG_REDUCTION	(1 << 12)

 /* GOMP_target{_ext,update_ext,enter_exit_data} flags argument.  */
 #define GOMP_TARGET_FLAG_NOWAIT		(1 << 0)
@@ -196,6 +197,18 @@ enum gomp_map_kind
 /* Internal to libgomp.  */
 #define GOMP_TARGET_FLAG_UPDATE		(1U << 31)

+
+/* OpenACC construct flags.  */
+
+/* Force host fallback execution.  */
+#define GOACC_FLAG_HOST_FALLBACK	(1 << 0)
+
+/* For legacy reasons, in the ABI, the GOACC_FLAGs are encoded as an inverted
+   bitmask.  */
+#define GOACC_FLAGS_MARSHAL_OP		BIT_NOT_EXPR
+#define GOACC_FLAGS_UNMARSHAL(X)	(~(X))
+
+
 /* Versions of libgomp and device-specific plugins.  GOMP_VERSION
    should be incremented whenever an ABI-incompatible change is introduced
    to the plugin interface defined in libgomp/libgomp.h.  */
@@ -251,6 +264,12 @@ enum gomp_map_kind
    at most and shifted by this many bits.  */
 #define GOMP_TARGET_ARG_VALUE_SHIFT		16

+/* Dependence types in omp_depend_t objects.  */
+#define GOMP_DEPEND_IN			1
+#define GOMP_DEPEND_OUT			2
+#define GOMP_DEPEND_INOUT		3
+#define GOMP_DEPEND_MUTEXINOUTSET	4
+
 /* HSA specific data structures.  */

 /* Identifiers of device-specific target arguments.  */