diff --git a/.gcc.metadata b/.gcc.metadata
index 99d2974..03b4cd9 100644
--- a/.gcc.metadata
+++ b/.gcc.metadata
@@ -1,3 +1,3 @@
-1fe3aa7ce95faa0f4d7f08f0dfefd86ff4b43015 SOURCES/gcc-8.2.1-20180905.tar.xz
+8ee669ee60997110e6251c72dac66bf69bbe13c7 SOURCES/gcc-8.3.1-20190507.tar.xz
 3bdb3cc01fa7690a0e20ea5cfffcbe690f7665eb SOURCES/nvptx-newlib-aadc8eb0ec43b7cd0dd2dfb484bae63c8b05ef24.tar.xz
 ce8eb83be0ac37fb5d5388df455a980fe37b4f13 SOURCES/nvptx-tools-c28050f60193b3b95a18866a96f03334e874e78f.tar.xz
diff --git a/.gitignore b/.gitignore
index 25f3c40..fb2c952 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,3 @@
-SOURCES/gcc-8.2.1-20180905.tar.xz
+SOURCES/gcc-8.3.1-20190507.tar.xz
 SOURCES/nvptx-newlib-aadc8eb0ec43b7cd0dd2dfb484bae63c8b05ef24.tar.xz
 SOURCES/nvptx-tools-c28050f60193b3b95a18866a96f03334e874e78f.tar.xz
diff --git a/SOURCES/gcc8-libgomp-20190503.patch b/SOURCES/gcc8-libgomp-20190503.patch
new file mode 100644
index 0000000..caa13f2
--- /dev/null
+++ b/SOURCES/gcc8-libgomp-20190503.patch
@@ -0,0 +1,10060 @@
+--- libgomp/loop.c.jj	2018-04-25 09:40:31.870655561 +0200
++++ libgomp/loop.c	2019-05-07 18:46:36.526109736 +0200
+@@ -27,9 +27,13 @@
+ 
+ #include <limits.h>
+ #include <stdlib.h>
++#include <string.h>
+ #include "libgomp.h"
+ 
+ 
++ialias (GOMP_loop_runtime_next)
++ialias_redirect (GOMP_taskgroup_reduction_register)
++
+ /* Initialize the given work share construct from the given arguments.  */
+ 
+ static inline void
+@@ -79,12 +83,12 @@ gomp_loop_init (struct gomp_work_share *
+ }
+ 
+ /* The *_start routines are called when first encountering a loop construct
+-   that is not bound directly to a parallel construct.  The first thread 
++   that is not bound directly to a parallel construct.  The first thread
+    that arrives will create the work-share construct; subsequent threads
+    will see the construct exists and allocate work from it.
+ 
+    START, END, INCR are the bounds of the loop; due to the restrictions of
+-   OpenMP, these values must be the same in every thread.  This is not 
++   OpenMP, these values must be the same in every thread.  This is not
+    verified (nor is it entirely verifiable, since START is not necessarily
+    retained intact in the work-share data structure).  CHUNK_SIZE is the
+    scheduling parameter; again this must be identical in all threads.
+@@ -101,7 +105,7 @@ gomp_loop_static_start (long start, long
+   struct gomp_thread *thr = gomp_thread ();
+ 
+   thr->ts.static_trip = 0;
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_init (thr->ts.work_share, start, end, incr,
+ 		      GFS_STATIC, chunk_size);
+@@ -123,7 +127,7 @@ gomp_loop_dynamic_start (long start, lon
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_init (thr->ts.work_share, start, end, incr,
+ 		      GFS_DYNAMIC, chunk_size);
+@@ -151,7 +155,7 @@ gomp_loop_guided_start (long start, long
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_init (thr->ts.work_share, start, end, incr,
+ 		      GFS_GUIDED, chunk_size);
+@@ -174,7 +178,7 @@ GOMP_loop_runtime_start (long start, lon
+ 			 long *istart, long *iend)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+-  switch (icv->run_sched_var)
++  switch (icv->run_sched_var & ~GFS_MONOTONIC)
+     {
+     case GFS_STATIC:
+       return gomp_loop_static_start (start, end, incr,
+@@ -197,6 +201,100 @@ GOMP_loop_runtime_start (long start, lon
+     }
+ }
+ 
++static long
++gomp_adjust_sched (long sched, long *chunk_size)
++{
++  sched &= ~GFS_MONOTONIC;
++  switch (sched)
++    {
++    case GFS_STATIC:
++    case GFS_DYNAMIC:
++    case GFS_GUIDED:
++      return sched;
++    /* GFS_RUNTIME is used for runtime schedule without monotonic
++       or nonmonotonic modifiers on the clause.
++       GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
++       modifier.  */
++    case GFS_RUNTIME:
++    /* GFS_AUTO is used for runtime schedule with nonmonotonic
++       modifier.  */
++    case GFS_AUTO:
++      {
++	struct gomp_task_icv *icv = gomp_icv (false);
++	sched = icv->run_sched_var & ~GFS_MONOTONIC;
++	switch (sched)
++	  {
++	  case GFS_STATIC:
++	  case GFS_DYNAMIC:
++	  case GFS_GUIDED:
++	    *chunk_size = icv->run_sched_chunk_size;
++	    break;
++	  case GFS_AUTO:
++	    sched = GFS_STATIC;
++	    *chunk_size = 0;
++	    break;
++	  default:
++	    abort ();
++	  }
++	return sched;
++      }
++    default:
++      abort ();
++    }
++}
++
++bool
++GOMP_loop_start (long start, long end, long incr, long sched,
++		 long chunk_size, long *istart, long *iend,
++		 uintptr_t *reductions, void **mem)
++{
++  struct gomp_thread *thr = gomp_thread ();
++
++  thr->ts.static_trip = 0;
++  if (reductions)
++    gomp_workshare_taskgroup_start ();
++  if (gomp_work_share_start (0))
++    {
++      sched = gomp_adjust_sched (sched, &chunk_size);
++      gomp_loop_init (thr->ts.work_share, start, end, incr,
++		      sched, chunk_size);
++      if (reductions)
++	{
++	  GOMP_taskgroup_reduction_register (reductions);
++	  thr->task->taskgroup->workshare = true;
++	  thr->ts.work_share->task_reductions = reductions;
++	}
++      if (mem)
++	{
++	  uintptr_t size = (uintptr_t) *mem;
++	  if (size > (sizeof (struct gomp_work_share)
++		      - offsetof (struct gomp_work_share,
++				  inline_ordered_team_ids)))
++	    thr->ts.work_share->ordered_team_ids
++	      = gomp_malloc_cleared (size);
++	  else
++	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
++	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
++	}
++      gomp_work_share_init_done ();
++    }
++  else
++    {
++      if (reductions)
++	{
++	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
++	  gomp_workshare_task_reduction_register (reductions,
++						  first_reductions);
++	}
++      if (mem)
++	*mem = (void *) thr->ts.work_share->ordered_team_ids;
++    }
++
++  if (!istart)
++    return true;
++  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
++}
++
+ /* The *_ordered_*_start routines are similar.  The only difference is that
+    this work-share construct is initialized to expect an ORDERED section.  */
+ 
+@@ -207,7 +305,7 @@ gomp_loop_ordered_static_start (long sta
+   struct gomp_thread *thr = gomp_thread ();
+ 
+   thr->ts.static_trip = 0;
+-  if (gomp_work_share_start (true))
++  if (gomp_work_share_start (1))
+     {
+       gomp_loop_init (thr->ts.work_share, start, end, incr,
+ 		      GFS_STATIC, chunk_size);
+@@ -225,7 +323,7 @@ gomp_loop_ordered_dynamic_start (long st
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (true))
++  if (gomp_work_share_start (1))
+     {
+       gomp_loop_init (thr->ts.work_share, start, end, incr,
+ 		      GFS_DYNAMIC, chunk_size);
+@@ -250,7 +348,7 @@ gomp_loop_ordered_guided_start (long sta
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (true))
++  if (gomp_work_share_start (1))
+     {
+       gomp_loop_init (thr->ts.work_share, start, end, incr,
+ 		      GFS_GUIDED, chunk_size);
+@@ -273,7 +371,7 @@ GOMP_loop_ordered_runtime_start (long st
+ 				 long *istart, long *iend)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+-  switch (icv->run_sched_var)
++  switch (icv->run_sched_var & ~GFS_MONOTONIC)
+     {
+     case GFS_STATIC:
+       return gomp_loop_ordered_static_start (start, end, incr,
+@@ -297,6 +395,81 @@ GOMP_loop_ordered_runtime_start (long st
+     }
+ }
+ 
++bool
++GOMP_loop_ordered_start (long start, long end, long incr, long sched,
++			 long chunk_size, long *istart, long *iend,
++			 uintptr_t *reductions, void **mem)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  size_t ordered = 1;
++  bool ret;
++
++  thr->ts.static_trip = 0;
++  if (reductions)
++    gomp_workshare_taskgroup_start ();
++  if (mem)
++    ordered += (uintptr_t) *mem;
++  if (gomp_work_share_start (ordered))
++    {
++      sched = gomp_adjust_sched (sched, &chunk_size);
++      gomp_loop_init (thr->ts.work_share, start, end, incr,
++		      sched, chunk_size);
++      if (reductions)
++	{
++	  GOMP_taskgroup_reduction_register (reductions);
++	  thr->task->taskgroup->workshare = true;
++	  thr->ts.work_share->task_reductions = reductions;
++	}
++      if (sched == GFS_STATIC)
++	gomp_ordered_static_init ();
++      else
++	gomp_mutex_lock (&thr->ts.work_share->lock);
++      gomp_work_share_init_done ();
++    }
++  else
++    {
++      if (reductions)
++	{
++	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
++	  gomp_workshare_task_reduction_register (reductions,
++						  first_reductions);
++	}
++      sched = thr->ts.work_share->sched;
++      if (sched != GFS_STATIC)
++	gomp_mutex_lock (&thr->ts.work_share->lock);
++    }
++
++  if (mem)
++    {
++      uintptr_t p
++	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
++		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
++      p += __alignof__ (long long) - 1;
++      p &= ~(__alignof__ (long long) - 1);
++      *mem = (void *) p;
++    }
++
++  switch (sched)
++    {
++    case GFS_STATIC:
++    case GFS_AUTO:
++      return !gomp_iter_static_next (istart, iend);
++    case GFS_DYNAMIC:
++      ret = gomp_iter_dynamic_next_locked (istart, iend);
++      break;
++    case GFS_GUIDED:
++      ret = gomp_iter_guided_next_locked (istart, iend);
++      break;
++    default:
++      abort ();
++    }
++
++  if (ret)
++    gomp_ordered_first ();
++  gomp_mutex_unlock (&thr->ts.work_share->lock);
++  return ret;
++}
++
+ /* The *_doacross_*_start routines are similar.  The only difference is that
+    this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
+    section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
+@@ -310,11 +483,11 @@ gomp_loop_doacross_static_start (unsigne
+   struct gomp_thread *thr = gomp_thread ();
+ 
+   thr->ts.static_trip = 0;
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+ 		      GFS_STATIC, chunk_size);
+-      gomp_doacross_init (ncounts, counts, chunk_size);
++      gomp_doacross_init (ncounts, counts, chunk_size, 0);
+       gomp_work_share_init_done ();
+     }
+ 
+@@ -328,11 +501,11 @@ gomp_loop_doacross_dynamic_start (unsign
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+ 		      GFS_DYNAMIC, chunk_size);
+-      gomp_doacross_init (ncounts, counts, chunk_size);
++      gomp_doacross_init (ncounts, counts, chunk_size, 0);
+       gomp_work_share_init_done ();
+     }
+ 
+@@ -354,11 +527,11 @@ gomp_loop_doacross_guided_start (unsigne
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
+ 		      GFS_GUIDED, chunk_size);
+-      gomp_doacross_init (ncounts, counts, chunk_size);
++      gomp_doacross_init (ncounts, counts, chunk_size, 0);
+       gomp_work_share_init_done ();
+     }
+ 
+@@ -378,7 +551,7 @@ GOMP_loop_doacross_runtime_start (unsign
+ 				  long *istart, long *iend)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+-  switch (icv->run_sched_var)
++  switch (icv->run_sched_var & ~GFS_MONOTONIC)
+     {
+     case GFS_STATIC:
+       return gomp_loop_doacross_static_start (ncounts, counts,
+@@ -402,8 +575,52 @@ GOMP_loop_doacross_runtime_start (unsign
+     }
+ }
+ 
+-/* The *_next routines are called when the thread completes processing of 
+-   the iteration block currently assigned to it.  If the work-share 
++bool
++GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched,
++			  long chunk_size, long *istart, long *iend,
++			  uintptr_t *reductions, void **mem)
++{
++  struct gomp_thread *thr = gomp_thread ();
++
++  thr->ts.static_trip = 0;
++  if (reductions)
++    gomp_workshare_taskgroup_start ();
++  if (gomp_work_share_start (0))
++    {
++      size_t extra = 0;
++      if (mem)
++	extra = (uintptr_t) *mem;
++      sched = gomp_adjust_sched (sched, &chunk_size);
++      gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
++		      sched, chunk_size);
++      gomp_doacross_init (ncounts, counts, chunk_size, extra);
++      if (reductions)
++	{
++	  GOMP_taskgroup_reduction_register (reductions);
++	  thr->task->taskgroup->workshare = true;
++	  thr->ts.work_share->task_reductions = reductions;
++	}
++      gomp_work_share_init_done ();
++    }
++  else
++    {
++      if (reductions)
++	{
++	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
++	  gomp_workshare_task_reduction_register (reductions,
++						  first_reductions);
++	}
++      sched = thr->ts.work_share->sched;
++    }
++
++  if (mem)
++    *mem = thr->ts.work_share->doacross->extra;
++
++  return ialias_call (GOMP_loop_runtime_next) (istart, iend);
++}
++
++/* The *_next routines are called when the thread completes processing of
++   the iteration block currently assigned to it.  If the work-share
+    construct is bound directly to a parallel construct, then the iteration
+    bounds may have been set up before the parallel.  In which case, this
+    may be the first iteration for the thread.
+@@ -456,7 +673,7 @@ bool
+ GOMP_loop_runtime_next (long *istart, long *iend)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+-  
++
+   switch (thr->ts.work_share->sched)
+     {
+     case GFS_STATIC:
+@@ -534,7 +751,7 @@ bool
+ GOMP_loop_ordered_runtime_next (long *istart, long *iend)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+-  
++
+   switch (thr->ts.work_share->sched)
+     {
+     case GFS_STATIC:
+@@ -563,7 +780,7 @@ gomp_parallel_loop_start (void (*fn) (vo
+   num_threads = gomp_resolve_num_threads (num_threads, 0);
+   team = gomp_new_team (num_threads);
+   gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
+-  gomp_team_start (fn, data, num_threads, flags, team);
++  gomp_team_start (fn, data, num_threads, flags, team, NULL);
+ }
+ 
+ void
+@@ -600,7 +817,8 @@ GOMP_parallel_loop_runtime_start (void (
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    icv->run_sched_var, icv->run_sched_chunk_size, 0);
++			    icv->run_sched_var & ~GFS_MONOTONIC,
++			    icv->run_sched_chunk_size, 0);
+ }
+ 
+ ialias_redirect (GOMP_parallel_end)
+@@ -638,11 +856,28 @@ GOMP_parallel_loop_guided (void (*fn) (v
+   GOMP_parallel_end ();
+ }
+ 
++void
++GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
++			    unsigned num_threads, long start, long end,
++			    long incr, unsigned flags)
++{
++  struct gomp_task_icv *icv = gomp_icv (false);
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    icv->run_sched_var & ~GFS_MONOTONIC,
++			    icv->run_sched_chunk_size, flags);
++  fn (data);
++  GOMP_parallel_end ();
++}
++
+ #ifdef HAVE_ATTRIBUTE_ALIAS
+ extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic
+ 	__attribute__((alias ("GOMP_parallel_loop_dynamic")));
+ extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided
+ 	__attribute__((alias ("GOMP_parallel_loop_guided")));
++extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_nonmonotonic_runtime
++	__attribute__((alias ("GOMP_parallel_loop_runtime")));
++extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_maybe_nonmonotonic_runtime
++	__attribute__((alias ("GOMP_parallel_loop_runtime")));
+ #else
+ void
+ GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data,
+@@ -667,21 +902,35 @@ GOMP_parallel_loop_nonmonotonic_guided (
+   fn (data);
+   GOMP_parallel_end ();
+ }
+-#endif
+ 
+ void
+-GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
+-			    unsigned num_threads, long start, long end,
+-			    long incr, unsigned flags)
++GOMP_parallel_loop_nonmonotonic_runtime (void (*fn) (void *), void *data,
++					 unsigned num_threads, long start,
++					 long end, long incr, unsigned flags)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+   gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
+-			    icv->run_sched_var, icv->run_sched_chunk_size,
+-			    flags);
++			    icv->run_sched_var & ~GFS_MONOTONIC,
++			    icv->run_sched_chunk_size, flags);
+   fn (data);
+   GOMP_parallel_end ();
+ }
+ 
++void
++GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*fn) (void *), void *data,
++					       unsigned num_threads, long start,
++					       long end, long incr,
++					       unsigned flags)
++{
++  struct gomp_task_icv *icv = gomp_icv (false);
++  gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
++			    icv->run_sched_var & ~GFS_MONOTONIC,
++			    icv->run_sched_chunk_size, flags);
++  fn (data);
++  GOMP_parallel_end ();
++}
++#endif
++
+ /* The GOMP_loop_end* routines are called after the thread is told that
+    all loop iterations are complete.  The first two versions synchronize
+    all threads; the nowait version does not.  */
+@@ -721,6 +970,10 @@ extern __typeof(gomp_loop_dynamic_start)
+ 	__attribute__((alias ("gomp_loop_dynamic_start")));
+ extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start
+ 	__attribute__((alias ("gomp_loop_guided_start")));
++extern __typeof(GOMP_loop_runtime_start) GOMP_loop_nonmonotonic_runtime_start
++	__attribute__((alias ("GOMP_loop_runtime_start")));
++extern __typeof(GOMP_loop_runtime_start) GOMP_loop_maybe_nonmonotonic_runtime_start
++	__attribute__((alias ("GOMP_loop_runtime_start")));
+ 
+ extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start
+ 	__attribute__((alias ("gomp_loop_ordered_static_start")));
+@@ -746,6 +999,10 @@ extern __typeof(gomp_loop_dynamic_next)
+ 	__attribute__((alias ("gomp_loop_dynamic_next")));
+ extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next
+ 	__attribute__((alias ("gomp_loop_guided_next")));
++extern __typeof(GOMP_loop_runtime_next) GOMP_loop_nonmonotonic_runtime_next
++	__attribute__((alias ("GOMP_loop_runtime_next")));
++extern __typeof(GOMP_loop_runtime_next) GOMP_loop_maybe_nonmonotonic_runtime_next
++	__attribute__((alias ("GOMP_loop_runtime_next")));
+ 
+ extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next
+ 	__attribute__((alias ("gomp_loop_ordered_static_next")));
+@@ -791,6 +1048,20 @@ GOMP_loop_nonmonotonic_guided_start (lon
+ }
+ 
+ bool
++GOMP_loop_nonmonotonic_runtime_start (long start, long end, long incr,
++				      long *istart, long *iend)
++{
++  return GOMP_loop_runtime_start (start, end, incr, istart, iend);
++}
++
++bool
++GOMP_loop_maybe_nonmonotonic_runtime_start (long start, long end, long incr,
++					    long *istart, long *iend)
++{
++  return GOMP_loop_runtime_start (start, end, incr, istart, iend);
++}
++
++bool
+ GOMP_loop_ordered_static_start (long start, long end, long incr,
+ 				long chunk_size, long *istart, long *iend)
+ {
+@@ -869,6 +1140,18 @@ GOMP_loop_nonmonotonic_guided_next (long
+ }
+ 
+ bool
++GOMP_loop_nonmonotonic_runtime_next (long *istart, long *iend)
++{
++  return GOMP_loop_runtime_next (istart, iend);
++}
++
++bool
++GOMP_loop_maybe_nonmonotonic_runtime_next (long *istart, long *iend)
++{
++  return GOMP_loop_runtime_next (istart, iend);
++}
++
++bool
+ GOMP_loop_ordered_static_next (long *istart, long *iend)
+ {
+   return gomp_loop_ordered_static_next (istart, iend);
+--- libgomp/oacc-plugin.c.jj	2018-04-25 09:40:31.322655307 +0200
++++ libgomp/oacc-plugin.c	2019-05-07 18:46:36.531109656 +0200
+@@ -49,3 +49,14 @@ GOMP_PLUGIN_acc_thread (void)
+   struct goacc_thread *thr = goacc_thread ();
+   return thr ? thr->target_tls : NULL;
+ }
++
++int
++GOMP_PLUGIN_acc_default_dim (unsigned int i)
++{
++  if (i >= GOMP_DIM_MAX)
++    {
++      gomp_fatal ("invalid dimension argument: %d", i);
++      return -1;
++    }
++  return goacc_default_dims[i];
++}
+--- libgomp/libgomp_g.h.jj	2018-04-25 09:40:31.320655306 +0200
++++ libgomp/libgomp_g.h	2019-05-07 18:46:36.513109943 +0200
+@@ -1,4 +1,4 @@
+-/* Copyright (C) 2005-2018 Free Software Foundation, Inc.
++/* Copyright (C) 2005-2019 Free Software Foundation, Inc.
+    Contributed by Richard Henderson <rth@redhat.com>.
+ 
+    This file is part of the GNU Offloading and Multi Processing Library
+@@ -31,6 +31,7 @@
+ 
+ #include <stdbool.h>
+ #include <stddef.h>
++#include "gstdint.h"
+ 
+ /* barrier.c */
+ 
+@@ -56,6 +57,12 @@ extern bool GOMP_loop_nonmonotonic_dynam
+ 						  long *, long *);
+ extern bool GOMP_loop_nonmonotonic_guided_start (long, long, long, long,
+ 						 long *, long *);
++extern bool GOMP_loop_nonmonotonic_runtime_start (long, long, long,
++						  long *, long *);
++extern bool GOMP_loop_maybe_nonmonotonic_runtime_start (long, long, long,
++							long *, long *);
++extern bool GOMP_loop_start (long, long, long, long, long, long *, long *,
++			     uintptr_t *, void **);
+ 
+ extern bool GOMP_loop_ordered_static_start (long, long, long, long,
+ 					    long *, long *);
+@@ -64,6 +71,8 @@ extern bool GOMP_loop_ordered_dynamic_st
+ extern bool GOMP_loop_ordered_guided_start (long, long, long, long,
+ 					    long *, long *);
+ extern bool GOMP_loop_ordered_runtime_start (long, long, long, long *, long *);
++extern bool GOMP_loop_ordered_start (long, long, long, long, long, long *,
++				     long *, uintptr_t *, void **);
+ 
+ extern bool GOMP_loop_static_next (long *, long *);
+ extern bool GOMP_loop_dynamic_next (long *, long *);
+@@ -71,6 +80,8 @@ extern bool GOMP_loop_guided_next (long
+ extern bool GOMP_loop_runtime_next (long *, long *);
+ extern bool GOMP_loop_nonmonotonic_dynamic_next (long *, long *);
+ extern bool GOMP_loop_nonmonotonic_guided_next (long *, long *);
++extern bool GOMP_loop_nonmonotonic_runtime_next (long *, long *);
++extern bool GOMP_loop_maybe_nonmonotonic_runtime_next (long *, long *);
+ 
+ extern bool GOMP_loop_ordered_static_next (long *, long *);
+ extern bool GOMP_loop_ordered_dynamic_next (long *, long *);
+@@ -85,6 +96,8 @@ extern bool GOMP_loop_doacross_guided_st
+ 					     long *);
+ extern bool GOMP_loop_doacross_runtime_start (unsigned, long *, long *,
+ 					      long *);
++extern bool GOMP_loop_doacross_start (unsigned, long *, long, long, long *,
++				      long *, uintptr_t *, void **);
+ 
+ extern void GOMP_parallel_loop_static_start (void (*)(void *), void *,
+ 					     unsigned, long, long, long, long);
+@@ -112,6 +125,13 @@ extern void GOMP_parallel_loop_nonmonoto
+ extern void GOMP_parallel_loop_nonmonotonic_guided (void (*)(void *), void *,
+ 						    unsigned, long, long,
+ 						    long, long, unsigned);
++extern void GOMP_parallel_loop_nonmonotonic_runtime (void (*)(void *), void *,
++						     unsigned, long, long,
++						     long, unsigned);
++extern void GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*)(void *),
++							   void *, unsigned,
++							   long, long,
++							   long, unsigned);
+ 
+ extern void GOMP_loop_end (void);
+ extern void GOMP_loop_end_nowait (void);
+@@ -154,6 +174,21 @@ extern bool GOMP_loop_ull_nonmonotonic_g
+ 						     unsigned long long,
+ 						     unsigned long long *,
+ 						     unsigned long long *);
++extern bool GOMP_loop_ull_nonmonotonic_runtime_start (bool, unsigned long long,
++						      unsigned long long,
++						      unsigned long long,
++						      unsigned long long *,
++						      unsigned long long *);
++extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool,
++							    unsigned long long,
++							    unsigned long long,
++							    unsigned long long,
++							    unsigned long long *,
++							    unsigned long long *);
++extern bool GOMP_loop_ull_start (bool, unsigned long long, unsigned long long,
++				 unsigned long long, long, unsigned long long,
++				 unsigned long long *, unsigned long long *,
++				 uintptr_t *, void **);
+ 
+ extern bool GOMP_loop_ull_ordered_static_start (bool, unsigned long long,
+ 						unsigned long long,
+@@ -178,6 +213,13 @@ extern bool GOMP_loop_ull_ordered_runtim
+ 						 unsigned long long,
+ 						 unsigned long long *,
+ 						 unsigned long long *);
++extern bool GOMP_loop_ull_ordered_start (bool, unsigned long long,
++					 unsigned long long,
++					 unsigned long long, long,
++					 unsigned long long,
++					 unsigned long long *,
++					 unsigned long long *,
++					 uintptr_t *, void **);
+ 
+ extern bool GOMP_loop_ull_static_next (unsigned long long *,
+ 				       unsigned long long *);
+@@ -191,6 +233,10 @@ extern bool GOMP_loop_ull_nonmonotonic_d
+ 						     unsigned long long *);
+ extern bool GOMP_loop_ull_nonmonotonic_guided_next (unsigned long long *,
+ 						    unsigned long long *);
++extern bool GOMP_loop_ull_nonmonotonic_runtime_next (unsigned long long *,
++						     unsigned long long *);
++extern bool GOMP_loop_ull_maybe_nonmonotonic_runtime_next (unsigned long long *,
++							   unsigned long long *);
+ 
+ extern bool GOMP_loop_ull_ordered_static_next (unsigned long long *,
+ 					       unsigned long long *);
+@@ -220,6 +266,11 @@ extern bool GOMP_loop_ull_doacross_runti
+ 						  unsigned long long *,
+ 						  unsigned long long *,
+ 						  unsigned long long *);
++extern bool GOMP_loop_ull_doacross_start (unsigned, unsigned long long *,
++					  long, unsigned long long,
++					  unsigned long long *,
++					  unsigned long long *,
++					  uintptr_t *, void **);
+ 
+ /* ordered.c */
+ 
+@@ -235,6 +286,8 @@ extern void GOMP_doacross_ull_wait (unsi
+ extern void GOMP_parallel_start (void (*) (void *), void *, unsigned);
+ extern void GOMP_parallel_end (void);
+ extern void GOMP_parallel (void (*) (void *), void *, unsigned, unsigned);
++extern unsigned GOMP_parallel_reductions (void (*) (void *), void *, unsigned,
++					  unsigned);
+ extern bool GOMP_cancel (int, bool);
+ extern bool GOMP_cancellation_point (int);
+ 
+@@ -251,13 +304,19 @@ extern void GOMP_taskloop_ull (void (*)
+ 			       unsigned long long, unsigned long long,
+ 			       unsigned long long);
+ extern void GOMP_taskwait (void);
++extern void GOMP_taskwait_depend (void **);
+ extern void GOMP_taskyield (void);
+ extern void GOMP_taskgroup_start (void);
+ extern void GOMP_taskgroup_end (void);
++extern void GOMP_taskgroup_reduction_register (uintptr_t *);
++extern void GOMP_taskgroup_reduction_unregister (uintptr_t *);
++extern void GOMP_task_reduction_remap (size_t, size_t, void **);
++extern void GOMP_workshare_task_reduction_unregister (bool);
+ 
+ /* sections.c */
+ 
+ extern unsigned GOMP_sections_start (unsigned);
++extern unsigned GOMP_sections2_start (unsigned, uintptr_t *, void **);
+ extern unsigned GOMP_sections_next (void);
+ extern void GOMP_parallel_sections_start (void (*) (void *), void *,
+ 					  unsigned, unsigned);
+@@ -293,6 +352,11 @@ extern void GOMP_target_enter_exit_data
+ 					 void **);
+ extern void GOMP_teams (unsigned int, unsigned int);
+ 
++/* teams.c */
++
++extern void GOMP_teams_reg (void (*) (void *), void *, unsigned, unsigned,
++			    unsigned);
++
+ /* oacc-parallel.c */
+ 
+ extern void GOACC_parallel_keyed (int, void (*) (void *), size_t,
+--- libgomp/affinity.c.jj	2018-04-25 09:40:31.913655581 +0200
++++ libgomp/affinity.c	2019-05-07 18:46:36.254114081 +0200
+@@ -26,6 +26,8 @@
+ /* This is a generic stub implementation of a CPU affinity setting.  */
+ 
+ #include "libgomp.h"
++#include <string.h>
++#include <stdio.h>
+ 
+ void
+ gomp_init_affinity (void)
+@@ -138,5 +140,17 @@ gomp_get_place_proc_ids_8 (int place_num
+   (void) ids;
+ }
+ 
++void
++gomp_display_affinity_place (char *buffer, size_t size, size_t *ret,
++			     int place)
++{
++  char buf[sizeof (long) * 3 + 4];
++  if (gomp_available_cpus > 1)
++    sprintf (buf, "0-%lu", gomp_available_cpus - 1);
++  else
++    strcpy (buf, "0");
++  gomp_display_string (buffer, size, ret, buf, strlen (buf));
++}
++
+ ialias(omp_get_place_num_procs)
+ ialias(omp_get_place_proc_ids)
+--- libgomp/sections.c.jj	2018-04-25 09:40:31.924655586 +0200
++++ libgomp/sections.c	2019-05-07 18:46:36.535109592 +0200
+@@ -26,8 +26,11 @@
+ /* This file handles the SECTIONS construct.  */
+ 
+ #include "libgomp.h"
++#include <string.h>
+ 
+ 
++ialias_redirect (GOMP_taskgroup_reduction_register)
++
+ /* Initialize the given work share construct from the given arguments.  */
+ 
+ static inline void
+@@ -72,7 +75,7 @@ GOMP_sections_start (unsigned count)
+   struct gomp_thread *thr = gomp_thread ();
+   long s, e, ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_sections_init (thr->ts.work_share, count);
+       gomp_work_share_init_done ();
+@@ -95,6 +98,66 @@ GOMP_sections_start (unsigned count)
+   return ret;
+ }
+ 
++unsigned
++GOMP_sections2_start (unsigned count, uintptr_t *reductions, void **mem)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  long s, e, ret;
++
++  if (reductions)
++    gomp_workshare_taskgroup_start ();
++  if (gomp_work_share_start (0))
++    {
++      gomp_sections_init (thr->ts.work_share, count);
++      if (reductions)
++	{
++	  GOMP_taskgroup_reduction_register (reductions);
++	  thr->task->taskgroup->workshare = true;
++	  thr->ts.work_share->task_reductions = reductions;
++	}
++      if (mem)
++	{
++	  uintptr_t size = (uintptr_t) *mem;
++	  if (size > (sizeof (struct gomp_work_share)
++		      - offsetof (struct gomp_work_share,
++				  inline_ordered_team_ids)))
++	    thr->ts.work_share->ordered_team_ids
++	      = gomp_malloc_cleared (size);
++	  else
++	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
++	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
++	}
++      gomp_work_share_init_done ();
++    }
++  else
++    {
++      if (reductions)
++	{
++	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
++	  gomp_workshare_task_reduction_register (reductions,
++						  first_reductions);
++	}
++      if (mem)
++	*mem = (void *) thr->ts.work_share->ordered_team_ids;
++    }
++
++#ifdef HAVE_SYNC_BUILTINS
++  if (gomp_iter_dynamic_next (&s, &e))
++    ret = s;
++  else
++    ret = 0;
++#else
++  gomp_mutex_lock (&thr->ts.work_share->lock);
++  if (gomp_iter_dynamic_next_locked (&s, &e))
++    ret = s;
++  else
++    ret = 0;
++  gomp_mutex_unlock (&thr->ts.work_share->lock);
++#endif
++
++  return ret;
++}
++
+ /* This routine is called when the thread completes processing of the
+    section currently assigned to it.  If the work-share construct is
+    bound directly to a parallel construct, then the construct may have
+@@ -140,7 +203,7 @@ GOMP_parallel_sections_start (void (*fn)
+   num_threads = gomp_resolve_num_threads (num_threads, count);
+   team = gomp_new_team (num_threads);
+   gomp_sections_init (&team->work_shares[0], count);
+-  gomp_team_start (fn, data, num_threads, 0, team);
++  gomp_team_start (fn, data, num_threads, 0, team, NULL);
+ }
+ 
+ ialias_redirect (GOMP_parallel_end)
+@@ -154,7 +217,7 @@ GOMP_parallel_sections (void (*fn) (void
+   num_threads = gomp_resolve_num_threads (num_threads, count);
+   team = gomp_new_team (num_threads);
+   gomp_sections_init (&team->work_shares[0], count);
+-  gomp_team_start (fn, data, num_threads, flags, team);
++  gomp_team_start (fn, data, num_threads, flags, team, NULL);
+   fn (data);
+   GOMP_parallel_end ();
+ }
+--- libgomp/config/linux/affinity.c.jj	2018-04-25 09:40:31.875655563 +0200
++++ libgomp/config/linux/affinity.c	2019-05-07 18:46:36.344112642 +0200
+@@ -396,6 +396,56 @@ gomp_get_place_proc_ids_8 (int place_num
+       *ids++ = i;
+ }
+ 
++void
++gomp_display_affinity_place (char *buffer, size_t size, size_t *ret,
++			     int place)
++{
++  cpu_set_t *cpusetp;
++  char buf[sizeof (long) * 3 + 4];
++  if (place >= 0 && place < gomp_places_list_len)
++    cpusetp = (cpu_set_t *) gomp_places_list[place];
++  else if (gomp_cpusetp)
++    cpusetp = gomp_cpusetp;
++  else
++    {
++      if (gomp_available_cpus > 1)
++	sprintf (buf, "0-%lu", gomp_available_cpus - 1);
++      else
++	strcpy (buf, "0");
++      gomp_display_string (buffer, size, ret, buf, strlen (buf));
++      return;
++    }
++
++  unsigned long i, max = 8 * gomp_cpuset_size, start;
++  bool prev_set = false;
++  start = max;
++  for (i = 0; i <= max; i++)
++    {
++      bool this_set;
++      if (i == max)
++	this_set = false;
++      else
++	this_set = CPU_ISSET_S (i, gomp_cpuset_size, cpusetp);
++      if (this_set != prev_set)
++	{
++	  prev_set = this_set;
++	  if (this_set)
++	    {
++	      char *p = buf;
++	      if (start != max)
++		*p++ = ',';
++	      sprintf (p, "%lu", i);
++	      start = i;
++	    }
++	  else if (i == start + 1)
++	    continue;
++	  else
++	    sprintf (buf, "-%lu", i - 1);
++	  gomp_display_string (buffer, size, ret, buf, strlen (buf));
++	}
++    }
++}
++
+ ialias(omp_get_place_num_procs)
+ ialias(omp_get_place_proc_ids)
+ 
+--- libgomp/config/linux/ia64/futex.h.jj	2018-04-25 09:40:31.877655564 +0200
++++ libgomp/config/linux/ia64/futex.h	2019-05-07 18:46:36.344112642 +0200
+@@ -45,8 +45,8 @@ sys_futex0(int *addr, int op, int val)
+ 	  "=r"(r8), "=r"(r10)
+ 	: "r"(r15), "r"(out0), "r"(out1), "r"(out2), "r"(out3)
+ 	: "memory", "out4", "out5", "out6", "out7",
+-	  /* Non-stacked integer registers, minus r8, r10, r15.  */
+-	  "r2", "r3", "r9", "r11", "r12", "r13", "r14", "r16", "r17", "r18",
++	  /* Non-stacked integer registers, minus r8, r10, r12, r15.  */
++	  "r2", "r3", "r9", "r11", "r13", "r14", "r16", "r17", "r18",
+ 	  "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27",
+ 	  "r28", "r29", "r30", "r31",
+ 	  /* Predicate registers.  */
+--- libgomp/config/nvptx/teams.c.jj	2019-05-07 18:46:36.459110805 +0200
++++ libgomp/config/nvptx/teams.c	2019-05-07 18:46:36.459110805 +0200
+@@ -0,0 +1,57 @@
++/* Copyright (C) 2015-2019 Free Software Foundation, Inc.
++   Contributed by Alexander Monakov <amonakov@ispras.ru>
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This file defines OpenMP API entry points that accelerator targets are
++   expected to replace.  */
++
++#include "libgomp.h"
++
++void
++GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
++		unsigned int thread_limit, unsigned int flags)
++{
++  (void) fn;
++  (void) data;
++  (void) flags;
++  (void) num_teams;
++  (void) thread_limit;
++}
++
++int
++omp_get_num_teams (void)
++{
++  return gomp_num_teams_var + 1;
++}
++
++int
++omp_get_team_num (void)
++{
++  int ctaid;
++  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
++  return ctaid;
++}
++
++ialias (omp_get_num_teams)
++ialias (omp_get_team_num)
+--- libgomp/config/nvptx/team.c.jj	2018-04-25 09:40:31.890655570 +0200
++++ libgomp/config/nvptx/team.c	2019-05-07 18:46:36.459110805 +0200
+@@ -116,7 +116,8 @@ gomp_thread_start (struct gomp_thread_po
+ 
+ void
+ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+-		 unsigned flags, struct gomp_team *team)
++		 unsigned flags, struct gomp_team *team,
++		 struct gomp_taskgroup *taskgroup)
+ {
+   struct gomp_thread *thr, *nthr;
+   struct gomp_task *task;
+@@ -147,6 +148,7 @@ gomp_team_start (void (*fn) (void *), vo
+   nthreads_var = icv->nthreads_var;
+   gomp_init_task (thr->task, task, icv);
+   team->implicit_task[0].icv.nthreads_var = nthreads_var;
++  team->implicit_task[0].taskgroup = taskgroup;
+ 
+   if (nthreads == 1)
+     return;
+@@ -166,6 +168,7 @@ gomp_team_start (void (*fn) (void *), vo
+       nthr->task = &team->implicit_task[i];
+       gomp_init_task (nthr->task, task, icv);
+       team->implicit_task[i].icv.nthreads_var = nthreads_var;
++      team->implicit_task[i].taskgroup = taskgroup;
+       nthr->fn = fn;
+       nthr->data = data;
+       team->ordered_release[i] = &nthr->release;
+@@ -174,5 +177,11 @@ gomp_team_start (void (*fn) (void *), vo
+   gomp_simple_barrier_wait (&pool->threads_dock);
+ }
+ 
++int
++gomp_pause_host (void)
++{
++  return -1;
++}
++
+ #include "../../team.c"
+ #endif
+--- libgomp/config/nvptx/oacc-parallel.c.jj	2018-04-25 09:40:31.887655569 +0200
++++ libgomp/config/nvptx/oacc-parallel.c	2019-05-07 18:46:36.453110901 +0200
+@@ -1,358 +0,0 @@
+-/* OpenACC constructs
+-
+-   Copyright (C) 2014-2018 Free Software Foundation, Inc.
+-
+-   Contributed by Mentor Embedded.
+-
+-   This file is part of the GNU Offloading and Multi Processing Library
+-   (libgomp).
+-
+-   Libgomp is free software; you can redistribute it and/or modify it
+-   under the terms of the GNU General Public License as published by
+-   the Free Software Foundation; either version 3, or (at your option)
+-   any later version.
+-
+-   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+-   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+-   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+-   more details.
+-
+-   Under Section 7 of GPL version 3, you are granted additional
+-   permissions described in the GCC Runtime Library Exception, version
+-   3.1, as published by the Free Software Foundation.
+-
+-   You should have received a copy of the GNU General Public License and
+-   a copy of the GCC Runtime Library Exception along with this program;
+-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+-   <http://www.gnu.org/licenses/>.  */
+-
+-#include "libgomp_g.h"
+-
+-__asm__ (".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1);\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1);\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1);\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1);\n"
+-	 "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_num_threads\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads;\n"
+-	 "// BEGIN GLOBAL FUNCTION DECL: GOACC_get_thread_num\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num;\n"
+-	 "// BEGIN GLOBAL FUNCTION DECL: abort\n"
+-	 ".extern .func abort;\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_tid (.param .u32 %in_ar1)\n"
+-	 "{\n"
+-	 ".reg .u32 %ar1;\n"
+-	 ".reg .u32 %retval;\n"
+-	 ".reg .u64 %hr10;\n"
+-	 ".reg .u32 %r22;\n"
+-	 ".reg .u32 %r23;\n"
+-	 ".reg .u32 %r24;\n"
+-	 ".reg .u32 %r25;\n"
+-	 ".reg .u32 %r26;\n"
+-	 ".reg .u32 %r27;\n"
+-	 ".reg .u32 %r28;\n"
+-	 ".reg .u32 %r29;\n"
+-	 ".reg .pred %r30;\n"
+-	 ".reg .u32 %r31;\n"
+-	 ".reg .pred %r32;\n"
+-	 ".reg .u32 %r33;\n"
+-	 ".reg .pred %r34;\n"
+-	 ".local .align 8 .b8 %frame[4];\n"
+-	 "ld.param.u32 %ar1,[%in_ar1];\n"
+-	 "mov.u32 %r27,%ar1;\n"
+-	 "st.local.u32 [%frame],%r27;\n"
+-	 "ld.local.u32 %r28,[%frame];\n"
+-	 "mov.u32 %r29,1;\n"
+-	 "setp.eq.u32 %r30,%r28,%r29;\n"
+-	 "@%r30 bra $L4;\n"
+-	 "mov.u32 %r31,2;\n"
+-	 "setp.eq.u32 %r32,%r28,%r31;\n"
+-	 "@%r32 bra $L5;\n"
+-	 "mov.u32 %r33,0;\n"
+-	 "setp.eq.u32 %r34,%r28,%r33;\n"
+-	 "@!%r34 bra $L8;\n"
+-	 "mov.u32 %r23,%tid.x;\n"
+-	 "mov.u32 %r22,%r23;\n"
+-	 "bra $L7;\n"
+-	 "$L4:\n"
+-	 "mov.u32 %r24,%tid.y;\n"
+-	 "mov.u32 %r22,%r24;\n"
+-	 "bra $L7;\n"
+-	 "$L5:\n"
+-	 "mov.u32 %r25,%tid.z;\n"
+-	 "mov.u32 %r22,%r25;\n"
+-	 "bra $L7;\n"
+-	 "$L8:\n"
+-	 "{\n"
+-	 "{\n"
+-	 "call abort;\n"
+-	 "}\n"
+-	 "}\n"
+-	 "$L7:\n"
+-	 "mov.u32 %r26,%r22;\n"
+-	 "mov.u32 %retval,%r26;\n"
+-	 "st.param.u32 [%out_retval],%retval;\n"
+-	 "ret;\n"
+-	 "}\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_ntid (.param .u32 %in_ar1)\n"
+-	 "{\n"
+-	 ".reg .u32 %ar1;\n"
+-	 ".reg .u32 %retval;\n"
+-	 ".reg .u64 %hr10;\n"
+-	 ".reg .u32 %r22;\n"
+-	 ".reg .u32 %r23;\n"
+-	 ".reg .u32 %r24;\n"
+-	 ".reg .u32 %r25;\n"
+-	 ".reg .u32 %r26;\n"
+-	 ".reg .u32 %r27;\n"
+-	 ".reg .u32 %r28;\n"
+-	 ".reg .u32 %r29;\n"
+-	 ".reg .pred %r30;\n"
+-	 ".reg .u32 %r31;\n"
+-	 ".reg .pred %r32;\n"
+-	 ".reg .u32 %r33;\n"
+-	 ".reg .pred %r34;\n"
+-	 ".local .align 8 .b8 %frame[4];\n"
+-	 "ld.param.u32 %ar1,[%in_ar1];\n"
+-	 "mov.u32 %r27,%ar1;\n"
+-	 "st.local.u32 [%frame],%r27;\n"
+-	 "ld.local.u32 %r28,[%frame];\n"
+-	 "mov.u32 %r29,1;\n"
+-	 "setp.eq.u32 %r30,%r28,%r29;\n"
+-	 "@%r30 bra $L11;\n"
+-	 "mov.u32 %r31,2;\n"
+-	 "setp.eq.u32 %r32,%r28,%r31;\n"
+-	 "@%r32 bra $L12;\n"
+-	 "mov.u32 %r33,0;\n"
+-	 "setp.eq.u32 %r34,%r28,%r33;\n"
+-	 "@!%r34 bra $L15;\n"
+-	 "mov.u32 %r23,%ntid.x;\n"
+-	 "mov.u32 %r22,%r23;\n"
+-	 "bra $L14;\n"
+-	 "$L11:\n"
+-	 "mov.u32 %r24,%ntid.y;\n"
+-	 "mov.u32 %r22,%r24;\n"
+-	 "bra $L14;\n"
+-	 "$L12:\n"
+-	 "mov.u32 %r25,%ntid.z;\n"
+-	 "mov.u32 %r22,%r25;\n"
+-	 "bra $L14;\n"
+-	 "$L15:\n"
+-	 "{\n"
+-	 "{\n"
+-	 "call abort;\n"
+-	 "}\n"
+-	 "}\n"
+-	 "$L14:\n"
+-	 "mov.u32 %r26,%r22;\n"
+-	 "mov.u32 %retval,%r26;\n"
+-	 "st.param.u32 [%out_retval],%retval;\n"
+-	 "ret;\n"
+-	 "}\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_ctaid (.param .u32 %in_ar1)\n"
+-	 "{\n"
+-	 ".reg .u32 %ar1;\n"
+-	 ".reg .u32 %retval;\n"
+-	 ".reg .u64 %hr10;\n"
+-	 ".reg .u32 %r22;\n"
+-	 ".reg .u32 %r23;\n"
+-	 ".reg .u32 %r24;\n"
+-	 ".reg .u32 %r25;\n"
+-	 ".reg .u32 %r26;\n"
+-	 ".reg .u32 %r27;\n"
+-	 ".reg .u32 %r28;\n"
+-	 ".reg .u32 %r29;\n"
+-	 ".reg .pred %r30;\n"
+-	 ".reg .u32 %r31;\n"
+-	 ".reg .pred %r32;\n"
+-	 ".reg .u32 %r33;\n"
+-	 ".reg .pred %r34;\n"
+-	 ".local .align 8 .b8 %frame[4];\n"
+-	 "ld.param.u32 %ar1,[%in_ar1];\n"
+-	 "mov.u32 %r27,%ar1;\n"
+-	 "st.local.u32 [%frame],%r27;\n"
+-	 "ld.local.u32 %r28,[%frame];\n"
+-	 "mov.u32 %r29,1;\n"
+-	 "setp.eq.u32 %r30,%r28,%r29;\n"
+-	 "@%r30 bra $L18;\n"
+-	 "mov.u32 %r31,2;\n"
+-	 "setp.eq.u32 %r32,%r28,%r31;\n"
+-	 "@%r32 bra $L19;\n"
+-	 "mov.u32 %r33,0;\n"
+-	 "setp.eq.u32 %r34,%r28,%r33;\n"
+-	 "@!%r34 bra $L22;\n"
+-	 "mov.u32 %r23,%ctaid.x;\n"
+-	 "mov.u32 %r22,%r23;\n"
+-	 "bra $L21;\n"
+-	 "$L18:\n"
+-	 "mov.u32 %r24,%ctaid.y;\n"
+-	 "mov.u32 %r22,%r24;\n"
+-	 "bra $L21;\n"
+-	 "$L19:\n"
+-	 "mov.u32 %r25,%ctaid.z;\n"
+-	 "mov.u32 %r22,%r25;\n"
+-	 "bra $L21;\n"
+-	 "$L22:\n"
+-	 "{\n"
+-	 "{\n"
+-	 "call abort;\n"
+-	 "}\n"
+-	 "}\n"
+-	 "$L21:\n"
+-	 "mov.u32 %r26,%r22;\n"
+-	 "mov.u32 %retval,%r26;\n"
+-	 "st.param.u32 [%out_retval],%retval;\n"
+-	 "ret;\n"
+-	 "}\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_nctaid (.param .u32 %in_ar1)\n"
+-	 "{\n"
+-	 ".reg .u32 %ar1;\n"
+-	 ".reg .u32 %retval;\n"
+-	 ".reg .u64 %hr10;\n"
+-	 ".reg .u32 %r22;\n"
+-	 ".reg .u32 %r23;\n"
+-	 ".reg .u32 %r24;\n"
+-	 ".reg .u32 %r25;\n"
+-	 ".reg .u32 %r26;\n"
+-	 ".reg .u32 %r27;\n"
+-	 ".reg .u32 %r28;\n"
+-	 ".reg .u32 %r29;\n"
+-	 ".reg .pred %r30;\n"
+-	 ".reg .u32 %r31;\n"
+-	 ".reg .pred %r32;\n"
+-	 ".reg .u32 %r33;\n"
+-	 ".reg .pred %r34;\n"
+-	 ".local .align 8 .b8 %frame[4];\n"
+-	 "ld.param.u32 %ar1,[%in_ar1];\n"
+-	 "mov.u32 %r27,%ar1;\n"
+-	 "st.local.u32 [%frame],%r27;\n"
+-	 "ld.local.u32 %r28,[%frame];\n"
+-	 "mov.u32 %r29,1;\n"
+-	 "setp.eq.u32 %r30,%r28,%r29;\n"
+-	 "@%r30 bra $L25;\n"
+-	 "mov.u32 %r31,2;\n"
+-	 "setp.eq.u32 %r32,%r28,%r31;\n"
+-	 "@%r32 bra $L26;\n"
+-	 "mov.u32 %r33,0;\n"
+-	 "setp.eq.u32 %r34,%r28,%r33;\n"
+-	 "@!%r34 bra $L29;\n"
+-	 "mov.u32 %r23,%nctaid.x;\n"
+-	 "mov.u32 %r22,%r23;\n"
+-	 "bra $L28;\n"
+-	 "$L25:\n"
+-	 "mov.u32 %r24,%nctaid.y;\n"
+-	 "mov.u32 %r22,%r24;\n"
+-	 "bra $L28;\n"
+-	 "$L26:\n"
+-	 "mov.u32 %r25,%nctaid.z;\n"
+-	 "mov.u32 %r22,%r25;\n"
+-	 "bra $L28;\n"
+-	 "$L29:\n"
+-	 "{\n"
+-	 "{\n"
+-	 "call abort;\n"
+-	 "}\n"
+-	 "}\n"
+-	 "$L28:\n"
+-	 "mov.u32 %r26,%r22;\n"
+-	 "mov.u32 %retval,%r26;\n"
+-	 "st.param.u32 [%out_retval],%retval;\n"
+-	 "ret;\n"
+-	 "}\n"
+-	 "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_num_threads\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_get_num_threads\n"
+-	 "{\n"
+-	 ".reg .u32 %retval;\n"
+-	 ".reg .u64 %hr10;\n"
+-	 ".reg .u32 %r22;\n"
+-	 ".reg .u32 %r23;\n"
+-	 ".reg .u32 %r24;\n"
+-	 ".reg .u32 %r25;\n"
+-	 ".reg .u32 %r26;\n"
+-	 ".reg .u32 %r27;\n"
+-	 ".reg .u32 %r28;\n"
+-	 ".reg .u32 %r29;\n"
+-	 "mov.u32 %r26,0;\n"
+-	 "{\n"
+-	 ".param .u32 %retval_in;\n"
+-	 "{\n"
+-	 ".param .u32 %out_arg0;\n"
+-	 "st.param.u32 [%out_arg0],%r26;\n"
+-	 "call (%retval_in),GOACC_ntid,(%out_arg0);\n"
+-	 "}\n"
+-	 "ld.param.u32 %r27,[%retval_in];\n"
+-	 "}\n"
+-	 "mov.u32 %r22,%r27;\n"
+-	 "mov.u32 %r28,0;\n"
+-	 "{\n"
+-	 ".param .u32 %retval_in;\n"
+-	 "{\n"
+-	 ".param .u32 %out_arg0;\n"
+-	 "st.param.u32 [%out_arg0],%r28;\n"
+-	 "call (%retval_in),GOACC_nctaid,(%out_arg0);\n"
+-	 "}\n"
+-	 "ld.param.u32 %r29,[%retval_in];\n"
+-	 "}\n"
+-	 "mov.u32 %r23,%r29;\n"
+-	 "mul.lo.u32 %r24,%r22,%r23;\n"
+-	 "mov.u32 %r25,%r24;\n"
+-	 "mov.u32 %retval,%r25;\n"
+-	 "st.param.u32 [%out_retval],%retval;\n"
+-	 "ret;\n"
+-	 "}\n"
+-	 "// BEGIN GLOBAL FUNCTION DEF: GOACC_get_thread_num\n"
+-	 ".visible .func (.param .u32 %out_retval) GOACC_get_thread_num\n"
+-	 "{\n"
+-	 ".reg .u32 %retval;\n"
+-	 ".reg .u64 %hr10;\n"
+-	 ".reg .u32 %r22;\n"
+-	 ".reg .u32 %r23;\n"
+-	 ".reg .u32 %r24;\n"
+-	 ".reg .u32 %r25;\n"
+-	 ".reg .u32 %r26;\n"
+-	 ".reg .u32 %r27;\n"
+-	 ".reg .u32 %r28;\n"
+-	 ".reg .u32 %r29;\n"
+-	 ".reg .u32 %r30;\n"
+-	 ".reg .u32 %r31;\n"
+-	 ".reg .u32 %r32;\n"
+-	 ".reg .u32 %r33;\n"
+-	 "mov.u32 %r28,0;\n"
+-	 "{\n"
+-	 ".param .u32 %retval_in;\n"
+-	 "{\n"
+-	 ".param .u32 %out_arg0;\n"
+-	 "st.param.u32 [%out_arg0],%r28;\n"
+-	 "call (%retval_in),GOACC_ntid,(%out_arg0);\n"
+-	 "}\n"
+-	 "ld.param.u32 %r29,[%retval_in];\n"
+-	 "}\n"
+-	 "mov.u32 %r22,%r29;\n"
+-	 "mov.u32 %r30,0;\n"
+-	 "{\n"
+-	 ".param .u32 %retval_in;\n"
+-	 "{\n"
+-	 ".param .u32 %out_arg0;\n"
+-	 "st.param.u32 [%out_arg0],%r30;\n"
+-	 "call (%retval_in),GOACC_ctaid,(%out_arg0);\n"
+-	 "}\n"
+-	 "ld.param.u32 %r31,[%retval_in];\n"
+-	 "}\n"
+-	 "mov.u32 %r23,%r31;\n"
+-	 "mul.lo.u32 %r24,%r22,%r23;\n"
+-	 "mov.u32 %r32,0;\n"
+-	 "{\n"
+-	 ".param .u32 %retval_in;\n"
+-	 "{\n"
+-	 ".param .u32 %out_arg0;\n"
+-	 "st.param.u32 [%out_arg0],%r32;\n"
+-	 "call (%retval_in),GOACC_tid,(%out_arg0);\n"
+-	 "}\n"
+-	 "ld.param.u32 %r33,[%retval_in];\n"
+-	 "}\n"
+-	 "mov.u32 %r25,%r33;\n"
+-	 "add.u32 %r26,%r24,%r25;\n"
+-	 "mov.u32 %r27,%r26;\n"
+-	 "mov.u32 %retval,%r27;\n"
+-	 "st.param.u32 [%out_retval],%retval;\n"
+-	 "ret;\n"
+-	 "}\n");
+--- libgomp/config/nvptx/target.c.jj	2018-04-25 09:40:31.890655570 +0200
++++ libgomp/config/nvptx/target.c	2019-05-07 18:46:36.453110901 +0200
+@@ -47,3 +47,21 @@ GOMP_teams (unsigned int num_teams, unsi
+     }
+   gomp_num_teams_var = num_teams - 1;
+ }
++
++int
++omp_pause_resource (omp_pause_resource_t kind, int device_num)
++{
++  (void) kind;
++  (void) device_num;
++  return -1;
++}
++
++int
++omp_pause_resource_all (omp_pause_resource_t kind)
++{
++  (void) kind;
++  return -1;
++}
++
++ialias (omp_pause_resource)
++ialias (omp_pause_resource_all)
+--- libgomp/config/nvptx/icv-device.c.jj	2018-04-25 09:40:31.889655570 +0200
++++ libgomp/config/nvptx/icv-device.c	2019-05-07 18:46:36.453110901 +0200
+@@ -46,20 +46,6 @@ omp_get_num_devices (void)
+ }
+ 
+ int
+-omp_get_num_teams (void)
+-{
+-  return gomp_num_teams_var + 1;
+-}
+-
+-int
+-omp_get_team_num (void)
+-{
+-  int ctaid;
+-  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
+-  return ctaid;
+-}
+-
+-int
+ omp_is_initial_device (void)
+ {
+   /* NVPTX is an accelerator-only target.  */
+@@ -69,6 +55,4 @@ omp_is_initial_device (void)
+ ialias (omp_set_default_device)
+ ialias (omp_get_default_device)
+ ialias (omp_get_num_devices)
+-ialias (omp_get_num_teams)
+-ialias (omp_get_team_num)
+ ialias (omp_is_initial_device)
+--- libgomp/config/nvptx/affinity-fmt.c.jj	2019-05-07 18:46:36.358112419 +0200
++++ libgomp/config/nvptx/affinity-fmt.c	2019-05-07 18:46:36.358112419 +0200
+@@ -0,0 +1,51 @@
++/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "libgomp.h"
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#ifdef HAVE_UNISTD_H
++#include <unistd.h>
++#endif
++#ifdef HAVE_INTTYPES_H
++# include <inttypes.h>  /* For PRIx64.  */
++#endif
++#ifdef HAVE_UNAME
++#include <sys/utsname.h>
++#endif
++
++/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for nvptx,
++   while the nvptx newlib implementation does not support those functions.
++   Override the configure test results here.  */
++#undef HAVE_GETPID
++#undef HAVE_GETHOSTNAME
++
++/* The nvptx newlib implementation does not support fwrite, but it does support
++   write.  Map fwrite to write.  */
++#undef fwrite
++#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
++
++#include "../../affinity-fmt.c"
++
+--- libgomp/config/mingw32/affinity-fmt.c.jj	2019-05-07 18:46:36.344112642 +0200
++++ libgomp/config/mingw32/affinity-fmt.c	2019-05-07 18:46:36.344112642 +0200
+@@ -0,0 +1,68 @@
++/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
++   Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "libgomp.h"
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#ifdef HAVE_UNISTD_H
++#include <unistd.h>
++#endif
++#ifdef HAVE_INTTYPES_H
++# include <inttypes.h>  /* For PRIx64.  */
++#endif
++#define WIN32_LEAN_AND_MEAN
++#include <windows.h>
++#include <errno.h>
++
++static int
++gomp_gethostname (char *name, size_t len)
++{
++  /* On Win9x GetComputerName fails if the input size is less
++     than MAX_COMPUTERNAME_LENGTH + 1.  */
++  char buffer[MAX_COMPUTERNAME_LENGTH + 1];
++  DWORD size = sizeof (buffer);
++  int ret = 0;
++
++  if (!GetComputerName (buffer, &size))
++    return -1;
++
++  if ((size = strlen (buffer) + 1) > len)
++    {
++      errno = EINVAL;
++      /* Truncate as per POSIX spec.  We do not NUL-terminate. */
++      size = len;
++      ret = -1;
++    }
++  memcpy (name, buffer, (size_t) size);
++
++  return ret;
++}
++
++#undef gethostname
++#define gethostname gomp_gethostname
++#define  HAVE_GETHOSTNAME 1
++
++#include "../../affinity-fmt.c"
+--- libgomp/config/rtems/bar.c.jj	2018-04-25 09:40:31.902655576 +0200
++++ libgomp/config/rtems/bar.c	2019-05-07 18:46:36.460110789 +0200
+@@ -72,184 +72,5 @@ do_wait (int *addr, int val)
+     futex_wait (addr, val);
+ }
+ 
+-/* Everything below this point should be identical to the Linux
+-   implementation.  */
+-
+-void
+-gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+-{
+-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+-    {
+-      /* Next time we'll be awaiting TOTAL threads again.  */
+-      bar->awaited = bar->total;
+-      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+-			MEMMODEL_RELEASE);
+-      futex_wake ((int *) &bar->generation, INT_MAX);
+-    }
+-  else
+-    {
+-      do
+-	do_wait ((int *) &bar->generation, state);
+-      while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE) == state);
+-    }
+-}
+-
+-void
+-gomp_barrier_wait (gomp_barrier_t *bar)
+-{
+-  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+-}
+-
+-/* Like gomp_barrier_wait, except that if the encountering thread
+-   is not the last one to hit the barrier, it returns immediately.
+-   The intended usage is that a thread which intends to gomp_barrier_destroy
+-   this barrier calls gomp_barrier_wait, while all other threads
+-   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+-   the barrier can be safely destroyed.  */
+-
+-void
+-gomp_barrier_wait_last (gomp_barrier_t *bar)
+-{
+-  gomp_barrier_state_t state = gomp_barrier_wait_start (bar);
+-  if (state & BAR_WAS_LAST)
+-    gomp_barrier_wait_end (bar, state);
+-}
+-
+-void
+-gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+-{
+-  futex_wake ((int *) &bar->generation, count == 0 ? INT_MAX : count);
+-}
+-
+-void
+-gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+-{
+-  unsigned int generation, gen;
+-
+-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+-    {
+-      /* Next time we'll be awaiting TOTAL threads again.  */
+-      struct gomp_thread *thr = gomp_thread ();
+-      struct gomp_team *team = thr->ts.team;
+-
+-      bar->awaited = bar->total;
+-      team->work_share_cancelled = 0;
+-      if (__builtin_expect (team->task_count, 0))
+-	{
+-	  gomp_barrier_handle_tasks (state);
+-	  state &= ~BAR_WAS_LAST;
+-	}
+-      else
+-	{
+-	  state &= ~BAR_CANCELLED;
+-	  state += BAR_INCR - BAR_WAS_LAST;
+-	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+-	  futex_wake ((int *) &bar->generation, INT_MAX);
+-	  return;
+-	}
+-    }
+-
+-  generation = state;
+-  state &= ~BAR_CANCELLED;
+-  do
+-    {
+-      do_wait ((int *) &bar->generation, generation);
+-      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+-      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+-	{
+-	  gomp_barrier_handle_tasks (state);
+-	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+-	}
+-      generation |= gen & BAR_WAITING_FOR_TASK;
+-    }
+-  while (gen != state + BAR_INCR);
+-}
+-
+-void
+-gomp_team_barrier_wait (gomp_barrier_t *bar)
+-{
+-  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+-}
+-
+-void
+-gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+-{
+-  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+-    bar->awaited_final = bar->total;
+-  gomp_team_barrier_wait_end (bar, state);
+-}
+-
+-bool
+-gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+-				   gomp_barrier_state_t state)
+-{
+-  unsigned int generation, gen;
+-
+-  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+-    {
+-      /* Next time we'll be awaiting TOTAL threads again.  */
+-      /* BAR_CANCELLED should never be set in state here, because
+-	 cancellation means that at least one of the threads has been
+-	 cancelled, thus on a cancellable barrier we should never see
+-	 all threads to arrive.  */
+-      struct gomp_thread *thr = gomp_thread ();
+-      struct gomp_team *team = thr->ts.team;
+-
+-      bar->awaited = bar->total;
+-      team->work_share_cancelled = 0;
+-      if (__builtin_expect (team->task_count, 0))
+-	{
+-	  gomp_barrier_handle_tasks (state);
+-	  state &= ~BAR_WAS_LAST;
+-	}
+-      else
+-	{
+-	  state += BAR_INCR - BAR_WAS_LAST;
+-	  __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
+-	  futex_wake ((int *) &bar->generation, INT_MAX);
+-	  return false;
+-	}
+-    }
+-
+-  if (__builtin_expect (state & BAR_CANCELLED, 0))
+-    return true;
+-
+-  generation = state;
+-  do
+-    {
+-      do_wait ((int *) &bar->generation, generation);
+-      gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+-      if (__builtin_expect (gen & BAR_CANCELLED, 0))
+-	return true;
+-      if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
+-	{
+-	  gomp_barrier_handle_tasks (state);
+-	  gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
+-	}
+-      generation |= gen & BAR_WAITING_FOR_TASK;
+-    }
+-  while (gen != state + BAR_INCR);
+-
+-  return false;
+-}
+-
+-bool
+-gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+-{
+-  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+-}
+-
+-void
+-gomp_team_barrier_cancel (struct gomp_team *team)
+-{
+-  gomp_mutex_lock (&team->task_lock);
+-  if (team->barrier.generation & BAR_CANCELLED)
+-    {
+-      gomp_mutex_unlock (&team->task_lock);
+-      return;
+-    }
+-  team->barrier.generation |= BAR_CANCELLED;
+-  gomp_mutex_unlock (&team->task_lock);
+-  futex_wake ((int *) &team->barrier.generation, INT_MAX);
+-}
++#define GOMP_WAIT_H 1
++#include "../linux/bar.c"
+--- libgomp/config/rtems/affinity-fmt.c.jj	2019-05-07 18:46:36.459110805 +0200
++++ libgomp/config/rtems/affinity-fmt.c	2019-05-07 18:46:36.459110805 +0200
+@@ -0,0 +1,49 @@
++/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "libgomp.h"
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#ifdef HAVE_UNISTD_H
++#include <unistd.h>
++#endif
++#ifdef HAVE_INTTYPES_H
++# include <inttypes.h>  /* For PRIx64.  */
++#endif
++#ifdef HAVE_UNAME
++#include <sys/utsname.h>
++#endif
++
++/* The HAVE_GETPID and HAVE_GETHOSTNAME configure tests are passing for RTEMS,
++   but the extra information they give are of little value for the user.
++   Override the configure test results here.  */
++#undef HAVE_GETPID
++#undef HAVE_GETHOSTNAME
++
++/* Avoid the complex fwrite() in favour of the simple write().  */
++#undef fwrite
++#define fwrite(ptr, size, nmemb, stream) write (1, (ptr), (nmemb) * (size))
++
++#include "../../affinity-fmt.c"
+--- libgomp/config.h.in.jj	2018-04-25 09:40:31.870655561 +0200
++++ libgomp/config.h.in	2019-05-07 18:46:36.465110710 +0200
+@@ -1,5 +1,8 @@
+ /* config.h.in.  Generated from configure.ac by autoheader.  */
+ 
++/* Define to 1 if you have the `aligned_alloc' function. */
++#undef HAVE_ALIGNED_ALLOC
++
+ /* Define to 1 if the target assembler supports .symver directive. */
+ #undef HAVE_AS_SYMVER_DIRECTIVE
+ 
+@@ -33,9 +36,15 @@
+ /* Define to 1 if you have the `getgid' function. */
+ #undef HAVE_GETGID
+ 
++/* Define if gethostname is supported. */
++#undef HAVE_GETHOSTNAME
++
+ /* Define to 1 if you have the `getloadavg' function. */
+ #undef HAVE_GETLOADAVG
+ 
++/* Define if getpid is supported. */
++#undef HAVE_GETPID
++
+ /* Define to 1 if you have the `getuid' function. */
+ #undef HAVE_GETUID
+ 
+@@ -45,9 +54,15 @@
+ /* Define to 1 if you have the `dl' library (-ldl). */
+ #undef HAVE_LIBDL
+ 
++/* Define to 1 if you have the `memalign' function. */
++#undef HAVE_MEMALIGN
++
+ /* Define to 1 if you have the <memory.h> header file. */
+ #undef HAVE_MEMORY_H
+ 
++/* Define to 1 if you have the `posix_memalign' function. */
++#undef HAVE_POSIX_MEMALIGN
++
+ /* Define if pthread_{,attr_}{g,s}etaffinity_np is supported. */
+ #undef HAVE_PTHREAD_AFFINITY_NP
+ 
+@@ -103,9 +118,15 @@
+ /* Define to 1 if the target supports thread-local storage. */
+ #undef HAVE_TLS
+ 
++/* Define if uname is supported and struct utsname has nodename field. */
++#undef HAVE_UNAME
++
+ /* Define to 1 if you have the <unistd.h> header file. */
+ #undef HAVE_UNISTD_H
+ 
++/* Define to 1 if you have the `_aligned_malloc' function. */
++#undef HAVE__ALIGNED_MALLOC
++
+ /* Define to 1 if you have the `__secure_getenv' function. */
+ #undef HAVE___SECURE_GETENV
+ 
+@@ -125,8 +146,8 @@
+    */
+ #undef LT_OBJDIR
+ 
+-/* Define to offload targets, separated by commas. */
+-#undef OFFLOAD_TARGETS
++/* Define to offload plugins, separated by commas. */
++#undef OFFLOAD_PLUGINS
+ 
+ /* Name of package */
+ #undef PACKAGE
+--- libgomp/teams.c.jj	2019-05-07 18:46:36.548109384 +0200
++++ libgomp/teams.c	2019-05-07 18:46:36.548109384 +0200
+@@ -0,0 +1,74 @@
++/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
++   Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++/* This file handles the host TEAMS construct.  */
++
++#include "libgomp.h"
++#include <limits.h>
++
++static unsigned gomp_num_teams = 1, gomp_team_num = 0;
++
++void
++GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
++		unsigned int thread_limit, unsigned int flags)
++{
++  (void) flags;
++  (void) num_teams;
++  unsigned old_thread_limit_var = 0;
++  if (thread_limit)
++    {
++      struct gomp_task_icv *icv = gomp_icv (true);
++      old_thread_limit_var = icv->thread_limit_var;
++      icv->thread_limit_var
++	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
++    }
++  if (num_teams == 0)
++    num_teams = 3;
++  gomp_num_teams = num_teams;
++  for (gomp_team_num = 0; gomp_team_num < num_teams; gomp_team_num++)
++    fn (data);
++  gomp_num_teams = 1;
++  gomp_team_num = 0;
++  if (thread_limit)
++    {
++      struct gomp_task_icv *icv = gomp_icv (true);
++      icv->thread_limit_var = old_thread_limit_var;
++    }
++}
++
++int
++omp_get_num_teams (void)
++{
++  return gomp_num_teams;
++}
++
++int
++omp_get_team_num (void)
++{
++  return gomp_team_num;
++}
++
++ialias (omp_get_num_teams)
++ialias (omp_get_team_num)
+--- libgomp/libgomp.map.jj	2018-04-25 09:40:31.321655307 +0200
++++ libgomp/libgomp.map	2019-05-07 18:46:36.525109751 +0200
+@@ -164,6 +164,22 @@ OMP_4.5 {
+ 	omp_target_disassociate_ptr;
+ } OMP_4.0;
+ 
++OMP_5.0 {
++  global:
++	omp_capture_affinity;
++	omp_capture_affinity_;
++	omp_display_affinity;
++	omp_display_affinity_;
++	omp_get_affinity_format;
++	omp_get_affinity_format_;
++	omp_set_affinity_format;
++	omp_set_affinity_format_;
++	omp_pause_resource;
++	omp_pause_resource_;
++	omp_pause_resource_all;
++	omp_pause_resource_all_;
++} OMP_4.5;
++
+ GOMP_1.0 {
+   global:
+ 	GOMP_atomic_end;
+@@ -298,6 +314,34 @@ GOMP_4.5 {
+ 	GOMP_parallel_loop_nonmonotonic_guided;
+ } GOMP_4.0.1;
+ 
++GOMP_5.0 {
++  global:
++	GOMP_loop_doacross_start;
++	GOMP_loop_maybe_nonmonotonic_runtime_next;
++	GOMP_loop_maybe_nonmonotonic_runtime_start;
++	GOMP_loop_nonmonotonic_runtime_next;
++	GOMP_loop_nonmonotonic_runtime_start;
++	GOMP_loop_ordered_start;
++	GOMP_loop_start;
++	GOMP_loop_ull_doacross_start;
++	GOMP_loop_ull_maybe_nonmonotonic_runtime_next;
++	GOMP_loop_ull_maybe_nonmonotonic_runtime_start;
++	GOMP_loop_ull_nonmonotonic_runtime_next;
++	GOMP_loop_ull_nonmonotonic_runtime_start;
++	GOMP_loop_ull_ordered_start;
++	GOMP_loop_ull_start;
++	GOMP_parallel_loop_maybe_nonmonotonic_runtime;
++	GOMP_parallel_loop_nonmonotonic_runtime;
++	GOMP_parallel_reductions;
++	GOMP_sections2_start;
++	GOMP_taskgroup_reduction_register;
++	GOMP_taskgroup_reduction_unregister;
++	GOMP_task_reduction_remap;
++	GOMP_taskwait_depend;
++	GOMP_teams_reg;
++	GOMP_workshare_task_reduction_unregister;
++} GOMP_4.5;
++
+ OACC_2.0 {
+   global:
+ 	acc_get_num_devices;
+@@ -386,6 +430,52 @@ OACC_2.0.1 {
+ 	acc_pcreate;
+ } OACC_2.0;
+ 
++OACC_2.5 {
++  global:
++	acc_copyin_async;
++	acc_copyin_async_32_h_;
++	acc_copyin_async_64_h_;
++	acc_copyin_async_array_h_;
++	acc_copyout_async;
++	acc_copyout_async_32_h_;
++	acc_copyout_async_64_h_;
++	acc_copyout_async_array_h_;
++	acc_copyout_finalize;
++	acc_copyout_finalize_32_h_;
++	acc_copyout_finalize_64_h_;
++	acc_copyout_finalize_array_h_;
++	acc_copyout_finalize_async;
++	acc_copyout_finalize_async_32_h_;
++	acc_copyout_finalize_async_64_h_;
++	acc_copyout_finalize_async_array_h_;
++	acc_create_async;
++	acc_create_async_32_h_;
++	acc_create_async_64_h_;
++	acc_create_async_array_h_;
++	acc_delete_async;
++	acc_delete_async_32_h_;
++	acc_delete_async_64_h_;
++	acc_delete_async_array_h_;
++	acc_delete_finalize;
++	acc_delete_finalize_32_h_;
++	acc_delete_finalize_64_h_;
++	acc_delete_finalize_array_h_;
++	acc_delete_finalize_async;
++	acc_delete_finalize_async_32_h_;
++	acc_delete_finalize_async_64_h_;
++	acc_delete_finalize_async_array_h_;
++	acc_memcpy_from_device_async;
++	acc_memcpy_to_device_async;
++	acc_update_device_async;
++	acc_update_device_async_32_h_;
++	acc_update_device_async_64_h_;
++	acc_update_device_async_array_h_;
++	acc_update_self_async;
++	acc_update_self_async_32_h_;
++	acc_update_self_async_64_h_;
++	acc_update_self_async_array_h_;
++} OACC_2.0.1;
++
+ GOACC_2.0 {
+   global:
+ 	GOACC_data_end;
+@@ -420,3 +510,8 @@ GOMP_PLUGIN_1.1 {
+   global:
+ 	GOMP_PLUGIN_target_task_completion;
+ } GOMP_PLUGIN_1.0;
++
++GOMP_PLUGIN_1.2 {
++  global:
++	GOMP_PLUGIN_acc_default_dim;
++} GOMP_PLUGIN_1.1;
+--- libgomp/oacc-async.c.jj	2018-04-25 09:40:31.925655587 +0200
++++ libgomp/oacc-async.c	2019-05-07 18:46:36.528109704 +0200
+@@ -34,7 +34,7 @@
+ int
+ acc_async_test (int async)
+ {
+-  if (async < acc_async_sync)
++  if (!async_valid_p (async))
+     gomp_fatal ("invalid async argument: %d", async);
+ 
+   struct goacc_thread *thr = goacc_thread ();
+@@ -59,7 +59,7 @@ acc_async_test_all (void)
+ void
+ acc_wait (int async)
+ {
+-  if (async < acc_async_sync)
++  if (!async_valid_p (async))
+     gomp_fatal ("invalid async argument: %d", async);
+ 
+   struct goacc_thread *thr = goacc_thread ();
+@@ -117,7 +117,7 @@ acc_async_wait_all (void)
+ void
+ acc_wait_all_async (int async)
+ {
+-  if (async < acc_async_sync)
++  if (!async_valid_p (async))
+     gomp_fatal ("invalid async argument: %d", async);
+ 
+   struct goacc_thread *thr = goacc_thread ();
+--- libgomp/loop_ull.c.jj	2018-04-25 09:40:31.912655580 +0200
++++ libgomp/loop_ull.c	2019-05-07 18:46:36.527109719 +0200
+@@ -27,8 +27,12 @@
+ 
+ #include <limits.h>
+ #include <stdlib.h>
++#include <string.h>
+ #include "libgomp.h"
+ 
++ialias (GOMP_loop_ull_runtime_next)
++ialias_redirect (GOMP_taskgroup_reduction_register)
++
+ typedef unsigned long long gomp_ull;
+ 
+ /* Initialize the given work share construct from the given arguments.  */
+@@ -104,7 +108,7 @@ gomp_loop_ull_static_start (bool up, gom
+   struct gomp_thread *thr = gomp_thread ();
+ 
+   thr->ts.static_trip = 0;
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+ 			  GFS_STATIC, chunk_size);
+@@ -122,7 +126,7 @@ gomp_loop_ull_dynamic_start (bool up, go
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+ 			  GFS_DYNAMIC, chunk_size);
+@@ -148,7 +152,7 @@ gomp_loop_ull_guided_start (bool up, gom
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+ 			  GFS_GUIDED, chunk_size);
+@@ -171,7 +175,7 @@ GOMP_loop_ull_runtime_start (bool up, go
+ 			     gomp_ull incr, gomp_ull *istart, gomp_ull *iend)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+-  switch (icv->run_sched_var)
++  switch (icv->run_sched_var & ~GFS_MONOTONIC)
+     {
+     case GFS_STATIC:
+       return gomp_loop_ull_static_start (up, start, end, incr,
+@@ -195,6 +199,99 @@ GOMP_loop_ull_runtime_start (bool up, go
+     }
+ }
+ 
++static long
++gomp_adjust_sched (long sched, gomp_ull *chunk_size)
++{
++  sched &= ~GFS_MONOTONIC;
++  switch (sched)
++    {
++    case GFS_STATIC:
++    case GFS_DYNAMIC:
++    case GFS_GUIDED:
++      return sched;
++    /* GFS_RUNTIME is used for runtime schedule without monotonic
++       or nonmonotonic modifiers on the clause.
++       GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
++       modifier.  */
++    case GFS_RUNTIME:
++    /* GFS_AUTO is used for runtime schedule with nonmonotonic
++       modifier.  */
++    case GFS_AUTO:
++      {
++	struct gomp_task_icv *icv = gomp_icv (false);
++	sched = icv->run_sched_var & ~GFS_MONOTONIC;
++	switch (sched)
++	  {
++	  case GFS_STATIC:
++	  case GFS_DYNAMIC:
++	  case GFS_GUIDED:
++	    *chunk_size = icv->run_sched_chunk_size;
++	    break;
++	  case GFS_AUTO:
++	    sched = GFS_STATIC;
++	    *chunk_size = 0;
++	    break;
++	  default:
++	    abort ();
++	  }
++	return sched;
++      }
++    default:
++      abort ();
++    }
++}
++
++bool
++GOMP_loop_ull_start (bool up, gomp_ull start, gomp_ull end,
++		     gomp_ull incr, long sched, gomp_ull chunk_size,
++		     gomp_ull *istart, gomp_ull *iend,
++		     uintptr_t *reductions, void **mem)
++{
++  struct gomp_thread *thr = gomp_thread ();
++
++  thr->ts.static_trip = 0;
++  if (reductions)
++    gomp_workshare_taskgroup_start ();
++  if (gomp_work_share_start (0))
++    {
++      sched = gomp_adjust_sched (sched, &chunk_size);
++      gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
++      			  sched, chunk_size);
++      if (reductions)
++	{
++	  GOMP_taskgroup_reduction_register (reductions);
++	  thr->task->taskgroup->workshare = true;
++	  thr->ts.work_share->task_reductions = reductions;
++	}
++      if (mem)
++	{
++	  uintptr_t size = (uintptr_t) *mem;
++	  if (size > (sizeof (struct gomp_work_share)
++		      - offsetof (struct gomp_work_share,
++				  inline_ordered_team_ids)))
++	    thr->ts.work_share->ordered_team_ids
++	      = gomp_malloc_cleared (size);
++	  else
++	    memset (thr->ts.work_share->ordered_team_ids, '\0', size);
++	  *mem = (void *) thr->ts.work_share->ordered_team_ids;
++	}
++      gomp_work_share_init_done ();
++    }
++  else
++    {
++      if (reductions)
++	{
++	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
++	  gomp_workshare_task_reduction_register (reductions,
++						  first_reductions);
++	}
++      if (mem)
++	*mem = (void *) thr->ts.work_share->ordered_team_ids;
++    }
++
++  return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
++}
++
+ /* The *_ordered_*_start routines are similar.  The only difference is that
+    this work-share construct is initialized to expect an ORDERED section.  */
+ 
+@@ -206,7 +303,7 @@ gomp_loop_ull_ordered_static_start (bool
+   struct gomp_thread *thr = gomp_thread ();
+ 
+   thr->ts.static_trip = 0;
+-  if (gomp_work_share_start (true))
++  if (gomp_work_share_start (1))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+ 			  GFS_STATIC, chunk_size);
+@@ -225,7 +322,7 @@ gomp_loop_ull_ordered_dynamic_start (boo
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (true))
++  if (gomp_work_share_start (1))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+ 			  GFS_DYNAMIC, chunk_size);
+@@ -251,7 +348,7 @@ gomp_loop_ull_ordered_guided_start (bool
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (true))
++  if (gomp_work_share_start (1))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
+ 			  GFS_GUIDED, chunk_size);
+@@ -275,7 +372,7 @@ GOMP_loop_ull_ordered_runtime_start (boo
+ 				     gomp_ull *iend)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+-  switch (icv->run_sched_var)
++  switch (icv->run_sched_var & ~GFS_MONOTONIC)
+     {
+     case GFS_STATIC:
+       return gomp_loop_ull_ordered_static_start (up, start, end, incr,
+@@ -299,6 +396,82 @@ GOMP_loop_ull_ordered_runtime_start (boo
+     }
+ }
+ 
++bool
++GOMP_loop_ull_ordered_start (bool up, gomp_ull start, gomp_ull end,
++			     gomp_ull incr, long sched, gomp_ull chunk_size,
++			     gomp_ull *istart, gomp_ull *iend,
++			     uintptr_t *reductions, void **mem)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  size_t ordered = 1;
++  bool ret;
++
++  thr->ts.static_trip = 0;
++  if (reductions)
++    gomp_workshare_taskgroup_start ();
++  if (mem)
++    ordered += (uintptr_t) *mem;
++  if (gomp_work_share_start (ordered))
++    {
++      sched = gomp_adjust_sched (sched, &chunk_size);
++      gomp_loop_ull_init (thr->ts.work_share, up, start, end, incr,
++			  sched, chunk_size);
++      if (reductions)
++	{
++	  GOMP_taskgroup_reduction_register (reductions);
++	  thr->task->taskgroup->workshare = true;
++	  thr->ts.work_share->task_reductions = reductions;
++	}
++      if (sched == GFS_STATIC)
++	gomp_ordered_static_init ();
++      else
++	gomp_mutex_lock (&thr->ts.work_share->lock);
++      gomp_work_share_init_done ();
++    }
++  else
++    {
++      if (reductions)
++	{
++	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
++	  gomp_workshare_task_reduction_register (reductions,
++						  first_reductions);
++	}
++      sched = thr->ts.work_share->sched;
++      if (sched != GFS_STATIC)
++	gomp_mutex_lock (&thr->ts.work_share->lock);
++    }
++
++  if (mem)
++    {
++      uintptr_t p
++	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
++		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
++      p += __alignof__ (long long) - 1;
++      p &= ~(__alignof__ (long long) - 1);
++      *mem = (void *) p;
++    }
++
++  switch (sched)
++    {
++    case GFS_STATIC:
++    case GFS_AUTO:
++      return !gomp_iter_ull_static_next (istart, iend);
++    case GFS_DYNAMIC:
++      ret = gomp_iter_ull_dynamic_next_locked (istart, iend);
++      break;
++    case GFS_GUIDED:
++      ret = gomp_iter_ull_guided_next_locked (istart, iend);
++      break;
++    default:
++      abort ();
++    }
++
++  if (ret)
++    gomp_ordered_first ();
++  gomp_mutex_unlock (&thr->ts.work_share->lock);
++  return ret;
++}
++
+ /* The *_doacross_*_start routines are similar.  The only difference is that
+    this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
+    section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
+@@ -313,11 +486,11 @@ gomp_loop_ull_doacross_static_start (uns
+   struct gomp_thread *thr = gomp_thread ();
+ 
+   thr->ts.static_trip = 0;
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+ 			  GFS_STATIC, chunk_size);
+-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
++      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
+       gomp_work_share_init_done ();
+     }
+ 
+@@ -332,11 +505,11 @@ gomp_loop_ull_doacross_dynamic_start (un
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+ 			  GFS_DYNAMIC, chunk_size);
+-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
++      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
+       gomp_work_share_init_done ();
+     }
+ 
+@@ -359,11 +532,11 @@ gomp_loop_ull_doacross_guided_start (uns
+   struct gomp_thread *thr = gomp_thread ();
+   bool ret;
+ 
+-  if (gomp_work_share_start (false))
++  if (gomp_work_share_start (0))
+     {
+       gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
+ 			  GFS_GUIDED, chunk_size);
+-      gomp_doacross_ull_init (ncounts, counts, chunk_size);
++      gomp_doacross_ull_init (ncounts, counts, chunk_size, 0);
+       gomp_work_share_init_done ();
+     }
+ 
+@@ -383,7 +556,7 @@ GOMP_loop_ull_doacross_runtime_start (un
+ 				      gomp_ull *istart, gomp_ull *iend)
+ {
+   struct gomp_task_icv *icv = gomp_icv (false);
+-  switch (icv->run_sched_var)
++  switch (icv->run_sched_var & ~GFS_MONOTONIC)
+     {
+     case GFS_STATIC:
+       return gomp_loop_ull_doacross_static_start (ncounts, counts,
+@@ -407,6 +580,51 @@ GOMP_loop_ull_doacross_runtime_start (un
+     }
+ }
+ 
++bool
++GOMP_loop_ull_doacross_start (unsigned ncounts, gomp_ull *counts,
++			      long sched, gomp_ull chunk_size,
++			      gomp_ull *istart, gomp_ull *iend,
++			      uintptr_t *reductions, void **mem)
++{
++  struct gomp_thread *thr = gomp_thread ();
++
++  thr->ts.static_trip = 0;
++  if (reductions)
++    gomp_workshare_taskgroup_start ();
++  if (gomp_work_share_start (0))
++    {
++      size_t extra = 0;
++      if (mem)
++	extra = (uintptr_t) *mem;
++      sched = gomp_adjust_sched (sched, &chunk_size);
++      gomp_loop_ull_init (thr->ts.work_share, true, 0, counts[0], 1,
++			  sched, chunk_size);
++      gomp_doacross_ull_init (ncounts, counts, chunk_size, extra);
++      if (reductions)
++	{
++	  GOMP_taskgroup_reduction_register (reductions);
++	  thr->task->taskgroup->workshare = true;
++	  thr->ts.work_share->task_reductions = reductions;
++	}
++      gomp_work_share_init_done ();
++    }
++  else
++    {
++      if (reductions)
++	{
++	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
++	  gomp_workshare_task_reduction_register (reductions,
++						  first_reductions);
++	}
++      sched = thr->ts.work_share->sched;
++    }
++
++  if (mem)
++    *mem = thr->ts.work_share->doacross->extra;
++
++  return ialias_call (GOMP_loop_ull_runtime_next) (istart, iend);
++}
++
+ /* The *_next routines are called when the thread completes processing of
+    the iteration block currently assigned to it.  If the work-share
+    construct is bound directly to a parallel construct, then the iteration
+@@ -570,6 +788,10 @@ extern __typeof(gomp_loop_ull_dynamic_st
+ 	__attribute__((alias ("gomp_loop_ull_dynamic_start")));
+ extern __typeof(gomp_loop_ull_guided_start) GOMP_loop_ull_nonmonotonic_guided_start
+ 	__attribute__((alias ("gomp_loop_ull_guided_start")));
++extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_nonmonotonic_runtime_start
++	__attribute__((alias ("GOMP_loop_ull_runtime_start")));
++extern __typeof(GOMP_loop_ull_runtime_start) GOMP_loop_ull_maybe_nonmonotonic_runtime_start
++	__attribute__((alias ("GOMP_loop_ull_runtime_start")));
+ 
+ extern __typeof(gomp_loop_ull_ordered_static_start) GOMP_loop_ull_ordered_static_start
+ 	__attribute__((alias ("gomp_loop_ull_ordered_static_start")));
+@@ -595,6 +817,10 @@ extern __typeof(gomp_loop_ull_dynamic_ne
+ 	__attribute__((alias ("gomp_loop_ull_dynamic_next")));
+ extern __typeof(gomp_loop_ull_guided_next) GOMP_loop_ull_nonmonotonic_guided_next
+ 	__attribute__((alias ("gomp_loop_ull_guided_next")));
++extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_nonmonotonic_runtime_next
++	__attribute__((alias ("GOMP_loop_ull_runtime_next")));
++extern __typeof(GOMP_loop_ull_runtime_next) GOMP_loop_ull_maybe_nonmonotonic_runtime_next
++	__attribute__((alias ("GOMP_loop_ull_runtime_next")));
+ 
+ extern __typeof(gomp_loop_ull_ordered_static_next) GOMP_loop_ull_ordered_static_next
+ 	__attribute__((alias ("gomp_loop_ull_ordered_static_next")));
+@@ -650,6 +876,23 @@ GOMP_loop_ull_nonmonotonic_guided_start
+ }
+ 
+ bool
++GOMP_loop_ull_nonmonotonic_runtime_start (bool up, gomp_ull start,
++					  gomp_ull end, gomp_ull incr,
++					  gomp_ull *istart, gomp_ull *iend)
++{
++  return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend);
++}
++
++bool
++GOMP_loop_ull_maybe_nonmonotonic_runtime_start (bool up, gomp_ull start,
++						gomp_ull end, gomp_ull incr,
++						gomp_ull *istart,
++						gomp_ull *iend)
++{
++  return GOMP_loop_ull_runtime_start (up, start, end, incr, istart, iend);
++}
++
++bool
+ GOMP_loop_ull_ordered_static_start (bool up, gomp_ull start, gomp_ull end,
+ 				    gomp_ull incr, gomp_ull chunk_size,
+ 				    gomp_ull *istart, gomp_ull *iend)
+@@ -734,6 +977,19 @@ GOMP_loop_ull_nonmonotonic_guided_next (
+ }
+ 
+ bool
++GOMP_loop_ull_nonmonotonic_runtime_next (gomp_ull *istart, gomp_ull *iend)
++{
++  return GOMP_loop_ull_runtime_next (istart, iend);
++}
++
++bool
++GOMP_loop_ull_maybe_nonmonotonic_runtime_next (gomp_ull *istart,
++					       gomp_ull *iend)
++{
++  return GOMP_loop_ull_runtime_next (istart, iend);
++}
++
++bool
+ GOMP_loop_ull_ordered_static_next (gomp_ull *istart, gomp_ull *iend)
+ {
+   return gomp_loop_ull_ordered_static_next (istart, iend);
+--- libgomp/oacc-int.h.jj	2018-04-25 09:40:31.320655306 +0200
++++ libgomp/oacc-int.h	2019-05-07 18:46:36.529109688 +0200
+@@ -99,6 +99,28 @@ void goacc_restore_bind (void);
+ void goacc_lazy_initialize (void);
+ void goacc_host_init (void);
+ 
++static inline bool
++async_valid_stream_id_p (int async)
++{
++  return async >= 0;
++}
++
++static inline bool
++async_valid_p (int async)
++{
++  return (async == acc_async_noval || async == acc_async_sync
++	  || async_valid_stream_id_p (async));
++}
++
++static inline bool
++async_synchronous_p (int async)
++{
++  if (!async_valid_p (async))
++    return true;
++
++  return async == acc_async_sync;
++}
++
+ #ifdef HAVE_ATTRIBUTE_VISIBILITY
+ # pragma GCC visibility pop
+ #endif
+--- libgomp/testsuite/Makefile.in.jj	2018-04-25 09:40:31.452655368 +0200
++++ libgomp/testsuite/Makefile.in	2019-05-07 18:51:35.754330084 +0200
+@@ -223,6 +223,7 @@ mkdir_p = @mkdir_p@
+ multi_basedir = @multi_basedir@
+ offload_additional_lib_paths = @offload_additional_lib_paths@
+ offload_additional_options = @offload_additional_options@
++offload_plugins = @offload_plugins@
+ offload_targets = @offload_targets@
+ oldincludedir = @oldincludedir@
+ pdfdir = @pdfdir@
+--- libgomp/task.c.jj	2018-04-25 09:40:31.925655587 +0200
++++ libgomp/task.c	2019-05-07 18:46:36.547109400 +0200
+@@ -166,21 +166,72 @@ gomp_task_handle_depend (struct gomp_tas
+ 			 void **depend)
+ {
+   size_t ndepend = (uintptr_t) depend[0];
+-  size_t nout = (uintptr_t) depend[1];
+   size_t i;
+   hash_entry_type ent;
+ 
++  if (ndepend)
++    {
++      /* depend[0] is total # */
++      size_t nout = (uintptr_t) depend[1]; /* # of out: and inout: */
++      /* ndepend - nout is # of in: */
++      for (i = 0; i < ndepend; i++)
++	{
++	  task->depend[i].addr = depend[2 + i];
++	  task->depend[i].is_in = i >= nout;
++	}
++    }
++  else
++    {
++      ndepend = (uintptr_t) depend[1]; /* total # */
++      size_t nout = (uintptr_t) depend[2]; /* # of out: and inout: */
++      size_t nmutexinoutset = (uintptr_t) depend[3]; /* # of mutexinoutset: */
++      /* For now we treat mutexinoutset like out, which is compliant, but
++	 inefficient.  */
++      size_t nin = (uintptr_t) depend[4]; /* # of in: */
++      /* ndepend - nout - nmutexinoutset - nin is # of depobjs */
++      size_t normal = nout + nmutexinoutset + nin;
++      size_t n = 0;
++      for (i = normal; i < ndepend; i++)
++	{
++	  void **d = (void **) (uintptr_t) depend[5 + i];
++	  switch ((uintptr_t) d[1])
++	    {
++	    case GOMP_DEPEND_OUT:
++	    case GOMP_DEPEND_INOUT:
++	    case GOMP_DEPEND_MUTEXINOUTSET:
++	      break;
++	    case GOMP_DEPEND_IN:
++	      continue;
++	    default:
++	      gomp_fatal ("unknown omp_depend_t dependence type %d",
++			  (int) (uintptr_t) d[1]);
++	    }
++	  task->depend[n].addr = d[0];
++	  task->depend[n++].is_in = 0;
++	}
++      for (i = 0; i < normal; i++)
++	{
++	  task->depend[n].addr = depend[5 + i];
++	  task->depend[n++].is_in = i >= nout + nmutexinoutset;
++	}
++      for (i = normal; i < ndepend; i++)
++	{
++	  void **d = (void **) (uintptr_t) depend[5 + i];
++	  if ((uintptr_t) d[1] != GOMP_DEPEND_IN)
++	    continue;
++	  task->depend[n].addr = d[0];
++	  task->depend[n++].is_in = 1;
++	}
++    }
+   task->depend_count = ndepend;
+   task->num_dependees = 0;
+   if (parent->depend_hash == NULL)
+     parent->depend_hash = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
+   for (i = 0; i < ndepend; i++)
+     {
+-      task->depend[i].addr = depend[2 + i];
+       task->depend[i].next = NULL;
+       task->depend[i].prev = NULL;
+       task->depend[i].task = task;
+-      task->depend[i].is_in = i >= nout;
+       task->depend[i].redundant = false;
+       task->depend[i].redundant_out = false;
+ 
+@@ -205,7 +256,7 @@ gomp_task_handle_depend (struct gomp_tas
+ 	      last = ent;
+ 
+ 	      /* depend(in:...) doesn't depend on earlier depend(in:...).  */
+-	      if (i >= nout && ent->is_in)
++	      if (task->depend[i].is_in && ent->is_in)
+ 		continue;
+ 
+ 	      if (!ent->is_in)
+@@ -280,9 +331,18 @@ gomp_task_handle_depend (struct gomp_tas
+    then the task may be executed by any member of the team.
+ 
+    DEPEND is an array containing:
++     if depend[0] is non-zero, then:
+ 	depend[0]: number of depend elements.
+-	depend[1]: number of depend elements of type "out".
+-	depend[2..N+1]: address of [1..N]th depend element.  */
++	depend[1]: number of depend elements of type "out/inout".
++	depend[2..N+1]: address of [1..N]th depend element.
++     otherwise, when depend[0] is zero, then:
++	depend[1]: number of depend elements.
++	depend[2]: number of depend elements of type "out/inout".
++	depend[3]: number of depend elements of type "mutexinoutset".
++	depend[4]: number of depend elements of type "in".
++	depend[5..4+depend[2]+depend[3]+depend[4]]: address of depend elements
++	depend[5+depend[2]+depend[3]+depend[4]..4+depend[1]]: address of
++		   omp_depend_t objects.  */
+ 
+ void
+ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
+@@ -303,10 +363,20 @@ GOMP_task (void (*fn) (void *), void *da
+ #endif
+ 
+   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+-  if (team
+-      && (gomp_team_barrier_cancelled (&team->barrier)
+-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+-    return;
++  if (__builtin_expect (gomp_cancel_var, 0) && team)
++    {
++      if (gomp_team_barrier_cancelled (&team->barrier))
++	return;
++      if (thr->task->taskgroup)
++	{
++	  if (thr->task->taskgroup->cancelled)
++	    return;
++	  if (thr->task->taskgroup->workshare
++	      && thr->task->taskgroup->prev
++	      && thr->task->taskgroup->prev->cancelled)
++	    return;
++	}
++    }
+ 
+   if ((flags & GOMP_TASK_FLAG_PRIORITY) == 0)
+     priority = 0;
+@@ -377,7 +447,7 @@ GOMP_task (void (*fn) (void *), void *da
+       size_t depend_size = 0;
+ 
+       if (flags & GOMP_TASK_FLAG_DEPEND)
+-	depend_size = ((uintptr_t) depend[0]
++	depend_size = ((uintptr_t) (depend[0] ? depend[0] : depend[1])
+ 		       * sizeof (struct gomp_task_depend_entry));
+       task = gomp_malloc (sizeof (*task) + depend_size
+ 			  + arg_size + arg_align - 1);
+@@ -404,14 +474,26 @@ GOMP_task (void (*fn) (void *), void *da
+       gomp_mutex_lock (&team->task_lock);
+       /* If parallel or taskgroup has been cancelled, don't start new
+ 	 tasks.  */
+-      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
+-			     || (taskgroup && taskgroup->cancelled))
+-			    && !task->copy_ctors_done, 0))
++      if (__builtin_expect (gomp_cancel_var, 0)
++	  && !task->copy_ctors_done)
+ 	{
+-	  gomp_mutex_unlock (&team->task_lock);
+-	  gomp_finish_task (task);
+-	  free (task);
+-	  return;
++	  if (gomp_team_barrier_cancelled (&team->barrier))
++	    {
++	    do_cancel:
++	      gomp_mutex_unlock (&team->task_lock);
++	      gomp_finish_task (task);
++	      free (task);
++	      return;
++	    }
++	  if (taskgroup)
++	    {
++	      if (taskgroup->cancelled)
++		goto do_cancel;
++	      if (taskgroup->workshare
++		  && taskgroup->prev
++		  && taskgroup->prev->cancelled)
++		goto do_cancel;
++	    }
+ 	}
+       if (taskgroup)
+ 	taskgroup->num_children++;
+@@ -463,6 +545,7 @@ GOMP_task (void (*fn) (void *), void *da
+ 
+ ialias (GOMP_taskgroup_start)
+ ialias (GOMP_taskgroup_end)
++ialias (GOMP_taskgroup_reduction_register)
+ 
+ #define TYPE long
+ #define UTYPE unsigned long
+@@ -601,10 +684,20 @@ gomp_create_target_task (struct gomp_dev
+   struct gomp_team *team = thr->ts.team;
+ 
+   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+-  if (team
+-      && (gomp_team_barrier_cancelled (&team->barrier)
+-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+-    return true;
++  if (__builtin_expect (gomp_cancel_var, 0) && team)
++    {
++      if (gomp_team_barrier_cancelled (&team->barrier))
++	return true;
++      if (thr->task->taskgroup)
++	{
++	  if (thr->task->taskgroup->cancelled)
++	    return true;
++	  if (thr->task->taskgroup->workshare
++	      && thr->task->taskgroup->prev
++	      && thr->task->taskgroup->prev->cancelled)
++	    return true;
++	}
++    }
+ 
+   struct gomp_target_task *ttask;
+   struct gomp_task *task;
+@@ -617,7 +710,7 @@ gomp_create_target_task (struct gomp_dev
+ 
+   if (depend != NULL)
+     {
+-      depend_cnt = (uintptr_t) depend[0];
++      depend_cnt = (uintptr_t) (depend[0] ? depend[0] : depend[1]);
+       depend_size = depend_cnt * sizeof (struct gomp_task_depend_entry);
+     }
+   if (fn)
+@@ -687,13 +780,25 @@ gomp_create_target_task (struct gomp_dev
+   task->final_task = 0;
+   gomp_mutex_lock (&team->task_lock);
+   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+-  if (__builtin_expect (gomp_team_barrier_cancelled (&team->barrier)
+-			|| (taskgroup && taskgroup->cancelled), 0))
++  if (__builtin_expect (gomp_cancel_var, 0))
+     {
+-      gomp_mutex_unlock (&team->task_lock);
+-      gomp_finish_task (task);
+-      free (task);
+-      return true;
++      if (gomp_team_barrier_cancelled (&team->barrier))
++	{
++	do_cancel:
++	  gomp_mutex_unlock (&team->task_lock);
++	  gomp_finish_task (task);
++	  free (task);
++	  return true;
++	}
++      if (taskgroup)
++	{
++	  if (taskgroup->cancelled)
++	    goto do_cancel;
++	  if (taskgroup->workshare
++	      && taskgroup->prev
++	      && taskgroup->prev->cancelled)
++	    goto do_cancel;
++	}
+     }
+   if (depend_size)
+     {
+@@ -986,10 +1091,21 @@ gomp_task_run_pre (struct gomp_task *chi
+ 
+   if (--team->task_queued_count == 0)
+     gomp_team_barrier_clear_task_pending (&team->barrier);
+-  if ((gomp_team_barrier_cancelled (&team->barrier)
+-       || (taskgroup && taskgroup->cancelled))
++  if (__builtin_expect (gomp_cancel_var, 0)
+       && !child_task->copy_ctors_done)
+-    return true;
++    {
++      if (gomp_team_barrier_cancelled (&team->barrier))
++	return true;
++      if (taskgroup)
++	{
++	  if (taskgroup->cancelled)
++	    return true;
++	  if (taskgroup->workshare
++	      && taskgroup->prev
++	      && taskgroup->prev->cancelled)
++	    return true;
++	}
++    }
+   return false;
+ }
+ 
+@@ -1456,6 +1572,35 @@ GOMP_taskwait (void)
+     }
+ }
+ 
++/* Called when encountering a taskwait directive with depend clause(s).
++   Wait as if it was an mergeable included task construct with empty body.  */
++
++void
++GOMP_taskwait_depend (void **depend)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++
++  /* If parallel or taskgroup has been cancelled, return early.  */
++  if (__builtin_expect (gomp_cancel_var, 0) && team)
++    {
++      if (gomp_team_barrier_cancelled (&team->barrier))
++	return;
++      if (thr->task->taskgroup)
++	{
++	  if (thr->task->taskgroup->cancelled)
++	    return;
++	  if (thr->task->taskgroup->workshare
++	      && thr->task->taskgroup->prev
++	      && thr->task->taskgroup->prev->cancelled)
++	    return;
++	}
++    }
++
++  if (thr->task && thr->task->depend_hash)
++    gomp_task_maybe_wait_for_dependencies (depend);
++}
++
+ /* An undeferred task is about to run.  Wait for all tasks that this
+    undeferred task depends on.
+ 
+@@ -1464,7 +1609,7 @@ GOMP_taskwait (void)
+    the scheduling queues.  Then we iterate through these imminently
+    ready tasks (and possibly other high priority tasks), and run them.
+    If we run out of ready dependencies to execute, we either wait for
+-   the reamining dependencies to finish, or wait for them to get
++   the remaining dependencies to finish, or wait for them to get
+    scheduled so we can run them.
+ 
+    DEPEND is as in GOMP_task.  */
+@@ -1477,21 +1622,50 @@ gomp_task_maybe_wait_for_dependencies (v
+   struct gomp_team *team = thr->ts.team;
+   struct gomp_task_depend_entry elem, *ent = NULL;
+   struct gomp_taskwait taskwait;
+-  size_t ndepend = (uintptr_t) depend[0];
++  size_t orig_ndepend = (uintptr_t) depend[0];
+   size_t nout = (uintptr_t) depend[1];
++  size_t ndepend = orig_ndepend;
++  size_t normal = ndepend;
++  size_t n = 2;
+   size_t i;
+   size_t num_awaited = 0;
+   struct gomp_task *child_task = NULL;
+   struct gomp_task *to_free = NULL;
+   int do_wake = 0;
+ 
++  if (ndepend == 0)
++    {
++      ndepend = nout;
++      nout = (uintptr_t) depend[2] + (uintptr_t) depend[3];
++      normal = nout + (uintptr_t) depend[4];
++      n = 5;
++    }
+   gomp_mutex_lock (&team->task_lock);
+   for (i = 0; i < ndepend; i++)
+     {
+-      elem.addr = depend[i + 2];
++      elem.addr = depend[i + n];
++      elem.is_in = i >= nout;
++      if (__builtin_expect (i >= normal, 0))
++	{
++	  void **d = (void **) elem.addr;
++	  switch ((uintptr_t) d[1])
++	    {
++	    case GOMP_DEPEND_IN:
++	      break;
++	    case GOMP_DEPEND_OUT:
++	    case GOMP_DEPEND_INOUT:
++	    case GOMP_DEPEND_MUTEXINOUTSET:
++	      elem.is_in = 0;
++	      break;
++	    default:
++	      gomp_fatal ("unknown omp_depend_t dependence type %d",
++			  (int) (uintptr_t) d[1]);
++	    }
++	  elem.addr = d[0];
++	}
+       ent = htab_find (task->depend_hash, &elem);
+       for (; ent; ent = ent->next)
+-	if (i >= nout && ent->is_in)
++	if (elem.is_in && ent->is_in)
+ 	  continue;
+ 	else
+ 	  {
+@@ -1654,13 +1828,28 @@ GOMP_taskyield (void)
+   /* Nothing at the moment.  */
+ }
+ 
++static inline struct gomp_taskgroup *
++gomp_taskgroup_init (struct gomp_taskgroup *prev)
++{
++  struct gomp_taskgroup *taskgroup
++    = gomp_malloc (sizeof (struct gomp_taskgroup));
++  taskgroup->prev = prev;
++  priority_queue_init (&taskgroup->taskgroup_queue);
++  taskgroup->reductions = prev ? prev->reductions : NULL;
++  taskgroup->in_taskgroup_wait = false;
++  taskgroup->cancelled = false;
++  taskgroup->workshare = false;
++  taskgroup->num_children = 0;
++  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
++  return taskgroup;
++}
++
+ void
+ GOMP_taskgroup_start (void)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+   struct gomp_task *task = thr->task;
+-  struct gomp_taskgroup *taskgroup;
+ 
+   /* If team is NULL, all tasks are executed as
+      GOMP_TASK_UNDEFERRED tasks and thus all children tasks of
+@@ -1668,14 +1857,7 @@ GOMP_taskgroup_start (void)
+      by the time GOMP_taskgroup_end is called.  */
+   if (team == NULL)
+     return;
+-  taskgroup = gomp_malloc (sizeof (struct gomp_taskgroup));
+-  taskgroup->prev = task->taskgroup;
+-  priority_queue_init (&taskgroup->taskgroup_queue);
+-  taskgroup->in_taskgroup_wait = false;
+-  taskgroup->cancelled = false;
+-  taskgroup->num_children = 0;
+-  gomp_sem_init (&taskgroup->taskgroup_sem, 0);
+-  task->taskgroup = taskgroup;
++  task->taskgroup = gomp_taskgroup_init (task->taskgroup);
+ }
+ 
+ void
+@@ -1840,6 +2022,302 @@ GOMP_taskgroup_end (void)
+   free (taskgroup);
+ }
+ 
++static inline __attribute__((always_inline)) void
++gomp_reduction_register (uintptr_t *data, uintptr_t *old, uintptr_t *orig,
++			 unsigned nthreads)
++{
++  size_t total_cnt = 0;
++  uintptr_t *d = data;
++  struct htab *old_htab = NULL, *new_htab;
++  do
++    {
++      if (__builtin_expect (orig != NULL, 0))
++	{
++	  /* For worksharing task reductions, memory has been allocated
++	     already by some other thread that encountered the construct
++	     earlier.  */
++	  d[2] = orig[2];
++	  d[6] = orig[6];
++	  orig = (uintptr_t *) orig[4];
++	}
++      else
++	{
++	  size_t sz = d[1] * nthreads;
++	  /* Should use omp_alloc if d[3] is not -1.  */
++	  void *ptr = gomp_aligned_alloc (d[2], sz);
++	  memset (ptr, '\0', sz);
++	  d[2] = (uintptr_t) ptr;
++	  d[6] = d[2] + sz;
++	}
++      d[5] = 0;
++      total_cnt += d[0];
++      if (d[4] == 0)
++	{
++	  d[4] = (uintptr_t) old;
++	  break;
++	}
++      else
++	d = (uintptr_t *) d[4];
++    }
++  while (1);
++  if (old && old[5])
++    {
++      old_htab = (struct htab *) old[5];
++      total_cnt += htab_elements (old_htab);
++    }
++  new_htab = htab_create (total_cnt);
++  if (old_htab)
++    {
++      /* Copy old hash table, like in htab_expand.  */
++      hash_entry_type *p, *olimit;
++      new_htab->n_elements = htab_elements (old_htab);
++      olimit = old_htab->entries + old_htab->size;
++      p = old_htab->entries;
++      do
++	{
++	  hash_entry_type x = *p;
++	  if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
++	    *find_empty_slot_for_expand (new_htab, htab_hash (x)) = x;
++	  p++;
++	}
++      while (p < olimit);
++    }
++  d = data;
++  do
++    {
++      size_t j;
++      for (j = 0; j < d[0]; ++j)
++	{
++	  uintptr_t *p = d + 7 + j * 3;
++	  p[2] = (uintptr_t) d;
++	  /* Ugly hack, hash_entry_type is defined for the task dependencies,
++	     which hash on the first element which is a pointer.  We need
++	     to hash also on the first sizeof (uintptr_t) bytes which contain
++	     a pointer.  Hide the cast from the compiler.  */
++	  hash_entry_type n;
++	  __asm ("" : "=g" (n) : "0" (p));
++	  *htab_find_slot (&new_htab, n, INSERT) = n;
++	}
++      if (d[4] == (uintptr_t) old)
++	break;
++      else
++	d = (uintptr_t *) d[4];
++    }
++  while (1);
++  d[5] = (uintptr_t) new_htab;
++}
++
++static void
++gomp_create_artificial_team (void)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_task_icv *icv;
++  struct gomp_team *team = gomp_new_team (1);
++  struct gomp_task *task = thr->task;
++  icv = task ? &task->icv : &gomp_global_icv;
++  team->prev_ts = thr->ts;
++  thr->ts.team = team;
++  thr->ts.team_id = 0;
++  thr->ts.work_share = &team->work_shares[0];
++  thr->ts.last_work_share = NULL;
++#ifdef HAVE_SYNC_BUILTINS
++  thr->ts.single_count = 0;
++#endif
++  thr->ts.static_trip = 0;
++  thr->task = &team->implicit_task[0];
++  gomp_init_task (thr->task, NULL, icv);
++  if (task)
++    {
++      thr->task = task;
++      gomp_end_task ();
++      free (task);
++      thr->task = &team->implicit_task[0];
++    }
++#ifdef LIBGOMP_USE_PTHREADS
++  else
++    pthread_setspecific (gomp_thread_destructor, thr);
++#endif
++}
++
++/* The format of data is:
++   data[0]	cnt
++   data[1]	size
++   data[2]	alignment (on output array pointer)
++   data[3]	allocator (-1 if malloc allocator)
++   data[4]	next pointer
++   data[5]	used internally (htab pointer)
++   data[6]	used internally (end of array)
++   cnt times
++   ent[0]	address
++   ent[1]	offset
++   ent[2]	used internally (pointer to data[0])
++   The entries are sorted by increasing offset, so that a binary
++   search can be performed.  Normally, data[8] is 0, exception is
++   for worksharing construct task reductions in cancellable parallel,
++   where at offset 0 there should be space for a pointer and an integer
++   which are used internally.  */
++
++void
++GOMP_taskgroup_reduction_register (uintptr_t *data)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_task *task;
++  unsigned nthreads;
++  if (__builtin_expect (team == NULL, 0))
++    {
++      /* The task reduction code needs a team and task, so for
++	 orphaned taskgroups just create the implicit team.  */
++      gomp_create_artificial_team ();
++      ialias_call (GOMP_taskgroup_start) ();
++      team = thr->ts.team;
++    }
++  nthreads = team->nthreads;
++  task = thr->task;
++  gomp_reduction_register (data, task->taskgroup->reductions, NULL, nthreads);
++  task->taskgroup->reductions = data;
++}
++
++void
++GOMP_taskgroup_reduction_unregister (uintptr_t *data)
++{
++  uintptr_t *d = data;
++  htab_free ((struct htab *) data[5]);
++  do
++    {
++      gomp_aligned_free ((void *) d[2]);
++      d = (uintptr_t *) d[4];
++    }
++  while (d && !d[5]);
++}
++ialias (GOMP_taskgroup_reduction_unregister)
++
++/* For i = 0 to cnt-1, remap ptrs[i] which is either address of the
++   original list item or address of previously remapped original list
++   item to address of the private copy, store that to ptrs[i].
++   For i < cntorig, additionally set ptrs[cnt+i] to the address of
++   the original list item.  */
++
++void
++GOMP_task_reduction_remap (size_t cnt, size_t cntorig, void **ptrs)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_task *task = thr->task;
++  unsigned id = thr->ts.team_id;
++  uintptr_t *data = task->taskgroup->reductions;
++  uintptr_t *d;
++  struct htab *reduction_htab = (struct htab *) data[5];
++  size_t i;
++  for (i = 0; i < cnt; ++i)
++    {
++      hash_entry_type ent, n;
++      __asm ("" : "=g" (ent) : "0" (ptrs + i));
++      n = htab_find (reduction_htab, ent);
++      if (n)
++	{
++	  uintptr_t *p;
++	  __asm ("" : "=g" (p) : "0" (n));
++	  /* At this point, p[0] should be equal to (uintptr_t) ptrs[i],
++	     p[1] is the offset within the allocated chunk for each
++	     thread, p[2] is the array registered with
++	     GOMP_taskgroup_reduction_register, d[2] is the base of the
++	     allocated memory and d[1] is the size of the allocated chunk
++	     for one thread.  */
++	  d = (uintptr_t *) p[2];
++	  ptrs[i] = (void *) (d[2] + id * d[1] + p[1]);
++	  if (__builtin_expect (i < cntorig, 0))
++	    ptrs[cnt + i] = (void *) p[0];
++	  continue;
++	}
++      d = data;
++      while (d != NULL)
++	{
++	  if ((uintptr_t) ptrs[i] >= d[2] && (uintptr_t) ptrs[i] < d[6])
++	    break;
++	  d = (uintptr_t *) d[4];
++	}
++      if (d == NULL)
++	gomp_fatal ("couldn't find matching task_reduction or reduction with "
++		    "task modifier for %p", ptrs[i]);
++      uintptr_t off = ((uintptr_t) ptrs[i] - d[2]) % d[1];
++      ptrs[i] = (void *) (d[2] + id * d[1] + off);
++      if (__builtin_expect (i < cntorig, 0))
++	{
++	  size_t lo = 0, hi = d[0] - 1;
++	  while (lo <= hi)
++	    {
++	      size_t m = (lo + hi) / 2;
++	      if (d[7 + 3 * m + 1] < off)
++		lo = m + 1;
++	      else if (d[7 + 3 * m + 1] == off)
++		{
++		  ptrs[cnt + i] = (void *) d[7 + 3 * m];
++		  break;
++		}
++	      else
++		hi = m - 1;
++	    }
++	  if (lo > hi)
++	    gomp_fatal ("couldn't find matching task_reduction or reduction "
++			"with task modifier for %p", ptrs[i]);
++	}
++    }
++}
++
++struct gomp_taskgroup *
++gomp_parallel_reduction_register (uintptr_t *data, unsigned nthreads)
++{
++  struct gomp_taskgroup *taskgroup = gomp_taskgroup_init (NULL);
++  gomp_reduction_register (data, NULL, NULL, nthreads);
++  taskgroup->reductions = data;
++  return taskgroup;
++}
++
++void
++gomp_workshare_task_reduction_register (uintptr_t *data, uintptr_t *orig)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_task *task = thr->task;
++  unsigned nthreads = team->nthreads;
++  gomp_reduction_register (data, task->taskgroup->reductions, orig, nthreads);
++  task->taskgroup->reductions = data;
++}
++
++void
++gomp_workshare_taskgroup_start (void)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_team *team = thr->ts.team;
++  struct gomp_task *task;
++
++  if (team == NULL)
++    {
++      gomp_create_artificial_team ();
++      team = thr->ts.team;
++    }
++  task = thr->task;
++  task->taskgroup = gomp_taskgroup_init (task->taskgroup);
++  task->taskgroup->workshare = true;
++}
++
++void
++GOMP_workshare_task_reduction_unregister (bool cancelled)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_task *task = thr->task;
++  struct gomp_team *team = thr->ts.team;
++  uintptr_t *data = task->taskgroup->reductions;
++  ialias_call (GOMP_taskgroup_end) ();
++  if (thr->ts.team_id == 0)
++    ialias_call (GOMP_taskgroup_reduction_unregister) (data);
++  else
++    htab_free ((struct htab *) data[5]);
++
++  if (!cancelled)
++    gomp_team_barrier_wait (&team->barrier);
++}
++
+ int
+ omp_in_final (void)
+ {
+--- libgomp/team.c.jj	2018-04-25 09:40:31.322655307 +0200
++++ libgomp/team.c	2019-05-07 18:46:36.548109384 +0200
+@@ -32,7 +32,6 @@
+ #include <string.h>
+ 
+ #ifdef LIBGOMP_USE_PTHREADS
+-/* This attribute contains PTHREAD_CREATE_DETACHED.  */
+ pthread_attr_t gomp_thread_attr;
+ 
+ /* This key is for the thread destructor.  */
+@@ -58,6 +57,7 @@ struct gomp_thread_start_data
+   struct gomp_thread_pool *thread_pool;
+   unsigned int place;
+   bool nested;
++  pthread_t handle;
+ };
+ 
+ 
+@@ -89,6 +89,9 @@ gomp_thread_start (void *xdata)
+   thr->ts = data->ts;
+   thr->task = data->task;
+   thr->place = data->place;
++#ifdef GOMP_NEEDS_THREAD_HANDLE
++  thr->handle = data->handle;
++#endif
+ 
+   thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
+ 
+@@ -131,6 +134,7 @@ gomp_thread_start (void *xdata)
+     }
+ 
+   gomp_sem_destroy (&thr->release);
++  pthread_detach (pthread_self ());
+   thr->thread_pool = NULL;
+   thr->task = NULL;
+   return NULL;
+@@ -183,7 +187,7 @@ gomp_new_team (unsigned nthreads)
+   team->single_count = 0;
+ #endif
+   team->work_shares_to_free = &team->work_shares[0];
+-  gomp_init_work_share (&team->work_shares[0], false, nthreads);
++  gomp_init_work_share (&team->work_shares[0], 0, nthreads);
+   team->work_shares[0].next_alloc = NULL;
+   team->work_share_list_free = NULL;
+   team->work_share_list_alloc = &team->work_shares[1];
+@@ -231,6 +235,7 @@ gomp_free_pool_helper (void *thread_pool
+   thr->thread_pool = NULL;
+   thr->task = NULL;
+ #ifdef LIBGOMP_USE_PTHREADS
++  pthread_detach (pthread_self ());
+   pthread_exit (NULL);
+ #elif defined(__nvptx__)
+   asm ("exit;");
+@@ -297,7 +302,8 @@ gomp_free_thread (void *arg __attribute_
+ #ifdef LIBGOMP_USE_PTHREADS
+ void
+ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
+-		 unsigned flags, struct gomp_team *team)
++		 unsigned flags, struct gomp_team *team,
++		 struct gomp_taskgroup *taskgroup)
+ {
+   struct gomp_thread_start_data *start_data;
+   struct gomp_thread *thr, *nthr;
+@@ -312,6 +318,7 @@ gomp_team_start (void (*fn) (void *), vo
+   unsigned int s = 0, rest = 0, p = 0, k = 0;
+   unsigned int affinity_count = 0;
+   struct gomp_thread **affinity_thr = NULL;
++  bool force_display = false;
+ 
+   thr = gomp_thread ();
+   nested = thr->ts.level;
+@@ -319,7 +326,12 @@ gomp_team_start (void (*fn) (void *), vo
+   task = thr->task;
+   icv = task ? &task->icv : &gomp_global_icv;
+   if (__builtin_expect (gomp_places_list != NULL, 0) && thr->place == 0)
+-    gomp_init_affinity ();
++    {
++      gomp_init_affinity ();
++      if (__builtin_expect (gomp_display_affinity_var, 0) && nthreads == 1)
++	gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
++				      thr->place);
++    }
+ 
+   /* Always save the previous state, even if this isn't a nested team.
+      In particular, we should save any work share state from an outer
+@@ -338,6 +350,9 @@ gomp_team_start (void (*fn) (void *), vo
+ #endif
+   thr->ts.static_trip = 0;
+   thr->task = &team->implicit_task[0];
++#ifdef GOMP_NEEDS_THREAD_HANDLE
++  thr->handle = pthread_self ();
++#endif
+   nthreads_var = icv->nthreads_var;
+   if (__builtin_expect (gomp_nthreads_var_list != NULL, 0)
+       && thr->ts.level < gomp_nthreads_var_list_len)
+@@ -350,6 +365,7 @@ gomp_team_start (void (*fn) (void *), vo
+       && thr->ts.level < gomp_bind_var_list_len)
+     bind_var = gomp_bind_var_list[thr->ts.level];
+   gomp_init_task (thr->task, task, icv);
++  thr->task->taskgroup = taskgroup;
+   team->implicit_task[0].icv.nthreads_var = nthreads_var;
+   team->implicit_task[0].icv.bind_var = bind_var;
+ 
+@@ -465,7 +481,9 @@ gomp_team_start (void (*fn) (void *), vo
+ 	  pool->threads
+ 	    = gomp_realloc (pool->threads,
+ 			    pool->threads_size
+-			    * sizeof (struct gomp_thread_data *));
++			    * sizeof (struct gomp_thread *));
++	  /* Add current (master) thread to threads[].  */
++	  pool->threads[0] = thr;
+ 	}
+ 
+       /* Release existing idle threads.  */
+@@ -540,6 +558,7 @@ gomp_team_start (void (*fn) (void *), vo
+ 						+ place_partition_len))
+ 		{
+ 		  unsigned int l;
++		  force_display = true;
+ 		  if (affinity_thr == NULL)
+ 		    {
+ 		      unsigned int j;
+@@ -623,6 +642,7 @@ gomp_team_start (void (*fn) (void *), vo
+ 	  gomp_init_task (nthr->task, task, icv);
+ 	  team->implicit_task[i].icv.nthreads_var = nthreads_var;
+ 	  team->implicit_task[i].icv.bind_var = bind_var;
++	  nthr->task->taskgroup = taskgroup;
+ 	  nthr->fn = fn;
+ 	  nthr->data = data;
+ 	  team->ordered_release[i] = &nthr->release;
+@@ -712,19 +732,17 @@ gomp_team_start (void (*fn) (void *), vo
+     {
+       size_t stacksize;
+       pthread_attr_init (&thread_attr);
+-      pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED);
+       if (! pthread_attr_getstacksize (&gomp_thread_attr, &stacksize))
+ 	pthread_attr_setstacksize (&thread_attr, stacksize);
+       attr = &thread_attr;
+     }
+ 
+   start_data = gomp_alloca (sizeof (struct gomp_thread_start_data)
+-			    * (nthreads-i));
++			    * (nthreads - i));
+ 
+   /* Launch new threads.  */
+   for (; i < nthreads; ++i)
+     {
+-      pthread_t pt;
+       int err;
+ 
+       start_data->ts.place_partition_off = thr->ts.place_partition_off;
+@@ -810,11 +828,14 @@ gomp_team_start (void (*fn) (void *), vo
+       gomp_init_task (start_data->task, task, icv);
+       team->implicit_task[i].icv.nthreads_var = nthreads_var;
+       team->implicit_task[i].icv.bind_var = bind_var;
++      start_data->task->taskgroup = taskgroup;
+       start_data->thread_pool = pool;
+       start_data->nested = nested;
+ 
+       attr = gomp_adjust_thread_attr (attr, &thread_attr);
+-      err = pthread_create (&pt, attr, gomp_thread_start, start_data++);
++      err = pthread_create (&start_data->handle, attr, gomp_thread_start,
++			    start_data);
++      start_data++;
+       if (err != 0)
+ 	gomp_fatal ("Thread creation failed: %s", strerror (err));
+     }
+@@ -854,6 +875,42 @@ gomp_team_start (void (*fn) (void *), vo
+       gomp_mutex_unlock (&gomp_managed_threads_lock);
+ #endif
+     }
++  if (__builtin_expect (gomp_display_affinity_var, 0))
++    {
++      if (nested
++	  || nthreads != old_threads_used
++	  || force_display)
++	{
++	  gomp_display_affinity_thread (gomp_thread_self (), &thr->ts,
++					thr->place);
++	  if (nested)
++	    {
++	      start_data -= nthreads - 1;
++	      for (i = 1; i < nthreads; ++i)
++		{
++		  gomp_display_affinity_thread (
++#ifdef LIBGOMP_USE_PTHREADS
++						start_data->handle,
++#else
++						gomp_thread_self (),
++#endif
++						&start_data->ts,
++						start_data->place);
++		  start_data++;
++		}
++	    }
++	  else
++	    {
++	      for (i = 1; i < nthreads; ++i)
++		{
++		  gomp_thread_handle handle
++		    = gomp_thread_to_pthread_t (pool->threads[i]);
++		  gomp_display_affinity_thread (handle, &pool->threads[i]->ts,
++						pool->threads[i]->place);
++		}
++	    }
++	}
++    }
+   if (__builtin_expect (affinity_thr != NULL, 0)
+       && team->prev_ts.place_partition_len > 64)
+     free (affinity_thr);
+@@ -894,7 +951,7 @@ gomp_team_end (void)
+   gomp_end_task ();
+   thr->ts = team->prev_ts;
+ 
+-  if (__builtin_expect (thr->ts.team != NULL, 0))
++  if (__builtin_expect (thr->ts.level != 0, 0))
+     {
+ #ifdef HAVE_SYNC_BUILTINS
+       __sync_fetch_and_add (&gomp_managed_threads, 1L - team->nthreads);
+@@ -959,6 +1016,76 @@ team_destructor (void)
+      crashes.  */
+   pthread_key_delete (gomp_thread_destructor);
+ }
++
++/* Similar to gomp_free_pool_helper, but don't detach itself,
++   gomp_pause_host will pthread_join those threads.  */
++
++static void
++gomp_pause_pool_helper (void *thread_pool)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_thread_pool *pool
++    = (struct gomp_thread_pool *) thread_pool;
++  gomp_simple_barrier_wait_last (&pool->threads_dock);
++  gomp_sem_destroy (&thr->release);
++  thr->thread_pool = NULL;
++  thr->task = NULL;
++  pthread_exit (NULL);
++}
++
++/* Free a thread pool and release its threads.  Return non-zero on
++   failure.  */
++
++int
++gomp_pause_host (void)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  struct gomp_thread_pool *pool = thr->thread_pool;
++  if (thr->ts.level)
++    return -1;
++  if (pool)
++    {
++      if (pool->threads_used > 0)
++	{
++	  int i;
++	  pthread_t *thrs
++	    = gomp_alloca (sizeof (pthread_t) * pool->threads_used);
++	  for (i = 1; i < pool->threads_used; i++)
++	    {
++	      struct gomp_thread *nthr = pool->threads[i];
++	      nthr->fn = gomp_pause_pool_helper;
++	      nthr->data = pool;
++	      thrs[i] = gomp_thread_to_pthread_t (nthr);
++	    }
++	  /* This barrier undocks threads docked on pool->threads_dock.  */
++	  gomp_simple_barrier_wait (&pool->threads_dock);
++	  /* And this waits till all threads have called gomp_barrier_wait_last
++	     in gomp_pause_pool_helper.  */
++	  gomp_simple_barrier_wait (&pool->threads_dock);
++	  /* Now it is safe to destroy the barrier and free the pool.  */
++	  gomp_simple_barrier_destroy (&pool->threads_dock);
++
++#ifdef HAVE_SYNC_BUILTINS
++	  __sync_fetch_and_add (&gomp_managed_threads,
++				1L - pool->threads_used);
++#else
++	  gomp_mutex_lock (&gomp_managed_threads_lock);
++	  gomp_managed_threads -= pool->threads_used - 1L;
++	  gomp_mutex_unlock (&gomp_managed_threads_lock);
++#endif
++	  for (i = 1; i < pool->threads_used; i++)
++	    pthread_join (thrs[i], NULL);
++	}
++      if (pool->last_team)
++	free_team (pool->last_team);
++#ifndef __nvptx__
++      free (pool->threads);
++      free (pool);
++#endif
++      thr->thread_pool = NULL;
++    }
++  return 0;
++}
+ #endif
+ 
+ struct gomp_task_icv *
+--- libgomp/libgomp.h.jj	2018-04-25 09:40:31.925655587 +0200
++++ libgomp/libgomp.h	2019-05-07 19:01:51.285535999 +0200
+@@ -44,6 +44,7 @@
+ #include "config.h"
+ #include "gstdint.h"
+ #include "libgomp-plugin.h"
++#include "gomp-constants.h"
+ 
+ #ifdef HAVE_PTHREAD_H
+ #include <pthread.h>
+@@ -85,9 +86,21 @@ enum memmodel
+ 
+ /* alloc.c */
+ 
++#if defined(HAVE_ALIGNED_ALLOC) \
++    || defined(HAVE__ALIGNED_MALLOC) \
++    || defined(HAVE_POSIX_MEMALIGN) \
++    || defined(HAVE_MEMALIGN)
++/* Defined if gomp_aligned_alloc doesn't use fallback version
++   and free can be used instead of gomp_aligned_free.  */
++#define GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC 1
++#endif
++
+ extern void *gomp_malloc (size_t) __attribute__((malloc));
+ extern void *gomp_malloc_cleared (size_t) __attribute__((malloc));
+ extern void *gomp_realloc (void *, size_t);
++extern void *gomp_aligned_alloc (size_t, size_t)
++  __attribute__((malloc, alloc_size (2)));
++extern void gomp_aligned_free (void *);
+ 
+ /* Avoid conflicting prototypes of alloca() in system headers by using
+    GCC's builtin alloca().  */
+@@ -137,7 +150,8 @@ enum gomp_schedule_type
+   GFS_STATIC,
+   GFS_DYNAMIC,
+   GFS_GUIDED,
+-  GFS_AUTO
++  GFS_AUTO,
++  GFS_MONOTONIC = 0x80000000U
+ };
+ 
+ struct gomp_doacross_work_share
+@@ -174,6 +188,8 @@ struct gomp_doacross_work_share
+     /* Likewise, but for the ull implementation.  */
+     unsigned long long boundary_ull;
+   };
++  /* Pointer to extra memory if needed for lastprivate(conditional).  */
++  void *extra;
+   /* Array of shift counts for each dimension if they can be flattened.  */
+   unsigned int shift_counts[];
+ };
+@@ -275,6 +291,9 @@ struct gomp_work_share
+     struct gomp_work_share *next_free;
+   };
+ 
++  /* Task reductions for this work-sharing construct.  */
++  uintptr_t *task_reductions;
++
+   /* If only few threads are in the team, ordered_team_ids can point
+      to this array which fills the padding at the end of this struct.  */
+   unsigned inline_ordered_team_ids[0];
+@@ -365,8 +384,12 @@ extern void **gomp_places_list;
+ extern unsigned long gomp_places_list_len;
+ extern unsigned int gomp_num_teams_var;
+ extern int gomp_debug_var;
++extern bool gomp_display_affinity_var;
++extern char *gomp_affinity_format_var;
++extern size_t gomp_affinity_format_len;
+ extern int goacc_device_num;
+ extern char *goacc_device_type;
++extern int goacc_default_dims[GOMP_DIM_MAX];
+ 
+ enum gomp_task_kind
+ {
+@@ -469,8 +492,10 @@ struct gomp_taskgroup
+   struct gomp_taskgroup *prev;
+   /* Queue of tasks that belong in this taskgroup.  */
+   struct priority_queue taskgroup_queue;
++  uintptr_t *reductions;
+   bool in_taskgroup_wait;
+   bool cancelled;
++  bool workshare;
+   gomp_sem_t taskgroup_sem;
+   size_t num_children;
+ };
+@@ -613,6 +638,19 @@ struct gomp_thread
+ 
+   /* User pthread thread pool */
+   struct gomp_thread_pool *thread_pool;
++
++#if defined(LIBGOMP_USE_PTHREADS) \
++    && (!defined(HAVE_TLS) \
++	|| !defined(__GLIBC__) \
++	|| !defined(USING_INITIAL_EXEC_TLS))
++  /* pthread_t of the thread containing this gomp_thread.
++     On Linux when using initial-exec TLS,
++     (typeof (pthread_t)) gomp_thread () - pthread_self ()
++     is constant in all threads, so we can optimize and not
++     store it.  */
++#define GOMP_NEEDS_THREAD_HANDLE 1
++  pthread_t handle;
++#endif
+ };
+ 
+ 
+@@ -709,6 +747,25 @@ extern bool gomp_affinity_finalize_place
+ extern bool gomp_affinity_init_level (int, unsigned long, bool);
+ extern void gomp_affinity_print_place (void *);
+ extern void gomp_get_place_proc_ids_8 (int, int64_t *);
++extern void gomp_display_affinity_place (char *, size_t, size_t *, int);
++
++/* affinity-fmt.c */
++
++extern void gomp_print_string (const char *str, size_t len);
++extern void gomp_set_affinity_format (const char *, size_t);
++extern void gomp_display_string (char *, size_t, size_t *, const char *,
++				 size_t);
++#ifdef LIBGOMP_USE_PTHREADS
++typedef pthread_t gomp_thread_handle;
++#else
++typedef struct {} gomp_thread_handle;
++#endif
++extern size_t gomp_display_affinity (char *, size_t, const char *,
++				     gomp_thread_handle,
++				     struct gomp_team_state *, unsigned int);
++extern void gomp_display_affinity_thread (gomp_thread_handle,
++					  struct gomp_team_state *,
++					  unsigned int) __attribute__((cold));
+ 
+ /* iter.c */
+ 
+@@ -745,9 +802,9 @@ extern void gomp_ordered_next (void);
+ extern void gomp_ordered_static_init (void);
+ extern void gomp_ordered_static_next (void);
+ extern void gomp_ordered_sync (void);
+-extern void gomp_doacross_init (unsigned, long *, long);
++extern void gomp_doacross_init (unsigned, long *, long, size_t);
+ extern void gomp_doacross_ull_init (unsigned, unsigned long long *,
+-				    unsigned long long);
++				    unsigned long long, size_t);
+ 
+ /* parallel.c */
+ 
+@@ -770,6 +827,10 @@ extern bool gomp_create_target_task (str
+ 				     size_t *, unsigned short *, unsigned int,
+ 				     void **, void **,
+ 				     enum gomp_target_task_state);
++extern struct gomp_taskgroup *gomp_parallel_reduction_register (uintptr_t *,
++								unsigned);
++extern void gomp_workshare_taskgroup_start (void);
++extern void gomp_workshare_task_reduction_register (uintptr_t *, uintptr_t *);
+ 
+ static void inline
+ gomp_finish_task (struct gomp_task *task)
+@@ -782,9 +843,11 @@ gomp_finish_task (struct gomp_task *task
+ 
+ extern struct gomp_team *gomp_new_team (unsigned);
+ extern void gomp_team_start (void (*) (void *), void *, unsigned,
+-			     unsigned, struct gomp_team *);
++			     unsigned, struct gomp_team *,
++			     struct gomp_taskgroup *);
+ extern void gomp_team_end (void);
+ extern void gomp_free_thread (void *);
++extern int gomp_pause_host (void);
+ 
+ /* target.c */
+ 
+@@ -851,6 +914,8 @@ struct splay_tree_key_s {
+   uintptr_t tgt_offset;
+   /* Reference count.  */
+   uintptr_t refcount;
++  /* Dynamic reference count.  */
++  uintptr_t dynamic_refcount;
+   /* Pointer to the original mapping of "omp declare target link" object.  */
+   splay_tree_key link_key;
+ };
+@@ -989,7 +1054,9 @@ enum gomp_map_vars_kind
+ };
+ 
+ extern void gomp_acc_insert_pointer (size_t, void **, size_t *, void *);
+-extern void gomp_acc_remove_pointer (void *, bool, int, int);
++extern void gomp_acc_remove_pointer (void *, size_t, bool, int, int, int);
++extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *,
++				       unsigned short *);
+ 
+ extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
+ 					      size_t, void **, void **,
+@@ -999,12 +1066,13 @@ extern void gomp_unmap_vars (struct targ
+ extern void gomp_init_device (struct gomp_device_descr *);
+ extern void gomp_free_memmap (struct splay_tree_s *);
+ extern void gomp_unload_device (struct gomp_device_descr *);
++extern bool gomp_remove_var (struct gomp_device_descr *, splay_tree_key);
+ 
+ /* work.c */
+ 
+-extern void gomp_init_work_share (struct gomp_work_share *, bool, unsigned);
++extern void gomp_init_work_share (struct gomp_work_share *, size_t, unsigned);
+ extern void gomp_fini_work_share (struct gomp_work_share *);
+-extern bool gomp_work_share_start (bool);
++extern bool gomp_work_share_start (size_t);
+ extern void gomp_work_share_end (void);
+ extern bool gomp_work_share_end_cancel (void);
+ extern void gomp_work_share_end_nowait (void);
+@@ -1028,6 +1096,14 @@ gomp_work_share_init_done (void)
+ #include "omp-lock.h"
+ #define _LIBGOMP_OMP_LOCK_DEFINED 1
+ #include "omp.h.in"
++#define omp_sched_monotonic 0x80000000U
++typedef enum omp_pause_resource_t
++{
++  omp_pause_soft = 1,
++  omp_pause_hard = 2
++} omp_pause_resource_t;
++extern int omp_pause_resource (omp_pause_resource_t, int) __GOMP_NOTHROW;
++extern int omp_pause_resource_all (omp_pause_resource_t) __GOMP_NOTHROW;
+ 
+ #if !defined (HAVE_ATTRIBUTE_VISIBILITY) \
+     || !defined (HAVE_ATTRIBUTE_ALIAS) \
+@@ -1082,16 +1158,26 @@ extern int gomp_test_nest_lock_25 (omp_n
+ # define attribute_hidden
+ #endif
+ 
++#if __GNUC__ >= 9
++#  define HAVE_ATTRIBUTE_COPY
++#endif
++
++#ifdef HAVE_ATTRIBUTE_COPY
++# define attribute_copy(arg) __attribute__ ((copy (arg)))
++#else
++# define attribute_copy(arg)
++#endif
++
+ #ifdef HAVE_ATTRIBUTE_ALIAS
+ # define strong_alias(fn, al) \
+-  extern __typeof (fn) al __attribute__ ((alias (#fn)));
++  extern __typeof (fn) al __attribute__ ((alias (#fn))) attribute_copy (fn);
+ 
+ # define ialias_ulp	ialias_str1(__USER_LABEL_PREFIX__)
+ # define ialias_str1(x)	ialias_str2(x)
+ # define ialias_str2(x)	#x
+ # define ialias(fn) \
+   extern __typeof (fn) gomp_ialias_##fn \
+-    __attribute__ ((alias (#fn))) attribute_hidden;
++    __attribute__ ((alias (#fn))) attribute_hidden attribute_copy (fn);
+ # define ialias_redirect(fn) \
+   extern __typeof (fn) fn __asm__ (ialias_ulp "gomp_ialias_" #fn) attribute_hidden;
+ # define ialias_call(fn) gomp_ialias_ ## fn
+@@ -1131,4 +1217,42 @@ task_to_priority_node (enum priority_que
+   return (struct priority_node *) ((char *) task
+ 				   + priority_queue_offset (type));
+ }
++
++#ifdef LIBGOMP_USE_PTHREADS
++static inline gomp_thread_handle
++gomp_thread_self (void)
++{
++  return pthread_self ();
++}
++
++static inline gomp_thread_handle
++gomp_thread_to_pthread_t (struct gomp_thread *thr)
++{
++  struct gomp_thread *this_thr = gomp_thread ();
++  if (thr == this_thr)
++    return pthread_self ();
++#ifdef GOMP_NEEDS_THREAD_HANDLE
++  return thr->handle;
++#else
++  /* On Linux with initial-exec TLS, the pthread_t of the thread containing
++     thr can be computed from thr, this_thr and pthread_self (),
++     as the distance between this_thr and pthread_self () is constant.  */
++  return pthread_self () + ((uintptr_t) thr - (uintptr_t) this_thr);
++#endif
++}
++#else
++static inline gomp_thread_handle
++gomp_thread_self (void)
++{
++  return (gomp_thread_handle) {};
++}
++
++static inline gomp_thread_handle
++gomp_thread_to_pthread_t (struct gomp_thread *thr)
++{
++  (void) thr;
++  return gomp_thread_self ();
++}
++#endif
++
+ #endif /* LIBGOMP_H */
+--- libgomp/oacc-parallel.c.jj	2018-04-25 09:40:31.319655306 +0200
++++ libgomp/oacc-parallel.c	2019-05-07 19:09:47.010991153 +0200
+@@ -27,6 +27,8 @@
+ /* This file handles OpenACC constructs.  */
+ 
+ #include "openacc.h"
++void acc_copyout_finalize (void *, size_t) __GOACC_NOTHROW;
++void acc_delete_finalize (void *, size_t) __GOACC_NOTHROW;
+ #include "libgomp.h"
+ #include "libgomp_g.h"
+ #include "gomp-constants.h"
+@@ -38,31 +40,95 @@
+ #include <stdarg.h>
+ #include <assert.h>
+ 
++
++/* In the ABI, the GOACC_FLAGs are encoded as an inverted bitmask, so that we
++   continue to support the following two legacy values.  */
++_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_ICV) == 0,
++		"legacy GOMP_DEVICE_ICV broken");
++_Static_assert (GOACC_FLAGS_UNMARSHAL (GOMP_DEVICE_HOST_FALLBACK)
++		== GOACC_FLAG_HOST_FALLBACK,
++		"legacy GOMP_DEVICE_HOST_FALLBACK broken");
++
++
++/* Returns the number of mappings associated with the pointer or pset. PSET
++   have three mappings, whereas pointer have two.  */
++
+ static int
+-find_pset (int pos, size_t mapnum, unsigned short *kinds)
++find_pointer (int pos, size_t mapnum, unsigned short *kinds)
+ {
+   if (pos + 1 >= mapnum)
+     return 0;
+ 
+   unsigned char kind = kinds[pos+1] & 0xff;
+ 
+-  return kind == GOMP_MAP_TO_PSET;
++  if (kind == GOMP_MAP_TO_PSET)
++    return 3;
++  else if (kind == GOMP_MAP_POINTER)
++    return 2;
++
++  return 0;
++}
++
++/* Handle the mapping pair that are presented when a
++   deviceptr clause is used with Fortran.  */
++
++static void
++handle_ftn_pointers (size_t mapnum, void **hostaddrs, size_t *sizes,
++		     unsigned short *kinds)
++{
++  int i;
++
++  for (i = 0; i < mapnum; i++)
++    {
++      unsigned short kind1 = kinds[i] & 0xff;
++
++      /* Handle Fortran deviceptr clause.  */
++      if (kind1 == GOMP_MAP_FORCE_DEVICEPTR)
++	{
++	  unsigned short kind2;
++
++	  if (i < (signed)mapnum - 1)
++	    kind2 = kinds[i + 1] & 0xff;
++	  else
++	    kind2 = 0xffff;
++
++	  if (sizes[i] == sizeof (void *))
++	    continue;
++
++	  /* At this point, we're dealing with a Fortran deviceptr.
++	     If the next element is not what we're expecting, then
++	     this is an instance of where the deviceptr variable was
++	     not used within the region and the pointer was removed
++	     by the gimplifier.  */
++	  if (kind2 == GOMP_MAP_POINTER
++	      && sizes[i + 1] == 0
++	      && hostaddrs[i] == *(void **)hostaddrs[i + 1])
++	    {
++	      kinds[i+1] = kinds[i];
++	      sizes[i+1] = sizeof (void *);
++	    }
++
++	  /* Invalidate the entry.  */
++	  hostaddrs[i] = NULL;
++	}
++    }
+ }
+ 
+ static void goacc_wait (int async, int num_waits, va_list *ap);
+ 
+ 
+-/* Launch a possibly offloaded function on DEVICE.  FN is the host fn
++/* Launch a possibly offloaded function with FLAGS.  FN is the host fn
+    address.  MAPNUM, HOSTADDRS, SIZES & KINDS  describe the memory
+    blocks to be copied to/from the device.  Varadic arguments are
+    keyed optional parameters terminated with a zero.  */
+ 
+ void
+-GOACC_parallel_keyed (int device, void (*fn) (void *),
++GOACC_parallel_keyed (int flags_m, void (*fn) (void *),
+ 		      size_t mapnum, void **hostaddrs, size_t *sizes,
+ 		      unsigned short *kinds, ...)
+ {
+-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
++  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
++
+   va_list ap;
+   struct goacc_thread *thr;
+   struct gomp_device_descr *acc_dev;
+@@ -88,9 +154,11 @@ GOACC_parallel_keyed (int device, void (
+   thr = goacc_thread ();
+   acc_dev = thr->dev;
+ 
++  handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
++
+   /* Host fallback if "if" clause is false or if the current device is set to
+      the host.  */
+-  if (host_fallback)
++  if (flags & GOACC_FLAG_HOST_FALLBACK)
+     {
+       goacc_save_and_set_bind (acc_device_host);
+       fn (hostaddrs);
+@@ -140,9 +208,7 @@ GOACC_parallel_keyed (int device, void (
+ 	case GOMP_LAUNCH_WAIT:
+ 	  {
+ 	    unsigned num_waits = GOMP_LAUNCH_OP (tag);
+-
+-	    if (num_waits)
+-	      goacc_wait (async, num_waits, &ap);
++	    goacc_wait (async, num_waits, &ap);
+ 	    break;
+ 	  }
+ 
+@@ -177,16 +243,36 @@ GOACC_parallel_keyed (int device, void (
+   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
+   for (i = 0; i < mapnum; i++)
+     devaddrs[i] = (void *) (tgt->list[i].key->tgt->tgt_start
+-			    + tgt->list[i].key->tgt_offset);
++			    + tgt->list[i].key->tgt_offset
++			    + tgt->list[i].offset);
+ 
+   acc_dev->openacc.exec_func (tgt_fn, mapnum, hostaddrs, devaddrs,
+ 			      async, dims, tgt);
+ 
+   /* If running synchronously, unmap immediately.  */
+-  if (async < acc_async_noval)
++  bool copyfrom = true;
++  if (async_synchronous_p (async))
+     gomp_unmap_vars (tgt, true);
+   else
+-    tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
++    {
++      bool async_unmap = false;
++      for (size_t i = 0; i < tgt->list_count; i++)
++	{
++	  splay_tree_key k = tgt->list[i].key;
++	  if (k && k->refcount == 1)
++	    {
++	      async_unmap = true;
++	      break;
++	    }
++	}
++      if (async_unmap)
++	tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
++      else
++	{
++	  copyfrom = false;
++	  gomp_unmap_vars (tgt, copyfrom);
++	}
++    }
+ 
+   acc_dev->openacc.async_set_async_func (acc_async_sync);
+ }
+@@ -194,7 +280,7 @@ GOACC_parallel_keyed (int device, void (
+ /* Legacy entry point, only provide host execution.  */
+ 
+ void
+-GOACC_parallel (int device, void (*fn) (void *),
++GOACC_parallel (int flags_m, void (*fn) (void *),
+ 		size_t mapnum, void **hostaddrs, size_t *sizes,
+ 		unsigned short *kinds,
+ 		int num_gangs, int num_workers, int vector_length,
+@@ -206,10 +292,11 @@ GOACC_parallel (int device, void (*fn) (
+ }
+ 
+ void
+-GOACC_data_start (int device, size_t mapnum,
++GOACC_data_start (int flags_m, size_t mapnum,
+ 		  void **hostaddrs, size_t *sizes, unsigned short *kinds)
+ {
+-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
++  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
++
+   struct target_mem_desc *tgt;
+ 
+ #ifdef HAVE_INTTYPES_H
+@@ -227,7 +314,7 @@ GOACC_data_start (int device, size_t map
+ 
+   /* Host fallback or 'do nothing'.  */
+   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+-      || host_fallback)
++      || (flags & GOACC_FLAG_HOST_FALLBACK))
+     {
+       tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, true,
+ 			   GOMP_MAP_VARS_OPENACC);
+@@ -258,13 +345,14 @@ GOACC_data_end (void)
+ }
+ 
+ void
+-GOACC_enter_exit_data (int device, size_t mapnum,
++GOACC_enter_exit_data (int flags_m, size_t mapnum,
+ 		       void **hostaddrs, size_t *sizes, unsigned short *kinds,
+ 		       int async, int num_waits, ...)
+ {
++  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
++
+   struct goacc_thread *thr;
+   struct gomp_device_descr *acc_dev;
+-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
+   bool data_enter = false;
+   size_t i;
+ 
+@@ -274,7 +362,7 @@ GOACC_enter_exit_data (int device, size_
+   acc_dev = thr->dev;
+ 
+   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+-      || host_fallback)
++      || (flags & GOACC_FLAG_HOST_FALLBACK))
+     return;
+ 
+   if (num_waits)
+@@ -286,6 +374,17 @@ GOACC_enter_exit_data (int device, size_
+       va_end (ap);
+     }
+ 
++  /* Determine whether "finalize" semantics apply to all mappings of this
++     OpenACC directive.  */
++  bool finalize = false;
++  if (mapnum > 0)
++    {
++      unsigned char kind = kinds[0] & 0xff;
++      if (kind == GOMP_MAP_DELETE
++	  || kind == GOMP_MAP_FORCE_FROM)
++	finalize = true;
++    }
++
+   acc_dev->openacc.async_set_async_func (async);
+ 
+   /* Determine if this is an "acc enter data".  */
+@@ -298,13 +397,17 @@ GOACC_enter_exit_data (int device, size_
+ 
+       if (kind == GOMP_MAP_FORCE_ALLOC
+ 	  || kind == GOMP_MAP_FORCE_PRESENT
+-	  || kind == GOMP_MAP_FORCE_TO)
++	  || kind == GOMP_MAP_FORCE_TO
++	  || kind == GOMP_MAP_TO
++	  || kind == GOMP_MAP_ALLOC)
+ 	{
+ 	  data_enter = true;
+ 	  break;
+ 	}
+ 
+-      if (kind == GOMP_MAP_DELETE
++      if (kind == GOMP_MAP_RELEASE
++	  || kind == GOMP_MAP_DELETE
++	  || kind == GOMP_MAP_FROM
+ 	  || kind == GOMP_MAP_FORCE_FROM)
+ 	break;
+ 
+@@ -312,31 +415,35 @@ GOACC_enter_exit_data (int device, size_
+ 		      kind);
+     }
+ 
++  /* In c, non-pointers and arrays are represented by a single data clause.
++     Dynamically allocated arrays and subarrays are represented by a data
++     clause followed by an internal GOMP_MAP_POINTER.
++
++     In fortran, scalars and not allocated arrays are represented by a
++     single data clause. Allocated arrays and subarrays have three mappings:
++     1) the original data clause, 2) a PSET 3) a pointer to the array data.
++  */
++
+   if (data_enter)
+     {
+       for (i = 0; i < mapnum; i++)
+ 	{
+ 	  unsigned char kind = kinds[i] & 0xff;
+ 
+-	  /* Scan for PSETs.  */
+-	  int psets = find_pset (i, mapnum, kinds);
++	  /* Scan for pointers and PSETs.  */
++	  int pointer = find_pointer (i, mapnum, kinds);
+ 
+-	  if (!psets)
++	  if (!pointer)
+ 	    {
+ 	      switch (kind)
+ 		{
+-		case GOMP_MAP_POINTER:
+-		  gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i],
+-					&kinds[i]);
+-		  break;
++		case GOMP_MAP_ALLOC:
+ 		case GOMP_MAP_FORCE_ALLOC:
+ 		  acc_create (hostaddrs[i], sizes[i]);
+ 		  break;
+-		case GOMP_MAP_FORCE_PRESENT:
+-		  acc_present_or_copyin (hostaddrs[i], sizes[i]);
+-		  break;
++		case GOMP_MAP_TO:
+ 		case GOMP_MAP_FORCE_TO:
+-		  acc_present_or_copyin (hostaddrs[i], sizes[i]);
++		  acc_copyin (hostaddrs[i], sizes[i]);
+ 		  break;
+ 		default:
+ 		  gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
+@@ -346,12 +453,13 @@ GOACC_enter_exit_data (int device, size_
+ 	    }
+ 	  else
+ 	    {
+-	      gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]);
++	      gomp_acc_insert_pointer (pointer, &hostaddrs[i],
++				       &sizes[i], &kinds[i]);
+ 	      /* Increment 'i' by two because OpenACC requires fortran
+ 		 arrays to be contiguous, so each PSET is associated with
+ 		 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
+ 		 one MAP_POINTER.  */
+-	      i += 2;
++	      i += pointer - 1;
+ 	    }
+ 	}
+     }
+@@ -360,22 +468,28 @@ GOACC_enter_exit_data (int device, size_
+       {
+ 	unsigned char kind = kinds[i] & 0xff;
+ 
+-	int psets = find_pset (i, mapnum, kinds);
++	int pointer = find_pointer (i, mapnum, kinds);
+ 
+-	if (!psets)
++	if (!pointer)
+ 	  {
+ 	    switch (kind)
+ 	      {
+-	      case GOMP_MAP_POINTER:
+-		gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
+-					 == GOMP_MAP_FORCE_FROM,
+-					 async, 1);
+-		break;
++	      case GOMP_MAP_RELEASE:
+ 	      case GOMP_MAP_DELETE:
+-		acc_delete (hostaddrs[i], sizes[i]);
++		if (acc_is_present (hostaddrs[i], sizes[i]))
++		  {
++		    if (finalize)
++		      acc_delete_finalize (hostaddrs[i], sizes[i]);
++		    else
++		      acc_delete (hostaddrs[i], sizes[i]);
++		  }
+ 		break;
++	      case GOMP_MAP_FROM:
+ 	      case GOMP_MAP_FORCE_FROM:
+-		acc_copyout (hostaddrs[i], sizes[i]);
++		if (finalize)
++		  acc_copyout_finalize (hostaddrs[i], sizes[i]);
++		else
++		  acc_copyout (hostaddrs[i], sizes[i]);
+ 		break;
+ 	      default:
+ 		gomp_fatal (">>>> GOACC_enter_exit_data UNHANDLED kind 0x%.2x",
+@@ -385,10 +499,12 @@ GOACC_enter_exit_data (int device, size_
+ 	  }
+ 	else
+ 	  {
+-	    gomp_acc_remove_pointer (hostaddrs[i], (kinds[i] & 0xff)
+-				     == GOMP_MAP_FORCE_FROM, async, 3);
++	    bool copyfrom = (kind == GOMP_MAP_FORCE_FROM
++			     || kind == GOMP_MAP_FROM);
++	    gomp_acc_remove_pointer (hostaddrs[i], sizes[i], copyfrom, async,
++				     finalize, pointer);
+ 	    /* See the above comment.  */
+-	    i += 2;
++	    i += pointer - 1;
+ 	  }
+       }
+ 
+@@ -398,13 +514,20 @@ GOACC_enter_exit_data (int device, size_
+ static void
+ goacc_wait (int async, int num_waits, va_list *ap)
+ {
+-  struct goacc_thread *thr = goacc_thread ();
+-  struct gomp_device_descr *acc_dev = thr->dev;
+-
+   while (num_waits--)
+     {
+       int qid = va_arg (*ap, int);
+-      
++
++      /* Waiting on ACC_ASYNC_NOVAL maps to 'wait all'.  */
++      if (qid == acc_async_noval)
++	{
++	  if (async == acc_async_sync)
++	    acc_wait_all ();
++	  else
++	    acc_wait_all_async (async);
++	  break;
++	}
++
+       if (acc_async_test (qid))
+ 	continue;
+ 
+@@ -415,16 +538,17 @@ goacc_wait (int async, int num_waits, va
+ 	    launching on, the queue itself will order work as
+ 	    required, so there's no need to wait explicitly.  */
+       else
+-	acc_dev->openacc.async_wait_async_func (qid, async);
++	acc_wait_async (qid, async);
+     }
+ }
+ 
+ void
+-GOACC_update (int device, size_t mapnum,
++GOACC_update (int flags_m, size_t mapnum,
+ 	      void **hostaddrs, size_t *sizes, unsigned short *kinds,
+ 	      int async, int num_waits, ...)
+ {
+-  bool host_fallback = device == GOMP_DEVICE_HOST_FALLBACK;
++  int flags = GOACC_FLAGS_UNMARSHAL (flags_m);
++
+   size_t i;
+ 
+   goacc_lazy_initialize ();
+@@ -433,7 +557,7 @@ GOACC_update (int device, size_t mapnum,
+   struct gomp_device_descr *acc_dev = thr->dev;
+ 
+   if ((acc_dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+-      || host_fallback)
++      || (flags & GOACC_FLAG_HOST_FALLBACK))
+     return;
+ 
+   if (num_waits)
+@@ -447,6 +571,7 @@ GOACC_update (int device, size_t mapnum,
+ 
+   acc_dev->openacc.async_set_async_func (async);
+ 
++  bool update_device = false;
+   for (i = 0; i < mapnum; ++i)
+     {
+       unsigned char kind = kinds[i] & 0xff;
+@@ -457,11 +582,46 @@ GOACC_update (int device, size_t mapnum,
+ 	case GOMP_MAP_TO_PSET:
+ 	  break;
+ 
++	case GOMP_MAP_ALWAYS_POINTER:
++	  if (update_device)
++	    {
++	      /* Save the contents of the host pointer.  */
++	      void *dptr = acc_deviceptr (hostaddrs[i-1]);
++	      uintptr_t t = *(uintptr_t *) hostaddrs[i];
++
++	      /* Update the contents of the host pointer to reflect
++		 the value of the allocated device memory in the
++		 previous pointer.  */
++	      *(uintptr_t *) hostaddrs[i] = (uintptr_t)dptr;
++	      acc_update_device (hostaddrs[i], sizeof (uintptr_t));
++
++	      /* Restore the host pointer.  */
++	      *(uintptr_t *) hostaddrs[i] = t;
++	      update_device = false;
++	    }
++	  break;
++
++	case GOMP_MAP_TO:
++	  if (!acc_is_present (hostaddrs[i], sizes[i]))
++	    {
++	      update_device = false;
++	      break;
++	    }
++	  /* Fallthru  */
+ 	case GOMP_MAP_FORCE_TO:
++	  update_device = true;
+ 	  acc_update_device (hostaddrs[i], sizes[i]);
+ 	  break;
+ 
++	case GOMP_MAP_FROM:
++	  if (!acc_is_present (hostaddrs[i], sizes[i]))
++	    {
++	      update_device = false;
++	      break;
++	    }
++	  /* Fallthru  */
+ 	case GOMP_MAP_FORCE_FROM:
++	  update_device = false;
+ 	  acc_update_self (hostaddrs[i], sizes[i]);
+ 	  break;
+ 
+@@ -487,8 +647,8 @@ GOACC_wait (int async, int num_waits, ..
+     }
+   else if (async == acc_async_sync)
+     acc_wait_all ();
+-  else if (async == acc_async_noval)
+-    goacc_thread ()->dev->openacc.async_wait_all_async_func (acc_async_noval);
++  else
++    acc_wait_all_async (async);
+ }
+ 
+ int
+@@ -504,7 +664,7 @@ GOACC_get_thread_num (void)
+ }
+ 
+ void
+-GOACC_declare (int device, size_t mapnum,
++GOACC_declare (int flags_m, size_t mapnum,
+ 	       void **hostaddrs, size_t *sizes, unsigned short *kinds)
+ {
+   int i;
+@@ -522,9 +682,10 @@ GOACC_declare (int device, size_t mapnum
+ 	  case GOMP_MAP_FORCE_FROM:
+ 	  case GOMP_MAP_FORCE_TO:
+ 	  case GOMP_MAP_POINTER:
++	  case GOMP_MAP_RELEASE:
+ 	  case GOMP_MAP_DELETE:
+-	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
+-				   &kinds[i], 0, 0);
++	    GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
++				   &kinds[i], GOMP_ASYNC_SYNC, 0);
+ 	    break;
+ 
+ 	  case GOMP_MAP_FORCE_DEVICEPTR:
+@@ -532,20 +693,19 @@ GOACC_declare (int device, size_t mapnum
+ 
+ 	  case GOMP_MAP_ALLOC:
+ 	    if (!acc_is_present (hostaddrs[i], sizes[i]))
+-	      GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
+-				     &kinds[i], 0, 0);
++	      GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
++				     &kinds[i], GOMP_ASYNC_SYNC, 0);
+ 	    break;
+ 
+ 	  case GOMP_MAP_TO:
+-	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
+-				   &kinds[i], 0, 0);
++	    GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
++				   &kinds[i], GOMP_ASYNC_SYNC, 0);
+ 
+ 	    break;
+ 
+ 	  case GOMP_MAP_FROM:
+-	    kinds[i] = GOMP_MAP_FORCE_FROM;
+-	    GOACC_enter_exit_data (device, 1, &hostaddrs[i], &sizes[i],
+-				   &kinds[i], 0, 0);
++	    GOACC_enter_exit_data (flags_m, 1, &hostaddrs[i], &sizes[i],
++				   &kinds[i], GOMP_ASYNC_SYNC, 0);
+ 	    break;
+ 
+ 	  case GOMP_MAP_FORCE_PRESENT:
+--- libgomp/openacc2.f90.jj	2019-05-07 19:54:18.828514375 +0200
++++ libgomp/openacc2.f90	2019-05-07 19:56:38.454296347 +0200
+@@ -0,0 +1,1502 @@
++!  OpenACC Runtime Library Definitions.
++
++!  Copyright (C) 2014-2019 Free Software Foundation, Inc.
++
++!  Contributed by Tobias Burnus <burnus@net-b.de>
++!              and Mentor Embedded.
++
++!  This file is part of the GNU Offloading and Multi Processing Library
++!  (libgomp).
++
++!  Libgomp is free software; you can redistribute it and/or modify it
++!  under the terms of the GNU General Public License as published by
++!  the Free Software Foundation; either version 3, or (at your option)
++!  any later version.
++
++!  Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++!  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++!  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++!  more details.
++
++!  Under Section 7 of GPL version 3, you are granted additional
++!  permissions described in the GCC Runtime Library Exception, version
++!  3.1, as published by the Free Software Foundation.
++
++!  You should have received a copy of the GNU General Public License and
++!  a copy of the GCC Runtime Library Exception along with this program;
++!  see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++!  <http://www.gnu.org/licenses/>.
++
++module openacc_kinds2
++  use iso_fortran_env, only: int32
++  implicit none
++
++  private :: int32
++  public :: acc_device_kind
++
++  integer, parameter :: acc_device_kind = int32
++
++  public :: acc_device_none, acc_device_default, acc_device_host
++  public :: acc_device_not_host, acc_device_nvidia
++
++  ! Keep in sync with include/gomp-constants.h.
++  integer (acc_device_kind), parameter :: acc_device_none = 0
++  integer (acc_device_kind), parameter :: acc_device_default = 1
++  integer (acc_device_kind), parameter :: acc_device_host = 2
++  ! integer (acc_device_kind), parameter :: acc_device_host_nonshm = 3 removed.
++  integer (acc_device_kind), parameter :: acc_device_not_host = 4
++  integer (acc_device_kind), parameter :: acc_device_nvidia = 5
++
++  public :: acc_handle_kind
++
++  integer, parameter :: acc_handle_kind = int32
++
++  public :: acc_async_noval, acc_async_sync
++
++  ! Keep in sync with include/gomp-constants.h.
++  integer (acc_handle_kind), parameter :: acc_async_noval = -1
++  integer (acc_handle_kind), parameter :: acc_async_sync = -2
++
++end module
++
++module openacc_internal2
++  use openacc_kinds2
++  implicit none
++
++  interface
++    function acc_get_num_devices_h (d)
++      import
++      integer acc_get_num_devices_h
++      integer (acc_device_kind) d
++    end function
++
++    subroutine acc_set_device_type_h (d)
++      import
++      integer (acc_device_kind) d
++    end subroutine
++
++    function acc_get_device_type_h ()
++      import
++      integer (acc_device_kind) acc_get_device_type_h
++    end function
++
++    subroutine acc_set_device_num_h (n, d)
++      import
++      integer n
++      integer (acc_device_kind) d
++    end subroutine
++
++    function acc_get_device_num_h (d)
++      import
++      integer acc_get_device_num_h
++      integer (acc_device_kind) d
++    end function
++
++    function acc_async_test_h (a)
++      logical acc_async_test_h
++      integer a
++    end function
++
++    function acc_async_test_all_h ()
++      logical acc_async_test_all_h
++    end function
++
++    subroutine acc_wait_h (a)
++      integer a
++    end subroutine
++
++    subroutine acc_wait_async_h (a1, a2)
++      integer a1, a2
++    end subroutine
++
++    subroutine acc_wait_all_h ()
++    end subroutine
++
++    subroutine acc_wait_all_async_h (a)
++      integer a
++    end subroutine
++
++    subroutine acc_init_h (d)
++      import
++      integer (acc_device_kind) d
++    end subroutine
++
++    subroutine acc_shutdown_h (d)
++      import
++      integer (acc_device_kind) d
++    end subroutine
++
++    function acc_on_device_h (d)
++      import
++      integer (acc_device_kind) d
++      logical acc_on_device_h
++    end function
++
++    subroutine acc_copyin_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_copyin_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_copyin_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_present_or_copyin_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_present_or_copyin_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_present_or_copyin_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_create_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_create_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_create_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_present_or_create_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_present_or_create_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_present_or_create_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_copyout_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_copyout_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_copyout_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_copyout_finalize_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_copyout_finalize_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_copyout_finalize_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_delete_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_delete_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_delete_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_delete_finalize_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_delete_finalize_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_delete_finalize_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_update_device_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_update_device_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_update_device_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    subroutine acc_update_self_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end subroutine
++
++    subroutine acc_update_self_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end subroutine
++
++    subroutine acc_update_self_array_h (a)
++      type (*), dimension (..), contiguous :: a
++    end subroutine
++
++    function acc_is_present_32_h (a, len)
++      use iso_c_binding, only: c_int32_t
++      logical acc_is_present_32_h
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++    end function
++
++    function acc_is_present_64_h (a, len)
++      use iso_c_binding, only: c_int64_t
++      logical acc_is_present_64_h
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++    end function
++
++    function acc_is_present_array_h (a)
++      logical acc_is_present_array_h
++      type (*), dimension (..), contiguous :: a
++    end function
++
++    subroutine acc_copyin_async_32_h (a, len, async)
++      use iso_c_binding, only: c_int32_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_copyin_async_64_h (a, len, async)
++      use iso_c_binding, only: c_int64_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_copyin_async_array_h (a, async)
++      use openacc_kinds2, only: acc_handle_kind
++      type (*), dimension (..), contiguous :: a
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_create_async_32_h (a, len, async)
++      use iso_c_binding, only: c_int32_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_create_async_64_h (a, len, async)
++      use iso_c_binding, only: c_int64_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_create_async_array_h (a, async)
++      use openacc_kinds2, only: acc_handle_kind
++      type (*), dimension (..), contiguous :: a
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_copyout_async_32_h (a, len, async)
++      use iso_c_binding, only: c_int32_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_copyout_async_64_h (a, len, async)
++      use iso_c_binding, only: c_int64_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_copyout_async_array_h (a, async)
++      use openacc_kinds2, only: acc_handle_kind
++      type (*), dimension (..), contiguous :: a
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_delete_async_32_h (a, len, async)
++      use iso_c_binding, only: c_int32_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_delete_async_64_h (a, len, async)
++      use iso_c_binding, only: c_int64_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_delete_async_array_h (a, async)
++      use openacc_kinds2, only: acc_handle_kind
++      type (*), dimension (..), contiguous :: a
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_update_device_async_32_h (a, len, async)
++      use iso_c_binding, only: c_int32_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_update_device_async_64_h (a, len, async)
++      use iso_c_binding, only: c_int64_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_update_device_async_array_h (a, async)
++      use openacc_kinds2, only: acc_handle_kind
++      type (*), dimension (..), contiguous :: a
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_update_self_async_32_h (a, len, async)
++      use iso_c_binding, only: c_int32_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int32_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_update_self_async_64_h (a, len, async)
++      use iso_c_binding, only: c_int64_t
++      use openacc_kinds2, only: acc_handle_kind
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_int64_t) len
++      integer (acc_handle_kind) async
++    end subroutine
++
++    subroutine acc_update_self_async_array_h (a, async)
++      use openacc_kinds2, only: acc_handle_kind
++      type (*), dimension (..), contiguous :: a
++      integer (acc_handle_kind) async
++    end subroutine
++  end interface
++
++  interface
++    function acc_get_num_devices_l (d) &
++        bind (C, name = "acc_get_num_devices")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_get_num_devices_l
++      integer (c_int), value :: d
++    end function
++
++    subroutine acc_set_device_type_l (d) &
++        bind (C, name = "acc_set_device_type")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: d
++    end subroutine
++
++    function acc_get_device_type_l () &
++        bind (C, name = "acc_get_device_type")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_get_device_type_l
++    end function
++
++    subroutine acc_set_device_num_l (n, d) &
++        bind (C, name = "acc_set_device_num")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: n, d
++    end subroutine
++
++    function acc_get_device_num_l (d) &
++        bind (C, name = "acc_get_device_num")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_get_device_num_l
++      integer (c_int), value :: d
++    end function
++
++    function acc_async_test_l (a) &
++        bind (C, name = "acc_async_test")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_async_test_l
++      integer (c_int), value :: a
++    end function
++
++    function acc_async_test_all_l () &
++        bind (C, name = "acc_async_test_all")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_async_test_all_l
++    end function
++
++    subroutine acc_wait_l (a) &
++        bind (C, name = "acc_wait")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: a
++    end subroutine
++
++    subroutine acc_wait_async_l (a1, a2) &
++        bind (C, name = "acc_wait_async")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: a1, a2
++    end subroutine
++
++    subroutine acc_wait_all_l () &
++        bind (C, name = "acc_wait_all")
++      use iso_c_binding, only: c_int
++    end subroutine
++
++    subroutine acc_wait_all_async_l (a) &
++        bind (C, name = "acc_wait_all_async")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: a
++    end subroutine
++
++    subroutine acc_init_l (d) &
++        bind (C, name = "acc_init")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: d
++    end subroutine
++
++    subroutine acc_shutdown_l (d) &
++        bind (C, name = "acc_shutdown")
++      use iso_c_binding, only: c_int
++      integer (c_int), value :: d
++    end subroutine
++
++    function acc_on_device_l (d) &
++        bind (C, name = "acc_on_device")
++      use iso_c_binding, only: c_int
++      integer (c_int) :: acc_on_device_l
++      integer (c_int), value :: d
++    end function
++
++    subroutine acc_copyin_l (a, len) &
++        bind (C, name = "acc_copyin")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_present_or_copyin_l (a, len) &
++        bind (C, name = "acc_present_or_copyin")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_create_l (a, len) &
++        bind (C, name = "acc_create")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_present_or_create_l (a, len) &
++        bind (C, name = "acc_present_or_create")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_copyout_l (a, len) &
++        bind (C, name = "acc_copyout")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_copyout_finalize_l (a, len) &
++        bind (C, name = "acc_copyout_finalize")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_delete_l (a, len) &
++        bind (C, name = "acc_delete")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_delete_finalize_l (a, len) &
++        bind (C, name = "acc_delete_finalize")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_update_device_l (a, len) &
++        bind (C, name = "acc_update_device")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    subroutine acc_update_self_l (a, len) &
++        bind (C, name = "acc_update_self")
++      use iso_c_binding, only: c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end subroutine
++
++    function acc_is_present_l (a, len) &
++        bind (C, name = "acc_is_present")
++      use iso_c_binding, only: c_int32_t, c_size_t
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      integer (c_int32_t) :: acc_is_present_l
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++    end function
++
++    subroutine acc_copyin_async_l (a, len, async) &
++        bind (C, name = "acc_copyin_async")
++      use iso_c_binding, only: c_size_t, c_int
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++      integer (c_int), value :: async
++    end subroutine
++
++    subroutine acc_create_async_l (a, len, async) &
++        bind (C, name = "acc_create_async")
++      use iso_c_binding, only: c_size_t, c_int
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++      integer (c_int), value :: async
++    end subroutine
++
++    subroutine acc_copyout_async_l (a, len, async) &
++        bind (C, name = "acc_copyout_async")
++      use iso_c_binding, only: c_size_t, c_int
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++      integer (c_int), value :: async
++    end subroutine
++
++    subroutine acc_delete_async_l (a, len, async) &
++        bind (C, name = "acc_delete_async")
++      use iso_c_binding, only: c_size_t, c_int
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++      integer (c_int), value :: async
++    end subroutine
++
++    subroutine acc_update_device_async_l (a, len, async) &
++        bind (C, name = "acc_update_device_async")
++      use iso_c_binding, only: c_size_t, c_int
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++      integer (c_int), value :: async
++    end subroutine
++
++    subroutine acc_update_self_async_l (a, len, async) &
++        bind (C, name = "acc_update_self_async")
++      use iso_c_binding, only: c_size_t, c_int
++      !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++      type (*), dimension (*) :: a
++      integer (c_size_t), value :: len
++      integer (c_int), value :: async
++    end subroutine
++  end interface
++end module
++
++module openacc2
++  use openacc_kinds2
++  use openacc_internal2
++  implicit none
++
++  public :: openacc_version
++
++  public :: acc_get_num_devices, acc_set_device_type, acc_get_device_type
++  public :: acc_set_device_num, acc_get_device_num, acc_async_test
++  public :: acc_async_test_all
++  public :: acc_wait, acc_async_wait, acc_wait_async
++  public :: acc_wait_all, acc_async_wait_all, acc_wait_all_async
++  public :: acc_init, acc_shutdown, acc_on_device
++  public :: acc_copyin, acc_present_or_copyin, acc_pcopyin, acc_create
++  public :: acc_present_or_create, acc_pcreate, acc_copyout, acc_delete
++  public :: acc_update_device, acc_update_self, acc_is_present
++  public :: acc_copyin_async, acc_create_async, acc_copyout_async
++  public :: acc_delete_async, acc_update_device_async, acc_update_self_async
++
++  integer, parameter :: openacc_version = 201306
++
++  interface acc_get_num_devices
++    procedure :: acc_get_num_devices_h
++  end interface
++
++  interface acc_set_device_type
++    procedure :: acc_set_device_type_h
++  end interface
++
++  interface acc_get_device_type
++    procedure :: acc_get_device_type_h
++  end interface
++
++  interface acc_set_device_num
++    procedure :: acc_set_device_num_h
++  end interface
++
++  interface acc_get_device_num
++    procedure :: acc_get_device_num_h
++  end interface
++
++  interface acc_async_test
++    procedure :: acc_async_test_h
++  end interface
++
++  interface acc_async_test_all
++    procedure :: acc_async_test_all_h
++  end interface
++
++  interface acc_wait
++    procedure :: acc_wait_h
++  end interface
++
++  ! acc_async_wait is an OpenACC 1.0 compatibility name for acc_wait.
++  interface acc_async_wait
++    procedure :: acc_wait_h
++  end interface
++
++  interface acc_wait_async
++    procedure :: acc_wait_async_h
++  end interface
++
++  interface acc_wait_all
++    procedure :: acc_wait_all_h
++  end interface
++
++  ! acc_async_wait_all is an OpenACC 1.0 compatibility name for acc_wait_all.
++  interface acc_async_wait_all
++    procedure :: acc_wait_all_h
++  end interface
++
++  interface acc_wait_all_async
++    procedure :: acc_wait_all_async_h
++  end interface
++
++  interface acc_init
++    procedure :: acc_init_h
++  end interface
++
++  interface acc_shutdown
++    procedure :: acc_shutdown_h
++  end interface
++
++  interface acc_on_device
++    procedure :: acc_on_device_h
++  end interface
++
++  ! acc_malloc: Only available in C/C++
++  ! acc_free: Only available in C/C++
++
++  ! As vendor extension, the following code supports both 32bit and 64bit
++  ! arguments for "size"; the OpenACC standard only permits default-kind
++  ! integers, which are of kind 4 (i.e. 32 bits).
++  ! Additionally, the two-argument version also takes arrays as argument.
++  ! and the one argument version also scalars. Note that the code assumes
++  ! that the arrays are contiguous.
++
++  interface acc_copyin
++    procedure :: acc_copyin_32_h
++    procedure :: acc_copyin_64_h
++    procedure :: acc_copyin_array_h
++  end interface
++
++  interface acc_present_or_copyin
++    procedure :: acc_present_or_copyin_32_h
++    procedure :: acc_present_or_copyin_64_h
++    procedure :: acc_present_or_copyin_array_h
++  end interface
++
++  interface acc_pcopyin
++    procedure :: acc_present_or_copyin_32_h
++    procedure :: acc_present_or_copyin_64_h
++    procedure :: acc_present_or_copyin_array_h
++  end interface
++
++  interface acc_create
++    procedure :: acc_create_32_h
++    procedure :: acc_create_64_h
++    procedure :: acc_create_array_h
++  end interface
++
++  interface acc_present_or_create
++    procedure :: acc_present_or_create_32_h
++    procedure :: acc_present_or_create_64_h
++    procedure :: acc_present_or_create_array_h
++  end interface
++
++  interface acc_pcreate
++    procedure :: acc_present_or_create_32_h
++    procedure :: acc_present_or_create_64_h
++    procedure :: acc_present_or_create_array_h
++  end interface
++
++  interface acc_copyout
++    procedure :: acc_copyout_32_h
++    procedure :: acc_copyout_64_h
++    procedure :: acc_copyout_array_h
++  end interface
++
++  interface acc_copyout_finalize
++    procedure :: acc_copyout_finalize_32_h
++    procedure :: acc_copyout_finalize_64_h
++    procedure :: acc_copyout_finalize_array_h
++  end interface
++
++  interface acc_delete
++    procedure :: acc_delete_32_h
++    procedure :: acc_delete_64_h
++    procedure :: acc_delete_array_h
++  end interface
++
++  interface acc_delete_finalize
++    procedure :: acc_delete_finalize_32_h
++    procedure :: acc_delete_finalize_64_h
++    procedure :: acc_delete_finalize_array_h
++  end interface
++
++  interface acc_update_device
++    procedure :: acc_update_device_32_h
++    procedure :: acc_update_device_64_h
++    procedure :: acc_update_device_array_h
++  end interface
++
++  interface acc_update_self
++    procedure :: acc_update_self_32_h
++    procedure :: acc_update_self_64_h
++    procedure :: acc_update_self_array_h
++  end interface
++
++  ! acc_map_data: Only available in C/C++
++  ! acc_unmap_data: Only available in C/C++
++  ! acc_deviceptr: Only available in C/C++
++  ! acc_hostptr: Only available in C/C++
++
++  interface acc_is_present
++    procedure :: acc_is_present_32_h
++    procedure :: acc_is_present_64_h
++    procedure :: acc_is_present_array_h
++  end interface
++
++  ! acc_memcpy_to_device: Only available in C/C++
++  ! acc_memcpy_from_device: Only available in C/C++
++
++  interface acc_copyin_async
++    procedure :: acc_copyin_async_32_h
++    procedure :: acc_copyin_async_64_h
++    procedure :: acc_copyin_async_array_h
++  end interface
++
++  interface acc_create_async
++    procedure :: acc_create_async_32_h
++    procedure :: acc_create_async_64_h
++    procedure :: acc_create_async_array_h
++  end interface
++
++  interface acc_copyout_async
++    procedure :: acc_copyout_async_32_h
++    procedure :: acc_copyout_async_64_h
++    procedure :: acc_copyout_async_array_h
++  end interface
++
++  interface acc_delete_async
++    procedure :: acc_delete_async_32_h
++    procedure :: acc_delete_async_64_h
++    procedure :: acc_delete_async_array_h
++  end interface
++
++  interface acc_update_device_async
++    procedure :: acc_update_device_async_32_h
++    procedure :: acc_update_device_async_64_h
++    procedure :: acc_update_device_async_array_h
++  end interface
++
++  interface acc_update_self_async
++    procedure :: acc_update_self_async_32_h
++    procedure :: acc_update_self_async_64_h
++    procedure :: acc_update_self_async_array_h
++  end interface
++
++end module
++
++function acc_get_num_devices_h (d)
++  use openacc_internal2, only: acc_get_num_devices_l
++  use openacc_kinds2
++  integer acc_get_num_devices_h
++  integer (acc_device_kind) d
++  acc_get_num_devices_h = acc_get_num_devices_l (d)
++end function
++
++subroutine acc_set_device_type_h (d)
++  use openacc_internal2, only: acc_set_device_type_l
++  use openacc_kinds2
++  integer (acc_device_kind) d
++  call acc_set_device_type_l (d)
++end subroutine
++
++function acc_get_device_type_h ()
++  use openacc_internal2, only: acc_get_device_type_l
++  use openacc_kinds2
++  integer (acc_device_kind) acc_get_device_type_h
++  acc_get_device_type_h = acc_get_device_type_l ()
++end function
++
++subroutine acc_set_device_num_h (n, d)
++  use openacc_internal2, only: acc_set_device_num_l
++  use openacc_kinds2
++  integer n
++  integer (acc_device_kind) d
++  call acc_set_device_num_l (n, d)
++end subroutine
++
++function acc_get_device_num_h (d)
++  use openacc_internal2, only: acc_get_device_num_l
++  use openacc_kinds2
++  integer acc_get_device_num_h
++  integer (acc_device_kind) d
++  acc_get_device_num_h = acc_get_device_num_l (d)
++end function
++
++function acc_async_test_h (a)
++  use openacc_internal2, only: acc_async_test_l
++  logical acc_async_test_h
++  integer a
++  if (acc_async_test_l (a) .eq. 1) then
++    acc_async_test_h = .TRUE.
++  else
++    acc_async_test_h = .FALSE.
++  end if
++end function
++
++function acc_async_test_all_h ()
++  use openacc_internal2, only: acc_async_test_all_l
++  logical acc_async_test_all_h
++  if (acc_async_test_all_l () .eq. 1) then
++    acc_async_test_all_h = .TRUE.
++  else
++    acc_async_test_all_h = .FALSE.
++  end if
++end function
++
++subroutine acc_wait_h (a)
++  use openacc_internal2, only: acc_wait_l
++  integer a
++  call acc_wait_l (a)
++end subroutine
++
++subroutine acc_wait_async_h (a1, a2)
++  use openacc_internal2, only: acc_wait_async_l
++  integer a1, a2
++  call acc_wait_async_l (a1, a2)
++end subroutine
++
++subroutine acc_wait_all_h ()
++  use openacc_internal2, only: acc_wait_all_l
++  call acc_wait_all_l ()
++end subroutine
++
++subroutine acc_wait_all_async_h (a)
++  use openacc_internal2, only: acc_wait_all_async_l
++  integer a
++  call acc_wait_all_async_l (a)
++end subroutine
++
++subroutine acc_init_h (d)
++  use openacc_internal2, only: acc_init_l
++  use openacc_kinds2
++  integer (acc_device_kind) d
++  call acc_init_l (d)
++end subroutine
++
++subroutine acc_shutdown_h (d)
++  use openacc_internal2, only: acc_shutdown_l
++  use openacc_kinds2
++  integer (acc_device_kind) d
++  call acc_shutdown_l (d)
++end subroutine
++
++function acc_on_device_h (d)
++  use openacc_internal2, only: acc_on_device_l
++  use openacc_kinds2
++  integer (acc_device_kind) d
++  logical acc_on_device_h
++  if (acc_on_device_l (d) .eq. 1) then
++    acc_on_device_h = .TRUE.
++  else
++    acc_on_device_h = .FALSE.
++  end if
++end function
++
++subroutine acc_copyin_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_copyin_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyin_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_copyin_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyin_array_h (a)
++  use openacc_internal2, only: acc_copyin_l
++  type (*), dimension (..), contiguous :: a
++  call acc_copyin_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_present_or_copyin_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_present_or_copyin_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_copyin_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_present_or_copyin_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_present_or_copyin_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_copyin_array_h (a)
++  use openacc_internal2, only: acc_present_or_copyin_l
++  type (*), dimension (..), contiguous :: a
++  call acc_present_or_copyin_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_create_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_create_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_create_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_create_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_create_array_h (a)
++  use openacc_internal2, only: acc_create_l
++  type (*), dimension (..), contiguous :: a
++  call acc_create_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_present_or_create_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_present_or_create_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_present_or_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_create_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_present_or_create_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_present_or_create_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_present_or_create_array_h (a)
++  use openacc_internal2, only: acc_present_or_create_l
++  type (*), dimension (..), contiguous :: a
++  call acc_present_or_create_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_copyout_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_copyout_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_copyout_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyout_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_copyout_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_copyout_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyout_array_h (a)
++  use openacc_internal2, only: acc_copyout_l
++  type (*), dimension (..), contiguous :: a
++  call acc_copyout_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_copyout_finalize_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_copyout_finalize_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyout_finalize_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_copyout_finalize_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_copyout_finalize_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_copyout_finalize_array_h (a)
++  use openacc_internal2, only: acc_copyout_finalize_l
++  type (*), dimension (..), contiguous :: a
++  call acc_copyout_finalize_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_delete_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_delete_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_delete_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_delete_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_delete_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_delete_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_delete_array_h (a)
++  use openacc_internal2, only: acc_delete_l
++  type (*), dimension (..), contiguous :: a
++  call acc_delete_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_delete_finalize_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_delete_finalize_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_delete_finalize_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_delete_finalize_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_delete_finalize_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_delete_finalize_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_delete_finalize_array_h (a)
++  use openacc_internal2, only: acc_delete_finalize_l
++  type (*), dimension (..), contiguous :: a
++  call acc_delete_finalize_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_update_device_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_update_device_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_update_device_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_device_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_update_device_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_update_device_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_device_array_h (a)
++  use openacc_internal2, only: acc_update_device_l
++  type (*), dimension (..), contiguous :: a
++  call acc_update_device_l (a, sizeof (a))
++end subroutine
++
++subroutine acc_update_self_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_update_self_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  call acc_update_self_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_self_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_update_self_l
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  call acc_update_self_l (a, int (len, kind = c_size_t))
++end subroutine
++
++subroutine acc_update_self_array_h (a)
++  use openacc_internal2, only: acc_update_self_l
++  type (*), dimension (..), contiguous :: a
++  call acc_update_self_l (a, sizeof (a))
++end subroutine
++
++function acc_is_present_32_h (a, len)
++  use iso_c_binding, only: c_int32_t, c_size_t
++  use openacc_internal2, only: acc_is_present_l
++  logical acc_is_present_32_h
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
++    acc_is_present_32_h = .TRUE.
++  else
++    acc_is_present_32_h = .FALSE.
++  end if
++end function
++
++function acc_is_present_64_h (a, len)
++  use iso_c_binding, only: c_int64_t, c_size_t
++  use openacc_internal2, only: acc_is_present_l
++  logical acc_is_present_64_h
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  if (acc_is_present_l (a, int (len, kind = c_size_t)) .eq. 1) then
++    acc_is_present_64_h = .TRUE.
++  else
++    acc_is_present_64_h = .FALSE.
++  end if
++end function
++
++function acc_is_present_array_h (a)
++  use openacc_internal2, only: acc_is_present_l
++  logical acc_is_present_array_h
++  type (*), dimension (..), contiguous :: a
++  acc_is_present_array_h = acc_is_present_l (a, sizeof (a)) == 1
++end function
++
++subroutine acc_copyin_async_32_h (a, len, async)
++  use iso_c_binding, only: c_int32_t, c_size_t, c_int
++  use openacc_internal2, only: acc_copyin_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  integer (acc_handle_kind) async
++  call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_copyin_async_64_h (a, len, async)
++  use iso_c_binding, only: c_int64_t, c_size_t, c_int
++  use openacc_internal2, only: acc_copyin_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  integer (acc_handle_kind) async
++  call acc_copyin_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_copyin_async_array_h (a, async)
++  use iso_c_binding, only: c_int
++  use openacc_internal2, only: acc_copyin_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  type (*), dimension (..), contiguous :: a
++  integer (acc_handle_kind) async
++  call acc_copyin_async_l (a, sizeof (a), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_create_async_32_h (a, len, async)
++  use iso_c_binding, only: c_int32_t, c_size_t, c_int
++  use openacc_internal2, only: acc_create_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  integer (acc_handle_kind) async
++  call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_create_async_64_h (a, len, async)
++  use iso_c_binding, only: c_int64_t, c_size_t, c_int
++  use openacc_internal2, only: acc_create_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  integer (acc_handle_kind) async
++  call acc_create_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_create_async_array_h (a, async)
++  use iso_c_binding, only: c_int
++  use openacc_internal2, only: acc_create_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  type (*), dimension (..), contiguous :: a
++  integer (acc_handle_kind) async
++  call acc_create_async_l (a, sizeof (a), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_copyout_async_32_h (a, len, async)
++  use iso_c_binding, only: c_int32_t, c_size_t, c_int
++  use openacc_internal2, only: acc_copyout_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  integer (acc_handle_kind) async
++  call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_copyout_async_64_h (a, len, async)
++  use iso_c_binding, only: c_int64_t, c_size_t, c_int
++  use openacc_internal2, only: acc_copyout_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  integer (acc_handle_kind) async
++  call acc_copyout_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_copyout_async_array_h (a, async)
++  use iso_c_binding, only: c_int
++  use openacc_internal2, only: acc_copyout_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  type (*), dimension (..), contiguous :: a
++  integer (acc_handle_kind) async
++  call acc_copyout_async_l (a, sizeof (a), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_delete_async_32_h (a, len, async)
++  use iso_c_binding, only: c_int32_t, c_size_t, c_int
++  use openacc_internal2, only: acc_delete_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  integer (acc_handle_kind) async
++  call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_delete_async_64_h (a, len, async)
++  use iso_c_binding, only: c_int64_t, c_size_t, c_int
++  use openacc_internal2, only: acc_delete_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  integer (acc_handle_kind) async
++  call acc_delete_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_delete_async_array_h (a, async)
++  use iso_c_binding, only: c_int
++  use openacc_internal2, only: acc_delete_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  type (*), dimension (..), contiguous :: a
++  integer (acc_handle_kind) async
++  call acc_delete_async_l (a, sizeof (a), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_update_device_async_32_h (a, len, async)
++  use iso_c_binding, only: c_int32_t, c_size_t, c_int
++  use openacc_internal2, only: acc_update_device_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  integer (acc_handle_kind) async
++  call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_update_device_async_64_h (a, len, async)
++  use iso_c_binding, only: c_int64_t, c_size_t, c_int
++  use openacc_internal2, only: acc_update_device_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  integer (acc_handle_kind) async
++  call acc_update_device_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_update_device_async_array_h (a, async)
++  use iso_c_binding, only: c_int
++  use openacc_internal2, only: acc_update_device_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  type (*), dimension (..), contiguous :: a
++  integer (acc_handle_kind) async
++  call acc_update_device_async_l (a, sizeof (a), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_update_self_async_32_h (a, len, async)
++  use iso_c_binding, only: c_int32_t, c_size_t, c_int
++  use openacc_internal2, only: acc_update_self_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int32_t) len
++  integer (acc_handle_kind) async
++  call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_update_self_async_64_h (a, len, async)
++  use iso_c_binding, only: c_int64_t, c_size_t, c_int
++  use openacc_internal2, only: acc_update_self_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  !GCC$ ATTRIBUTES NO_ARG_CHECK :: a
++  type (*), dimension (*) :: a
++  integer (c_int64_t) len
++  integer (acc_handle_kind) async
++  call acc_update_self_async_l (a, int (len, kind = c_size_t), int (async, kind = c_int))
++end subroutine
++
++subroutine acc_update_self_async_array_h (a, async)
++  use iso_c_binding, only: c_int
++  use openacc_internal2, only: acc_update_self_async_l
++  use openacc_kinds2, only: acc_handle_kind
++  type (*), dimension (..), contiguous :: a
++  integer (acc_handle_kind) async
++  call acc_update_self_async_l (a, sizeof (a), int (async, kind = c_int))
++end subroutine
+--- libgomp/taskloop.c.jj	2018-04-25 09:40:31.913655581 +0200
++++ libgomp/taskloop.c	2019-05-07 18:46:36.547109400 +0200
+@@ -149,11 +149,28 @@ GOMP_taskloop (void (*fn) (void *), void
+ 
+   if (flags & GOMP_TASK_FLAG_NOGROUP)
+     {
+-      if (thr->task && thr->task->taskgroup && thr->task->taskgroup->cancelled)
+-	return;
++      if (__builtin_expect (gomp_cancel_var, 0)
++	  && thr->task
++	  && thr->task->taskgroup)
++	{
++	  if (thr->task->taskgroup->cancelled)
++	    return;
++	  if (thr->task->taskgroup->workshare
++	      && thr->task->taskgroup->prev
++	      && thr->task->taskgroup->prev->cancelled)
++	    return;
++	}
+     }
+   else
+-    ialias_call (GOMP_taskgroup_start) ();
++    {
++      ialias_call (GOMP_taskgroup_start) ();
++      if (flags & GOMP_TASK_FLAG_REDUCTION)
++	{
++	  struct gomp_data_head { TYPE t1, t2; uintptr_t *ptr; };
++	  uintptr_t *ptr = ((struct gomp_data_head *) data)->ptr;
++	  ialias_call (GOMP_taskgroup_reduction_register) (ptr);
++	}
++    }
+ 
+   if (priority > gomp_max_task_priority_var)
+     priority = gomp_max_task_priority_var;
+@@ -284,19 +301,31 @@ GOMP_taskloop (void (*fn) (void *), void
+       gomp_mutex_lock (&team->task_lock);
+       /* If parallel or taskgroup has been cancelled, don't start new
+ 	 tasks.  */
+-      if (__builtin_expect ((gomp_team_barrier_cancelled (&team->barrier)
+-			     || (taskgroup && taskgroup->cancelled))
+-			    && cpyfn == NULL, 0))
++      if (__builtin_expect (gomp_cancel_var, 0)
++	  && cpyfn == NULL)
+ 	{
+-	  gomp_mutex_unlock (&team->task_lock);
+-	  for (i = 0; i < num_tasks; i++)
++	  if (gomp_team_barrier_cancelled (&team->barrier))
++	    {
++	    do_cancel:
++	      gomp_mutex_unlock (&team->task_lock);
++	      for (i = 0; i < num_tasks; i++)
++		{
++		  gomp_finish_task (tasks[i]);
++		  free (tasks[i]);
++		}
++	      if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
++		ialias_call (GOMP_taskgroup_end) ();
++	      return;
++	    }
++	  if (taskgroup)
+ 	    {
+-	      gomp_finish_task (tasks[i]);
+-	      free (tasks[i]);
++	      if (taskgroup->cancelled)
++		goto do_cancel;
++	      if (taskgroup->workshare
++		  && taskgroup->prev
++		  && taskgroup->prev->cancelled)
++		goto do_cancel;
+ 	    }
+-	  if ((flags & GOMP_TASK_FLAG_NOGROUP) == 0)
+-	    ialias_call (GOMP_taskgroup_end) ();
+-	  return;
+ 	}
+       if (taskgroup)
+ 	taskgroup->num_children += num_tasks;
+--- libgomp/parallel.c.jj	2018-04-25 09:40:31.926655587 +0200
++++ libgomp/parallel.c	2019-05-07 18:46:36.532109640 +0200
+@@ -123,7 +123,8 @@ void
+ GOMP_parallel_start (void (*fn) (void *), void *data, unsigned num_threads)
+ {
+   num_threads = gomp_resolve_num_threads (num_threads, 0);
+-  gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads));
++  gomp_team_start (fn, data, num_threads, 0, gomp_new_team (num_threads),
++		   NULL);
+ }
+ 
+ void
+@@ -161,14 +162,33 @@ GOMP_parallel_end (void)
+ ialias (GOMP_parallel_end)
+ 
+ void
+-GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads, unsigned int flags)
++GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
++	       unsigned int flags)
+ {
+   num_threads = gomp_resolve_num_threads (num_threads, 0);
+-  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads));
++  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads),
++		   NULL);
+   fn (data);
+   ialias_call (GOMP_parallel_end) ();
+ }
+ 
++unsigned
++GOMP_parallel_reductions (void (*fn) (void *), void *data,
++			  unsigned num_threads, unsigned int flags)
++{
++  struct gomp_taskgroup *taskgroup;
++  num_threads = gomp_resolve_num_threads (num_threads, 0);
++  uintptr_t *rdata = *(uintptr_t **)data;
++  taskgroup = gomp_parallel_reduction_register (rdata, num_threads);
++  gomp_team_start (fn, data, num_threads, flags, gomp_new_team (num_threads),
++		   taskgroup);
++  fn (data);
++  ialias_call (GOMP_parallel_end) ();
++  gomp_sem_destroy (&taskgroup->taskgroup_sem);
++  free (taskgroup);
++  return num_threads;
++}
++
+ bool
+ GOMP_cancellation_point (int which)
+ {
+@@ -185,8 +205,15 @@ GOMP_cancellation_point (int which)
+     }
+   else if (which & GOMP_CANCEL_TASKGROUP)
+     {
+-      if (thr->task->taskgroup && thr->task->taskgroup->cancelled)
+-	return true;
++      if (thr->task->taskgroup)
++	{
++	  if (thr->task->taskgroup->cancelled)
++	    return true;
++	  if (thr->task->taskgroup->workshare
++	      && thr->task->taskgroup->prev
++	      && thr->task->taskgroup->prev->cancelled)
++	    return true;
++	}
+       /* FALLTHRU into the GOMP_CANCEL_PARALLEL case,
+ 	 as #pragma omp cancel parallel also cancels all explicit
+ 	 tasks.  */
+@@ -218,11 +245,17 @@ GOMP_cancel (int which, bool do_cancel)
+     }
+   else if (which & GOMP_CANCEL_TASKGROUP)
+     {
+-      if (thr->task->taskgroup && !thr->task->taskgroup->cancelled)
++      if (thr->task->taskgroup)
+ 	{
+-	  gomp_mutex_lock (&team->task_lock);
+-	  thr->task->taskgroup->cancelled = true;
+-	  gomp_mutex_unlock (&team->task_lock);
++	  struct gomp_taskgroup *taskgroup = thr->task->taskgroup;
++	  if (taskgroup->workshare && taskgroup->prev)
++	    taskgroup = taskgroup->prev;
++	  if (!taskgroup->cancelled)
++	    {
++	      gomp_mutex_lock (&team->task_lock);
++	      taskgroup->cancelled = true;
++	      gomp_mutex_unlock (&team->task_lock);
++	    }
+ 	}
+       return true;
+     }
+--- libgomp/oacc-plugin.h.jj	2018-04-25 09:40:31.322655307 +0200
++++ libgomp/oacc-plugin.h	2019-05-07 18:46:36.531109656 +0200
+@@ -29,5 +29,6 @@
+ 
+ extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
+ extern void *GOMP_PLUGIN_acc_thread (void);
++extern int GOMP_PLUGIN_acc_default_dim (unsigned int);
+ 
+ #endif
+--- libgomp/target.c.jj	2018-04-25 09:40:31.912655580 +0200
++++ libgomp/target.c	2019-05-07 19:07:21.032306327 +0200
+@@ -180,16 +180,22 @@ gomp_device_copy (struct gomp_device_des
+ /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses)
+    host to device memory transfers.  */
+ 
++struct gomp_coalesce_chunk
++{
++  /* The starting and ending point of a coalesced chunk of memory.  */
++  size_t start, end;
++};
++
+ struct gomp_coalesce_buf
+ {
+   /* Buffer into which gomp_copy_host2dev will memcpy data and from which
+      it will be copied to the device.  */
+   void *buf;
+   struct target_mem_desc *tgt;
+-  /* Array with offsets, chunks[2 * i] is the starting offset and
+-     chunks[2 * i + 1] ending offset relative to tgt->tgt_start device address
++  /* Array with offsets, chunks[i].start is the starting offset and
++     chunks[i].end ending offset relative to tgt->tgt_start device address
+      of chunks which are to be copied to buf and later copied to device.  */
+-  size_t *chunks;
++  struct gomp_coalesce_chunk *chunks;
+   /* Number of chunks in chunks array, or -1 if coalesce buffering should not
+      be performed.  */
+   long chunk_cnt;
+@@ -222,14 +228,14 @@ gomp_coalesce_buf_add (struct gomp_coale
+     {
+       if (cbuf->chunk_cnt < 0)
+ 	return;
+-      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
++      if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end)
+ 	{
+ 	  cbuf->chunk_cnt = -1;
+ 	  return;
+ 	}
+-      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + MAX_COALESCE_BUF_GAP)
++      if (start < cbuf->chunks[cbuf->chunk_cnt - 1].end + MAX_COALESCE_BUF_GAP)
+ 	{
+-	  cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
++	  cbuf->chunks[cbuf->chunk_cnt - 1].end = start + len;
+ 	  cbuf->use_cnt++;
+ 	  return;
+ 	}
+@@ -239,8 +245,8 @@ gomp_coalesce_buf_add (struct gomp_coale
+       if (cbuf->use_cnt == 1)
+ 	cbuf->chunk_cnt--;
+     }
+-  cbuf->chunks[2 * cbuf->chunk_cnt] = start;
+-  cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
++  cbuf->chunks[cbuf->chunk_cnt].start = start;
++  cbuf->chunks[cbuf->chunk_cnt].end = start + len;
+   cbuf->chunk_cnt++;
+   cbuf->use_cnt = 1;
+ }
+@@ -271,20 +277,20 @@ gomp_copy_host2dev (struct gomp_device_d
+   if (cbuf)
+     {
+       uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
+-      if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
++      if (doff < cbuf->chunks[cbuf->chunk_cnt - 1].end)
+ 	{
+ 	  long first = 0;
+ 	  long last = cbuf->chunk_cnt - 1;
+ 	  while (first <= last)
+ 	    {
+ 	      long middle = (first + last) >> 1;
+-	      if (cbuf->chunks[2 * middle + 1] <= doff)
++	      if (cbuf->chunks[middle].end <= doff)
+ 		first = middle + 1;
+-	      else if (cbuf->chunks[2 * middle] <= doff)
++	      else if (cbuf->chunks[middle].start <= doff)
+ 		{
+-		  if (doff + sz > cbuf->chunks[2 * middle + 1])
++		  if (doff + sz > cbuf->chunks[middle].end)
+ 		    gomp_fatal ("internal libgomp cbuf error");
+-		  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
++		  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
+ 			  h, sz);
+ 		  return;
+ 		}
+@@ -510,8 +516,8 @@ gomp_map_vars (struct gomp_device_descr
+   cbuf.buf = NULL;
+   if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
+     {
+-      cbuf.chunks
+-	= (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
++      size_t chunks_size = (mapnum + 1) * sizeof (struct gomp_coalesce_chunk);
++      cbuf.chunks = (struct gomp_coalesce_chunk *) gomp_alloca (chunks_size);
+       cbuf.chunk_cnt = 0;
+     }
+   if (pragma_kind == GOMP_MAP_VARS_TARGET)
+@@ -521,8 +527,8 @@ gomp_map_vars (struct gomp_device_descr
+       tgt_size = mapnum * sizeof (void *);
+       cbuf.chunk_cnt = 1;
+       cbuf.use_cnt = 1 + (mapnum > 1);
+-      cbuf.chunks[0] = 0;
+-      cbuf.chunks[1] = tgt_size;
++      cbuf.chunks[0].start = 0;
++      cbuf.chunks[0].end = tgt_size;
+     }
+ 
+   gomp_mutex_lock (&devicep->lock);
+@@ -707,7 +713,7 @@ gomp_map_vars (struct gomp_device_descr
+       if (cbuf.chunk_cnt > 0)
+ 	{
+ 	  cbuf.buf
+-	    = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
++	    = malloc (cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start);
+ 	  if (cbuf.buf)
+ 	    {
+ 	      cbuf.tgt = tgt;
+@@ -859,6 +865,7 @@ gomp_map_vars (struct gomp_device_descr
+ 		tgt->list[i].offset = 0;
+ 		tgt->list[i].length = k->host_end - k->host_start;
+ 		k->refcount = 1;
++		k->dynamic_refcount = 0;
+ 		tgt->refcount++;
+ 		array->left = NULL;
+ 		array->right = NULL;
+@@ -956,9 +963,10 @@ gomp_map_vars (struct gomp_device_descr
+ 		    /* Set link pointer on target to the device address of the
+ 		       mapped object.  */
+ 		    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
+-		    devicep->host2dev_func (devicep->target_id,
+-					    (void *) n->tgt_offset,
+-					    &tgt_addr, sizeof (void *));
++		    /* We intentionally do not use coalescing here, as it's not
++		       data allocated by the current call to this function.  */
++		    gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
++					&tgt_addr, sizeof (void *), NULL);
+ 		  }
+ 		array++;
+ 	      }
+@@ -981,10 +989,14 @@ gomp_map_vars (struct gomp_device_descr
+     {
+       long c = 0;
+       for (c = 0; c < cbuf.chunk_cnt; ++c)
+-	gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * c]),
+-			    (char *) cbuf.buf + (cbuf.chunks[2 * c] - cbuf.chunks[0]),
+-			    cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
++	gomp_copy_host2dev (devicep,
++			    (void *) (tgt->tgt_start + cbuf.chunks[c].start),
++			    (char *) cbuf.buf + (cbuf.chunks[c].start
++						 - cbuf.chunks[0].start),
++			    cbuf.chunks[c].end - cbuf.chunks[c].start, NULL);
+       free (cbuf.buf);
++      cbuf.buf = NULL;
++      cbufp = NULL;
+     }
+ 
+   /* If the variable from "omp target enter data" map-list was already mapped,
+@@ -1011,6 +1023,23 @@ gomp_unmap_tgt (struct target_mem_desc *
+   free (tgt);
+ }
+ 
++attribute_hidden bool
++gomp_remove_var (struct gomp_device_descr *devicep, splay_tree_key k)
++{
++  bool is_tgt_unmapped = false;
++  splay_tree_remove (&devicep->mem_map, k);
++  if (k->link_key)
++    splay_tree_insert (&devicep->mem_map, (splay_tree_node) k->link_key);
++  if (k->tgt->refcount > 1)
++    k->tgt->refcount--;
++  else
++    {
++      is_tgt_unmapped = true;
++      gomp_unmap_tgt (k->tgt);
++    }
++  return is_tgt_unmapped;
++}
++
+ /* Unmap variables described by TGT.  If DO_COPYFROM is true, copy relevant
+    variables back from device to host: if it is false, it is assumed that this
+    has been done already.  */
+@@ -1059,16 +1088,7 @@ gomp_unmap_vars (struct target_mem_desc
+ 				      + tgt->list[i].offset),
+ 			    tgt->list[i].length);
+       if (do_unmap)
+-	{
+-	  splay_tree_remove (&devicep->mem_map, k);
+-	  if (k->link_key)
+-	    splay_tree_insert (&devicep->mem_map,
+-			       (splay_tree_node) k->link_key);
+-	  if (k->tgt->refcount > 1)
+-	    k->tgt->refcount--;
+-	  else
+-	    gomp_unmap_tgt (k->tgt);
+-	}
++	gomp_remove_var (devicep, k);
+     }
+ 
+   if (tgt->refcount > 1)
+@@ -1298,17 +1318,7 @@ gomp_unload_image_from_device (struct go
+       else
+ 	{
+ 	  splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &k);
+-	  splay_tree_remove (&devicep->mem_map, n);
+-	  if (n->link_key)
+-	    {
+-	      if (n->tgt->refcount > 1)
+-		n->tgt->refcount--;
+-	      else
+-		{
+-		  is_tgt_unmapped = true;
+-		  gomp_unmap_tgt (n->tgt);
+-		}
+-	    }
++	  is_tgt_unmapped = gomp_remove_var (devicep, n);
+ 	}
+     }
+ 
+@@ -1855,11 +1865,20 @@ GOMP_target_update_ext (int device, size
+ 	      struct gomp_team *team = thr->ts.team;
+ 	      /* If parallel or taskgroup has been cancelled, don't start new
+ 		 tasks.  */
+-	      if (team
+-		  && (gomp_team_barrier_cancelled (&team->barrier)
+-		      || (thr->task->taskgroup
+-			  && thr->task->taskgroup->cancelled)))
+-		return;
++	      if (__builtin_expect (gomp_cancel_var, 0) && team)
++		{
++		  if (gomp_team_barrier_cancelled (&team->barrier))
++		    return;
++		  if (thr->task->taskgroup)
++		    {
++		      if (thr->task->taskgroup->cancelled)
++			return;
++		      if (thr->task->taskgroup->workshare
++			  && thr->task->taskgroup->prev
++			  && thr->task->taskgroup->prev->cancelled)
++			return;
++		    }
++		}
+ 
+ 	      gomp_task_maybe_wait_for_dependencies (depend);
+ 	    }
+@@ -1874,10 +1893,20 @@ GOMP_target_update_ext (int device, size
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+-  if (team
+-      && (gomp_team_barrier_cancelled (&team->barrier)
+-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+-    return;
++  if (__builtin_expect (gomp_cancel_var, 0) && team)
++    {
++      if (gomp_team_barrier_cancelled (&team->barrier))
++	return;
++      if (thr->task->taskgroup)
++	{
++	  if (thr->task->taskgroup->cancelled)
++	    return;
++	  if (thr->task->taskgroup->workshare
++	      && thr->task->taskgroup->prev
++	      && thr->task->taskgroup->prev->cancelled)
++	    return;
++	}
++    }
+ 
+   gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, true);
+ }
+@@ -1986,11 +2015,20 @@ GOMP_target_enter_exit_data (int device,
+ 	      struct gomp_team *team = thr->ts.team;
+ 	      /* If parallel or taskgroup has been cancelled, don't start new
+ 		 tasks.  */
+-	      if (team
+-		  && (gomp_team_barrier_cancelled (&team->barrier)
+-		      || (thr->task->taskgroup
+-			  && thr->task->taskgroup->cancelled)))
+-		return;
++	      if (__builtin_expect (gomp_cancel_var, 0) && team)
++		{
++		  if (gomp_team_barrier_cancelled (&team->barrier))
++		    return;
++		  if (thr->task->taskgroup)
++		    {
++		      if (thr->task->taskgroup->cancelled)
++			return;
++		      if (thr->task->taskgroup->workshare
++			  && thr->task->taskgroup->prev
++			  && thr->task->taskgroup->prev->cancelled)
++			return;
++		    }
++		}
+ 
+ 	      gomp_task_maybe_wait_for_dependencies (depend);
+ 	    }
+@@ -2005,10 +2043,20 @@ GOMP_target_enter_exit_data (int device,
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+   /* If parallel or taskgroup has been cancelled, don't start new tasks.  */
+-  if (team
+-      && (gomp_team_barrier_cancelled (&team->barrier)
+-	  || (thr->task->taskgroup && thr->task->taskgroup->cancelled)))
+-    return;
++  if (__builtin_expect (gomp_cancel_var, 0) && team)
++    {
++      if (gomp_team_barrier_cancelled (&team->barrier))
++	return;
++      if (thr->task->taskgroup)
++	{
++	  if (thr->task->taskgroup->cancelled)
++	    return;
++	  if (thr->task->taskgroup->workshare
++	      && thr->task->taskgroup->prev
++	      && thr->task->taskgroup->prev->cancelled)
++	    return;
++	}
++    }
+ 
+   size_t i;
+   if ((flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
+@@ -2197,8 +2245,9 @@ omp_target_is_present (void *ptr, int de
+ }
+ 
+ int
+-omp_target_memcpy (void *dst, void *src, size_t length, size_t dst_offset,
+-		   size_t src_offset, int dst_device_num, int src_device_num)
++omp_target_memcpy (void *dst, void *src, size_t length,
++		   size_t dst_offset, size_t src_offset, int dst_device_num,
++		   int src_device_num)
+ {
+   struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+   bool ret;
+@@ -2287,21 +2336,25 @@ omp_target_memcpy_rect_worker (void *dst
+ 	return EINVAL;
+       if (dst_devicep == NULL && src_devicep == NULL)
+ 	{
+-	  memcpy ((char *) dst + dst_off, (char *) src + src_off, length);
++	  memcpy ((char *) dst + dst_off, (char *) src + src_off,
++		  length);
+ 	  ret = 1;
+ 	}
+       else if (src_devicep == NULL)
+ 	ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+ 					  (char *) dst + dst_off,
+-					  (char *) src + src_off, length);
++					  (char *) src + src_off,
++					  length);
+       else if (dst_devicep == NULL)
+ 	ret = src_devicep->dev2host_func (src_devicep->target_id,
+ 					  (char *) dst + dst_off,
+-					  (char *) src + src_off, length);
++					  (char *) src + src_off,
++					  length);
+       else if (src_devicep == dst_devicep)
+ 	ret = src_devicep->dev2dev_func (src_devicep->target_id,
+ 					 (char *) dst + dst_off,
+-					 (char *) src + src_off, length);
++					 (char *) src + src_off,
++					 length);
+       else
+ 	ret = 0;
+       return ret ? 0 : EINVAL;
+@@ -2396,8 +2449,8 @@ omp_target_memcpy_rect (void *dst, void
+ }
+ 
+ int
+-omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
+-			  size_t device_offset, int device_num)
++omp_target_associate_ptr (void *host_ptr, void *device_ptr,
++			  size_t size, size_t device_offset, int device_num)
+ {
+   if (device_num == GOMP_DEVICE_HOST_FALLBACK)
+     return EINVAL;
+@@ -2499,6 +2552,31 @@ omp_target_disassociate_ptr (void *ptr,
+   return ret;
+ }
+ 
++int
++omp_pause_resource (omp_pause_resource_t kind, int device_num)
++{
++  (void) kind;
++  if (device_num == GOMP_DEVICE_HOST_FALLBACK)
++    return gomp_pause_host ();
++  if (device_num < 0 || device_num >= gomp_get_num_devices ())
++    return -1;
++  /* Do nothing for target devices for now.  */
++  return 0;
++}
++
++int
++omp_pause_resource_all (omp_pause_resource_t kind)
++{
++  (void) kind;
++  if (gomp_pause_host ())
++    return -1;
++  /* Do nothing for target devices for now.  */
++  return 0;
++}
++
++ialias (omp_pause_resource)
++ialias (omp_pause_resource_all)
++
+ #ifdef PLUGIN_SUPPORT
+ 
+ /* This function tries to load a plugin for DEVICE.  Name of plugin is passed
+@@ -2632,9 +2710,9 @@ gomp_target_fini (void)
+     }
+ }
+ 
+-/* This function initializes the runtime needed for offloading.
+-   It parses the list of offload targets and tries to load the plugins for
+-   these targets.  On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP
++/* This function initializes the runtime for offloading.
++   It parses the list of offload plugins, and tries to load these.
++   On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP
+    will be set, and the array DEVICES initialized, containing descriptors for
+    corresponding devices, first the GOMP_OFFLOAD_CAP_OPENMP_400 ones, follows
+    by the others.  */
+@@ -2651,7 +2729,7 @@ gomp_target_init (void)
+   num_devices = 0;
+   devices = NULL;
+ 
+-  cur = OFFLOAD_TARGETS;
++  cur = OFFLOAD_PLUGINS;
+   if (*cur)
+     do
+       {
+--- libgomp/ordered.c.jj	2018-04-25 09:40:31.926655587 +0200
++++ libgomp/ordered.c	2019-05-07 18:46:36.532109640 +0200
+@@ -259,7 +259,8 @@ GOMP_ordered_end (void)
+ #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
+ 
+ void
+-gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size)
++gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
++		    size_t extra)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+@@ -269,13 +270,24 @@ gomp_doacross_init (unsigned ncounts, lo
+   struct gomp_doacross_work_share *doacross;
+ 
+   if (team == NULL || team->nthreads == 1)
+-    return;
++    {
++    empty:
++      if (!extra)
++	ws->doacross = NULL;
++      else
++	{
++	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
++	  doacross->extra = (void *) (doacross + 1);
++	  ws->doacross = doacross;
++	}
++      return;
++    }
+ 
+   for (i = 0; i < ncounts; i++)
+     {
+       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+       if (counts[i] == 0)
+-	return;
++	goto empty;
+ 
+       if (num_bits <= MAX_COLLAPSED_BITS)
+ 	{
+@@ -314,7 +326,7 @@ gomp_doacross_init (unsigned ncounts, lo
+   elt_sz = (elt_sz + 63) & ~63UL;
+ 
+   doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
+-			  + shift_sz);
++			  + shift_sz + extra);
+   doacross->chunk_size = chunk_size;
+   doacross->elt_sz = elt_sz;
+   doacross->ncounts = ncounts;
+@@ -322,6 +334,13 @@ gomp_doacross_init (unsigned ncounts, lo
+   doacross->array = (unsigned char *)
+ 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+ 		     & ~(uintptr_t) 63);
++  if (extra)
++    {
++      doacross->extra = doacross->array + num_ents * elt_sz;
++      memset (doacross->extra, '\0', extra);
++    }
++  else
++    doacross->extra = NULL;
+   if (num_bits <= MAX_COLLAPSED_BITS)
+     {
+       unsigned int shift_count = 0;
+@@ -360,7 +379,8 @@ GOMP_doacross_post (long *counts)
+   unsigned long ent;
+   unsigned int i;
+ 
+-  if (__builtin_expect (doacross == NULL, 0))
++  if (__builtin_expect (doacross == NULL, 0)
++      || __builtin_expect (doacross->array == NULL, 0))
+     {
+       __sync_synchronize ();
+       return;
+@@ -411,7 +431,8 @@ GOMP_doacross_wait (long first, ...)
+   unsigned long ent;
+   unsigned int i;
+ 
+-  if (__builtin_expect (doacross == NULL, 0))
++  if (__builtin_expect (doacross == NULL, 0)
++      || __builtin_expect (doacross->array == NULL, 0))
+     {
+       __sync_synchronize ();
+       return;
+@@ -488,7 +509,8 @@ GOMP_doacross_wait (long first, ...)
+ typedef unsigned long long gomp_ull;
+ 
+ void
+-gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts, gomp_ull chunk_size)
++gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
++			gomp_ull chunk_size, size_t extra)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+@@ -498,13 +520,24 @@ gomp_doacross_ull_init (unsigned ncounts
+   struct gomp_doacross_work_share *doacross;
+ 
+   if (team == NULL || team->nthreads == 1)
+-    return;
++    {
++    empty:
++      if (!extra)
++	ws->doacross = NULL;
++      else
++	{
++	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
++	  doacross->extra = (void *) (doacross + 1);
++	  ws->doacross = doacross;
++	}
++      return;
++    }
+ 
+   for (i = 0; i < ncounts; i++)
+     {
+       /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
+       if (counts[i] == 0)
+-	return;
++	goto empty;
+ 
+       if (num_bits <= MAX_COLLAPSED_BITS)
+ 	{
+@@ -557,6 +590,13 @@ gomp_doacross_ull_init (unsigned ncounts
+   doacross->array = (unsigned char *)
+ 		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
+ 		     & ~(uintptr_t) 63);
++  if (extra)
++    {
++      doacross->extra = doacross->array + num_ents * elt_sz;
++      memset (doacross->extra, '\0', extra);
++    }
++  else
++    doacross->extra = NULL;
+   if (num_bits <= MAX_COLLAPSED_BITS)
+     {
+       unsigned int shift_count = 0;
+@@ -595,7 +635,8 @@ GOMP_doacross_ull_post (gomp_ull *counts
+   unsigned long ent;
+   unsigned int i;
+ 
+-  if (__builtin_expect (doacross == NULL, 0))
++  if (__builtin_expect (doacross == NULL, 0)
++      || __builtin_expect (doacross->array == NULL, 0))
+     {
+       __sync_synchronize ();
+       return;
+@@ -667,7 +708,8 @@ GOMP_doacross_ull_wait (gomp_ull first,
+   unsigned long ent;
+   unsigned int i;
+ 
+-  if (__builtin_expect (doacross == NULL, 0))
++  if (__builtin_expect (doacross == NULL, 0)
++      || __builtin_expect (doacross->array == NULL, 0))
+     {
+       __sync_synchronize ();
+       return;
+--- libgomp/alloc.c.jj	2018-04-25 09:40:31.926655587 +0200
++++ libgomp/alloc.c	2019-05-07 18:46:36.336112770 +0200
+@@ -57,3 +57,50 @@ gomp_realloc (void *old, size_t size)
+     gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size);
+   return ret;
+ }
++
++void *
++gomp_aligned_alloc (size_t al, size_t size)
++{
++  void *ret;
++  if (al < sizeof (void *))
++    al = sizeof (void *);
++#ifdef HAVE_ALIGNED_ALLOC
++  ret = aligned_alloc (al, size);
++#elif defined(HAVE__ALIGNED_MALLOC)
++  ret = _aligned_malloc (size, al);
++#elif defined(HAVE_POSIX_MEMALIGN)
++  if (posix_memalign (&ret, al, size) != 0)
++    ret = NULL;
++#elif defined(HAVE_MEMALIGN)
++  {
++    extern void *memalign (size_t, size_t);
++    ret = memalign (al, size);
++  }
++#else
++  ret = NULL;
++  if ((al & (al - 1)) == 0 && size)
++    {
++      void *p = malloc (size + al);
++      if (p)
++	{
++	  void *ap = (void *) (((uintptr_t) p + al) & -al);
++	  ((void **) ap)[-1] = p;
++	  ret = ap;
++	}
++    }
++#endif
++  if (ret == NULL)
++    gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size);
++  return ret;
++}
++
++void
++gomp_aligned_free (void *ptr)
++{
++#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC
++  free (ptr);
++#else
++  if (ptr)
++    free (((void **) ptr)[-1]);
++#endif
++}
+--- libgomp/configure.ac.jj	2018-04-25 09:40:31.321655307 +0200
++++ libgomp/configure.ac	2019-05-07 18:46:36.471110614 +0200
+@@ -219,6 +219,7 @@ m4_include([plugin/configfrag.ac])
+ 
+ # Check for functions needed.
+ AC_CHECK_FUNCS(getloadavg clock_gettime strtoull)
++AC_CHECK_FUNCS(aligned_alloc posix_memalign memalign _aligned_malloc)
+ 
+ # Check for broken semaphore implementation on darwin.
+ # sem_init returns: sem_init error: Function not implemented.
+@@ -266,6 +267,41 @@ if test $ac_cv_func_clock_gettime = no;
+ 	       [Define to 1 if you have the `clock_gettime' function.])])
+ fi
+ 
++# Check for uname.
++AC_COMPILE_IFELSE(
++ [AC_LANG_PROGRAM(
++  [#include <string.h>
++   #include <stdlib.h>
++   #include <sys/utsname.h>],
++  [struct utsname buf;
++   volatile size_t len = 0;
++   if (!uname (buf))
++     len = strlen (buf.nodename);])],
++  AC_DEFINE(HAVE_UNAME, 1,
++[	Define if uname is supported and struct utsname has nodename field.]))
++
++# Check for gethostname.
++AC_COMPILE_IFELSE(
++ [AC_LANG_PROGRAM(
++  [#include <unistd.h>],
++  [
++changequote(,)dnl
++   char buf[256];
++   if (gethostname (buf, sizeof (buf) - 1) == 0)
++     buf[255] = '\0';
++changequote([,])dnl
++  ])],
++  AC_DEFINE(HAVE_GETHOSTNAME, 1,
++[	Define if gethostname is supported.]))
++
++# Check for getpid.
++AC_COMPILE_IFELSE(
++ [AC_LANG_PROGRAM(
++  [#include <unistd.h>],
++  [int pid = getpid ();])],
++  AC_DEFINE(HAVE_GETPID, 1,
++[	Define if getpid is supported.]))
++
+ # See if we support thread-local storage.
+ GCC_CHECK_TLS
+ 
+--- libgomp/icv.c.jj	2018-04-25 09:40:31.870655561 +0200
++++ libgomp/icv.c	2019-05-07 18:46:36.501110134 +0200
+@@ -69,7 +69,7 @@ void
+ omp_set_schedule (omp_sched_t kind, int chunk_size)
+ {
+   struct gomp_task_icv *icv = gomp_icv (true);
+-  switch (kind)
++  switch (kind & ~omp_sched_monotonic)
+     {
+     case omp_sched_static:
+       if (chunk_size < 1)
+--- libgomp/configure.jj	2018-04-25 09:40:31.913655581 +0200
++++ libgomp/configure	2019-05-07 18:47:37.961128420 +0200
+@@ -636,6 +636,8 @@ PLUGIN_NVPTX_FALSE
+ PLUGIN_NVPTX_TRUE
+ offload_additional_lib_paths
+ offload_additional_options
++offload_targets
++offload_plugins
+ PLUGIN_HSA_LIBS
+ PLUGIN_HSA_LDFLAGS
+ PLUGIN_HSA_CPPFLAGS
+@@ -648,7 +650,6 @@ PLUGIN_NVPTX_CPPFLAGS
+ PLUGIN_NVPTX
+ CUDA_DRIVER_LIB
+ CUDA_DRIVER_INCLUDE
+-offload_targets
+ libtool_VERSION
+ ac_ct_FC
+ FCFLAGS
+@@ -11157,7 +11158,7 @@ else
+   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+   lt_status=$lt_dlunknown
+   cat > conftest.$ac_ext <<_LT_EOF
+-#line 11160 "configure"
++#line 11161 "configure"
+ #include "confdefs.h"
+ 
+ #if HAVE_DLFCN_H
+@@ -11263,7 +11264,7 @@ else
+   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
+   lt_status=$lt_dlunknown
+   cat > conftest.$ac_ext <<_LT_EOF
+-#line 11266 "configure"
++#line 11267 "configure"
+ #include "confdefs.h"
+ 
+ #if HAVE_DLFCN_H
+@@ -15167,8 +15168,6 @@ fi
+ # see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ # <http://www.gnu.org/licenses/>.
+ 
+-offload_targets=
+-
+ plugin_support=yes
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for dlsym in -ldl" >&5
+ $as_echo_n "checking for dlsym in -ldl... " >&6; }
+@@ -15302,7 +15301,11 @@ if test "${with_cuda_driver_lib+set}" =
+ fi
+ 
+ case "x$with_cuda_driver" in
+-  x | xno) ;;
++  x) ;;
++  xno)
++    CUDA_DRIVER_INCLUDE=no
++    CUDA_DRIVER_LIB=no
++    ;;
+   *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
+      CUDA_DRIVER_LIB=$with_cuda_driver/lib
+      ;;
+@@ -15313,10 +15316,12 @@ fi
+ if test "x$with_cuda_driver_lib" != x; then
+   CUDA_DRIVER_LIB=$with_cuda_driver_lib
+ fi
+-if test "x$CUDA_DRIVER_INCLUDE" != x; then
++if test "x$CUDA_DRIVER_INCLUDE" != x \
++   && test "x$CUDA_DRIVER_INCLUDE" != xno; then
+   CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
+ fi
+-if test "x$CUDA_DRIVER_LIB" != x; then
++if test "x$CUDA_DRIVER_LIB" != x \
++   && test "x$CUDA_DRIVER_LIB" != xno; then
+   CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
+ fi
+ 
+@@ -15383,7 +15388,13 @@ PLUGIN_HSA_LIBS=
+ 
+ 
+ 
+-# Get offload targets and path to install tree of offloading compiler.
++# Parse '--enable-offload-targets', figure out the corresponding libgomp
++# plugins, and configure to find the corresponding offload compilers.
++# 'offload_plugins' and 'offload_targets' will be populated in the same order.
++offload_plugins=
++offload_targets=
++
++
+ offload_additional_options=
+ offload_additional_lib_paths=
+ 
+@@ -15392,25 +15403,27 @@ if test x"$enable_offload_targets" != x;
+   for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
+     tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
+     tgt=`echo $tgt | sed 's/=.*//'`
+-    tgt_name=
++    tgt_plugin=
+     case $tgt in
+       *-intelmic-* | *-intelmicemul-*)
+-	tgt_name=intelmic
++	tgt_plugin=intelmic
+ 	;;
+       nvptx*)
+-        tgt_name=nvptx
++	tgt_plugin=nvptx
+ 	PLUGIN_NVPTX=$tgt
+-	PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+-	PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+-	PLUGIN_NVPTX_LIBS='-lcuda'
+-
+-	PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+-	CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+-	PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+-	LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+-	PLUGIN_NVPTX_save_LIBS=$LIBS
+-	LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+-	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
++	if test "x$CUDA_DRIVER_LIB" != xno \
++	   && test "x$CUDA_DRIVER_LIB" != xno; then
++	  PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
++	  PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
++	  PLUGIN_NVPTX_LIBS='-lcuda'
++
++	  PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
++	  CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
++	  PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
++	  LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
++	  PLUGIN_NVPTX_save_LIBS=$LIBS
++	  LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
++	  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ /* end confdefs.h.  */
+ #include "cuda.h"
+ int
+@@ -15426,13 +15439,16 @@ if ac_fn_c_try_link "$LINENO"; then :
+ fi
+ rm -f core conftest.err conftest.$ac_objext \
+     conftest$ac_exeext conftest.$ac_ext
+-	CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+-	LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+-	LIBS=$PLUGIN_NVPTX_save_LIBS
++	  CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
++	  LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
++	  LIBS=$PLUGIN_NVPTX_save_LIBS
++	fi
+ 	case $PLUGIN_NVPTX in
+ 	  nvptx*)
+-	    if test "x$CUDA_DRIVER_INCLUDE" = x \
+-	       && test "x$CUDA_DRIVER_LIB" = x; then
++	    if (test "x$CUDA_DRIVER_INCLUDE" = x \
++		|| test "x$CUDA_DRIVER_INCLUDE" = xno) \
++	       && (test "x$CUDA_DRIVER_LIB" = x \
++		   || test "x$CUDA_DRIVER_LIB" = xno); then
+ 	      PLUGIN_NVPTX=1
+ 	      PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
+ 	      PLUGIN_NVPTX_LIBS='-ldl'
+@@ -15452,7 +15468,7 @@ rm -f core conftest.err conftest.$ac_obj
+ 	        PLUGIN_HSA=0
+ 		;;
+ 	      *)
+-	        tgt_name=hsa
++		tgt_plugin=hsa
+ 	        PLUGIN_HSA=$tgt
+ 	        PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+ 	        PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+@@ -15470,7 +15486,7 @@ rm -f core conftest.err conftest.$ac_obj
+ 	        LDFLAGS=$PLUGIN_HSA_save_LDFLAGS
+ 	        LIBS=$PLUGIN_HSA_save_LIBS
+ 	        case $PLUGIN_HSA in
+-	          hsa*)
++		  hsa*)
+ 	            HSA_PLUGIN=0
+ 	            as_fn_error "HSA run-time package required for HSA support" "$LINENO" 5
+ 	            ;;
+@@ -15487,16 +15503,19 @@ rm -f core conftest.err conftest.$ac_obj
+ 	as_fn_error "unknown offload target specified" "$LINENO" 5
+ 	;;
+     esac
+-    if test x"$tgt_name" = x; then
+-      # Don't configure libgomp for this offloading target if we don't build
+-      # the corresponding plugin.
++    if test x"$tgt_plugin" = x; then
++      # Not configuring libgomp for this offload target if we're not building
++      # the corresponding offload plugin.
+       continue
+-    elif test x"$offload_targets" = x; then
+-      offload_targets=$tgt_name
++    elif test x"$offload_plugins" = x; then
++      offload_plugins=$tgt_plugin
++      offload_targets=$tgt
+     else
+-      offload_targets=$offload_targets,$tgt_name
++      offload_plugins=$offload_plugins,$tgt_plugin
++      offload_targets=$offload_targets,$tgt
+     fi
+-    if test "$tgt_name" = hsa; then
++    # Configure additional search paths.
++    if test "$tgt_plugin" = hsa; then
+       # Offloading compilation is all handled by the target compiler.
+       :
+     elif test x"$tgt_dir" != x; then
+@@ -15510,7 +15529,7 @@ rm -f core conftest.err conftest.$ac_obj
+ fi
+ 
+ cat >>confdefs.h <<_ACEOF
+-#define OFFLOAD_TARGETS "$offload_targets"
++#define OFFLOAD_PLUGINS "$offload_plugins"
+ _ACEOF
+ 
+  if test $PLUGIN_NVPTX = 1; then
+@@ -15570,6 +15589,19 @@ _ACEOF
+ fi
+ done
+ 
++for ac_func in aligned_alloc posix_memalign memalign _aligned_malloc
++do :
++  as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
++ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
++eval as_val=\$$as_ac_var
++   if test "x$as_val" = x""yes; then :
++  cat >>confdefs.h <<_ACEOF
++#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1
++_ACEOF
++
++fi
++done
++
+ 
+ # Check for broken semaphore implementation on darwin.
+ # sem_init returns: sem_init error: Function not implemented.
+@@ -15784,6 +15816,72 @@ fi
+ 
+ fi
+ 
++# Check for uname.
++cat confdefs.h - <<_ACEOF >conftest.$ac_ext
++/* end confdefs.h.  */
++#include <string.h>
++   #include <stdlib.h>
++   #include <sys/utsname.h>
++int
++main ()
++{
++struct utsname buf;
++   volatile size_t len = 0;
++   if (!uname (buf))
++     len = strlen (buf.nodename);
++  ;
++  return 0;
++}
++_ACEOF
++if ac_fn_c_try_compile "$LINENO"; then :
++
++$as_echo "#define HAVE_UNAME 1" >>confdefs.h
++
++fi
++rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
++
++# Check for gethostname.
++cat confdefs.h - <<_ACEOF >conftest.$ac_ext
++/* end confdefs.h.  */
++#include <unistd.h>
++int
++main ()
++{
++
++   char buf[256];
++   if (gethostname (buf, sizeof (buf) - 1) == 0)
++     buf[255] = '\0';
++
++  ;
++  return 0;
++}
++_ACEOF
++if ac_fn_c_try_compile "$LINENO"; then :
++
++$as_echo "#define HAVE_GETHOSTNAME 1" >>confdefs.h
++
++fi
++rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
++
++# Check for getpid.
++cat confdefs.h - <<_ACEOF >conftest.$ac_ext
++/* end confdefs.h.  */
++#include <unistd.h>
++int
++main ()
++{
++int pid = getpid ();
++  ;
++  return 0;
++}
++_ACEOF
++if ac_fn_c_try_compile "$LINENO"; then :
++
++$as_echo "#define HAVE_GETPID 1" >>confdefs.h
++
++fi
++rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
++
+ # See if we support thread-local storage.
+ 
+ 
+--- libgomp/Makefile.am.jj	2018-04-25 09:40:31.926655587 +0200
++++ libgomp/Makefile.am	2019-05-07 19:59:03.683989317 +0200
+@@ -63,12 +63,13 @@ libgomp_la_SOURCES = alloc.c atomic.c ba
+ 	parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
+ 	proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
+ 	splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
+-	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
++	oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
++	affinity-fmt.c teams.c
+ 
+ include $(top_srcdir)/plugin/Makefrag.am
+ 
+ if USE_FORTRAN
+-libgomp_la_SOURCES += openacc.f90
++libgomp_la_SOURCES += openacc2.f90
+ endif
+ 
+ nodist_noinst_HEADERS = libgomp_f.h
+@@ -87,8 +88,6 @@ omp_lib_kinds.mod: omp_lib.mod
+ 	:
+ openacc_kinds.mod: openacc.mod
+ 	:
+-openacc.mod: openacc.lo
+-	:
+ %.mod: %.f90
+ 	$(FC) $(FCFLAGS) -fsyntax-only $<
+ fortran.lo: libgomp_f.h
+--- libgomp/oacc-mem.c.jj	2018-04-25 09:40:31.924655586 +0200
++++ libgomp/oacc-mem.c	2019-05-07 18:46:36.530109672 +0200
+@@ -153,8 +153,9 @@ acc_free (void *d)
+     gomp_fatal ("error in freeing device memory in %s", __FUNCTION__);
+ }
+ 
+-void
+-acc_memcpy_to_device (void *d, void *h, size_t s)
++static void
++memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
++		      const char *libfnname)
+ {
+   /* No need to call lazy open here, as the device pointer must have
+      been obtained from a routine that did that.  */
+@@ -164,31 +165,49 @@ acc_memcpy_to_device (void *d, void *h,
+ 
+   if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+     {
+-      memmove (d, h, s);
++      if (from)
++	memmove (h, d, s);
++      else
++	memmove (d, h, s);
+       return;
+     }
+ 
+-  if (!thr->dev->host2dev_func (thr->dev->target_id, d, h, s))
+-    gomp_fatal ("error in %s", __FUNCTION__);
++  if (async > acc_async_sync)
++    thr->dev->openacc.async_set_async_func (async);
++
++  bool ret = (from
++	      ? thr->dev->dev2host_func (thr->dev->target_id, h, d, s)
++	      : thr->dev->host2dev_func (thr->dev->target_id, d, h, s));
++
++  if (async > acc_async_sync)
++    thr->dev->openacc.async_set_async_func (acc_async_sync);
++
++  if (!ret)
++    gomp_fatal ("error in %s", libfnname);
+ }
+ 
+ void
+-acc_memcpy_from_device (void *h, void *d, size_t s)
++acc_memcpy_to_device (void *d, void *h, size_t s)
+ {
+-  /* No need to call lazy open here, as the device pointer must have
+-     been obtained from a routine that did that.  */
+-  struct goacc_thread *thr = goacc_thread ();
++  memcpy_tofrom_device (false, d, h, s, acc_async_sync, __FUNCTION__);
++}
+ 
+-  assert (thr && thr->dev);
++void
++acc_memcpy_to_device_async (void *d, void *h, size_t s, int async)
++{
++  memcpy_tofrom_device (false, d, h, s, async, __FUNCTION__);
++}
+ 
+-  if (thr->dev->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+-    {
+-      memmove (h, d, s);
+-      return;
+-    }
++void
++acc_memcpy_from_device (void *h, void *d, size_t s)
++{
++  memcpy_tofrom_device (true, d, h, s, acc_async_sync, __FUNCTION__);
++}
+ 
+-  if (!thr->dev->dev2host_func (thr->dev->target_id, h, d, s))
+-    gomp_fatal ("error in %s", __FUNCTION__);
++void
++acc_memcpy_from_device_async (void *h, void *d, size_t s, int async)
++{
++  memcpy_tofrom_device (true, d, h, s, async, __FUNCTION__);
+ }
+ 
+ /* Return the device pointer that corresponds to host data H.  Or NULL
+@@ -347,6 +366,7 @@ acc_map_data (void *h, void *d, size_t s
+ 
+       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, &devaddrs, &sizes,
+ 			   &kinds, true, GOMP_MAP_VARS_OPENACC);
++      tgt->list[0].key->refcount = REFCOUNT_INFINITY;
+     }
+ 
+   gomp_mutex_lock (&acc_dev->lock);
+@@ -389,6 +409,9 @@ acc_unmap_data (void *h)
+ 		  (void *) n->host_start, (int) host_size, (void *) h);
+     }
+ 
++  /* Mark for removal.  */
++  n->refcount = 1;
++
+   t = n->tgt;
+ 
+   if (t->refcount == 2)
+@@ -424,7 +447,7 @@ acc_unmap_data (void *h)
+ #define FLAG_COPY (1 << 2)
+ 
+ static void *
+-present_create_copy (unsigned f, void *h, size_t s)
++present_create_copy (unsigned f, void *h, size_t s, int async)
+ {
+   void *d;
+   splay_tree_key n;
+@@ -460,6 +483,11 @@ present_create_copy (unsigned f, void *h
+ 	  gomp_fatal ("[%p,+%d] not mapped", (void *)h, (int)s);
+ 	}
+ 
++      if (n->refcount != REFCOUNT_INFINITY)
++	{
++	  n->refcount++;
++	  n->dynamic_refcount++;
++	}
+       gomp_mutex_unlock (&acc_dev->lock);
+     }
+   else if (!(f & FLAG_CREATE))
+@@ -481,8 +509,16 @@ present_create_copy (unsigned f, void *h
+ 
+       gomp_mutex_unlock (&acc_dev->lock);
+ 
++      if (async > acc_async_sync)
++	acc_dev->openacc.async_set_async_func (async);
++
+       tgt = gomp_map_vars (acc_dev, mapnum, &hostaddrs, NULL, &s, &kinds, true,
+ 			   GOMP_MAP_VARS_OPENACC);
++      /* Initialize dynamic refcount.  */
++      tgt->list[0].key->dynamic_refcount = 1;
++
++      if (async > acc_async_sync)
++	acc_dev->openacc.async_set_async_func (acc_async_sync);
+ 
+       gomp_mutex_lock (&acc_dev->lock);
+ 
+@@ -499,53 +535,71 @@ present_create_copy (unsigned f, void *h
+ void *
+ acc_create (void *h, size_t s)
+ {
+-  return present_create_copy (FLAG_CREATE, h, s);
++  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, acc_async_sync);
+ }
+ 
+-void *
+-acc_copyin (void *h, size_t s)
++void
++acc_create_async (void *h, size_t s, int async)
+ {
+-  return present_create_copy (FLAG_CREATE | FLAG_COPY, h, s);
++  present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s, async);
+ }
+ 
++/* acc_present_or_create used to be what acc_create is now.  */
++/* acc_pcreate is acc_present_or_create by a different name.  */
++#ifdef HAVE_ATTRIBUTE_ALIAS
++strong_alias (acc_create, acc_present_or_create)
++strong_alias (acc_create, acc_pcreate)
++#else
+ void *
+ acc_present_or_create (void *h, size_t s)
+ {
+-  return present_create_copy (FLAG_PRESENT | FLAG_CREATE, h, s);
++  return acc_create (h, s);
+ }
+ 
+-/* acc_pcreate is acc_present_or_create by a different name.  */
+-#ifdef HAVE_ATTRIBUTE_ALIAS
+-strong_alias (acc_present_or_create, acc_pcreate)
+-#else
+ void *
+ acc_pcreate (void *h, size_t s)
+ {
+-  return acc_present_or_create (h, s);
++  return acc_create (h, s);
+ }
+ #endif
+ 
+ void *
+-acc_present_or_copyin (void *h, size_t s)
++acc_copyin (void *h, size_t s)
++{
++  return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s,
++			      acc_async_sync);
++}
++
++void
++acc_copyin_async (void *h, size_t s, int async)
+ {
+-  return present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s);
++  present_create_copy (FLAG_PRESENT | FLAG_CREATE | FLAG_COPY, h, s, async);
+ }
+ 
++/* acc_present_or_copyin used to be what acc_copyin is now.  */
+ /* acc_pcopyin is acc_present_or_copyin by a different name.  */
+ #ifdef HAVE_ATTRIBUTE_ALIAS
+-strong_alias (acc_present_or_copyin, acc_pcopyin)
++strong_alias (acc_copyin, acc_present_or_copyin)
++strong_alias (acc_copyin, acc_pcopyin)
+ #else
+ void *
++acc_present_or_copyin (void *h, size_t s)
++{
++  return acc_copyin (h, s);
++}
++
++void *
+ acc_pcopyin (void *h, size_t s)
+ {
+-  return acc_present_or_copyin (h, s);
++  return acc_copyin (h, s);
+ }
+ #endif
+ 
+-#define FLAG_COPYOUT (1 << 0)
++#define FLAG_COPYOUT  (1 << 0)
++#define FLAG_FINALIZE (1 << 1)
+ 
+ static void
+-delete_copyout (unsigned f, void *h, size_t s, const char *libfnname)
++delete_copyout (unsigned f, void *h, size_t s, int async, const char *libfnname)
+ {
+   size_t host_size;
+   splay_tree_key n;
+@@ -581,31 +635,111 @@ delete_copyout (unsigned f, void *h, siz
+ 		  (void *) n->host_start, (int) host_size, (void *) h, (int) s);
+     }
+ 
+-  gomp_mutex_unlock (&acc_dev->lock);
++  if (n->refcount == REFCOUNT_INFINITY)
++    {
++      n->refcount = 0;
++      n->dynamic_refcount = 0;
++    }
++  if (n->refcount < n->dynamic_refcount)
++    {
++      gomp_mutex_unlock (&acc_dev->lock);
++      gomp_fatal ("Dynamic reference counting assert fail\n");
++    }
+ 
+-  if (f & FLAG_COPYOUT)
+-    acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
++  if (f & FLAG_FINALIZE)
++    {
++      n->refcount -= n->dynamic_refcount;
++      n->dynamic_refcount = 0;
++    }
++  else if (n->dynamic_refcount)
++    {
++      n->dynamic_refcount--;
++      n->refcount--;
++    }
++
++  if (n->refcount == 0)
++    {
++      if (n->tgt->refcount == 2)
++	{
++	  struct target_mem_desc *tp, *t;
++	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
++	       tp = t, t = t->prev)
++	    if (n->tgt == t)
++	      {
++		if (tp)
++		  tp->prev = t->prev;
++		else
++		  acc_dev->openacc.data_environ = t->prev;
++		break;
++	      }
++	}
++
++      if (f & FLAG_COPYOUT)
++	{
++	  if (async > acc_async_sync)
++	    acc_dev->openacc.async_set_async_func (async);
++	  acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
++	  if (async > acc_async_sync)
++	    acc_dev->openacc.async_set_async_func (acc_async_sync);
++	}
+ 
+-  acc_unmap_data (h);
++      gomp_remove_var (acc_dev, n);
++    }
+ 
+-  if (!acc_dev->free_func (acc_dev->target_id, d))
+-    gomp_fatal ("error in freeing device memory in %s", libfnname);
++  gomp_mutex_unlock (&acc_dev->lock);
+ }
+ 
+ void
+ acc_delete (void *h , size_t s)
+ {
+-  delete_copyout (0, h, s, __FUNCTION__);
++  delete_copyout (0, h, s, acc_async_sync, __FUNCTION__);
++}
++
++void
++acc_delete_async (void *h , size_t s, int async)
++{
++  delete_copyout (0, h, s, async, __FUNCTION__);
++}
++
++void
++acc_delete_finalize (void *h , size_t s)
++{
++  delete_copyout (FLAG_FINALIZE, h, s, acc_async_sync, __FUNCTION__);
++}
++
++void
++acc_delete_finalize_async (void *h , size_t s, int async)
++{
++  delete_copyout (FLAG_FINALIZE, h, s, async, __FUNCTION__);
+ }
+ 
+ void
+ acc_copyout (void *h, size_t s)
+ {
+-  delete_copyout (FLAG_COPYOUT, h, s, __FUNCTION__);
++  delete_copyout (FLAG_COPYOUT, h, s, acc_async_sync, __FUNCTION__);
++}
++
++void
++acc_copyout_async (void *h, size_t s, int async)
++{
++  delete_copyout (FLAG_COPYOUT, h, s, async, __FUNCTION__);
++}
++
++void
++acc_copyout_finalize (void *h, size_t s)
++{
++  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, acc_async_sync,
++		  __FUNCTION__);
++}
++
++void
++acc_copyout_finalize_async (void *h, size_t s, int async)
++{
++  delete_copyout (FLAG_COPYOUT | FLAG_FINALIZE, h, s, async, __FUNCTION__);
+ }
+ 
+ static void
+-update_dev_host (int is_dev, void *h, size_t s)
++update_dev_host (int is_dev, void *h, size_t s, int async)
+ {
+   splay_tree_key n;
+   void *d;
+@@ -631,24 +765,42 @@ update_dev_host (int is_dev, void *h, si
+   d = (void *) (n->tgt->tgt_start + n->tgt_offset
+ 		+ (uintptr_t) h - n->host_start);
+ 
++  if (async > acc_async_sync)
++    acc_dev->openacc.async_set_async_func (async);
++
+   if (is_dev)
+     acc_dev->host2dev_func (acc_dev->target_id, d, h, s);
+   else
+     acc_dev->dev2host_func (acc_dev->target_id, h, d, s);
+ 
++  if (async > acc_async_sync)
++    acc_dev->openacc.async_set_async_func (acc_async_sync);
++
+   gomp_mutex_unlock (&acc_dev->lock);
+ }
+ 
+ void
+ acc_update_device (void *h, size_t s)
+ {
+-  update_dev_host (1, h, s);
++  update_dev_host (1, h, s, acc_async_sync);
++}
++
++void
++acc_update_device_async (void *h, size_t s, int async)
++{
++  update_dev_host (1, h, s, async);
+ }
+ 
+ void
+ acc_update_self (void *h, size_t s)
+ {
+-  update_dev_host (0, h, s);
++  update_dev_host (0, h, s, acc_async_sync);
++}
++
++void
++acc_update_self_async (void *h, size_t s, int async)
++{
++  update_dev_host (0, h, s, async);
+ }
+ 
+ void
+@@ -659,11 +811,37 @@ gomp_acc_insert_pointer (size_t mapnum,
+   struct goacc_thread *thr = goacc_thread ();
+   struct gomp_device_descr *acc_dev = thr->dev;
+ 
++  if (acc_is_present (*hostaddrs, *sizes))
++    {
++      splay_tree_key n;
++      gomp_mutex_lock (&acc_dev->lock);
++      n = lookup_host (acc_dev, *hostaddrs, *sizes);
++      gomp_mutex_unlock (&acc_dev->lock);
++
++      tgt = n->tgt;
++      for (size_t i = 0; i < tgt->list_count; i++)
++	if (tgt->list[i].key == n)
++	  {
++	    for (size_t j = 0; j < mapnum; j++)
++	      if (i + j < tgt->list_count && tgt->list[i + j].key)
++		{
++		  tgt->list[i + j].key->refcount++;
++		  tgt->list[i + j].key->dynamic_refcount++;
++		}
++	    return;
++	  }
++      /* Should not reach here.  */
++      gomp_fatal ("Dynamic refcount incrementing failed for pointer/pset");
++    }
++
+   gomp_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
+   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs,
+ 		       NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC);
+   gomp_debug (0, "  %s: mappings prepared\n", __FUNCTION__);
+ 
++  /* Initialize dynamic refcount.  */
++  tgt->list[0].key->dynamic_refcount = 1;
++
+   gomp_mutex_lock (&acc_dev->lock);
+   tgt->prev = acc_dev->openacc.data_environ;
+   acc_dev->openacc.data_environ = tgt;
+@@ -671,7 +849,8 @@ gomp_acc_insert_pointer (size_t mapnum,
+ }
+ 
+ void
+-gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
++gomp_acc_remove_pointer (void *h, size_t s, bool force_copyfrom, int async,
++			 int finalize, int mapnum)
+ {
+   struct goacc_thread *thr = goacc_thread ();
+   struct gomp_device_descr *acc_dev = thr->dev;
+@@ -679,6 +858,9 @@ gomp_acc_remove_pointer (void *h, bool f
+   struct target_mem_desc *t;
+   int minrefs = (mapnum == 1) ? 2 : 3;
+ 
++  if (!acc_is_present (h, s))
++    return;
++
+   gomp_mutex_lock (&acc_dev->lock);
+ 
+   n = lookup_host (acc_dev, h, 1);
+@@ -693,40 +875,65 @@ gomp_acc_remove_pointer (void *h, bool f
+ 
+   t = n->tgt;
+ 
+-  struct target_mem_desc *tp;
++  if (n->refcount < n->dynamic_refcount)
++    {
++      gomp_mutex_unlock (&acc_dev->lock);
++      gomp_fatal ("Dynamic reference counting assert fail\n");
++    }
+ 
+-  if (t->refcount == minrefs)
++  if (finalize)
+     {
+-      /* This is the last reference, so pull the descriptor off the
+-	 chain. This avoids gomp_unmap_vars via gomp_unmap_tgt from
+-	 freeing the device memory. */
+-      t->tgt_end = 0;
+-      t->to_free = 0;
++      n->refcount -= n->dynamic_refcount;
++      n->dynamic_refcount = 0;
++    }
++  else if (n->dynamic_refcount)
++    {
++      n->dynamic_refcount--;
++      n->refcount--;
++    }
+ 
+-      for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
+-	   tp = t, t = t->prev)
++  gomp_mutex_unlock (&acc_dev->lock);
++
++  if (n->refcount == 0)
++    {
++      if (t->refcount == minrefs)
+ 	{
+-	  if (n->tgt == t)
++	  /* This is the last reference, so pull the descriptor off the
++	     chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from
++	     freeing the device memory. */
++	  struct target_mem_desc *tp;
++	  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
++	       tp = t, t = t->prev)
+ 	    {
+-	      if (tp)
+-		tp->prev = t->prev;
+-	      else
+-		acc_dev->openacc.data_environ = t->prev;
+-	      break;
++	      if (n->tgt == t)
++		{
++		  if (tp)
++		    tp->prev = t->prev;
++		  else
++		    acc_dev->openacc.data_environ = t->prev;
++		  break;
++		}
+ 	    }
+ 	}
+-    }
+ 
+-  if (force_copyfrom)
+-    t->list[0].copy_from = 1;
++      /* Set refcount to 1 to allow gomp_unmap_vars to unmap it.  */
++      n->refcount = 1;
++      t->refcount = minrefs;
++      for (size_t i = 0; i < t->list_count; i++)
++	if (t->list[i].key == n)
++	  {
++	    t->list[i].copy_from = force_copyfrom ? 1 : 0;
++	    break;
++	  }
+ 
+-  gomp_mutex_unlock (&acc_dev->lock);
++      /* If running synchronously, unmap immediately.  */
++      if (async < acc_async_noval)
++	gomp_unmap_vars (t, true);
++      else
++	t->device_descr->openacc.register_async_cleanup_func (t, async);
++    }
+ 
+-  /* If running synchronously, unmap immediately.  */
+-  if (async < acc_async_noval)
+-    gomp_unmap_vars (t, true);
+-  else
+-    t->device_descr->openacc.register_async_cleanup_func (t, async);
++  gomp_mutex_unlock (&acc_dev->lock);
+ 
+   gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
+ }
+--- libgomp/env.c.jj	2018-04-25 09:40:31.924655586 +0200
++++ libgomp/env.c	2019-05-07 18:46:36.482110438 +0200
+@@ -88,8 +88,12 @@ void **gomp_places_list;
+ unsigned long gomp_places_list_len;
+ int gomp_debug_var;
+ unsigned int gomp_num_teams_var;
++bool gomp_display_affinity_var;
++char *gomp_affinity_format_var = "level %L thread %i affinity %A";
++size_t gomp_affinity_format_len;
+ char *goacc_device_type;
+ int goacc_device_num;
++int goacc_default_dims[GOMP_DIM_MAX];
+ 
+ #ifndef LIBGOMP_OFFLOADED_ONLY
+ 
+@@ -100,6 +104,7 @@ parse_schedule (void)
+ {
+   char *env, *end;
+   unsigned long value;
++  int monotonic = 0;
+ 
+   env = getenv ("OMP_SCHEDULE");
+   if (env == NULL)
+@@ -107,6 +112,26 @@ parse_schedule (void)
+ 
+   while (isspace ((unsigned char) *env))
+     ++env;
++  if (strncasecmp (env, "monotonic", 9) == 0)
++    {
++      monotonic = 1;
++      env += 9;
++    }
++  else if (strncasecmp (env, "nonmonotonic", 12) == 0)
++    {
++      monotonic = -1;
++      env += 12;
++    }
++  if (monotonic)
++    {
++      while (isspace ((unsigned char) *env))
++	++env;
++      if (*env != ':')
++	goto unknown;
++      ++env;
++      while (isspace ((unsigned char) *env))
++	++env;
++    }
+   if (strncasecmp (env, "static", 6) == 0)
+     {
+       gomp_global_icv.run_sched_var = GFS_STATIC;
+@@ -130,12 +155,16 @@ parse_schedule (void)
+   else
+     goto unknown;
+ 
++  if (monotonic == 1
++      || (monotonic == 0 && gomp_global_icv.run_sched_var == GFS_STATIC))
++    gomp_global_icv.run_sched_var |= GFS_MONOTONIC;
++
+   while (isspace ((unsigned char) *env))
+     ++env;
+   if (*env == '\0')
+     {
+       gomp_global_icv.run_sched_chunk_size
+-	= gomp_global_icv.run_sched_var != GFS_STATIC;
++	= (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC;
+       return;
+     }
+   if (*env++ != ',')
+@@ -158,7 +187,8 @@ parse_schedule (void)
+   if ((int)value != value)
+     goto invalid;
+ 
+-  if (value == 0 && gomp_global_icv.run_sched_var != GFS_STATIC)
++  if (value == 0
++      && (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC) != GFS_STATIC)
+     value = 1;
+   gomp_global_icv.run_sched_chunk_size = value;
+   return;
+@@ -1066,6 +1096,36 @@ parse_acc_device_type (void)
+ }
+ 
+ static void
++parse_gomp_openacc_dim (void)
++{
++  /* The syntax is the same as for the -fopenacc-dim compilation option.  */
++  const char *var_name = "GOMP_OPENACC_DIM";
++  const char *env_var = getenv (var_name);
++  if (!env_var)
++    return;
++
++  const char *pos = env_var;
++  int i;
++  for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
++    {
++      if (i && *pos++ != ':')
++	break;
++
++      if (*pos == ':')
++	continue;
++
++      const char *eptr;
++      errno = 0;
++      long val = strtol (pos, (char **)&eptr, 10);
++      if (errno || val < 0 || (unsigned)val != val)
++	break;
++
++      goacc_default_dims[i] = (int)val;
++      pos = eptr;
++    }
++}
++
++static void
+ handle_omp_display_env (unsigned long stacksize, int wait_policy)
+ {
+   const char *env;
+@@ -1119,19 +1179,34 @@ handle_omp_display_env (unsigned long st
+   fputs ("'\n", stderr);
+ 
+   fprintf (stderr, "  OMP_SCHEDULE = '");
+-  switch (gomp_global_icv.run_sched_var)
++  if ((gomp_global_icv.run_sched_var & GFS_MONOTONIC))
++    {
++      if (gomp_global_icv.run_sched_var != (GFS_MONOTONIC | GFS_STATIC))
++	fputs ("MONOTONIC:", stderr);
++    }
++  else if (gomp_global_icv.run_sched_var == GFS_STATIC)
++    fputs ("NONMONOTONIC:", stderr);
++  switch (gomp_global_icv.run_sched_var & ~GFS_MONOTONIC)
+     {
+     case GFS_RUNTIME:
+       fputs ("RUNTIME", stderr);
++      if (gomp_global_icv.run_sched_chunk_size != 1)
++	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
+       break;
+     case GFS_STATIC:
+       fputs ("STATIC", stderr);
++      if (gomp_global_icv.run_sched_chunk_size != 0)
++	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
+       break;
+     case GFS_DYNAMIC:
+       fputs ("DYNAMIC", stderr);
++      if (gomp_global_icv.run_sched_chunk_size != 1)
++	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
+       break;
+     case GFS_GUIDED:
+       fputs ("GUIDED", stderr);
++      if (gomp_global_icv.run_sched_chunk_size != 1)
++	fprintf (stderr, ",%d", gomp_global_icv.run_sched_chunk_size);
+       break;
+     case GFS_AUTO:
+       fputs ("AUTO", stderr);
+@@ -1197,6 +1272,10 @@ handle_omp_display_env (unsigned long st
+ 	   gomp_global_icv.default_device_var);
+   fprintf (stderr, "  OMP_MAX_TASK_PRIORITY = '%d'\n",
+ 	   gomp_max_task_priority_var);
++  fprintf (stderr, "  OMP_DISPLAY_AFFINITY = '%s'\n",
++	   gomp_display_affinity_var ? "TRUE" : "FALSE");
++  fprintf (stderr, "  OMP_AFFINITY_FORMAT = '%s'\n",
++	   gomp_affinity_format_var);
+ 
+   if (verbose)
+     {
+@@ -1228,6 +1307,7 @@ initialize_env (void)
+   parse_boolean ("OMP_DYNAMIC", &gomp_global_icv.dyn_var);
+   parse_boolean ("OMP_NESTED", &gomp_global_icv.nest_var);
+   parse_boolean ("OMP_CANCELLATION", &gomp_cancel_var);
++  parse_boolean ("OMP_DISPLAY_AFFINITY", &gomp_display_affinity_var);
+   parse_int ("OMP_DEFAULT_DEVICE", &gomp_global_icv.default_device_var, true);
+   parse_int ("OMP_MAX_TASK_PRIORITY", &gomp_max_task_priority_var, true);
+   parse_unsigned_long ("OMP_MAX_ACTIVE_LEVELS", &gomp_max_active_levels_var,
+@@ -1277,6 +1357,13 @@ initialize_env (void)
+     }
+   if (gomp_global_icv.bind_var != omp_proc_bind_false)
+     gomp_init_affinity ();
++
++  {
++    const char *env = getenv ("OMP_AFFINITY_FORMAT");
++    if (env != NULL)
++      gomp_set_affinity_format (env, strlen (env));
++  }
++
+   wait_policy = parse_wait_policy ();
+   if (!parse_spincount ("GOMP_SPINCOUNT", &gomp_spin_count_var))
+     {
+@@ -1302,7 +1389,6 @@ initialize_env (void)
+ 
+   /* Not strictly environment related, but ordering constructors is tricky.  */
+   pthread_attr_init (&gomp_thread_attr);
+-  pthread_attr_setdetachstate (&gomp_thread_attr, PTHREAD_CREATE_DETACHED);
+ 
+   if (parse_stacksize ("OMP_STACKSIZE", &stacksize)
+       || parse_stacksize ("GOMP_STACKSIZE", &stacksize)
+@@ -1336,6 +1422,7 @@ initialize_env (void)
+     goacc_device_num = 0;
+ 
+   parse_acc_device_type ();
++  parse_gomp_openacc_dim ();
+ 
+   goacc_runtime_initialize ();
+ }
+--- libgomp/fortran.c.jj	2018-04-25 09:40:31.913655581 +0200
++++ libgomp/fortran.c	2019-05-07 18:46:36.491110295 +0200
+@@ -28,6 +28,8 @@
+ #include "libgomp.h"
+ #include "libgomp_f.h"
+ #include <stdlib.h>
++#include <stdio.h>
++#include <string.h>
+ #include <limits.h>
+ 
+ #ifdef HAVE_ATTRIBUTE_ALIAS
+@@ -82,6 +84,8 @@ ialias_redirect (omp_get_team_num)
+ ialias_redirect (omp_is_initial_device)
+ ialias_redirect (omp_get_initial_device)
+ ialias_redirect (omp_get_max_task_priority)
++ialias_redirect (omp_pause_resource)
++ialias_redirect (omp_pause_resource_all)
+ #endif
+ 
+ #ifndef LIBGOMP_GNU_SYMBOL_VERSIONING
+@@ -368,7 +372,9 @@ omp_get_schedule_ (int32_t *kind, int32_
+   omp_sched_t k;
+   int cs;
+   omp_get_schedule (&k, &cs);
+-  *kind = k;
++  /* For now mask off GFS_MONOTONIC, because OpenMP 4.5 code will not
++     expect to see it.  */
++  *kind = k & ~GFS_MONOTONIC;
+   *chunk_size = cs;
+ }
+ 
+@@ -378,7 +384,8 @@ omp_get_schedule_8_ (int32_t *kind, int6
+   omp_sched_t k;
+   int cs;
+   omp_get_schedule (&k, &cs);
+-  *kind = k;
++  /* See above.  */
++  *kind = k & ~GFS_MONOTONIC;
+   *chunk_size = cs;
+ }
+ 
+@@ -576,3 +583,96 @@ omp_get_max_task_priority_ (void)
+ {
+   return omp_get_max_task_priority ();
+ }
++
++void
++omp_set_affinity_format_ (const char *format, size_t format_len)
++{
++  gomp_set_affinity_format (format, format_len);
++}
++
++int32_t
++omp_get_affinity_format_ (char *buffer, size_t buffer_len)
++{
++  size_t len = strlen (gomp_affinity_format_var);
++  if (buffer_len)
++    {
++      if (len < buffer_len)
++	{
++	  memcpy (buffer, gomp_affinity_format_var, len);
++	  memset (buffer + len, ' ', buffer_len - len);
++	}
++      else
++	memcpy (buffer, gomp_affinity_format_var, buffer_len);
++    }
++  return len;
++}
++
++void
++omp_display_affinity_ (const char *format, size_t format_len)
++{
++  char *fmt = NULL, fmt_buf[256];
++  char buf[512];
++  if (format_len)
++    {
++      fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1);
++      memcpy (fmt, format, format_len);
++      fmt[format_len] = '\0';
++    }
++  struct gomp_thread *thr = gomp_thread ();
++  size_t ret
++    = gomp_display_affinity (buf, sizeof buf,
++			     format_len ? fmt : gomp_affinity_format_var,
++			     gomp_thread_self (), &thr->ts, thr->place);
++  if (ret < sizeof buf)
++    {
++      buf[ret] = '\n';
++      gomp_print_string (buf, ret + 1);
++    }
++  else
++    {
++      char *b = gomp_malloc (ret + 1);
++      gomp_display_affinity (buf, sizeof buf,
++			     format_len ? fmt : gomp_affinity_format_var,
++			     gomp_thread_self (), &thr->ts, thr->place);
++      b[ret] = '\n';
++      gomp_print_string (b, ret + 1);
++      free (b);
++    }
++  if (fmt && fmt != fmt_buf)
++    free (fmt);
++}
++
++int32_t
++omp_capture_affinity_ (char *buffer, const char *format,
++		       size_t buffer_len, size_t format_len)
++{
++  char *fmt = NULL, fmt_buf[256];
++  if (format_len)
++    {
++      fmt = format_len < 256 ? fmt_buf : gomp_malloc (format_len + 1);
++      memcpy (fmt, format, format_len);
++      fmt[format_len] = '\0';
++    }
++  struct gomp_thread *thr = gomp_thread ();
++  size_t ret
++    = gomp_display_affinity (buffer, buffer_len,
++			     format_len ? fmt : gomp_affinity_format_var,
++			     gomp_thread_self (), &thr->ts, thr->place);
++  if (fmt && fmt != fmt_buf)
++    free (fmt);
++  if (ret < buffer_len)
++    memset (buffer + ret, ' ', buffer_len - ret);
++  return ret;
++}
++
++int32_t
++omp_pause_resource_ (const int32_t *kind, const int32_t *device_num)
++{
++  return omp_pause_resource (*kind, *device_num);
++}
++
++int32_t
++omp_pause_resource_all_ (const int32_t *kind)
++{
++  return omp_pause_resource_all (*kind);
++}
+--- libgomp/configure.tgt.jj	2018-04-25 09:40:31.925655587 +0200
++++ libgomp/configure.tgt	2019-05-07 18:46:36.479110486 +0200
+@@ -18,7 +18,7 @@ if test $gcc_cv_have_tls = yes ; then
+ 	;;
+ 
+     *-*-linux* | *-*-gnu*)
+-	XCFLAGS="${XCFLAGS} -ftls-model=initial-exec"
++	XCFLAGS="${XCFLAGS} -ftls-model=initial-exec -DUSING_INITIAL_EXEC_TLS"
+ 	;;
+ 
+     *-*-rtems*)
+--- libgomp/icv-device.c.jj	2018-04-25 09:40:31.925655587 +0200
++++ libgomp/icv-device.c	2019-05-07 18:46:36.513109943 +0200
+@@ -49,20 +49,6 @@ omp_get_num_devices (void)
+ }
+ 
+ int
+-omp_get_num_teams (void)
+-{
+-  /* Hardcoded to 1 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+-  return 1;
+-}
+-
+-int
+-omp_get_team_num (void)
+-{
+-  /* Hardcoded to 0 on host, MIC, HSAIL?  Maybe variable on PTX.  */
+-  return 0;
+-}
+-
+-int
+ omp_is_initial_device (void)
+ {
+   /* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX.  */
+@@ -72,6 +58,4 @@ omp_is_initial_device (void)
+ ialias (omp_set_default_device)
+ ialias (omp_get_default_device)
+ ialias (omp_get_num_devices)
+-ialias (omp_get_num_teams)
+-ialias (omp_get_team_num)
+ ialias (omp_is_initial_device)
+--- libgomp/Makefile.in.jj	2018-04-25 09:40:31.320655306 +0200
++++ libgomp/Makefile.in	2019-05-07 20:00:01.082077522 +0200
+@@ -90,7 +90,7 @@ DIST_COMMON = $(top_srcdir)/plugin/Makef
+ 	$(srcdir)/libgomp.spec.in $(srcdir)/../depcomp
+ @PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
+ @PLUGIN_HSA_TRUE@am__append_2 = libgomp-plugin-hsa.la
+-@USE_FORTRAN_TRUE@am__append_3 = openacc.f90
++@USE_FORTRAN_TRUE@am__append_3 = openacc2.f90
+ subdir = .
+ ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+ am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
+@@ -172,7 +172,7 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL
+ @PLUGIN_NVPTX_TRUE@am_libgomp_plugin_nvptx_la_rpath = -rpath \
+ @PLUGIN_NVPTX_TRUE@	$(toolexeclibdir)
+ libgomp_la_LIBADD =
+-@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
++@USE_FORTRAN_TRUE@am__objects_1 = openacc2.lo
+ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
+ 	env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
+ 	loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \
+@@ -180,7 +180,8 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.
+ 	sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \
+ 	target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
+ 	oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
+-	oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1)
++	oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
++	teams.lo $(am__objects_1)
+ libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
+ DEFAULT_INCLUDES = -I.@am__isrc@
+ depcomp = $(SHELL) $(top_srcdir)/../depcomp
+@@ -380,6 +381,7 @@ mkdir_p = @mkdir_p@
+ multi_basedir = @multi_basedir@
+ offload_additional_lib_paths = @offload_additional_lib_paths@
+ offload_additional_options = @offload_additional_options@
++offload_plugins = @offload_plugins@
+ offload_targets = @offload_targets@
+ oldincludedir = @oldincludedir@
+ pdfdir = @pdfdir@
+@@ -436,7 +438,7 @@ libgomp_la_SOURCES = alloc.c atomic.c ba
+ 	affinity.c target.c splay-tree.c libgomp-plugin.c \
+ 	oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
+ 	oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
+-	$(am__append_3)
++	affinity-fmt.c teams.c $(am__append_3)
+ 
+ # Nvidia PTX OpenACC plugin.
+ @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
+@@ -599,6 +601,7 @@ mostlyclean-compile:
+ distclean-compile:
+ 	-rm -f *.tab.c
+ 
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity-fmt.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@
+@@ -638,6 +641,7 @@ distclean-compile:
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/target.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/task.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/team.Plo@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/teams.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/time.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/work.Plo@am__quote@
+ 
+@@ -1292,8 +1296,6 @@ omp_lib_kinds.mod: omp_lib.mod
+ 	:
+ openacc_kinds.mod: openacc.mod
+ 	:
+-openacc.mod: openacc.lo
+-	:
+ %.mod: %.f90
+ 	$(FC) $(FCFLAGS) -fsyntax-only $<
+ fortran.lo: libgomp_f.h
+--- libgomp/plugin/cuda/cuda.h.jj	2018-04-25 09:40:31.914655581 +0200
++++ libgomp/plugin/cuda/cuda.h	2019-05-07 18:46:36.533109624 +0200
+@@ -44,6 +44,7 @@ typedef void *CUevent;
+ typedef void *CUfunction;
+ typedef void *CUlinkState;
+ typedef void *CUmodule;
++typedef size_t (*CUoccupancyB2DSize)(int);
+ typedef void *CUstream;
+ 
+ typedef enum {
+@@ -88,6 +89,7 @@ typedef enum {
+   CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,
+   CU_JIT_ERROR_LOG_BUFFER = 5,
+   CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,
++  CU_JIT_OPTIMIZATION_LEVEL = 7,
+   CU_JIT_LOG_VERBOSE = 12
+ } CUjit_option;
+ 
+@@ -169,6 +171,8 @@ CUresult cuModuleGetGlobal (CUdeviceptr
+ CUresult cuModuleLoad (CUmodule *, const char *);
+ CUresult cuModuleLoadData (CUmodule *, const void *);
+ CUresult cuModuleUnload (CUmodule);
++CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
++					  CUoccupancyB2DSize, size_t, int);
+ CUresult cuStreamCreate (CUstream *, unsigned);
+ #define cuStreamDestroy cuStreamDestroy_v2
+ CUresult cuStreamDestroy (CUstream);
+--- libgomp/plugin/cuda-lib.def.jj	2019-05-07 18:46:36.533109624 +0200
++++ libgomp/plugin/cuda-lib.def	2019-05-07 18:46:36.533109624 +0200
+@@ -0,0 +1,49 @@
++CUDA_ONE_CALL (cuCtxCreate)
++CUDA_ONE_CALL (cuCtxDestroy)
++CUDA_ONE_CALL (cuCtxGetCurrent)
++CUDA_ONE_CALL (cuCtxGetDevice)
++CUDA_ONE_CALL (cuCtxPopCurrent)
++CUDA_ONE_CALL (cuCtxPushCurrent)
++CUDA_ONE_CALL (cuCtxSynchronize)
++CUDA_ONE_CALL (cuDeviceGet)
++CUDA_ONE_CALL (cuDeviceGetAttribute)
++CUDA_ONE_CALL (cuDeviceGetCount)
++CUDA_ONE_CALL (cuEventCreate)
++CUDA_ONE_CALL (cuEventDestroy)
++CUDA_ONE_CALL (cuEventElapsedTime)
++CUDA_ONE_CALL (cuEventQuery)
++CUDA_ONE_CALL (cuEventRecord)
++CUDA_ONE_CALL (cuEventSynchronize)
++CUDA_ONE_CALL (cuFuncGetAttribute)
++CUDA_ONE_CALL_MAYBE_NULL (cuGetErrorString)
++CUDA_ONE_CALL (cuInit)
++CUDA_ONE_CALL (cuLaunchKernel)
++CUDA_ONE_CALL (cuLinkAddData)
++CUDA_ONE_CALL_MAYBE_NULL (cuLinkAddData_v2)
++CUDA_ONE_CALL (cuLinkComplete)
++CUDA_ONE_CALL (cuLinkCreate)
++CUDA_ONE_CALL_MAYBE_NULL (cuLinkCreate_v2)
++CUDA_ONE_CALL (cuLinkDestroy)
++CUDA_ONE_CALL (cuMemAlloc)
++CUDA_ONE_CALL (cuMemAllocHost)
++CUDA_ONE_CALL (cuMemcpy)
++CUDA_ONE_CALL (cuMemcpyDtoDAsync)
++CUDA_ONE_CALL (cuMemcpyDtoH)
++CUDA_ONE_CALL (cuMemcpyDtoHAsync)
++CUDA_ONE_CALL (cuMemcpyHtoD)
++CUDA_ONE_CALL (cuMemcpyHtoDAsync)
++CUDA_ONE_CALL (cuMemFree)
++CUDA_ONE_CALL (cuMemFreeHost)
++CUDA_ONE_CALL (cuMemGetAddressRange)
++CUDA_ONE_CALL (cuMemHostGetDevicePointer)
++CUDA_ONE_CALL (cuModuleGetFunction)
++CUDA_ONE_CALL (cuModuleGetGlobal)
++CUDA_ONE_CALL (cuModuleLoad)
++CUDA_ONE_CALL (cuModuleLoadData)
++CUDA_ONE_CALL (cuModuleUnload)
++CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize)
++CUDA_ONE_CALL (cuStreamCreate)
++CUDA_ONE_CALL (cuStreamDestroy)
++CUDA_ONE_CALL (cuStreamQuery)
++CUDA_ONE_CALL (cuStreamSynchronize)
++CUDA_ONE_CALL (cuStreamWaitEvent)
+--- libgomp/plugin/plugin-nvptx.c.jj	2018-04-25 09:40:31.915655582 +0200
++++ libgomp/plugin/plugin-nvptx.c	2019-05-07 18:46:36.535109592 +0200
+@@ -31,6 +31,7 @@
+    is not clear as to what that state might be.  Or how one might
+    propagate it from one thread to another.  */
+ 
++#define _GNU_SOURCE
+ #include "openacc.h"
+ #include "config.h"
+ #include "libgomp-plugin.h"
+@@ -48,60 +49,41 @@
+ #include <assert.h>
+ #include <errno.h>
+ 
++#if CUDA_VERSION < 6000
++extern CUresult cuGetErrorString (CUresult, const char **);
++#define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
++#endif
++
++#if CUDA_VERSION >= 6050
++#undef cuLinkCreate
++#undef cuLinkAddData
++CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
++			const char *, unsigned, CUjit_option *, void **);
++CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
++#else
++typedef size_t (*CUoccupancyB2DSize)(int);
++CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
++			   const char *, unsigned, CUjit_option *, void **);
++CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
++CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
++					  CUoccupancyB2DSize, size_t, int);
++#endif
++
++#define DO_PRAGMA(x) _Pragma (#x)
++
+ #if PLUGIN_NVPTX_DYNAMIC
+ # include <dlfcn.h>
+ 
+-# define CUDA_CALLS \
+-CUDA_ONE_CALL (cuCtxCreate)		\
+-CUDA_ONE_CALL (cuCtxDestroy)		\
+-CUDA_ONE_CALL (cuCtxGetCurrent)		\
+-CUDA_ONE_CALL (cuCtxGetDevice)		\
+-CUDA_ONE_CALL (cuCtxPopCurrent)		\
+-CUDA_ONE_CALL (cuCtxPushCurrent)	\
+-CUDA_ONE_CALL (cuCtxSynchronize)	\
+-CUDA_ONE_CALL (cuDeviceGet)		\
+-CUDA_ONE_CALL (cuDeviceGetAttribute)	\
+-CUDA_ONE_CALL (cuDeviceGetCount)	\
+-CUDA_ONE_CALL (cuEventCreate)		\
+-CUDA_ONE_CALL (cuEventDestroy)		\
+-CUDA_ONE_CALL (cuEventElapsedTime)	\
+-CUDA_ONE_CALL (cuEventQuery)		\
+-CUDA_ONE_CALL (cuEventRecord)		\
+-CUDA_ONE_CALL (cuEventSynchronize)	\
+-CUDA_ONE_CALL (cuFuncGetAttribute)	\
+-CUDA_ONE_CALL (cuGetErrorString)	\
+-CUDA_ONE_CALL (cuInit)			\
+-CUDA_ONE_CALL (cuLaunchKernel)		\
+-CUDA_ONE_CALL (cuLinkAddData)		\
+-CUDA_ONE_CALL (cuLinkComplete)		\
+-CUDA_ONE_CALL (cuLinkCreate)		\
+-CUDA_ONE_CALL (cuLinkDestroy)		\
+-CUDA_ONE_CALL (cuMemAlloc)		\
+-CUDA_ONE_CALL (cuMemAllocHost)		\
+-CUDA_ONE_CALL (cuMemcpy)		\
+-CUDA_ONE_CALL (cuMemcpyDtoDAsync)	\
+-CUDA_ONE_CALL (cuMemcpyDtoH)		\
+-CUDA_ONE_CALL (cuMemcpyDtoHAsync)	\
+-CUDA_ONE_CALL (cuMemcpyHtoD)		\
+-CUDA_ONE_CALL (cuMemcpyHtoDAsync)	\
+-CUDA_ONE_CALL (cuMemFree)		\
+-CUDA_ONE_CALL (cuMemFreeHost)		\
+-CUDA_ONE_CALL (cuMemGetAddressRange)	\
+-CUDA_ONE_CALL (cuMemHostGetDevicePointer)\
+-CUDA_ONE_CALL (cuModuleGetFunction)	\
+-CUDA_ONE_CALL (cuModuleGetGlobal)	\
+-CUDA_ONE_CALL (cuModuleLoad)		\
+-CUDA_ONE_CALL (cuModuleLoadData)	\
+-CUDA_ONE_CALL (cuModuleUnload)		\
+-CUDA_ONE_CALL (cuStreamCreate)		\
+-CUDA_ONE_CALL (cuStreamDestroy)		\
+-CUDA_ONE_CALL (cuStreamQuery)		\
+-CUDA_ONE_CALL (cuStreamSynchronize)	\
+-CUDA_ONE_CALL (cuStreamWaitEvent)
+-# define CUDA_ONE_CALL(call) \
+-  __typeof (call) *call;
+ struct cuda_lib_s {
+-  CUDA_CALLS
++
++# define CUDA_ONE_CALL(call)			\
++  __typeof (call) *call;
++# define CUDA_ONE_CALL_MAYBE_NULL(call)		\
++  CUDA_ONE_CALL (call)
++#include "cuda-lib.def"
++# undef CUDA_ONE_CALL
++# undef CUDA_ONE_CALL_MAYBE_NULL
++
+ } cuda_lib;
+ 
+ /* -1 if init_cuda_lib has not been called yet, false
+@@ -120,24 +102,41 @@ init_cuda_lib (void)
+   cuda_lib_inited = false;
+   if (h == NULL)
+     return false;
+-# undef CUDA_ONE_CALL
+-# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call)
+-# define CUDA_ONE_CALL_1(call) \
++
++# define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
++# define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
++# define CUDA_ONE_CALL_1(call, allow_null)		\
+   cuda_lib.call = dlsym (h, #call);	\
+-  if (cuda_lib.call == NULL)		\
++  if (!allow_null && cuda_lib.call == NULL)		\
+     return false;
+-  CUDA_CALLS
++#include "cuda-lib.def"
++# undef CUDA_ONE_CALL
++# undef CUDA_ONE_CALL_1
++# undef CUDA_ONE_CALL_MAYBE_NULL
++
+   cuda_lib_inited = true;
+   return true;
+ }
+-# undef CUDA_ONE_CALL
+-# undef CUDA_ONE_CALL_1
+ # define CUDA_CALL_PREFIX cuda_lib.
+ #else
++
++# define CUDA_ONE_CALL(call)
++# define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
++#include "cuda-lib.def"
++#undef CUDA_ONE_CALL_MAYBE_NULL
++#undef CUDA_ONE_CALL
++
+ # define CUDA_CALL_PREFIX
+ # define init_cuda_lib() true
+ #endif
+ 
++#include "secure_getenv.h"
++
++#undef MIN
++#undef MAX
++#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
++#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
++
+ /* Convenience macros for the frequently used CUDA library call and
+    error handling sequence as well as CUDA library calls that
+    do the error checking themselves or don't do it at all.  */
+@@ -171,40 +170,42 @@ init_cuda_lib (void)
+ #define CUDA_CALL_NOCHECK(FN, ...)		\
+   CUDA_CALL_PREFIX FN (__VA_ARGS__)
+ 
++#define CUDA_CALL_EXISTS(FN)			\
++  CUDA_CALL_PREFIX FN
++
+ static const char *
+ cuda_error (CUresult r)
+ {
+-#if CUDA_VERSION < 7000
+-  /* Specified in documentation and present in library from at least
+-     5.5.  Not declared in header file prior to 7.0.  */
+-  extern CUresult cuGetErrorString (CUresult, const char **);
+-#endif
++  const char *fallback = "unknown cuda error";
+   const char *desc;
+ 
++  if (!CUDA_CALL_EXISTS (cuGetErrorString))
++    return fallback;
++
+   r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
+-  if (r != CUDA_SUCCESS)
+-    desc = "unknown cuda error";
++  if (r == CUDA_SUCCESS)
++    return desc;
+ 
+-  return desc;
++  return fallback;
+ }
+ 
+ static unsigned int instantiated_devices = 0;
+ static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
+ 
++struct cuda_map
++{
++  CUdeviceptr d;
++  size_t size;
++  bool active;
++  struct cuda_map *next;
++};
++
+ struct ptx_stream
+ {
+   CUstream stream;
+   pthread_t host_thread;
+   bool multithreaded;
+-
+-  CUdeviceptr d;
+-  void *h;
+-  void *h_begin;
+-  void *h_end;
+-  void *h_next;
+-  void *h_prev;
+-  void *h_tail;
+-
++  struct cuda_map *map;
+   struct ptx_stream *next;
+ };
+ 
+@@ -216,12 +217,64 @@ struct nvptx_thread
+   struct ptx_device *ptx_dev;
+ };
+ 
+-struct map
++static struct cuda_map *
++cuda_map_create (size_t size)
+ {
+-  int     async;
+-  size_t  size;
+-  char    mappings[0];
+-};
++  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
++
++  assert (map);
++
++  map->next = NULL;
++  map->size = size;
++  map->active = false;
++
++  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
++  assert (map->d);
++
++  return map;
++}
++
++static void
++cuda_map_destroy (struct cuda_map *map)
++{
++  if (map->active)
++    /* Possible reasons for the map to be still active:
++       - the associated async kernel might still be running.
++       - the associated async kernel might have finished, but the
++         corresponding event that should trigger the pop_map has not been
++	 processed by event_gc.
++       - the associated sync kernel might have aborted
++
++       The async cases could happen if the user specified an async region
++       without adding a corresponding wait that is guaranteed to be executed
++       (before returning from main, or in an atexit handler).
++       We do not want to deallocate a device pointer that is still being
++       used, so skip it.
++
++       In the sync case, the device pointer is no longer used, but deallocating
++       it using cuMemFree will not succeed, so skip it.
++
++       TODO: Handle this in a more constructive way, by f.i. waiting for streams
++       to finish before de-allocating them (PR88981), or by ensuring the CUDA
++       lib atexit handler is called before rather than after the libgomp plugin
++       atexit handler (PR83795).  */
++    ;
++  else
++    CUDA_CALL_NOCHECK (cuMemFree, map->d);
++
++  free (map);
++}
++
++/* The following map_* routines manage the CUDA device memory that
++   contains the data mapping arguments for cuLaunchKernel.  Each
++   asynchronous PTX stream may have multiple pending kernel
++   invocations, which are launched in a FIFO order.  As such, the map
++   routines maintains a queue of cuLaunchKernel arguments.
++
++   Calls to map_push and map_pop must be guarded by ptx_event_lock.
++   Likewise, calls to map_init and map_fini are guarded by
++   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
++   GOMP_OFFLOAD_fini_device, respectively.  */
+ 
+ static bool
+ map_init (struct ptx_stream *s)
+@@ -229,109 +282,83 @@ map_init (struct ptx_stream *s)
+   int size = getpagesize ();
+ 
+   assert (s);
+-  assert (!s->d);
+-  assert (!s->h);
+-
+-  CUDA_CALL (cuMemAllocHost, &s->h, size);
+-  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
+ 
+-  assert (s->h);
++  s->map = cuda_map_create (size);
+ 
+-  s->h_begin = s->h;
+-  s->h_end = s->h_begin + size;
+-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
+-
+-  assert (s->h_next);
+-  assert (s->h_end);
+   return true;
+ }
+ 
+ static bool
+ map_fini (struct ptx_stream *s)
+ {
+-  CUDA_CALL (cuMemFreeHost, s->h);
++  assert (s->map->next == NULL);
++
++  cuda_map_destroy (s->map);
++
+   return true;
+ }
+ 
+ static void
+ map_pop (struct ptx_stream *s)
+ {
+-  struct map *m;
++  struct cuda_map *next;
+ 
+   assert (s != NULL);
+-  assert (s->h_next);
+-  assert (s->h_prev);
+-  assert (s->h_tail);
+-
+-  m = s->h_tail;
+-
+-  s->h_tail += m->size;
+-
+-  if (s->h_tail >= s->h_end)
+-    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+-
+-  if (s->h_next == s->h_tail)
+-    s->h_prev = s->h_next;
+ 
+-  assert (s->h_next >= s->h_begin);
+-  assert (s->h_tail >= s->h_begin);
+-  assert (s->h_prev >= s->h_begin);
++  if (s->map->next == NULL)
++    {
++      s->map->active = false;
++      return;
++    }
+ 
+-  assert (s->h_next <= s->h_end);
+-  assert (s->h_tail <= s->h_end);
+-  assert (s->h_prev <= s->h_end);
++  next = s->map->next;
++  cuda_map_destroy (s->map);
++  s->map = next;
+ }
+ 
+-static void
+-map_push (struct ptx_stream *s, int async, size_t size, void **h, void **d)
++static CUdeviceptr
++map_push (struct ptx_stream *s, size_t size)
+ {
+-  int left;
+-  int offset;
+-  struct map *m;
++  struct cuda_map *map = NULL;
++  struct cuda_map **t;
+ 
+-  assert (s != NULL);
+-
+-  left = s->h_end - s->h_next;
+-  size += sizeof (struct map);
+-
+-  assert (s->h_prev);
+-  assert (s->h_next);
++  assert (s);
++  assert (s->map);
+ 
+-  if (size >= left)
++  /* Select an element to push.  */
++  if (s->map->active)
++    map = cuda_map_create (size);
++  else
+     {
+-      m = s->h_prev;
+-      m->size += left;
+-      s->h_next = s->h_begin;
+-
+-      if (s->h_next + size > s->h_end)
+-	GOMP_PLUGIN_fatal ("unable to push map");
+-    }
+-
+-  assert (s->h_next);
+-
+-  m = s->h_next;
+-  m->async = async;
+-  m->size = size;
++      /* Pop the inactive front element.  */
++      struct cuda_map *pop = s->map;
++      s->map = pop->next;
++      pop->next = NULL;
+ 
+-  offset = (void *)&m->mappings[0] - s->h;
++      if (pop->size < size)
++	{
++	  cuda_map_destroy (pop);
+ 
+-  *d = (void *)(s->d + offset);
+-  *h = (void *)(s->h + offset);
++	  map = cuda_map_create (size);
++	}
++      else
++	map = pop;
++    }
+ 
+-  s->h_prev = s->h_next;
+-  s->h_next += size;
++  /* Check that the element is as expected.  */
++  assert (map->next == NULL);
++  assert (!map->active);
+ 
+-  assert (s->h_prev);
+-  assert (s->h_next);
++  /* Mark the element active.  */
++  map->active = true;
+ 
+-  assert (s->h_next >= s->h_begin);
+-  assert (s->h_tail >= s->h_begin);
+-  assert (s->h_prev >= s->h_begin);
+-  assert (s->h_next <= s->h_end);
+-  assert (s->h_tail <= s->h_end);
+-  assert (s->h_prev <= s->h_end);
++  /* Push the element to the back of the list.  */
++  for (t = &s->map; (*t) != NULL; t = &(*t)->next)
++    ;
++  assert (t != NULL && *t == NULL);
++  *t = map;
+ 
+-  return;
++  return map->d;
+ }
+ 
+ /* Target data function launch information.  */
+@@ -411,6 +438,10 @@ struct ptx_device
+   int num_sms;
+   int regs_per_block;
+   int regs_per_sm;
++  int warp_size;
++  int max_threads_per_block;
++  int max_threads_per_multiprocessor;
++  int default_dims[GOMP_DIM_MAX];
+ 
+   struct ptx_image_data *images;  /* Images loaded on device.  */
+   pthread_mutex_t image_lock;     /* Lock for above list.  */
+@@ -458,8 +489,6 @@ init_streams_for_device (struct ptx_devi
+   null_stream->stream = NULL;
+   null_stream->host_thread = pthread_self ();
+   null_stream->multithreaded = true;
+-  null_stream->d = (CUdeviceptr) NULL;
+-  null_stream->h = NULL;
+   if (!map_init (null_stream))
+     return false;
+ 
+@@ -594,8 +623,6 @@ select_stream_for_async (int async, pthr
+ 	  s->host_thread = thread;
+ 	  s->multithreaded = false;
+ 
+-	  s->d = (CUdeviceptr) NULL;
+-	  s->h = NULL;
+ 	  if (!map_init (s))
+ 	    {
+ 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
+@@ -777,9 +804,11 @@ nvptx_open_device (int n)
+ 		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
+   ptx_dev->regs_per_block = pi;
+ 
+-  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
++  /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
+      in CUDA 6.0 and newer.  */
+-  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, 82, dev);
++  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
++			 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
++			 dev);
+   /* Fallback: use limit of registers per block, which is usually equal.  */
+   if (r == CUDA_ERROR_INVALID_VALUE)
+     pi = ptx_dev->regs_per_block;
+@@ -797,12 +826,24 @@ nvptx_open_device (int n)
+       GOMP_PLUGIN_error ("Only warp size 32 is supported");
+       return NULL;
+     }
++  ptx_dev->warp_size = pi;
++
++  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
++		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
++  ptx_dev->max_threads_per_block = pi;
++
++  CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
++		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
++  ptx_dev->max_threads_per_multiprocessor = pi;
+ 
+   r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &async_engines,
+ 			 CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+   if (r != CUDA_SUCCESS)
+     async_engines = 1;
+ 
++  for (int i = 0; i != GOMP_DIM_MAX; i++)
++    ptx_dev->default_dims[i] = 0;
++
+   ptx_dev->images = NULL;
+   pthread_mutex_init (&ptx_dev->image_lock, NULL);
+ 
+@@ -876,12 +917,42 @@ notify_var (const char *var_name, const
+     GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
+ }
+ 
++static void
++process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
++{
++  const char *var_name = "GOMP_NVPTX_JIT";
++  const char *env_var = secure_getenv (var_name);
++  notify_var (var_name, env_var);
++
++  if (env_var == NULL)
++    return;
++
++  const char *c = env_var;
++  while (*c != '\0')
++    {
++      while (*c == ' ')
++	c++;
++
++      if (c[0] == '-' && c[1] == 'O'
++	  && '0' <= c[2] && c[2] <= '4'
++	  && (c[3] == '\0' || c[3] == ' '))
++	{
++	  *gomp_nvptx_o = c[2] - '0';
++	  c += 3;
++	  continue;
++	}
++
++      GOMP_PLUGIN_error ("Error parsing %s", var_name);
++      break;
++    }
++}
++
+ static bool
+ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
+ 	  unsigned num_objs)
+ {
+-  CUjit_option opts[6];
+-  void *optvals[6];
++  CUjit_option opts[7];
++  void *optvals[7];
+   float elapsed = 0.0;
+   char elog[1024];
+   char ilog[16384];
+@@ -908,16 +979,41 @@ link_ptx (CUmodule *module, const struct
+   opts[5] = CU_JIT_LOG_VERBOSE;
+   optvals[5] = (void *) 1;
+ 
+-  CUDA_CALL (cuLinkCreate, 6, opts, optvals, &linkstate);
++  static intptr_t gomp_nvptx_o = -1;
++
++  static bool init_done = false;
++  if (!init_done)
++    {
++      process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
++      init_done = true;
++  }
++
++  int nopts = 6;
++  if (gomp_nvptx_o != -1)
++    {
++      opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
++      optvals[nopts] = (void *) gomp_nvptx_o;
++      nopts++;
++    }
++
++  if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
++    CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
++  else
++    CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
+ 
+   for (; num_objs--; ptx_objs++)
+     {
+       /* cuLinkAddData's 'data' argument erroneously omits the const
+ 	 qualifier.  */
+       GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
+-      r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
+-			     (char *) ptx_objs->code, ptx_objs->size,
+-			     0, 0, 0, 0);
++      if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
++	r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
++			       (char *) ptx_objs->code, ptx_objs->size,
++			       0, 0, 0, 0);
++      else
++	r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
++			       (char *) ptx_objs->code, ptx_objs->size,
++			       0, 0, 0, 0);
+       if (r != CUDA_SUCCESS)
+ 	{
+ 	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
+@@ -1067,8 +1163,10 @@ nvptx_exec (void (*fn), size_t mapnum, v
+   int i;
+   struct ptx_stream *dev_str;
+   void *kargs[1];
+-  void *hp, *dp;
++  void *hp;
++  CUdeviceptr dp = 0;
+   struct nvptx_thread *nvthd = nvptx_thread ();
++  int warp_size = nvthd->ptx_dev->warp_size;
+   const char *maybe_abort_msg = "(perhaps abort was called)";
+ 
+   function = targ_fn->fn;
+@@ -1090,68 +1188,36 @@ nvptx_exec (void (*fn), size_t mapnum, v
+ 
+   if (seen_zero)
+     {
+-      /* See if the user provided GOMP_OPENACC_DIM environment
+-	 variable to specify runtime defaults. */
+-      static int default_dims[GOMP_DIM_MAX];
+-
+       pthread_mutex_lock (&ptx_dev_lock);
+-      if (!default_dims[0])
+-	{
+-	  const char *var_name = "GOMP_OPENACC_DIM";
+-	  /* We only read the environment variable once.  You can't
+-	     change it in the middle of execution.  The syntax  is
+-	     the same as for the -fopenacc-dim compilation option.  */
+-	  const char *env_var = getenv (var_name);
+-	  notify_var (var_name, env_var);
+-	  if (env_var)
+-	    {
+-	      const char *pos = env_var;
+ 
+-	      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+-		{
+-		  if (i && *pos++ != ':')
+-		    break;
+-		  if (*pos != ':')
+-		    {
+-		      const char *eptr;
+-
+-		      errno = 0;
+-		      long val = strtol (pos, (char **)&eptr, 10);
+-		      if (errno || val < 0 || (unsigned)val != val)
+-			break;
+-		      default_dims[i] = (int)val;
+-		      pos = eptr;
+-		    }
+-		}
+-	    }
++      static int gomp_openacc_dims[GOMP_DIM_MAX];
++      if (!gomp_openacc_dims[0])
++	{
++	  /* See if the user provided GOMP_OPENACC_DIM environment
++	     variable to specify runtime defaults.  */
++	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
++	    gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
++	}
+ 
+-	  int warp_size, block_size, dev_size, cpu_size;
+-	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
+-	  /* 32 is the default for known hardware.  */
+-	  int gang = 0, worker = 32, vector = 32;
+-	  CUdevice_attribute cu_tpb, cu_ws, cu_mpc, cu_tpm;
+-
+-	  cu_tpb = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK;
+-	  cu_ws = CU_DEVICE_ATTRIBUTE_WARP_SIZE;
+-	  cu_mpc = CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT;
+-	  cu_tpm  = CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR;
+-
+-	  if (CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &block_size, cu_tpb,
+-				 dev) == CUDA_SUCCESS
+-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &warp_size, cu_ws,
+-				    dev) == CUDA_SUCCESS
+-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &dev_size, cu_mpc,
+-				    dev) == CUDA_SUCCESS
+-	      && CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &cpu_size, cu_tpm,
+-				    dev) == CUDA_SUCCESS)
+-	    {
+-	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+-				 " dev_size=%d, cpu_size=%d\n",
+-				 warp_size, block_size, dev_size, cpu_size);
+-	      gang = (cpu_size / block_size) * dev_size;
+-	      worker = block_size / warp_size;
+-	      vector = warp_size;
+-	    }
++      if (!nvthd->ptx_dev->default_dims[0])
++	{
++	  int default_dims[GOMP_DIM_MAX];
++	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
++	    default_dims[i] = gomp_openacc_dims[i];
++
++	  int gang, worker, vector;
++	  {
++	    int block_size = nvthd->ptx_dev->max_threads_per_block;
++	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
++	    int dev_size = nvthd->ptx_dev->num_sms;
++	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
++			       " dev_size=%d, cpu_size=%d\n",
++			       warp_size, block_size, dev_size, cpu_size);
++
++	    gang = (cpu_size / block_size) * dev_size;
++	    worker = block_size / warp_size;
++	    vector = warp_size;
++	  }
+ 
+ 	  /* There is no upper bound on the gang size.  The best size
+ 	     matches the hardware configuration.  Logical gangs are
+@@ -1172,29 +1238,150 @@ nvptx_exec (void (*fn), size_t mapnum, v
+ 			     default_dims[GOMP_DIM_GANG],
+ 			     default_dims[GOMP_DIM_WORKER],
+ 			     default_dims[GOMP_DIM_VECTOR]);
++
++	  for (i = 0; i != GOMP_DIM_MAX; i++)
++	    nvthd->ptx_dev->default_dims[i] = default_dims[i];
+ 	}
+       pthread_mutex_unlock (&ptx_dev_lock);
+ 
+-      for (i = 0; i != GOMP_DIM_MAX; i++)
+-	if (!dims[i])
+-	  dims[i] = default_dims[i];
+-    }
+-
+-  /* This reserves a chunk of a pre-allocated page of memory mapped on both
+-     the host and the device. HP is a host pointer to the new chunk, and DP is
+-     the corresponding device pointer.  */
+-  map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp);
+-
+-  GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
+-
+-  /* Copy the array of arguments to the mapped page.  */
+-  for (i = 0; i < mapnum; i++)
+-    ((void **) hp)[i] = devaddrs[i];
+-
+-  /* Copy the (device) pointers to arguments to the device (dp and hp might in
+-     fact have the same value on a unified-memory system).  */
+-  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+-		    mapnum * sizeof (void *));
++      {
++	bool default_dim_p[GOMP_DIM_MAX];
++	for (i = 0; i != GOMP_DIM_MAX; i++)
++	  default_dim_p[i] = !dims[i];
++
++	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
++	  {
++	    for (i = 0; i != GOMP_DIM_MAX; i++)
++	      if (default_dim_p[i])
++		dims[i] = nvthd->ptx_dev->default_dims[i];
++
++	    if (default_dim_p[GOMP_DIM_VECTOR])
++	      dims[GOMP_DIM_VECTOR]
++		= MIN (dims[GOMP_DIM_VECTOR],
++		       (targ_fn->max_threads_per_block / warp_size
++			* warp_size));
++
++	    if (default_dim_p[GOMP_DIM_WORKER])
++	      dims[GOMP_DIM_WORKER]
++		= MIN (dims[GOMP_DIM_WORKER],
++		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
++	  }
++	else
++	  {
++	    /* Handle the case that the compiler allows the runtime to choose
++	       the vector-length conservatively, by ignoring
++	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
++	       it.  */
++	    int vectors = 0;
++	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
++	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
++	       exceed targ_fn->max_threads_per_block. */
++	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
++	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
++	    int grids, blocks;
++
++	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
++			      &blocks, function, NULL, 0,
++			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
++	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
++			       "grid = %d, block = %d\n", grids, blocks);
++
++	    /* Keep the num_gangs proportional to the block size.  In
++	       the case were a block size is limited by shared-memory
++	       or the register file capacity, the runtime will not
++	       excessively over assign gangs to the multiprocessor
++	       units if their state is going to be swapped out even
++	       more than necessary. The constant factor 2 is there to
++	       prevent threads from idling when there is insufficient
++	       work for them.  */
++	    if (gangs == 0)
++	      gangs = 2 * grids * (blocks / warp_size);
++
++	    if (vectors == 0)
++	      vectors = warp_size;
++
++	    if (workers == 0)
++	      {
++		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
++				      ? vectors
++				      : dims[GOMP_DIM_VECTOR]);
++		workers = blocks / actual_vectors;
++		workers = MAX (workers, 1);
++		/* If we need a per-worker barrier ... .  */
++		if (actual_vectors > 32)
++		  /* Don't use more barriers than available.  */
++		  workers = MIN (workers, 15);
++	      }
++
++	    for (i = 0; i != GOMP_DIM_MAX; i++)
++	      if (default_dim_p[i])
++		switch (i)
++		  {
++		  case GOMP_DIM_GANG: dims[i] = gangs; break;
++		  case GOMP_DIM_WORKER: dims[i] = workers; break;
++		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
++		  default: GOMP_PLUGIN_fatal ("invalid dim");
++		  }
++	  }
++      }
++    }
++
++  /* Check if the accelerator has sufficient hardware resources to
++     launch the offloaded kernel.  */
++  if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
++      > targ_fn->max_threads_per_block)
++    {
++      const char *msg
++	= ("The Nvidia accelerator has insufficient resources to launch '%s'"
++	   " with num_workers = %d and vector_length = %d"
++	   "; "
++	   "recompile the program with 'num_workers = x and vector_length = y'"
++	   " on that offloaded region or '-fopenacc-dim=:x:y' where"
++	   " x * y <= %d"
++	   ".\n");
++      GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
++			 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
++    }
++
++  /* Check if the accelerator has sufficient barrier resources to
++     launch the offloaded kernel.  */
++  if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
++    {
++      const char *msg
++	= ("The Nvidia accelerator has insufficient barrier resources to launch"
++	   " '%s' with num_workers = %d and vector_length = %d"
++	   "; "
++	   "recompile the program with 'num_workers = x' on that offloaded"
++	   " region or '-fopenacc-dim=:x:' where x <= 15"
++	   "; "
++	   "or, recompile the program with 'vector_length = 32' on that"
++	   " offloaded region or '-fopenacc-dim=::32'"
++	   ".\n");
++	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
++			   dims[GOMP_DIM_VECTOR]);
++    }
++
++  if (mapnum > 0)
++    {
++      /* This reserves a chunk of a pre-allocated page of memory mapped on both
++	 the host and the device. HP is a host pointer to the new chunk, and DP is
++	 the corresponding device pointer.  */
++      pthread_mutex_lock (&ptx_event_lock);
++      dp = map_push (dev_str, mapnum * sizeof (void *));
++      pthread_mutex_unlock (&ptx_event_lock);
++
++      GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
++
++      /* Copy the array of arguments to the mapped page.  */
++      hp = alloca(sizeof(void *) * mapnum);
++      for (i = 0; i < mapnum; i++)
++	((void **) hp)[i] = devaddrs[i];
++
++      /* Copy the (device) pointers to arguments to the device */
++      CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
++			mapnum * sizeof (void *));
++    }
++
+   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
+ 		     " gangs=%u, workers=%u, vectors=%u\n",
+ 		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
+@@ -1239,7 +1426,8 @@ nvptx_exec (void (*fn), size_t mapnum, v
+ 
+       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
+ 
+-      event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
++      if (mapnum > 0)
++	event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
+     }
+ #else
+   r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
+@@ -1256,7 +1444,10 @@ nvptx_exec (void (*fn), size_t mapnum, v
+ #ifndef DISABLE_ASYNC
+   if (async < acc_async_noval)
+ #endif
+-    map_pop (dev_str);
++    {
++      if (mapnum > 0)
++	map_pop (dev_str);
++    }
+ }
+ 
+ void * openacc_get_current_cuda_context (void);
+@@ -1415,9 +1606,8 @@ nvptx_async_test (int async)
+   struct ptx_stream *s;
+ 
+   s = select_stream_for_async (async, pthread_self (), false, NULL);
+-
+   if (!s)
+-    GOMP_PLUGIN_fatal ("unknown async %d", async);
++    return 1;
+ 
+   r = CUDA_CALL_NOCHECK (cuStreamQuery, s->stream);
+   if (r == CUDA_SUCCESS)
+@@ -1472,7 +1662,7 @@ nvptx_wait (int async)
+ 
+   s = select_stream_for_async (async, pthread_self (), false, NULL);
+   if (!s)
+-    GOMP_PLUGIN_fatal ("unknown async %d", async);
++    return;
+ 
+   CUDA_CALL_ASSERT (cuStreamSynchronize, s->stream);
+ 
+@@ -1486,16 +1676,17 @@ nvptx_wait_async (int async1, int async2
+   struct ptx_stream *s1, *s2;
+   pthread_t self = pthread_self ();
+ 
++  s1 = select_stream_for_async (async1, self, false, NULL);
++  if (!s1)
++    return;
++
+   /* The stream that is waiting (rather than being waited for) doesn't
+      necessarily have to exist already.  */
+   s2 = select_stream_for_async (async2, self, true, NULL);
+ 
+-  s1 = select_stream_for_async (async1, self, false, NULL);
+-  if (!s1)
+-    GOMP_PLUGIN_fatal ("invalid async 1\n");
+-
++  /* A stream is always synchronized with itself.  */
+   if (s1 == s2)
+-    GOMP_PLUGIN_fatal ("identical parameters");
++    return;
+ 
+   e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
+ 
+@@ -1629,8 +1820,14 @@ nvptx_set_cuda_stream (int async, void *
+   pthread_t self = pthread_self ();
+   struct nvptx_thread *nvthd = nvptx_thread ();
+ 
+-  if (async < 0)
+-    GOMP_PLUGIN_fatal ("bad async %d", async);
++  /* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
++     to change the stream handle associated with "acc_async_sync".  */
++  if (async == acc_async_sync)
++    {
++      GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
++			 " with \"acc_async_sync\"\n");
++      return 0;
++    }
+ 
+   pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);
+ 
+@@ -1739,6 +1936,12 @@ GOMP_OFFLOAD_fini_device (int n)
+       instantiated_devices--;
+     }
+ 
++  if (instantiated_devices == 0)
++    {
++      free (ptx_devices);
++      ptx_devices = NULL;
++    }
++
+   pthread_mutex_unlock (&ptx_dev_lock);
+   return true;
+ }
+--- libgomp/plugin/configfrag.ac.jj	2018-04-25 09:40:31.914655581 +0200
++++ libgomp/plugin/configfrag.ac	2019-05-07 18:46:36.533109624 +0200
+@@ -26,8 +26,6 @@
+ # see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ # <http://www.gnu.org/licenses/>.
+ 
+-offload_targets=
+-AC_SUBST(offload_targets)
+ plugin_support=yes
+ AC_CHECK_LIB(dl, dlsym, , [plugin_support=no])
+ if test x"$plugin_support" = xyes; then
+@@ -59,7 +57,11 @@ AC_ARG_WITH(cuda-driver-lib,
+ 	[AS_HELP_STRING([--with-cuda-driver-lib=PATH],
+ 		[specify directory for the installed CUDA driver library])])
+ case "x$with_cuda_driver" in
+-  x | xno) ;;
++  x) ;;
++  xno)
++    CUDA_DRIVER_INCLUDE=no
++    CUDA_DRIVER_LIB=no
++    ;;
+   *) CUDA_DRIVER_INCLUDE=$with_cuda_driver/include
+      CUDA_DRIVER_LIB=$with_cuda_driver/lib
+      ;;
+@@ -70,10 +72,12 @@ fi
+ if test "x$with_cuda_driver_lib" != x; then
+   CUDA_DRIVER_LIB=$with_cuda_driver_lib
+ fi
+-if test "x$CUDA_DRIVER_INCLUDE" != x; then
++if test "x$CUDA_DRIVER_INCLUDE" != x \
++   && test "x$CUDA_DRIVER_INCLUDE" != xno; then
+   CUDA_DRIVER_CPPFLAGS=-I$CUDA_DRIVER_INCLUDE
+ fi
+-if test "x$CUDA_DRIVER_LIB" != x; then
++if test "x$CUDA_DRIVER_LIB" != x \
++   && test "x$CUDA_DRIVER_LIB" != xno; then
+   CUDA_DRIVER_LDFLAGS=-L$CUDA_DRIVER_LIB
+ fi
+ 
+@@ -133,7 +137,13 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS)
+ AC_SUBST(PLUGIN_HSA_LDFLAGS)
+ AC_SUBST(PLUGIN_HSA_LIBS)
+ 
+-# Get offload targets and path to install tree of offloading compiler.
++# Parse '--enable-offload-targets', figure out the corresponding libgomp
++# plugins, and configure to find the corresponding offload compilers.
++# 'offload_plugins' and 'offload_targets' will be populated in the same order.
++offload_plugins=
++offload_targets=
++AC_SUBST(offload_plugins)
++AC_SUBST(offload_targets)
+ offload_additional_options=
+ offload_additional_lib_paths=
+ AC_SUBST(offload_additional_options)
+@@ -142,36 +152,41 @@ if test x"$enable_offload_targets" != x;
+   for tgt in `echo $enable_offload_targets | sed -e 's#,# #g'`; do
+     tgt_dir=`echo $tgt | grep '=' | sed 's/.*=//'`
+     tgt=`echo $tgt | sed 's/=.*//'`
+-    tgt_name=
++    tgt_plugin=
+     case $tgt in
+       *-intelmic-* | *-intelmicemul-*)
+-	tgt_name=intelmic
++	tgt_plugin=intelmic
+ 	;;
+       nvptx*)
+-        tgt_name=nvptx
++	tgt_plugin=nvptx
+ 	PLUGIN_NVPTX=$tgt
+-	PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
+-	PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
+-	PLUGIN_NVPTX_LIBS='-lcuda'
+-
+-	PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
+-	CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
+-	PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
+-	LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
+-	PLUGIN_NVPTX_save_LIBS=$LIBS
+-	LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
+-	AC_LINK_IFELSE(
+-	  [AC_LANG_PROGRAM(
+-	    [#include "cuda.h"],
+-	      [CUresult r = cuCtxPushCurrent (NULL);])],
+-	  [PLUGIN_NVPTX=1])
+-	CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
+-	LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
+-	LIBS=$PLUGIN_NVPTX_save_LIBS
++	if test "x$CUDA_DRIVER_LIB" != xno \
++	   && test "x$CUDA_DRIVER_LIB" != xno; then
++	  PLUGIN_NVPTX_CPPFLAGS=$CUDA_DRIVER_CPPFLAGS
++	  PLUGIN_NVPTX_LDFLAGS=$CUDA_DRIVER_LDFLAGS
++	  PLUGIN_NVPTX_LIBS='-lcuda'
++
++	  PLUGIN_NVPTX_save_CPPFLAGS=$CPPFLAGS
++	  CPPFLAGS="$PLUGIN_NVPTX_CPPFLAGS $CPPFLAGS"
++	  PLUGIN_NVPTX_save_LDFLAGS=$LDFLAGS
++	  LDFLAGS="$PLUGIN_NVPTX_LDFLAGS $LDFLAGS"
++	  PLUGIN_NVPTX_save_LIBS=$LIBS
++	  LIBS="$PLUGIN_NVPTX_LIBS $LIBS"
++	  AC_LINK_IFELSE(
++	    [AC_LANG_PROGRAM(
++	      [#include "cuda.h"],
++		[CUresult r = cuCtxPushCurrent (NULL);])],
++	    [PLUGIN_NVPTX=1])
++	  CPPFLAGS=$PLUGIN_NVPTX_save_CPPFLAGS
++	  LDFLAGS=$PLUGIN_NVPTX_save_LDFLAGS
++	  LIBS=$PLUGIN_NVPTX_save_LIBS
++	fi
+ 	case $PLUGIN_NVPTX in
+ 	  nvptx*)
+-	    if test "x$CUDA_DRIVER_INCLUDE" = x \
+-	       && test "x$CUDA_DRIVER_LIB" = x; then
++	    if (test "x$CUDA_DRIVER_INCLUDE" = x \
++		|| test "x$CUDA_DRIVER_INCLUDE" = xno) \
++	       && (test "x$CUDA_DRIVER_LIB" = x \
++		   || test "x$CUDA_DRIVER_LIB" = xno); then
+ 	      PLUGIN_NVPTX=1
+ 	      PLUGIN_NVPTX_CPPFLAGS='-I$(srcdir)/plugin/cuda'
+ 	      PLUGIN_NVPTX_LIBS='-ldl'
+@@ -191,7 +206,7 @@ if test x"$enable_offload_targets" != x;
+ 	        PLUGIN_HSA=0
+ 		;;
+ 	      *)
+-	        tgt_name=hsa
++		tgt_plugin=hsa
+ 	        PLUGIN_HSA=$tgt
+ 	        PLUGIN_HSA_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+ 	        PLUGIN_HSA_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+@@ -209,7 +224,7 @@ if test x"$enable_offload_targets" != x;
+ 	        LDFLAGS=$PLUGIN_HSA_save_LDFLAGS
+ 	        LIBS=$PLUGIN_HSA_save_LIBS
+ 	        case $PLUGIN_HSA in
+-	          hsa*)
++		  hsa*)
+ 	            HSA_PLUGIN=0
+ 	            AC_MSG_ERROR([HSA run-time package required for HSA support])
+ 	            ;;
+@@ -226,16 +241,19 @@ if test x"$enable_offload_targets" != x;
+ 	AC_MSG_ERROR([unknown offload target specified])
+ 	;;
+     esac
+-    if test x"$tgt_name" = x; then
+-      # Don't configure libgomp for this offloading target if we don't build
+-      # the corresponding plugin.
++    if test x"$tgt_plugin" = x; then
++      # Not configuring libgomp for this offload target if we're not building
++      # the corresponding offload plugin.
+       continue
+-    elif test x"$offload_targets" = x; then
+-      offload_targets=$tgt_name
++    elif test x"$offload_plugins" = x; then
++      offload_plugins=$tgt_plugin
++      offload_targets=$tgt
+     else
+-      offload_targets=$offload_targets,$tgt_name
++      offload_plugins=$offload_plugins,$tgt_plugin
++      offload_targets=$offload_targets,$tgt
+     fi
+-    if test "$tgt_name" = hsa; then
++    # Configure additional search paths.
++    if test "$tgt_plugin" = hsa; then
+       # Offloading compilation is all handled by the target compiler.
+       :
+     elif test x"$tgt_dir" != x; then
+@@ -247,8 +265,8 @@ if test x"$enable_offload_targets" != x;
+     fi
+   done
+ fi
+-AC_DEFINE_UNQUOTED(OFFLOAD_TARGETS, "$offload_targets",
+-  [Define to offload targets, separated by commas.])
++AC_DEFINE_UNQUOTED(OFFLOAD_PLUGINS, "$offload_plugins",
++  [Define to offload plugins, separated by commas.])
+ AM_CONDITIONAL([PLUGIN_NVPTX], [test $PLUGIN_NVPTX = 1])
+ AC_DEFINE_UNQUOTED([PLUGIN_NVPTX], [$PLUGIN_NVPTX],
+   [Define to 1 if the NVIDIA plugin is built, 0 if not.])
+--- libgomp/affinity-fmt.c.jj	2019-05-07 18:46:36.285113585 +0200
++++ libgomp/affinity-fmt.c	2019-05-07 18:46:36.285113585 +0200
+@@ -0,0 +1,495 @@
++/* Copyright (C) 2018-2019 Free Software Foundation, Inc.
++   Contributed by Jakub Jelinek <jakub@redhat.com>.
++
++   This file is part of the GNU Offloading and Multi Processing Library
++   (libgomp).
++
++   Libgomp is free software; you can redistribute it and/or modify it
++   under the terms of the GNU General Public License as published by
++   the Free Software Foundation; either version 3, or (at your option)
++   any later version.
++
++   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
++   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
++   more details.
++
++   Under Section 7 of GPL version 3, you are granted additional
++   permissions described in the GCC Runtime Library Exception, version
++   3.1, as published by the Free Software Foundation.
++
++   You should have received a copy of the GNU General Public License and
++   a copy of the GCC Runtime Library Exception along with this program;
++   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
++   <http://www.gnu.org/licenses/>.  */
++
++#include "libgomp.h"
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#ifdef HAVE_UNISTD_H
++#include <unistd.h>
++#endif
++#ifdef HAVE_INTTYPES_H
++# include <inttypes.h>  /* For PRIx64.  */
++#endif
++#ifdef HAVE_UNAME
++#include <sys/utsname.h>
++#endif
++
++void
++gomp_print_string (const char *str, size_t len)
++{
++  fwrite (str, 1, len, stderr);
++}
++
++void
++gomp_set_affinity_format (const char *format, size_t len)
++{
++  if (len < gomp_affinity_format_len)
++    memcpy (gomp_affinity_format_var, format, len);
++  else
++    {
++      char *p;
++      if (gomp_affinity_format_len)
++	p = gomp_realloc (gomp_affinity_format_var, len + 1);
++      else
++	p = gomp_malloc (len + 1);
++      memcpy (p, format, len);
++      gomp_affinity_format_var = p;
++      gomp_affinity_format_len = len + 1;
++    }
++  gomp_affinity_format_var[len] = '\0';
++}
++
++void
++omp_set_affinity_format (const char *format)
++{
++  gomp_set_affinity_format (format, strlen (format));
++}
++
++size_t
++omp_get_affinity_format (char *buffer, size_t size)
++{
++  size_t len = strlen (gomp_affinity_format_var);
++  if (size)
++    {
++      if (len < size)
++	memcpy (buffer, gomp_affinity_format_var, len + 1);
++      else
++	{
++	  memcpy (buffer, gomp_affinity_format_var, size - 1);
++	  buffer[size - 1] = '\0';
++	}
++    }
++  return len;
++}
++
++void
++gomp_display_string (char *buffer, size_t size, size_t *ret,
++		     const char *str, size_t len)
++{
++  size_t r = *ret;
++  if (size && r < size)
++    {
++      size_t l = len;
++      if (size - r < len)
++	l = size - r;
++      memcpy (buffer + r, str, l);
++    }
++  *ret += len;
++  if (__builtin_expect (r > *ret, 0))
++    gomp_fatal ("overflow in omp_capture_affinity");
++}
++
++static void
++gomp_display_repeat (char *buffer, size_t size, size_t *ret,
++		     char c, size_t len)
++{
++  size_t r = *ret;
++  if (size && r < size)
++    {
++      size_t l = len;
++      if (size - r < len)
++	l = size - r;
++      memset (buffer + r, c, l);
++    }
++  *ret += len;
++  if (__builtin_expect (r > *ret, 0))
++    gomp_fatal ("overflow in omp_capture_affinity");
++}
++
++static void
++gomp_display_num (char *buffer, size_t size, size_t *ret,
++		  bool zero, bool right, size_t sz, char *buf)
++{
++  size_t l = strlen (buf);
++  if (sz == (size_t) -1 || l >= sz)
++    {
++      gomp_display_string (buffer, size, ret, buf, l);
++      return;
++    }
++  if (zero)
++    {
++      if (buf[0] == '-')
++	gomp_display_string (buffer, size, ret, buf, 1);
++      else if (buf[0] == '0' && buf[1] == 'x')
++	gomp_display_string (buffer, size, ret, buf, 2);
++      gomp_display_repeat (buffer, size, ret, '0', sz - l);
++      if (buf[0] == '-')
++	gomp_display_string (buffer, size, ret, buf + 1, l - 1);
++      else if (buf[0] == '0' && buf[1] == 'x')
++	gomp_display_string (buffer, size, ret, buf + 2, l - 2);
++      else
++	gomp_display_string (buffer, size, ret, buf, l);
++    }
++  else if (right)
++    {
++      gomp_display_repeat (buffer, size, ret, ' ', sz - l);
++      gomp_display_string (buffer, size, ret, buf, l);
++    }
++  else
++    {
++      gomp_display_string (buffer, size, ret, buf, l);
++      gomp_display_repeat (buffer, size, ret, ' ', sz - l);
++    }
++}
++
++static void
++gomp_display_int (char *buffer, size_t size, size_t *ret,
++		  bool zero, bool right, size_t sz, int num)
++{
++  char buf[3 * sizeof (int) + 2];
++  sprintf (buf, "%d", num);
++  gomp_display_num (buffer, size, ret, zero, right, sz, buf);
++}
++
++static void
++gomp_display_string_len (char *buffer, size_t size, size_t *ret,
++			 bool right, size_t sz, char *str, size_t len)
++{
++  if (sz == (size_t) -1 || len >= sz)
++    {
++      gomp_display_string (buffer, size, ret, str, len);
++      return;
++    }
++
++  if (right)
++    {
++      gomp_display_repeat (buffer, size, ret, ' ', sz - len);
++      gomp_display_string (buffer, size, ret, str, len);
++    }
++  else
++    {
++      gomp_display_string (buffer, size, ret, str, len);
++      gomp_display_repeat (buffer, size, ret, ' ', sz - len);
++    }
++}
++
++static void
++gomp_display_hostname (char *buffer, size_t size, size_t *ret,
++		       bool right, size_t sz)
++{
++#ifdef HAVE_GETHOSTNAME
++  {
++    char buf[256];
++    char *b = buf;
++    size_t len = 256;
++    do
++      {
++	b[len - 1] = '\0';
++	if (gethostname (b, len - 1) == 0)
++	  {
++	    size_t l = strlen (b);
++	    if (l < len - 1)
++	      {
++		gomp_display_string_len (buffer, size, ret,
++					 right, sz, b, l);
++		if (b != buf)
++		  free (b);
++		return;
++	      }
++	  }
++	if (len == 1048576)
++	  break;
++	len = len * 2;
++	if (len == 512)
++	  b = gomp_malloc (len);
++	else
++	  b = gomp_realloc (b, len);
++      }
++    while (1);
++    if (b != buf)
++      free (b);
++  }
++#endif
++#ifdef HAVE_UNAME
++  {
++    struct utsname buf;
++    if (uname (&buf) == 0)
++      {
++	gomp_display_string_len (buffer, size, ret, right, sz,
++				 buf.nodename, strlen (buf.nodename));
++	return;
++      }
++  }
++#endif
++  gomp_display_string_len (buffer, size, ret, right, sz, "node", 4);
++}
++
++struct affinity_types_struct {
++  char long_str[18];
++  char long_len;
++  char short_c; };
++
++static struct affinity_types_struct affinity_types[] =
++{
++#define AFFINITY_TYPE(l, s) \
++  { #l, sizeof (#l) - 1, s }
++  AFFINITY_TYPE (team_num, 't'),
++  AFFINITY_TYPE (num_teams, 'T'),
++  AFFINITY_TYPE (nesting_level, 'L'),
++  AFFINITY_TYPE (thread_num, 'n'),
++  AFFINITY_TYPE (num_threads, 'N'),
++  AFFINITY_TYPE (ancestor_tnum, 'a'),
++  AFFINITY_TYPE (host, 'H'),
++  AFFINITY_TYPE (process_id, 'P'),
++  AFFINITY_TYPE (native_thread_id, 'i'),
++  AFFINITY_TYPE (thread_affinity, 'A')
++#undef AFFINITY_TYPE
++};
++
++size_t
++gomp_display_affinity (char *buffer, size_t size,
++		       const char *format, gomp_thread_handle handle,
++		       struct gomp_team_state *ts, unsigned int place)
++{
++  size_t ret = 0;
++  do
++    {
++      const char *p = strchr (format, '%');
++      bool zero = false;
++      bool right = false;
++      size_t sz = -1;
++      char c;
++      int val;
++      if (p == NULL)
++	p = strchr (format, '\0');
++      if (p != format)
++	gomp_display_string (buffer, size, &ret,
++			     format, p - format);
++      if (*p == '\0')
++	break;
++      p++;
++      if (*p == '%')
++	{
++	  gomp_display_string (buffer, size, &ret, "%", 1);
++	  format = p + 1;
++	  continue;
++	}
++      if (*p == '0')
++	{
++	  zero = true;
++	  p++;
++	  if (*p != '.')
++	    gomp_fatal ("leading zero not followed by dot in affinity format");
++	}
++      if (*p == '.')
++	{
++	  right = true;
++	  p++;
++	}
++      if (*p >= '1' && *p <= '9')
++	{
++	  char *end;
++	  sz = strtoul (p, &end, 10);
++	  p = end;
++	}
++      else if (zero || right)
++	gomp_fatal ("leading zero or right justification in affinity format "
++		    "requires size");
++      c = *p;
++      if (c == '{')
++	{
++	  int i;
++	  for (i = 0;
++	       i < sizeof (affinity_types) / sizeof (affinity_types[0]); ++i)
++	    if (strncmp (p + 1, affinity_types[i].long_str,
++			 affinity_types[i].long_len) == 0
++		&& p[affinity_types[i].long_len + 1] == '}')
++	      {
++		c = affinity_types[i].short_c;
++		p += affinity_types[i].long_len + 1;
++		break;
++	      }
++	  if (c == '{')
++	    {
++	      char *q = strchr (p + 1, '}');
++	      if (q)
++		gomp_fatal ("unsupported long type name '%.*s' in affinity "
++			    "format", (int) (q - (p + 1)), p + 1);
++	      else
++		gomp_fatal ("unterminated long type name '%s' in affinity "
++			    "format", p + 1);
++	    }
++	}
++      switch (c)
++	{
++	case 't':
++	  val = omp_get_team_num ();
++	  goto do_int;
++	case 'T':
++	  val = omp_get_num_teams ();
++	  goto do_int;
++	case 'L':
++	  val = ts->level;
++	  goto do_int;
++	case 'n':
++	  val = ts->team_id;
++	  goto do_int;
++	case 'N':
++	  val = ts->team ? ts->team->nthreads : 1;
++	  goto do_int;
++	case 'a':
++	  val = ts->team ? ts->team->prev_ts.team_id : -1;
++	  goto do_int;
++	case 'H':
++	  gomp_display_hostname (buffer, size, &ret, right, sz);
++	  break;
++	case 'P':
++#ifdef HAVE_GETPID
++	  val = getpid ();
++#else
++	  val = 0;
++#endif
++	  goto do_int;
++	case 'i':
++#if defined(LIBGOMP_USE_PTHREADS) && defined(__GNUC__)
++	  {
++	    char buf[3 * (sizeof (handle) + sizeof (uintptr_t) + sizeof (int))
++		     + 4];
++	    /* This macro returns expr unmodified for integral or pointer
++	       types and 0 for anything else (e.g. aggregates).  */
++#define gomp_nonaggregate(expr) \
++  __builtin_choose_expr (__builtin_classify_type (expr) == 1		    \
++			 || __builtin_classify_type (expr) == 5, expr, 0)
++	    /* This macro returns expr unmodified for integral types,
++	       (uintptr_t) (expr) for pointer types and 0 for anything else
++	       (e.g. aggregates).  */
++#define gomp_integral(expr) \
++  __builtin_choose_expr (__builtin_classify_type (expr) == 5,		    \
++			 (uintptr_t) gomp_nonaggregate (expr),		    \
++			 gomp_nonaggregate (expr))
++
++	    if (sizeof (gomp_integral (handle)) == sizeof (unsigned long))
++	      sprintf (buf, "0x%lx", (unsigned long) gomp_integral (handle));
++#if defined (HAVE_INTTYPES_H) && defined (PRIx64)
++	    else if (sizeof (gomp_integral (handle)) == sizeof (uint64_t))
++	      sprintf (buf, "0x%" PRIx64, (uint64_t) gomp_integral (handle));
++#else
++	    else if (sizeof (gomp_integral (handle))
++		     == sizeof (unsigned long long))
++	      sprintf (buf, "0x%llx",
++		       (unsigned long long) gomp_integral (handle));
++#endif
++	    else
++	      sprintf (buf, "0x%x", (unsigned int) gomp_integral (handle));
++	    gomp_display_num (buffer, size, &ret, zero, right, sz, buf);
++	    break;
++	  }
++#else
++	  val = 0;
++	  goto do_int;
++#endif
++	case 'A':
++	  if (sz == (size_t) -1)
++	    gomp_display_affinity_place (buffer, size, &ret,
++					 place - 1);
++	  else if (right)
++	    {
++	      size_t len = 0;
++	      gomp_display_affinity_place (NULL, 0, &len, place - 1);
++	      if (len < sz)
++		gomp_display_repeat (buffer, size, &ret, ' ', sz - len);
++	      gomp_display_affinity_place (buffer, size, &ret, place - 1);
++	    }
++	  else
++	    {
++	      size_t start = ret;
++	      gomp_display_affinity_place (buffer, size, &ret, place - 1);
++	      if (ret - start < sz)
++		gomp_display_repeat (buffer, size, &ret, ' ', sz - (ret - start));
++	    }
++	  break;
++	do_int:
++	  gomp_display_int (buffer, size, &ret, zero, right, sz, val);
++	  break;
++	default:
++	  gomp_fatal ("unsupported type %c in affinity format", c);
++	}
++      format = p + 1;
++    }
++  while (1);
++  return ret;
++}
++
++size_t
++omp_capture_affinity (char *buffer, size_t size, const char *format)
++{
++  struct gomp_thread *thr = gomp_thread ();
++  size_t ret
++    = gomp_display_affinity (buffer, size,
++			     format && *format
++			     ? format : gomp_affinity_format_var,
++			     gomp_thread_self (), &thr->ts, thr->place);
++  if (size)
++    {
++      if (ret >= size)
++	buffer[size - 1] = '\0';
++      else
++	buffer[ret] = '\0';
++    }
++  return ret;
++}
++ialias (omp_capture_affinity)
++
++void
++omp_display_affinity (const char *format)
++{
++  char buf[512];
++  char *b;
++  size_t ret = ialias_call (omp_capture_affinity) (buf, sizeof buf, format);
++  if (ret < sizeof buf)
++    {
++      buf[ret] = '\n';
++      gomp_print_string (buf, ret + 1);
++      return;
++    }
++  b = gomp_malloc (ret + 1);
++  ialias_call (omp_capture_affinity) (b, ret + 1, format);
++  b[ret] = '\n';
++  gomp_print_string (b, ret + 1);
++  free (b);
++}
++
++void
++gomp_display_affinity_thread (gomp_thread_handle handle,
++			      struct gomp_team_state *ts, unsigned int place)
++{
++  char buf[512];
++  char *b;
++  size_t ret = gomp_display_affinity (buf, sizeof buf, gomp_affinity_format_var,
++				      handle, ts, place);
++  if (ret < sizeof buf)
++    {
++      buf[ret] = '\n';
++      gomp_print_string (buf, ret + 1);
++      return;
++    }
++  b = gomp_malloc (ret + 1);
++  gomp_display_affinity (b, ret + 1, gomp_affinity_format_var,
++  			 handle, ts, place);
++  b[ret] = '\n';
++  gomp_print_string (b, ret + 1);
++  free (b);
++}
+--- libgomp/single.c.jj	2018-04-25 09:40:31.870655561 +0200
++++ libgomp/single.c	2019-05-07 18:46:36.536109576 +0200
+@@ -47,7 +47,7 @@ GOMP_single_start (void)
+   return __sync_bool_compare_and_swap (&team->single_count, single_count,
+ 				       single_count + 1L);
+ #else
+-  bool ret = gomp_work_share_start (false);
++  bool ret = gomp_work_share_start (0);
+   if (ret)
+     gomp_work_share_init_done ();
+   gomp_work_share_end_nowait ();
+@@ -68,7 +68,7 @@ GOMP_single_copy_start (void)
+   bool first;
+   void *ret;
+ 
+-  first = gomp_work_share_start (false);
++  first = gomp_work_share_start (0);
+   
+   if (first)
+     {
+--- libgomp/oacc-cuda.c.jj	2018-04-25 09:40:31.321655307 +0200
++++ libgomp/oacc-cuda.c	2019-05-07 18:46:36.528109704 +0200
+@@ -58,7 +58,7 @@ acc_get_cuda_stream (int async)
+ {
+   struct goacc_thread *thr = goacc_thread ();
+ 
+-  if (async < 0)
++  if (!async_valid_p (async))
+     return NULL;
+ 
+   if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
+@@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *st
+ {
+   struct goacc_thread *thr;
+ 
+-  if (async < 0 || stream == NULL)
++  if (!async_valid_p (async) || stream == NULL)
+     return 0;
+ 
+   goacc_lazy_initialize ();
+--- libgomp/work.c.jj	2018-04-25 09:40:31.925655587 +0200
++++ libgomp/work.c	2019-05-07 18:46:36.548109384 +0200
+@@ -76,7 +76,15 @@ alloc_work_share (struct gomp_team *team
+ #endif
+ 
+   team->work_share_chunk *= 2;
++  /* Allocating gomp_work_share structures aligned is just an
++     optimization, don't do it when using the fallback method.  */
++#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC
++  ws = gomp_aligned_alloc (__alignof (struct gomp_work_share),
++			   team->work_share_chunk
++			   * sizeof (struct gomp_work_share));
++#else
+   ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share));
++#endif
+   ws->next_alloc = team->work_shares[0].next_alloc;
+   team->work_shares[0].next_alloc = ws;
+   team->work_share_list_alloc = &ws[1];
+@@ -90,30 +98,35 @@ alloc_work_share (struct gomp_team *team
+    This shouldn't touch the next_alloc field.  */
+ 
+ void
+-gomp_init_work_share (struct gomp_work_share *ws, bool ordered,
++gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
+ 		      unsigned nthreads)
+ {
+   gomp_mutex_init (&ws->lock);
+   if (__builtin_expect (ordered, 0))
+     {
+-#define INLINE_ORDERED_TEAM_IDS_CNT \
+-  ((sizeof (struct gomp_work_share) \
+-    - offsetof (struct gomp_work_share, inline_ordered_team_ids)) \
+-   / sizeof (((struct gomp_work_share *) 0)->inline_ordered_team_ids[0]))
+-
+-      if (nthreads > INLINE_ORDERED_TEAM_IDS_CNT)
+-	ws->ordered_team_ids
+-	  = gomp_malloc (nthreads * sizeof (*ws->ordered_team_ids));
++#define INLINE_ORDERED_TEAM_IDS_SIZE \
++  (sizeof (struct gomp_work_share) \
++   - offsetof (struct gomp_work_share, inline_ordered_team_ids))
++
++      if (__builtin_expect (ordered != 1, 0))
++	{
++	  ordered += nthreads * sizeof (*ws->ordered_team_ids) - 1;
++	  ordered = ordered + __alignof__ (long long) - 1;
++	  ordered &= ~(__alignof__ (long long) - 1);
++	}
++      else
++	ordered = nthreads * sizeof (*ws->ordered_team_ids);
++      if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
++	ws->ordered_team_ids = gomp_malloc (ordered);
+       else
+ 	ws->ordered_team_ids = ws->inline_ordered_team_ids;
+-      memset (ws->ordered_team_ids, '\0',
+-	      nthreads * sizeof (*ws->ordered_team_ids));
++      memset (ws->ordered_team_ids, '\0', ordered);
+       ws->ordered_num_used = 0;
+       ws->ordered_owner = -1;
+       ws->ordered_cur = 0;
+     }
+   else
+-    ws->ordered_team_ids = NULL;
++    ws->ordered_team_ids = ws->inline_ordered_team_ids;
+   gomp_ptrlock_init (&ws->next_ws, NULL);
+   ws->threads_completed = 0;
+ }
+@@ -166,7 +179,7 @@ free_work_share (struct gomp_team *team,
+    if this was the first thread to reach this point.  */
+ 
+ bool
+-gomp_work_share_start (bool ordered)
++gomp_work_share_start (size_t ordered)
+ {
+   struct gomp_thread *thr = gomp_thread ();
+   struct gomp_team *team = thr->ts.team;
+@@ -178,7 +191,7 @@ gomp_work_share_start (bool ordered)
+       ws = gomp_malloc (sizeof (*ws));
+       gomp_init_work_share (ws, ordered, 1);
+       thr->ts.work_share = ws;
+-      return ws;
++      return true;
+     }
+ 
+   ws = thr->ts.work_share;
+--- include/gomp-constants.h.jj	2018-04-25 09:40:39.757659209 +0200
++++ include/gomp-constants.h	2019-05-07 18:57:33.333627031 +0200
+@@ -189,6 +189,7 @@ enum gomp_map_kind
+ #define GOMP_TASK_FLAG_GRAINSIZE	(1 << 9)
+ #define GOMP_TASK_FLAG_IF		(1 << 10)
+ #define GOMP_TASK_FLAG_NOGROUP		(1 << 11)
++#define GOMP_TASK_FLAG_REDUCTION	(1 << 12)
+ 
+ /* GOMP_target{_ext,update_ext,enter_exit_data} flags argument.  */
+ #define GOMP_TARGET_FLAG_NOWAIT		(1 << 0)
+@@ -196,6 +197,18 @@ enum gomp_map_kind
+ /* Internal to libgomp.  */
+ #define GOMP_TARGET_FLAG_UPDATE		(1U << 31)
+ 
++
++/* OpenACC construct flags.  */
++
++/* Force host fallback execution.  */
++#define GOACC_FLAG_HOST_FALLBACK	(1 << 0)
++
++/* For legacy reasons, in the ABI, the GOACC_FLAGs are encoded as an inverted
++   bitmask.  */
++#define GOACC_FLAGS_MARSHAL_OP		BIT_NOT_EXPR
++#define GOACC_FLAGS_UNMARSHAL(X)	(~(X))
++
++
+ /* Versions of libgomp and device-specific plugins.  GOMP_VERSION
+    should be incremented whenever an ABI-incompatible change is introduced
+    to the plugin interface defined in libgomp/libgomp.h.  */
+@@ -251,6 +264,12 @@ enum gomp_map_kind
+    at most and shifted by this many bits.  */
+ #define GOMP_TARGET_ARG_VALUE_SHIFT		16
+ 
++/* Dependence types in omp_depend_t objects.  */
++#define GOMP_DEPEND_IN			1
++#define GOMP_DEPEND_OUT			2
++#define GOMP_DEPEND_INOUT		3
++#define GOMP_DEPEND_MUTEXINOUTSET	4
++
+ /* HSA specific data structures.  */
+ 
+ /* Identifiers of device-specific target arguments.  */
diff --git a/SOURCES/gcc8-libgomp-testsuite.patch b/SOURCES/gcc8-libgomp-testsuite.patch
new file mode 100644
index 0000000..502ee22
--- /dev/null
+++ b/SOURCES/gcc8-libgomp-testsuite.patch
@@ -0,0 +1,41 @@
+--- libgomp/testsuite/libgomp-test-support.exp.in.jj	2018-04-25 09:40:31.323655308 +0200
++++ libgomp/testsuite/libgomp-test-support.exp.in	2019-04-25 20:01:50.028243827 +0200
+@@ -2,4 +2,5 @@ set cuda_driver_include "@CUDA_DRIVER_IN
+ set cuda_driver_lib "@CUDA_DRIVER_LIB@"
+ set hsa_runtime_lib "@HSA_RUNTIME_LIB@"
+ 
++set offload_plugins "@offload_plugins@"
+ set offload_targets "@offload_targets@"
+--- libgomp/testsuite/lib/libgomp.exp.jj	2018-04-25 09:40:31.584655429 +0200
++++ libgomp/testsuite/lib/libgomp.exp	2019-05-24 11:41:51.015822702 +0200
+@@ -40,7 +40,7 @@ load_file libgomp-test-support.exp
+ # Populate offload_targets_s (offloading targets separated by a space), and
+ # offload_targets_s_openacc (the same, but with OpenACC names; OpenACC spells
+ # some of them a little differently).
+-set offload_targets_s [split $offload_targets ","]
++set offload_targets_s [split $offload_plugins ","]
+ set offload_targets_s_openacc {}
+ foreach offload_target_openacc $offload_targets_s {
+     # Translate to OpenACC names, or skip if not yet supported.
+@@ -137,8 +137,8 @@ proc libgomp_init { args } {
+ 
+     # Add liboffloadmic build directory in LD_LIBRARY_PATH to support
+     # non-fallback testing for Intel MIC targets
+-    global offload_targets
+-    if { [string match "*,intelmic,*" ",$offload_targets,"] } {
++    global offload_plugins
++    if { [string match "*,intelmic,*" ",$offload_plugins,"] } {
+ 	append always_ld_library_path ":${blddir}/../liboffloadmic/.libs"
+ 	append always_ld_library_path ":${blddir}/../liboffloadmic/plugin/.libs"
+ 	# libstdc++ is required by liboffloadmic
+@@ -362,8 +362,8 @@ proc check_effective_target_offload_devi
+ # Return 1 if configured for nvptx offloading.
+ 
+ proc check_effective_target_openacc_nvidia_accel_configured { } {
+-    global offload_targets
+-    if { ![string match "*,nvptx,*" ",$offload_targets,"] } {
++    global offload_plugins
++    if { ![string match "*,nvptx,*" ",$offload_plugins,"] } {
+         return 0
+     }
+     # PR libgomp/65099: Currently, we only support offloading in 64-bit
diff --git a/SOURCES/gcc8-pr60790.patch b/SOURCES/gcc8-pr60790.patch
deleted file mode 100644
index 810919f..0000000
--- a/SOURCES/gcc8-pr60790.patch
+++ /dev/null
@@ -1,84 +0,0 @@
-	PR libgcc/60790
-	x86: Do not assume ELF constructors run before IFUNC resolvers.
-	* config/x86/host-config.h (libat_feat1_ecx, libat_feat1_edx):
-	Remove declarations.
-	(__libat_feat1, __libat_feat1_init): Declare.
-	(FEAT1_REGISTER): Define.
-	(load_feat1): New function.
-	(IFUNC_COND_1): Adjust.
-	* config/x86/init.c (libat_feat1_ecx, libat_feat1_edx)
-	(init_cpuid): Remove definitions.
-	(__libat_feat1): New variable.
-	(__libat_feat1_init): New function.
-
---- libatomic/config/x86/host-config.h	(revision 264990)
-+++ libatomic/config/x86/host-config.h	(working copy)
-@@ -25,13 +25,39 @@
- #if HAVE_IFUNC
- #include <cpuid.h>
- 
--extern unsigned int libat_feat1_ecx HIDDEN;
--extern unsigned int libat_feat1_edx HIDDEN;
-+#ifdef __x86_64__
-+# define FEAT1_REGISTER ecx
-+#else
-+# define FEAT1_REGISTER edx
-+#endif
- 
-+/* Value of the CPUID feature register FEAT1_REGISTER for the cmpxchg
-+   bit for IFUNC_COND1 below.  */
-+extern unsigned int __libat_feat1 HIDDEN;
-+
-+/* Initialize libat_feat1 and return its value.  */
-+unsigned int __libat_feat1_init (void) HIDDEN;
-+
-+/* Return the value of the relevant feature register for the relevant
-+   cmpxchg bit, or 0 if there is no CPUID support.  */
-+static inline unsigned int
-+__attribute__ ((const))
-+load_feat1 (void)
-+{
-+  /* See the store in __libat_feat1_init.  */
-+  unsigned int feat1 = __atomic_load_n (&__libat_feat1, __ATOMIC_RELAXED);
-+  if (feat1 == 0)
-+    /* Assume that initialization has not happened yet.  This may get
-+       called repeatedly if the CPU does not have any feature bits at
-+       all.  */
-+    feat1 = __libat_feat1_init ();
-+  return feat1;
-+}
-+
- #ifdef __x86_64__
--# define IFUNC_COND_1	(libat_feat1_ecx & bit_CMPXCHG16B)
-+# define IFUNC_COND_1	(load_feat1 () & bit_CMPXCHG16B)
- #else
--# define IFUNC_COND_1	(libat_feat1_edx & bit_CMPXCHG8B)
-+# define IFUNC_COND_1	(load_feat1 () & bit_CMPXCHG8B)
- #endif
- 
- #ifdef __x86_64__
---- libatomic/config/x86/init.c	(revision 264990)
-+++ libatomic/config/x86/init.c	(working copy)
-@@ -26,13 +26,17 @@
- 
- #if HAVE_IFUNC
- 
--unsigned int libat_feat1_ecx, libat_feat1_edx;
-+unsigned int __libat_feat1;
- 
--static void __attribute__((constructor))
--init_cpuid (void)
-+unsigned int
-+__libat_feat1_init (void)
- {
--  unsigned int eax, ebx;
--  __get_cpuid (1, &eax, &ebx, &libat_feat1_ecx, &libat_feat1_edx);
-+  unsigned int eax, ebx, ecx, edx;
-+  FEAT1_REGISTER = 0;
-+  __get_cpuid (1, &eax, &ebx, &ecx, &edx);
-+  /* See the load in load_feat1.  */
-+  __atomic_store_n (&__libat_feat1, FEAT1_REGISTER, __ATOMIC_RELAXED);
-+  return FEAT1_REGISTER;
- }
- 
- #endif /* HAVE_IFUNC */
diff --git a/SOURCES/gcc8-pr85400.patch b/SOURCES/gcc8-pr85400.patch
new file mode 100644
index 0000000..0c0d887
--- /dev/null
+++ b/SOURCES/gcc8-pr85400.patch
@@ -0,0 +1,94 @@
+2018-05-10  Eric Botcazou  <ebotcazou@adacore.com>
+
+	PR c++/85400
+	* c-attribs.c (handle_visibility_attribute): Do not set no_add_attrs.
+
+	* decl2.c (adjust_var_decl_tls_model): New static function.
+	(comdat_linkage): Call it on a variable.
+	(maybe_make_one_only): Likewise.
+
+--- gcc/c-family/c-attribs.c
++++ gcc/c-family/c-attribs.c
+@@ -2299,14 +2299,13 @@ handle_visibility_attribute (tree *node, tree name, tree args,
+ 
+ static tree
+ handle_tls_model_attribute (tree *node, tree name, tree args,
+-			    int ARG_UNUSED (flags), bool *no_add_attrs)
++			    int ARG_UNUSED (flags),
++			    bool *ARG_UNUSED (no_add_attrs))
+ {
+   tree id;
+   tree decl = *node;
+   enum tls_model kind;
+ 
+-  *no_add_attrs = true;
+-
+   if (!VAR_P (decl) || !DECL_THREAD_LOCAL_P (decl))
+     {
+       warning (OPT_Wattributes, "%qE attribute ignored", name);
+--- gcc/cp/decl2.c
++++ gcc/cp/decl2.c
+@@ -1838,6 +1838,17 @@ mark_vtable_entries (tree decl)
+     }
+ }
+ 
++/* Adjust the TLS model on variable DECL if need be, typically after
++   the linkage of DECL has been modified.  */
++
++static void
++adjust_var_decl_tls_model (tree decl)
++{
++  if (CP_DECL_THREAD_LOCAL_P (decl)
++      && !lookup_attribute ("tls_model", DECL_ATTRIBUTES (decl)))
++    set_decl_tls_model (decl, decl_default_tls_model (decl));
++}
++
+ /* Set DECL up to have the closest approximation of "initialized common"
+    linkage available.  */
+ 
+@@ -1888,6 +1899,9 @@ comdat_linkage (tree decl)
+ 
+   if (TREE_PUBLIC (decl))
+     DECL_COMDAT (decl) = 1;
++
++  if (VAR_P (decl))
++    adjust_var_decl_tls_model (decl);
+ }
+ 
+ /* For win32 we also want to put explicit instantiations in
+@@ -1926,6 +1940,8 @@ maybe_make_one_only (tree decl)
+ 	  /* Mark it needed so we don't forget to emit it.  */
+           node->forced_by_abi = true;
+ 	  TREE_USED (decl) = 1;
++
++	  adjust_var_decl_tls_model (decl);
+ 	}
+     }
+ }
+--- /dev/null
++++ gcc/testsuite/g++.dg/tls/pr85400.C
+@@ -0,0 +1,24 @@
++// PR c++/85400
++// Testcase by Brian Vandenberg <phantall@gmail.com>
++
++// { dg-do link { target c++11 } }
++// { dg-require-effective-target fpic }
++// { dg-require-effective-target shared }
++// { dg-require-effective-target tls }
++// { dg-options "-shared -fPIC -O" }
++// { dg-add-options tls }
++
++struct Test
++{
++  int blah (int y)
++  {
++    thread_local int mything = 3;
++    mything = y > 0 ? y : mything;
++    return mything;
++  }
++};
++
++int stuff (Test& test, int y)
++{
++  return test.blah(y);
++}
diff --git a/SOURCES/gcc8-pr86098.patch b/SOURCES/gcc8-pr86098.patch
new file mode 100644
index 0000000..5f5a651
--- /dev/null
+++ b/SOURCES/gcc8-pr86098.patch
@@ -0,0 +1,39 @@
+2018-06-12  Jason Merrill  <jason@redhat.com>
+
+	PR c++/86098 - ICE with template placeholder for TTP.
+	* typeck.c (structural_comptypes) [TEMPLATE_TYPE_PARM]: Check
+	CLASS_PLACEHOLDER_TEMPLATE.
+
+--- gcc/cp/typeck.c
++++ gcc/cp/typeck.c
+@@ -1375,6 +1375,11 @@ structural_comptypes (tree t1, tree t2, int strict)
+ 	 template parameters set, they can't be equal.  */
+       if (!comp_template_parms_position (t1, t2))
+ 	return false;
++      /* If T1 and T2 don't represent the same class template deduction,
++         they aren't equal.  */
++      if (CLASS_PLACEHOLDER_TEMPLATE (t1)
++	  != CLASS_PLACEHOLDER_TEMPLATE (t2))
++	return false;
+       /* Constrained 'auto's are distinct from parms that don't have the same
+ 	 constraints.  */
+       if (!equivalent_placeholder_constraints (t1, t2))
+--- /dev/null
++++ gcc/testsuite/g++.dg/cpp1z/class-deduction58.C
+@@ -0,0 +1,16 @@
++// PR c++/86098
++// { dg-additional-options -std=c++17 }
++
++template <class _Res> class future;
++template <class T> T&& declval();
++
++template<template <class...> class T>
++struct construct_deduced {
++  template <class... AN>
++  using deduced_t = decltype(T{declval<AN>()...});
++  template<class... AN>
++  deduced_t<AN...> operator()(AN&&... an) const;
++};
++
++template<class T>
++future<T> future_from(T singleSender);
diff --git a/SOURCES/gcc8-pr86747.patch b/SOURCES/gcc8-pr86747.patch
new file mode 100644
index 0000000..78e4a76
--- /dev/null
+++ b/SOURCES/gcc8-pr86747.patch
@@ -0,0 +1,30 @@
+2018-12-06  Alexandre Oliva <aoliva@redhat.com>
+ 
+	PR c++/86747
+	* pt.c (tsubst_friend_class): Enter tsubsted class context.
+
+--- gcc/cp/pt.c
++++ gcc/cp/pt.c
+@@ -10558,7 +10558,10 @@ tsubst_friend_class (tree friend_tmpl, tree args)
+   if (TREE_CODE (context) == NAMESPACE_DECL)
+     push_nested_namespace (context);
+   else
+-    push_nested_class (context);
++    {
++      context = tsubst (context, args, tf_error, NULL_TREE);
++      push_nested_class (context);
++    }
+ 
+   tmpl = lookup_name_real (DECL_NAME (friend_tmpl), /*prefer_type=*/false,
+ 			   /*non_class=*/false, /*block_p=*/false,
+--- /dev/null
++++ gcc/testsuite/g++.dg/pr86747.C
+@@ -0,0 +1,8 @@
++// { dg-do compile }
++
++template <typename T> class A {
++  template <void (A::*p)()> class C; // #1
++  template <void (A::*q)()> friend class C; // #2
++};
++
++A<double> a;
diff --git a/SOURCES/gcc8-pr90139.patch b/SOURCES/gcc8-pr90139.patch
new file mode 100644
index 0000000..e7a5958
--- /dev/null
+++ b/SOURCES/gcc8-pr90139.patch
@@ -0,0 +1,40 @@
+2019-04-19  Jakub Jelinek  <jakub@redhat.com>
+
+	PR middle-end/90139
+	* tree-outof-ssa.c (get_temp_reg): If reg_mode is BLKmode, return
+	assign_temp instead of gen_reg_rtx.
+
+--- /dev/null
++++ gcc/testsuite/gcc.c-torture/compile/pr90139.c
+@@ -0,0 +1,20 @@
++/* PR middle-end/90139 */
++
++typedef float __attribute__((vector_size (sizeof (float)))) V;
++void bar (int, V *);
++int l;
++
++void
++foo (void)
++{
++  V n, b, o;
++  while (1)
++    switch (l)
++      {
++      case 0:
++	o = n;
++	n = b;
++	b = o;
++	bar (1, &o);
++      }
++}
+--- gcc/tree-outof-ssa.c
++++ gcc/tree-outof-ssa.c
+@@ -653,6 +653,8 @@ get_temp_reg (tree name)
+   tree type = TREE_TYPE (name);
+   int unsignedp;
+   machine_mode reg_mode = promote_ssa_mode (name, &unsignedp);
++  if (reg_mode == BLKmode)
++    return assign_temp (type, 0, 0);
+   rtx x = gen_reg_rtx (reg_mode);
+   if (POINTER_TYPE_P (type))
+     mark_reg_pointer (x, TYPE_ALIGN (TREE_TYPE (type)));
diff --git a/SOURCES/gcc8-pr90756.patch b/SOURCES/gcc8-pr90756.patch
new file mode 100644
index 0000000..c43fb18
--- /dev/null
+++ b/SOURCES/gcc8-pr90756.patch
@@ -0,0 +1,55 @@
+2019-07-04  Jakub Jelinek  <jakub@redhat.com>
+
+	PR rtl-optimization/90756
+	* explow.c (promote_ssa_mode): Always use TYPE_MODE, don't bypass it
+	for VECTOR_TYPE_P.
+
+--- gcc/explow.c
++++ gcc/explow.c
+@@ -892,16 +892,7 @@ promote_ssa_mode (const_tree name, int *punsignedp)
+ 
+   tree type = TREE_TYPE (name);
+   int unsignedp = TYPE_UNSIGNED (type);
+-  machine_mode mode = TYPE_MODE (type);
+-
+-  /* Bypass TYPE_MODE when it maps vector modes to BLKmode.  */
+-  if (mode == BLKmode)
+-    {
+-      gcc_assert (VECTOR_TYPE_P (type));
+-      mode = type->type_common.mode;
+-    }
+-
+-  machine_mode pmode = promote_mode (type, mode, &unsignedp);
++  machine_mode pmode = promote_mode (type, TYPE_MODE (type), &unsignedp);
+   if (punsignedp)
+     *punsignedp = unsignedp;
+ 
+--- /dev/null
++++ gcc/testsuite/gcc.dg/pr90756.c
+@@ -0,0 +1,26 @@
++/* PR rtl-optimization/90756 */
++/* { dg-do compile } */
++/* { dg-options "-O2 -Wno-psabi" } */
++/* { dg-additional-options "-mno-sse" { target ia32 } } */
++
++typedef float B __attribute__((vector_size(4 * sizeof (float))));
++typedef unsigned long long C __attribute__((vector_size(4 * sizeof (long long))));
++typedef short D __attribute__((vector_size(4 * sizeof (short))));
++B z;
++void foo (C);
++C bar (D);
++B baz ();
++D qux (B);
++
++void
++quux (int x)
++{
++  B n = z, b = z;
++  while (1)
++    switch (x)
++      {
++      case 0: n = baz (); /* FALLTHRU */
++      case 1: { B o = n; n = b; b = o; } /* FALLTHRU */
++      case 2: { D u = qux (b); C v = bar (u); foo (v); }
++      }
++}
diff --git a/SOURCES/gcc8-rh1612514.patch b/SOURCES/gcc8-rh1612514.patch
deleted file mode 100644
index cf0bdcf..0000000
--- a/SOURCES/gcc8-rh1612514.patch
+++ /dev/null
@@ -1,85 +0,0 @@
-2018-08-03  David Malcolm  <dmalcolm@redhat.com>
-
-	* doc/gcov.texi (-x): Remove duplicate "to".
-	* doc/invoke.texi (-Wnoexcept-type): Remove duplicate "calls".
-	(-Wif-not-aligned): Remove duplicate "is".
-	(-flto): Remove duplicate "the".
-	(MicroBlaze Options): In examples of "-mcpu=cpu-type", remove
-	duplicate "v5.00.b".
-	(MSP430 Options): Remove duplicate "and" from the description
-	of "-mgprel-sec=regexp".
-	(x86 Options): Remove duplicate copies of "vmldLog102" and
-	vmlsLog104 from description of "-mveclibabi=type".
-
---- gcc/doc/gcov.texi
-+++ gcc/doc/gcov.texi
-@@ -340,7 +340,7 @@ Print verbose informations related to basic blocks and arcs.
- 
- @item -x
- @itemx --hash-filenames
--By default, gcov uses the full pathname of the source files to to create
-+By default, gcov uses the full pathname of the source files to create
- an output filename.  This can lead to long filenames that can overflow
- filesystem limits.  This option creates names of the form
- @file{@var{source-file}##@var{md5}.gcov},
---- gcc/doc/invoke.texi
-+++ gcc/doc/invoke.texi
-@@ -3056,7 +3056,7 @@ void h() @{ f(g); @}
- @end smallexample
- 
- @noindent
--In C++14, @code{f} calls calls @code{f<void(*)()>}, but in
-+In C++14, @code{f} calls @code{f<void(*)()>}, but in
- C++17 it calls @code{f<void(*)()noexcept>}.
- 
- @item -Wclass-memaccess @r{(C++ and Objective-C++ only)}
-@@ -4587,7 +4587,7 @@ The @option{-Wimplicit-fallthrough=3} warning is enabled by @option{-Wextra}.
- @opindex Wif-not-aligned
- @opindex Wno-if-not-aligned
- Control if warning triggered by the @code{warn_if_not_aligned} attribute
--should be issued.  This is is enabled by default.
-+should be issued.  This is enabled by default.
- Use @option{-Wno-if-not-aligned} to disable it.
- 
- @item -Wignored-qualifiers @r{(C and C++ only)}
-@@ -9613,7 +9613,7 @@ for LTO, use @command{gcc-ar} and @command{gcc-ranlib} instead of @command{ar}
- and @command{ranlib}; 
- to show the symbols of object files with GIMPLE bytecode, use
- @command{gcc-nm}.  Those commands require that @command{ar}, @command{ranlib}
--and @command{nm} have been compiled with plugin support.  At link time, use the the
-+and @command{nm} have been compiled with plugin support.  At link time, use the
- flag @option{-fuse-linker-plugin} to ensure that the library participates in
- the LTO optimization process:
- 
-@@ -20159,7 +20159,7 @@ Use features of, and schedule code for, the given CPU.
- Supported values are in the format @samp{v@var{X}.@var{YY}.@var{Z}},
- where @var{X} is a major version, @var{YY} is the minor version, and
- @var{Z} is compatibility code.  Example values are @samp{v3.00.a},
--@samp{v4.00.b}, @samp{v5.00.a}, @samp{v5.00.b}, @samp{v5.00.b}, @samp{v6.00.a}.
-+@samp{v4.00.b}, @samp{v5.00.a}, @samp{v5.00.b}, @samp{v6.00.a}.
- 
- @item -mxl-soft-mul
- @opindex mxl-soft-mul
-@@ -21839,7 +21839,7 @@ GP-relative addressing.  It is most useful in conjunction with
- The @var{regexp} is a POSIX Extended Regular Expression.
- 
- This option does not affect the behavior of the @option{-G} option, and 
--and the specified sections are in addition to the standard @code{.sdata} 
-+the specified sections are in addition to the standard @code{.sdata}
- and @code{.sbss} small-data sections that are recognized by @option{-mgpopt}.
- 
- @item -mr0rel-sec=@var{regexp}
-@@ -27613,11 +27613,11 @@ To use this option, both @option{-ftree-vectorize} and
- ABI-compatible library must be specified at link time.
- 
- GCC currently emits calls to @code{vmldExp2},
--@code{vmldLn2}, @code{vmldLog102}, @code{vmldLog102}, @code{vmldPow2},
-+@code{vmldLn2}, @code{vmldLog102}, @code{vmldPow2},
- @code{vmldTanh2}, @code{vmldTan2}, @code{vmldAtan2}, @code{vmldAtanh2},
- @code{vmldCbrt2}, @code{vmldSinh2}, @code{vmldSin2}, @code{vmldAsinh2},
- @code{vmldAsin2}, @code{vmldCosh2}, @code{vmldCos2}, @code{vmldAcosh2},
--@code{vmldAcos2}, @code{vmlsExp4}, @code{vmlsLn4}, @code{vmlsLog104},
-+@code{vmldAcos2}, @code{vmlsExp4}, @code{vmlsLn4},
- @code{vmlsLog104}, @code{vmlsPow4}, @code{vmlsTanh4}, @code{vmlsTan4},
- @code{vmlsAtan4}, @code{vmlsAtanh4}, @code{vmlsCbrt4}, @code{vmlsSinh4},
- @code{vmlsSin4}, @code{vmlsAsinh4}, @code{vmlsAsin4}, @code{vmlsCosh4},
diff --git a/SOURCES/gcc8-rh1652016.patch b/SOURCES/gcc8-rh1652016.patch
deleted file mode 100644
index bc9e190..0000000
--- a/SOURCES/gcc8-rh1652016.patch
+++ /dev/null
@@ -1,124 +0,0 @@
-commit e7c4d49ab27338e6bc8b0272c4036da58482bde0
-Author: krebbel <krebbel@138bc75d-0d04-0410-961f-82ee72b054a4>
-Date:   Mon Nov 26 15:15:57 2018 +0000
-
-    S/390: Fix flogr RTX.
-    
-    The flogr instruction uses a 64 bit register pair target operand.  In
-    the RTX we model this as a write to a TImode register.  Unfortunately
-    the RTX's being assigned to the two parts of the target operand were
-    swapped.  This is no problem if in the end the flogr instruction will
-    be emitted since the instruction still does what the clzdi expander
-    expects.  However, a problem arises when the RTX is used to optimize
-    CLZ for a constant input operand.  Even then it matters only if the
-    expression couldn't be folded on tree level already.
-    
-    In the testcase this happened thanks to loop unrolling on RTL level.
-    The iteration variable is used as an argument to the clz
-    builtin. Due to the loop unrolling it becomes a constant and after
-    folding the broken RTX leads to a wrong assumption.
-    
-    gcc/ChangeLog:
-    
-    2018-11-26  Andreas Krebbel  <krebbel@linux.ibm.com>
-    
-            Backport from mainline
-            2018-11-20  Andreas Krebbel  <krebbel@linux.ibm.com>
-    
-            * config/s390/s390.md ("clztidi2"): Swap the RTX's written to the
-            DImode parts of the target operand.
-    
-    gcc/testsuite/ChangeLog:
-    
-    2018-11-26  Andreas Krebbel  <krebbel@linux.ibm.com>
-    
-            Backport from mainline
-            2018-11-20  Andreas Krebbel  <krebbel@linux.ibm.com>
-    
-            * gcc.target/s390/flogr-1.c: New test.
-    
-    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gcc-8-branch@266465 138bc75d-0d04-0410-961f-82ee72b054a4
-
-diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
-index c4d391bc9b5..53bb1985285 100644
---- a/gcc/config/s390/s390.md
-+++ b/gcc/config/s390/s390.md
-@@ -8861,17 +8861,17 @@
-   DONE;
- })
- 
-+; CLZ result is in hard reg op0 - this is the high part of the target operand
-+; The source with the left-most one bit cleared is in hard reg op0 + 1 - the low part
- (define_insn "clztidi2"
-   [(set (match_operand:TI 0 "register_operand" "=d")
- 	(ior:TI
--	  (ashift:TI
--            (zero_extend:TI
--   	      (xor:DI (match_operand:DI 1 "register_operand" "d")
--                      (lshiftrt (match_operand:DI 2 "const_int_operand" "")
--				(subreg:SI (clz:DI (match_dup 1)) 4))))
--
--	    (const_int 64))
--          (zero_extend:TI (clz:DI (match_dup 1)))))
-+	  (ashift:TI (zero_extend:TI (clz:DI (match_operand:DI 1 "register_operand" "d")))
-+		     (const_int 64))
-+	  (zero_extend:TI
-+	   (xor:DI (match_dup 1)
-+		   (lshiftrt (match_operand:DI 2 "const_int_operand" "")
-+			     (subreg:SI (clz:DI (match_dup 1)) 4))))))
-    (clobber (reg:CC CC_REGNUM))]
-   "UINTVAL (operands[2]) == HOST_WIDE_INT_1U << 63
-    && TARGET_EXTIMM && TARGET_ZARCH"
-diff --git a/gcc/testsuite/gcc.target/s390/flogr-1.c b/gcc/testsuite/gcc.target/s390/flogr-1.c
-new file mode 100644
-index 00000000000..a3869000d62
---- /dev/null
-+++ b/gcc/testsuite/gcc.target/s390/flogr-1.c
-@@ -0,0 +1,47 @@
-+/* { dg-do run } */
-+/* { dg-options "-O2 -funroll-loops -march=z9-109" } */
-+/* { dg-require-effective-target stdint_types } */
-+
-+/* Folding of the FLOGR caused a wrong value to be returned by
-+   __builtin_clz becuase of a problem in the RTX we emit for FLOGR.
-+   The problematic folding can only be triggered with constants inputs
-+   introduced on RTL level.  In this case it happens with loop
-+   unrolling.  */
-+
-+#include <stdint.h>
-+#include <assert.h>
-+
-+static inline uint32_t pow2_ceil_u32(uint32_t x) {
-+  if (x <= 1) {
-+    return x;
-+  }
-+  int msb_on_index;
-+  msb_on_index = (31 ^ __builtin_clz(x - 1));
-+  assert(msb_on_index < 31);
-+  return 1U << (msb_on_index + 1);
-+}
-+
-+void __attribute__((noinline,noclone))
-+die (int a)
-+{
-+  if (a)
-+    __builtin_abort ();
-+}
-+
-+void test_pow2_ceil_u32(void) {
-+  unsigned i;
-+
-+  for (i = 0; i < 18; i++) {
-+      uint32_t a_ = (pow2_ceil_u32(((uint32_t)1) << i));
-+      if (!(a_ == (((uint32_t)1) << i))) {
-+	die(1);
-+      }
-+  }
-+}
-+
-+int
-+main(void) {
-+  test_pow2_ceil_u32();
-+
-+  return 0;
-+}
diff --git a/SOURCES/gcc8-rh1652929-1.patch b/SOURCES/gcc8-rh1652929-1.patch
deleted file mode 100644
index 24f7a20..0000000
--- a/SOURCES/gcc8-rh1652929-1.patch
+++ /dev/null
@@ -1,572 +0,0 @@
-commit 87c504d3b293ebe6d36f3b50696cd307b02b0daa
-Author: acsawdey <acsawdey@138bc75d-0d04-0410-961f-82ee72b054a4>
-Date:   Tue Jun 19 21:23:39 2018 +0000
-
-    2018-06-19  Aaron Sawdey  <acsawdey@linux.ibm.com>
-    
-            * config/rs6000/rs6000-string.c (select_block_compare_mode): Check
-            TARGET_EFFICIENT_OVERLAPPING_UNALIGNED here instead of in caller.
-            (do_and3, do_and3_mask, do_compb3, do_rotl3): New functions.
-            (expand_block_compare): Change select_block_compare_mode call.
-            (expand_strncmp_align_check): Use new functions, fix comment.
-            (emit_final_str_compare_gpr): New function.
-            (expand_strn_compare): Refactor and clean up code.
-            * config/rs6000/vsx.md (vsx_mov<mode>_64bit): Remove *.
-    
-    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@261769 138bc75d-0d04-0410-961f-82ee72b054a4
-
-diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
-index 632d3359711..f9dd54eb639 100644
---- a/gcc/config/rs6000/rs6000-string.c
-+++ b/gcc/config/rs6000/rs6000-string.c
-@@ -266,6 +266,7 @@ select_block_compare_mode (unsigned HOST_WIDE_INT offset,
-   else if (bytes == GET_MODE_SIZE (QImode))
-     return QImode;
-   else if (bytes < GET_MODE_SIZE (SImode)
-+	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
- 	   && offset >= GET_MODE_SIZE (SImode) - bytes)
-     /* This matches the case were we have SImode and 3 bytes
-        and offset >= 1 and permits us to move back one and overlap
-@@ -273,6 +274,7 @@ select_block_compare_mode (unsigned HOST_WIDE_INT offset,
-        unwanted bytes off of the input.  */
-     return SImode;
-   else if (word_mode_ok && bytes < UNITS_PER_WORD
-+	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
- 	   && offset >= UNITS_PER_WORD-bytes)
-     /* Similarly, if we can use DImode it will get matched here and
-        can do an overlapping read that ends at the end of the block.  */
-@@ -408,6 +410,54 @@ do_add3 (rtx dest, rtx src1, rtx src2)
-     emit_insn (gen_addsi3 (dest, src1, src2));
- }
- 
-+/* Emit an and of the proper mode for DEST.
-+
-+   DEST is the destination register for the and.
-+   SRC1 is the first and input.
-+   SRC2 is the second and input.
-+
-+   Computes DEST = SRC1&SRC2.  */
-+static void
-+do_and3 (rtx dest, rtx src1, rtx src2)
-+{
-+  if (GET_MODE (dest) == DImode)
-+    emit_insn (gen_anddi3 (dest, src1, src2));
-+  else
-+    emit_insn (gen_andsi3 (dest, src1, src2));
-+}
-+
-+/* Emit an cmpb of the proper mode for DEST.
-+
-+   DEST is the destination register for the cmpb.
-+   SRC1 is the first input.
-+   SRC2 is the second input.
-+
-+   Computes cmpb of SRC1, SRC2.  */
-+static void
-+do_cmpb3 (rtx dest, rtx src1, rtx src2)
-+{
-+  if (GET_MODE (dest) == DImode)
-+    emit_insn (gen_cmpbdi3 (dest, src1, src2));
-+  else
-+    emit_insn (gen_cmpbsi3 (dest, src1, src2));
-+}
-+
-+/* Emit a rotl of the proper mode for DEST.
-+
-+   DEST is the destination register for the and.
-+   SRC1 is the first and input.
-+   SRC2 is the second and input.
-+
-+   Computes DEST = SRC1 rotated left by SRC2.  */
-+static void
-+do_rotl3 (rtx dest, rtx src1, rtx src2)
-+{
-+  if (GET_MODE (dest) == DImode)
-+    emit_insn (gen_rotldi3 (dest, src1, src2));
-+  else
-+    emit_insn (gen_rotlsi3 (dest, src1, src2));
-+}
-+
- /* Generate rtl for a load, shift, and compare of less than a full word.
- 
-    LOAD_MODE is the machine mode for the loads.
-@@ -1395,11 +1445,8 @@ expand_block_compare (rtx operands[])
-   while (bytes > 0)
-     {
-       unsigned int align = compute_current_alignment (base_align, offset);
--      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
--	load_mode = select_block_compare_mode (offset, bytes, align,
--					       word_mode_ok);
--      else
--	load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
-+      load_mode = select_block_compare_mode (offset, bytes,
-+					     align, word_mode_ok);
-       load_mode_size = GET_MODE_SIZE (load_mode);
-       if (bytes >= load_mode_size)
- 	cmp_bytes = load_mode_size;
-@@ -1627,22 +1674,19 @@ expand_block_compare (rtx operands[])
-   return true;
- }
- 
--/* Generate alignment check and branch code to set up for
-+/* Generate page crossing check and branch code to set up for
-    strncmp when we don't have DI alignment.
-    STRNCMP_LABEL is the label to branch if there is a page crossing.
--   SRC is the string pointer to be examined.
-+   SRC_ADDR is the string address to be examined.
-    BYTES is the max number of bytes to compare.  */
- static void
--expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
-+expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
- {
-   rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
--  rtx src_check = copy_addr_to_reg (XEXP (src, 0));
--  if (GET_MODE (src_check) == SImode)
--    emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
--  else
--    emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
-+  rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
-+  do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
-   rtx cond = gen_reg_rtx (CCmode);
--  emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
-+  emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
- 					 GEN_INT (4096 - bytes)));
- 
-   rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
-@@ -1654,6 +1698,76 @@ expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
-   LABEL_NUSES (strncmp_label) += 1;
- }
- 
-+/* Generate the final sequence that identifies the differing
-+   byte and generates the final result, taking into account
-+   zero bytes:
-+   
-+   cmpb              cmpb_result1, src1, src2
-+   cmpb              cmpb_result2, src1, zero
-+   orc               cmpb_result1, cmp_result1, cmpb_result2
-+   cntlzd            get bit of first zero/diff byte
-+   addi              convert for rldcl use
-+   rldcl rldcl       extract diff/zero byte
-+   subf              subtract for final result
-+
-+   STR1 is the reg rtx for data from string 1.
-+   STR2 is the reg rtx for data from string 2.
-+   RESULT is the reg rtx for the comparison result.  */
-+
-+static void
-+emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
-+{
-+  machine_mode m = GET_MODE (str1);
-+  rtx cmpb_diff = gen_reg_rtx (m);
-+  rtx cmpb_zero = gen_reg_rtx (m);
-+  rtx rot_amt = gen_reg_rtx (m);
-+  rtx zero_reg = gen_reg_rtx (m);
-+
-+  rtx rot1_1 = gen_reg_rtx (m);
-+  rtx rot1_2 = gen_reg_rtx (m);
-+  rtx rot2_1 = gen_reg_rtx (m);
-+  rtx rot2_2 = gen_reg_rtx (m);
-+
-+  if (m == SImode)
-+    {
-+      emit_insn (gen_cmpbsi3 (cmpb_diff, str1, str2));
-+      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
-+      emit_insn (gen_cmpbsi3 (cmpb_zero, str1, zero_reg));
-+      emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
-+      emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
-+      emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
-+      emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
-+      emit_insn (gen_rotlsi3 (rot1_1, str1,
-+			      gen_lowpart (SImode, rot_amt)));
-+      emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
-+      emit_insn (gen_rotlsi3 (rot2_1, str2,
-+			      gen_lowpart (SImode, rot_amt)));
-+      emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
-+      emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
-+    }
-+  else if (m == DImode)
-+    {
-+      emit_insn (gen_cmpbdi3 (cmpb_diff, str1, str2));
-+      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
-+      emit_insn (gen_cmpbdi3 (cmpb_zero, str1, zero_reg));
-+      emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
-+      emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
-+      emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
-+      emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
-+      emit_insn (gen_rotldi3 (rot1_1, str1,
-+			      gen_lowpart (SImode, rot_amt)));
-+      emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
-+      emit_insn (gen_rotldi3 (rot2_1, str2,
-+			      gen_lowpart (SImode, rot_amt)));
-+      emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
-+      emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
-+    }
-+  else
-+    gcc_unreachable ();
-+    
-+  return;
-+}
-+
- /* Expand a string compare operation with length, and return
-    true if successful. Return false if we should let the
-    compiler generate normal code, probably a strncmp call.
-@@ -1684,8 +1798,8 @@ expand_strn_compare (rtx operands[], int no_length)
-       align_rtx = operands[4];
-     }
-   unsigned HOST_WIDE_INT cmp_bytes = 0;
--  rtx src1 = orig_src1;
--  rtx src2 = orig_src2;
-+  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
-+  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
- 
-   /* If we have a length, it must be constant. This simplifies things
-      a bit as we don't have to generate code to check if we've exceeded
-@@ -1698,8 +1812,8 @@ expand_strn_compare (rtx operands[], int no_length)
-     return false;
- 
-   unsigned int base_align = UINTVAL (align_rtx);
--  int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
--  int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
-+  unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
-+  unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
- 
-   /* targetm.slow_unaligned_access -- don't do unaligned stuff.  */
-   if (targetm.slow_unaligned_access (word_mode, align1)
-@@ -1751,8 +1865,9 @@ expand_strn_compare (rtx operands[], int no_length)
-   rtx final_move_label = gen_label_rtx ();
-   rtx final_label = gen_label_rtx ();
-   rtx begin_compare_label = NULL;
-+  unsigned int required_align = 8;
- 
--  if (base_align < 8)
-+  if (base_align < required_align)
-     {
-       /* Generate code that checks distance to 4k boundary for this case.  */
-       begin_compare_label = gen_label_rtx ();
-@@ -1775,14 +1890,14 @@ expand_strn_compare (rtx operands[], int no_length)
-         }
-       else
-         {
--          align_test = ROUND_UP (align_test, 8);
--          base_align = 8;
-+          align_test = ROUND_UP (align_test, required_align);
-+          base_align = required_align;
-         }
- 
--      if (align1 < 8)
--        expand_strncmp_align_check (strncmp_label, src1, align_test);
--      if (align2 < 8)
--        expand_strncmp_align_check (strncmp_label, src2, align_test);
-+      if (align1 < required_align)
-+        expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
-+      if (align2 < required_align)
-+        expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
- 
-       /* Now generate the following sequence:
- 	 - branch to begin_compare
-@@ -1799,25 +1914,13 @@ expand_strn_compare (rtx operands[], int no_length)
- 
-       emit_label (strncmp_label);
- 
--      if (!REG_P (XEXP (src1, 0)))
--	{
--	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
--	  src1 = replace_equiv_address (src1, src1_reg);
--	}
--
--      if (!REG_P (XEXP (src2, 0)))
--	{
--	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
--	  src2 = replace_equiv_address (src2, src2_reg);
--	}
--
-       if (no_length)
- 	{
- 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
- 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
- 				   target, LCT_NORMAL, GET_MODE (target),
--				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
--				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
-+				   force_reg (Pmode, src1_addr), Pmode,
-+				   force_reg (Pmode, src2_addr), Pmode);
- 	}
-       else
- 	{
-@@ -1830,8 +1933,8 @@ expand_strn_compare (rtx operands[], int no_length)
- 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
- 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
- 				   target, LCT_NORMAL, GET_MODE (target),
--				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
--				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
-+				   force_reg (Pmode, src1_addr), Pmode,
-+				   force_reg (Pmode, src2_addr), Pmode,
- 				   len_rtx, Pmode);
- 	}
- 
-@@ -1847,12 +1950,12 @@ expand_strn_compare (rtx operands[], int no_length)
-   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
-   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
- 
--  /* Generate sequence of ld/ldbrx, cmpb to compare out
-+  /* Generate a sequence of GPR or VEC/VSX instructions to compare out
-      to the length specified.  */
-   unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
-   while (bytes_to_compare > 0)
-     {
--      /* Compare sequence:
-+      /* GPR compare sequence:
-          check each 8B with: ld/ld cmpd bne
- 	 If equal, use rldicr/cmpb to check for zero byte.
-          cleanup code at end:
-@@ -1866,13 +1969,10 @@ expand_strn_compare (rtx operands[], int no_length)
- 
-          The last compare can branch around the cleanup code if the
-          result is zero because the strings are exactly equal.  */
-+      
-       unsigned int align = compute_current_alignment (base_align, offset);
--      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
--	load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
--					       word_mode_ok);
--      else
--	load_mode = select_block_compare_mode (0, bytes_to_compare, align,
--					       word_mode_ok);
-+      load_mode = select_block_compare_mode (offset, bytes_to_compare,
-+					     align, word_mode_ok);
-       load_mode_size = GET_MODE_SIZE (load_mode);
-       if (bytes_to_compare >= load_mode_size)
- 	cmp_bytes = load_mode_size;
-@@ -1895,25 +1995,10 @@ expand_strn_compare (rtx operands[], int no_length)
- 	   rid of the extra bytes.  */
- 	cmp_bytes = bytes_to_compare;
- 
--      src1 = adjust_address (orig_src1, load_mode, offset);
--      src2 = adjust_address (orig_src2, load_mode, offset);
--
--      if (!REG_P (XEXP (src1, 0)))
--	{
--	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
--	  src1 = replace_equiv_address (src1, src1_reg);
--	}
--      set_mem_size (src1, load_mode_size);
--
--      if (!REG_P (XEXP (src2, 0)))
--	{
--	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
--	  src2 = replace_equiv_address (src2, src2_reg);
--	}
--      set_mem_size (src2, load_mode_size);
--
--      do_load_for_compare (tmp_reg_src1, src1, load_mode);
--      do_load_for_compare (tmp_reg_src2, src2, load_mode);
-+      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
-+      do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
-+      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
-+      do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
- 
-       /* We must always left-align the data we read, and
- 	 clear any bytes to the right that are beyond the string.
-@@ -1926,16 +2011,8 @@ expand_strn_compare (rtx operands[], int no_length)
- 	{
- 	  /* Rotate left first. */
- 	  rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
--	  if (word_mode == DImode)
--	    {
--	      emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
--	      emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
--	    }
--	  else
--	    {
--	      emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
--	      emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
--	    }
-+	  do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
-+	  do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
- 	}
- 
-       if (cmp_bytes < word_mode_size)
-@@ -1944,16 +2021,8 @@ expand_strn_compare (rtx operands[], int no_length)
- 	     turned into a rldicr instruction. */
- 	  HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
- 	  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
--	  if (word_mode == DImode)
--	    {
--	      emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
--	      emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
--	    }
--	  else
--	    {
--	      emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
--	      emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
--	    }
-+	  do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
-+	  do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
- 	}
- 
-       /* Cases to handle.  A and B are chunks of the two strings.
-@@ -2010,31 +2079,16 @@ expand_strn_compare (rtx operands[], int no_length)
- 	  rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
- 	  rtx condz = gen_reg_rtx (CCmode);
- 	  rtx zero_reg = gen_reg_rtx (word_mode);
--	  if (word_mode == SImode)
--	    {
--	      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
--	      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
--	      if (cmp_bytes < word_mode_size)
--		{
--		  /* Don't want to look at zero bytes past end.  */
--		  HOST_WIDE_INT mb =
--		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
--		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
--		  emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
--		}
--	    }
--	  else
-+	  emit_move_insn (zero_reg, GEN_INT (0));
-+	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
-+
-+	  if (cmp_bytes < word_mode_size)
- 	    {
--	      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
--	      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
--	      if (cmp_bytes < word_mode_size)
--		{
--		  /* Don't want to look at zero bytes past end.  */
--		  HOST_WIDE_INT mb =
--		    BITS_PER_UNIT * (word_mode_size - cmp_bytes);
--		  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
--		  emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
--		}
-+	      /* Don't want to look at zero bytes past end.  */
-+	      HOST_WIDE_INT mb =
-+		BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-+	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-+	      do_and3 (cmpb_zero, cmpb_zero, mask);
- 	    }
- 
- 	  emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
-@@ -2054,22 +2108,10 @@ expand_strn_compare (rtx operands[], int no_length)
-   if (equality_compare_rest)
-     {
-       /* Update pointers past what has been compared already.  */
--      src1 = adjust_address (orig_src1, load_mode, offset);
--      src2 = adjust_address (orig_src2, load_mode, offset);
--
--      if (!REG_P (XEXP (src1, 0)))
--	{
--	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
--	  src1 = replace_equiv_address (src1, src1_reg);
--	}
--      set_mem_size (src1, load_mode_size);
--
--      if (!REG_P (XEXP (src2, 0)))
--	{
--	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
--	  src2 = replace_equiv_address (src2, src2_reg);
--	}
--      set_mem_size (src2, load_mode_size);
-+      rtx src1 = force_reg (Pmode,
-+			    gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
-+      rtx src2 = force_reg (Pmode,
-+			    gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
- 
-       /* Construct call to strcmp/strncmp to compare the rest of the string.  */
-       if (no_length)
-@@ -2077,8 +2119,7 @@ expand_strn_compare (rtx operands[], int no_length)
- 	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
- 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
- 				   target, LCT_NORMAL, GET_MODE (target),
--				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
--				   force_reg (Pmode, XEXP (src2, 0)), Pmode);
-+				   src1, Pmode, src2, Pmode);
- 	}
-       else
- 	{
-@@ -2087,9 +2128,7 @@ expand_strn_compare (rtx operands[], int no_length)
- 	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
- 	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
- 				   target, LCT_NORMAL, GET_MODE (target),
--				   force_reg (Pmode, XEXP (src1, 0)), Pmode,
--				   force_reg (Pmode, XEXP (src2, 0)), Pmode,
--				   len_rtx, Pmode);
-+				   src1, Pmode, src2, Pmode, len_rtx, Pmode);
- 	}
- 
-       rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
-@@ -2102,63 +2141,7 @@ expand_strn_compare (rtx operands[], int no_length)
-   if (cleanup_label)
-     emit_label (cleanup_label);
- 
--  /* Generate the final sequence that identifies the differing
--     byte and generates the final result, taking into account
--     zero bytes:
--
--     cmpb              cmpb_result1, src1, src2
--     cmpb              cmpb_result2, src1, zero
--     orc               cmpb_result1, cmp_result1, cmpb_result2
--     cntlzd            get bit of first zero/diff byte
--     addi              convert for rldcl use
--     rldcl rldcl       extract diff/zero byte
--     subf              subtract for final result
--  */
--
--  rtx cmpb_diff = gen_reg_rtx (word_mode);
--  rtx cmpb_zero = gen_reg_rtx (word_mode);
--  rtx rot_amt = gen_reg_rtx (word_mode);
--  rtx zero_reg = gen_reg_rtx (word_mode);
--
--  rtx rot1_1 = gen_reg_rtx (word_mode);
--  rtx rot1_2 = gen_reg_rtx (word_mode);
--  rtx rot2_1 = gen_reg_rtx (word_mode);
--  rtx rot2_2 = gen_reg_rtx (word_mode);
--
--  if (word_mode == SImode)
--    {
--      emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
--      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
--      emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
--      emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
--      emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
--      emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
--      emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
--      emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
--			      gen_lowpart (SImode, rot_amt)));
--      emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
--      emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
--			      gen_lowpart (SImode, rot_amt)));
--      emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
--      emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
--    }
--  else
--    {
--      emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
--      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
--      emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
--      emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
--      emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
--      emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
--      emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
--      emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
--			      gen_lowpart (SImode, rot_amt)));
--      emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
--      emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
--			      gen_lowpart (SImode, rot_amt)));
--      emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
--      emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
--    }
-+  emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
- 
-   emit_label (final_move_label);
-   emit_insn (gen_movsi (target,
-diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
-index 0fc77aa18b0..e6921e96a3d 100644
---- a/gcc/config/rs6000/vsx.md
-+++ b/gcc/config/rs6000/vsx.md
-@@ -1210,7 +1210,7 @@
- ;;              VSX store  VSX load   VSX move  VSX->GPR   GPR->VSX    LQ (GPR)
- ;;              STQ (GPR)  GPR load   GPR store GPR move   XXSPLTIB    VSPLTISW
- ;;              VSX 0/-1   GPR 0/-1   VMX const GPR const  LVX (VMX)   STVX (VMX)
--(define_insn "*vsx_mov<mode>_64bit"
-+(define_insn "vsx_mov<mode>_64bit"
-   [(set (match_operand:VSX_M 0 "nonimmediate_operand"
-                "=ZwO,      <VSa>,     <VSa>,     r,         we,        ?wQ,
-                 ?&r,       ??r,       ??Y,       <??r>,     wo,        v,
diff --git a/SOURCES/gcc8-rh1652929-2.patch b/SOURCES/gcc8-rh1652929-2.patch
deleted file mode 100644
index bfcba3e..0000000
--- a/SOURCES/gcc8-rh1652929-2.patch
+++ /dev/null
@@ -1,472 +0,0 @@
-commit c7a833caa029b84ad579c3fabe006a80f718d7e1
-Author: acsawdey <acsawdey@138bc75d-0d04-0410-961f-82ee72b054a4>
-Date:   Thu Aug 2 18:11:54 2018 +0000
-
-    2018-07-31  Aaron Sawdey  <acsawdey@linux.ibm.com>
-    
-            * config/rs6000/rs6000-string.c (select_block_compare_mode): Move test
-            for word_mode_ok here instead of passing as argument.
-            (expand_block_compare): Change select_block_compare_mode() call.
-            (expand_strncmp_gpr_sequence): New function.
-            (expand_strn_compare): Make use of expand_strncmp_gpr_sequence.
-    
-    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@263273 138bc75d-0d04-0410-961f-82ee72b054a4
-
-diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
-index f9dd54eb639..451e9ed33da 100644
---- a/gcc/config/rs6000/rs6000-string.c
-+++ b/gcc/config/rs6000/rs6000-string.c
-@@ -238,13 +238,11 @@ do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
- 
-    OFFSET is the current read offset from the beginning of the block.
-    BYTES is the number of bytes remaining to be read.
--   ALIGN is the minimum alignment of the memory blocks being compared in bytes.
--   WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
--   the largest allowable mode.  */
-+   ALIGN is the minimum alignment of the memory blocks being compared in bytes.  */
- static machine_mode
- select_block_compare_mode (unsigned HOST_WIDE_INT offset,
- 			   unsigned HOST_WIDE_INT bytes,
--			   unsigned HOST_WIDE_INT align, bool word_mode_ok)
-+			   unsigned HOST_WIDE_INT align)
- {
-   /* First see if we can do a whole load unit
-      as that will be more efficient than a larger load + shift.  */
-@@ -257,6 +255,11 @@ select_block_compare_mode (unsigned HOST_WIDE_INT offset,
-   /* The most we can read without potential page crossing.  */
-   unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
- 
-+  /* If we have an LE target without ldbrx and word_mode is DImode,
-+     then we must avoid using word_mode.  */
-+  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
-+		       && word_mode == DImode);
-+
-   if (word_mode_ok && bytes >= UNITS_PER_WORD)
-     return word_mode;
-   else if (bytes == GET_MODE_SIZE (SImode))
-@@ -1382,16 +1385,11 @@ expand_block_compare (rtx operands[])
-   else
-     cond = gen_reg_rtx (CCmode);
- 
--  /* If we have an LE target without ldbrx and word_mode is DImode,
--     then we must avoid using word_mode.  */
--  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
--		       && word_mode == DImode);
--
-   /* Strategy phase.  How many ops will this take and should we expand it?  */
- 
-   unsigned HOST_WIDE_INT offset = 0;
-   machine_mode load_mode =
--    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
-+    select_block_compare_mode (offset, bytes, base_align);
-   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
- 
-   /* We don't want to generate too much code.  The loop code can take
-@@ -1445,8 +1443,7 @@ expand_block_compare (rtx operands[])
-   while (bytes > 0)
-     {
-       unsigned int align = compute_current_alignment (base_align, offset);
--      load_mode = select_block_compare_mode (offset, bytes,
--					     align, word_mode_ok);
-+      load_mode = select_block_compare_mode (offset, bytes, align);
-       load_mode_size = GET_MODE_SIZE (load_mode);
-       if (bytes >= load_mode_size)
- 	cmp_bytes = load_mode_size;
-@@ -1698,6 +1695,189 @@ expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes
-   LABEL_NUSES (strncmp_label) += 1;
- }
- 
-+/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
-+   BYTES_TO_COMPARE is the number of bytes to be compared.
-+   BASE_ALIGN is the smaller of the alignment of the two strings.
-+   ORIG_SRC1 is the unmodified rtx for the first string.
-+   ORIG_SRC2 is the unmodified rtx for the second string.
-+   TMP_REG_SRC1 is the register for loading the first string.
-+   TMP_REG_SRC2 is the register for loading the second string.
-+   RESULT_REG is the rtx for the result register.
-+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
-+   to strcmp/strncmp if we have equality at the end of the inline comparison.
-+   CLEANUP_LABEL is rtx for a label we generate if we need code to clean up
-+   and generate the final comparison result.
-+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
-+   set the final result.  */
-+static void
-+expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
-+			    unsigned int base_align,
-+			    rtx orig_src1, rtx orig_src2,
-+			    rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
-+			    bool equality_compare_rest, rtx &cleanup_label,
-+			    rtx final_move_label)
-+{
-+  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
-+  machine_mode load_mode;
-+  unsigned int load_mode_size;
-+  unsigned HOST_WIDE_INT cmp_bytes = 0;
-+  unsigned HOST_WIDE_INT offset = 0;
-+  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
-+  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
-+
-+  while (bytes_to_compare > 0)
-+    {
-+      /* GPR compare sequence:
-+         check each 8B with: ld/ld cmpd bne
-+	 If equal, use rldicr/cmpb to check for zero byte.
-+         cleanup code at end:
-+         cmpb          get byte that differs
-+         cmpb          look for zero byte
-+         orc           combine
-+         cntlzd        get bit of first zero/diff byte
-+         subfic        convert for rldcl use
-+         rldcl rldcl   extract diff/zero byte
-+         subf          subtract for final result
-+
-+         The last compare can branch around the cleanup code if the
-+         result is zero because the strings are exactly equal.  */
-+      
-+      unsigned int align = compute_current_alignment (base_align, offset);
-+      load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
-+      load_mode_size = GET_MODE_SIZE (load_mode);
-+      if (bytes_to_compare >= load_mode_size)
-+	cmp_bytes = load_mode_size;
-+      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
-+	{
-+	  /* Move this load back so it doesn't go past the end.
-+	     P8/P9 can do this efficiently.  */
-+	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
-+	  cmp_bytes = bytes_to_compare;
-+	  if (extra_bytes < offset)
-+	    {
-+	      offset -= extra_bytes;
-+	      cmp_bytes = load_mode_size;
-+	      bytes_to_compare = cmp_bytes;
-+	    }
-+	}
-+      else
-+	/* P7 and earlier can't do the overlapping load trick fast,
-+	   so this forces a non-overlapping load and a shift to get
-+	   rid of the extra bytes.  */
-+	cmp_bytes = bytes_to_compare;
-+
-+      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
-+      do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
-+      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
-+      do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
-+
-+      /* We must always left-align the data we read, and
-+	 clear any bytes to the right that are beyond the string.
-+	 Otherwise the cmpb sequence won't produce the correct
-+	 results.  The beginning of the compare will be done
-+	 with word_mode so will not have any extra shifts or
-+	 clear rights.  */
-+
-+      if (load_mode_size < word_mode_size)
-+	{
-+	  /* Rotate left first. */
-+	  rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
-+	  do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
-+	  do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
-+	}
-+
-+      if (cmp_bytes < word_mode_size)
-+	{
-+	  /* Now clear right.  This plus the rotate can be
-+	     turned into a rldicr instruction. */
-+	  HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-+	  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-+	  do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
-+	  do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
-+	}
-+
-+      /* Cases to handle.  A and B are chunks of the two strings.
-+	 1: Not end of comparison:
-+	 A != B: branch to cleanup code to compute result.
-+	 A == B: check for 0 byte, next block if not found.
-+	 2: End of the inline comparison:
-+	 A != B: branch to cleanup code to compute result.
-+	 A == B: check for 0 byte, call strcmp/strncmp
-+	 3: compared requested N bytes:
-+	 A == B: branch to result 0.
-+	 A != B: cleanup code to compute result.  */
-+
-+      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
-+
-+      rtx dst_label;
-+      if (remain > 0 || equality_compare_rest)
-+	{
-+	  /* Branch to cleanup code, otherwise fall through to do
-+	     more compares.  */
-+	  if (!cleanup_label)
-+	    cleanup_label = gen_label_rtx ();
-+	  dst_label = cleanup_label;
-+	}
-+      else
-+	/* Branch to end and produce result of 0.  */
-+	dst_label = final_move_label;
-+
-+      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
-+      rtx cond = gen_reg_rtx (CCmode);
-+
-+      /* Always produce the 0 result, it is needed if
-+	 cmpb finds a 0 byte in this chunk.  */
-+      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
-+      rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
-+
-+      rtx cmp_rtx;
-+      if (remain == 0 && !equality_compare_rest)
-+	cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
-+      else
-+	cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-+
-+      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
-+					 lab_ref, pc_rtx);
-+      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-+      JUMP_LABEL (j) = dst_label;
-+      LABEL_NUSES (dst_label) += 1;
-+
-+      if (remain > 0 || equality_compare_rest)
-+	{
-+	  /* Generate a cmpb to test for a 0 byte and branch
-+	     to final result if found.  */
-+	  rtx cmpb_zero = gen_reg_rtx (word_mode);
-+	  rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
-+	  rtx condz = gen_reg_rtx (CCmode);
-+	  rtx zero_reg = gen_reg_rtx (word_mode);
-+	  emit_move_insn (zero_reg, GEN_INT (0));
-+	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
-+
-+	  if (cmp_bytes < word_mode_size)
-+	    {
-+	      /* Don't want to look at zero bytes past end.  */
-+	      HOST_WIDE_INT mb =
-+		BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-+	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-+	      do_and3 (cmpb_zero, cmpb_zero, mask);
-+	    }
-+
-+	  emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
-+	  rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
-+	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
-+					     lab_ref_fin, pc_rtx);
-+	  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-+	  JUMP_LABEL (j2) = final_move_label;
-+	  LABEL_NUSES (final_move_label) += 1;
-+
-+	}
-+
-+      offset += cmp_bytes;
-+      bytes_to_compare -= cmp_bytes;
-+    }
-+
-+}
-+
- /* Generate the final sequence that identifies the differing
-    byte and generates the final result, taking into account
-    zero bytes:
-@@ -1797,7 +1977,7 @@ expand_strn_compare (rtx operands[], int no_length)
-       bytes_rtx = operands[3];
-       align_rtx = operands[4];
-     }
--  unsigned HOST_WIDE_INT cmp_bytes = 0;
-+
-   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
-   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
- 
-@@ -1822,11 +2002,6 @@ expand_strn_compare (rtx operands[], int no_length)
- 
-   gcc_assert (GET_MODE (target) == SImode);
- 
--  /* If we have an LE target without ldbrx and word_mode is DImode,
--     then we must avoid using word_mode.  */
--  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
--		       && word_mode == DImode);
--
-   unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
- 
-   unsigned HOST_WIDE_INT offset = 0;
-@@ -1839,7 +2014,7 @@ expand_strn_compare (rtx operands[], int no_length)
-     bytes = UINTVAL (bytes_rtx);
- 
-   machine_mode load_mode =
--    select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
-+    select_block_compare_mode (0, bytes, base_align);
-   unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
-   compare_length = rs6000_string_compare_inline_limit * load_mode_size;
- 
-@@ -1867,6 +2042,8 @@ expand_strn_compare (rtx operands[], int no_length)
-   rtx begin_compare_label = NULL;
-   unsigned int required_align = 8;
- 
-+  required_align = 8;
-+
-   if (base_align < required_align)
-     {
-       /* Generate code that checks distance to 4k boundary for this case.  */
-@@ -1952,159 +2129,15 @@ expand_strn_compare (rtx operands[], int no_length)
- 
-   /* Generate a sequence of GPR or VEC/VSX instructions to compare out
-      to the length specified.  */
--  unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
--  while (bytes_to_compare > 0)
--    {
--      /* GPR compare sequence:
--         check each 8B with: ld/ld cmpd bne
--	 If equal, use rldicr/cmpb to check for zero byte.
--         cleanup code at end:
--         cmpb          get byte that differs
--         cmpb          look for zero byte
--         orc           combine
--         cntlzd        get bit of first zero/diff byte
--         subfic        convert for rldcl use
--         rldcl rldcl   extract diff/zero byte
--         subf          subtract for final result
--
--         The last compare can branch around the cleanup code if the
--         result is zero because the strings are exactly equal.  */
--      
--      unsigned int align = compute_current_alignment (base_align, offset);
--      load_mode = select_block_compare_mode (offset, bytes_to_compare,
--					     align, word_mode_ok);
--      load_mode_size = GET_MODE_SIZE (load_mode);
--      if (bytes_to_compare >= load_mode_size)
--	cmp_bytes = load_mode_size;
--      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
--	{
--	  /* Move this load back so it doesn't go past the end.
--	     P8/P9 can do this efficiently.  */
--	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
--	  cmp_bytes = bytes_to_compare;
--	  if (extra_bytes < offset)
--	    {
--	      offset -= extra_bytes;
--	      cmp_bytes = load_mode_size;
--	      bytes_to_compare = cmp_bytes;
--	    }
--	}
--      else
--	/* P7 and earlier can't do the overlapping load trick fast,
--	   so this forces a non-overlapping load and a shift to get
--	   rid of the extra bytes.  */
--	cmp_bytes = bytes_to_compare;
--
--      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
--      do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
--      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
--      do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
--
--      /* We must always left-align the data we read, and
--	 clear any bytes to the right that are beyond the string.
--	 Otherwise the cmpb sequence won't produce the correct
--	 results.  The beginning of the compare will be done
--	 with word_mode so will not have any extra shifts or
--	 clear rights.  */
--
--      if (load_mode_size < word_mode_size)
--	{
--	  /* Rotate left first. */
--	  rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
--	  do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
--	  do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
--	}
--
--      if (cmp_bytes < word_mode_size)
--	{
--	  /* Now clear right.  This plus the rotate can be
--	     turned into a rldicr instruction. */
--	  HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
--	  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
--	  do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
--	  do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
--	}
--
--      /* Cases to handle.  A and B are chunks of the two strings.
--	 1: Not end of comparison:
--	 A != B: branch to cleanup code to compute result.
--	 A == B: check for 0 byte, next block if not found.
--	 2: End of the inline comparison:
--	 A != B: branch to cleanup code to compute result.
--	 A == B: check for 0 byte, call strcmp/strncmp
--	 3: compared requested N bytes:
--	 A == B: branch to result 0.
--	 A != B: cleanup code to compute result.  */
--
--      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
--
--      rtx dst_label;
--      if (remain > 0 || equality_compare_rest)
--	{
--	  /* Branch to cleanup code, otherwise fall through to do
--	     more compares.  */
--	  if (!cleanup_label)
--	    cleanup_label = gen_label_rtx ();
--	  dst_label = cleanup_label;
--	}
--      else
--	/* Branch to end and produce result of 0.  */
--	dst_label = final_move_label;
--
--      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
--      rtx cond = gen_reg_rtx (CCmode);
--
--      /* Always produce the 0 result, it is needed if
--	 cmpb finds a 0 byte in this chunk.  */
--      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
--      rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
--
--      rtx cmp_rtx;
--      if (remain == 0 && !equality_compare_rest)
--	cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
--      else
--	cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
--
--      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
--					 lab_ref, pc_rtx);
--      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
--      JUMP_LABEL (j) = dst_label;
--      LABEL_NUSES (dst_label) += 1;
--
--      if (remain > 0 || equality_compare_rest)
--	{
--	  /* Generate a cmpb to test for a 0 byte and branch
--	     to final result if found.  */
--	  rtx cmpb_zero = gen_reg_rtx (word_mode);
--	  rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
--	  rtx condz = gen_reg_rtx (CCmode);
--	  rtx zero_reg = gen_reg_rtx (word_mode);
--	  emit_move_insn (zero_reg, GEN_INT (0));
--	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
--
--	  if (cmp_bytes < word_mode_size)
--	    {
--	      /* Don't want to look at zero bytes past end.  */
--	      HOST_WIDE_INT mb =
--		BITS_PER_UNIT * (word_mode_size - cmp_bytes);
--	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
--	      do_and3 (cmpb_zero, cmpb_zero, mask);
--	    }
--
--	  emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
--	  rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
--	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
--					     lab_ref_fin, pc_rtx);
--	  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
--	  JUMP_LABEL (j2) = final_move_label;
--	  LABEL_NUSES (final_move_label) += 1;
--
--	}
--
--      offset += cmp_bytes;
--      bytes_to_compare -= cmp_bytes;
--    }
--
-+  expand_strncmp_gpr_sequence(compare_length, base_align,
-+			      orig_src1, orig_src2,
-+			      tmp_reg_src1, tmp_reg_src2,
-+			      result_reg,
-+			      equality_compare_rest,
-+			      cleanup_label, final_move_label);
-+
-+  offset = compare_length;
-+  
-   if (equality_compare_rest)
-     {
-       /* Update pointers past what has been compared already.  */
diff --git a/SOURCES/gcc8-rh1652929-3.patch b/SOURCES/gcc8-rh1652929-3.patch
deleted file mode 100644
index 2416c17..0000000
--- a/SOURCES/gcc8-rh1652929-3.patch
+++ /dev/null
@@ -1,613 +0,0 @@
-commit e4108e7e619dcf7f21224382bc37ba2ef651eb43
-Author: acsawdey <acsawdey@138bc75d-0d04-0410-961f-82ee72b054a4>
-Date:   Thu Aug 30 18:17:00 2018 +0000
-
-    2018-08-30  Aaron Sawdey  <acsawdey@linux.ibm.com>
-    
-            * config/rs6000/altivec.md (altivec_eq<mode>): Remove star.
-            (altivec_vcmpequ<VI_char>_p): Remove star.
-            * config/rs6000/rs6000-string.c (do_load_for_compare): Support
-            vector load modes.
-            (expand_strncmp_vec_sequence): New function.
-            (emit_final_str_compare_vec): New function.
-            (expand_strn_compare): Add support for vector strncmp.
-            * config/rs6000/rs6000.opt (-mstring-compare-inline-limit): Change
-            length specification to bytes.
-            * config/rs6000/vsx.md (vsx_ld_elemrev_v16qi_internal): Remove star.
-            (vcmpnezb_p): New pattern.
-            * doc/invoke.texi (RS/6000 and PowerPC Options): Update documentation
-            for option -mstring-compare-inline-limit.
-    
-    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@263991 138bc75d-0d04-0410-961f-82ee72b054a4
-
-diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
-index 13f4654db6a..db4f926bd15 100644
---- a/gcc/config/rs6000/altivec.md
-+++ b/gcc/config/rs6000/altivec.md
-@@ -608,7 +608,7 @@
-   "vcmpbfp %0,%1,%2"
-   [(set_attr "type" "veccmp")])
- 
--(define_insn "*altivec_eq<mode>"
-+(define_insn "altivec_eq<mode>"
-   [(set (match_operand:VI2 0 "altivec_register_operand" "=v")
- 	(eq:VI2 (match_operand:VI2 1 "altivec_register_operand" "v")
- 		(match_operand:VI2 2 "altivec_register_operand" "v")))]
-@@ -2438,7 +2438,7 @@
- 
- ;; Compare vectors producing a vector result and a predicate, setting CR6 to
- ;; indicate a combined status
--(define_insn "*altivec_vcmpequ<VI_char>_p"
-+(define_insn "altivec_vcmpequ<VI_char>_p"
-   [(set (reg:CC CR6_REGNO)
- 	(unspec:CC [(eq:CC (match_operand:VI2 1 "register_operand" "v")
- 			   (match_operand:VI2 2 "register_operand" "v"))]
-diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
-index 451e9ed33da..ff0414586d0 100644
---- a/gcc/config/rs6000/rs6000-string.c
-+++ b/gcc/config/rs6000/rs6000-string.c
-@@ -157,6 +157,33 @@ do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
- {
-   switch (GET_MODE (reg))
-     {
-+    case E_V16QImode:
-+      switch (mode)
-+	{
-+	case E_V16QImode:
-+	  if (!BYTES_BIG_ENDIAN)
-+	    {
-+	      if (TARGET_P9_VECTOR)
-+		emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
-+	      else
-+		{
-+		  rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
-+						      V16QImode, 0);
-+		  gcc_assert (MEM_P (mem));
-+		  rtx addr = XEXP (mem, 0);
-+		  rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
-+		  MEM_COPY_ATTRIBUTES (mem_v2di, mem);
-+		  set_mem_size (mem, GET_MODE_SIZE (V2DImode));
-+		  emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
-+		}
-+	    }
-+	  else
-+	    emit_insn (gen_vsx_movv2di_64bit (reg, mem));
-+	  break;
-+	default:
-+	  gcc_unreachable ();
-+	}
-+      break;
-     case E_DImode:
-       switch (mode)
- 	{
-@@ -227,6 +254,12 @@ do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
- 	  gcc_unreachable ();
- 	}
-       break;
-+
-+    case E_QImode:
-+      gcc_assert (mode == E_QImode);
-+      emit_move_insn (reg, mem);
-+      break;
-+      
-     default:
-       gcc_unreachable ();
-       break;
-@@ -1705,17 +1738,17 @@ expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes
-    RESULT_REG is the rtx for the result register.
-    EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
-    to strcmp/strncmp if we have equality at the end of the inline comparison.
--   CLEANUP_LABEL is rtx for a label we generate if we need code to clean up
--   and generate the final comparison result.
-+   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
-+   to clean up and generate the final comparison result.
-    FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
-    set the final result.  */
- static void
--expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
--			    unsigned int base_align,
--			    rtx orig_src1, rtx orig_src2,
--			    rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
--			    bool equality_compare_rest, rtx &cleanup_label,
--			    rtx final_move_label)
-+expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-+			     unsigned int base_align,
-+			     rtx orig_src1, rtx orig_src2,
-+			     rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
-+			     bool equality_compare_rest, rtx *p_cleanup_label,
-+			     rtx final_move_label)
- {
-   unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
-   machine_mode load_mode;
-@@ -1724,6 +1757,8 @@ expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
-   unsigned HOST_WIDE_INT offset = 0;
-   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
-   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
-+  gcc_assert (p_cleanup_label != NULL);
-+  rtx cleanup_label = *p_cleanup_label;
- 
-   while (bytes_to_compare > 0)
-     {
-@@ -1876,6 +1911,178 @@ expand_strncmp_gpr_sequence(unsigned HOST_WIDE_INT bytes_to_compare,
-       bytes_to_compare -= cmp_bytes;
-     }
- 
-+  *p_cleanup_label = cleanup_label;
-+  return;
-+}
-+
-+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx 
-+   instructions.
-+
-+   BYTES_TO_COMPARE is the number of bytes to be compared.
-+   ORIG_SRC1 is the unmodified rtx for the first string.
-+   ORIG_SRC2 is the unmodified rtx for the second string.
-+   S1ADDR is the register to use for the base address of the first string.
-+   S2ADDR is the register to use for the base address of the second string.
-+   OFF_REG is the register to use for the string offset for loads.
-+   S1DATA is the register for loading the first string.
-+   S2DATA is the register for loading the second string.
-+   VEC_RESULT is the rtx for the vector result indicating the byte difference.
-+   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
-+   to strcmp/strncmp if we have equality at the end of the inline comparison.
-+   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up
-+   and generate the final comparison result.
-+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
-+   set the final result.  */
-+static void
-+expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-+			     rtx orig_src1, rtx orig_src2,
-+			     rtx s1addr, rtx s2addr, rtx off_reg,
-+			     rtx s1data, rtx s2data,
-+			     rtx vec_result, bool equality_compare_rest,
-+			     rtx *p_cleanup_label, rtx final_move_label)
-+{
-+  machine_mode load_mode;
-+  unsigned int load_mode_size;
-+  unsigned HOST_WIDE_INT cmp_bytes = 0;
-+  unsigned HOST_WIDE_INT offset = 0;
-+
-+  gcc_assert (p_cleanup_label != NULL);
-+  rtx cleanup_label = *p_cleanup_label;
-+
-+  emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
-+  emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
-+
-+  unsigned int i;
-+  rtx zr[16];
-+  for (i = 0; i < 16; i++)
-+    zr[i] = GEN_INT (0);
-+  rtvec zv = gen_rtvec_v (16, zr);
-+  rtx zero_reg = gen_reg_rtx (V16QImode);
-+  rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
-+
-+  while (bytes_to_compare > 0)
-+    {
-+      /* VEC/VSX compare sequence for P8:
-+	 check each 16B with:
-+	 lxvd2x 32,28,8
-+	 lxvd2x 33,29,8
-+	 vcmpequb 2,0,1  # compare strings
-+	 vcmpequb 4,0,3  # compare w/ 0
-+	 xxlorc 37,36,34       # first FF byte is either mismatch or end of string
-+	 vcmpequb. 7,5,3  # reg 7 contains 0
-+	 bnl 6,.Lmismatch
-+
-+	 For the P8 LE case, we use lxvd2x and compare full 16 bytes
-+	 but then use use vgbbd and a shift to get two bytes with the
-+	 information we need in the correct order.
-+
-+	 VEC/VSX compare sequence if TARGET_P9_VECTOR:
-+	 lxvb16x/lxvb16x     # load 16B of each string
-+	 vcmpnezb.           # produces difference location or zero byte location
-+	 bne 6,.Lmismatch
-+
-+	 Use the overlapping compare trick for the last block if it is
-+	 less than 16 bytes. 
-+      */
-+
-+      load_mode = V16QImode;
-+      load_mode_size = GET_MODE_SIZE (load_mode);
-+      
-+      if (bytes_to_compare >= load_mode_size)
-+	cmp_bytes = load_mode_size;
-+      else
-+	{
-+	  /* Move this load back so it doesn't go past the end.  P8/P9
-+	     can do this efficiently.  This is never called with less
-+	     than 16 bytes so we should always be able to do this.  */
-+	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
-+	  cmp_bytes = bytes_to_compare;
-+	  gcc_assert (offset > extra_bytes);
-+	  offset -= extra_bytes;
-+	  cmp_bytes = load_mode_size;
-+	  bytes_to_compare = cmp_bytes;
-+	}
-+
-+      /* The offset currently used is always kept in off_reg so that the
-+	 cleanup code on P8 can use it to extract the differing byte.  */
-+      emit_move_insn (off_reg, GEN_INT (offset));
-+
-+      rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
-+      do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
-+      rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
-+      do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
-+
-+      /* Cases to handle.  A and B are chunks of the two strings.
-+	 1: Not end of comparison:
-+	 A != B: branch to cleanup code to compute result.
-+	 A == B: next block
-+	 2: End of the inline comparison:
-+	 A != B: branch to cleanup code to compute result.
-+	 A == B: call strcmp/strncmp
-+	 3: compared requested N bytes:
-+	 A == B: branch to result 0.
-+	 A != B: cleanup code to compute result.  */
-+
-+      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
-+
-+      if (TARGET_P9_VECTOR)
-+	emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
-+      else
-+	{
-+	  /* Emit instructions to do comparison and zero check.  */
-+	  rtx cmp_res = gen_reg_rtx (load_mode);
-+	  rtx cmp_zero = gen_reg_rtx (load_mode);
-+	  rtx cmp_combined = gen_reg_rtx (load_mode);
-+	  emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
-+	  emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
-+	  emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
-+	  emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
-+	}
-+
-+      bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
-+      rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
-+      rtx dst_label;
-+      rtx cmp_rtx;
-+      if (branch_to_cleanup)
-+	{
-+	  /* Branch to cleanup code, otherwise fall through to do more
-+	     compares. P8 and P9 use different CR bits because on P8
-+	     we are looking at the result of a comparsion vs a
-+	     register of zeroes so the all-true condition means no
-+	     difference or zero was found. On P9, vcmpnezb sets a byte
-+	     to 0xff if there is a mismatch or zero, so the all-false
-+	     condition indicates we found no difference or zero.  */
-+	  if (!cleanup_label)
-+	    cleanup_label = gen_label_rtx ();
-+	  dst_label = cleanup_label;
-+	  if (TARGET_P9_VECTOR)
-+	    cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
-+	  else
-+	    cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
-+	}
-+      else
-+	{
-+	  /* Branch to final return or fall through to cleanup, 
-+	     result is already set to 0.  */
-+	  dst_label = final_move_label;
-+	  if (TARGET_P9_VECTOR)
-+	    cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
-+	  else
-+	    cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
-+	}
-+
-+      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
-+      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
-+					 lab_ref, pc_rtx);
-+      rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-+      JUMP_LABEL (j2) = dst_label;
-+      LABEL_NUSES (dst_label) += 1;
-+
-+      offset += cmp_bytes;
-+      bytes_to_compare -= cmp_bytes;
-+    }
-+  *p_cleanup_label = cleanup_label;
-+  return;
- }
- 
- /* Generate the final sequence that identifies the differing
-@@ -1948,6 +2155,96 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
-   return;
- }
- 
-+/* Generate the final sequence that identifies the differing
-+   byte and generates the final result, taking into account
-+   zero bytes:
-+
-+   P8:
-+        vgbbd 0,0
-+        vsldoi 0,0,0,9
-+        mfvsrd 9,32
-+        addi 10,9,-1    # count trailing zero bits
-+        andc 9,10,9
-+        popcntd 9,9
-+        lbzx 10,28,9    # use that offset to load differing byte
-+        lbzx 3,29,9
-+        subf 3,3,10     # subtract for final result
-+   
-+   P9:
-+	 vclzlsbb            # counts trailing bytes with lsb=0
-+	 vextublx            # extract differing byte 
-+
-+   STR1 is the reg rtx for data from string 1.
-+   STR2 is the reg rtx for data from string 2.
-+   RESULT is the reg rtx for the comparison result.
-+   S1ADDR is the register to use for the base address of the first string.
-+   S2ADDR is the register to use for the base address of the second string.
-+   ORIG_SRC1 is the unmodified rtx for the first string.
-+   ORIG_SRC2 is the unmodified rtx for the second string.
-+   OFF_REG is the register to use for the string offset for loads.
-+   VEC_RESULT is the rtx for the vector result indicating the byte difference.
-+  */
-+
-+static void
-+emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
-+			    rtx s1addr, rtx s2addr,
-+			    rtx orig_src1, rtx orig_src2,
-+			    rtx off_reg, rtx vec_result)
-+{
-+  if (TARGET_P9_VECTOR)
-+    {
-+      rtx diffix = gen_reg_rtx (SImode);
-+      rtx chr1 = gen_reg_rtx (SImode);
-+      rtx chr2 = gen_reg_rtx (SImode);
-+      rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
-+      rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
-+      emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
-+      emit_insn (gen_vextublx (chr1, diffix, str1));
-+      emit_insn (gen_vextublx (chr2, diffix, str2));
-+      do_sub3 (result, chr1_di, chr2_di);
-+    }
-+  else
-+    {
-+      rtx diffix = gen_reg_rtx (DImode);
-+      rtx result_gbbd = gen_reg_rtx (V16QImode);
-+      /* Since each byte of the input is either 00 or FF, the bytes in 
-+	 dw0 and dw1 after vgbbd are all identical to each other.  */
-+      emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
-+      /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
-+	 For BE, we shift by 7 and get AB in the high two bytes then CLZ.  */
-+      rtx result_shifted = gen_reg_rtx (V16QImode);
-+      int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
-+      emit_insn (gen_altivec_vsldoi_v16qi (result_shifted,result_gbbd,result_gbbd, GEN_INT (shift_amt)));
-+
-+      rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
-+      emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
-+      rtx count = gen_reg_rtx (DImode);
-+
-+      if (BYTES_BIG_ENDIAN)
-+	emit_insn (gen_clzdi2 (count, diffix));
-+      else
-+	emit_insn (gen_ctzdi2 (count, diffix));
-+
-+      /* P8 doesn't have a good solution for extracting one byte from 
-+	 a vsx reg like vextublx on P9 so we just compute the offset
-+	 of the differing byte and load it from each string.  */
-+      do_add3 (off_reg, off_reg, count);
-+
-+      rtx chr1 = gen_reg_rtx (QImode);
-+      rtx chr2 = gen_reg_rtx (QImode);
-+      rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
-+      do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
-+      rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
-+      do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
-+      machine_mode rmode = GET_MODE (result);
-+      rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
-+      rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
-+      do_sub3 (result, chr1_rm, chr2_rm);
-+    }
-+
-+  return;
-+}
-+
- /* Expand a string compare operation with length, and return
-    true if successful. Return false if we should let the
-    compiler generate normal code, probably a strncmp call.
-@@ -2002,21 +2299,43 @@ expand_strn_compare (rtx operands[], int no_length)
- 
-   gcc_assert (GET_MODE (target) == SImode);
- 
--  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
-+  unsigned int required_align = 8;
- 
-   unsigned HOST_WIDE_INT offset = 0;
-   unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
-   unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
-+
-   if (no_length)
--    /* Use this as a standin to determine the mode to use.  */
--    bytes = rs6000_string_compare_inline_limit * word_mode_size;
-+    bytes = rs6000_string_compare_inline_limit;
-   else
-     bytes = UINTVAL (bytes_rtx);
- 
--  machine_mode load_mode =
--    select_block_compare_mode (0, bytes, base_align);
--  unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
--  compare_length = rs6000_string_compare_inline_limit * load_mode_size;
-+  /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
-+     least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
-+     at least POWER8.  That way we can rely on overlapping compares to
-+     do the final comparison of less than 16 bytes.  Also I do not want
-+     to deal with making this work for 32 bits.  */
-+  int use_vec = (bytes >= 16 && !TARGET_32BIT && TARGET_EFFICIENT_UNALIGNED_VSX);
-+
-+  if (use_vec)
-+    required_align = 16;
-+
-+  machine_mode load_mode;
-+  rtx tmp_reg_src1, tmp_reg_src2;
-+  if (use_vec)
-+    {
-+      load_mode = V16QImode;
-+      tmp_reg_src1 = gen_reg_rtx (V16QImode);
-+      tmp_reg_src2 = gen_reg_rtx (V16QImode);
-+    }
-+  else
-+    {
-+      load_mode = select_block_compare_mode (0, bytes, base_align);
-+      tmp_reg_src1 = gen_reg_rtx (word_mode);
-+      tmp_reg_src2 = gen_reg_rtx (word_mode);
-+    }
-+
-+  compare_length = rs6000_string_compare_inline_limit;
- 
-   /* If we have equality at the end of the last compare and we have not
-      found the end of the string, we need to call strcmp/strncmp to
-@@ -2040,10 +2359,7 @@ expand_strn_compare (rtx operands[], int no_length)
-   rtx final_move_label = gen_label_rtx ();
-   rtx final_label = gen_label_rtx ();
-   rtx begin_compare_label = NULL;
--  unsigned int required_align = 8;
--
--  required_align = 8;
--
-+  
-   if (base_align < required_align)
-     {
-       /* Generate code that checks distance to 4k boundary for this case.  */
-@@ -2060,7 +2376,7 @@ expand_strn_compare (rtx operands[], int no_length)
-          the subsequent code generation are in agreement so we do not
-          go past the length we tested for a 4k boundary crossing.  */
-       unsigned HOST_WIDE_INT align_test = compare_length;
--      if (align_test < 8)
-+      if (align_test < required_align)
-         {
-           align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
-           base_align = align_test;
-@@ -2102,7 +2418,7 @@ expand_strn_compare (rtx operands[], int no_length)
-       else
- 	{
- 	  /* -m32 -mpowerpc64 results in word_mode being DImode even
--	     though otherwise it is 32-bit. The length arg to strncmp
-+	     though otherwise it is 32-bit.  The length arg to strncmp
- 	     is a size_t which will be the same size as pointers.  */
- 	  rtx len_rtx = gen_reg_rtx (Pmode);
- 	  emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
-@@ -2124,17 +2440,32 @@ expand_strn_compare (rtx operands[], int no_length)
-     }
- 
-   rtx cleanup_label = NULL;
--  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
--  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
-+  rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
- 
-   /* Generate a sequence of GPR or VEC/VSX instructions to compare out
-      to the length specified.  */
--  expand_strncmp_gpr_sequence(compare_length, base_align,
--			      orig_src1, orig_src2,
--			      tmp_reg_src1, tmp_reg_src2,
--			      result_reg,
--			      equality_compare_rest,
--			      cleanup_label, final_move_label);
-+  if (use_vec)
-+    {
-+      s1addr = gen_reg_rtx (Pmode);
-+      s2addr = gen_reg_rtx (Pmode);
-+      off_reg = gen_reg_rtx (Pmode);
-+      vec_result = gen_reg_rtx (load_mode);
-+      emit_move_insn (result_reg, GEN_INT (0));
-+      expand_strncmp_vec_sequence (compare_length,
-+				   orig_src1, orig_src2,
-+				   s1addr, s2addr, off_reg,
-+				   tmp_reg_src1, tmp_reg_src2,
-+				   vec_result,
-+				   equality_compare_rest,
-+				   &cleanup_label, final_move_label);
-+    }
-+  else
-+    expand_strncmp_gpr_sequence (compare_length, base_align,
-+				 orig_src1, orig_src2,
-+				 tmp_reg_src1, tmp_reg_src2,
-+				 result_reg,
-+				 equality_compare_rest,
-+				 &cleanup_label, final_move_label);
- 
-   offset = compare_length;
-   
-@@ -2174,7 +2505,12 @@ expand_strn_compare (rtx operands[], int no_length)
-   if (cleanup_label)
-     emit_label (cleanup_label);
- 
--  emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
-+  if (use_vec)
-+    emit_final_str_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
-+				s1addr, s2addr, orig_src1, orig_src2,
-+				off_reg, vec_result);
-+  else
-+    emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
- 
-   emit_label (final_move_label);
-   emit_insn (gen_movsi (target,
-diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
-index ace8a477550..ad1b8a29ac6 100644
---- a/gcc/config/rs6000/rs6000.opt
-+++ b/gcc/config/rs6000/rs6000.opt
-@@ -342,8 +342,8 @@ Target Report Var(rs6000_block_compare_inline_loop_limit) Init(-1) RejectNegativ
- Max number of bytes to compare with loops.
- 
- mstring-compare-inline-limit=
--Target Report Var(rs6000_string_compare_inline_limit) Init(8) RejectNegative Joined UInteger Save
--Max number of pairs of load insns for compare.
-+Target Report Var(rs6000_string_compare_inline_limit) Init(64) RejectNegative Joined UInteger Save
-+Max number of bytes to compare.
- 
- misel
- Target Report Mask(ISEL) Var(rs6000_isa_flags)
-diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
-index e6921e96a3d..01fb4213001 100644
---- a/gcc/config/rs6000/vsx.md
-+++ b/gcc/config/rs6000/vsx.md
-@@ -1429,7 +1429,7 @@
-     }
- })
- 
--(define_insn "*vsx_ld_elemrev_v16qi_internal"
-+(define_insn "vsx_ld_elemrev_v16qi_internal"
-   [(set (match_operand:V16QI 0 "vsx_register_operand" "=wa")
-         (vec_select:V16QI
-           (match_operand:V16QI 1 "memory_operand" "Z")
-@@ -5107,6 +5107,22 @@
-   "vcmpnezb %0,%1,%2"
-   [(set_attr "type" "vecsimple")])
- 
-+;; Vector Compare Not Equal or Zero Byte predicate or record-form
-+(define_insn "vcmpnezb_p"
-+  [(set (reg:CC CR6_REGNO)
-+	(unspec:CC
-+	 [(match_operand:V16QI 1 "altivec_register_operand" "v")
-+	  (match_operand:V16QI 2 "altivec_register_operand" "v")]
-+	 UNSPEC_VCMPNEZB))
-+   (set (match_operand:V16QI 0 "altivec_register_operand" "=v")
-+	(unspec:V16QI
-+	 [(match_dup 1)
-+	  (match_dup 2)]
-+	 UNSPEC_VCMPNEZB))]
-+  "TARGET_P9_VECTOR"
-+  "vcmpnezb. %0,%1,%2"
-+  [(set_attr "type" "vecsimple")])
-+
- ;; Vector Compare Not Equal Half Word (specified/not+eq:)
- (define_insn "vcmpneh"
-   [(set (match_operand:V8HI 0 "altivec_register_operand" "=v")
-diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
-index f2dd12b3d73..291e414fea2 100644
---- a/gcc/doc/invoke.texi
-+++ b/gcc/doc/invoke.texi
-@@ -24165,12 +24165,10 @@ target-specific.
- 
- @item -mstring-compare-inline-limit=@var{num}
- @opindex mstring-compare-inline-limit
--Generate at most @var{num} pairs of load instructions to compare the
--string inline. If the difference or end of string is not found at the
-+Compare at most @var{num} string bytes with inline code.
-+If the difference or end of string is not found at the
- end of the inline compare a call to @code{strcmp} or @code{strncmp} will
--take care of the rest of the comparison. The default is 8 pairs of
--loads, which will compare 64 bytes on a 64-bit target and 32 bytes on a
--32-bit target.
-+take care of the rest of the comparison. The default is 64 bytes.
- 
- @item -G @var{num}
- @opindex G
diff --git a/SOURCES/gcc8-rh1652929-4.patch b/SOURCES/gcc8-rh1652929-4.patch
deleted file mode 100644
index 2a5d5da..0000000
--- a/SOURCES/gcc8-rh1652929-4.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-commit 6f1a7440d9aac59fba0f2e2d8d0a9a0b82f480cb
-Author: acsawdey <acsawdey@138bc75d-0d04-0410-961f-82ee72b054a4>
-Date:   Tue Oct 2 17:31:53 2018 +0000
-
-    2018-10-02  Aaron Sawdey  <acsawdey@linux.ibm.com>
-    
-            PR target/87474
-            * config/rs6000/rs6000-string.c (expand_strn_compare): Check that both
-            P8_VECTOR and VSX are enabled.
-    
-    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@264799 138bc75d-0d04-0410-961f-82ee72b054a4
-
-diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
-index ff0414586d0..9c25bad97a1 100644
---- a/gcc/config/rs6000/rs6000-string.c
-+++ b/gcc/config/rs6000/rs6000-string.c
-@@ -2205,6 +2205,7 @@ emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
-     }
-   else
-     {
-+      gcc_assert (TARGET_P8_VECTOR);
-       rtx diffix = gen_reg_rtx (DImode);
-       rtx result_gbbd = gen_reg_rtx (V16QImode);
-       /* Since each byte of the input is either 00 or FF, the bytes in 
-@@ -2313,9 +2314,12 @@ expand_strn_compare (rtx operands[], int no_length)
-   /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
-      least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
-      at least POWER8.  That way we can rely on overlapping compares to
--     do the final comparison of less than 16 bytes.  Also I do not want
--     to deal with making this work for 32 bits.  */
--  int use_vec = (bytes >= 16 && !TARGET_32BIT && TARGET_EFFICIENT_UNALIGNED_VSX);
-+     do the final comparison of less than 16 bytes.  Also I do not
-+     want to deal with making this work for 32 bits.  In addition, we
-+     have to make sure that we have at least P8_VECTOR (we don't allow
-+     P9_VECTOR without P8_VECTOR).  */
-+  int use_vec = (bytes >= 16 && !TARGET_32BIT
-+		 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
- 
-   if (use_vec)
-     required_align = 16;
diff --git a/SOURCES/gcc8-rh1652929-5.patch b/SOURCES/gcc8-rh1652929-5.patch
deleted file mode 100644
index 0db118d..0000000
--- a/SOURCES/gcc8-rh1652929-5.patch
+++ /dev/null
@@ -1,510 +0,0 @@
-commit 08869d85bd2a7ec5468b3bb3f01a930eb93e6381
-Author: acsawdey <acsawdey@138bc75d-0d04-0410-961f-82ee72b054a4>
-Date:   Fri Oct 26 19:33:31 2018 +0000
-
-    2018-10-26  Aaron Sawdey  <acsawdey@linux.ibm.com>
-    
-            * config/rs6000/rs6000-string.c (expand_strncmp_gpr_sequence): Change to
-            a shorter sequence with fewer branches.
-            (emit_final_str_compare_gpr): Ditto.
-    
-    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@265546 138bc75d-0d04-0410-961f-82ee72b054a4
-
-diff --git a/gcc/config/rs6000/rs6000-string.c b/gcc/config/rs6000/rs6000-string.c
-index 9c25bad97a1..96729d9663c 100644
---- a/gcc/config/rs6000/rs6000-string.c
-+++ b/gcc/config/rs6000/rs6000-string.c
-@@ -259,7 +259,7 @@ do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
-       gcc_assert (mode == E_QImode);
-       emit_move_insn (reg, mem);
-       break;
--      
-+
-     default:
-       gcc_unreachable ();
-       break;
-@@ -726,7 +726,7 @@ expand_compare_loop (rtx operands[])
-     {
-       if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
- 	/* Do not expect length longer than word_mode.  */
--	return false; 
-+	return false;
-       else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
- 	{
- 	  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
-@@ -770,7 +770,7 @@ expand_compare_loop (rtx operands[])
-   rtx j;
- 
-   /* Example of generated code for 35 bytes aligned 1 byte.
--     
-+
- 	     mtctr 8
- 	     li 6,0
- 	     li 5,8
-@@ -798,7 +798,7 @@ expand_compare_loop (rtx operands[])
- 	     popcntd 9,9
- 	     subfe 10,10,10
- 	     or 9,9,10
--     
-+
-      Compiled with -fno-reorder-blocks for clarity.  */
- 
-   /* Structure of what we're going to do:
-@@ -1041,7 +1041,7 @@ expand_compare_loop (rtx operands[])
-       if (!bytes_is_const)
- 	{
- 	  /* If we're dealing with runtime length, we have to check if
--	     it's zero after the loop. When length is known at compile
-+	     it's zero after the loop.  When length is known at compile
- 	     time the no-remainder condition is dealt with above.  By
- 	     doing this after cleanup_label, we also deal with the
- 	     case where length is 0 at the start and we bypass the
-@@ -1411,7 +1411,7 @@ expand_block_compare (rtx operands[])
-   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
-   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
-   /* P7/P8 code uses cond for subfc. but P9 uses
--     it for cmpld which needs CCUNSmode. */
-+     it for cmpld which needs CCUNSmode.  */
-   rtx cond;
-   if (TARGET_P9_MISC)
-     cond = gen_reg_rtx (CCUNSmode);
-@@ -1655,7 +1655,7 @@ expand_block_compare (rtx operands[])
- 	emit_label (convert_label);
- 
-       /* We need to produce DI result from sub, then convert to target SI
--	 while maintaining <0 / ==0 / >0 properties. This sequence works:
-+	 while maintaining <0 / ==0 / >0 properties.  This sequence works:
- 	 subfc L,A,B
- 	 subfe H,H,H
- 	 popcntd L,L
-@@ -1740,7 +1740,7 @@ expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes
-    to strcmp/strncmp if we have equality at the end of the inline comparison.
-    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
-    to clean up and generate the final comparison result.
--   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
-+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
-    set the final result.  */
- static void
- expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-@@ -1763,12 +1763,9 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-   while (bytes_to_compare > 0)
-     {
-       /* GPR compare sequence:
--         check each 8B with: ld/ld cmpd bne
--	 If equal, use rldicr/cmpb to check for zero byte.
-+         check each 8B with: ld/ld/cmpb/cmpb/orc./bne
-+
-          cleanup code at end:
--         cmpb          get byte that differs
--         cmpb          look for zero byte
--         orc           combine
-          cntlzd        get bit of first zero/diff byte
-          subfic        convert for rldcl use
-          rldcl rldcl   extract diff/zero byte
-@@ -1776,7 +1773,7 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
- 
-          The last compare can branch around the cleanup code if the
-          result is zero because the strings are exactly equal.  */
--      
-+
-       unsigned int align = compute_current_alignment (base_align, offset);
-       load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
-       load_mode_size = GET_MODE_SIZE (load_mode);
-@@ -1801,34 +1798,49 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
- 	   rid of the extra bytes.  */
- 	cmp_bytes = bytes_to_compare;
- 
--      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
-+      rtx offset_reg = gen_reg_rtx (Pmode);
-+      emit_move_insn (offset_reg, GEN_INT (offset));
-+
-+      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_reg);
-       do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
--      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
-+      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_reg);
-       do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
- 
-       /* We must always left-align the data we read, and
- 	 clear any bytes to the right that are beyond the string.
- 	 Otherwise the cmpb sequence won't produce the correct
--	 results.  The beginning of the compare will be done
--	 with word_mode so will not have any extra shifts or
--	 clear rights.  */
-+	 results.  However if there is only one byte left, we
-+	 can just subtract to get the final result so the shifts
-+	 and clears are not needed.  */
- 
--      if (load_mode_size < word_mode_size)
--	{
--	  /* Rotate left first. */
--	  rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
--	  do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
--	  do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
--	}
-+      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
- 
--      if (cmp_bytes < word_mode_size)
-+      /* Loading just a single byte is a special case.  If we are
-+	 loading more than that, we have to check whether we are
-+	 looking at the entire chunk of data.  If not, rotate left and
-+	 clear right so that bytes we aren't supposed to look at are
-+	 zeroed, and the first byte we are supposed to compare is
-+	 leftmost.  */
-+      if (load_mode_size != 1)
- 	{
--	  /* Now clear right.  This plus the rotate can be
--	     turned into a rldicr instruction. */
--	  HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
--	  rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
--	  do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
--	  do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
-+	  if (load_mode_size < word_mode_size)
-+	    {
-+	      /* Rotate left first.  */
-+	      rtx sh = GEN_INT (BITS_PER_UNIT
-+				* (word_mode_size - load_mode_size));
-+	      do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
-+	      do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
-+	    }
-+
-+	  if (cmp_bytes < word_mode_size)
-+	    {
-+	      /* Now clear right.  This plus the rotate can be
-+		 turned into a rldicr instruction.  */
-+	      HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
-+	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
-+	      do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
-+	      do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
-+	    }
- 	}
- 
-       /* Cases to handle.  A and B are chunks of the two strings.
-@@ -1842,8 +1854,6 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
- 	 A == B: branch to result 0.
- 	 A != B: cleanup code to compute result.  */
- 
--      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
--
-       rtx dst_label;
-       if (remain > 0 || equality_compare_rest)
- 	{
-@@ -1857,54 +1867,89 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
- 	/* Branch to end and produce result of 0.  */
- 	dst_label = final_move_label;
- 
--      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
--      rtx cond = gen_reg_rtx (CCmode);
-+      if (load_mode_size == 1)
-+	{
-+	  /* Special case for comparing just single byte.  */
-+	  if (equality_compare_rest)
-+	    {
-+	      /* Use subf./bne to branch to final_move_label if the
-+		 byte differs, otherwise fall through to the strncmp
-+		 call.  We must also check for a zero byte here as we
-+		 must not make the library call if this is the end of
-+		 the string.  */
-+
-+	      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
-+	      rtx cond = gen_reg_rtx (CCmode);
-+	      rtx diff_rtx = gen_rtx_MINUS (word_mode,
-+					    tmp_reg_src1, tmp_reg_src2);
-+	      rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
-+	      rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-+
-+	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
-+						 lab_ref, pc_rtx);
-+	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-+	      JUMP_LABEL (j) = final_move_label;
-+	      LABEL_NUSES (final_move_label) += 1;
- 
--      /* Always produce the 0 result, it is needed if
--	 cmpb finds a 0 byte in this chunk.  */
--      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
--      rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
-+	      /* Check for zero byte here before fall through to
-+		 library call.  This catches the case where the
-+		 strings are equal and end in a zero byte at this
-+		 position.  */
- 
--      rtx cmp_rtx;
--      if (remain == 0 && !equality_compare_rest)
--	cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
--      else
--	cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
-+	      rtx cond0 = gen_reg_rtx (CCmode);
-+	      emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
-+						      const0_rtx));
- 
--      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
--					 lab_ref, pc_rtx);
--      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
--      JUMP_LABEL (j) = dst_label;
--      LABEL_NUSES (dst_label) += 1;
-+	      rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
- 
--      if (remain > 0 || equality_compare_rest)
-+	      rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
-+						 lab_ref, pc_rtx);
-+	      rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
-+	      JUMP_LABEL (j0) = final_move_label;
-+	      LABEL_NUSES (final_move_label) += 1;
-+	    }
-+	  else
-+	    {
-+	      /* This is the last byte to be compared so we can use
-+		 subf to compute the final result and branch
-+		 unconditionally to final_move_label.  */
-+
-+	      do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
-+
-+	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
-+	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
-+	      JUMP_LABEL (j) = final_move_label;
-+	      LABEL_NUSES (final_move_label) += 1;
-+	      emit_barrier ();
-+	    }
-+	}
-+      else
- 	{
--	  /* Generate a cmpb to test for a 0 byte and branch
--	     to final result if found.  */
- 	  rtx cmpb_zero = gen_reg_rtx (word_mode);
--	  rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
--	  rtx condz = gen_reg_rtx (CCmode);
-+	  rtx cmpb_diff = gen_reg_rtx (word_mode);
- 	  rtx zero_reg = gen_reg_rtx (word_mode);
-+	  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
-+	  rtx cond = gen_reg_rtx (CCmode);
-+
- 	  emit_move_insn (zero_reg, GEN_INT (0));
-+	  do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
- 	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
-+	  rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
-+	  rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
- 
--	  if (cmp_bytes < word_mode_size)
--	    {
--	      /* Don't want to look at zero bytes past end.  */
--	      HOST_WIDE_INT mb =
--		BITS_PER_UNIT * (word_mode_size - cmp_bytes);
--	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
--	      do_and3 (cmpb_zero, cmpb_zero, mask);
--	    }
-+	  rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
- 
--	  emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
--	  rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
--	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
--					     lab_ref_fin, pc_rtx);
--	  rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
--	  JUMP_LABEL (j2) = final_move_label;
--	  LABEL_NUSES (final_move_label) += 1;
-+	  rtx cmp_rtx;
-+	  if (remain == 0 && !equality_compare_rest)
-+	    cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
-+	  else
-+	    cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
- 
-+	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
-+					     lab_ref, pc_rtx);
-+	  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
-+	  JUMP_LABEL (j) = dst_label;
-+	  LABEL_NUSES (dst_label) += 1;
- 	}
- 
-       offset += cmp_bytes;
-@@ -1915,7 +1960,7 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-   return;
- }
- 
--/* Generate the sequence of compares for strcmp/strncmp using vec/vsx 
-+/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
-    instructions.
- 
-    BYTES_TO_COMPARE is the number of bytes to be compared.
-@@ -1931,7 +1976,7 @@ expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-    to strcmp/strncmp if we have equality at the end of the inline comparison.
-    P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up
-    and generate the final comparison result.
--   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 
-+   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
-    set the final result.  */
- static void
- expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-@@ -1982,12 +2027,12 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
- 	 bne 6,.Lmismatch
- 
- 	 Use the overlapping compare trick for the last block if it is
--	 less than 16 bytes. 
-+	 less than 16 bytes.
-       */
- 
-       load_mode = V16QImode;
-       load_mode_size = GET_MODE_SIZE (load_mode);
--      
-+
-       if (bytes_to_compare >= load_mode_size)
- 	cmp_bytes = load_mode_size;
-       else
-@@ -2046,10 +2091,10 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
-       if (branch_to_cleanup)
- 	{
- 	  /* Branch to cleanup code, otherwise fall through to do more
--	     compares. P8 and P9 use different CR bits because on P8
-+	     compares.  P8 and P9 use different CR bits because on P8
- 	     we are looking at the result of a comparsion vs a
- 	     register of zeroes so the all-true condition means no
--	     difference or zero was found. On P9, vcmpnezb sets a byte
-+	     difference or zero was found.  On P9, vcmpnezb sets a byte
- 	     to 0xff if there is a mismatch or zero, so the all-false
- 	     condition indicates we found no difference or zero.  */
- 	  if (!cleanup_label)
-@@ -2062,7 +2107,7 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
- 	}
-       else
- 	{
--	  /* Branch to final return or fall through to cleanup, 
-+	  /* Branch to final return or fall through to cleanup,
- 	     result is already set to 0.  */
- 	  dst_label = final_move_label;
- 	  if (TARGET_P9_VECTOR)
-@@ -2088,10 +2133,7 @@ expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
- /* Generate the final sequence that identifies the differing
-    byte and generates the final result, taking into account
-    zero bytes:
--   
--   cmpb              cmpb_result1, src1, src2
--   cmpb              cmpb_result2, src1, zero
--   orc               cmpb_result1, cmp_result1, cmpb_result2
-+
-    cntlzd            get bit of first zero/diff byte
-    addi              convert for rldcl use
-    rldcl rldcl       extract diff/zero byte
-@@ -2105,10 +2147,7 @@ static void
- emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
- {
-   machine_mode m = GET_MODE (str1);
--  rtx cmpb_diff = gen_reg_rtx (m);
--  rtx cmpb_zero = gen_reg_rtx (m);
-   rtx rot_amt = gen_reg_rtx (m);
--  rtx zero_reg = gen_reg_rtx (m);
- 
-   rtx rot1_1 = gen_reg_rtx (m);
-   rtx rot1_2 = gen_reg_rtx (m);
-@@ -2117,12 +2156,7 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
- 
-   if (m == SImode)
-     {
--      emit_insn (gen_cmpbsi3 (cmpb_diff, str1, str2));
--      emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
--      emit_insn (gen_cmpbsi3 (cmpb_zero, str1, zero_reg));
--      emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
--      emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
--      emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
-+      emit_insn (gen_clzsi2 (rot_amt, result));
-       emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
-       emit_insn (gen_rotlsi3 (rot1_1, str1,
- 			      gen_lowpart (SImode, rot_amt)));
-@@ -2134,12 +2168,7 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
-     }
-   else if (m == DImode)
-     {
--      emit_insn (gen_cmpbdi3 (cmpb_diff, str1, str2));
--      emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
--      emit_insn (gen_cmpbdi3 (cmpb_zero, str1, zero_reg));
--      emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
--      emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
--      emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
-+      emit_insn (gen_clzdi2 (rot_amt, result));
-       emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
-       emit_insn (gen_rotldi3 (rot1_1, str1,
- 			      gen_lowpart (SImode, rot_amt)));
-@@ -2151,7 +2180,7 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
-     }
-   else
-     gcc_unreachable ();
--    
-+
-   return;
- }
- 
-@@ -2169,10 +2198,10 @@ emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
-         lbzx 10,28,9    # use that offset to load differing byte
-         lbzx 3,29,9
-         subf 3,3,10     # subtract for final result
--   
-+
-    P9:
- 	 vclzlsbb            # counts trailing bytes with lsb=0
--	 vextublx            # extract differing byte 
-+	 vextublx            # extract differing byte
- 
-    STR1 is the reg rtx for data from string 1.
-    STR2 is the reg rtx for data from string 2.
-@@ -2208,7 +2237,7 @@ emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
-       gcc_assert (TARGET_P8_VECTOR);
-       rtx diffix = gen_reg_rtx (DImode);
-       rtx result_gbbd = gen_reg_rtx (V16QImode);
--      /* Since each byte of the input is either 00 or FF, the bytes in 
-+      /* Since each byte of the input is either 00 or FF, the bytes in
- 	 dw0 and dw1 after vgbbd are all identical to each other.  */
-       emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
-       /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
-@@ -2226,7 +2255,7 @@ emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
-       else
- 	emit_insn (gen_ctzdi2 (count, diffix));
- 
--      /* P8 doesn't have a good solution for extracting one byte from 
-+      /* P8 doesn't have a good solution for extracting one byte from
- 	 a vsx reg like vextublx on P9 so we just compute the offset
- 	 of the differing byte and load it from each string.  */
-       do_add3 (off_reg, off_reg, count);
-@@ -2247,7 +2276,7 @@ emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
- }
- 
- /* Expand a string compare operation with length, and return
--   true if successful. Return false if we should let the
-+   true if successful.  Return false if we should let the
-    compiler generate normal code, probably a strncmp call.
- 
-    OPERANDS[0] is the target (result).
-@@ -2279,9 +2308,9 @@ expand_strn_compare (rtx operands[], int no_length)
-   rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
-   rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
- 
--  /* If we have a length, it must be constant. This simplifies things
-+  /* If we have a length, it must be constant.  This simplifies things
-      a bit as we don't have to generate code to check if we've exceeded
--     the length. Later this could be expanded to handle this case.  */
-+     the length.  Later this could be expanded to handle this case.  */
-   if (!no_length && !CONST_INT_P (bytes_rtx))
-     return false;
- 
-@@ -2311,7 +2340,7 @@ expand_strn_compare (rtx operands[], int no_length)
-   else
-     bytes = UINTVAL (bytes_rtx);
- 
--  /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
-+  /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
-      least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
-      at least POWER8.  That way we can rely on overlapping compares to
-      do the final comparison of less than 16 bytes.  Also I do not
-@@ -2363,7 +2392,7 @@ expand_strn_compare (rtx operands[], int no_length)
-   rtx final_move_label = gen_label_rtx ();
-   rtx final_label = gen_label_rtx ();
-   rtx begin_compare_label = NULL;
--  
-+
-   if (base_align < required_align)
-     {
-       /* Generate code that checks distance to 4k boundary for this case.  */
-@@ -2472,7 +2501,7 @@ expand_strn_compare (rtx operands[], int no_length)
- 				 &cleanup_label, final_move_label);
- 
-   offset = compare_length;
--  
-+
-   if (equality_compare_rest)
-     {
-       /* Update pointers past what has been compared already.  */
diff --git a/SOURCES/gcc8-rh1668903-1.patch b/SOURCES/gcc8-rh1668903-1.patch
new file mode 100644
index 0000000..9ffff09
--- /dev/null
+++ b/SOURCES/gcc8-rh1668903-1.patch
@@ -0,0 +1,406 @@
+commit 126dab7c9d84294f256b1f7bf91c24a9e7103249
+Author: qinzhao <qinzhao@138bc75d-0d04-0410-961f-82ee72b054a4>
+Date:   Thu Nov 29 16:06:03 2018 +0000
+
+    Add a new option -flive-patching={inline-only-static|inline-clone}
+    to support live patching in GCC.
+    
+    2018-11-29  qing zhao  <qing.zhao@oracle.com>
+    
+    gcc/ChangeLog:
+    
+            * cif-code.def (EXTERN_LIVE_ONLY_STATIC): New CIF code.
+            * common.opt: Add -flive-patching flag.
+            * doc/invoke.texi: Document -flive-patching.
+            * flag-types.h (enum live_patching_level): New enum.
+            * ipa-inline.c (can_inline_edge_p): Disable external functions from
+            inlining when flag_live_patching is LIVE_PATCHING_INLINE_ONLY_STATIC.
+            * opts.c (control_options_for_live_patching): New function.
+            (finish_options): Make flag_live_patching incompatible with flag_lto.
+            Control IPA optimizations based on different levels of
+            flag_live_patching.
+    
+    gcc/testsuite/ChangeLog:
+    
+            * gcc.dg/live-patching-1.c: New test.
+            * gcc.dg/live-patching-2.c: New test.
+            * gcc.dg/live-patching-3.c: New test.
+            * gcc.dg/tree-ssa/writeonly-3.c: New test.
+            * gcc.target/i386/ipa-stack-alignment-2.c: New test.
+    
+    
+    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@266627 138bc75d-0d04-0410-961f-82ee72b054a4
+
+--- gcc/cif-code.def
++++ gcc/cif-code.def
+@@ -132,6 +132,12 @@ DEFCIFCODE(USES_COMDAT_LOCAL, CIF_FINAL_ERROR,
+ DEFCIFCODE(ATTRIBUTE_MISMATCH, CIF_FINAL_ERROR,
+ 	   N_("function attribute mismatch"))
+ 
++/* We can't inline because the user requests only static functions
++   but the function has external linkage for live patching purpose.  */
++DEFCIFCODE(EXTERN_LIVE_ONLY_STATIC, CIF_FINAL_ERROR,
++	   N_("function has external linkage when the user requests only"
++	      " inlining static for live patching"))
++
+ /* We proved that the call is unreachable.  */
+ DEFCIFCODE(UNREACHABLE, CIF_FINAL_ERROR,
+ 	   N_("unreachable"))
+--- gcc/common.opt
++++ gcc/common.opt
+@@ -2181,6 +2181,24 @@ starts and when the destructor finishes.
+ flifetime-dse=
+ Common Joined RejectNegative UInteger Var(flag_lifetime_dse) Optimization IntegerRange(0, 2)
+ 
++flive-patching
++Common RejectNegative Alias(flive-patching=,inline-clone) Optimization
++
++flive-patching=
++Common Report Joined RejectNegative Enum(live_patching_level) Var(flag_live_patching) Init(LIVE_PATCHING_NONE) Optimization
++-flive-patching=[inline-only-static|inline-clone]	Control IPA
++optimizations to provide a safe compilation for live-patching.  At the same
++time, provides multiple-level control on the enabled IPA optimizations.
++
++Enum
++Name(live_patching_level) Type(enum live_patching_level) UnknownError(unknown Live-Patching Level %qs)
++
++EnumValue
++Enum(live_patching_level) String(inline-only-static) Value(LIVE_PATCHING_INLINE_ONLY_STATIC)
++
++EnumValue
++Enum(live_patching_level) String(inline-clone) Value(LIVE_PATCHING_INLINE_CLONE)
++
+ flive-range-shrinkage
+ Common Report Var(flag_live_range_shrinkage) Init(0) Optimization
+ Relief of register pressure through live range shrinkage.
+--- gcc/doc/invoke.texi
++++ gcc/doc/invoke.texi
+@@ -389,6 +389,7 @@ Objective-C and Objective-C++ Dialects}.
+ -fipa-bit-cp -fipa-vrp @gol
+ -fipa-pta  -fipa-profile  -fipa-pure-const  -fipa-reference  -fipa-icf @gol
+ -fira-algorithm=@var{algorithm} @gol
++-flive-patching=@var{level} @gol
+ -fira-region=@var{region}  -fira-hoist-pressure @gol
+ -fira-loop-pressure  -fno-ira-share-save-slots @gol
+ -fno-ira-share-spill-slots @gol
+@@ -9291,6 +9292,65 @@ equivalences that are found only by GCC and equivalences found only by Gold.
+ 
+ This flag is enabled by default at @option{-O2} and @option{-Os}.
+ 
++@item -flive-patching=@var{level}
++@opindex flive-patching
++Control GCC's optimizations to provide a safe compilation for live-patching.
++
++If the compiler's optimization uses a function's body or information extracted
++from its body to optimize/change another function, the latter is called an
++impacted function of the former.  If a function is patched, its impacted
++functions should be patched too.
++
++The impacted functions are decided by the compiler's interprocedural
++optimizations.  For example, inlining a function into its caller, cloning
++a function and changing its caller to call this new clone, or extracting
++a function's pureness/constness information to optimize its direct or
++indirect callers, etc.
++
++Usually, the more IPA optimizations enabled, the larger the number of
++impacted functions for each function.  In order to control the number of
++impacted functions and computed the list of impacted function easily,
++we provide control to partially enable IPA optimizations on two different
++levels.
++
++The @var{level} argument should be one of the following:
++
++@table @samp
++
++@item inline-clone
++
++Only enable inlining and cloning optimizations, which includes inlining,
++cloning, interprocedural scalar replacement of aggregates and partial inlining.
++As a result, when patching a function, all its callers and its clones'
++callers need to be patched as well.
++
++@option{-flive-patching=inline-clone} disables the following optimization flags:
++@gccoptlist{-fwhole-program  -fipa-pta  -fipa-reference  -fipa-ra @gol
++-fipa-icf  -fipa-icf-functions  -fipa-icf-variables @gol
++-fipa-bit-cp  -fipa-vrp  -fipa-pure-const  -fipa-reference-addressable @gol
++-fipa-stack-alignment}
++
++@item inline-only-static
++
++Only enable inlining of static functions.
++As a result, when patching a static function, all its callers need to be
++patches as well.
++
++In addition to all the flags that -flive-patching=inline-clone disables,
++@option{-flive-patching=inline-only-static} disables the following additional
++optimization flags:
++@gccoptlist{-fipa-cp-clone  -fipa-sra  -fpartial-inlining  -fipa-cp}
++
++@end table
++
++When -flive-patching specified without any value, the default value
++is "inline-clone".
++
++This flag is disabled by default.
++
++Note that -flive-patching is not supported with link-time optimizer.
++(@option{-flto}).
++
+ @item -fisolate-erroneous-paths-dereference
+ @opindex fisolate-erroneous-paths-dereference
+ Detect paths that trigger erroneous or undefined behavior due to
+--- gcc/flag-types.h
++++ gcc/flag-types.h
+@@ -123,6 +123,14 @@ enum stack_reuse_level
+   SR_ALL
+ };
+ 
++/* The live patching level.  */
++enum live_patching_level
++{
++  LIVE_PATCHING_NONE = 0,
++  LIVE_PATCHING_INLINE_ONLY_STATIC,
++  LIVE_PATCHING_INLINE_CLONE
++};
++
+ /* The algorithm used for basic block reordering.  */
+ enum reorder_blocks_algorithm
+ {
+--- gcc/ipa-inline.c
++++ gcc/ipa-inline.c
+@@ -379,6 +379,12 @@ can_inline_edge_p (struct cgraph_edge *e, bool report,
+       e->inline_failed = CIF_ATTRIBUTE_MISMATCH;
+       inlinable = false;
+     }
++  else if (callee->externally_visible
++	   && flag_live_patching == LIVE_PATCHING_INLINE_ONLY_STATIC)
++    {
++      e->inline_failed = CIF_EXTERN_LIVE_ONLY_STATIC;
++      inlinable = false;
++    }
+   if (!inlinable && report)
+     report_inline_failed_reason (e);
+   return inlinable;
+--- gcc/opts.c
++++ gcc/opts.c
+@@ -699,6 +699,152 @@ default_options_optimization (struct gcc
+ 			 lang_mask, handlers, loc, dc);
+ }
+ 
++/* Control IPA optimizations based on different live patching LEVEL.  */
++static void
++control_options_for_live_patching (struct gcc_options *opts,
++				   struct gcc_options *opts_set,
++				   enum live_patching_level level,
++				   location_t loc)
++{
++  gcc_assert (level > LIVE_PATCHING_NONE);
++
++  switch (level)
++    {
++    case LIVE_PATCHING_INLINE_ONLY_STATIC:
++      if (opts_set->x_flag_ipa_cp_clone && opts->x_flag_ipa_cp_clone)
++	error_at (loc,
++		  "%<-fipa-cp-clone%> is incompatible with "
++		  "%<-flive-patching=inline-only-static%>");
++      else
++	opts->x_flag_ipa_cp_clone = 0;
++
++      if (opts_set->x_flag_ipa_sra && opts->x_flag_ipa_sra)
++	error_at (loc,
++		  "%<-fipa-sra%> is incompatible with "
++		  "%<-flive-patching=inline-only-static%>");
++      else
++	opts->x_flag_ipa_sra = 0;
++
++      if (opts_set->x_flag_partial_inlining && opts->x_flag_partial_inlining)
++	error_at (loc,
++		  "%<-fpartial-inlining%> is incompatible with "
++		  "%<-flive-patching=inline-only-static%>");
++      else
++	opts->x_flag_partial_inlining = 0;
++
++      if (opts_set->x_flag_ipa_cp && opts->x_flag_ipa_cp)
++	error_at (loc,
++		  "%<-fipa-cp%> is incompatible with "
++		  "%<-flive-patching=inline-only-static%>");
++      else
++	opts->x_flag_ipa_cp = 0;
++
++      /* FALLTHROUGH.  */
++    case LIVE_PATCHING_INLINE_CLONE:
++      /* live patching should disable whole-program optimization.  */
++      if (opts_set->x_flag_whole_program && opts->x_flag_whole_program)
++	error_at (loc,
++		  "%<-fwhole-program%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_whole_program = 0;
++
++      /* visibility change should be excluded by !flag_whole_program
++	 && !in_lto_p && !flag_ipa_cp_clone && !flag_ipa_sra
++	 && !flag_partial_inlining.  */
++
++      if (opts_set->x_flag_ipa_pta && opts->x_flag_ipa_pta)
++	error_at (loc,
++		  "%<-fipa-pta%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_pta = 0;
++
++      if (opts_set->x_flag_ipa_reference && opts->x_flag_ipa_reference)
++	error_at (loc,
++		  "%<-fipa-reference%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_reference = 0;
++
++      if (opts_set->x_flag_ipa_ra && opts->x_flag_ipa_ra)
++	error_at (loc,
++		  "%<-fipa-ra%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_ra = 0;
++
++      if (opts_set->x_flag_ipa_icf && opts->x_flag_ipa_icf)
++	error_at (loc,
++		  "%<-fipa-icf%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_icf = 0;
++
++      if (opts_set->x_flag_ipa_icf_functions && opts->x_flag_ipa_icf_functions)
++	error_at (loc,
++		  "%<-fipa-icf-functions%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_icf_functions = 0;
++
++      if (opts_set->x_flag_ipa_icf_variables && opts->x_flag_ipa_icf_variables)
++	error_at (loc,
++		  "%<-fipa-icf-variables%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_icf_variables = 0;
++
++      if (opts_set->x_flag_ipa_bit_cp && opts->x_flag_ipa_bit_cp)
++	error_at (loc,
++		  "%<-fipa-bit-cp%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_bit_cp = 0;
++
++      if (opts_set->x_flag_ipa_vrp && opts->x_flag_ipa_vrp)
++	error_at (loc,
++		  "%<-fipa-vrp%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_vrp = 0;
++
++      if (opts_set->x_flag_ipa_pure_const && opts->x_flag_ipa_pure_const)
++	error_at (loc,
++		  "%<-fipa-pure-const%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_pure_const = 0;
++
++      /* FIXME: disable unreachable code removal.  */
++
++      /* discovery of functions/variables with no address taken.  */
++// GCC 8 doesn't have these options.
++#if 0
++      if (opts_set->x_flag_ipa_reference_addressable
++	  && opts->x_flag_ipa_reference_addressable)
++	error_at (loc,
++		  "%<-fipa-reference-addressable%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_reference_addressable = 0;
++
++      /* ipa stack alignment propagation.  */
++      if (opts_set->x_flag_ipa_stack_alignment
++	  && opts->x_flag_ipa_stack_alignment)
++	error_at (loc,
++		  "%<-fipa-stack-alignment%> is incompatible with "
++		  "%<-flive-patching=inline-only-static|inline-clone%>");
++      else
++	opts->x_flag_ipa_stack_alignment = 0;
++#endif
++
++      break;
++    default:
++      gcc_unreachable ();
++    }
++}
++
+ /* After all options at LOC have been read into OPTS and OPTS_SET,
+    finalize settings of those options and diagnose incompatible
+    combinations.  */
+@@ -1057,6 +1203,18 @@ finish_options (struct gcc_options *opts
+     sorry ("transactional memory is not supported with "
+ 	   "%<-fsanitize=kernel-address%>");
+ 
++  /* Currently live patching is not support for LTO.  */
++  if (opts->x_flag_live_patching && opts->x_flag_lto)
++    sorry ("live patching is not supported with LTO");
++
++  /* Control IPA optimizations based on different -flive-patching level.  */
++  if (opts->x_flag_live_patching)
++    {
++      control_options_for_live_patching (opts, opts_set,
++					 opts->x_flag_live_patching,
++					 loc);
++    }
++
+   /* Comes from final.c -- no real reason to change it.  */
+ #define MAX_CODE_ALIGN 16
+ #define MAX_CODE_ALIGN_VALUE (1 << MAX_CODE_ALIGN)
+--- /dev/null
++++ gcc/testsuite/gcc.dg/live-patching-1.c
+@@ -0,0 +1,22 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -flive-patching=inline-only-static -fdump-ipa-inline" } */
++
++extern int sum, n, m;
++
++int foo (int a)
++{
++  return a + n;
++}
++
++static int bar (int b)
++{
++  return b * m;
++}
++
++int main()
++{
++  sum = foo (m) + bar (n); 
++  return 0;
++}
++
++/* { dg-final { scan-ipa-dump "foo/0 function has external linkage when the user requests only inlining static for live patching"  "inline" } } */
+--- /dev/null
++++ gcc/testsuite/gcc.dg/live-patching-2.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -flive-patching -flto" } */
++
++int main()
++{
++  return 0;
++}
++
++/* { dg-message "sorry, unimplemented: live patching is not supported with LTO" "-flive-patching and -flto together" { target *-*-* } 0 } */
+--- /dev/null
++++ gcc/testsuite/gcc.dg/live-patching-3.c
+@@ -0,0 +1,9 @@
++/* { dg-do compile } */
++/* { dg-options "-O1 -flive-patching -fwhole-program" } */
++
++int main()
++{
++  return 0;
++}
++
++/* { dg-message "'-fwhole-program' is incompatible with '-flive-patching=inline-only-static|inline-clone’" "" {target "*-*-*"} 0 } */
diff --git a/SOURCES/gcc8-rh1668903-2.patch b/SOURCES/gcc8-rh1668903-2.patch
new file mode 100644
index 0000000..92eaf54
--- /dev/null
+++ b/SOURCES/gcc8-rh1668903-2.patch
@@ -0,0 +1,73 @@
+commit 9939b2f79bd9b75b99080a17f3d6f1214d543477
+Author: qinzhao <qinzhao@138bc75d-0d04-0410-961f-82ee72b054a4>
+Date:   Wed Apr 3 19:00:25 2019 +0000
+
+    2019-04-03  qing zhao  <qing.zhao@oracle.com>
+    
+            PR tree-optimization/89730
+            * ipa-inline.c (can_inline_edge_p): Delete the checking for
+            -flive-patching=inline-only-static.
+            (can_inline_edge_by_limits_p): Add the checking for
+            -flive-patching=inline-only-static and grant always_inline
+            even when -flive-patching=inline-only-static is specified.
+    
+            * gcc.dg/live-patching-4.c: New test.
+    
+    
+    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@270134 138bc75d-0d04-0410-961f-82ee72b054a4
+
+--- gcc/ipa-inline.c
++++ gcc/ipa-inline.c
+@@ -385,12 +385,6 @@ can_inline_edge_p (struct cgraph_edge *e, bool report,
+       e->inline_failed = CIF_ATTRIBUTE_MISMATCH;
+       inlinable = false;
+     }
+-  else if (callee->externally_visible
+-	   && flag_live_patching == LIVE_PATCHING_INLINE_ONLY_STATIC)
+-    {
+-      e->inline_failed = CIF_EXTERN_LIVE_ONLY_STATIC;
+-      inlinable = false;
+-    }
+   if (!inlinable && report)
+     report_inline_failed_reason (e);
+   return inlinable;
+@@ -433,6 +427,13 @@ can_inline_edge_by_limits_p (struct cgraph_edge *e, bool report,
+      		 DECL_ATTRIBUTES (caller->decl))
+       && !caller_growth_limits (e))
+     inlinable = false;
++  else if (callee->externally_visible
++	   && !DECL_DISREGARD_INLINE_LIMITS (callee->decl)
++	   && flag_live_patching == LIVE_PATCHING_INLINE_ONLY_STATIC)
++    {
++      e->inline_failed = CIF_EXTERN_LIVE_ONLY_STATIC;
++      inlinable = false;
++    }
+   /* Don't inline a function with a higher optimization level than the
+      caller.  FIXME: this is really just tip of iceberg of handling
+      optimization attribute.  */
+--- /dev/null
++++ gcc/testsuite/gcc.dg/live-patching-4.c
+@@ -0,0 +1,23 @@
++/* { dg-do compile } */
++/* { dg-options "-O2 -flive-patching=inline-only-static -fdump-tree-einline-optimized" } */
++
++extern int sum, n, m;
++
++extern inline __attribute__((always_inline)) int foo (int a);
++inline __attribute__((always_inline)) int foo (int a)
++{
++  return a + n;
++}
++
++static int bar (int b)
++{
++  return b * m;
++}
++
++int main()
++{
++  sum = foo (m) + bar (n); 
++  return 0;
++}
++
++/* { dg-final { scan-tree-dump "Inlining foo into main"  "einline" } } */
diff --git a/SOURCES/gcc8-rh1668903-3.patch b/SOURCES/gcc8-rh1668903-3.patch
new file mode 100644
index 0000000..129d037
--- /dev/null
+++ b/SOURCES/gcc8-rh1668903-3.patch
@@ -0,0 +1,85 @@
+commit 77e6311332590004c5aec82ceeb45e4d4d93f690
+Author: redi <redi@138bc75d-0d04-0410-961f-82ee72b054a4>
+Date:   Thu Apr 11 08:52:22 2019 +0000
+
+    Clarify documentation for -flive-patching
+    
+            * doc/invoke.texi (Optimize Options): Clarify -flive-patching docs.
+    
+    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@270276 138bc75d-0d04-0410-961f-82ee72b054a4
+
+--- gcc/doc/invoke.texi
++++ gcc/doc/invoke.texi
+@@ -9367,24 +9367,24 @@ This flag is enabled by default at @option{-O2} and @option{-Os}.
+ 
+ @item -flive-patching=@var{level}
+ @opindex flive-patching
+-Control GCC's optimizations to provide a safe compilation for live-patching.
++Control GCC's optimizations to produce output suitable for live-patching.
+ 
+ If the compiler's optimization uses a function's body or information extracted
+ from its body to optimize/change another function, the latter is called an
+ impacted function of the former.  If a function is patched, its impacted
+ functions should be patched too.
+ 
+-The impacted functions are decided by the compiler's interprocedural
+-optimizations.  For example, inlining a function into its caller, cloning
+-a function and changing its caller to call this new clone, or extracting
+-a function's pureness/constness information to optimize its direct or
+-indirect callers, etc.
++The impacted functions are determined by the compiler's interprocedural
++optimizations.  For example, a caller is impacted when inlining a function
++into its caller,
++cloning a function and changing its caller to call this new clone,
++or extracting a function's pureness/constness information to optimize
++its direct or indirect callers, etc.
+ 
+ Usually, the more IPA optimizations enabled, the larger the number of
+ impacted functions for each function.  In order to control the number of
+-impacted functions and computed the list of impacted function easily,
+-we provide control to partially enable IPA optimizations on two different
+-levels.
++impacted functions and more easily compute the list of impacted function,
++IPA optimizations can be partially enabled at two different levels.
+ 
+ The @var{level} argument should be one of the following:
+ 
+@@ -9395,7 +9395,7 @@ The @var{level} argument should be one of the following:
+ Only enable inlining and cloning optimizations, which includes inlining,
+ cloning, interprocedural scalar replacement of aggregates and partial inlining.
+ As a result, when patching a function, all its callers and its clones'
+-callers need to be patched as well.
++callers are impacted, therefore need to be patched as well.
+ 
+ @option{-flive-patching=inline-clone} disables the following optimization flags:
+ @gccoptlist{-fwhole-program  -fipa-pta  -fipa-reference  -fipa-ra @gol
+@@ -9406,22 +9406,23 @@ callers need to be patched as well.
+ @item inline-only-static
+ 
+ Only enable inlining of static functions.
+-As a result, when patching a static function, all its callers need to be
+-patches as well.
++As a result, when patching a static function, all its callers are impacted
++and so need to be patched as well.
+ 
+-In addition to all the flags that -flive-patching=inline-clone disables,
++In addition to all the flags that @option{-flive-patching=inline-clone}
++disables,
+ @option{-flive-patching=inline-only-static} disables the following additional
+ optimization flags:
+ @gccoptlist{-fipa-cp-clone  -fipa-sra  -fpartial-inlining  -fipa-cp}
+ 
+ @end table
+ 
+-When -flive-patching specified without any value, the default value
+-is "inline-clone".
++When @option{-flive-patching} is specified without any value, the default value
++is @var{inline-clone}.
+ 
+ This flag is disabled by default.
+ 
+-Note that -flive-patching is not supported with link-time optimizer.
++Note that @option{-flive-patching} is not supported with link-time optimization
+ (@option{-flto}).
+ 
+ @item -fisolate-erroneous-paths-dereference
diff --git a/SOURCES/gcc8-rh1670535.patch b/SOURCES/gcc8-rh1670535.patch
new file mode 100644
index 0000000..961c278
--- /dev/null
+++ b/SOURCES/gcc8-rh1670535.patch
@@ -0,0 +1,93 @@
+2018-11-08  Roman Geissler  <roman.geissler@amadeus.com>
+
+	* collect2.c (linker_select):  Add USE_LLD_LD.
+	(ld_suffixes): Add ld.lld.
+	(main): Handle -fuse-ld=lld.
+	* common.opt (-fuse-ld=lld): New option.
+	* doc/invoke.texi (-fuse-ld=lld): Document.
+	* opts.c (common_handle_option): Handle OPT_fuse_ld_lld.
+	
+--- gcc/collect2.c
++++ gcc/collect2.c
+@@ -831,6 +831,7 @@ main (int argc, char **argv)
+       USE_PLUGIN_LD,
+       USE_GOLD_LD,
+       USE_BFD_LD,
++      USE_LLD_LD,
+       USE_LD_MAX
+     } selected_linker = USE_DEFAULT_LD;
+   static const char *const ld_suffixes[USE_LD_MAX] =
+@@ -838,7 +839,8 @@ main (int argc, char **argv)
+       "ld",
+       PLUGIN_LD_SUFFIX,
+       "ld.gold",
+-      "ld.bfd"
++      "ld.bfd",
++      "ld.lld"
+     };
+   static const char *const real_ld_suffix = "real-ld";
+   static const char *const collect_ld_suffix = "collect-ld";
+@@ -1007,6 +1009,8 @@ main (int argc, char **argv)
+ 	  selected_linker = USE_BFD_LD;
+ 	else if (strcmp (argv[i], "-fuse-ld=gold") == 0)
+ 	  selected_linker = USE_GOLD_LD;
++  else if (strcmp (argv[i], "-fuse-ld=lld") == 0)
++    selected_linker = USE_LLD_LD;
+ 
+ #ifdef COLLECT_EXPORT_LIST
+ 	/* These flags are position independent, although their order
+@@ -1096,7 +1100,8 @@ main (int argc, char **argv)
+   /* Maybe we know the right file to use (if not cross).  */
+   ld_file_name = 0;
+ #ifdef DEFAULT_LINKER
+-  if (selected_linker == USE_BFD_LD || selected_linker == USE_GOLD_LD)
++  if (selected_linker == USE_BFD_LD || selected_linker == USE_GOLD_LD ||
++      selected_linker == USE_LLD_LD)
+     {
+       char *linker_name;
+ # ifdef HOST_EXECUTABLE_SUFFIX
+@@ -1315,7 +1320,7 @@ main (int argc, char **argv)
+ 	      else if (!use_collect_ld
+ 		       && strncmp (arg, "-fuse-ld=", 9) == 0)
+ 		{
+-		  /* Do not pass -fuse-ld={bfd|gold} to the linker. */
++		  /* Do not pass -fuse-ld={bfd|gold|lld} to the linker. */
+ 		  ld1--;
+ 		  ld2--;
+ 		}
+--- gcc/common.opt
++++ gcc/common.opt
+@@ -2732,6 +2732,10 @@ fuse-ld=gold
+ Common Driver Negative(fuse-ld=bfd)
+ Use the gold linker instead of the default linker.
+ 
++fuse-ld=lld
++Common Driver Negative(fuse-ld=lld)
++Use the lld LLVM linker instead of the default linker.
++
+ fuse-linker-plugin
+ Common Undocumented Var(flag_use_linker_plugin)
+ 
+--- gcc/doc/invoke.texi
++++ gcc/doc/invoke.texi
+@@ -12610,6 +12610,10 @@ Use the @command{bfd} linker instead of the default linker.
+ @opindex fuse-ld=gold
+ Use the @command{gold} linker instead of the default linker.
+ 
++@item -fuse-ld=lld
++@opindex fuse-ld=lld
++Use the LLVM @command{lld} linker instead of the default linker.
++
+ @cindex Libraries
+ @item -l@var{library}
+ @itemx -l @var{library}
+--- gcc/opts.c
++++ gcc/opts.c
+@@ -2557,6 +2557,7 @@ common_handle_option (struct gcc_options *opts,
+ 
+     case OPT_fuse_ld_bfd:
+     case OPT_fuse_ld_gold:
++    case OPT_fuse_ld_lld:
+     case OPT_fuse_linker_plugin:
+       /* No-op. Used by the driver and passed to us because it starts with f.*/
+       break;
diff --git a/SOURCES/gcc8-rh1711346.patch b/SOURCES/gcc8-rh1711346.patch
new file mode 100644
index 0000000..0d9017a
--- /dev/null
+++ b/SOURCES/gcc8-rh1711346.patch
@@ -0,0 +1,279 @@
+2019-05-29  Jakub Jelinek  <jakub@redhat.com>
+
+	PR fortran/90329
+	* lto-streamer.h (LTO_minor_version): Bump to 2.
+
+	Backported from mainline
+	2019-05-16  Jakub Jelinek  <jakub@redhat.com>
+
+	PR fortran/90329
+	* tree-core.h (struct tree_decl_common): Document
+	decl_nonshareable_flag for PARM_DECLs.
+	* tree.h (DECL_HIDDEN_STRING_LENGTH): Define.
+	* calls.c (expand_call): Don't try tail call if caller
+	has any DECL_HIDDEN_STRING_LENGTH PARM_DECLs that are or might be
+	passed on the stack and callee needs to pass any arguments on the
+	stack.
+	* tree-streamer-in.c (unpack_ts_decl_common_value_fields): Use
+	else if instead of series of mutually exclusive ifs.  Handle
+	DECL_HIDDEN_STRING_LENGTH for PARM_DECLs.
+	* tree-streamer-out.c (pack_ts_decl_common_value_fields): Likewise.
+
+	* lang.opt (fbroken-callers): Remove.
+	(ftail-call-workaround, ftail-call-workaround=): New options.
+	* gfortran.h (struct gfc_namespace): Add implicit_interface_calls.
+	* interface.c (gfc_procedure_use): Set implicit_interface_calls
+	for calls to implicit interface procedures.
+	* trans-decl.c (create_function_arglist): Use flag_tail_call_workaround
+	instead of flag_broken_callers.  If it is not 2, also require
+	sym->ns->implicit_interface_calls.
+	* invoke.texi (fbroken-callers): Remove documentation.
+	(ftail-call-workaround, ftail-call-workaround=): Document.
+
+	2019-05-19  Thomas Koenig  <tkoenig@gcc.gnu.org>
+
+	PR fortran/90329
+	* invoke.texi: Document -fbroken-callers.
+	* lang.opt: Add -fbroken-callers.
+	* trans-decl.c (create_function_arglist): Only set
+	DECL_HIDDEN_STRING_LENGTH if flag_broken_callers is set.
+
+	2019-05-16  Jakub Jelinek  <jakub@redhat.com>
+
+	PR fortran/90329
+	* trans-decl.c (create_function_arglist): Set
+	DECL_HIDDEN_STRING_LENGTH on hidden string length PARM_DECLs if
+	len is constant.
+
+--- gcc/calls.c
++++ gcc/calls.c
+@@ -3754,6 +3754,28 @@ expand_call (tree exp, rtx target, int ignore)
+       || dbg_cnt (tail_call) == false)
+     try_tail_call = 0;
+ 
++  /* Workaround buggy C/C++ wrappers around Fortran routines with
++     character(len=constant) arguments if the hidden string length arguments
++     are passed on the stack; if the callers forget to pass those arguments,
++     attempting to tail call in such routines leads to stack corruption.
++     Avoid tail calls in functions where at least one such hidden string
++     length argument is passed (partially or fully) on the stack in the
++     caller and the callee needs to pass any arguments on the stack.
++     See PR90329.  */
++  if (try_tail_call && maybe_ne (args_size.constant, 0))
++    for (tree arg = DECL_ARGUMENTS (current_function_decl);
++	 arg; arg = DECL_CHAIN (arg))
++      if (DECL_HIDDEN_STRING_LENGTH (arg) && DECL_INCOMING_RTL (arg))
++	{
++	  subrtx_iterator::array_type array;
++	  FOR_EACH_SUBRTX (iter, array, DECL_INCOMING_RTL (arg), NONCONST)
++	    if (MEM_P (*iter))
++	      {
++		try_tail_call = 0;
++		break;
++	      }
++	}
++
+   /* If the user has marked the function as requiring tail-call
+      optimization, attempt it.  */
+   if (must_tail_call)
+--- gcc/fortran/gfortran.h
++++ gcc/fortran/gfortran.h
+@@ -1857,6 +1857,9 @@ typedef struct gfc_namespace
+ 
+   /* Set to 1 for !$ACC ROUTINE namespaces.  */
+   unsigned oacc_routine:1;
++
++  /* Set to 1 if there are any calls to procedures with implicit interface.  */
++  unsigned implicit_interface_calls:1;
+ }
+ gfc_namespace;
+ 
+--- gcc/fortran/interface.c
++++ gcc/fortran/interface.c
+@@ -3657,6 +3657,7 @@ gfc_procedure_use (gfc_symbol *sym, gfc_actual_arglist **ap, locus *where)
+ 	gfc_warning (OPT_Wimplicit_procedure,
+ 		     "Procedure %qs called at %L is not explicitly declared",
+ 		     sym->name, where);
++      gfc_find_proc_namespace (sym->ns)->implicit_interface_calls = 1;
+     }
+ 
+   if (sym->attr.if_source == IFSRC_UNKNOWN)
+--- gcc/fortran/invoke.texi
++++ gcc/fortran/invoke.texi
+@@ -181,7 +181,8 @@ and warnings}.
+ @item Code Generation Options
+ @xref{Code Gen Options,,Options for code generation conventions}.
+ @gccoptlist{-faggressive-function-elimination -fblas-matmul-limit=@var{n} @gol
+--fbounds-check -fcheck-array-temporaries @gol
++-fbounds-check -ftail-call-workaround -ftail-call-workaround=@var{n} @gol
++-fcheck-array-temporaries @gol
+ -fcheck=@var{<all|array-temps|bounds|do|mem|pointer|recursion>} @gol
+ -fcoarray=@var{<none|single|lib>} -fexternal-blas -ff2c
+ -ffrontend-loop-interchange @gol
+@@ -1580,6 +1581,39 @@ warnings for generated array temporaries.
+ @c Note: This option is also referred in gcc's manpage
+ Deprecated alias for @option{-fcheck=bounds}.
+ 
++@item -ftail-call-workaround
++@itemx -ftail-call-workaround=@var{n}
++@opindex @code{tail-call-workaround}
++Some C interfaces to Fortran codes violate the gfortran ABI by
++omitting the hidden character length arguments as described in
++@xref{Argument passing conventions}.  This can lead to crashes
++because pushing arguments for tail calls can overflow the stack.
++
++To provide a workaround for existing binary packages, this option
++disables tail call optimization for gfortran procedures with character
++arguments.  With @option{-ftail-call-workaround=2} tail call optimization
++is disabled in all gfortran procedures with character arguments,
++with @option{-ftail-call-workaround=1} or equivalent
++@option{-ftail-call-workaround} only in gfortran procedures with character
++arguments that call implicitly prototyped procedures.
++
++Using this option can lead to problems including crashes due to
++insufficient stack space.
++
++It is @emph{very strongly} recommended to fix the code in question.
++The @option{-fc-prototypes-external} option can be used to generate
++prototypes which conform to gfortran's ABI, for inclusion in the
++source code.
++
++Support for this option will likely be withdrawn in a future release
++of gfortran.
++
++The negative form, @option{-fno-tail-call-workaround} or equivalent
++@option{-ftail-call-workaround=0}, can be used to disable this option.
++
++Default is currently @option{-ftail-call-workaround}, this will change
++in future releases.
++
+ @item -fcheck-array-temporaries
+ @opindex @code{fcheck-array-temporaries}
+ Deprecated alias for @option{-fcheck=array-temps}.
+--- gcc/fortran/lang.opt
++++ gcc/fortran/lang.opt
+@@ -742,6 +742,13 @@ fsign-zero
+ Fortran Var(flag_sign_zero) Init(1)
+ Apply negative sign to zero values.
+ 
++ftail-call-workaround
++Fortran Alias(ftail-call-workaround=,1,0)
++
++ftail-call-workaround=
++Fortran RejectNegative Joined UInteger IntegerRange(0, 2) Var(flag_tail_call_workaround) Init(1)
++Disallow tail call optimization when a calling routine may have omitted character lengths.
++
+ funderscoring
+ Fortran Var(flag_underscoring) Init(1)
+ Append underscores to externally visible names.
+--- gcc/fortran/trans-decl.c
++++ gcc/fortran/trans-decl.c
+@@ -2513,6 +2513,17 @@ create_function_arglist (gfc_symbol * sym)
+ 	  TREE_READONLY (length) = 1;
+ 	  gfc_finish_decl (length);
+ 
++	  /* Marking the length DECL_HIDDEN_STRING_LENGTH will lead
++	     to tail calls being disabled.  Only do that if we
++	     potentially have broken callers.  */
++	  if (flag_tail_call_workaround
++	      && f->sym->ts.u.cl
++	      && f->sym->ts.u.cl->length
++	      && f->sym->ts.u.cl->length->expr_type == EXPR_CONSTANT
++	      && (flag_tail_call_workaround == 2
++		  || f->sym->ns->implicit_interface_calls))
++	    DECL_HIDDEN_STRING_LENGTH (length) = 1;
++
+ 	  /* Remember the passed value.  */
+           if (!f->sym->ts.u.cl ||  f->sym->ts.u.cl->passed_length)
+             {
+--- gcc/lto-streamer.h
++++ gcc/lto-streamer.h
+@@ -121,7 +121,7 @@ along with GCC; see the file COPYING3.  If not see
+      form followed by the data for the string.  */
+ 
+ #define LTO_major_version 7
+-#define LTO_minor_version 1
++#define LTO_minor_version 2
+ 
+ typedef unsigned char	lto_decl_flags_t;
+ 
+--- gcc/tree-core.h
++++ gcc/tree-core.h
+@@ -1644,6 +1644,7 @@ struct GTY(()) tree_decl_common {
+   /* In a VAR_DECL and PARM_DECL, this is DECL_READ_P.  */
+   unsigned decl_read_flag : 1;
+   /* In a VAR_DECL or RESULT_DECL, this is DECL_NONSHAREABLE.  */
++  /* In a PARM_DECL, this is DECL_HIDDEN_STRING_LENGTH.  */
+   unsigned decl_nonshareable_flag : 1;
+ 
+   /* DECL_OFFSET_ALIGN, used only for FIELD_DECLs.  */
+--- gcc/tree-streamer-in.c
++++ gcc/tree-streamer-in.c
+@@ -252,7 +252,7 @@ unpack_ts_decl_common_value_fields (struct bitpack_d *bp, tree expr)
+       LABEL_DECL_UID (expr) = -1;
+     }
+ 
+-  if (TREE_CODE (expr) == FIELD_DECL)
++  else if (TREE_CODE (expr) == FIELD_DECL)
+     {
+       DECL_PACKED (expr) = (unsigned) bp_unpack_value (bp, 1);
+       DECL_NONADDRESSABLE_P (expr) = (unsigned) bp_unpack_value (bp, 1);
+@@ -260,12 +260,15 @@ unpack_ts_decl_common_value_fields (struct bitpack_d *bp, tree expr)
+       expr->decl_common.off_align = bp_unpack_value (bp, 8);
+     }
+ 
+-  if (VAR_P (expr))
++  else if (VAR_P (expr))
+     {
+       DECL_HAS_DEBUG_EXPR_P (expr) = (unsigned) bp_unpack_value (bp, 1);
+       DECL_NONLOCAL_FRAME (expr) = (unsigned) bp_unpack_value (bp, 1);
+     }
+ 
++  else if (TREE_CODE (expr) == PARM_DECL)
++    DECL_HIDDEN_STRING_LENGTH (expr) = (unsigned) bp_unpack_value (bp, 1);
++
+   if (TREE_CODE (expr) == RESULT_DECL
+       || TREE_CODE (expr) == PARM_DECL
+       || VAR_P (expr))
+--- gcc/tree-streamer-out.c
++++ gcc/tree-streamer-out.c
+@@ -212,7 +212,7 @@ pack_ts_decl_common_value_fields (struct bitpack_d *bp, tree expr)
+       bp_pack_var_len_unsigned (bp, EH_LANDING_PAD_NR (expr));
+     }
+ 
+-  if (TREE_CODE (expr) == FIELD_DECL)
++  else if (TREE_CODE (expr) == FIELD_DECL)
+     {
+       bp_pack_value (bp, DECL_PACKED (expr), 1);
+       bp_pack_value (bp, DECL_NONADDRESSABLE_P (expr), 1);
+@@ -220,12 +220,15 @@ pack_ts_decl_common_value_fields (struct bitpack_d *bp, tree expr)
+       bp_pack_value (bp, expr->decl_common.off_align, 8);
+     }
+ 
+-  if (VAR_P (expr))
++  else if (VAR_P (expr))
+     {
+       bp_pack_value (bp, DECL_HAS_DEBUG_EXPR_P (expr), 1);
+       bp_pack_value (bp, DECL_NONLOCAL_FRAME (expr), 1);
+     }
+ 
++  else if (TREE_CODE (expr) == PARM_DECL)
++    bp_pack_value (bp, DECL_HIDDEN_STRING_LENGTH (expr), 1);
++
+   if (TREE_CODE (expr) == RESULT_DECL
+       || TREE_CODE (expr) == PARM_DECL
+       || VAR_P (expr))
+--- gcc/tree.h
++++ gcc/tree.h
+@@ -909,6 +909,11 @@ extern void omp_clause_range_check_failed (const_tree, const char *, int,
+   (TREE_CHECK2 (NODE, VAR_DECL, \
+ 		RESULT_DECL)->decl_common.decl_nonshareable_flag)
+ 
++/* In a PARM_DECL, set for Fortran hidden string length arguments that some
++   buggy callers don't pass to the callee.  */
++#define DECL_HIDDEN_STRING_LENGTH(NODE) \
++  (TREE_CHECK (NODE, PARM_DECL)->decl_common.decl_nonshareable_flag)
++
+ /* In a CALL_EXPR, means that the call is the jump from a thunk to the
+    thunked-to function.  */
+ #define CALL_FROM_THUNK_P(NODE) (CALL_EXPR_CHECK (NODE)->base.protected_flag)
diff --git a/SOURCES/gcc8-rh1730380.patch b/SOURCES/gcc8-rh1730380.patch
new file mode 100644
index 0000000..201a7ba
--- /dev/null
+++ b/SOURCES/gcc8-rh1730380.patch
@@ -0,0 +1,45 @@
+2019-07-15  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	Backport from mainline
+	2019-07-01  Andreas Krebbel  <krebbel@linux.ibm.com>
+
+	* config/s390/vector.md: Fix shift count operand printing.
+
+--- gcc/config/s390/vector.md
++++ gcc/config/s390/vector.md
+@@ -944,7 +944,7 @@
+ 	(VEC_SHIFTS:VI (match_operand:VI 1 "register_operand"   "v")
+ 		       (match_operand:SI 2 "nonmemory_operand" "an")))]
+   "TARGET_VX"
+-  "<vec_shifts_mnem><bhfgq>\t%v0,%v1,%Y2"
++  "<vec_shifts_mnem><bhfgq>\t%v0,%v1,<addr_style_op_ops>"
+   [(set_attr "op_type" "VRS")])
+ 
+ ; Shift each element by corresponding vector element
+--- /dev/null
++++ gcc/testsuite/gcc.target/s390/vector/vec-shift-2.c
+@@ -0,0 +1,24 @@
++/* { dg-do run } */
++/* { dg-options "-O3 -mzarch -march=z13 --save-temps" } */
++
++/* { dg-final { scan-assembler-times "veslf" 1 } } */
++
++typedef __attribute__((vector_size(16))) signed int v4si;
++
++v4si __attribute__((noinline,noclone))
++shift_left_by_scalar (v4si in, int shift_count)
++{
++  return in << (3 + shift_count);
++}
++
++int
++main ()
++{
++  v4si a = { 1, 2, 3, 4 };
++  v4si result = shift_left_by_scalar (a, 1);
++
++  if (result[1] != 32)
++    __builtin_abort ();
++
++  return 0;
++}
diff --git a/SPECS/gcc.spec b/SPECS/gcc.spec
index b727508..23a96e3 100644
--- a/SPECS/gcc.spec
+++ b/SPECS/gcc.spec
@@ -1,10 +1,10 @@
-%global DATE 20180905
-%global SVNREV 264110
-%global gcc_version 8.2.1
+%global DATE 20190507
+%global SVNREV 270976
+%global gcc_version 8.3.1
 %global gcc_major 8
 # Note, gcc_release must be integer, if you want to add suffixes to
 # %%{release}, append them after %%{gcc_release} on Release: line.
-%global gcc_release 3
+%global gcc_release 4
 %global nvptx_tools_gitrev c28050f60193b3b95a18866a96f03334e874e78f
 %global nvptx_newlib_gitrev aadc8eb0ec43b7cd0dd2dfb484bae63c8b05ef24
 %global _unpackaged_files_terminate_build 0
@@ -268,16 +268,20 @@ Patch11: gcc8-rh1512529-aarch64.patch
 Patch12: gcc8-mcet.patch
 Patch13: gcc8-rh1574936.patch
 Patch14: gcc8-libgcc-hardened.patch
-Patch15: gcc8-rh1612514.patch
-Patch16: gcc8-pr60790.patch
-Patch17: gcc8-rh1652016.patch
-
-Patch21: gcc8-rh1652929-1.patch
-Patch22: gcc8-rh1652929-2.patch
-Patch23: gcc8-rh1652929-3.patch
-Patch24: gcc8-rh1652929-4.patch
-Patch25: gcc8-rh1652929-5.patch
+Patch15: gcc8-rh1670535.patch
+Patch16: gcc8-pr85400.patch
+Patch17: gcc8-libgomp-20190503.patch
+Patch18: gcc8-pr86747.patch
+Patch19: gcc8-libgomp-testsuite.patch
+Patch20: gcc8-rh1711346.patch
+Patch21: gcc8-rh1730380.patch
+Patch22: gcc8-pr86098.patch
+Patch23: gcc8-pr90139.patch
+Patch24: gcc8-pr90756.patch
 
+Patch30: gcc8-rh1668903-1.patch
+Patch31: gcc8-rh1668903-2.patch
+Patch32: gcc8-rh1668903-3.patch
 
 Patch1000: nvptx-tools-no-ptxas.patch
 Patch1001: nvptx-tools-build.patch
@@ -847,15 +851,20 @@ to NVidia PTX capable devices if available.
 %patch13 -p0 -b .rh1574936~
 %patch14 -p0 -b .libgcc-hardened~
 %endif
-%patch15 -p0 -b .rh1612514~
-%patch16 -p0 -b .pr60790~
-%patch17 -p1 -b .rh1652016~
+%patch15 -p0 -b .rh1670535~
+%patch16 -p0 -b .pr85400~
+%patch17 -p0 -b .libgomp-20190503~
+%patch18 -p0 -b .pr86747~
+%patch19 -p0 -b .libgomp-testsuite~
+%patch20 -p0 -b .rh1711346~
+%patch21 -p0 -b .rh1730380~
+%patch22 -p0 -b .pr86098~
+%patch23 -p0 -b .pr90139~
+%patch24 -p0 -b .pr90756~
 
-%patch21 -p1 -b .rh1652929-1~
-%patch22 -p1 -b .rh1652929-2~
-%patch23 -p1 -b .rh1652929-3~
-%patch24 -p1 -b .rh1652929-4~
-%patch25 -p1 -b .rh1652929-5~
+%patch30 -p0 -b .rh1668903-1~
+%patch31 -p0 -b .rh1668903-2~
+%patch32 -p0 -b .rh1668903-3~
 
 cd nvptx-tools-%{nvptx_tools_gitrev}
 %patch1000 -p1 -b .nvptx-tools-no-ptxas~
@@ -1350,36 +1359,39 @@ mkdir -p %{buildroot}/%{_lib}
 mv -f %{buildroot}%{_prefix}/%{_lib}/libgcc_s.so.1 %{buildroot}/%{_lib}/libgcc_s-%{gcc_major}-%{DATE}.so.1
 chmod 755 %{buildroot}/%{_lib}/libgcc_s-%{gcc_major}-%{DATE}.so.1
 ln -sf libgcc_s-%{gcc_major}-%{DATE}.so.1 %{buildroot}/%{_lib}/libgcc_s.so.1
+%ifarch %{ix86} x86_64 ppc ppc64 ppc64p7 ppc64le %{arm}
+rm -f $FULLPATH/libgcc_s.so
+echo '/* GNU ld script
+   Use the shared library, but some functions are only in
+   the static library, so try that secondarily.  */
+OUTPUT_FORMAT('`gcc -Wl,--print-output-format -nostdlib -r -o /dev/null`')
+GROUP ( /%{_lib}/libgcc_s.so.1 libgcc.a )' > $FULLPATH/libgcc_s.so
+%else
 ln -sf /%{_lib}/libgcc_s.so.1 $FULLPATH/libgcc_s.so
+%endif
 %ifarch sparcv9 ppc
+%ifarch ppc
+rm -f $FULLPATH/64/libgcc_s.so
+echo '/* GNU ld script
+   Use the shared library, but some functions are only in
+   the static library, so try that secondarily.  */
+OUTPUT_FORMAT('`gcc -m64 -Wl,--print-output-format -nostdlib -r -o /dev/null`')
+GROUP ( /lib64/libgcc_s.so.1 libgcc.a )' > $FULLPATH/64/libgcc_s.so
+%else
 ln -sf /lib64/libgcc_s.so.1 $FULLPATH/64/libgcc_s.so
 %endif
+%endif
 %ifarch %{multilib_64_archs}
+%ifarch x86_64 ppc64 ppc64p7
+rm -f $FULLPATH/64/libgcc_s.so
+echo '/* GNU ld script
+   Use the shared library, but some functions are only in
+   the static library, so try that secondarily.  */
+OUTPUT_FORMAT('`gcc -m32 -Wl,--print-output-format -nostdlib -r -o /dev/null`')
+GROUP ( /lib/libgcc_s.so.1 libgcc.a )' > $FULLPATH/32/libgcc_s.so
+%else
 ln -sf /lib/libgcc_s.so.1 $FULLPATH/32/libgcc_s.so
 %endif
-%ifarch ppc
-rm -f $FULLPATH/libgcc_s.so
-echo '/* GNU ld script
-   Use the shared library, but some functions are only in
-   the static library, so try that secondarily.  */
-OUTPUT_FORMAT(elf32-powerpc)
-GROUP ( /lib/libgcc_s.so.1 libgcc.a )' > $FULLPATH/libgcc_s.so
-%endif
-%ifarch ppc64 ppc64p7
-rm -f $FULLPATH/32/libgcc_s.so
-echo '/* GNU ld script
-   Use the shared library, but some functions are only in
-   the static library, so try that secondarily.  */
-OUTPUT_FORMAT(elf32-powerpc)
-GROUP ( /lib/libgcc_s.so.1 libgcc.a )' > $FULLPATH/32/libgcc_s.so
-%endif
-%ifarch %{arm}
-rm -f $FULLPATH/libgcc_s.so
-echo '/* GNU ld script
-   Use the shared library, but some functions are only in
-   the static library, so try that secondarily.  */
-OUTPUT_FORMAT(elf32-littlearm)
-GROUP ( /lib/libgcc_s.so.1 libgcc.a )' > $FULLPATH/libgcc_s.so
 %endif
 
 mv -f %{buildroot}%{_prefix}/%{_lib}/libgomp.spec $FULLPATH/
@@ -2335,6 +2347,14 @@ fi
 %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/vec_types.h
 %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/htmintrin.h
 %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/htmxlintrin.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/bmi2intrin.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/bmiintrin.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/xmmintrin.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/mm_malloc.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/emmintrin.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/mmintrin.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/x86intrin.h
+%{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/amo.h
 %endif
 %ifarch %{arm}
 %{_prefix}/lib/gcc/%{gcc_target_platform}/%{gcc_major}/include/unwind-arm-common.h
@@ -3157,21 +3177,60 @@ fi
 %endif
 
 %changelog
-* Mon Dec 10 2018 Marek Polacek <polacek@redhat.com 8.2.1-3.5
+* Tue Jul 16 2019 Marek Polacek <polacek@redhat.com> 8.3.1-4.5
+- fix shift count operand printing (#1730380)
+- fix tree-outof-ssa.c ICE with vector types (PR middle-end/90139, #1730454)
+- fix out-of-ssa with unsupported vector types (PR rtl-optimization/90756,
+  #1730454)
+- fix ICE with template placeholder for TTP (PR c++/86098, #1730454)
+
+* Mon Jun  3 2019 Marek Polacek <polacek@redhat.com> 8.3.1-4.4
+- backport workaround for broken C/C++ wrappers to LAPACK (#1711346)
+
+* Fri May 24 2019 Marek Polacek <polacek@redhat.com> 8.3.1-4.3
+- additional fix for the libgomp testsuite (#1707568)
+
+* Tue May 21 2019 Marek Polacek <polacek@redhat.com> 8.3.1-4.2
+- backport the -fuse-ld=lld option (#1670535)
+- TLS model fix (#1678555, PR c++/85400)
+- two small autoFDO fixes (#1686082)
+- libgomp update (#1707568)
+- member template redeclaration fix (#1652704, PR c++/86747)
+- turn libgcc_s.so into a linker script on i?86, x86_64, ppc64le and also on
+  ppc and ppc64 for 64-bit multilib (#1708309)
+- avoid using unaligned vsx or lxvd2x/stxvd2x for memcpy/memmove inline
+  expansion (#1666977)
+
+* Wed May  8 2019 Marek Polacek <polacek@redhat.com> 8.3.1-4.1
+- tweak gcc8-rh1668903-1.patch and gcc8-rh1668903-2.patch patches
+
+* Tue May  7 2019 Marek Polacek <polacek@redhat.com> 8.3.1-4
+- update from Fedora 8.3.1-4 (#1680182)
+- drop gcc8-pr60790.patch, gcc8-pr89629.patch, gcc8-rh1668903-4.patch
+
+* Tue May  7 2019 Marek Polacek <polacek@redhat.com> 8.3.1-3
+- update from Fedora 8.3.1-3 (#1680182)
+- remove load and test FP splitter (#1673116)
+- fix *movsi_from_df (#1677652)
+- add missing headers
+- add support for live patching (#1668903)
+- retire gcc8-rh1612514.patch, gcc8-rh1652016.patch, gcc8-rh1652929-?.patch
+
+* Mon Dec 10 2018 Marek Polacek <polacek@redhat.com> 8.2.1-3.5
 - remove python2 dependecy (#1595385)
 
-* Tue Nov 27 2018 Jeff Law <law@redhat.com 8.2.1-3.4
+* Tue Nov 27 2018 Jeff Law <law@redhat.com> 8.2.1-3.4
 - Backport PPC string inlines from trunk which allow for valgrind's
   memcheck to work properly (#1652929)
 - Backport bugfix for clz pattern on s390 affecting jemalloc (#1652016)
 
-* Mon Oct 15 2018 Marek Polacek <polacek@redhat.com 8.2.1-3.3
+* Mon Oct 15 2018 Marek Polacek <polacek@redhat.com> 8.2.1-3.3
 - avoid IFUNC resolver access to uninitialized data (#1559350, PR libgcc/60790)
 
-* Thu Oct 11 2018 Marek Polacek <polacek@redhat.com 8.2.1-3.2
+* Thu Oct 11 2018 Marek Polacek <polacek@redhat.com> 8.2.1-3.2
 - fix typos in manual (#1612514)
 
-* Mon Oct  8 2018 Marek Polacek <polacek@redhat.com 8.2.1-3.1
+* Mon Oct  8 2018 Marek Polacek <polacek@redhat.com> 8.2.1-3.1
 - enable hardening of binaries (#1624114)
 - disable libgccjit on RHEL