aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrey Fedorov <andrey.fedorov@intel.com>2021-02-16 12:02:37 +0300
committerGitHub <noreply@github.com>2021-02-16 12:02:37 +0300
commit94189435085a09fcca528f6313b0006d179bbf00 (patch)
treee31c81155879cdb7bf15a44217098202c7957a02
parentAdd FPGA_EMU tests (#118) (diff)
downloadllvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.gz
llvm-project-94189435085a09fcca528f6313b0006d179bbf00.tar.bz2
llvm-project-94189435085a09fcca528f6313b0006d179bbf00.zip
Refactor generated names of kernels (#113)
* reworked kernel names * more testing is added * make shift_left_right run with unnamed lambdas * some fixes for CI * applied some suggestions from review * addressed feedback from review * cmake changes + kernel name for shift_left, shift_right algorithms * make some tests compile * remove unused variables
-rw-r--r--CMakeLists.txt11
-rw-r--r--cmake/README.md2
-rw-r--r--include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h14
-rw-r--r--include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h8
-rw-r--r--include/oneapi/dpl/internal/reduce_by_segment_impl.h37
-rw-r--r--include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h41
-rw-r--r--include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h184
-rw-r--r--include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h51
-rw-r--r--include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h37
-rw-r--r--test/general/lambda_naming.pass.cpp18
-rw-r--r--test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp38
-rw-r--r--test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp4
-rw-r--r--test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp4
-rw-r--r--test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp4
-rw-r--r--test/parallel_api/ranges/merge_ranges_sycl.pass.cpp2
15 files changed, 252 insertions, 203 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2b86ccd19bc..d0e8c230964b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,6 @@ if (FIND_GXX_EXE)
execute_process(COMMAND ${FIND_GXX_EXE} -dumpfullversion OUTPUT_VARIABLE _onedpl_gxx_version)
endif()
-option(ONEDPL_USE_UNNAMED_LAMBDA "Pass -fsycl-unnamed-lambda compile option" OFF)
option(ONEDPL_FPGA_STATIC_REPORT "Enable the static report generation for the FPGA device" OFF)
option(ONEDPL_USE_AOT_COMPILATION "Enable the ahead of time compilation via OCLOC compiler" OFF)
@@ -147,9 +146,17 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$")
# DPC++ specific compiler options
target_compile_options(oneDPL INTERFACE
- $<$<BOOL:${ONEDPL_USE_UNNAMED_LAMBDA}>:-fsycl-unnamed-lambda>
$<$<OR:$<BOOL:${ONEDPL_USE_DEVICE_FPGA_HW}>,$<BOOL:${ONEDPL_USE_DEVICE_FPGA_EMU}>>:-fintelfpga>
)
+ if (DEFINED ONEDPL_USE_UNNAMED_LAMBDA)
+ if(ONEDPL_USE_UNNAMED_LAMBDA)
+ message(STATUS "Use unnamed lambdas")
+ target_compile_options(oneDPL INTERFACE -fsycl-unnamed-lambda)
+ else()
+ message(STATUS "Don't use unnamed lambdas")
+ target_compile_options(oneDPL INTERFACE -fno-sycl-unnamed-lambda)
+ endif()
+ endif()
# DPC++ specific macro
target_compile_definitions(oneDPL INTERFACE
diff --git a/cmake/README.md b/cmake/README.md
index 93fac833f635..ad08fb0e69fd 100644
--- a/cmake/README.md
+++ b/cmake/README.md
@@ -10,7 +10,7 @@ The following variables are provided for oneDPL configuration:
|------------------------------|--------|-----------------------------------------------------------------------------------------------|---------------|
| ONEDPL_BACKEND | STRING | Threading backend; supported values: tbb, dpcpp, dpcpp_only, serial, ...; the default value is defined by compiler: dpcpp for DPC++ and tbb for others | tbb/dpcpp |
| ONEDPL_DEVICE_TYPE | STRING | Device type, applicable only for sycl backends; supported values: GPU, CPU, FPGA_HW, FPGA_EMU | GPU |
-| ONEDPL_USE_UNNAMED_LAMBDA | BOOL | Pass `-fsycl-unnamed-lambda` compile option | OFF |
+| ONEDPL_USE_UNNAMED_LAMBDA | BOOL | Pass `-fsycl-unnamed-lambda`, `-fno-sycl-unnamed-lambda` compile options or nothing | |
| ONEDPL_FPGA_STATIC_REPORT | BOOL | Enable the static report generation for the FPGA_HW device type | OFF |
| ONEDPL_USE_AOT_COMPILATION | BOOL | Enable the ahead of time compilation via OpenCL™ Offline Compiler (OCLOC) | OFF |
| ONEDPL_AOT_ARCH | STRING | Architecture options for the ahead of time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html); the default value `*` means compilation for all available options | * |
diff --git a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
index 6f997d5c772d..44b6dbca381a 100644
--- a/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
@@ -29,6 +29,12 @@ namespace dpl
{
namespace internal
{
+
+template <typename Name>
+class ExclusiveScan1;
+template <typename Name>
+class ExclusiveScan2;
+
template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename T,
typename BinaryPredicate, typename Operator>
oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type, OutputIterator>
@@ -67,7 +73,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
temp[0] = init;
- typename internal::rebind_policy<policy_type, class ExclusiveScan1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, ExclusiveScan1<policy_type>>::type policy1(policy);
// TODO : add stencil form of replace_copy_if to oneDPL if the
// transform call here is difficult to understand and maintain.
@@ -78,7 +84,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
replace_copy_if(policy1, first2, last2 - 1, _flags.get() + 1, _temp.get() + 1, ::std::negate<FlagType>(), init);
#endif
- typename internal::rebind_policy<policy_type, class ExclusiveScan2>::type policy2(policy);
+ typename internal::rebind_policy<policy_type, ExclusiveScan2<policy_type>>::type policy2(policy);
// scan key-flag tuples
inclusive_scan(policy2, make_zip_iterator(_temp.get(), _flags.get()),
@@ -133,7 +139,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
temp[0] = init;
}
- typename internal::rebind_policy<policy_type, class ExclusiveScan1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, ExclusiveScan1<policy_type>>::type policy1(policy);
// TODO : add stencil form of replace_copy_if to oneDPL if the
// transform call here is difficult to understand and maintain.
@@ -144,7 +150,7 @@ exclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
replace_copy_if(policy1, first2, last2 - 1, _flags.get() + 1, _temp.get() + 1, ::std::negate<FlagType>(), init);
# endif
- typename internal::rebind_policy<policy_type, class ExclusiveScan2>::type policy2(policy);
+ typename internal::rebind_policy<policy_type, ExclusiveScan2<policy_type>>::type policy2(policy);
// scan key-flag tuples
transform_inclusive_scan(policy2, make_zip_iterator(_temp.get(), _flags.get()),
diff --git a/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h b/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h
index 59dfc5ccc233..5e2df863130e 100644
--- a/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/inclusive_scan_by_segment_impl.h
@@ -30,6 +30,10 @@ namespace dpl
{
namespace internal
{
+
+template <typename Name>
+class InclusiveScan1;
+
template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator,
typename BinaryPredicate, typename BinaryOperator>
oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type, OutputIterator>
@@ -58,7 +62,7 @@ inclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
transform(::std::forward<Policy>(policy), first1, last1 - 1, first1 + 1, _mask.get() + 1, ::std::not2(binary_pred));
- typename internal::rebind_policy<policy_type, class InclusiveScan1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, InclusiveScan1<policy_type>>::type policy1(policy);
inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n,
make_zip_iterator(result, _mask.get()),
internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op));
@@ -106,7 +110,7 @@ inclusive_scan_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIter
transform(::std::forward<Policy>(policy), first1, last1 - 1, first1 + 1, _mask.get() + 1, ::std::not2(binary_pred));
- typename internal::rebind_policy<policy_type, class InclusiveScan1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, InclusiveScan1<policy_type>>::type policy1(policy);
transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
make_zip_iterator(first2, _mask.get()) + n, make_zip_iterator(result, _mask.get()),
internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op),
diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
index 18e93b94199f..db36c8c9a621 100644
--- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h
+++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h
@@ -30,6 +30,15 @@ namespace dpl
namespace internal
{
+template <typename Name>
+class Reduce1;
+template <typename Name>
+class Reduce2;
+template <typename Name>
+class Reduce3;
+template <typename Name>
+class Reduce4;
+
template <typename Policy, typename InputIterator1, typename InputIterator2, typename OutputIterator1,
typename OutputIterator2, typename BinaryPred, typename BinaryOperator>
oneapi::dpl::__internal::__enable_if_host_execution_policy<typename ::std::decay<Policy>::type,
@@ -87,7 +96,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
oneapi::dpl::__par_backend::__buffer<policy_type, FlagType> _scanned_tail_flags(n);
// Compute the sum of the segments. scanned_tail_flags values are not used.
- typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()), make_zip_iterator(first2, _mask.get()) + n,
make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
internal::segmented_scan_fun<ValueType, FlagType, BinaryOperator>(binary_op));
@@ -95,7 +104,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
// for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
// Compute the indicies each segment sum should be written
- typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+ typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
::std::plus<CountType>());
@@ -108,7 +117,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
CountType N = scanned_tail_flags[n - 1] + 1;
// scatter the keys and accumulated values
- typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
+ typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy);
oneapi::dpl::for_each(policy3, make_zip_iterator(first1, scanned_tail_flags, mask, scanned_values, mask + 1),
make_zip_iterator(first1, scanned_tail_flags, mask, scanned_values, mask + 1) + n,
internal::scatter_and_accumulate_fun<OutputIterator1, OutputIterator2>(result1, result2));
@@ -193,7 +202,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n);
// Compute the sum of the segments. scanned_tail_flags values are not used.
- typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
make_zip_iterator(first2, _mask.get()) + n,
make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
@@ -203,7 +212,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
// for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
// Compute the indicies each segment sum should be written
- typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+ typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
::std::plus<CountType>());
@@ -222,8 +231,8 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
}
// scatter the keys and accumulated values
- typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
- typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy);
+ typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy);
+ typename internal::rebind_policy<policy_type, Reduce4<policy_type>>::type policy4(policy);
// permutation iterator reorders elements in result1 so the element at index
// _scanned_tail_flags[i] is returned when index i of the iterator is accessed. The result
@@ -320,7 +329,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n);
// Compute the sum of the segments. scanned_tail_flags values are not used.
- typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
make_zip_iterator(first2, _mask.get()) + n,
make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
@@ -330,7 +339,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
// for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
// Compute the indicies each segment sum should be written
- typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+ typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
::std::plus<CountType>());
@@ -349,8 +358,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
}
// scatter the keys and accumulated values
- typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
- typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy);
+ typename internal::rebind_policy<policy_type, Reduce4<policy_type>>::type policy4(policy);
// result1 is a discard_iterator instance so we omit the write to it.
@@ -440,7 +448,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
internal::__buffer<policy_type, FlagType> _scanned_tail_flags(policy, n);
// Compute the sum of the segments. scanned_tail_flags values are not used.
- typename internal::rebind_policy<policy_type, class ReduceByKey1>::type policy1(policy);
+ typename internal::rebind_policy<policy_type, Reduce1<policy_type>>::type policy1(policy);
transform_inclusive_scan(policy1, make_zip_iterator(first2, _mask.get()),
make_zip_iterator(first2, _mask.get()) + n,
make_zip_iterator(_scanned_values.get(), _scanned_tail_flags.get()),
@@ -450,7 +458,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
// for example: _scanned_values = { 1, 2, 3, 4, 1, 2, 3, 6, 1, 2, 3, 6, 0 }
// Compute the indicies each segment sum should be written
- typename internal::rebind_policy<policy_type, class ReduceByKey2>::type policy2(policy);
+ typename internal::rebind_policy<policy_type, Reduce2<policy_type>>::type policy2(policy);
oneapi::dpl::exclusive_scan(policy2, _mask.get() + 1, _mask.get() + n + 1, _scanned_tail_flags.get(), CountType(0),
::std::plus<CountType>());
@@ -469,8 +477,7 @@ reduce_by_segment_impl(Policy&& policy, InputIterator1 first1, InputIterator1 la
}
// scatter the keys and accumulated values
- typename internal::rebind_policy<policy_type, class ReduceByKey3>::type policy3(policy);
- typename internal::rebind_policy<policy_type, class ReduceByKey4>::type policy4(policy);
+ typename internal::rebind_policy<policy_type, Reduce3<policy_type>>::type policy3(policy);
// permutation iterator reorders elements in result1 so the element at index
// _scanned_tail_flags[i] is returned when index i of the iterator is accessed. The result
diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
index cdb001fa0043..eb41600c2ad6 100644
--- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
+++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
@@ -1064,10 +1064,14 @@ __pattern_unique_copy(_ExecutionPolicy&& __exec, _Iterator1 __first, _Iterator1
return __result_first + __result.second;
}
-template <typename Name>
+template <typename _Name>
class copy_back_wrapper
{
};
+template <typename _Name>
+class copy_back_wrapper2
+{
+};
template <typename _ExecutionPolicy, typename _Iterator, typename _Predicate>
oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy, _Iterator>
@@ -1247,12 +1251,16 @@ __pattern_merge(_ExecutionPolicy&& __exec, _Iterator1 __first1, _Iterator1 __las
//To consider the direct copying pattern call in case just one of sequences is empty.
if (__n1 == 0)
oneapi::dpl::__internal::__pattern_walk2_brick(
- ::std::forward<_ExecutionPolicy>(__exec), __first2, __last2, __d_first,
- oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, ::std::true_type());
+ oneapi::dpl::__par_backend_hetero::make_wrapped_policy<copy_back_wrapper>(
+ ::std::forward<_ExecutionPolicy>(__exec)),
+ __first2, __last2, __d_first, oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{},
+ ::std::true_type());
else if (__n2 == 0)
oneapi::dpl::__internal::__pattern_walk2_brick(
- ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __d_first,
- oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{}, ::std::true_type());
+ oneapi::dpl::__par_backend_hetero::make_wrapped_policy<copy_back_wrapper2>(
+ ::std::forward<_ExecutionPolicy>(__exec)),
+ __first1, __last1, __d_first, oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{},
+ ::std::true_type());
else
{
auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
@@ -1339,11 +1347,6 @@ __pattern_stable_sort(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __
.wait();
}
-template <typename Name>
-class copy_back_wrapper2
-{
-};
-
template <typename _ExecutionPolicy, typename _Iterator, typename _UnaryPredicate>
oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy, _Iterator>
__pattern_stable_partition(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _UnaryPredicate __pred,
@@ -1902,9 +1905,11 @@ __pattern_set_union(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _Forw
__comp, unseq_backend::_DifferenceTag()) -
__buf;
//2. Merge {1} and the difference
- return oneapi::dpl::__internal::__pattern_merge(::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __buf,
- __buf + __n_diff, __result, __comp,
- /*vector=*/::std::true_type(), /*parallel=*/::std::true_type());
+ return oneapi::dpl::__internal::__pattern_merge(
+ oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__set_union_copy_case_2>(
+ ::std::forward<_ExecutionPolicy>(__exec)),
+ __first1, __last1, __buf, __buf + __n_diff, __result, __comp,
+ /*vector=*/::std::true_type(), /*parallel=*/::std::true_type());
}
//Dummy names to avoid kernel problems
@@ -1996,6 +2001,11 @@ __pattern_set_symmetric_difference(_ExecutionPolicy&& __exec, _ForwardIterator1
__comp, ::std::true_type(), ::std::true_type());
}
+template <typename _Name>
+class __shift_left_right
+{
+};
+
template <typename _ExecutionPolicy, typename _Range>
oneapi::dpl::__internal::__enable_if_hetero_execution_policy<_ExecutionPolicy,
oneapi::dpl::__internal::__difference_t<_Range>>
@@ -2027,7 +2037,10 @@ __pattern_shift_left(_ExecutionPolicy&& __exec, _Range __rng, oneapi::dpl::__int
else //2. n < size/2; 'n' parallel copying
{
auto __brick = unseq_backend::__brick_shift_left<_ExecutionPolicy, _DiffType>{__size, __n};
- oneapi::dpl::__par_backend_hetero::__parallel_for(::std::forward<_ExecutionPolicy>(__exec), __brick, __n, __rng)
+ oneapi::dpl::__par_backend_hetero::__parallel_for(
+ oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__shift_left_right>(
+ ::std::forward<_ExecutionPolicy>(__exec)),
+ __brick, __n, __rng)
.wait();
}
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index 9643fedcd336..4a9f7910da11 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -175,33 +175,6 @@ make_iter_mode(const _Iterator& __it) -> decltype(iter_mode<outMode>()(__it))
return iter_mode<outMode>()(__it);
}
-// function is needed to wrap kernel name into another class
-template <template <typename> class _NewKernelName, typename _Policy,
- oneapi::dpl::__internal::__enable_if_device_execution_policy<_Policy, int> = 0>
-auto
-make_wrapped_policy(_Policy&& __policy)
- -> decltype(oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
- ::std::forward<_Policy>(__policy)))
-{
- return oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
- ::std::forward<_Policy>(__policy));
-}
-
-#if _ONEDPL_FPGA_DEVICE
-template <template <typename> class _NewKernelName, typename _Policy,
- oneapi::dpl::__internal::__enable_if_fpga_execution_policy<_Policy, int> = 0>
-auto
-make_wrapped_policy(_Policy&& __policy)
- -> decltype(oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
- _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
- ::std::forward<_Policy>(__policy)))
-{
- return oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
- _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
- ::std::forward<_Policy>(__policy));
-}
-#endif
-
// set of templated classes to name kernels
template <typename _DerivedKernelName>
class __kernel_name_base
@@ -227,19 +200,19 @@ class __parallel_reduce_kernel : public __kernel_name_base<__parallel_reduce_ker
{
};
template <typename... _Name>
-class __parallel_scan_kernel_1 : public __kernel_name_base<__parallel_scan_kernel_1<_Name...>>
+class __parallel_scan_local_kernel : public __kernel_name_base<__parallel_scan_local_kernel<_Name...>>
{
};
template <typename... _Name>
-class __parallel_scan_kernel_2 : public __kernel_name_base<__parallel_scan_kernel_2<_Name...>>
+class __parallel_scan_global_kernel : public __kernel_name_base<__parallel_scan_global_kernel<_Name...>>
{
};
template <typename... _Name>
-class __parallel_scan_kernel_3 : public __kernel_name_base<__parallel_scan_kernel_3<_Name...>>
+class __parallel_scan_propagate_kernel : public __kernel_name_base<__parallel_scan_propagate_kernel<_Name...>>
{
};
template <typename... _Name>
-class __parallel_find_or_kernel_1 : public __kernel_name_base<__parallel_find_or_kernel_1<_Name...>>
+class __parallel_find_or_kernel : public __kernel_name_base<__parallel_find_or_kernel<_Name...>>
{
};
template <typename... _Name>
@@ -247,15 +220,15 @@ class __parallel_merge_kernel : public __kernel_name_base<__parallel_merge_kerne
{
};
template <typename... _Name>
-class __parallel_sort_kernel_1 : public __kernel_name_base<__parallel_sort_kernel_1<_Name...>>
+class __parallel_sort_leaf_kernel : public __kernel_name_base<__parallel_sort_leaf_kernel<_Name...>>
{
};
template <typename... _Name>
-class __parallel_sort_kernel_2 : public __kernel_name_base<__parallel_sort_kernel_2<_Name...>>
+class __parallel_sort_global_kernel : public __kernel_name_base<__parallel_sort_global_kernel<_Name...>>
{
};
template <typename... _Name>
-class __parallel_sort_kernel_3 : public __kernel_name_base<__parallel_sort_kernel_3<_Name...>>
+class __parallel_sort_copy_back_kernel : public __kernel_name_base<__parallel_sort_copy_back_kernel<_Name...>>
{
};
@@ -272,19 +245,16 @@ __parallel_for(_ExecutionPolicy&& __exec, _Fp __brick, _Index __count, _Ranges&&
assert(__get_first_range(::std::forward<_Ranges>(__rngs)...).size() > 0);
using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
- using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
- using __kernel_name_t = __parallel_for_kernel<_Fp, __kernel_name, _Ranges...>;
-#else
- using __kernel_name_t = __parallel_for_kernel<__kernel_name>;
-#endif
+ using _CustomName = typename _Policy::kernel_name;
+ using _ForKernel = oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_for_kernel, _CustomName,
+ _Fp, _Ranges...>;
_PRINT_INFO_IN_DEBUG_MODE(__exec);
auto __event = __exec.queue().submit([&__rngs..., &__brick, __count](sycl::handler& __cgh) {
//get an access to data under SYCL buffer:
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
- __cgh.parallel_for<__kernel_name_t>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
+ __cgh.parallel_for<_ForKernel>(sycl::range</*dim=*/1>(__count), [=](sycl::item</*dim=*/1> __item_id) {
auto __idx = __item_id.get_linear_id();
__brick(__idx, __rngs...);
});
@@ -306,12 +276,10 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _
using _Size = decltype(__n);
using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
- using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
- using __kernel_name_t = __parallel_reduce_kernel<_Up, _Cp, _Rp, __kernel_name, _Ranges...>;
-#else
- using __kernel_name_t = __parallel_reduce_kernel<__kernel_name>;
-#endif
+ using _CustomName = typename _Policy::kernel_name;
+ using _ReduceKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_reduce_kernel, _CustomName, _Up, _Cp,
+ _Rp, _Ranges...>;
sycl::cl_uint __max_compute_units = oneapi::dpl::__internal::__max_compute_units(__exec);
::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
@@ -319,7 +287,7 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _
__work_group_size = oneapi::dpl::__internal::__max_local_allocation_size<_ExecutionPolicy, _Tp>(
::std::forward<_ExecutionPolicy>(__exec), __work_group_size);
#if _ONEDPL_COMPILE_KERNEL
- sycl::kernel __kernel = __kernel_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ sycl::kernel __kernel = _ReduceKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
__work_group_size = ::std::min(__work_group_size, oneapi::dpl::__internal::__kernel_work_group_size(
::std::forward<_ExecutionPolicy>(__exec), __kernel));
#endif
@@ -354,7 +322,7 @@ __parallel_transform_reduce(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _
auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh);
sycl::accessor<_Tp, 1, access_mode::read_write, sycl::access::target::local> __temp_local(
sycl::range<1>(__work_group_size), __cgh);
- __cgh.parallel_for<__kernel_name_t>(
+ __cgh.parallel_for<_ReduceKernel>(
#if _ONEDPL_COMPILE_KERNEL
__kernel,
#endif
@@ -404,21 +372,21 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
_InitType __init, _LocalScan __local_scan, _GroupScan __group_scan, _GlobalScan __global_scan)
{
using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
- using _KernelName = typename _Policy::kernel_name;
+ using _CustomName = typename _Policy::kernel_name;
using _Type = typename _InitType::__value_type;
-#if __SYCL_UNNAMED_LAMBDA__
- using __kernel_1_name_t = __parallel_scan_kernel_1<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan,
- _GroupScan, _GlobalScan, _KernelName>;
- using __kernel_2_name_t = __parallel_scan_kernel_2<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan,
- _GroupScan, _GlobalScan, _KernelName>;
- using __kernel_3_name_t = __parallel_scan_kernel_3<_Range1, _Range2, _BinaryOperation, _Type, _LocalScan,
- _GroupScan, _GlobalScan, _KernelName>;
-#else
- using __kernel_1_name_t = __parallel_scan_kernel_1<_KernelName>;
- using __kernel_2_name_t = __parallel_scan_kernel_2<_KernelName>;
- using __kernel_3_name_t = __parallel_scan_kernel_3<_KernelName>;
-#endif
+ using _LocalScanKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_local_kernel, _CustomName, _Range1,
+ _Range2, _BinaryOperation, _Type, _LocalScan,
+ _GroupScan, _GlobalScan>;
+ using _GlobalScanKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_global_kernel, _CustomName,
+ _Range1, _Range2, _BinaryOperation, _Type,
+ _LocalScan, _GroupScan, _GlobalScan>;
+ using _PropagateKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_scan_propagate_kernel, _CustomName,
+ _Range1, _Range2, _BinaryOperation, _Type,
+ _LocalScan, _GroupScan, _GlobalScan>;
auto __n = __rng1.size();
assert(__n > 0);
@@ -430,8 +398,8 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
::std::forward<_ExecutionPolicy>(__exec), __wgroup_size);
#if _ONEDPL_COMPILE_KERNEL
- auto __kernel_1 = __kernel_1_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
- auto __kernel_2 = __kernel_2_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ auto __kernel_1 = _LocalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ auto __kernel_2 = _GlobalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
auto __wgroup_size_kernel_1 =
oneapi::dpl::__internal::__kernel_work_group_size(::std::forward<_ExecutionPolicy>(__exec), __kernel_1);
auto __wgroup_size_kernel_2 =
@@ -455,7 +423,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
sycl::accessor<_Type, 1, access_mode::discard_read_write, sycl::access::target::local> __local_acc(
__wgroup_size, __cgh);
- __cgh.parallel_for<__kernel_1_name_t>(
+ __cgh.parallel_for<_LocalScanKernel>(
#if _ONEDPL_COMPILE_KERNEL
__kernel_1,
#endif
@@ -475,7 +443,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
sycl::accessor<_Type, 1, access_mode::discard_read_write, sycl::access::target::local> __local_acc(
__wgroup_size, __cgh);
- __cgh.parallel_for<__kernel_2_name_t>(
+ __cgh.parallel_for<_GlobalScanKernel>(
#if _ONEDPL_COMPILE_KERNEL
__kernel_2,
#endif
@@ -492,7 +460,7 @@ __parallel_transform_scan(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&&
__cgh.depends_on(__submit_event);
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer
auto __wg_sums_acc = __wg_sums.template get_access<access_mode::read>(__cgh);
- __cgh.parallel_for<__kernel_3_name_t>(sycl::range<1>(__n_groups * __size_per_wg), [=](sycl::item<1> __item) {
+ __cgh.parallel_for<_PropagateKernel>(sycl::range<1>(__n_groups * __size_per_wg), [=](sycl::item<1> __item) {
__global_scan(__item, __rng2, __rng1, __wg_sums_acc, __n, __size_per_wg);
});
});
@@ -648,14 +616,10 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<
__parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag, _Ranges&&... __rngs)
{
using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
- using __kernel_name = typename _Policy::kernel_name;
+ using _CustomName = typename _Policy::kernel_name;
using _AtomicType = typename _BrickTag::_AtomicType;
-
-#if __SYCL_UNNAMED_LAMBDA__
- using __kernel_1_name_t = __parallel_find_or_kernel_1<_Ranges..., _Brick, __kernel_name>;
-#else
- using __kernel_1_name_t = __parallel_find_or_kernel_1<__kernel_name>;
-#endif
+ using _FindOrKernel = oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_find_or_kernel,
+ _CustomName, _Brick, _Ranges...>;
auto __or_tag_check = ::std::is_same<_BrickTag, __parallel_or_tag>{};
auto __rng_n = oneapi::dpl::__ranges::__get_first_range(::std::forward<_Ranges>(__rngs)...).size();
@@ -663,7 +627,7 @@ __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag,
auto __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(::std::forward<_ExecutionPolicy>(__exec));
#if _ONEDPL_COMPILE_KERNEL
- auto __kernel = __kernel_1_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ auto __kernel = _FindOrKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
__wgroup_size = ::std::min(__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(
::std::forward<_ExecutionPolicy>(__exec), __kernel));
#endif
@@ -693,7 +657,7 @@ __parallel_find_or(_ExecutionPolicy&& __exec, _Brick __f, _BrickTag __brick_tag,
// create local accessor to connect atomic with
sycl::accessor<_AtomicType, 1, access_mode::read_write, sycl::access::target::local> __temp_local(1, __cgh);
- __cgh.parallel_for<__kernel_1_name_t>(
+ __cgh.parallel_for<_FindOrKernel>(
#if _ONEDPL_COMPILE_KERNEL
__kernel,
#endif
@@ -1036,12 +1000,10 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _
__parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __rng3, _Compare __comp)
{
using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
- using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
- using __kernel_1_name_t = __parallel_merge_kernel<_Range1, _Range2, _Range3, _Compare, __kernel_name>;
-#else
- using __kernel_1_name_t = __parallel_merge_kernel<__kernel_name>;
-#endif
+ using _CustomName = typename _Policy::kernel_name;
+ using _MergeKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_merge_kernel, _CustomName, _Range1,
+ _Range2, _Range3, _Compare>;
auto __n = __rng1.size();
auto __n_2 = __rng2.size();
@@ -1055,7 +1017,7 @@ __parallel_merge(_ExecutionPolicy&& __exec, _Range1&& __rng1, _Range2&& __rng2,
auto __event = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2, __rng3);
- __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
+ __cgh.parallel_for<_MergeKernel>(sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
__full_merge_kernel()(__item_id.get_linear_id() * __chunk, __rng1, decltype(__n)(0), __n, __rng2,
decltype(__n_2)(0), __n_2, __rng3, decltype(__n)(0), __comp, __chunk);
});
@@ -1095,16 +1057,16 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _
__parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp)
{
using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
- using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
- using __kernel_1_name_t = __parallel_sort_kernel_1<_Range, _Merge, _Compare, __kernel_name>;
- using __kernel_2_name_t = __parallel_sort_kernel_2<_Range, _Merge, _Compare, __kernel_name>;
- using __kernel_3_name_t = __parallel_sort_kernel_3<_Range, _Merge, _Compare, __kernel_name>;
-#else
- using __kernel_1_name_t = __parallel_sort_kernel_1<__kernel_name>;
- using __kernel_2_name_t = __parallel_sort_kernel_2<__kernel_name>;
- using __kernel_3_name_t = __parallel_sort_kernel_3<__kernel_name>;
-#endif
+ using _CustomName = typename _Policy::kernel_name;
+ using _LeafSortKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_leaf_kernel, _CustomName, _Range,
+ _Merge, _Compare>;
+ using _GlobalSortKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_global_kernel, _CustomName, _Range,
+ _Merge, _Compare>;
+ using _CopyBackKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_copy_back_kernel, _CustomName,
+ _Range, _Merge, _Compare>;
using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
using _Size = oneapi::dpl::__internal::__difference_t<_Range>;
@@ -1133,13 +1095,12 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge,
// 1. Perform sorting of the leaves of the merge sort tree
sycl::event __event1 = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rng);
- __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__leaf_steps),
- [=](sycl::item</*dim=*/1> __item_id) {
- const _Size __idx = __item_id.get_linear_id() * __leaf;
- const _Size __start = __idx;
- const _Size __end = sycl::min(__start + __leaf, __n);
- __leaf_sort_kernel()(__rng, __start, __end, __comp);
- });
+ __cgh.parallel_for<_LeafSortKernel>(sycl::range</*dim=*/1>(__leaf_steps), [=](sycl::item</*dim=*/1> __item_id) {
+ const _Size __idx = __item_id.get_linear_id() * __leaf;
+ const _Size __start = __idx;
+ const _Size __end = sycl::min(__start + __leaf, __n);
+ __leaf_sort_kernel()(__rng, __start, __end, __comp);
+ });
});
_Size __sorted = __leaf;
@@ -1183,7 +1144,7 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge,
__cgh.depends_on(__event1);
oneapi::dpl::__ranges::__require_access(__cgh, __rng);
auto __temp_acc = __temp.template get_access<__par_backend_hetero::access_mode::read_write>(__cgh);
- __cgh.parallel_for<__kernel_2_name_t>(
+ __cgh.parallel_for<_GlobalSortKernel>(
sycl::range</*dim=*/1>(__steps), [=](sycl::item</*dim=*/1> __item_id) {
const _Size __idx = __item_id.get_linear_id();
// Borders of the first and the second sorted sequences
@@ -1221,7 +1182,7 @@ __parallel_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge,
oneapi::dpl::__ranges::__require_access(__cgh, __rng);
auto __temp_acc = __temp.template get_access<access_mode::read>(__cgh);
// We cannot use __cgh.copy here because of zip_iterator usage
- __cgh.parallel_for<__kernel_3_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
+ __cgh.parallel_for<_CopyBackKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
__rng[__item_id.get_linear_id()] = __temp_acc[__item_id];
});
});
@@ -1235,14 +1196,13 @@ oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, _
__parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge __merge, _Compare __comp)
{
using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
- using __kernel_name = typename _Policy::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
- using __kernel_1_name_t = __parallel_sort_kernel_1<_Range, _Merge, _Compare, __kernel_name>;
- using __kernel_2_name_t = __parallel_sort_kernel_2<_Range, _Merge, _Compare, __kernel_name>;
-#else
- using __kernel_1_name_t = __parallel_sort_kernel_1<__kernel_name>;
- using __kernel_2_name_t = __parallel_sort_kernel_2<__kernel_name>;
-#endif
+ using _CustomName = typename _Policy::kernel_name;
+ using _GlobalSortKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_global_kernel, _CustomName, _Range,
+ _Merge, _Compare>;
+ using _CopyBackKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__parallel_sort_copy_back_kernel, _CustomName,
+ _Range, _Merge, _Compare>;
using _Tp = oneapi::dpl::__internal::__value_t<_Range>;
using _Size = oneapi::dpl::__internal::__difference_t<_Range>;
@@ -1262,7 +1222,7 @@ __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge _
__cgh.depends_on(__event1);
oneapi::dpl::__ranges::__require_access(__cgh, __rng);
auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh);
- __cgh.parallel_for<__kernel_1_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
+ __cgh.parallel_for<_GlobalSortKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
auto __global_idx = __item_id.get_linear_id();
_Size __start = 2 * __k * (__global_idx / (2 * __k));
@@ -1293,7 +1253,7 @@ __parallel_partial_sort_impl(_ExecutionPolicy&& __exec, _Range&& __rng, _Merge _
oneapi::dpl::__ranges::__require_access(__cgh, __rng);
auto __temp_acc = __temp.template get_access<access_mode::read>(__cgh);
// we cannot use __cgh.copy here because of zip_iterator usage
- __cgh.parallel_for<__kernel_2_name_t>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
+ __cgh.parallel_for<_CopyBackKernel>(sycl::range</*dim=*/1>(__n), [=](sycl::item</*dim=*/1> __item_id) {
__rng[__item_id.get_linear_id()] = __temp_acc[__item_id];
});
});
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
index 4915b1f0a396..13d21dd380b4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
@@ -55,6 +55,9 @@ class __radix_sort_reorder_kernel : public __kernel_name_base<__radix_sort_reord
{
};
+template <typename _Name>
+class __odd_iteration;
+
//------------------------------------------------------------------------
// radix sort: ordered traits for a given size and integral/float flag
//------------------------------------------------------------------------
@@ -562,30 +565,28 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme
_InRange&& __in_rng, _OutRange&& __out_rng, _TmpBuf&& __tmp_buf,
sycl::event __dependency_event)
{
- using _KernelName = typename __decay_t<_ExecutionPolicy>::kernel_name;
-#if __SYCL_UNNAMED_LAMBDA__
- using __in_range_t = __decay_t<_InRange>;
- using __out_range_t = __decay_t<_OutRange>;
- using __tmp_buf_t = __decay_t<_TmpBuf>;
- using __count_kernel_name = __radix_sort_count_kernel<__in_range_t, __tmp_buf_t, _KernelName>;
- using __scan_kernel_name_1 = __radix_sort_scan_kernel_1<__tmp_buf_t, _KernelName>;
- using __scan_kernel_name_2 = __radix_sort_scan_kernel_2<__tmp_buf_t, _KernelName>;
- using __reorder_kernel_name = __radix_sort_reorder_kernel<__in_range_t, __out_range_t, ::std::size_t, _KernelName>;
-#else
- using __count_kernel_name = __radix_sort_count_kernel<_KernelName>;
- using __scan_kernel_name_1 = __radix_sort_scan_kernel_1<_KernelName>;
- using __scan_kernel_name_2 = __radix_sort_scan_kernel_2<_KernelName>;
- using __reorder_kernel_name = __radix_sort_reorder_kernel<_KernelName>;
-#endif
+ using _CustomName = typename __decay_t<_ExecutionPolicy>::kernel_name;
+ using _RadixCountKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_count_kernel, _CustomName,
+ __decay_t<_InRange>, __decay_t<_TmpBuf>>;
+ using _RadixLocalScanKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_scan_kernel_1, _CustomName,
+ __decay_t<_TmpBuf>>;
+ using _RadixGlobalScanKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_scan_kernel_2, _CustomName,
+ __decay_t<_TmpBuf>>;
+ using _RadixReorderKernel =
+ oneapi::dpl::__par_backend_hetero::__internal::_KernelName_t<__radix_sort_reorder_kernel, _CustomName,
+ __decay_t<_InRange>, __decay_t<_OutRange>>;
::std::size_t __max_sg_size = oneapi::dpl::__internal::__max_sub_group_size(__exec);
::std::size_t __scan_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
::std::size_t __block_size = __max_sg_size;
::std::size_t __reorder_sg_size = __max_sg_size;
#if _ONEDPL_COMPILE_KERNEL
- auto __count_kernel = __count_kernel_name::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
- auto __scan_kernel_1 = __scan_kernel_name_1::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
- auto __reorder_kernel = __reorder_kernel_name::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ auto __count_kernel = _RadixCountKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ auto __scan_kernel_1 = _RadixLocalScanKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ auto __reorder_kernel = _RadixReorderKernel::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
::std::size_t __count_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __count_kernel);
__reorder_sg_size = oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __reorder_kernel);
__block_size = sycl::max(__count_sg_size, __reorder_sg_size);
@@ -596,7 +597,7 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme
__block_size = __radix_states;
// 1. Count Phase
- sycl::event __count_event = __radix_sort_count_submit<__count_kernel_name, __radix_bits, __is_comp_asc>(
+ sycl::event __count_event = __radix_sort_count_submit<_RadixCountKernel, __radix_bits, __is_comp_asc>(
__exec, __segments, __block_size, __radix_iter, __in_rng, __tmp_buf, __dependency_event
#if _ONEDPL_COMPILE_KERNEL
,
@@ -605,11 +606,11 @@ __parallel_radix_sort_iteration(_ExecutionPolicy&& __exec, ::std::size_t __segme
);
// 2. Scan Phase
- sycl::event __scan_event = __radix_sort_scan_submit<__scan_kernel_name_1, __scan_kernel_name_2, __radix_bits>(
+ sycl::event __scan_event = __radix_sort_scan_submit<_RadixLocalScanKernel, _RadixGlobalScanKernel, __radix_bits>(
__exec, __scan_wg_size, __segments, __tmp_buf, __count_event);
// 3. Reorder Phase
- sycl::event __reorder_event = __radix_sort_reorder_submit<__reorder_kernel_name, __radix_bits, __is_comp_asc>(
+ sycl::event __reorder_event = __radix_sort_reorder_submit<_RadixReorderKernel, __radix_bits, __is_comp_asc>(
__exec, __segments, __block_size, __reorder_sg_size, __radix_iter, __in_rng, __out_rng, __tmp_buf, __scan_event
#if _ONEDPL_COMPILE_KERNEL
,
@@ -662,10 +663,12 @@ __parallel_radix_sort(_ExecutionPolicy&& __exec, _Range&& __in_rng)
// TODO: convert to ordered type once at the first iteration and convert back at the last one
if (__radix_iter % 2 == 0)
__iteration_event = __parallel_radix_sort_iteration<__radix_bits, __is_comp_asc>(
- __exec, __segments, __radix_iter, __in_rng, __out_rng, __tmp_buf, __iteration_event);
- else //swap __in_rng and__out_rng
+ ::std::forward<_ExecutionPolicy>(__exec), __segments, __radix_iter, __in_rng, __out_rng, __tmp_buf,
+ __iteration_event);
+ else //swap __in_rng and __out_rng
__iteration_event = __parallel_radix_sort_iteration<__radix_bits, __is_comp_asc>(
- __exec, __segments, __radix_iter, __out_rng, __in_rng, __tmp_buf, __iteration_event);
+ make_wrapped_policy<__odd_iteration>(::std::forward<_ExecutionPolicy>(__exec)), __segments,
+ __radix_iter, __out_rng, __in_rng, __tmp_buf, __iteration_event);
// TODO: since reassign to __iteration_event does not work, we have to make explicit wait on the event
explicit_wait_if<::std::is_pointer<decltype(__in_rng.begin())>::value>{}(__iteration_event);
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 67892b8a7c1e..ee275c252881 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -82,8 +82,45 @@ struct explicit_wait_if<true>
}
};
+// function is needed to wrap kernel name into another class
+template <template <typename> class _NewKernelName, typename _Policy,
+ oneapi::dpl::__internal::__enable_if_device_execution_policy<_Policy, int> = 0>
+auto
+make_wrapped_policy(_Policy&& __policy)
+ -> decltype(oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+ ::std::forward<_Policy>(__policy)))
+{
+ return oneapi::dpl::execution::make_device_policy<_NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+ ::std::forward<_Policy>(__policy));
+}
+
+#if _ONEDPL_FPGA_DEVICE
+template <template <typename> class _NewKernelName, typename _Policy,
+ oneapi::dpl::__internal::__enable_if_fpga_execution_policy<_Policy, int> = 0>
+auto
+make_wrapped_policy(_Policy&& __policy)
+ -> decltype(oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
+ _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+ ::std::forward<_Policy>(__policy)))
+{
+ return oneapi::dpl::execution::make_fpga_policy<__decay_t<_Policy>::unroll_factor,
+ _NewKernelName<typename __decay_t<_Policy>::kernel_name>>(
+ ::std::forward<_Policy>(__policy));
+}
+#endif
+
namespace __internal
{
+
+template <template <typename...> class _BaseName, typename _CustomName, typename... _Args>
+using _KernelName_t =
+#if __SYCL_UNNAMED_LAMBDA__
+ typename std::conditional<std::is_same<_CustomName, oneapi::dpl::execution::DefaultKernelName>::value,
+ _BaseName<_CustomName, _Args...>, _BaseName<_CustomName>>::type;
+#else
+ _BaseName<_CustomName>;
+#endif
+
#if _ONEDPL_DEBUG_SYCL
template <typename _Policy>
inline void
diff --git a/test/general/lambda_naming.pass.cpp b/test/general/lambda_naming.pass.cpp
index 2b9b5c0a7d67..b3040432bc56 100644
--- a/test/general/lambda_naming.pass.cpp
+++ b/test/general/lambda_naming.pass.cpp
@@ -30,7 +30,7 @@ using namespace TestUtils;
// This is the simple test for compilation only, to check if lambda naming works correctly
int main() {
-#if __SYCL_UNNAMED_LAMBDA__ && _ONEDPL_BACKEND_SYCL
+#if _ONEDPL_BACKEND_SYCL
const int n = 1000;
sycl::buffer<int, 1> buf{ sycl::range<1>(n) };
sycl::buffer<int, 1> out_buf{ sycl::range<1>(n) };
@@ -44,10 +44,13 @@ int main() {
sycl::noinit);
#else
sycl::property::noinit{});
-#endif
+#endif // __cplusplus >= 201703L
+
::std::fill(policy, buf_begin_discard_write, buf_begin_discard_write + n, 1);
+#if __SYCL_UNNAMED_LAMBDA__
::std::sort(policy, buf_begin, buf_end);
::std::for_each(policy, buf_begin, buf_end, [](int& x) { x += 41; });
+
#if !_ONEDPL_FPGA_DEVICE
::std::inplace_merge(policy, buf_begin, buf_begin + n / 2, buf_end);
auto red_val = ::std::reduce(policy, buf_begin, buf_end, 1);
@@ -58,8 +61,15 @@ int main() {
EXPECT_TRUE(!is_equal, "wrong return value from equal");
auto does_1_exist = ::std::find(policy, buf_begin, buf_end, 1);
EXPECT_TRUE(does_1_exist - buf_begin == 1000, "wrong return value from find");
-#endif
-#endif
+#endif // !_ONEDPL_FPGA_DEVICE
+
+#else
+ // ::std::for_each(policy, buf_begin, buf_end, [](int& x) { x++; }); // It's not allowed. Policy with different name is needed
+ ::std::for_each(oneapi::dpl::execution::make_device_policy<class ForEach>(policy), buf_begin, buf_end, [](int& x) { x++; });
+ auto red_val = ::std::reduce(policy, buf_begin, buf_end, 1);
+ EXPECT_TRUE(red_val == 2001, "wrong return value from reduce");
+#endif // __SYCL_UNNAMED_LAMBDA__
+#endif // _ONEDPL_BACKEND_SYCL
::std::cout << done() << ::std::endl;
return 0;
}
diff --git a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
index 6384864b1963..f608f6aeb287 100644
--- a/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
+++ b/test/parallel_api/algorithm/alg.modifying.operations/shift_left_right.pass.cpp
@@ -31,6 +31,9 @@
#define _PSTL_TEST_SHIFT_RIGHT
#endif
+template<typename Name>
+struct USM;
+
struct test_shift
{
template <typename Policy, typename It, typename Algo>
@@ -86,7 +89,7 @@ struct test_shift
//copying data to USM buffer
::std::copy_n(first, m, ptr.get());
- auto het_res = algo(::std::forward<Policy>(exec), ptr.get(), ptr.get() + m, n);
+ auto het_res = algo(oneapi::dpl::execution::make_device_policy<USM<Algo>>(::std::forward<Policy>(exec)), ptr.get(), ptr.get() + m, n);
res_idx = het_res - ptr.get();
//3.2 check result
@@ -97,16 +100,6 @@ struct test_shift
#endif
};
-template <typename T, typename Size, typename Algo>
-void
-test_shift_by_type(Size m, Size n, Algo algo)
-{
- TestUtils::Sequence<T> orig(m, [](::std::size_t v) -> T { return T(v); }); //fill data
- TestUtils::Sequence<T> in(m, [](::std::size_t v) -> T { return T(v); }); //fill data
-
- TestUtils::invoke_on_all_policies<>()(test_shift(), in.begin(), m, orig.begin(), n, algo);
-}
-
struct shift_left_algo
{
template <typename Policy, typename It>
@@ -188,6 +181,21 @@ struct shift_right_algo
}
};
+template <typename T, typename Size>
+void
+test_shift_by_type(Size m, Size n)
+{
+ TestUtils::Sequence<T> orig(m, [](::std::size_t v) -> T { return T(v); }); //fill data
+ TestUtils::Sequence<T> in(m, [](::std::size_t v) -> T { return T(v); }); //fill data
+
+#ifdef _PSTL_TEST_SHIFT_LEFT
+ TestUtils::invoke_on_all_policies<0>()(test_shift(), in.begin(), m, orig.begin(), n, shift_left_algo{});
+#endif
+#ifdef _PSTL_TEST_SHIFT_RIGHT
+ TestUtils::invoke_on_all_policies<1>()(test_shift(), in.begin(), m, orig.begin(), n, shift_right_algo{});
+#endif
+}
+
int
main()
{
@@ -195,13 +203,7 @@ main()
for (long m = 0; m < N; m = m < 16 ? m + 1 : long(3.1415 * m))
for (long n = 0; n < N; n = n < 16 ? n + 1 : long(3.1415 * n))
{
- //std::cout << "m: " << m << " n: " << n << std::endl;
-#ifdef _PSTL_TEST_SHIFT_LEFT
- test_shift_by_type<int32_t>(m, n, shift_left_algo{});
-#endif
-#ifdef _PSTL_TEST_SHIFT_RIGHT
- test_shift_by_type<int32_t>(m, n, shift_right_algo{});
-#endif
+ test_shift_by_type<int32_t>(m, n);
}
::std::cout << TestUtils::done() << ::std::endl;
diff --git a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
index 0a6609e51b54..8317e0bfa910 100644
--- a/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/exclusive_scan_by_segment.pass.cpp
@@ -103,7 +103,7 @@ void test_with_usm()
res_head[5] = 9; res_head[6] = 9; res_head[7] = 9; res_head[8] = 9; res_head[9] = 9;
// call algorithm
- auto new_policy = oneapi::dpl::execution::make_device_policy(q);
+ auto new_policy = oneapi::dpl::execution::make_device_policy<class exclusive_scan_by_segment_1>(q);
oneapi::dpl::exclusive_scan_by_segment(new_policy, key_head, key_head+n, val_head, res_head,
(uint64_t)0, std::equal_to<uint64_t>(), std::plus<uint64_t>());
q.wait();
@@ -120,7 +120,7 @@ void test_with_usm()
// call algorithm on single element range
res_head[0] = 9;
- auto new_policy2 = oneapi::dpl::execution::make_device_policy(q);
+ auto new_policy2 = oneapi::dpl::execution::make_device_policy<class exclusive_scan_by_segment_2>(q);
oneapi::dpl::exclusive_scan_by_segment(new_policy2, key_head, key_head+1, val_head, res_head,
(uint64_t)0);
diff --git a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
index 0c130857f894..c9ab5a00d8cf 100644
--- a/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/inclusive_scan_by_segment.pass.cpp
@@ -102,7 +102,7 @@ void test_with_usm()
res_head[5] = 9; res_head[6] = 9; res_head[7] = 9; res_head[8] = 9; res_head[9] = 9;
// call algorithm
- auto new_policy = oneapi::dpl::execution::make_device_policy(q);
+ auto new_policy = oneapi::dpl::execution::make_device_policy<class inclusive_scan_by_segment_1>(q);
oneapi::dpl::inclusive_scan_by_segment(new_policy, key_head, key_head+n, val_head, res_head,
std::equal_to<uint64_t>(), std::plus<uint64_t>());
@@ -118,7 +118,7 @@ void test_with_usm()
// call algorithm on single element range
res_head[0] = 9;
- auto new_policy2 = oneapi::dpl::execution::make_device_policy(q);
+ auto new_policy2 = oneapi::dpl::execution::make_device_policy<class inclusive_scan_by_segment_2>(q);
oneapi::dpl::inclusive_scan_by_segment(new_policy2, key_head, key_head+1, val_head, res_head);
// check values
diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
index 2d125d0a3b7d..96e48a104f37 100644
--- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
+++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
@@ -152,7 +152,7 @@ void test_with_usm()
val_head[12] = 0;
// call algorithm
- auto new_policy = oneapi::dpl::execution::make_device_policy(q);
+ auto new_policy = oneapi::dpl::execution::make_device_policy<class reduce_by_segment_1>(q);
auto res1 = oneapi::dpl::reduce_by_segment(new_policy, key_head, key_head + n, val_head, key_res_head, val_res_head);
// check values
@@ -179,7 +179,7 @@ void test_with_usm()
key_res_head[0] = 9;
val_res_head[0] = 9;
- auto new_policy2 = oneapi::dpl::execution::make_device_policy(q);
+ auto new_policy2 = oneapi::dpl::execution::make_device_policy<class reduce_by_segment_2>(q);
auto res2 = oneapi::dpl::reduce_by_segment(new_policy2, key_head, key_head + 1, val_head, key_res_head, val_res_head);
// check values
diff --git a/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp b/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp
index 49ce643713d8..76fe8c246cc1 100644
--- a/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp
+++ b/test/parallel_api/ranges/merge_ranges_sycl.pass.cpp
@@ -50,7 +50,7 @@ main()
auto exec = TestUtils::default_dpcpp_policy;
merge(exec, all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(D));
- merge(exec, all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(E), ::std::less<T>());
+ merge(oneapi::dpl::execution::make_device_policy<class merge_2>(exec), all_view(A), all_view(B), all_view<T, sycl::access::mode::write>(E), ::std::less<T>());
}
//check result