Skip to content

Commit f1ae68f

Browse files
Use sequential sorting for small arrays
1 parent d901322 commit f1ae68f

File tree

1 file changed

+37
-24
lines changed

1 file changed

+37
-24
lines changed

dpctl/tensor/libtensor/include/kernels/sorting.hpp

Lines changed: 37 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -384,28 +384,26 @@ class sort_base_step_contig_krn;
384384
template <typename InpAcc, typename OutAcc, typename Comp>
385385
sycl::event
386386
sort_base_step_contig_impl(sycl::queue &q,
387-
size_t iter_nelems,
388-
size_t sort_nelems,
387+
const size_t iter_nelems,
388+
const size_t sort_nelems,
389389
const InpAcc input,
390390
OutAcc output,
391391
const Comp &comp,
392-
size_t &conseq_nelems_sorted,
392+
const size_t conseq_nelems_sorted,
393393
const std::vector<sycl::event> &depends = {})
394394
{
395395

396396
using inpT = typename GetValueType<InpAcc>::value_type;
397397
using outT = typename GetValueType<OutAcc>::value_type;
398398
using KernelName = sort_base_step_contig_krn<inpT, outT, Comp>;
399399

400-
conseq_nelems_sorted = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
401-
402-
size_t n_segments =
400+
const size_t n_segments =
403401
quotient_ceil<size_t>(sort_nelems, conseq_nelems_sorted);
404402

405403
sycl::event base_sort = q.submit([&](sycl::handler &cgh) {
406404
cgh.depends_on(depends);
407405

408-
sycl::range<1> gRange{iter_nelems * n_segments};
406+
const sycl::range<1> gRange{iter_nelems * n_segments};
409407

410408
auto input_acc = GetReadOnlyAccess<InpAcc>{}(input, cgh);
411409
auto output_acc = GetWriteDiscardAccess<OutAcc>{}(output, cgh);
@@ -478,7 +476,8 @@ sort_over_work_group_contig_impl(sycl::queue &q,
478476
nelems_wg_sorts = elems_per_wi * lws;
479477

480478
if (nelems_wg_sorts > nelems_per_slm) {
481-
nelems_wg_sorts = 0;
479+
nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4);
480+
482481
return sort_base_step_contig_impl<InpAcc, OutAcc, Comp>(
483482
q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts,
484483
depends);
@@ -781,24 +780,38 @@ sycl::event stable_sort_axis1_contig_impl(
781780

782781
auto comp = Comp{};
783782

784-
static constexpr size_t determine_automatically = 0;
785-
size_t sorted_block_size =
786-
(sort_nelems >= 512) ? 512 : determine_automatically;
783+
constexpr size_t sequential_sorting_threshold = 64;
787784

788-
// Sort segments of the array
789-
sycl::event base_sort_ev = sort_detail::sort_over_work_group_contig_impl<
790-
const argTy *, argTy *, Comp>(
791-
exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
792-
sorted_block_size, // modified in place with size of sorted block size
793-
depends);
794-
795-
// Merge segments in parallel until all elements are sorted
796-
sycl::event merges_ev =
797-
sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
798-
exec_q, iter_nelems, sort_nelems, res_tp, comp, sorted_block_size,
799-
{base_sort_ev});
785+
if (sort_nelems < sequential_sorting_threshold) {
786+
// equal work-item sorts entire row
787+
sycl::event sequential_sorting_ev =
788+
sort_detail::sort_base_step_contig_impl<const argTy *, argTy *,
789+
Comp>(
790+
exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
791+
sort_nelems, depends);
800792

801-
return merges_ev;
793+
return sequential_sorting_ev;
794+
}
795+
else {
796+
size_t sorted_block_size{};
797+
798+
// Sort segments of the array
799+
sycl::event base_sort_ev =
800+
sort_detail::sort_over_work_group_contig_impl<const argTy *,
801+
argTy *, Comp>(
802+
exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp,
803+
sorted_block_size, // modified in place with size of sorted
804+
// block size
805+
depends);
806+
807+
// Merge segments in parallel until all elements are sorted
808+
sycl::event merges_ev =
809+
sort_detail::merge_sorted_block_contig_impl<argTy *, Comp>(
810+
exec_q, iter_nelems, sort_nelems, res_tp, comp,
811+
sorted_block_size, {base_sort_ev});
812+
813+
return merges_ev;
814+
}
802815
}
803816

804817
template <typename T1, typename T2, typename T3>

0 commit comments

Comments
 (0)