12
12
// Provides the various *_LOOP macros
13
13
#include "fast_loop_macros.h"
14
14
15
+
16
+ #define DIVIDEBYZERO_OVERFLOW_CHECK(x, y, min_val, signed) \
17
+ (NPY_UNLIKELY( \
18
+ (signed) ? \
19
+ ((y == 0) || ((x == min_val) && (y == -1))) : \
20
+ (y == 0)) \
21
+ )
22
+
23
+ #define FLAG_IF_DIVIDEBYZERO(x) do { \
24
+ if (NPY_UNLIKELY(x == 0)) { \
25
+ npy_set_floatstatus_divbyzero(); \
26
+ } \
27
+ } while (0)
28
+
29
+
15
30
#if NPY_SIMD && defined(NPY_HAVE_VSX4)
16
31
typedef struct {
17
32
npyv_u32x2 hi;
@@ -166,7 +181,6 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
166
181
const int vstep = npyv_nlanes_@sfx@;
167
182
#if @id@ == 2 /* divmod */
168
183
npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3];
169
- const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1);
170
184
npyv_b@len@ warn = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
171
185
172
186
for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
@@ -176,11 +190,11 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
176
190
npyv_@sfx@ quo = vsx4_div_@sfx@(a, b);
177
191
npyv_@sfx@ rem = npyv_sub_@sfx@(a, vec_mul(b, quo));
178
192
npyv_b@len@ bzero = npyv_cmpeq_@sfx@(b, vzero);
179
- // when b is 0, 'cvtozero' forces the modulo to be 0 too
180
- npyv_@sfx@ cvtozero = npyv_select_@sfx@(bzero, vzero, vneg_one );
193
+ // when b is 0, forces the remainder to be 0 too
194
+ rem = npyv_select_@sfx@(bzero, vzero, rem );
181
195
warn = npyv_or_@sfx@(bzero, warn);
182
196
npyv_store_@sfx@(dst1, quo);
183
- npyv_store_@sfx@(dst2, npyv_and_@sfx@(cvtozero, rem) );
197
+ npyv_store_@sfx@(dst2, rem);
184
198
}
185
199
186
200
if (!vec_all_eq(warn, vzero)) {
@@ -290,7 +304,8 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
290
304
npyv_lanetype_@sfx@ *dst2 = (npyv_lanetype_@sfx@ *) args[3];
291
305
const npyv_@sfx@ vneg_one = npyv_setall_@sfx@(-1);
292
306
const npyv_@sfx@ vmin = npyv_setall_@sfx@(NPY_MIN_INT@len@);
293
- npyv_b@len@ warn = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
307
+ npyv_b@len@ warn_zero = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
308
+ npyv_b@len@ warn_overflow = npyv_cvt_b@len@_@sfx@(npyv_zero_@sfx@());
294
309
295
310
for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
296
311
dst1 += vstep, dst2 += vstep) {
@@ -310,10 +325,8 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
310
325
npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin);
311
326
npyv_b@len@ bneg_one = npyv_cmpeq_@sfx@(b, vneg_one);
312
327
npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin);
313
- npyv_b@len@ error = npyv_or_@sfx@(bzero, overflow);
314
- // in case of overflow or b = 0, 'cvtozero' forces quo/rem to be 0
315
- npyv_@sfx@ cvtozero = npyv_select_@sfx@(error, vzero, vneg_one);
316
- warn = npyv_or_@sfx@(error, warn);
328
+ warn_zero = npyv_or_@sfx@(bzero, warn_zero);
329
+ warn_overflow = npyv_or_@sfx@(overflow, warn_overflow);
317
330
#endif
318
331
#if @id@ >= 1 /* remainder and divmod */
319
332
// handle mixed case the way Python does
@@ -329,8 +342,14 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
329
342
#if @id@ == 2 /* divmod */
330
343
npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one);
331
344
quo = npyv_add_@sfx@(quo, to_sub);
332
- npyv_store_@sfx@(dst1, npyv_and_@sfx@(cvtozero, quo));
333
- npyv_store_@sfx@(dst2, npyv_and_@sfx@(cvtozero, rem));
345
+ // Divide by zero
346
+ quo = npyv_select_@sfx@(bzero, vzero, quo);
347
+ rem = npyv_select_@sfx@(bzero, vzero, rem);
348
+ // Overflow
349
+ quo = npyv_select_@sfx@(overflow, vmin, quo);
350
+ rem = npyv_select_@sfx@(overflow, vzero, rem);
351
+ npyv_store_@sfx@(dst1, quo);
352
+ npyv_store_@sfx@(dst2, rem);
334
353
#else /* fmod and remainder */
335
354
npyv_store_@sfx@(dst1, rem);
336
355
if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
@@ -340,17 +359,27 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
340
359
}
341
360
342
361
#if @id@ == 2 /* divmod */
343
- if (!vec_all_eq(warn , vzero)) {
362
+ if (!vec_all_eq(warn_zero , vzero)) {
344
363
npy_set_floatstatus_divbyzero();
345
364
}
365
+ if (!vec_all_eq(warn_overflow, vzero)) {
366
+ npy_set_floatstatus_overflow();
367
+ }
346
368
347
369
for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
348
370
const npyv_lanetype_@sfx@ a = *src1;
349
371
const npyv_lanetype_@sfx@ b = *src2;
350
- if (b == 0 || (a == NPY_MIN_INT@len@ && b == -1)) {
351
- npy_set_floatstatus_divbyzero();
352
- *dst1 = 0;
353
- *dst2 = 0;
372
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT@len@, NPY_TRUE)) {
373
+ if (b == 0) {
374
+ npy_set_floatstatus_divbyzero();
375
+ *dst1 = 0;
376
+ *dst2 = 0;
377
+ }
378
+ else {
379
+ npy_set_floatstatus_overflow();
380
+ *dst1 = NPY_MIN_INT@len@;
381
+ *dst2 = 0;
382
+ }
354
383
}
355
384
else {
356
385
*dst1 = a / b;
@@ -365,8 +394,8 @@ vsx4_simd_@func@_contig_@sfx@(char **args, npy_intp len)
365
394
for (; len > 0; --len, ++src1, ++src2, ++dst1) {
366
395
const npyv_lanetype_@sfx@ a = *src1;
367
396
const npyv_lanetype_@sfx@ b = *src2;
368
- if (NPY_UNLIKELY(b == 0 )) {
369
- npy_set_floatstatus_divbyzero( );
397
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT@len@, NPY_TRUE )) {
398
+ FLAG_IF_DIVIDEBYZERO(b );
370
399
*dst1 = 0;
371
400
} else{
372
401
*dst1 = a % b;
@@ -415,8 +444,6 @@ vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
415
444
// (a == NPY_MIN_INT@len@ && b == -1)
416
445
npyv_b@len@ amin = npyv_cmpeq_@sfx@(a, vmin);
417
446
npyv_b@len@ overflow = npyv_and_@sfx@(bneg_one, amin);
418
- // in case of overflow, 'cvtozero' forces quo/rem to be 0
419
- npyv_@sfx@ cvtozero = npyv_select_@sfx@(overflow, vzero, vneg_one);
420
447
warn = npyv_or_@sfx@(overflow, warn);
421
448
#endif
422
449
#if @id@ >= 1 /* remainder and divmod */
@@ -432,23 +459,26 @@ vsx4_simd_@func@_by_scalar_contig_@sfx@(char **args, npy_intp len)
432
459
#if @id@ == 2 /* divmod */
433
460
npyv_@sfx@ to_sub = npyv_select_@sfx@(or, vzero, vneg_one);
434
461
quo = npyv_add_@sfx@(quo, to_sub);
435
- npyv_store_@sfx@(dst1, npyv_and_@sfx@(cvtozero, quo));
436
- npyv_store_@sfx@(dst2, npyv_and_@sfx@(cvtozero, rem));
462
+ // Overflow: set quo to minimum and rem to 0
463
+ quo = npyv_select_@sfx@(overflow, vmin, quo);
464
+ rem = npyv_select_@sfx@(overflow, vzero, rem);
465
+ npyv_store_@sfx@(dst1, quo);
466
+ npyv_store_@sfx@(dst2, rem);
437
467
#else /* fmod and remainder */
438
468
npyv_store_@sfx@(dst1, rem);
439
469
#endif
440
470
}
441
471
442
472
#if @id@ == 2 /* divmod */
443
473
if (!vec_all_eq(warn, vzero)) {
444
- npy_set_floatstatus_divbyzero ();
474
+ npy_set_floatstatus_overflow ();
445
475
}
446
476
447
477
for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
448
478
const npyv_lanetype_@sfx@ a = *src1;
449
- if (a == NPY_MIN_INT@len@ && scalar == -1) {
450
- npy_set_floatstatus_divbyzero ();
451
- *dst1 = 0 ;
479
+ if (NPY_UNLIKELY( a == NPY_MIN_INT@len@ && scalar == -1) ) {
480
+ npy_set_floatstatus_overflow ();
481
+ *dst1 = NPY_MIN_INT@len@ ;
452
482
*dst2 = 0;
453
483
}
454
484
else {
@@ -524,8 +554,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_fmod)
524
554
BINARY_LOOP {
525
555
const @type@ in1 = *(@type@ *)ip1;
526
556
const @type@ in2 = *(@type@ *)ip2;
527
- if (NPY_UNLIKELY(in2 == 0)) {
528
- npy_set_floatstatus_divbyzero();
557
+ #if @signed@
558
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_@TYPE@, NPY_TRUE)) {
559
+ #else
560
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
561
+ #endif
562
+ FLAG_IF_DIVIDEBYZERO(in2);
529
563
*((@type@ *)op1) = 0;
530
564
} else{
531
565
*((@type@ *)op1)= in1 % in2;
@@ -552,8 +586,12 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_remainder)
552
586
BINARY_LOOP {
553
587
const @type@ in1 = *(@type@ *)ip1;
554
588
const @type@ in2 = *(@type@ *)ip2;
555
- if (NPY_UNLIKELY(in2 == 0)) {
556
- npy_set_floatstatus_divbyzero();
589
+ #if @signed@
590
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_@TYPE@, NPY_TRUE)) {
591
+ #else
592
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
593
+ #endif
594
+ FLAG_IF_DIVIDEBYZERO(in2);
557
595
*((@type@ *)op1) = 0;
558
596
} else{
559
597
#if @signed@
@@ -593,10 +631,17 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divmod)
593
631
const @type@ in1 = *(@type@ *)ip1;
594
632
const @type@ in2 = *(@type@ *)ip2;
595
633
/* see FIXME note for divide above */
596
- if (NPY_UNLIKELY(in2 == 0 || (in1 == NPY_MIN_@TYPE@ && in2 == -1))) {
597
- npy_set_floatstatus_divbyzero();
598
- *((@type@ *)op1) = 0;
599
- *((@type@ *)op2) = 0;
634
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_@TYPE@, NPY_TRUE)) {
635
+ if (in2 == 0) {
636
+ npy_set_floatstatus_divbyzero();
637
+ *((@type@ *)op1) = 0;
638
+ *((@type@ *)op2) = 0;
639
+ }
640
+ else {
641
+ npy_set_floatstatus_overflow();
642
+ *((@type@ *)op1) = NPY_MIN_@TYPE@;
643
+ *((@type@ *)op2) = 0;
644
+ }
600
645
}
601
646
else {
602
647
/* handle mixed case the way Python does */
@@ -616,7 +661,7 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divmod)
616
661
BINARY_LOOP_TWO_OUT {
617
662
const @type@ in1 = *(@type@ *)ip1;
618
663
const @type@ in2 = *(@type@ *)ip2;
619
- if (NPY_UNLIKELY( in2 == 0 )) {
664
+ if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE )) {
620
665
npy_set_floatstatus_divbyzero();
621
666
*((@type@ *)op1) = 0;
622
667
*((@type@ *)op2) = 0;
0 commit comments