Skip to content

Commit 0b43a0e

Browse files
authored
Merge pull request numpy#25891 from lysnikolaou/string-ufuncs-expandtabs
ENH: Add expandtabs ufunc for string & unicode dtypes
2 parents c0617ac + c5df089 commit 0b43a0e

File tree

8 files changed

+455
-27
lines changed

8 files changed

+455
-27
lines changed

numpy/_core/code_generators/generate_umath.py

+11
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,17 @@ def english_upper(s):
12701270
docstrings.get('numpy._core.umath._rstrip_whitespace'),
12711271
None,
12721272
),
1273+
'_expandtabs_length':
1274+
Ufunc(2, 1, None,
1275+
docstrings.get('numpy._core.umath._expandtabs_length'),
1276+
None,
1277+
),
1278+
'_expandtabs':
1279+
Ufunc(2, 1, None,
1280+
docstrings.get('numpy._core.umath._expandtabs'),
1281+
None,
1282+
),
1283+
12731284
}
12741285

12751286
def indent(st, spaces):

numpy/_core/code_generators/ufunc_docstrings.py

+3
Original file line numberDiff line numberDiff line change
@@ -4882,3 +4882,6 @@ def add_newdoc(place, name, doc):
48824882
add_newdoc('numpy._core.umath', '_strip_whitespace', '')
48834883
add_newdoc('numpy._core.umath', '_lstrip_whitespace', '')
48844884
add_newdoc('numpy._core.umath', '_rstrip_whitespace', '')
4885+
4886+
add_newdoc('numpy._core.umath', '_expandtabs_length', '')
4887+
add_newdoc('numpy._core.umath', '_expandtabs', '')

numpy/_core/src/umath/string_buffer.h

+84-8
Original file line numberDiff line numberDiff line change
@@ -404,28 +404,36 @@ struct Buffer {
404404
}
405405
}
406406

407-
inline void
407+
inline npy_intp
408408
buffer_memset(npy_ucs4 fill_char, size_t n_chars)
409409
{
410410
if (n_chars == 0) {
411-
return;
411+
return 0;
412412
}
413413
switch (enc) {
414414
case ENCODING::ASCII:
415-
memset(buf, fill_char, n_chars);
416-
break;
415+
memset(this->buf, fill_char, n_chars);
416+
return n_chars;
417417
case ENCODING::UTF32:
418418
{
419-
char *tmp = buf;
419+
char *tmp = this->buf;
420420
for (size_t i = 0; i < n_chars; i++) {
421421
*(npy_ucs4 *)tmp = fill_char;
422422
tmp += sizeof(npy_ucs4);
423423
}
424-
break;
424+
return n_chars;
425425
}
426426
case ENCODING::UTF8:
427-
assert(false); // buffer_memset not used by stringdtype
428-
break;
427+
{
428+
char utf8_c[4] = {0};
429+
char *tmp = this->buf;
430+
size_t num_bytes = ucs4_code_to_utf8_char(fill_char, utf8_c);
431+
for (size_t i = 0; i < n_chars; i++) {
432+
memcpy(tmp, utf8_c, num_bytes);
433+
tmp += num_bytes;
434+
}
435+
return num_bytes * n_chars;
436+
}
429437
}
430438
}
431439

@@ -1428,5 +1436,73 @@ string_replace(Buffer<enc> buf1, Buffer<enc> buf2, Buffer<enc> buf3, npy_int64 c
14281436
}
14291437

14301438

1439+
template <ENCODING enc>
1440+
static inline npy_intp
1441+
string_expandtabs_length(Buffer<enc> buf, npy_int64 tabsize)
1442+
{
1443+
size_t len = buf.num_codepoints();
1444+
1445+
npy_intp new_len = 0, line_pos = 0;
1446+
1447+
Buffer<enc> tmp = buf;
1448+
for (size_t i = 0; i < len; i++) {
1449+
npy_ucs4 ch = *tmp;
1450+
if (ch == '\t') {
1451+
if (tabsize > 0) {
1452+
npy_intp incr = tabsize - (line_pos % tabsize);
1453+
line_pos += incr;
1454+
new_len += incr;
1455+
}
1456+
}
1457+
else {
1458+
line_pos += 1;
1459+
size_t n_bytes = tmp.num_bytes_next_character();
1460+
new_len += n_bytes;
1461+
if (ch == '\n' || ch == '\r') {
1462+
line_pos = 0;
1463+
}
1464+
}
1465+
if (new_len == PY_SSIZE_T_MAX || new_len < 0) {
1466+
npy_gil_error(PyExc_OverflowError, "new string is too long");
1467+
return -1;
1468+
}
1469+
tmp++;
1470+
}
1471+
return new_len;
1472+
}
1473+
1474+
1475+
template <ENCODING enc>
1476+
static inline npy_intp
1477+
string_expandtabs(Buffer<enc> buf, npy_int64 tabsize, Buffer<enc> out)
1478+
{
1479+
size_t len = buf.num_codepoints();
1480+
1481+
npy_intp new_len = 0, line_pos = 0;
1482+
1483+
Buffer<enc> tmp = buf;
1484+
for (size_t i = 0; i < len; i++) {
1485+
npy_ucs4 ch = *tmp;
1486+
if (ch == '\t') {
1487+
if (tabsize > 0) {
1488+
npy_intp incr = tabsize - (line_pos % tabsize);
1489+
line_pos += incr;
1490+
new_len += out.buffer_memset((npy_ucs4) ' ', incr);
1491+
out += incr;
1492+
}
1493+
}
1494+
else {
1495+
line_pos++;
1496+
new_len += out.buffer_memset(ch, 1);
1497+
out++;
1498+
if (ch == '\n' || ch == '\r') {
1499+
line_pos = 0;
1500+
}
1501+
}
1502+
tmp++;
1503+
}
1504+
return new_len;
1505+
}
1506+
14311507

14321508
#endif /* _NPY_CORE_SRC_UMATH_STRING_BUFFER_H_ */

numpy/_core/src/umath/string_ufuncs.cpp

+158-4
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,63 @@ string_lrstrip_chars_loop(PyArrayMethod_Context *context,
450450
}
451451

452452

453+
template <ENCODING enc>
454+
static int
455+
string_expandtabs_length_loop(PyArrayMethod_Context *context,
456+
char *const data[], npy_intp const dimensions[],
457+
npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
458+
{
459+
int elsize = context->descriptors[0]->elsize;
460+
461+
char *in1 = data[0];
462+
char *in2 = data[1];
463+
char *out = data[2];
464+
465+
npy_intp N = dimensions[0];
466+
467+
while (N--) {
468+
Buffer<enc> buf(in1, elsize);
469+
*(npy_intp *)out = string_expandtabs_length(buf, *(npy_int64 *)in2);
470+
471+
in1 += strides[0];
472+
in2 += strides[1];
473+
out += strides[2];
474+
}
475+
476+
return 0;
477+
}
478+
479+
480+
template <ENCODING enc>
481+
static int
482+
string_expandtabs_loop(PyArrayMethod_Context *context,
483+
char *const data[], npy_intp const dimensions[],
484+
npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata))
485+
{
486+
int elsize = context->descriptors[0]->elsize;
487+
int outsize = context->descriptors[2]->elsize;
488+
489+
char *in1 = data[0];
490+
char *in2 = data[1];
491+
char *out = data[2];
492+
493+
npy_intp N = dimensions[0];
494+
495+
while (N--) {
496+
Buffer<enc> buf(in1, elsize);
497+
Buffer<enc> outbuf(out, outsize);
498+
npy_intp new_len = string_expandtabs(buf, *(npy_int64 *)in2, outbuf);
499+
outbuf.buffer_fill_with_zeros_after_index(new_len);
500+
501+
in1 += strides[0];
502+
in2 += strides[1];
503+
out += strides[2];
504+
}
505+
506+
return 0;
507+
}
508+
509+
453510
/* Resolve descriptors & promoter functions */
454511

455512
static NPY_CASTING
@@ -517,9 +574,9 @@ string_multiply_resolve_descriptors(
517574
static NPY_CASTING
518575
string_strip_whitespace_resolve_descriptors(
519576
PyArrayMethodObject *NPY_UNUSED(self),
520-
PyArray_DTypeMeta *NPY_UNUSED(dtypes[3]),
521-
PyArray_Descr *given_descrs[3],
522-
PyArray_Descr *loop_descrs[3],
577+
PyArray_DTypeMeta *NPY_UNUSED(dtypes[2]),
578+
PyArray_Descr *given_descrs[2],
579+
PyArray_Descr *loop_descrs[2],
523580
npy_intp *NPY_UNUSED(view_offset))
524581
{
525582
loop_descrs[0] = NPY_DT_CALL_ensure_canonical(given_descrs[0]);
@@ -600,7 +657,7 @@ string_replace_promoter(PyObject *NPY_UNUSED(ufunc),
600657
static NPY_CASTING
601658
string_replace_resolve_descriptors(
602659
PyArrayMethodObject *NPY_UNUSED(self),
603-
PyArray_DTypeMeta *NPY_UNUSED(dtypes[3]),
660+
PyArray_DTypeMeta *NPY_UNUSED(dtypes[5]),
604661
PyArray_Descr *given_descrs[5],
605662
PyArray_Descr *loop_descrs[5],
606663
npy_intp *NPY_UNUSED(view_offset))
@@ -651,6 +708,67 @@ string_startswith_endswith_promoter(PyObject *NPY_UNUSED(ufunc),
651708
}
652709

653710

711+
static int
712+
string_expandtabs_length_promoter(PyObject *NPY_UNUSED(ufunc),
713+
PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
714+
PyArray_DTypeMeta *new_op_dtypes[])
715+
{
716+
Py_INCREF(op_dtypes[0]);
717+
new_op_dtypes[0] = op_dtypes[0];
718+
new_op_dtypes[1] = NPY_DT_NewRef(&PyArray_Int64DType);
719+
new_op_dtypes[2] = PyArray_DTypeFromTypeNum(NPY_DEFAULT_INT);
720+
return 0;
721+
}
722+
723+
724+
static int
725+
string_expandtabs_promoter(PyObject *NPY_UNUSED(ufunc),
726+
PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[],
727+
PyArray_DTypeMeta *new_op_dtypes[])
728+
{
729+
Py_INCREF(op_dtypes[0]);
730+
new_op_dtypes[0] = op_dtypes[0];
731+
new_op_dtypes[1] = NPY_DT_NewRef(&PyArray_Int64DType);
732+
Py_INCREF(op_dtypes[0]);
733+
new_op_dtypes[2] = op_dtypes[0];
734+
return 0;
735+
}
736+
737+
738+
static NPY_CASTING
739+
string_expandtabs_resolve_descriptors(
740+
PyArrayMethodObject *NPY_UNUSED(self),
741+
PyArray_DTypeMeta *NPY_UNUSED(dtypes[3]),
742+
PyArray_Descr *given_descrs[3],
743+
PyArray_Descr *loop_descrs[3],
744+
npy_intp *NPY_UNUSED(view_offset))
745+
{
746+
if (given_descrs[2] == NULL) {
747+
PyErr_SetString(
748+
PyExc_TypeError,
749+
"The 'out' kwarg is necessary. Use numpy.strings.expandtabs without it.");
750+
return _NPY_ERROR_OCCURRED_IN_CAST;
751+
}
752+
753+
loop_descrs[0] = NPY_DT_CALL_ensure_canonical(given_descrs[0]);
754+
if (loop_descrs[0] == NULL) {
755+
return _NPY_ERROR_OCCURRED_IN_CAST;
756+
}
757+
758+
loop_descrs[1] = NPY_DT_CALL_ensure_canonical(given_descrs[1]);
759+
if (loop_descrs[1] == NULL) {
760+
return _NPY_ERROR_OCCURRED_IN_CAST;
761+
}
762+
763+
loop_descrs[2] = NPY_DT_CALL_ensure_canonical(given_descrs[2]);
764+
if (loop_descrs[2] == NULL) {
765+
return _NPY_ERROR_OCCURRED_IN_CAST;
766+
}
767+
768+
return NPY_NO_CASTING;
769+
}
770+
771+
654772
/*
655773
* Machinery to add the string loops to the existing ufuncs.
656774
*/
@@ -1130,6 +1248,42 @@ init_string_ufuncs(PyObject *umath)
11301248
}
11311249
}
11321250

1251+
dtypes[0] = NPY_OBJECT;
1252+
dtypes[1] = NPY_INT64;
1253+
dtypes[2] = NPY_DEFAULT_INT;
1254+
if (init_ufunc(
1255+
umath, "_expandtabs_length", 2, 1, dtypes, ENCODING::ASCII,
1256+
string_expandtabs_length_loop<ENCODING::ASCII>, NULL, NULL) < 0) {
1257+
return -1;
1258+
}
1259+
if (init_ufunc(
1260+
umath, "_expandtabs_length", 2, 1, dtypes, ENCODING::UTF32,
1261+
string_expandtabs_length_loop<ENCODING::UTF32>, NULL, NULL) < 0) {
1262+
return -1;
1263+
}
1264+
if (init_promoter(umath, "_expandtabs_length", 2, 1, string_expandtabs_length_promoter) < 0) {
1265+
return -1;
1266+
}
1267+
1268+
dtypes[0] = NPY_OBJECT;
1269+
dtypes[1] = NPY_INT64;
1270+
dtypes[2] = NPY_OBJECT;
1271+
if (init_ufunc(
1272+
umath, "_expandtabs", 2, 1, dtypes, ENCODING::ASCII,
1273+
string_expandtabs_loop<ENCODING::ASCII>,
1274+
string_expandtabs_resolve_descriptors, NULL) < 0) {
1275+
return -1;
1276+
}
1277+
if (init_ufunc(
1278+
umath, "_expandtabs", 2, 1, dtypes, ENCODING::UTF32,
1279+
string_expandtabs_loop<ENCODING::UTF32>,
1280+
string_expandtabs_resolve_descriptors, NULL) < 0) {
1281+
return -1;
1282+
}
1283+
if (init_promoter(umath, "_expandtabs", 2, 1, string_expandtabs_promoter) < 0) {
1284+
return -1;
1285+
}
1286+
11331287
return 0;
11341288
}
11351289

0 commit comments

Comments
 (0)