Skip to content

Remove Encoding of values in char** For Labels #27618

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Aug 23, 2019
Merged
Changes from 7 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
648d2b5
with segfaults
WillAyd Jul 26, 2019
37226d7
Merge remote-tracking branch 'upstream/master' into remove-buf-writes
WillAyd Jul 26, 2019
e890578
Works save datetimes
WillAyd Jul 26, 2019
24a585a
Less failures, still not correct dt support
WillAyd Jul 26, 2019
42cc1ec
Removed encoder attributes in encodeLabels
WillAyd Jul 26, 2019
7385fd8
Added docstring
WillAyd Jul 26, 2019
4aca787
whitespace cleanup
WillAyd Jul 26, 2019
bc5e69a
Removed unused headers
WillAyd Jul 26, 2019
d53527f
Moved macro back to vendor c file
WillAyd Jul 26, 2019
eefadfa
Merge remote-tracking branch 'upstream/master' into remove-buf-writes
WillAyd Aug 1, 2019
7b2af9f
Merge remote-tracking branch 'upstream/master' into remove-buf-writes
WillAyd Aug 5, 2019
2e25bef
Working TS conversions
WillAyd Aug 7, 2019
0820f57
Parametrized test
WillAyd Aug 7, 2019
185aa61
Fixed epoch precision issue for labels
WillAyd Aug 7, 2019
221e43d
Parametrized failing test
WillAyd Aug 7, 2019
18d28c8
Parametrized frame test
WillAyd Aug 7, 2019
7a5fc9c
Removed UTC cast for index
WillAyd Aug 7, 2019
ad06e9c
Merge remote-tracking branch 'upstream/master' into remove-buf-writes
WillAyd Aug 7, 2019
3e22820
blackify
WillAyd Aug 7, 2019
2389a33
Added test for timedelta
WillAyd Aug 7, 2019
104d2f2
Added fix for timedelta as label
WillAyd Aug 7, 2019
dbdaed1
Blackify
WillAyd Aug 8, 2019
3ed7979
Comment cleanup and error handling
WillAyd Aug 8, 2019
01ae178
Added test for tuples
WillAyd Aug 8, 2019
0372794
Whatsnew
WillAyd Aug 8, 2019
e15bede
Warnings fixup
WillAyd Aug 8, 2019
d698255
Simplified object construction
WillAyd Aug 8, 2019
d453ed0
Macro for int64 copying
WillAyd Aug 8, 2019
5ab7ecf
Removed unnecessary padding in sprintf
WillAyd Aug 9, 2019
8ea895e
Merge remote-tracking branch 'upstream/master' into remove-buf-writes
WillAyd Aug 23, 2019
439b695
whitespace fixup
WillAyd Aug 23, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 56 additions & 66 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -787,30 +787,23 @@ JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) {
return GET_TC(tc)->itemValue;
}

static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen,
npy_intp idx, char **labels) {
JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder;
PRINTMARK();
*outLen = strlen(labels[idx]);
Buffer_Reserve(enc, *outLen);
memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen));
enc->offset += *outLen;
*outLen = 0;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like replacing this function with the simpler version below is orthogonal ot everything else. am i reading it wrong?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not orthogonal just simplified. Previously since the labels would all be encoded into a buffer this function would help return the unencoded label. Instead now all labels are stored unencoded in the char ** and can easily be accessed by index, leaving the ultimate encoding up to ujson


char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
NpyArrContext *npyarr = GET_TC(tc)->npyarr;
npy_intp idx;
PRINTMARK();
char *cStr;

if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
idx = npyarr->index[npyarr->stridedim] - 1;
NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels);
cStr = npyarr->columnLabels[idx];
} else {
idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1;
NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels);
cStr = npyarr->rowLabels[idx];
}
return NULL;

*outLen = strlen(cStr);

return cStr;
}

//=============================================================================
Expand Down Expand Up @@ -852,36 +845,42 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) {
PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
NpyArrContext *npyarr = blkCtxt->npyCtxts[0];
npy_intp idx;
char *cStr;
PRINTMARK();

if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) {
idx = blkCtxt->colIdx - 1;
NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels);
cStr = npyarr->columnLabels[idx];
} else {
idx = GET_TC(tc)->iterNext != PdBlock_iterNext
? npyarr->index[npyarr->stridedim - npyarr->inc] - 1
: npyarr->index[npyarr->stridedim];

NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels);
cStr = npyarr->rowLabels[idx];
}
return NULL;

*outLen = strlen(cStr);
return cStr;
}

char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc,
size_t *outLen) {
PdBlockContext *blkCtxt = GET_TC(tc)->pdblock;
NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx];
npy_intp idx;
char *cStr;
PRINTMARK();

if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) {
idx = npyarr->index[npyarr->stridedim] - 1;
NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels);
cStr = npyarr->columnLabels[idx];
} else {
idx = blkCtxt->colIdx;
NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels);
cStr = npyarr->rowLabels[idx];
}
return NULL;

*outLen = strlen(cStr);
return cStr;
}

int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) {
Expand Down Expand Up @@ -1578,15 +1577,31 @@ void NpyArr_freeLabels(char **labels, npy_intp len) {
}
}

/*
* Function: NpyArr_encodeLabels
* -----------------------------
*
* Builds an array of "encoded" labels.
*
* labels: PyArrayObject pointer for labels to be "encoded"
* num : number of labels
*
* "encode" is quoted above because we aren't really doing encoding
* For historical reasons this function would actually encode the entire
* array into a separate buffer with a separate call to JSON_Encode
* and would leave it to complex pointer manipulation from there to
* unpack values as needed. To make things simpler and more idiomatic
* this has instead just stringified any input save for datetime values,
* which may need to be represented in various formats.
*/
char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc,
npy_intp num) {
// NOTE this function steals a reference to labels.
PyObjectEncoder *pyenc = (PyObjectEncoder *)enc;
PyObject *item = NULL;
npy_intp i, stride, len, need_quotes;
npy_intp i, stride, len;
char **ret;
char *dataptr, *cLabel, *origend, *origst, *origoffset;
char labelBuffer[NPY_JSON_BUFSIZE];
char *dataptr, *cLabel;
PyArray_GetItemFunc *getitem;
int type_num;
PRINTMARK();
Expand Down Expand Up @@ -1614,68 +1629,44 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc,
ret[i] = NULL;
}

origst = enc->start;
origend = enc->end;
origoffset = enc->offset;

stride = PyArray_STRIDE(labels, 0);
dataptr = PyArray_DATA(labels);
getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem;
type_num = PyArray_TYPE(labels);

for (i = 0; i < num; i++) {
if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num))
{
item = (PyObject *)labels;
pyenc->npyType = type_num;
pyenc->npyValue = dataptr;
} else {
item = getitem(dataptr, labels);
if (!item) {
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}
}

cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE);

if (item != (PyObject *)labels) {
Py_DECREF(item);
}

if (PyErr_Occurred() || enc->errorMsg) {
item = getitem(dataptr, labels);
if (!item) {
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

PyObject *str = PyObject_Str(item);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel last comment...here is where I think that check needs to go. We are getting the str of the object but when passed in as a DTA the introspection is different than when passed in an object array. Wondering if we have shared utilities to already handle that ambiguity I could leverage instead of trying to do it all in this extension

cLabel = PyUnicode_AsUTF8(str);
len = strlen(cLabel);
Py_DECREF(str);

Py_DECREF(item);
// Add 1 to include NULL terminator
ret[i] = PyObject_Malloc(len + 1);
memcpy(ret[i], cLabel, len + 1);

if (PyErr_Occurred()) {
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

need_quotes = ((*cLabel) != '"');
len = enc->offset - cLabel + 1 + 2 * need_quotes;
ret[i] = PyObject_Malloc(sizeof(char) * len);

if (!ret[i]) {
PyErr_NoMemory();
ret = 0;
break;
}

if (need_quotes) {
ret[i][0] = '"';
memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4));
ret[i][len - 3] = '"';
} else {
memcpy(ret[i], cLabel, sizeof(char) * (len - 2));
}
ret[i][len - 2] = ':';
ret[i][len - 1] = '\0';
dataptr += stride;
}

enc->start = origst;
enc->end = origend;
enc->offset = origoffset;

Py_DECREF(labels);
return ret;
}
Expand Down Expand Up @@ -2429,7 +2420,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) {
PRINTMARK();
ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer));
PRINTMARK();

if (PyErr_Occurred()) {
PRINTMARK();
return NULL;
Expand Down