diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 5c03408cbf20f..fe57ed4a54975 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -21,7 +21,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - +- Improved Performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) diff --git a/pandas/io/json.py b/pandas/io/json.py index 66a8e76c09a6f..1e258101a5d86 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -605,25 +605,9 @@ def _convert_to_line_delimits(s): if not s[0] == '[' and s[-1] == ']': return s s = s[1:-1] - num_open_brackets_seen = 0 - commas_to_replace = [] - in_quotes = False - for idx, char in enumerate(s): # iter through to find all - if char == '"' and idx > 0 and s[idx - 1] != '\\': - in_quotes = ~in_quotes - elif char == ',': # commas that should be \n - if num_open_brackets_seen == 0 and not in_quotes: - commas_to_replace.append(idx) - elif char == '{': - if not in_quotes: - num_open_brackets_seen += 1 - elif char == '}': - if not in_quotes: - num_open_brackets_seen -= 1 - s_arr = np.array(list(s)) # Turn to an array to set - s_arr[commas_to_replace] = '\n' # all commas at once. - s = ''.join(s_arr) - return s + + from pandas.lib import convert_json_to_lines + return convert_json_to_lines(s) def nested_to_record(ds, prefix="", level=0): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index e7672de5c835e..b56a02b245d69 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1087,6 +1087,44 @@ def string_array_replace_from_nan_rep( return arr +@cython.boundscheck(False) +@cython.wraparound(False) +def convert_json_to_lines(object arr): + """ + replace comma separated json with line feeds, paying special attention + to quotes & brackets + """ + cdef: + Py_ssize_t i = 0, num_open_brackets_seen = 0, in_quotes = 0, length + ndarray[uint8_t] narr + unsigned char v, comma, left_bracket, right_brack, newline + + newline = ord('\n') + comma = ord(',') + left_bracket = ord('{') + right_bracket = ord('}') + quote = ord('"') + backslash = ord('\\') + + narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy() + length = narr.shape[0] + for i in range(length): + v = narr[i] + if v == quote and i > 0 and narr[i - 1] != backslash: + in_quotes = ~in_quotes + if v == comma: # commas that should be \n + if num_open_brackets_seen == 0 and not in_quotes: + narr[i] = newline + elif v == left_bracket: + if not in_quotes: + num_open_brackets_seen += 1 + elif v == right_bracket: + if not in_quotes: + num_open_brackets_seen -= 1 + + return narr.tostring().decode('utf-8') + + @cython.boundscheck(False) @cython.wraparound(False) def write_csv_rows(list data, ndarray data_index,