Skip to content

Commit 8b15934

Browse files
committed
split out json readlines to sep test class
1 parent c2544f1 commit 8b15934

File tree

2 files changed

+167
-150
lines changed

2 files changed

+167
-150
lines changed

pandas/tests/io/json/test_pandas.py

-150
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
read_json, compat)
1111
from datetime import timedelta
1212
import pandas as pd
13-
from pandas.io.json.json import JsonReader
1413

1514
from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
1615
assert_series_equal, network,
@@ -36,12 +35,6 @@
3635
_mixed_frame = _frame.copy()
3736

3837

39-
@pytest.fixture
40-
def lines_json_df():
41-
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
42-
return df.to_json(lines=True, orient="records")
43-
44-
4538
class TestPandasContainer(object):
4639

4740
def setup_method(self, method):
@@ -1046,146 +1039,3 @@ def test_data_frame_size_after_to_json(self):
10461039
assert size_before == size_after
10471040

10481041

1049-
class TestPandasJsonLines(object):
1050-
1051-
def test_read_jsonl(self):
1052-
# GH9180
1053-
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
1054-
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
1055-
assert_frame_equal(result, expected)
1056-
1057-
def test_read_jsonl_unicode_chars(self):
1058-
# GH15132: non-ascii unicode characters
1059-
# \u201d == RIGHT DOUBLE QUOTATION MARK
1060-
1061-
# simulate file handle
1062-
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
1063-
json = StringIO(json)
1064-
result = read_json(json, lines=True)
1065-
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
1066-
columns=['a', 'b'])
1067-
assert_frame_equal(result, expected)
1068-
1069-
# simulate string
1070-
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
1071-
result = read_json(json, lines=True)
1072-
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
1073-
columns=['a', 'b'])
1074-
assert_frame_equal(result, expected)
1075-
1076-
def test_to_jsonl(self):
1077-
# GH9180
1078-
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
1079-
result = df.to_json(orient="records", lines=True)
1080-
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
1081-
assert result == expected
1082-
1083-
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
1084-
result = df.to_json(orient="records", lines=True)
1085-
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
1086-
assert result == expected
1087-
assert_frame_equal(pd.read_json(result, lines=True), df)
1088-
1089-
# GH15096: escaped characters in columns and data
1090-
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
1091-
columns=["a\\", 'b'])
1092-
result = df.to_json(orient="records", lines=True)
1093-
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
1094-
'{"a\\\\":"foo\\"","b":"bar"}')
1095-
assert result == expected
1096-
assert_frame_equal(pd.read_json(result, lines=True), df)
1097-
1098-
@pytest.mark.parametrize("chunksize", [1, 1.0])
1099-
def test_readjson_chunks(self, lines_json_df, chunksize):
1100-
# Basic test that read_json(chunks=True) gives the same result as
1101-
# read_json(chunks=False)
1102-
# GH17048: memory usage when lines=True
1103-
1104-
unchunked = pd.read_json(StringIO(lines_json_df), lines=True)
1105-
reader = pd.read_json(StringIO(lines_json_df), lines=True,
1106-
chunksize=chunksize)
1107-
chunked = pd.concat(reader)
1108-
1109-
assert_frame_equal(chunked, unchunked)
1110-
1111-
def test_readjson_chunksize_requires_lines(self, lines_json_df):
1112-
msg = "chunksize can only be passed if lines=True"
1113-
with tm.assert_raises_regex(ValueError, msg):
1114-
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
1115-
1116-
def test_readjson_chunks_series(self):
1117-
# Test reading line-format JSON to Series with chunksize param
1118-
s = pd.Series({'A': 1, 'B': 2})
1119-
1120-
strio = StringIO(s.to_json(lines=True, orient="records"))
1121-
unchunked = pd.read_json(strio, lines=True, typ='Series')
1122-
1123-
strio = StringIO(s.to_json(lines=True, orient="records"))
1124-
chunked = pd.concat(pd.read_json(
1125-
strio, lines=True, typ='Series', chunksize=1
1126-
))
1127-
1128-
assert_series_equal(chunked, unchunked)
1129-
1130-
def test_readjson_each_chunk(self, lines_json_df):
1131-
# Other tests check that the final result of read_json(chunksize=True)
1132-
# is correct. This checks the intermediate chunks.
1133-
chunks = list(
1134-
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
1135-
)
1136-
assert chunks[0].shape == (2, 2)
1137-
assert chunks[1].shape == (1, 2)
1138-
1139-
def test_readjson_chunks_from_file(self):
1140-
with ensure_clean('test.json') as path:
1141-
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
1142-
df.to_json(path, lines=True, orient="records")
1143-
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
1144-
unchunked = pd.read_json(path, lines=True)
1145-
assert_frame_equal(unchunked, chunked)
1146-
1147-
@pytest.mark.parametrize("chunksize", [None, 1])
1148-
def test_readjson_chunks_closes(self, chunksize):
1149-
with ensure_clean('test.json') as path:
1150-
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
1151-
df.to_json(path, lines=True, orient="records")
1152-
reader = JsonReader(
1153-
path, orient=None, typ="frame", dtype=True, convert_axes=True,
1154-
convert_dates=True, keep_default_dates=True, numpy=False,
1155-
precise_float=False, date_unit=None, encoding=None,
1156-
lines=True, chunksize=chunksize)
1157-
reader.read()
1158-
assert reader.open_stream.closed, "didn't close stream with \
1159-
chunksize = %s" % chunksize
1160-
1161-
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
1162-
def test_readjson_invalid_chunksize(self, lines_json_df, chunksize):
1163-
msg = r"'chunksize' must be an integer >=1"
1164-
1165-
with tm.assert_raises_regex(ValueError, msg):
1166-
pd.read_json(StringIO(lines_json_df), lines=True,
1167-
chunksize=chunksize)
1168-
1169-
@pytest.mark.parametrize("chunksize", [None, 1, 2])
1170-
def test_readjson_chunks_multiple_empty_lines(self, chunksize):
1171-
j = """
1172-
1173-
{"A":1,"B":4}
1174-
1175-
1176-
1177-
{"A":2,"B":5}
1178-
1179-
1180-
1181-
1182-
1183-
1184-
1185-
{"A":3,"B":6}
1186-
"""
1187-
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
1188-
test = pd.read_json(j, lines=True, chunksize=chunksize)
1189-
if chunksize is not None:
1190-
test = pd.concat(test)
1191-
tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)
+167
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
import pytest
2+
import pandas as pd
3+
from pandas import DataFrame, read_json
4+
from pandas.compat import StringIO
5+
from pandas.io.json.json import JsonReader
6+
import pandas.util.testing as tm
7+
from pandas.util.testing import (assert_frame_equal, assert_series_equal,
8+
ensure_clean)
9+
10+
11+
@pytest.fixture
12+
def lines_json_df():
13+
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
14+
return df.to_json(lines=True, orient="records")
15+
16+
17+
def test_read_jsonl():
18+
# GH9180
19+
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
20+
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
21+
assert_frame_equal(result, expected)
22+
23+
24+
def test_read_jsonl_unicode_chars():
25+
# GH15132: non-ascii unicode characters
26+
# \u201d == RIGHT DOUBLE QUOTATION MARK
27+
28+
# simulate file handle
29+
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
30+
json = StringIO(json)
31+
result = read_json(json, lines=True)
32+
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
33+
columns=['a', 'b'])
34+
assert_frame_equal(result, expected)
35+
36+
# simulate string
37+
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
38+
result = read_json(json, lines=True)
39+
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
40+
columns=['a', 'b'])
41+
assert_frame_equal(result, expected)
42+
43+
44+
def test_to_jsonl():
45+
# GH9180
46+
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
47+
result = df.to_json(orient="records", lines=True)
48+
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
49+
assert result == expected
50+
51+
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
52+
result = df.to_json(orient="records", lines=True)
53+
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
54+
assert result == expected
55+
assert_frame_equal(read_json(result, lines=True), df)
56+
57+
# GH15096: escaped characters in columns and data
58+
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
59+
columns=["a\\", 'b'])
60+
result = df.to_json(orient="records", lines=True)
61+
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
62+
'{"a\\\\":"foo\\"","b":"bar"}')
63+
assert result == expected
64+
assert_frame_equal(read_json(result, lines=True), df)
65+
66+
67+
@pytest.mark.parametrize("chunksize", [1, 1.0])
68+
def test_readjson_chunks(lines_json_df, chunksize):
69+
# Basic test that read_json(chunks=True) gives the same result as
70+
# read_json(chunks=False)
71+
# GH17048: memory usage when lines=True
72+
73+
unchunked = read_json(StringIO(lines_json_df), lines=True)
74+
reader = read_json(StringIO(lines_json_df), lines=True,
75+
chunksize=chunksize)
76+
chunked = pd.concat(reader)
77+
78+
assert_frame_equal(chunked, unchunked)
79+
80+
81+
def test_readjson_chunksize_requires_lines(lines_json_df):
82+
msg = "chunksize can only be passed if lines=True"
83+
with tm.assert_raises_regex(ValueError, msg):
84+
pd.read_json(StringIO(lines_json_df), lines=False, chunksize=2)
85+
86+
87+
def test_readjson_chunks_series():
88+
# Test reading line-format JSON to Series with chunksize param
89+
s = pd.Series({'A': 1, 'B': 2})
90+
91+
strio = StringIO(s.to_json(lines=True, orient="records"))
92+
unchunked = pd.read_json(strio, lines=True, typ='Series')
93+
94+
strio = StringIO(s.to_json(lines=True, orient="records"))
95+
chunked = pd.concat(pd.read_json(
96+
strio, lines=True, typ='Series', chunksize=1
97+
))
98+
99+
assert_series_equal(chunked, unchunked)
100+
101+
102+
def test_readjson_each_chunk(lines_json_df):
103+
# Other tests check that the final result of read_json(chunksize=True)
104+
# is correct. This checks the intermediate chunks.
105+
chunks = list(
106+
pd.read_json(StringIO(lines_json_df), lines=True, chunksize=2)
107+
)
108+
assert chunks[0].shape == (2, 2)
109+
assert chunks[1].shape == (1, 2)
110+
111+
112+
def test_readjson_chunks_from_file():
113+
with ensure_clean('test.json') as path:
114+
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
115+
df.to_json(path, lines=True, orient="records")
116+
chunked = pd.concat(pd.read_json(path, lines=True, chunksize=1))
117+
unchunked = pd.read_json(path, lines=True)
118+
assert_frame_equal(unchunked, chunked)
119+
120+
121+
@pytest.mark.parametrize("chunksize", [None, 1])
122+
def test_readjson_chunks_closes(chunksize):
123+
with ensure_clean('test.json') as path:
124+
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
125+
df.to_json(path, lines=True, orient="records")
126+
reader = JsonReader(
127+
path, orient=None, typ="frame", dtype=True, convert_axes=True,
128+
convert_dates=True, keep_default_dates=True, numpy=False,
129+
precise_float=False, date_unit=None, encoding=None,
130+
lines=True, chunksize=chunksize)
131+
reader.read()
132+
assert reader.open_stream.closed, "didn't close stream with \
133+
chunksize = %s" % chunksize
134+
135+
136+
@pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
137+
def test_readjson_invalid_chunksize(lines_json_df, chunksize):
138+
msg = r"'chunksize' must be an integer >=1"
139+
140+
with tm.assert_raises_regex(ValueError, msg):
141+
pd.read_json(StringIO(lines_json_df), lines=True,
142+
chunksize=chunksize)
143+
144+
145+
@pytest.mark.parametrize("chunksize", [None, 1, 2])
146+
def test_readjson_chunks_multiple_empty_lines(chunksize):
147+
j = """
148+
149+
{"A":1,"B":4}
150+
151+
152+
153+
{"A":2,"B":5}
154+
155+
156+
157+
158+
159+
160+
161+
{"A":3,"B":6}
162+
"""
163+
orig = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
164+
test = pd.read_json(j, lines=True, chunksize=chunksize)
165+
if chunksize is not None:
166+
test = pd.concat(test)
167+
tm.assert_frame_equal(orig, test, obj="chunksize: %s" % chunksize)

0 commit comments

Comments
 (0)