Skip to content

Commit 4fd104a

Browse files
Kevin Kuhljreback
Kevin Kuhl
authored andcommitted
COMPAT: reading json with lines=True from s3, xref #17200 (#17201)
1 parent f7c79be commit 4fd104a

File tree

6 files changed

+152
-59
lines changed

6 files changed

+152
-59
lines changed

doc/source/whatsnew/v0.21.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ I/O
8888
- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)
8989
- Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`).
9090
- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`)
91-
91+
- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`)
9292

9393
Plotting
9494
^^^^^^^^

pandas/io/json/json.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
import pandas._libs.json as json
77
from pandas._libs.tslib import iNaT
8-
from pandas.compat import StringIO, long, u
8+
from pandas.compat import StringIO, long, u, to_str
99
from pandas import compat, isna
1010
from pandas import Series, DataFrame, to_datetime, MultiIndex
1111
from pandas.io.common import (get_filepath_or_buffer, _get_handle,
@@ -458,8 +458,10 @@ def read(self):
458458
if self.lines and self.chunksize:
459459
obj = concat(self)
460460
elif self.lines:
461+
462+
data = to_str(self.data)
461463
obj = self._get_object_parser(
462-
self._combine_lines(self.data.split('\n'))
464+
self._combine_lines(data.split('\n'))
463465
)
464466
else:
465467
obj = self._get_object_parser(self.data)
@@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
612614
try:
613615
dtype = np.dtype(dtype)
614616
return data.astype(dtype), True
615-
except:
617+
except (TypeError, ValueError):
616618
return data, False
617619

618620
if convert_dates:
@@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
628630
try:
629631
data = data.astype('float64')
630632
result = True
631-
except:
633+
except (TypeError, ValueError):
632634
pass
633635

634636
if data.dtype.kind == 'f':
@@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
639641
try:
640642
data = data.astype('float64')
641643
result = True
642-
except:
644+
except (TypeError, ValueError):
643645
pass
644646

645647
# do't coerce 0-len data
@@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
651653
if (new_data == data).all():
652654
data = new_data
653655
result = True
654-
except:
656+
except (TypeError, ValueError):
655657
pass
656658

657659
# coerce ints to 64
@@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
661663
try:
662664
data = data.astype('int64')
663665
result = True
664-
except:
666+
except (TypeError, ValueError):
665667
pass
666668

667669
return data, result
@@ -680,7 +682,7 @@ def _try_convert_to_date(self, data):
680682
if new_data.dtype == 'object':
681683
try:
682684
new_data = data.astype('int64')
683-
except:
685+
except (TypeError, ValueError):
684686
pass
685687

686688
# ignore numbers that are out of range
@@ -697,7 +699,7 @@ def _try_convert_to_date(self, data):
697699
unit=date_unit)
698700
except ValueError:
699701
continue
700-
except:
702+
except Exception:
701703
break
702704
return new_data, True
703705
return data, False

pandas/tests/io/conftest.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import os
2+
3+
import moto
4+
import pytest
5+
from pandas.io.parsers import read_table
6+
7+
HERE = os.path.dirname(__file__)
8+
9+
10+
@pytest.fixture(scope='module')
11+
def tips_file():
12+
"""Path to the tips dataset"""
13+
return os.path.join(HERE, 'parser', 'data', 'tips.csv')
14+
15+
16+
@pytest.fixture(scope='module')
17+
def jsonl_file():
18+
"""Path a JSONL dataset"""
19+
return os.path.join(HERE, 'parser', 'data', 'items.jsonl')
20+
21+
22+
@pytest.fixture(scope='module')
23+
def salaries_table():
24+
"""DataFrame with the salaries dataset"""
25+
path = os.path.join(HERE, 'parser', 'data', 'salaries.csv')
26+
return read_table(path)
27+
28+
29+
@pytest.fixture(scope='module')
30+
def s3_resource(tips_file, jsonl_file):
31+
"""Fixture for mocking S3 interaction.
32+
33+
The primary bucket name is "pandas-test". The following datasets
34+
are loaded.
35+
36+
- tips.csv
37+
- tips.csv.gz
38+
- tips.csv.bz2
39+
- items.jsonl
40+
41+
A private bucket "cant_get_it" is also created. The boto3 s3 resource
42+
is yielded by the fixture.
43+
"""
44+
pytest.importorskip('s3fs')
45+
moto.mock_s3().start()
46+
47+
test_s3_files = [
48+
('tips.csv', tips_file),
49+
('tips.csv.gz', tips_file + '.gz'),
50+
('tips.csv.bz2', tips_file + '.bz2'),
51+
('items.jsonl', jsonl_file),
52+
]
53+
54+
def add_tips_files(bucket_name):
55+
for s3_key, file_name in test_s3_files:
56+
with open(file_name, 'rb') as f:
57+
conn.Bucket(bucket_name).put_object(
58+
Key=s3_key,
59+
Body=f)
60+
61+
boto3 = pytest.importorskip('boto3')
62+
# see gh-16135
63+
bucket = 'pandas-test'
64+
65+
conn = boto3.resource("s3", region_name="us-east-1")
66+
conn.create_bucket(Bucket=bucket)
67+
add_tips_files(bucket)
68+
69+
conn.create_bucket(Bucket='cant_get_it', ACL='private')
70+
add_tips_files('cant_get_it')
71+
72+
yield conn
73+
74+
moto.mock_s3().stop()

pandas/tests/io/json/test_pandas.py

+64-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from pandas.compat import (range, lrange, StringIO,
55
OrderedDict, is_platform_32bit)
66
import os
7-
87
import numpy as np
98
from pandas import (Series, DataFrame, DatetimeIndex, Timestamp,
109
read_json, compat)
@@ -1032,6 +1031,70 @@ def test_tz_range_is_utc(self):
10321031
df = DataFrame({'DT': dti})
10331032
assert dumps(df, iso_dates=True) == dfexp
10341033

1034+
def test_read_inline_jsonl(self):
1035+
# GH9180
1036+
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
1037+
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
1038+
assert_frame_equal(result, expected)
1039+
1040+
def test_read_s3_jsonl(self, s3_resource):
1041+
pytest.importorskip('s3fs')
1042+
# GH17200
1043+
1044+
result = read_json('s3n://pandas-test/items.jsonl', lines=True)
1045+
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
1046+
assert_frame_equal(result, expected)
1047+
1048+
def test_read_local_jsonl(self):
1049+
# GH17200
1050+
with ensure_clean('tmp_items.json') as path:
1051+
with open(path, 'w') as infile:
1052+
infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
1053+
result = read_json(path, lines=True)
1054+
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
1055+
assert_frame_equal(result, expected)
1056+
1057+
def test_read_jsonl_unicode_chars(self):
1058+
# GH15132: non-ascii unicode characters
1059+
# \u201d == RIGHT DOUBLE QUOTATION MARK
1060+
1061+
# simulate file handle
1062+
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
1063+
json = StringIO(json)
1064+
result = read_json(json, lines=True)
1065+
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
1066+
columns=['a', 'b'])
1067+
assert_frame_equal(result, expected)
1068+
1069+
# simulate string
1070+
json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
1071+
result = read_json(json, lines=True)
1072+
expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
1073+
columns=['a', 'b'])
1074+
assert_frame_equal(result, expected)
1075+
1076+
def test_to_jsonl(self):
1077+
# GH9180
1078+
df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
1079+
result = df.to_json(orient="records", lines=True)
1080+
expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
1081+
assert result == expected
1082+
1083+
df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
1084+
result = df.to_json(orient="records", lines=True)
1085+
expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
1086+
assert result == expected
1087+
assert_frame_equal(pd.read_json(result, lines=True), df)
1088+
1089+
# GH15096: escaped characters in columns and data
1090+
df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
1091+
columns=["a\\", 'b'])
1092+
result = df.to_json(orient="records", lines=True)
1093+
expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
1094+
'{"a\\\\":"foo\\"","b":"bar"}')
1095+
assert result == expected
1096+
assert_frame_equal(pd.read_json(result, lines=True), df)
1097+
10351098
def test_latin_encoding(self):
10361099
if compat.PY2:
10371100
tm.assert_raises_regex(
+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"a": 1, "b": 2}
2+
{"b":2, "a" :1}

pandas/tests/io/parser/test_network.py

-48
Original file line numberDiff line numberDiff line change
@@ -4,62 +4,14 @@
44
Tests parsers ability to read and parse non-local files
55
and hence require a network connection to be read.
66
"""
7-
import os
8-
97
import pytest
10-
import moto
118

129
import pandas.util.testing as tm
1310
from pandas import DataFrame
1411
from pandas.io.parsers import read_csv, read_table
1512
from pandas.compat import BytesIO
1613

1714

18-
@pytest.fixture(scope='module')
19-
def tips_file():
20-
return os.path.join(tm.get_data_path(), 'tips.csv')
21-
22-
23-
@pytest.fixture(scope='module')
24-
def salaries_table():
25-
path = os.path.join(tm.get_data_path(), 'salaries.csv')
26-
return read_table(path)
27-
28-
29-
@pytest.fixture(scope='module')
30-
def s3_resource(tips_file):
31-
pytest.importorskip('s3fs')
32-
moto.mock_s3().start()
33-
34-
test_s3_files = [
35-
('tips.csv', tips_file),
36-
('tips.csv.gz', tips_file + '.gz'),
37-
('tips.csv.bz2', tips_file + '.bz2'),
38-
]
39-
40-
def add_tips_files(bucket_name):
41-
for s3_key, file_name in test_s3_files:
42-
with open(file_name, 'rb') as f:
43-
conn.Bucket(bucket_name).put_object(
44-
Key=s3_key,
45-
Body=f)
46-
47-
boto3 = pytest.importorskip('boto3')
48-
# see gh-16135
49-
bucket = 'pandas-test'
50-
51-
conn = boto3.resource("s3", region_name="us-east-1")
52-
conn.create_bucket(Bucket=bucket)
53-
add_tips_files(bucket)
54-
55-
conn.create_bucket(Bucket='cant_get_it', ACL='private')
56-
add_tips_files('cant_get_it')
57-
58-
yield conn
59-
60-
moto.mock_s3().stop()
61-
62-
6315
@pytest.mark.network
6416
@pytest.mark.parametrize(
6517
"compression,extension",

0 commit comments

Comments
 (0)