-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
COMPAT: reading json with lines=True from s3, xref #17200 #17201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
8255cd0
caa3c80
4709251
a042e3c
0c164e7
ac99133
125f049
533d404
c7f13b8
17973a1
2ae5a9d
38f043b
1d03d7a
78ee720
9d7e75b
b21401b
6979fb8
bb600cb
5b036ec
c5c4d07
f1122ca
b913972
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
xarray==0.9.1 | ||
pandas-gbq | ||
moto |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,3 +26,4 @@ sqlalchemy | |
bottleneck | ||
pymysql | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is ok |
||
Jinja2 | ||
s3fs |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,4 +5,4 @@ cython | |
pytest>=3.1.0 | ||
pytest-cov | ||
flake8 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. revert this, s3fs is NOT a requirement for dev; we should be robust to not having this installed |
||
moto | ||
s3fs |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -341,12 +341,20 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, | |
json = filepath_or_buffer | ||
elif hasattr(filepath_or_buffer, 'read'): | ||
json = filepath_or_buffer.read() | ||
|
||
else: | ||
json = filepath_or_buffer | ||
|
||
if lines: | ||
# If given a json lines file, we break the string into lines, add | ||
# commas and put it in a json list to make a valid json object. | ||
|
||
""" | ||
If PY3 and/or isinstance(json, bytes) | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just a 1-line comment is fine;
|
||
if isinstance(json, bytes): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 1 line comment |
||
json = json.decode('utf-8') | ||
|
||
lines = list(StringIO(json.strip())) | ||
json = '[' + ','.join(lines) + ']' | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import os | ||
|
||
import moto | ||
import pytest | ||
from pandas.io.parsers import read_table | ||
|
||
HERE = os.path.dirname(__file__) | ||
|
||
|
||
@pytest.fixture(scope='module') | ||
def tips_file(): | ||
"""Path to the tips dataset""" | ||
return os.path.join(HERE, 'parser', 'data', 'tips.csv') | ||
|
||
|
||
@pytest.fixture(scope='module') | ||
def jsonl_file(): | ||
"""Path a JSONL dataset""" | ||
return os.path.join(HERE, 'parser', 'data', 'items.jsonl') | ||
|
||
|
||
@pytest.fixture(scope='module') | ||
def salaries_table(): | ||
"""DataFrame with the salaries dataset""" | ||
path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') | ||
return read_table(path) | ||
|
||
|
||
@pytest.fixture(scope='module') | ||
def s3_resource(tips_file, jsonl_file): | ||
"""Fixture for mocking S3 interaction. | ||
|
||
The primary bucket name is "pandas-test". The following datasets | ||
are loaded. | ||
|
||
- tips.csv | ||
- tips.csv.gz | ||
- tips.csv.bz2 | ||
- items.jsonl | ||
|
||
A private bucket "cant_get_it" is also created. The boto3 s3 resource | ||
is yielded by the fixture. | ||
""" | ||
pytest.importorskip('s3fs') | ||
moto.mock_s3().start() | ||
|
||
test_s3_files = [ | ||
('tips.csv', tips_file), | ||
('tips.csv.gz', tips_file + '.gz'), | ||
('tips.csv.bz2', tips_file + '.bz2'), | ||
('items.jsonl', jsonl_file), | ||
] | ||
|
||
def add_tips_files(bucket_name): | ||
for s3_key, file_name in test_s3_files: | ||
with open(file_name, 'rb') as f: | ||
conn.Bucket(bucket_name).put_object( | ||
Key=s3_key, | ||
Body=f) | ||
|
||
boto3 = pytest.importorskip('boto3') | ||
# see gh-16135 | ||
bucket = 'pandas-test' | ||
|
||
conn = boto3.resource("s3", region_name="us-east-1") | ||
conn.create_bucket(Bucket=bucket) | ||
add_tips_files(bucket) | ||
|
||
conn.create_bucket(Bucket='cant_get_it', ACL='private') | ||
add_tips_files('cant_get_it') | ||
|
||
yield conn | ||
|
||
moto.mock_s3().stop() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,6 @@ | |
from pandas.compat import (range, lrange, StringIO, | ||
OrderedDict, is_platform_32bit) | ||
import os | ||
|
||
import numpy as np | ||
from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, | ||
read_json, compat) | ||
|
@@ -985,12 +984,29 @@ def test_tz_range_is_utc(self): | |
df = DataFrame({'DT': dti}) | ||
assert dumps(df, iso_dates=True) == dfexp | ||
|
||
def test_read_jsonl(self): | ||
def test_read_inline_jsonl(self): | ||
# GH9180 | ||
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) | ||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_read_s3_jsonl(self, s3_resource): | ||
pytest.importorskip('s3fs') | ||
# GH17200 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a |
||
|
||
result = read_json('s3n://pandas-test/items.jsonl', lines=True) | ||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_read_local_jsonl(self): | ||
# GH17200 | ||
with ensure_clean('tmp_items.json') as path: | ||
with open(path, 'w') as infile: | ||
infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') | ||
result = read_json(path, lines=True) | ||
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) | ||
assert_frame_equal(result, expected) | ||
|
||
def test_read_jsonl_unicode_chars(self): | ||
# GH15132: non-ascii unicode characters | ||
# \u201d == RIGHT DOUBLE QUOTATION MARK | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"a": 1, "b": 2} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is the purpose of this file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see, ok you have to have this named |
||
{"b":2, "a" :1} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,62 +4,14 @@ | |
Tests parsers ability to read and parse non-local files | ||
and hence require a network connection to be read. | ||
""" | ||
import os | ||
|
||
import pytest | ||
import moto | ||
|
||
import pandas.util.testing as tm | ||
from pandas import DataFrame | ||
from pandas.io.parsers import read_csv, read_table | ||
from pandas.compat import BytesIO | ||
|
||
|
||
@pytest.fixture(scope='module') | ||
def tips_file(): | ||
return os.path.join(tm.get_data_path(), 'tips.csv') | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cool |
||
|
||
@pytest.fixture(scope='module') | ||
def salaries_table(): | ||
path = os.path.join(tm.get_data_path(), 'salaries.csv') | ||
return read_table(path) | ||
|
||
|
||
@pytest.fixture(scope='module') | ||
def s3_resource(tips_file): | ||
pytest.importorskip('s3fs') | ||
moto.mock_s3().start() | ||
|
||
test_s3_files = [ | ||
('tips.csv', tips_file), | ||
('tips.csv.gz', tips_file + '.gz'), | ||
('tips.csv.bz2', tips_file + '.bz2'), | ||
] | ||
|
||
def add_tips_files(bucket_name): | ||
for s3_key, file_name in test_s3_files: | ||
with open(file_name, 'rb') as f: | ||
conn.Bucket(bucket_name).put_object( | ||
Key=s3_key, | ||
Body=f) | ||
|
||
boto3 = pytest.importorskip('boto3') | ||
# see gh-16135 | ||
bucket = 'pandas-test' | ||
|
||
conn = boto3.resource("s3", region_name="us-east-1") | ||
conn.create_bucket(Bucket=bucket) | ||
add_tips_files(bucket) | ||
|
||
conn.create_bucket(Bucket='cant_get_it', ACL='private') | ||
add_tips_files('cant_get_it') | ||
|
||
yield conn | ||
|
||
moto.mock_s3().stop() | ||
|
||
|
||
@pytest.mark.network | ||
@pytest.mark.parametrize( | ||
"compression,extension", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
revert
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this needds to be reverted