Skip to content

COMPAT: reading json with lines=True from s3, xref #17200 #17201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Nov 27, 2017
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/requirements-3.5.pip
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
xarray==0.9.1
pandas-gbq
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

revert

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this needds to be reverted

moto
1 change: 1 addition & 0 deletions ci/requirements_all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ sqlalchemy
bottleneck
pymysql
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is ok

Jinja2
s3fs
2 changes: 1 addition & 1 deletion ci/requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ cython
pytest>=3.1.0
pytest-cov
flake8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

revert this, s3fs is NOT a requirement for dev; we should be robust to not having this installed

moto
s3fs
8 changes: 8 additions & 0 deletions pandas/io/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,12 +341,20 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
json = filepath_or_buffer
elif hasattr(filepath_or_buffer, 'read'):
json = filepath_or_buffer.read()

else:
json = filepath_or_buffer

if lines:
# If given a json lines file, we break the string into lines, add
# commas and put it in a json list to make a valid json object.

"""
If PY3 and/or isinstance(json, bytes)
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just a 1-line comment is fine;

if PY3 and isinstance(json, bytes):
    ...

if isinstance(json, bytes):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1 line comment

json = json.decode('utf-8')

lines = list(StringIO(json.strip()))
json = '[' + ','.join(lines) + ']'

Expand Down
74 changes: 74 additions & 0 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os

import moto
import pytest
from pandas.io.parsers import read_table

HERE = os.path.dirname(__file__)


@pytest.fixture(scope='module')
def tips_file():
"""Path to the tips dataset"""
return os.path.join(HERE, 'parser', 'data', 'tips.csv')


@pytest.fixture(scope='module')
def jsonl_file():
"""Path a JSONL dataset"""
return os.path.join(HERE, 'parser', 'data', 'items.jsonl')


@pytest.fixture(scope='module')
def salaries_table():
"""DataFrame with the salaries dataset"""
path = os.path.join(HERE, 'parser', 'data', 'salaries.csv')
return read_table(path)


@pytest.fixture(scope='module')
def s3_resource(tips_file, jsonl_file):
"""Fixture for mocking S3 interaction.

The primary bucket name is "pandas-test". The following datasets
are loaded.

- tips.csv
- tips.csv.gz
- tips.csv.bz2
- items.jsonl

A private bucket "cant_get_it" is also created. The boto3 s3 resource
is yielded by the fixture.
"""
pytest.importorskip('s3fs')
moto.mock_s3().start()

test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
('items.jsonl', jsonl_file),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)

boto3 = pytest.importorskip('boto3')
# see gh-16135
bucket = 'pandas-test'

conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)

conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')

yield conn

moto.mock_s3().stop()
20 changes: 18 additions & 2 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from pandas.compat import (range, lrange, StringIO,
OrderedDict, is_platform_32bit)
import os

import numpy as np
from pandas import (Series, DataFrame, DatetimeIndex, Timestamp,
read_json, compat)
Expand Down Expand Up @@ -985,12 +984,29 @@ def test_tz_range_is_utc(self):
df = DataFrame({'DT': dti})
assert dumps(df, iso_dates=True) == dfexp

def test_read_jsonl(self):
def test_read_inline_jsonl(self):
# GH9180
result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_s3_jsonl(self, s3_resource):
pytest.importorskip('s3fs')
# GH17200
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a pytest.importorskip() here I think.


result = read_json('s3n://pandas-test/items.jsonl', lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_local_jsonl(self):
# GH17200
with ensure_clean('tmp_items.json') as path:
with open(path, 'w') as infile:
infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
result = read_json(path, lines=True)
expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
assert_frame_equal(result, expected)

def test_read_jsonl_unicode_chars(self):
# GH15132: non-ascii unicode characters
# \u201d == RIGHT DOUBLE QUOTATION MARK
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/io/parser/data/items.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"a": 1, "b": 2}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the purpose of this file?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, ok you have to have this named .json otherwise it won't be picked up by setup.py (IOW the install test will fail).

{"b":2, "a" :1}
48 changes: 0 additions & 48 deletions pandas/tests/io/parser/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,62 +4,14 @@
Tests parsers ability to read and parse non-local files
and hence require a network connection to be read.
"""
import os

import pytest
import moto

import pandas.util.testing as tm
from pandas import DataFrame
from pandas.io.parsers import read_csv, read_table
from pandas.compat import BytesIO


@pytest.fixture(scope='module')
def tips_file():
return os.path.join(tm.get_data_path(), 'tips.csv')

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool


@pytest.fixture(scope='module')
def salaries_table():
path = os.path.join(tm.get_data_path(), 'salaries.csv')
return read_table(path)


@pytest.fixture(scope='module')
def s3_resource(tips_file):
pytest.importorskip('s3fs')
moto.mock_s3().start()

test_s3_files = [
('tips.csv', tips_file),
('tips.csv.gz', tips_file + '.gz'),
('tips.csv.bz2', tips_file + '.bz2'),
]

def add_tips_files(bucket_name):
for s3_key, file_name in test_s3_files:
with open(file_name, 'rb') as f:
conn.Bucket(bucket_name).put_object(
Key=s3_key,
Body=f)

boto3 = pytest.importorskip('boto3')
# see gh-16135
bucket = 'pandas-test'

conn = boto3.resource("s3", region_name="us-east-1")
conn.create_bucket(Bucket=bucket)
add_tips_files(bucket)

conn.create_bucket(Bucket='cant_get_it', ACL='private')
add_tips_files('cant_get_it')

yield conn

moto.mock_s3().stop()


@pytest.mark.network
@pytest.mark.parametrize(
"compression,extension",
Expand Down