From 8255cd0aa006a9d2da131cdea4297dc0af9992f6 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Tue, 8 Aug 2017 13:38:40 -0500 Subject: [PATCH 01/19] Fix for #17200 Attempt to decode the bytes array with `encoding` passed to the call. --- pandas/io/json/json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index a1d48719ba9c0..5403eab34c1f3 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -346,7 +346,8 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if lines: # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. - lines = list(StringIO(json.strip())) + enc = encoding if encoding else 'utf-8' + lines = list(StringIO(json.decode(enc).strip())) json = '[' + ','.join(lines) + ']' obj = None From caa3c80a2e1dd70700efd14af3e8ddbb677c797f Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Tue, 8 Aug 2017 15:34:12 -0500 Subject: [PATCH 02/19] Revert "Fix for #17200" This reverts commit 8255cd0aa006a9d2da131cdea4297dc0af9992f6. --- pandas/io/json/json.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 5403eab34c1f3..a1d48719ba9c0 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -346,8 +346,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, if lines: # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. - enc = encoding if encoding else 'utf-8' - lines = list(StringIO(json.decode(enc).strip())) + lines = list(StringIO(json.strip())) json = '[' + ','.join(lines) + ']' obj = None From 47092510cdea48eb254e70bd624123e3041a3483 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Tue, 8 Aug 2017 15:37:24 -0500 Subject: [PATCH 03/19] Wrap BytesIO based streams when using --- pandas/io/common.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index cbfc33dbebb81..e3d9876e75be6 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -194,6 +194,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, """ filepath_or_buffer = _stringify_path(filepath_or_buffer) + from io import TextIOWrapper if _is_url(filepath_or_buffer): req = _urlopen(filepath_or_buffer) content_encoding = req.headers.get('Content-Encoding', None) @@ -201,13 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) - return reader, encoding, compression + return TextIOWrapper(reader, encoding=encoding), encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io import s3 - return s3.get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, - compression=compression) + ret = s3.get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression) + return TextIOWrapper(ret[0], encoding=encoding), ret[1], ret[2] if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, From a042e3c771597e2606b38b80c624e4720548bd54 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Tue, 29 Aug 2017 14:31:53 -0500 Subject: [PATCH 04/19] Added and deps for mocking s3 testing calls. --- ci/requirements-2.7.pip | 1 + ci/requirements-2.7.run | 1 + ci/requirements-2.7_SLOW.run | 1 + ci/requirements-3.5.pip | 1 + ci/requirements-3.5.run | 1 + ci/requirements-3.5_OSX.run | 1 + ci/requirements-3.6.pip | 1 + ci/requirements-3.6.run | 1 + ci/requirements-3.6_LOCALE.run | 1 + ci/requirements-3.6_LOCALE_SLOW.run | 1 + ci/requirements_dev.txt | 2 ++ pandas/tests/io/json/test_pandas.py | 28 +++++++++++++++++++++++++++- 12 files changed, 39 insertions(+), 1 deletion(-) diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 876d9e978fa84..6316e64417e18 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -8,3 +8,4 @@ py PyCrypto mock ipython +moto \ No newline at end of file diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index 7152cb2c8b605..0220428cd4e6b 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -18,3 +18,4 @@ patsy pymysql=0.6.3 jinja2=2.8 xarray=0.8.0 +boto3 \ No newline at end of file diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index 0a549554f5219..0fab2120a9273 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -17,3 +17,4 @@ psycopg2 pymysql html5lib beautiful-soup +boto3 \ No newline at end of file diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 6e4f7b65f9728..9f8f7ee0a3563 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,2 +1,3 @@ xarray==0.9.1 pandas-gbq +moto \ No newline at end of file diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index 52828b5220997..055df57338760 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -18,3 +18,4 @@ psycopg2 s3fs beautifulsoup4 ipython +boto3 \ No newline at end of file diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run index 1d83474d10f2f..64cea248edec2 100644 --- a/ci/requirements-3.5_OSX.run +++ b/ci/requirements-3.5_OSX.run @@ -14,3 +14,4 @@ bottleneck xarray s3fs beautifulsoup4 +boto3 \ No newline at end of file diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip index 753a60d6c119a..42237d0d03a16 100644 --- a/ci/requirements-3.6.pip +++ b/ci/requirements-3.6.pip @@ -1 +1,2 @@ brotlipy +moto \ No newline at end of file diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 822144a80bc9a..06352d48c3547 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -23,3 +23,4 @@ beautifulsoup4 s3fs xarray ipython +boto3 \ No newline at end of file diff --git a/ci/requirements-3.6_LOCALE.run b/ci/requirements-3.6_LOCALE.run index ad54284c6f7e3..4dc44d2884070 100644 --- a/ci/requirements-3.6_LOCALE.run +++ b/ci/requirements-3.6_LOCALE.run @@ -20,3 +20,4 @@ beautifulsoup4 s3fs xarray ipython +boto3 \ No newline at end of file diff --git a/ci/requirements-3.6_LOCALE_SLOW.run b/ci/requirements-3.6_LOCALE_SLOW.run index ad54284c6f7e3..4dc44d2884070 100644 --- a/ci/requirements-3.6_LOCALE_SLOW.run +++ b/ci/requirements-3.6_LOCALE_SLOW.run @@ -20,3 +20,4 @@ beautifulsoup4 s3fs xarray ipython +boto3 \ No newline at end of file diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 1e051802ec9f8..3ee4035769c9d 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -5,3 +5,5 @@ cython pytest pytest-cov flake8 +s3fs +boto3 \ No newline at end of file diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 671d4248818e4..2c32134f5face 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,11 +4,13 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os +import boto3 import numpy as np from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) from datetime import timedelta +from moto import mock_s3 import pandas as pd from pandas.util.testing import (assert_almost_equal, assert_frame_equal, @@ -985,12 +987,36 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) assert dumps(df, iso_dates=True) == dfexp - def test_read_jsonl(self): + def test_read_inline_jsonl(self): # GH9180 result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) + @mock_s3 + def test_read_s3_jsonl(self): + # GH17200 + conn = boto3.client('s3') + conn.create_bucket(Bucket='testbucket') + conn.put_object(Body=b'{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', Key='items.jsonl', Bucket='testbucket') + + result = read_json('s3://testbucket/items.jsonl', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_local_jsonl(self): + # GH17200 + fname = "./tmp_items.jsonl" + try: + with open(fname, "w") as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(fname, lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + finally: + import os + os.remove(fname) + def test_read_jsonl_unicode_chars(self): # GH15132: non-ascii unicode characters # \u201d == RIGHT DOUBLE QUOTATION MARK From 0c164e787c3c360022c49cdfd776399da5a2f234 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Tue, 29 Aug 2017 15:21:48 -0500 Subject: [PATCH 05/19] Removed boto3 per code review --- ci/requirements-2.7.run | 3 +-- ci/requirements-2.7_SLOW.run | 3 +-- ci/requirements-3.5.run | 3 +-- ci/requirements-3.5_OSX.run | 3 +-- ci/requirements-3.6.run | 3 +-- ci/requirements-3.6_LOCALE.run | 3 +-- ci/requirements-3.6_LOCALE_SLOW.run | 3 +-- ci/requirements_dev.txt | 3 +-- 8 files changed, 8 insertions(+), 16 deletions(-) diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index 0220428cd4e6b..b1e5c33f1b0d2 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -17,5 +17,4 @@ psycopg2 patsy pymysql=0.6.3 jinja2=2.8 -xarray=0.8.0 -boto3 \ No newline at end of file +xarray=0.8.0 \ No newline at end of file diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index 0fab2120a9273..fd8962049901e 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -16,5 +16,4 @@ s3fs psycopg2 pymysql html5lib -beautiful-soup -boto3 \ No newline at end of file +beautiful-soup \ No newline at end of file diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index 055df57338760..94faf4e6af1ad 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -17,5 +17,4 @@ pymysql psycopg2 s3fs beautifulsoup4 -ipython -boto3 \ No newline at end of file +ipython \ No newline at end of file diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run index 64cea248edec2..78ae7c1ea703e 100644 --- a/ci/requirements-3.5_OSX.run +++ b/ci/requirements-3.5_OSX.run @@ -13,5 +13,4 @@ jinja2 bottleneck xarray s3fs -beautifulsoup4 -boto3 \ No newline at end of file +beautifulsoup4 \ No newline at end of file diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 06352d48c3547..2df5719ca77b6 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -22,5 +22,4 @@ fastparquet beautifulsoup4 s3fs xarray -ipython -boto3 \ No newline at end of file +ipython \ No newline at end of file diff --git a/ci/requirements-3.6_LOCALE.run b/ci/requirements-3.6_LOCALE.run index 4dc44d2884070..9b651b337d349 100644 --- a/ci/requirements-3.6_LOCALE.run +++ b/ci/requirements-3.6_LOCALE.run @@ -19,5 +19,4 @@ psycopg2 beautifulsoup4 s3fs xarray -ipython -boto3 \ No newline at end of file +ipython \ No newline at end of file diff --git a/ci/requirements-3.6_LOCALE_SLOW.run b/ci/requirements-3.6_LOCALE_SLOW.run index 4dc44d2884070..9b651b337d349 100644 --- a/ci/requirements-3.6_LOCALE_SLOW.run +++ b/ci/requirements-3.6_LOCALE_SLOW.run @@ -19,5 +19,4 @@ psycopg2 beautifulsoup4 s3fs xarray -ipython -boto3 \ No newline at end of file +ipython \ No newline at end of file diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 3ee4035769c9d..6a1797d21b1a4 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -5,5 +5,4 @@ cython pytest pytest-cov flake8 -s3fs -boto3 \ No newline at end of file +s3fs \ No newline at end of file From ac9913363e85bb08f21aa68ca0bf5086baee3bd9 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Tue, 29 Aug 2017 16:13:24 -0500 Subject: [PATCH 06/19] Skip if imports don't exist. Create fixture for test setup. --- pandas/tests/io/json/test_pandas.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2c32134f5face..4d1fe73cc6018 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,13 +4,11 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os -import boto3 - import numpy as np from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) from datetime import timedelta -from moto import mock_s3 +moto = pytest.importorskip("moto") import pandas as pd from pandas.util.testing import (assert_almost_equal, assert_frame_equal, @@ -993,12 +991,22 @@ def test_read_inline_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) - @mock_s3 - def test_read_s3_jsonl(self): - # GH17200 + @pytest.yield_fixture(scope="function") + def testbucket_conn(self): + """ Fixture for test_read_s3_jsonl""" + boto3 = pytest.importorskip('boto3') + moto.mock_s3().start() # Start and stop mocking only once, surrounding the test run + # to ensure single context is kept. + conn = boto3.client('s3') conn.create_bucket(Bucket='testbucket') - conn.put_object(Body=b'{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', Key='items.jsonl', Bucket='testbucket') + yield conn + + moto.mock_s3().stop() + + def test_read_s3_jsonl(self, testbucket_conn): + # GH17200 + testbucket_conn.put_object(Body=b'{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', Key='items.jsonl', Bucket='testbucket') result = read_json('s3://testbucket/items.jsonl', lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) From 125f0494e9da1ddba5df6dd69ac65310e3fd1dc5 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Wed, 30 Aug 2017 10:37:38 -0500 Subject: [PATCH 07/19] compatibility for Python 2.7 --- pandas/io/common.py | 5 ++++- pandas/tests/io/json/test_pandas.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e3d9876e75be6..02b4c6468cf67 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -202,7 +202,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) - return TextIOWrapper(reader, encoding=encoding), encoding, compression + if compat.PY3: + return TextIOWrapper(reader, encoding=encoding), encoding, compression + else: + return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io import s3 diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 4d1fe73cc6018..e10e2ff87c3df 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -996,7 +996,7 @@ def testbucket_conn(self): """ Fixture for test_read_s3_jsonl""" boto3 = pytest.importorskip('boto3') moto.mock_s3().start() # Start and stop mocking only once, surrounding the test run - # to ensure single context is kept. + # to ensure single context is kept. conn = boto3.client('s3') conn.create_bucket(Bucket='testbucket') From 533d404cdb2e00632de52804b5ffc4c763b2d180 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Wed, 20 Sep 2017 18:12:54 -0500 Subject: [PATCH 08/19] Add object handling in the read_json function instead of handling in common.py. Revert handling in common.py to avoid issues with other clients. --- pandas/io/common.py | 8 ++------ pandas/io/json/json.py | 9 +++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 02b4c6468cf67..52b3a07eb6726 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -202,17 +202,13 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, # Override compression based on Content-Encoding header compression = 'gzip' reader = BytesIO(req.read()) - if compat.PY3: - return TextIOWrapper(reader, encoding=encoding), encoding, compression - else: - return reader, encoding, compression + return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io import s3 - ret = s3.get_filepath_or_buffer(filepath_or_buffer, + return s3.get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, compression=compression) - return TextIOWrapper(ret[0], encoding=encoding), ret[1], ret[2] if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index a1d48719ba9c0..1ab216f9350ef 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -340,12 +340,21 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): json = filepath_or_buffer.read() + else: json = filepath_or_buffer if lines: # If given a json lines file, we break the string into lines, add # commas and put it in a json list to make a valid json object. + + """ + Handle encoded bytes arrays in PY3 and bytes objects from certain + readables before using StringIO. + """ + if isinstance(json, bytes): + json = json.decode('utf-8') + lines = list(StringIO(json.strip())) json = '[' + ','.join(lines) + ']' From 17973a19181d794a48c24ca10c738df45ab25c0e Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Thu, 21 Sep 2017 10:13:48 -0500 Subject: [PATCH 09/19] add s3fs to requirements_all. Will be needed to run read_json tests anywhere --- ci/requirements_all.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt index b153b6989df86..29361780dabdc 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements_all.txt @@ -26,3 +26,4 @@ sqlalchemy bottleneck pymysql Jinja2 +s3fs \ No newline at end of file From 2ae5a9d2d59f6ea113e63167d7d93714c781d771 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Thu, 21 Sep 2017 14:22:59 -0500 Subject: [PATCH 10/19] PEP-8 fixes. Ignore tests without s3fs available. Use ensure_clean --- pandas/io/common.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index b41ba382e1f87..33cbb5a2b4ff9 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -194,7 +194,6 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, """ filepath_or_buffer = _stringify_path(filepath_or_buffer) - from io import TextIOWrapper if _is_url(filepath_or_buffer): req = _urlopen(filepath_or_buffer) content_encoding = req.headers.get('Content-Encoding', None) @@ -207,8 +206,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if _is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, - compression=compression) + encoding=encoding, + compression=compression) if isinstance(filepath_or_buffer, (compat.string_types, compat.binary_type, From 38f043b57d0ddf8e71c6abf7a0a5f210c8ec8a64 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Thu, 21 Sep 2017 20:25:59 -0500 Subject: [PATCH 11/19] PEP-8 fixes. Ignore tests without s3fs available. Use ensure_clean --- pandas/tests/io/json/test_pandas.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index e10e2ff87c3df..99b199a56d219 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -8,7 +8,6 @@ from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) from datetime import timedelta -moto = pytest.importorskip("moto") import pandas as pd from pandas.util.testing import (assert_almost_equal, assert_frame_equal, @@ -995,8 +994,13 @@ def test_read_inline_jsonl(self): def testbucket_conn(self): """ Fixture for test_read_s3_jsonl""" boto3 = pytest.importorskip('boto3') - moto.mock_s3().start() # Start and stop mocking only once, surrounding the test run - # to ensure single context is kept. + moto = pytest.importorskip("moto") + + """ + Start and stop mocking only once, surrounding the test run + to ensure single context is kept. + """ + moto.mock_s3().start() conn = boto3.client('s3') conn.create_bucket(Bucket='testbucket') @@ -1005,8 +1009,13 @@ def testbucket_conn(self): moto.mock_s3().stop() def test_read_s3_jsonl(self, testbucket_conn): + pytest.importorskip('s3fs') # GH17200 - testbucket_conn.put_object(Body=b'{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', Key='items.jsonl', Bucket='testbucket') + + body = b'{"a": 1, "b": 2}\n{"b":2, "a" :1}\n' + testbucket_conn.put_object(Body=body, + Key='items.jsonl', + Bucket='testbucket') result = read_json('s3://testbucket/items.jsonl', lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) @@ -1014,16 +1023,11 @@ def test_read_s3_jsonl(self, testbucket_conn): def test_read_local_jsonl(self): # GH17200 - fname = "./tmp_items.jsonl" - try: - with open(fname, "w") as infile: - infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + with ensure_clean('tmp_items.json') as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') result = read_json(fname, lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) - finally: - import os - os.remove(fname) def test_read_jsonl_unicode_chars(self): # GH15132: non-ascii unicode characters From 1d03d7af0c71a56df6855cb273c90afe0cfa88e7 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Thu, 21 Sep 2017 22:19:34 -0500 Subject: [PATCH 12/19] remove bad dev deps. fix tempfile context in tests --- ci/requirements_dev.txt | 1 - pandas/tests/io/json/test_pandas.py | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 8cbc947b5964d..037db6a11a3f1 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -6,4 +6,3 @@ pytest>=3.1.0 pytest-cov flake8 s3fs -moto diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 99b199a56d219..9958b1131d62c 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1023,9 +1023,10 @@ def test_read_s3_jsonl(self, testbucket_conn): def test_read_local_jsonl(self): # GH17200 - with ensure_clean('tmp_items.json') as infile: - infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') - result = read_json(fname, lines=True) + with ensure_clean('tmp_items.json') as path: + with open(path, 'w') as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(path, lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) From 78ee720addb3a16631594dca7284490198b1cc69 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Fri, 22 Sep 2017 09:37:03 -0500 Subject: [PATCH 13/19] Fixing link errors --- pandas/io/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 33cbb5a2b4ff9..69a7e69ea724b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -206,7 +206,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, if _is_s3_url(filepath_or_buffer): from pandas.io import s3 return s3.get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, + encoding=encoding, compression=compression) if isinstance(filepath_or_buffer, (compat.string_types, From 9d7e75b32dcdba23a994cc3257091dd093e8b670 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Mon, 25 Sep 2017 10:49:23 -0500 Subject: [PATCH 14/19] Code review formatting and compliance. Also use for testing --- ci/requirements-2.7.pip | 1 - ci/requirements-3.6.pip | 1 - ci/requirements-3.6.run | 2 +- ci/requirements-3.6_LOCALE.run | 2 +- ci/requirements-3.6_LOCALE_SLOW.run | 2 +- pandas/io/json/json.py | 3 +-- pandas/tests/io/json/test_pandas.py | 30 +++++-------------------- pandas/tests/io/parser/data/items.jsonl | 2 ++ pandas/tests/io/parser/test_network.py | 6 ++++- 9 files changed, 16 insertions(+), 33 deletions(-) create mode 100644 pandas/tests/io/parser/data/items.jsonl diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index 6316e64417e18..876d9e978fa84 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -8,4 +8,3 @@ py PyCrypto mock ipython -moto \ No newline at end of file diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip index 42237d0d03a16..753a60d6c119a 100644 --- a/ci/requirements-3.6.pip +++ b/ci/requirements-3.6.pip @@ -1,2 +1 @@ brotlipy -moto \ No newline at end of file diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run index 2df5719ca77b6..822144a80bc9a 100644 --- a/ci/requirements-3.6.run +++ b/ci/requirements-3.6.run @@ -22,4 +22,4 @@ fastparquet beautifulsoup4 s3fs xarray -ipython \ No newline at end of file +ipython diff --git a/ci/requirements-3.6_LOCALE.run b/ci/requirements-3.6_LOCALE.run index 9b651b337d349..ad54284c6f7e3 100644 --- a/ci/requirements-3.6_LOCALE.run +++ b/ci/requirements-3.6_LOCALE.run @@ -19,4 +19,4 @@ psycopg2 beautifulsoup4 s3fs xarray -ipython \ No newline at end of file +ipython diff --git a/ci/requirements-3.6_LOCALE_SLOW.run b/ci/requirements-3.6_LOCALE_SLOW.run index 9b651b337d349..ad54284c6f7e3 100644 --- a/ci/requirements-3.6_LOCALE_SLOW.run +++ b/ci/requirements-3.6_LOCALE_SLOW.run @@ -19,4 +19,4 @@ psycopg2 beautifulsoup4 s3fs xarray -ipython \ No newline at end of file +ipython diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 1ab216f9350ef..5108123927b5e 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -349,8 +349,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, # commas and put it in a json list to make a valid json object. """ - Handle encoded bytes arrays in PY3 and bytes objects from certain - readables before using StringIO. + If PY3 and/or isinstance(json, bytes) """ if isinstance(json, bytes): json = json.decode('utf-8') diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9958b1131d62c..bdf4fcb963113 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,7 +4,10 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os +import moto import numpy as np +from pandas.tests.io.parser.test_network import s3_resource +from pandas.tests.io.parser.test_network import tips_file, jsonl_file from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) from datetime import timedelta @@ -990,34 +993,11 @@ def test_read_inline_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) - @pytest.yield_fixture(scope="function") - def testbucket_conn(self): - """ Fixture for test_read_s3_jsonl""" - boto3 = pytest.importorskip('boto3') - moto = pytest.importorskip("moto") - - """ - Start and stop mocking only once, surrounding the test run - to ensure single context is kept. - """ - moto.mock_s3().start() - - conn = boto3.client('s3') - conn.create_bucket(Bucket='testbucket') - yield conn - - moto.mock_s3().stop() - - def test_read_s3_jsonl(self, testbucket_conn): + def test_read_s3_jsonl(self, s3_resource): pytest.importorskip('s3fs') # GH17200 - body = b'{"a": 1, "b": 2}\n{"b":2, "a" :1}\n' - testbucket_conn.put_object(Body=body, - Key='items.jsonl', - Bucket='testbucket') - - result = read_json('s3://testbucket/items.jsonl', lines=True) + result = read_json('s3n://pandas-test/items.jsonl', lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl new file mode 100644 index 0000000000000..f784d37befa82 --- /dev/null +++ b/pandas/tests/io/parser/data/items.jsonl @@ -0,0 +1,2 @@ +{"a": 1, "b": 2} +{"b":2, "a" :1} diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 27cc708889fa2..a527dc0bd56ae 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -19,6 +19,9 @@ def tips_file(): return os.path.join(tm.get_data_path(), 'tips.csv') +@pytest.fixture(scope='module') +def jsonl_file(): + return os.path.join(tm.get_data_path(), 'items.jsonl') @pytest.fixture(scope='module') def salaries_table(): @@ -27,7 +30,7 @@ def salaries_table(): @pytest.fixture(scope='module') -def s3_resource(tips_file): +def s3_resource(tips_file, jsonl_file): pytest.importorskip('s3fs') moto.mock_s3().start() @@ -35,6 +38,7 @@ def s3_resource(tips_file): ('tips.csv', tips_file), ('tips.csv.gz', tips_file + '.gz'), ('tips.csv.bz2', tips_file + '.bz2'), + ('items.jsonl', jsonl_file), ] def add_tips_files(bucket_name): From 6979fb80690e9376e665e43b9cf21e90edb54109 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 25 Sep 2017 16:20:47 -0500 Subject: [PATCH 15/19] REF: Move s3 mocking to conftest --- pandas/tests/io/conftest.py | 74 ++++++++++++++++++++++++++ pandas/tests/io/json/test_pandas.py | 3 -- pandas/tests/io/parser/test_network.py | 52 ------------------ 3 files changed, 74 insertions(+), 55 deletions(-) create mode 100644 pandas/tests/io/conftest.py diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py new file mode 100644 index 0000000000000..828d5d0ccd3c6 --- /dev/null +++ b/pandas/tests/io/conftest.py @@ -0,0 +1,74 @@ +import os + +import moto +import pytest +from pandas.io.parsers import read_table + +HERE = os.path.dirname(__file__) + + +@pytest.fixture(scope='module') +def tips_file(): + """Path to the tips dataset""" + return os.path.join(HERE, 'parser', 'data', 'tips.csv') + + +@pytest.fixture(scope='module') +def jsonl_file(): + """Path a JSONL dataset""" + return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + + +@pytest.fixture(scope='module') +def salaries_table(): + """DataFrame with the salaries dataset""" + path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + return read_table(path) + + +@pytest.fixture(scope='module') +def s3_resource(tips_file, jsonl_file): + """Fixture for mocking S3 interaction. + + The primary bucket name is "pandas-test". The following datasets + are loaded. + + - tips.csv + - tips.csv.gz + - tips.csv.bz2 + - items.jsonl + + A private bucket "cant_get_it" is also created. The boto3 s3 resource + is yielded by the fixture. + """ + pytest.importorskip('s3fs') + moto.mock_s3().start() + + test_s3_files = [ + ('tips.csv', tips_file), + ('tips.csv.gz', tips_file + '.gz'), + ('tips.csv.bz2', tips_file + '.bz2'), + ('items.jsonl', jsonl_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, 'rb') as f: + conn.Bucket(bucket_name).put_object( + Key=s3_key, + Body=f) + + boto3 = pytest.importorskip('boto3') + # see gh-16135 + bucket = 'pandas-test' + + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + + yield conn + + moto.mock_s3().stop() diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index bdf4fcb963113..5d4d4e6603293 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,10 +4,7 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os -import moto import numpy as np -from pandas.tests.io.parser.test_network import s3_resource -from pandas.tests.io.parser.test_network import tips_file, jsonl_file from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) from datetime import timedelta diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index a527dc0bd56ae..d00d3f31ce189 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -4,10 +4,7 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -import os - import pytest -import moto import pandas.util.testing as tm from pandas import DataFrame @@ -15,55 +12,6 @@ from pandas.compat import BytesIO -@pytest.fixture(scope='module') -def tips_file(): - return os.path.join(tm.get_data_path(), 'tips.csv') - -@pytest.fixture(scope='module') -def jsonl_file(): - return os.path.join(tm.get_data_path(), 'items.jsonl') - -@pytest.fixture(scope='module') -def salaries_table(): - path = os.path.join(tm.get_data_path(), 'salaries.csv') - return read_table(path) - - -@pytest.fixture(scope='module') -def s3_resource(tips_file, jsonl_file): - pytest.importorskip('s3fs') - moto.mock_s3().start() - - test_s3_files = [ - ('tips.csv', tips_file), - ('tips.csv.gz', tips_file + '.gz'), - ('tips.csv.bz2', tips_file + '.bz2'), - ('items.jsonl', jsonl_file), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, 'rb') as f: - conn.Bucket(bucket_name).put_object( - Key=s3_key, - Body=f) - - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' - - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') - - yield conn - - moto.mock_s3().stop() - - @pytest.mark.network @pytest.mark.parametrize( "compression,extension", From bb600cb74f9c7a36e9b281bf108fbe99784e352e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Sep 2017 12:33:01 -0500 Subject: [PATCH 16/19] Update requirements-2.7.run End in newline --- ci/requirements-2.7.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index b1e5c33f1b0d2..7152cb2c8b605 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -17,4 +17,4 @@ psycopg2 patsy pymysql=0.6.3 jinja2=2.8 -xarray=0.8.0 \ No newline at end of file +xarray=0.8.0 From 5b036ec6f1acceff6dd6d3dbcebe0a7583d10ea5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Sep 2017 12:33:32 -0500 Subject: [PATCH 17/19] Update requirements-2.7_SLOW.run End in newline --- ci/requirements-2.7_SLOW.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index 0a19363fa3d43..f7708283ad04a 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -16,4 +16,4 @@ s3fs psycopg2 pymysql html5lib -beautiful-soup \ No newline at end of file +beautiful-soup From c5c4d07c13fe8803d2b5ab45b9425b234162747f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 26 Sep 2017 12:35:10 -0500 Subject: [PATCH 18/19] End in newlines --- ci/requirements-3.5.pip | 2 +- ci/requirements-3.5.run | 2 +- ci/requirements-3.5_OSX.run | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 9f8f7ee0a3563..336f73530eb2d 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,3 +1,3 @@ xarray==0.9.1 pandas-gbq -moto \ No newline at end of file +moto diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index 94faf4e6af1ad..52828b5220997 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -17,4 +17,4 @@ pymysql psycopg2 s3fs beautifulsoup4 -ipython \ No newline at end of file +ipython diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run index 78ae7c1ea703e..1d83474d10f2f 100644 --- a/ci/requirements-3.5_OSX.run +++ b/ci/requirements-3.5_OSX.run @@ -13,4 +13,4 @@ jinja2 bottleneck xarray s3fs -beautifulsoup4 \ No newline at end of file +beautifulsoup4 From b913972f71e896b4812966960f9cecaf7cf0c7ae Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 26 Nov 2017 18:59:34 -0500 Subject: [PATCH 19/19] linting & fix --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/io/json/json.py | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4c6cdb9846305..84583a8b6af67 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -88,7 +88,7 @@ I/O - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - +- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index 11bf3a9363953..21736673350d8 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -5,7 +5,7 @@ import pandas._libs.json as json from pandas._libs.tslib import iNaT -from pandas.compat import StringIO, long, u +from pandas.compat import StringIO, long, u, to_str from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, @@ -458,8 +458,10 @@ def read(self): if self.lines and self.chunksize: obj = concat(self) elif self.lines: + + data = to_str(self.data) obj = self._get_object_parser( - self._combine_lines(self.data.split('\n')) + self._combine_lines(data.split('\n')) ) else: obj = self._get_object_parser(self.data) @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: dtype = np.dtype(dtype) return data.astype(dtype), True - except: + except (TypeError, ValueError): return data, False if convert_dates: @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass if data.dtype.kind == 'f': @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass # do't coerce 0-len data @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, if (new_data == data).all(): data = new_data result = True - except: + except (TypeError, ValueError): pass # coerce ints to 64 @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('int64') result = True - except: + except (TypeError, ValueError): pass return data, result @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data): if new_data.dtype == 'object': try: new_data = data.astype('int64') - except: + except (TypeError, ValueError): pass # ignore numbers that are out of range @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data): unit=date_unit) except ValueError: continue - except: + except Exception: break return new_data, True return data, False