COMPAT: reading json with lines=True from s3, xref #17200 (#17201)

Kevin Kuhl · jreback · commit 4fd104a72a82 · 2017-11-27T06:34:56.000-05:00
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
@@ -88,7 +88,7 @@ I/O
 - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`)
 - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`).
 - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`)
-
+- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -5,7 +5,7 @@
 
 import pandas._libs.json as json
 from pandas._libs.tslib import iNaT
-from pandas.compat import StringIO, long, u
+from pandas.compat import StringIO, long, u, to_str
 from pandas import compat, isna
 from pandas import Series, DataFrame, to_datetime, MultiIndex
 from pandas.io.common import (get_filepath_or_buffer, _get_handle,
@@ -458,8 +458,10 @@ def read(self):
         if self.lines and self.chunksize:
             obj = concat(self)
         elif self.lines:
+
+            data = to_str(self.data)
             obj = self._get_object_parser(
-                self._combine_lines(self.data.split('\n'))
+                self._combine_lines(data.split('\n'))
             )
         else:
             obj = self._get_object_parser(self.data)
@@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
                     try:
                         dtype = np.dtype(dtype)
                         return data.astype(dtype), True
-                    except:
+                    except (TypeError, ValueError):
                         return data, False
 
         if convert_dates:
@@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
             try:
                 data = data.astype('float64')
                 result = True
-            except:
+            except (TypeError, ValueError):
                 pass
 
         if data.dtype.kind == 'f':
@@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
                 try:
                     data = data.astype('float64')
                     result = True
-                except:
+                except (TypeError, ValueError):
                     pass
 
         # do't coerce 0-len data
@@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
                 if (new_data == data).all():
                     data = new_data
                     result = True
-            except:
+            except (TypeError, ValueError):
                 pass
 
         # coerce ints to 64
@@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True,
             try:
                 data = data.astype('int64')
                 result = True
-            except:
+            except (TypeError, ValueError):
                 pass
 
         return data, result
@@ -680,7 +682,7 @@ def _try_convert_to_date(self, data):
         if new_data.dtype == 'object':
             try:
                 new_data = data.astype('int64')
-            except:
+            except (TypeError, ValueError):
                 pass
 
         # ignore numbers that are out of range
@@ -697,7 +699,7 @@ def _try_convert_to_date(self, data):
                                        unit=date_unit)
             except ValueError:
                 continue
-            except:
+            except Exception:
                 break
             return new_data, True
         return data, False
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -0,0 +1,74 @@
+import os
+
+import moto
+import pytest
+from pandas.io.parsers import read_table
+
+HERE = os.path.dirname(__file__)
+
+
+@pytest.fixture(scope='module')
+def tips_file():
+    """Path to the tips dataset"""
+    return os.path.join(HERE, 'parser', 'data', 'tips.csv')
+
+
+@pytest.fixture(scope='module')
+def jsonl_file():
+    """Path a JSONL dataset"""
+    return os.path.join(HERE, 'parser', 'data', 'items.jsonl')
+
+
+@pytest.fixture(scope='module')
+def salaries_table():
+    """DataFrame with the salaries dataset"""
+    path = os.path.join(HERE, 'parser', 'data', 'salaries.csv')
+    return read_table(path)
+
+
+@pytest.fixture(scope='module')
+def s3_resource(tips_file, jsonl_file):
+    """Fixture for mocking S3 interaction.
+
+    The primary bucket name is "pandas-test". The following datasets
+    are loaded.
+
+    - tips.csv
+    - tips.csv.gz
+    - tips.csv.bz2
+    - items.jsonl
+
+    A private bucket "cant_get_it" is also created. The boto3 s3 resource
+    is yielded by the fixture.
+    """
+    pytest.importorskip('s3fs')
+    moto.mock_s3().start()
+
+    test_s3_files = [
+        ('tips.csv', tips_file),
+        ('tips.csv.gz', tips_file + '.gz'),
+        ('tips.csv.bz2', tips_file + '.bz2'),
+        ('items.jsonl', jsonl_file),
+    ]
+
+    def add_tips_files(bucket_name):
+        for s3_key, file_name in test_s3_files:
+            with open(file_name, 'rb') as f:
+                conn.Bucket(bucket_name).put_object(
+                    Key=s3_key,
+                    Body=f)
+
+    boto3 = pytest.importorskip('boto3')
+    # see gh-16135
+    bucket = 'pandas-test'
+
+    conn = boto3.resource("s3", region_name="us-east-1")
+    conn.create_bucket(Bucket=bucket)
+    add_tips_files(bucket)
+
+    conn.create_bucket(Bucket='cant_get_it', ACL='private')
+    add_tips_files('cant_get_it')
+
+    yield conn
+
+    moto.mock_s3().stop()
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -4,7 +4,6 @@
 from pandas.compat import (range, lrange, StringIO,
                            OrderedDict, is_platform_32bit)
 import os
-
 import numpy as np
 from pandas import (Series, DataFrame, DatetimeIndex, Timestamp,
                     read_json, compat)
@@ -1032,6 +1031,70 @@ def test_tz_range_is_utc(self):
         df = DataFrame({'DT': dti})
         assert dumps(df, iso_dates=True) == dfexp
 
+    def test_read_inline_jsonl(self):
+        # GH9180
+        result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
+        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+    def test_read_s3_jsonl(self, s3_resource):
+        pytest.importorskip('s3fs')
+        # GH17200
+
+        result = read_json('s3n://pandas-test/items.jsonl', lines=True)
+        expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+    def test_read_local_jsonl(self):
+        # GH17200
+        with ensure_clean('tmp_items.json') as path:
+            with open(path, 'w') as infile:
+                infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
+            result = read_json(path, lines=True)
+            expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+            assert_frame_equal(result, expected)
+
+    def test_read_jsonl_unicode_chars(self):
+        # GH15132: non-ascii unicode characters
+        # \u201d == RIGHT DOUBLE QUOTATION MARK
+
+        # simulate file handle
+        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+        json = StringIO(json)
+        result = read_json(json, lines=True)
+        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                             columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+        # simulate string
+        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+        result = read_json(json, lines=True)
+        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                             columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+    def test_to_jsonl(self):
+        # GH9180
+        df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
+        assert result == expected
+
+        df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
+        assert result == expected
+        assert_frame_equal(pd.read_json(result, lines=True), df)
+
+        # GH15096: escaped characters in columns and data
+        df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
+                       columns=["a\\", 'b'])
+        result = df.to_json(orient="records", lines=True)
+        expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
+                    '{"a\\\\":"foo\\"","b":"bar"}')
+        assert result == expected
+        assert_frame_equal(pd.read_json(result, lines=True), df)
+
     def test_latin_encoding(self):
         if compat.PY2:
             tm.assert_raises_regex(
diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl
@@ -0,0 +1,2 @@
+{"a": 1, "b": 2}
+{"b":2, "a" :1}
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
@@ -4,62 +4,14 @@
 Tests parsers ability to read and parse non-local files
 and hence require a network connection to be read.
 """
-import os
-
 import pytest
-import moto
 
 import pandas.util.testing as tm
 from pandas import DataFrame
 from pandas.io.parsers import read_csv, read_table
 from pandas.compat import BytesIO
 
 
-@pytest.fixture(scope='module')
-def tips_file():
-    return os.path.join(tm.get_data_path(), 'tips.csv')
-
-
-@pytest.fixture(scope='module')
-def salaries_table():
-    path = os.path.join(tm.get_data_path(), 'salaries.csv')
-    return read_table(path)
-
-
-@pytest.fixture(scope='module')
-def s3_resource(tips_file):
-    pytest.importorskip('s3fs')
-    moto.mock_s3().start()
-
-    test_s3_files = [
-        ('tips.csv', tips_file),
-        ('tips.csv.gz', tips_file + '.gz'),
-        ('tips.csv.bz2', tips_file + '.bz2'),
-    ]
-
-    def add_tips_files(bucket_name):
-        for s3_key, file_name in test_s3_files:
-            with open(file_name, 'rb') as f:
-                conn.Bucket(bucket_name).put_object(
-                    Key=s3_key,
-                    Body=f)
-
-    boto3 = pytest.importorskip('boto3')
-    # see gh-16135
-    bucket = 'pandas-test'
-
-    conn = boto3.resource("s3", region_name="us-east-1")
-    conn.create_bucket(Bucket=bucket)
-    add_tips_files(bucket)
-
-    conn.create_bucket(Bucket='cant_get_it', ACL='private')
-    add_tips_files('cant_get_it')
-
-    yield conn
-
-    moto.mock_s3().stop()
-
-
 @pytest.mark.network
 @pytest.mark.parametrize(
     "compression,extension",

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"a": 1, "b": 2}`
	`2`	`+{"b":2, "a" :1}`