From a560331b254390174259fe6df084830ce2b377a7 Mon Sep 17 00:00:00 2001 From: dickreuter Date: Wed, 26 Oct 2016 21:14:55 +0100 Subject: [PATCH 01/29] Avoids exception when pandas.io.json.json_normalize contains items in meta parameter that don't always occur in every item of the list --- pandas/io/json.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 1e258101a5d86..2ab6120dc8bdd 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -792,7 +792,10 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - meta_val = _pull_field(obj, val[level:]) + try: + meta_val = _pull_field(obj, val[level:]) + except: + meta_val = np.nan meta_vals[key].append(meta_val) records.extend(recs) From 050bf60edd9e551eb6927f2c167b974d1f8eade5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Oct 2016 18:18:34 -0400 Subject: [PATCH 02/29] COMPAT/TST: fix test for range testing of negative integers to neg powers xref https://github.com/numpy/numpy/pull/8127 closes #14489 Author: Jeff Reback Closes #14498 from jreback/compat and squashes the following commits: 882872e [Jeff Reback] COMPAT/TST: fix test for range testing of negative integers to neg powers --- pandas/tests/indexes/test_range.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 26d50aa55431f..38e715fce2720 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -29,12 +29,7 @@ def setUp(self): def create_index(self): return RangeIndex(5) - def test_binops(self): - ops = [operator.add, operator.sub, operator.mul, operator.floordiv, - operator.truediv, pow] - scalars = [-1, 1, 2] - idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2), - RangeIndex(-10, 10, 2), RangeIndex(5, -5, -1)] + def check_binop(self, ops, scalars, idxs): for op in ops: for a, b in combinations(idxs, 2): result = op(a, b) @@ -46,6 +41,23 @@ def test_binops(self): expected = op(Int64Index(idx), scalar) tm.assert_index_equal(result, expected) + def test_binops(self): + ops = [operator.add, operator.sub, operator.mul, operator.floordiv, + operator.truediv] + scalars = [-1, 1, 2] + idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2), + RangeIndex(-10, 10, 2), RangeIndex(5, -5, -1)] + self.check_binop(ops, scalars, idxs) + + def test_binops_pow(self): + # later versions of numpy don't allow powers of negative integers + # so test separately + # https://github.com/numpy/numpy/pull/8127 + ops = [pow] + scalars = [1, 2] + idxs = [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)] + self.check_binop(ops, scalars, idxs) + def test_too_many_names(self): def testit(): self.index.names = ["roger", "harold"] From 66b4c835f3fd7c9a05233603792a5ac51a04193f Mon Sep 17 00:00:00 2001 From: Robert Bradshaw Date: Tue, 25 Oct 2016 22:17:35 -0700 Subject: [PATCH 03/29] BLD: Support Cython 0.25 closes #14496 --- doc/source/whatsnew/v0.19.1.txt | 2 +- setup.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3ee4cc1dde92d..c5822ba5ea254 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -33,7 +33,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - +- Compat with Cython 0.25 for building (:issue:`14496`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) diff --git a/setup.py b/setup.py index 3f8667cd6fe42..a17dd502d7706 100755 --- a/setup.py +++ b/setup.py @@ -85,7 +85,11 @@ def is_platform_mac(): try: if not _CYTHON_INSTALLED: raise ImportError('No supported version of Cython installed.') - from Cython.Distutils import build_ext as _build_ext + try: + from Cython.Distutils.old_build_ext import old_build_ext as _build_ext + except ImportError: + # Pre 0.25 + from Cython.Distutils import build_ext as _build_ext cython = True except ImportError: cython = False From 6130e77fb7c9d44fde5d98f9719bd67bb9ec2ade Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 26 Oct 2016 18:31:03 -0400 Subject: [PATCH 04/29] BUG: Accept unicode quotechars again in pd.read_csv Title is self-explanatory. Affects Python 2.x only. Closes #14477. Author: gfyoung Closes #14492 from gfyoung/quotechar-unicode-2.x and squashes the following commits: ec9f59a [gfyoung] BUG: Accept unicode quotechars again in pd.read_csv --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/parsers.py | 3 +++ pandas/io/tests/parser/quoting.py | 15 ++++++++++++++- pandas/parser.pyx | 3 ++- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index c5822ba5ea254..7594478ada41a 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -36,6 +36,7 @@ Bug Fixes - Compat with Cython 0.25 for building (:issue:`14496`) +- Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f8cf04e08ab03..e0127c3544971 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1759,6 +1759,9 @@ def __init__(self, f, **kwds): self.delimiter = kwds['delimiter'] self.quotechar = kwds['quotechar'] + if isinstance(self.quotechar, compat.text_type): + self.quotechar = str(self.quotechar) + self.escapechar = kwds['escapechar'] self.doublequote = kwds['doublequote'] self.skipinitialspace = kwds['skipinitialspace'] diff --git a/pandas/io/tests/parser/quoting.py b/pandas/io/tests/parser/quoting.py index d0f1493be0621..765cec8243a0a 100644 --- a/pandas/io/tests/parser/quoting.py +++ b/pandas/io/tests/parser/quoting.py @@ -9,7 +9,7 @@ import pandas.util.testing as tm from pandas import DataFrame -from pandas.compat import StringIO +from pandas.compat import PY3, StringIO, u class QuotingTests(object): @@ -138,3 +138,16 @@ def test_double_quote(self): result = self.read_csv(StringIO(data), quotechar='"', doublequote=False) tm.assert_frame_equal(result, expected) + + def test_quotechar_unicode(self): + # See gh-14477 + data = 'a\n1' + expected = DataFrame({'a': [1]}) + + result = self.read_csv(StringIO(data), quotechar=u('"')) + tm.assert_frame_equal(result, expected) + + # Compared to Python 3.x, Python 2.x does not handle unicode well. + if PY3: + result = self.read_csv(StringIO(data), quotechar=u('\u0394')) + tm.assert_frame_equal(result, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 12525c7a9c587..0a2824e74120c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -570,7 +570,8 @@ cdef class TextReader: if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: raise TypeError('bad "quoting" value') - if not isinstance(quote_char, (str, bytes)) and quote_char is not None: + if not isinstance(quote_char, (str, compat.text_type, + bytes)) and quote_char is not None: dtype = type(quote_char).__name__ raise TypeError('"quotechar" must be string, ' 'not {dtype}'.format(dtype=dtype)) From 6ac759d5e24f8a1b8eb9f39f08b139079cad401e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 26 Oct 2016 18:32:41 -0400 Subject: [PATCH 05/29] BLD: fix 3.4 build for cython to 0.24.1 --- ci/requirements-3.4.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.4.build b/ci/requirements-3.4.build index 4a4bd9d433428..e6e59dcba63fe 100644 --- a/ci/requirements-3.4.build +++ b/ci/requirements-3.4.build @@ -1,3 +1,3 @@ numpy=1.8.1 -cython +cython=0.24.1 libgfortran=1.0 From 31ca7170edd1fa3cfcfd96b283d6821491324711 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 27 Oct 2016 09:11:41 +0200 Subject: [PATCH 06/29] TST: simplify tests for GH14346 (#14502) --- pandas/tests/indexes/test_datetimelike.py | 119 +++++++++------------- 1 file changed, 48 insertions(+), 71 deletions(-) diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index b04e840ffc849..68db163be6fde 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -732,30 +732,21 @@ def test_fillna_datetime64(self): dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of DatetimeIndex does not preserve frequency, - # so a differencing operation should not retain the freq field of the - # original index. - i = pd.date_range("20160920", "20160925", freq="D") - - a = pd.date_range("20160921", "20160924", freq="D") - expected = pd.DatetimeIndex(["20160920", "20160925"], freq=None) - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.date_range("20160922", "20160925", freq="D") - b_diff = i.difference(b) - expected = pd.DatetimeIndex(["20160920", "20160921"], freq=None) - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_diff = a_diff.union(b_diff) - expected = pd.DatetimeIndex(["20160920", "20160921", "20160925"], - freq=None) - tm.assert_index_equal(union_of_diff, expected) - tm.assert_attr_equal('freq', union_of_diff, expected) + def test_difference_freq(self): + # GH14323: difference of DatetimeIndex should not preserve frequency + + index = date_range("20160920", "20160925", freq="D") + other = date_range("20160921", "20160924", freq="D") + expected = DatetimeIndex(["20160920", "20160925"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = date_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = DatetimeIndex(["20160920", "20160921"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) class TestPeriodIndex(DatetimeLike, tm.TestCase): @@ -963,29 +954,23 @@ def test_no_millisecond_field(self): with self.assertRaises(AttributeError): DatetimeIndex([]).millisecond - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of Period MUST preserve frequency, but the ability - # to union results must be preserved - i = pd.period_range("20160920", "20160925", freq="D") - - a = pd.period_range("20160921", "20160924", freq="D") - expected = pd.PeriodIndex(["20160920", "20160925"], freq='D') - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.period_range("20160922", "20160925", freq="D") - b_diff = i.difference(b) - expected = pd.PeriodIndex(["20160920", "20160921"], freq='D') - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_diff = a_diff.union(b_diff) - expected = pd.PeriodIndex(["20160920", "20160921", "20160925"], - freq='D') - tm.assert_index_equal(union_of_diff, expected) - tm.assert_attr_equal('freq', union_of_diff, expected) + def test_difference_freq(self): + # GH14323: difference of Period MUST preserve frequency + # but the ability to union results must be preserved + + index = period_range("20160920", "20160925", freq="D") + + other = period_range("20160921", "20160924", freq="D") + expected = PeriodIndex(["20160920", "20160925"], freq='D') + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = period_range("20160922", "20160925", freq="D") + idx_diff = index.difference(other) + expected = PeriodIndex(["20160920", "20160921"], freq='D') + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) class TestTimedeltaIndex(DatetimeLike, tm.TestCase): @@ -1199,27 +1184,19 @@ def test_fillna_timedelta(self): [pd.Timedelta('1 day'), 'x', pd.Timedelta('3 day')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - def test_difference_of_union(self): - # GH14323: Test taking the union of differences of an Index. - # Difference of TimedeltaIndex does not preserve frequency, - # so a differencing operation should not retain the freq field of the - # original index. - i = pd.timedelta_range("0 days", "5 days", freq="D") - - a = pd.timedelta_range("1 days", "4 days", freq="D") - expected = pd.TimedeltaIndex(["0 days", "5 days"], freq=None) - a_diff = i.difference(a) - tm.assert_index_equal(a_diff, expected) - tm.assert_attr_equal('freq', a_diff, expected) - - b = pd.timedelta_range("2 days", "5 days", freq="D") - b_diff = i.difference(b) - expected = pd.TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(b_diff, expected) - tm.assert_attr_equal('freq', b_diff, expected) - - union_of_difference = a_diff.union(b_diff) - expected = pd.TimedeltaIndex(["0 days", "1 days", "5 days"], - freq=None) - tm.assert_index_equal(union_of_difference, expected) - tm.assert_attr_equal('freq', union_of_difference, expected) + def test_difference_freq(self): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal('freq', idx_diff, expected) From e7ac84d2988284604bff781c67a50974e51afdec Mon Sep 17 00:00:00 2001 From: Chris Warth Date: Thu, 27 Oct 2016 06:51:10 -0700 Subject: [PATCH 07/29] DOC: Expand on reference docs for read_json() (#14442) --- pandas/io/json.py | 91 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 1e258101a5d86..878506a6ddc05 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -123,32 +123,38 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` - orient - - * `Series` - + orient : string, + Indication of expected JSON string format. + Compatible JSON strings can be produced by ``to_json()`` with a + corresponding orient value. + The set of possible orients is: + + - ``'split'`` : dict like + ``{index -> [index], columns -> [columns], data -> [values]}`` + - ``'records'`` : list like + ``[{column -> value}, ... , {column -> value}]`` + - ``'index'`` : dict like ``{index -> {column -> value}}`` + - ``'columns'`` : dict like ``{column -> {index -> value}}`` + - ``'values'`` : just the values array + + The allowed and default values depend on the value + of the `typ` parameter. + + * when ``typ == 'series'``, + + - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - - allowed values are: ``{'split','records','index'}`` - The Series index must be unique for orient ``'index'``. - * `DataFrame` + * when ``typ == 'frame'``, + - allowed orients are ``{'split','records','index', + 'columns','values'}`` - default is ``'columns'`` - - allowed values are: {'split','records','index','columns','values'} - - The DataFrame index must be unique for orients 'index' and - 'columns'. - - The DataFrame columns must be unique for orients 'index', - 'columns', and 'records'. - - * The format of the JSON string - - - split : dict like - ``{index -> [index], columns -> [columns], data -> [values]}`` - - records : list like - ``[{column -> value}, ... , {column -> value}]`` - - index : dict like ``{index -> {column -> value}}`` - - columns : dict like ``{column -> {index -> value}}`` - - values : just the values array + - The DataFrame index must be unique for orients ``'index'`` and + ``'columns'``. + - The DataFrame columns must be unique for orients ``'index'``, + ``'columns'``, and ``'records'``. typ : type of object to recover (series or frame), default 'frame' dtype : boolean or dict, default True @@ -197,7 +203,48 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, Returns ------- - result : Series or DataFrame + result : Series or DataFrame, depending on the value of `typ`. + + See Also + -------- + DataFrame.to_json + + Examples + -------- + + >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']], + ... index=['row 1', 'row 2'], + ... columns=['col 1', 'col 2']) + + Encoding/decoding a Dataframe using ``'split'`` formatted JSON: + + >>> df.to_json(orient='split') + '{"columns":["col 1","col 2"], + "index":["row 1","row 2"], + "data":[["a","b"],["c","d"]]}' + >>> pd.read_json(_, orient='split') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'index'`` formatted JSON: + + >>> df.to_json(orient='index') + '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}' + >>> pd.read_json(_, orient='index') + col 1 col 2 + row 1 a b + row 2 c d + + Encoding/decoding a Dataframe using ``'records'`` formatted JSON. + Note that index labels are not preserved with this encoding. + + >>> df.to_json(orient='records') + '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]' + >>> pd.read_json(_, orient='records') + col 1 col 2 + 0 a b + 1 c d """ filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, From d7fb5bd310edcab9875c1f9339b62e92baae8291 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 27 Oct 2016 15:53:09 +0200 Subject: [PATCH 08/29] BUG: fix DatetimeIndex._maybe_cast_slice_bound for empty index (GH14354) (#14501) --- doc/source/whatsnew/v0.19.1.txt | 4 +++- pandas/tseries/index.py | 5 +++-- pandas/tseries/tests/test_timeseries.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 7594478ada41a..a81ab6ed0311c 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -48,6 +48,7 @@ Bug Fixes - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). - Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) +- Regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) @@ -78,4 +79,5 @@ Bug Fixes -- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) +- Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` + is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f68750e242f1f..70e2d2c121773 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1453,8 +1453,9 @@ def _maybe_cast_slice_bound(self, label, side, kind): # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) # because label may be passed to searchsorted - # the bounds need swapped if index is reverse sorted - if self.is_monotonic_decreasing: + # the bounds need swapped if index is reverse sorted and has a + # length (is_monotonic_decreasing gives True for empty index) + if self.is_monotonic_decreasing and len(self): return upper if side == 'left' else lower return lower if side == 'left' else upper else: diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index c13805d383e5d..aa8a5d10cd9d3 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -3911,6 +3911,18 @@ def test_slice_with_zero_step_raises(self): self.assertRaisesRegexp(ValueError, 'slice step cannot be zero', lambda: ts.ix[::0]) + def test_slice_bounds_empty(self): + # GH 14354 + empty_idx = DatetimeIndex(freq='1H', periods=0, end='2015') + + right = empty_idx._maybe_cast_slice_bound('2015-01-02', 'right', 'loc') + exp = Timestamp('2015-01-02 23:59:59.999999999') + self.assertEqual(right, exp) + + left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') + exp = Timestamp('2015-01-02 00:00:00') + self.assertEqual(left, exp) + class TestDatetime64(tm.TestCase): """ From 096d8866a90c8cbb44ab8243320e811fc24190bd Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 28 Oct 2016 04:37:55 -0400 Subject: [PATCH 09/29] MAINT: Expand lint for *.py (#14516) --- ci/lint.sh | 14 ++++---------- pandas/core/groupby.py | 2 +- pandas/core/internals.py | 3 ++- pandas/io/parsers.py | 10 +++++----- pandas/io/tests/parser/common.py | 4 ++-- pandas/msgpack/__init__.py | 14 ++++++-------- pandas/tests/indexes/test_base.py | 3 +-- pandas/util/testing.py | 2 +- 8 files changed, 22 insertions(+), 30 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index a866b04445f96..115a2cdaf7899 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -7,16 +7,10 @@ source activate pandas RET=0 if [ "$LINT" ]; then - echo "Linting" - for path in 'api' 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' - do - echo "linting -> pandas/$path" - flake8 pandas/$path --filename '*.py' - if [ $? -ne "0" ]; then - RET=1 - fi - - done + # pandas/rpy is deprecated and will be removed. + # pandas/src is C code, so no need to search there. + echo "Linting *.py" + flake8 pandas --filename '*.py' --exclude pandas/rpy,pandas/src echo "Linting *.py DONE" echo "Linting *.pyx" diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2a7f896e1b871..afddb86988970 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -6,7 +6,7 @@ import warnings import copy -from pandas.compat import( +from pandas.compat import ( zip, range, long, lzip, callable, map ) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 11721a5bdac29..d9d4bb0d14228 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1147,8 +1147,9 @@ def get_result(other): def handle_error(): if raise_on_error: + # The 'detail' variable is defined in outer scope. raise TypeError('Could not operate %s with block values %s' % - (repr(other), str(detail))) + (repr(other), str(detail))) # noqa else: # return the values result = np.empty(values.shape, dtype='O') diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e0127c3544971..9e5fcd406a750 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2194,16 +2194,16 @@ def _handle_usecols(self, columns, usecols_key): usecols_key is used if there are string usecols. """ if self.usecols is not None: - if any([isinstance(u, string_types) for u in self.usecols]): + if any([isinstance(col, string_types) for col in self.usecols]): if len(columns) > 1: raise ValueError("If using multiple headers, usecols must " "be integers.") col_indices = [] - for u in self.usecols: - if isinstance(u, string_types): - col_indices.append(usecols_key.index(u)) + for col in self.usecols: + if isinstance(col, string_types): + col_indices.append(usecols_key.index(col)) else: - col_indices.append(u) + col_indices.append(col) else: col_indices = self.usecols diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 0219e16391be8..0364b3bf42fff 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -17,8 +17,8 @@ import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex from pandas import compat -from pandas.compat import(StringIO, BytesIO, PY3, - range, lrange, u) +from pandas.compat import (StringIO, BytesIO, PY3, + range, lrange, u) from pandas.io.common import DtypeWarning, EmptyDataError, URLError from pandas.io.parsers import TextFileReader, TextParser diff --git a/pandas/msgpack/__init__.py b/pandas/msgpack/__init__.py index 0c2370df936a4..33d60a12ef0a3 100644 --- a/pandas/msgpack/__init__.py +++ b/pandas/msgpack/__init__.py @@ -1,11 +1,10 @@ # coding: utf-8 -# flake8: noqa - -from pandas.msgpack._version import version -from pandas.msgpack.exceptions import * from collections import namedtuple +from pandas.msgpack.exceptions import * # noqa +from pandas.msgpack._version import version # noqa + class ExtType(namedtuple('ExtType', 'code data')): """ExtType represents ext type in msgpack.""" @@ -18,11 +17,10 @@ def __new__(cls, code, data): raise ValueError("code must be 0~127") return super(ExtType, cls).__new__(cls, code, data) +import os # noqa -import os -from pandas.msgpack._packer import Packer -from pandas.msgpack._unpacker import unpack, unpackb, Unpacker - +from pandas.msgpack._packer import Packer # noqa +from pandas.msgpack._unpacker import unpack, unpackb, Unpacker # noqa def pack(o, stream, **kwargs): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 21471b1883209..b839ed6331457 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1576,11 +1576,10 @@ def test_string_index_repr(self): # py3/py2 repr can differ because of "u" prefix # which also affects to displayed element size - # suppress flake8 warnings if PY3: coerce = lambda x: x else: - coerce = unicode + coerce = unicode # noqa # short idx = pd.Index(['a', 'bb', 'ccc']) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 57bb01e5e0406..05517bf6cf53a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -35,7 +35,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat -from pandas.compat import( +from pandas.compat import ( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, raise_with_traceback, httplib, is_platform_windows, is_platform_32bit, PY3 From 7f5a45c1b388b3f7f309f82bfa0733b7b9980c3a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 29 Oct 2016 15:24:19 +0200 Subject: [PATCH 10/29] BUG/ERR: raise correct error when sql driver is not installed (#14527) When the driver was not installed, but sqlalchemy itself was, when passing a URI string, you got an error indicating that SQLAlchemy was not installed, instead of the driver not being installed. This was because the import error for the driver was captured as import error for sqlalchemy. --- pandas/io/sql.py | 5 +++-- pandas/io/tests/test_sql.py | 8 +++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 47642c2e2bc28..c9f8d32e1b504 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -507,10 +507,11 @@ def _engine_builder(con): if isinstance(con, string_types): try: import sqlalchemy - con = sqlalchemy.create_engine(con) - return con except ImportError: _SQLALCHEMY_INSTALLED = False + else: + con = sqlalchemy.create_engine(con) + return con return con diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index af8989baabbc0..e9d19bbd8be66 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -944,7 +944,7 @@ def test_sqlalchemy_type_mapping(self): self.assertTrue(isinstance( table.table.c['time'].type, sqltypes.DateTime)) - def test_to_sql_read_sql_with_database_uri(self): + def test_database_uri_string(self): # Test read_sql and .to_sql method with a database URI (GH10654) test_frame1 = self.test_frame1 @@ -963,6 +963,12 @@ def test_to_sql_read_sql_with_database_uri(self): tm.assert_frame_equal(test_frame1, test_frame3) tm.assert_frame_equal(test_frame1, test_frame4) + # using driver that will not be installed on Travis to trigger error + # in sqlalchemy.create_engine -> test passing of this error to user + db_uri = "postgresql+pg8000://user:pass@host/dbname" + with tm.assertRaisesRegexp(ImportError, "pg8000"): + sql.read_sql("select * from table", db_uri) + def _make_iris_table_metadata(self): sa = sqlalchemy metadata = sa.MetaData() From b793443030fe60eef6d198538d6ebd1d5fe6f55e Mon Sep 17 00:00:00 2001 From: dickreuter Date: Sun, 30 Oct 2016 20:06:24 +0000 Subject: [PATCH 11/29] Added documentation and test for issue #14505 --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/tests/json/test_json_norm.py | 45 ++++++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 3ee4cc1dde92d..78a98453ed9ef 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -78,3 +78,4 @@ Bug Fixes - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) +- Bug in ``pandas.io.json.json_normalize``When parsing a nested json and convert it to a dataframe, the meta parameter can be used to use fields as metadata for each record in resulting table. In some cases, not all items may contain all of the specified meta fields. This change will avoid throwing an error and output np.nan instead. (:issue '14505') \ No newline at end of file diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 4848db97194d9..ef0e3c620ef72 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,6 +225,51 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + def test_json_normalise_fix(self): + j = { + "Trades": [{ + "general": { + "tradeid": 100, + "trade_version": 1, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + }, + }, { + "general": { + "tradeid": 100, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + }, + } + ] + } + j = json_normalize(data=j['Trades'], record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], ['general', 'trade_version']]) + self.assertEqual(len(j), 4) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) From 1ce62992ac086d40353228902048a13e8765ceb5 Mon Sep 17 00:00:00 2001 From: Anthonios Partheniou Date: Mon, 31 Oct 2016 08:23:39 -0400 Subject: [PATCH 12/29] DOC: Simplify the gbq integration testing procedure for contributors (#14541) --- ci/travis_encrypt_gbq.sh | 11 +++++------ ci/travis_gbq_config.txt | 1 - ci/travis_process_gbq_encryption.sh | 6 ++++-- doc/source/contributing.rst | 30 +++++++++++++++-------------- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh index 719db67f384e0..e404ca73a405e 100755 --- a/ci/travis_encrypt_gbq.sh +++ b/ci/travis_encrypt_gbq.sh @@ -1,11 +1,10 @@ #!/bin/bash GBQ_JSON_FILE=$1 -GBQ_PROJECT_ID=$2 -if [[ $# -ne 2 ]]; then +if [[ $# -ne 1 ]]; then echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ - " " + "" exit 1 fi @@ -23,9 +22,9 @@ echo "Encrypting $GBQ_JSON_FILE..." read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file $GBQ_JSON_FILE \ travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); -echo "Adding your secure key and project id to travis_gbq_config.txt ..." -echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY\n"\ -"GBQ_PROJECT_ID='$GBQ_PROJECT_ID'" > travis_gbq_config.txt +echo "Adding your secure key to travis_gbq_config.txt ..." +echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY"\ +> travis_gbq_config.txt echo "Done. Removing file $GBQ_JSON_FILE" rm $GBQ_JSON_FILE diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt index 3b68d62f177cc..0b28cdedbd0d7 100644 --- a/ci/travis_gbq_config.txt +++ b/ci/travis_gbq_config.txt @@ -1,3 +1,2 @@ TRAVIS_IV_ENV=encrypted_1d9d7b1f171b_iv TRAVIS_KEY_ENV=encrypted_1d9d7b1f171b_key -GBQ_PROJECT_ID='pandas-travis' diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh index 7ff4c08f78e37..9967d40e49f0a 100755 --- a/ci/travis_process_gbq_encryption.sh +++ b/ci/travis_process_gbq_encryption.sh @@ -2,10 +2,12 @@ source ci/travis_gbq_config.txt -if [[ -n ${!TRAVIS_IV_ENV} ]]; then +if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then + echo "${SERVICE_ACCOUNT_KEY}" > ci/travis_gbq.json; +elif [[ -n ${!TRAVIS_IV_ENV} ]]; then openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; - export GBQ_PROJECT_ID=$GBQ_PROJECT_ID; + export GBQ_PROJECT_ID='pandas-travis'; echo 'Successfully decrypted gbq credentials' fi diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index a8a47a9d979c0..44ee6223d5ee1 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -643,20 +643,22 @@ on Travis-CI and are only accessible from the pandas-dev/pandas repository. The credentials won't be available on forks of pandas. Here are the steps to run gbq integration tests on a forked repository: -#. First, complete all the steps in the `Encrypting Files Prerequisites - `__ section. -#. Sign into `Travis `__ using your GitHub account. -#. Enable your forked repository of pandas for testing in `Travis - `__. -#. Run the following command from terminal where the current working directory - is the ``ci`` folder:: - - ./travis_encrypt_gbq.sh - -#. Create a new branch from the branch used in your pull request. Commit the - encrypted file called ``travis_gbq.json.enc`` as well as the file - ``travis_gbq_config.txt``, in an otherwise empty commit. DO NOT commit the - ``*.json`` file which contains your unencrypted private key. +#. Go to `Travis CI `__ and sign in with your GitHub + account. +#. Click on the ``+`` icon next to the ``My Repositories`` list and enable + Travis builds for your fork. +#. Click on the gear icon to edit your travis build, and add two environment + variables: + + - ``GBQ_PROJECT_ID`` with the value being the ID of your BigQuery project. + + - ``SERVICE_ACCOUNT_KEY`` with the value being the contents of the JSON key + that you downloaded for your service account. Use single quotes around + your JSON key to ensure that it is treated as a string. + + For both environment variables, keep the "Display value in build log" option + DISABLED. These variables contain sensitive data and you do not want their + contents being exposed in build logs. #. Your branch should be tested automatically once it is pushed. You can check the status by visiting your Travis branches page which exists at the following location: https://travis-ci.org/your-user-name/pandas/branches . From 47f117d18a99f8bbaf2ecbc7829d198d304b8c2a Mon Sep 17 00:00:00 2001 From: Piotr Chromiec Date: Mon, 31 Oct 2016 13:24:05 +0100 Subject: [PATCH 13/29] BUG: tseries ceil doc fix (#14543) --- pandas/tseries/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 96213a4aec34d..4645ae24684ff 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -100,7 +100,7 @@ def round(self, freq, *args, **kwargs): def floor(self, freq): return self._round(freq, np.floor) - @Appender(_round_doc % "floor") + @Appender(_round_doc % "ceil") def ceil(self, freq): return self._round(freq, np.ceil) From b08811220459ac8271dd904a94e82278bd4066c4 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 31 Oct 2016 16:39:59 -0400 Subject: [PATCH 14/29] BUG: Don't parse inline quotes in skipped lines (#14514) Closes gh-14459. --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/tests/parser/skiprows.py | 8 ++++++ pandas/src/parser/tokenizer.c | 45 ++++++++++++++++++++---------- pandas/src/parser/tokenizer.h | 7 +++-- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index a81ab6ed0311c..ab999643d575b 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -36,6 +36,7 @@ Bug Fixes - Compat with Cython 0.25 for building (:issue:`14496`) +- Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index c9f50dec6c01e..9f01adb6fabcb 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -190,3 +190,11 @@ def test_skiprows_lineterminator(self): skiprows=1, delim_whitespace=True, names=['date', 'time', 'var', 'flag', 'oflag']) tm.assert_frame_equal(df, expected) + + def test_skiprows_infield_quote(self): + # see gh-14459 + data = 'a"\nb"\na\n1' + expected = DataFrame({'a': [1]}) + + df = self.read_csv(StringIO(data), skiprows=2) + tm.assert_frame_equal(df, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index af85b7b894d26..748edc7fcacc5 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -478,9 +478,10 @@ static int end_line(parser_t *self) { } } - if (self->state == SKIP_LINE || \ - self->state == QUOTE_IN_SKIP_LINE || \ - self->state == QUOTE_IN_QUOTE_IN_SKIP_LINE + if (self->state == START_FIELD_IN_SKIP_LINE || \ + self->state == IN_FIELD_IN_SKIP_LINE || \ + self->state == IN_QUOTED_FIELD_IN_SKIP_LINE || \ + self->state == QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE ) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count @@ -761,38 +762,54 @@ int tokenize_bytes(parser_t *self, size_t line_limit) switch(self->state) { - case SKIP_LINE: - TRACE(("tokenize_bytes SKIP_LINE 0x%x, state %d\n", c, self->state)); + case START_FIELD_IN_SKIP_LINE: if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; } else if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; + } else if (IS_DELIMITER(c)) { + // Do nothing, we're starting a new field again. + } else { + self->state = IN_FIELD_IN_SKIP_LINE; + } + break; + + case IN_FIELD_IN_SKIP_LINE: + if (IS_TERMINATOR(c)) { + END_LINE(); + } else if (IS_CARRIAGE(c)) { + self->file_lines++; + self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } break; - case QUOTE_IN_SKIP_LINE: + case IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { if (self->doublequote) { - self->state = QUOTE_IN_QUOTE_IN_SKIP_LINE; + self->state = QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } } break; - case QUOTE_IN_QUOTE_IN_SKIP_LINE: + case QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE: if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else if (IS_TERMINATOR(c)) { END_LINE(); } else if (IS_CARRIAGE(c)) { self->file_lines++; self->state = EAT_CRNL_NOP; + } else if (IS_DELIMITER(c)) { + self->state = START_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; } break; @@ -846,9 +863,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit) // start of record if (skip_this_line(self, self->file_lines)) { if (IS_QUOTE(c)) { - self->state = QUOTE_IN_SKIP_LINE; + self->state = IN_QUOTED_FIELD_IN_SKIP_LINE; } else { - self->state = SKIP_LINE; + self->state = IN_FIELD_IN_SKIP_LINE; if (IS_TERMINATOR(c)) { END_LINE(); diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 8f7ae436bb7b7..487c1265d9358 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -123,9 +123,10 @@ typedef enum { EAT_COMMENT, EAT_LINE_COMMENT, WHITESPACE_LINE, - SKIP_LINE, - QUOTE_IN_SKIP_LINE, - QUOTE_IN_QUOTE_IN_SKIP_LINE, + START_FIELD_IN_SKIP_LINE, + IN_FIELD_IN_SKIP_LINE, + IN_QUOTED_FIELD_IN_SKIP_LINE, + QUOTE_IN_QUOTED_FIELD_IN_SKIP_LINE, FINISHED } ParserState; From 60a335e457bc40d009d5be99b384a62fa34ba3fc Mon Sep 17 00:00:00 2001 From: "Brandon M. Burroughs" Date: Mon, 31 Oct 2016 16:53:51 -0400 Subject: [PATCH 15/29] BUG: Dataframe constructor when given dict with None value (#14392) --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/core/series.py | 4 ++-- pandas/tests/frame/test_constructors.py | 8 ++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index ab999643d575b..cb02c3a5009ab 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -41,6 +41,7 @@ Bug Fixes - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) +- Bug in ``pd.DataFrame`` where constructor fails when given dict with ``None`` value (:issue:`14381`) - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c6b13885dd01..188204d83d985 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2915,8 +2915,8 @@ def create_from_value(value, index, dtype): return subarr - # scalar like - if subarr.ndim == 0: + # scalar like, GH + if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index d21db5ba52a45..e55ba3e161ed9 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -259,6 +259,14 @@ def test_constructor_dict(self): frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) self.assert_index_equal(frame.index, Index([], dtype=np.int64)) + # GH 14381 + # Dict with None value + frame_none = DataFrame(dict(a=None), index=[0]) + frame_none_list = DataFrame(dict(a=[None]), index=[0]) + tm.assert_equal(frame_none.get_value(0, 'a'), None) + tm.assert_equal(frame_none_list.get_value(0, 'a'), None) + tm.assert_frame_equal(frame_none, frame_none_list) + # GH10856 # dict with scalar values should raise error, even if columns passed with tm.assertRaises(ValueError): From e5443622e413138f626db64825adb0c9e3efa4a2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Nov 2016 17:15:06 +0100 Subject: [PATCH 16/29] Update ISSUE_TEMPLATE: be more explicit on where to paste the output of show_versions --- .github/ISSUE_TEMPLATE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6f91eba1ad239..c7d731249f9cf 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -10,6 +10,6 @@ #### Output of ``pd.show_versions()``
-# Paste the output here +# Paste the output here pd.show_versions() here
From c3e25c6a769d85709980cbc0c1d942e0dca1f402 Mon Sep 17 00:00:00 2001 From: dickreuter Date: Tue, 1 Nov 2016 20:56:06 +0000 Subject: [PATCH 17/29] Added keyword errors {'raise'|'ignore} Added documenation Shortened what's new Removed commas in dictionary for linting compatibility --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/io/json.py | 35 ++++++++++++++------------ pandas/io/tests/json/test_json_norm.py | 16 +++++++++--- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 78a98453ed9ef..d8a50f4a69461 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -78,4 +78,4 @@ Bug Fixes - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) -- Bug in ``pandas.io.json.json_normalize``When parsing a nested json and convert it to a dataframe, the meta parameter can be used to use fields as metadata for each record in resulting table. In some cases, not all items may contain all of the specified meta fields. This change will avoid throwing an error and output np.nan instead. (:issue '14505') \ No newline at end of file +- Bug in ``pandas.io.json.json_normalize``If meta keys are not always present a new option to set errors='ignore' has been implemented (:issue:`14505`) \ No newline at end of file diff --git a/pandas/io/json.py b/pandas/io/json.py index 2ab6120dc8bdd..a02aa3beb847d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -22,10 +22,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False): - if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -53,7 +52,6 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', class Writer(object): - def __init__(self, obj, orient, date_format, double_precision, ensure_ascii, date_unit, default_handler=None): self.obj = obj @@ -244,7 +242,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, class Parser(object): - _STAMP_UNITS = ('s', 'ms', 'us', 'ns') _MIN_STAMPS = { 's': long(31536000), @@ -445,8 +442,8 @@ def _parse_no_numpy(self): if orient == "split": decoded = dict((str(k), v) for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + json, + precise_float=self.precise_float))) self.check_keys_split(decoded) self.obj = Series(dtype=None, **decoded) else: @@ -520,8 +517,8 @@ def _parse_no_numpy(self): elif orient == "split": decoded = dict((str(k), v) for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + json, + precise_float=self.precise_float))) self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": @@ -548,7 +545,6 @@ def _process_converter(self, f, filt=None): new_obj[i] = c if needs_new_obj: - # possibly handle dup columns new_obj = DataFrame(new_obj, index=self.obj.index) new_obj.columns = self.obj.columns @@ -581,9 +577,9 @@ def is_ok(col): col_lower = col.lower() if (col_lower.endswith('_at') or col_lower.endswith('_time') or - col_lower == 'modified' or - col_lower == 'date' or - col_lower == 'datetime' or + col_lower == 'modified' or + col_lower == 'date' or + col_lower == 'datetime' or col_lower.startswith('timestamp')): return True return False @@ -593,6 +589,7 @@ def is_ok(col): lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates)) + # --------------------------------------------------------------------- # JSON normalization routines @@ -676,7 +673,7 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, - record_prefix=None): + record_prefix=None, errors='raise'): """ "Normalize" semi-structured JSON data into a flat table @@ -693,6 +690,8 @@ def json_normalize(data, record_path=None, meta=None, If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None + error: {'raise', 'ignore'}, default 'raise' + * ignore: will ignore keyErrors if keys listed in meta are not always present Returns ------- @@ -728,6 +727,7 @@ def json_normalize(data, record_path=None, meta=None, 4 Cuyahoga 1337 John Kasich Ohio OH """ + def _pull_field(js, spec): result = js if isinstance(spec, list): @@ -794,8 +794,11 @@ def _recursive_extract(data, path, seen_meta, level=0): else: try: meta_val = _pull_field(obj, val[level:]) - except: - meta_val = np.nan + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise KeyError("Try running with errors='ignore' as the following key may not always be present: "+str(e)) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index ef0e3c620ef72..4877728d9ec52 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,7 +225,9 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + def test_json_normalise_fix(self): + # issue 14505 j = { "Trades": [{ "general": { @@ -245,7 +247,7 @@ def test_json_normalise_fix(self): } ] - }, + } }, { "general": { "tradeid": 100, @@ -262,13 +264,19 @@ def test_json_normalise_fix(self): } ] - }, + } } ] } j = json_normalize(data=j['Trades'], record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], ['general', 'trade_version']]) - self.assertEqual(len(j), 4) + meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') + expected={'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, + 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, + 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, + 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, + 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} + + self.assertEqual(j.fillna('').to_dict(), expected) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', From 8b805626cfd5af4d8c93c4db44b59fa85ee1b091 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 2 Nov 2016 05:57:41 -0400 Subject: [PATCH 18/29] asv compat for py3 --- asv_bench/benchmarks/inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 0f9689dadcbb0..2e394ed4268f3 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -148,12 +148,12 @@ class to_numeric(object): N = 500000 data_dict = { - 'string-int': (['1'] * (N / 2)) + ([2] * (N / 2)), - 'string-nint': (['-1'] * (N / 2)) + ([2] * (N / 2)), + 'string-int': (['1'] * (N // 2)) + ([2] * (N // 2)), + 'string-nint': (['-1'] * (N // 2)) + ([2] * (N // 2)), 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * (N / 2)) + ([2] * (N / 2)), - 'int-list': ([1] * (N / 2)) + ([2] * (N / 2)), + 'string-float': (['1.1'] * (N // 2)) + ([2] * (N // 2)), + 'int-list': ([1] * (N // 2)) + ([2] * (N // 2)), 'int32': np.repeat(np.int32(1), N) } From eb7bd993da0b8cbfbbe716ca67a4a8745de41e23 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Nov 2016 12:27:40 +0100 Subject: [PATCH 19/29] BUG: don't close user-provided file handles in C parser (GH14418) (#14520) --- doc/source/whatsnew/v0.19.1.txt | 1 + pandas/io/parsers.py | 2 ++ pandas/io/tests/parser/common.py | 23 +++++++++++++++++++++++ pandas/parser.pyx | 9 ++++----- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index cb02c3a5009ab..a604ead87b2ab 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -38,6 +38,7 @@ Bug Fixes - Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) +- Fixed regression where user-provided file handles were closed in ``read_csv`` (c engine) (:issue:`14418`). - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9e5fcd406a750..090a21632cddb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1456,6 +1456,8 @@ def __init__(self, src, **kwds): def close(self): for f in self.handles: f.close() + + # close additional handles opened by C parser (for compression) try: self._reader.close() except: diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 0364b3bf42fff..3be02c55ea10a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1602,3 +1602,26 @@ def test_internal_eof_byte(self): expected = pd.DataFrame([["1\x1a", 2]], columns=['a', 'b']) result = self.read_csv(StringIO(data)) tm.assert_frame_equal(result, expected) + + def test_file_handles(self): + # GH 14418 - don't close user provided file handles + + fh = StringIO('a,b\n1,2') + self.read_csv(fh) + self.assertFalse(fh.closed) + + with open(self.csv1, 'r') as f: + self.read_csv(f) + self.assertFalse(f.closed) + + # mmap not working with python engine + if self.engine != 'python': + + import mmap + with open(self.csv1, 'r') as f: + m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + self.read_csv(m) + # closed attribute new in python 3.2 + if PY3: + self.assertFalse(m.closed) + m.close() diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 0a2824e74120c..93a494c176b99 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -272,7 +272,7 @@ cdef class TextReader: parser_t *parser object file_handle, na_fvalues object true_values, false_values - object dsource + object handle bint na_filter, verbose, has_usecols, has_mi_columns int parser_start list clocks @@ -554,9 +554,9 @@ cdef class TextReader: def close(self): # we need to properly close an open derived # filehandle here, e.g. and UTFRecoder - if self.dsource is not None: + if self.handle is not None: try: - self.dsource.close() + self.handle.close() except: pass @@ -641,6 +641,7 @@ cdef class TextReader: else: raise ValueError('Unrecognized compression type: %s' % self.compression) + self.handle = source if isinstance(source, basestring): if not isinstance(source, bytes): @@ -684,8 +685,6 @@ cdef class TextReader: raise IOError('Expected file path name or file-like object,' ' got %s type' % type(source)) - self.dsource = source - cdef _get_header(self): # header is now a list of lists, so field_count should use header[0] From 52f31d470d779204e4c1388cdb56351c68332c3f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Nov 2016 14:01:04 +0100 Subject: [PATCH 20/29] BUG: DataFrame.quantile with NaNs (GH14357) (#14536) --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/core/internals.py | 51 ++++++++++----- pandas/tests/frame/test_quantile.py | 97 ++++++++++++++++++++++++++++ pandas/tests/series/test_quantile.py | 32 +++++++++ 4 files changed, 166 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index a604ead87b2ab..5de59ed373523 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -66,7 +66,7 @@ Bug Fixes - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) - +- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index d9d4bb0d14228..43beefffd448e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -6,7 +6,6 @@ from collections import defaultdict import numpy as np -from numpy import percentile as _quantile from pandas.core.base import PandasObject @@ -1316,16 +1315,38 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): values = self.get_values() values, _, _, _ = self._try_coerce_args(values, values) - mask = isnull(self.values) - if not lib.isscalar(mask) and mask.any(): - # even though this could be a 2-d mask it appears - # as a 1-d result - mask = mask.reshape(values.shape) - result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) - values = _block_shape(values[~mask], ndim=self.ndim) - if self.ndim > 1: - values = values.reshape(result_shape) + def _nanpercentile1D(values, mask, q, **kw): + values = values[~mask] + + if len(values) == 0: + if is_scalar(q): + return self._na_value + else: + return np.array([self._na_value] * len(q), + dtype=values.dtype) + + return np.percentile(values, q, **kw) + + def _nanpercentile(values, q, axis, **kw): + + mask = isnull(self.values) + if not is_scalar(mask) and mask.any(): + if self.ndim == 1: + return _nanpercentile1D(values, mask, q, **kw) + else: + # for nonconsolidatable blocks mask is 1D, but values 2D + if mask.ndim < values.ndim: + mask = mask.reshape(values.shape) + if axis == 0: + values = values.T + mask = mask.T + result = [_nanpercentile1D(val, m, q, **kw) for (val, m) + in zip(list(values), list(mask))] + result = np.array(result, dtype=values.dtype, copy=False).T + return result + else: + return np.percentile(values, q, axis=axis, **kw) from pandas import Float64Index is_empty = values.shape[axis] == 0 @@ -1344,13 +1365,13 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: try: - result = _quantile(values, np.array(qs) * 100, - axis=axis, **kw) + result = _nanpercentile(values, np.array(qs) * 100, + axis=axis, **kw) except ValueError: # older numpies don't handle an array for q - result = [_quantile(values, q * 100, - axis=axis, **kw) for q in qs] + result = [_nanpercentile(values, q * 100, + axis=axis, **kw) for q in qs] result = np.array(result, copy=False) if self.ndim > 1: @@ -1369,7 +1390,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): else: result = np.array([self._na_value] * len(self)) else: - result = _quantile(values, qs * 100, axis=axis, **kw) + result = _nanpercentile(values, qs * 100, axis=axis, **kw) ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index 52e8697abe850..22414a6ba8a53 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -262,6 +262,11 @@ def test_quantile_datetime(self): index=[0.5], columns=[0, 1]) assert_frame_equal(result, expected) + # empty when numeric_only=True + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # result = df[['a', 'c']].quantile(.5) + # result = df[['a', 'c']].quantile([.5]) + def test_quantile_invalid(self): msg = 'percentiles should all be in the interval \\[0, 1\\]' for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: @@ -340,3 +345,95 @@ def test_quantile_box(self): pd.Timedelta('2 days')]], index=[0.5], columns=list('AaBbCc')) tm.assert_frame_equal(res, exp) + + def test_quantile_nan(self): + + # GH 14357 - float block where some cols have missing values + df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)}) + df.iloc[-1, 1] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + res = df.quantile(0.5, axis=1) + exp = Series(np.arange(1.0, 6.0), name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75], axis=1) + exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + # full-nan column + df['b'] = np.nan + + res = df.quantile(0.5) + exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5, 0.75]) + exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]}, + index=[0.5, 0.75]) + tm.assert_frame_equal(res, exp) + + def test_quantile_nat(self): + + # full NaT column + df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.NaT], index=['a'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame({'a': [pd.NaT]}, index=[0.5]) + tm.assert_frame_equal(res, exp) + + # mixed non-null / full null column + df = DataFrame({'a': [pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03')], + 'b': [pd.NaT, pd.NaT, pd.NaT]}) + + res = df.quantile(0.5, numeric_only=False) + exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'], + name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5], + columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + + def test_quantile_empty(self): + + # floats + df = DataFrame(columns=['a', 'b'], dtype='float64') + + res = df.quantile(0.5) + exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5]) + exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5]) + tm.assert_frame_equal(res, exp) + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5, axis=1) + # res = df.quantile([0.5], axis=1) + + # ints + df = DataFrame(columns=['a', 'b'], dtype='int64') + + # FIXME (gives empty frame in 0.18.1, broken in 0.19.0) + # res = df.quantile(0.5) + + # datetimes + df = DataFrame(columns=['a', 'b'], dtype='datetime64') + + # FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0) + # res = df.quantile(0.5, numeric_only=False) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index 7d2517987e526..76db6c90a685f 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -184,3 +184,35 @@ def test_quantile_nat(self): res = Series([pd.NaT, pd.NaT]).quantile([0.5]) tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) + + def test_quantile_empty(self): + + # floats + s = Series([], dtype='float64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # int + s = Series([], dtype='int64') + + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + exp = Series([np.nan], index=[0.5]) + tm.assert_series_equal(res, exp) + + # datetime + s = Series([], dtype='datetime64[ns]') + + res = s.quantile(0.5) + self.assertTrue(res is pd.NaT) + + res = s.quantile([0.5]) + exp = Series([pd.NaT], index=[0.5]) + tm.assert_series_equal(res, exp) From 1d951794187b72841d59b3dfde0a98309a64dec7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Nov 2016 16:16:15 +0100 Subject: [PATCH 21/29] PERF: casting loc to labels dtype before searchsorted (#14551) --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/indexes/multi.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 5de59ed373523..80a3e38fd5edd 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -24,7 +24,7 @@ Performance Improvements - Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) - Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461) - Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461) - +- Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index a9f452db69659..f9576d92d8a49 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1907,6 +1907,13 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): return np.array(labels == loc, dtype=bool) else: # sorted, so can return slice object -> view + try: + loc = labels.dtype.type(loc) + except TypeError: + # this occurs when loc is a slice (partial string indexing) + # but the TypeError raised by searchsorted in this case + # is catched in Index._has_valid_type() + pass i = labels.searchsorted(loc, side='left') j = labels.searchsorted(loc, side='right') return slice(i, j) From 093aa8231eae92ff6cf7ef9564d62289b458aaff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 10:36:10 +0100 Subject: [PATCH 22/29] DEPR: add deprecation warning for com.array_equivalent (#14567) pandas.core.common.array_equivalent was removed without deprecation warning. This commits adds it back to the core.common namespace with deprecation warning --- doc/source/whatsnew/v0.19.1.txt | 2 +- pandas/api/tests/test_api.py | 7 +++++++ pandas/core/common.py | 9 +++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 80a3e38fd5edd..cbdacb95a3d4a 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -34,7 +34,7 @@ Bug Fixes ~~~~~~~~~ - Compat with Cython 0.25 for building (:issue:`14496`) - +- Added back ``pandas.core.common.array_equivalent`` with a deprecation warning (:issue:`14555`). - Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index d4d8b7e4e9747..49aa31c375e25 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -1,5 +1,7 @@ # -*- coding: utf-8 -*- +import numpy as np + import pandas as pd from pandas.core import common as com from pandas import api @@ -184,6 +186,11 @@ def test_deprecation_core_common(self): for t in self.allowed: self.check_deprecation(getattr(com, t), getattr(types, t)) + def test_deprecation_core_common_array_equivalent(self): + + with tm.assert_produces_warning(DeprecationWarning): + com.array_equivalent(np.array([1, 2]), np.array([1, 2])) + def test_deprecation_core_common_moved(self): # these are in pandas.types.common diff --git a/pandas/core/common.py b/pandas/core/common.py index 341bd3b4cc845..295947bbc1166 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -64,6 +64,15 @@ def wrapper(*args, **kwargs): setattr(m, t, outer(t)) +# deprecate array_equivalent + +def array_equivalent(*args, **kwargs): + warnings.warn("'pandas.core.common.array_equivalent' is deprecated and " + "is no longer public API", DeprecationWarning, stacklevel=2) + from pandas.types import missing + return missing.array_equivalent(*args, **kwargs) + + class PandasError(Exception): pass From 7f0c4e084dd5fe3c010fd4f07633368004c6a0f5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 12:10:43 +0100 Subject: [PATCH 23/29] DOC: rst fixes --- doc/source/ecosystem.rst | 2 +- doc/source/io.rst | 2 +- doc/source/whatsnew/v0.13.0.txt | 2 +- doc/source/whatsnew/v0.19.1.txt | 6 +++--- pandas/core/generic.py | 2 ++ 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index a37b1e89c7cc3..087b265ee83f2 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -143,7 +143,7 @@ both "column wise min/max and global min/max coloring." API ----- -`pandas-datareader `__ +`pandas-datareader `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ``pandas-datareader`` is a remote data access library for pandas. ``pandas.io`` from pandas < 0.17.0 is now refactored/split-off to and importable from ``pandas_datareader`` (PyPI:``pandas-datareader``). Many/most of the supported APIs have at least a documentation paragraph in the `pandas-datareader docs `_: diff --git a/doc/source/io.rst b/doc/source/io.rst index ae71587c8b46b..ba1bd328d2991 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2789,7 +2789,7 @@ both on the writing (serialization), and reading (deserialization). | 0.17 / Python 3 | >=0.18 / any Python | +----------------------+------------------------+ | 0.18 | >= 0.18 | - +======================+========================+ + +----------------------+------------------------+ Reading (files packed by older versions) is backward-compatibile, except for files packed with 0.17 in Python 2, in which case only they can only be unpacked in Python 2. diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt index 0944d849cfafd..6ecd4b487c798 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -600,7 +600,7 @@ Enhancements .. ipython:: python t = Timestamp('20130101 09:01:02') - t + pd.datetools.Nano(123) + t + pd.tseries.offsets.Nano(123) - A new method, ``isin`` for DataFrames, which plays nicely with boolean indexing. The argument to ``isin``, what we're comparing the DataFrame to, can be a DataFrame, Series, dict, or array of values. See :ref:`the docs` for more. diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index cbdacb95a3d4a..19964a499c4e4 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -22,8 +22,8 @@ Performance Improvements - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) - Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) -- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461) -- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461) +- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) +- Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). @@ -50,7 +50,7 @@ Bug Fixes - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). -- Bug in union of differences from a ``DatetimeIndex`; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) +- Bug in union of differences from a ``DatetimeIndex``; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) - Regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 037ab900e6150..8e18b65e80385 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4008,6 +4008,8 @@ def asfreq(self, freq, method=None, how=None, normalize=False): ------- converted : type of caller + Notes + ----- To learn more about the frequency strings, please see `this link `__. """ From 252526cc0f197fb4c6b93cad41ca7cbcc5a82ed7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 12:12:19 +0100 Subject: [PATCH 24/29] BUG/API: Index.append with mixed object/Categorical indices (#14545) * BUG/API: Index.append with mixed object/Categorical indices * Only coerce to object if the calling index is not categorical * Add test for the df.info() case (GH14298) --- pandas/indexes/base.py | 8 ++++---- pandas/tests/frame/test_repr_info.py | 8 ++++++++ pandas/tests/indexes/test_category.py | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 4d2dcd259e623..54eaf86315a88 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1464,13 +1464,13 @@ def append(self, other): names = set([obj.name for obj in to_concat]) name = None if len(names) > 1 else self.name - typs = _concat.get_dtype_kinds(to_concat) - - if 'category' in typs: - # if any of the to_concat is category + if self.is_categorical(): + # if calling index is category, don't check dtype of others from pandas.indexes.category import CategoricalIndex return CategoricalIndex._append_same_dtype(self, to_concat, name) + typs = _concat.get_dtype_kinds(to_concat) + if len(typs) == 1: return self._append_same_dtype(to_concat, name=name) return _concat._concat_index_asobject(to_concat, name=name) diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 5e5e9abda1200..12cd62f8b4cc0 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -405,3 +405,11 @@ def memory_usage(f): # high upper bound self.assertTrue(memory_usage(unstacked) - memory_usage(df) < 2000) + + def test_info_categorical(self): + # GH14298 + idx = pd.CategoricalIndex(['a', 'b']) + df = pd.DataFrame(np.zeros((2, 2)), index=idx, columns=idx) + + buf = StringIO() + df.info(buf=buf) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 9f8405bcc2e1e..c76f5ff22c534 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -278,6 +278,11 @@ def test_append(self): # invalid objects self.assertRaises(TypeError, lambda: ci.append(Index(['a', 'd']))) + # GH14298 - if base object is not categorical -> coerce to object + result = Index(['c', 'a']).append(ci) + expected = Index(list('caaabbca')) + tm.assert_index_equal(result, expected, exact=True) + def test_insert(self): ci = self.create_index() From e1cdc4b83d7e970e9683056be722d5fa2a00fa70 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 3 Nov 2016 15:17:42 +0100 Subject: [PATCH 25/29] DOC: update whatsnew/release notes for 0.19.1 (#14573) --- doc/source/release.rst | 44 ++++++++++++++++++++++++++++ doc/source/whatsnew/v0.19.1.txt | 51 +++++++++------------------------ 2 files changed, 57 insertions(+), 38 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index d210065f04459..622e9a53ff8f0 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,50 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org + +pandas 0.19.1 +------------- + +**Release date:** November 3, 2016 + +This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, +bug fixes and performance improvements. + +See the :ref:`v0.19.1 Whatsnew ` page for an overview of all +bugs that have been fixed in 0.19.1. + +Thanks +~~~~~~ + +- Adam Chainz +- Anthonios Partheniou +- Arash Rouhani +- Ben Kandel +- Brandon M. Burroughs +- Chris +- chris-b1 +- Chris Warth +- David Krych +- dubourg +- gfyoung +- Iván Vallés Pérez +- Jeff Reback +- Joe Jevnik +- Jon M. Mease +- Joris Van den Bossche +- Josh Owen +- Keshav Ramaswamy +- Larry Ren +- mattrijk +- Michael Felt +- paul-mannino +- Piotr Chromiec +- Robert Bradshaw +- Sinhrks +- Thiago Serafim +- Tom Bird + + pandas 0.19.0 ------------- diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index 19964a499c4e4..db5bd22393e64 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -1,15 +1,12 @@ .. _whatsnew_0191: -v0.19.1 (????, 2016) ---------------------- +v0.19.1 (November 3, 2016) +-------------------------- -This is a minor bug-fix release from 0.19.0 and includes a large number of -bug fixes along with several new features, enhancements, and performance improvements. +This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, +bug fixes and performance improvements. We recommend that all users upgrade to this version. -Highlights include: - - .. contents:: What's new in v0.19.1 :local: :backlinks: none @@ -21,52 +18,38 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Fixed performance regression in factorization of ``Period`` data (:issue:`14338`) -- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) -- Improved performance in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) +- Fixed performance regression in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461`) +- Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) - Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). - .. _whatsnew_0191.bug_fixes: Bug Fixes ~~~~~~~~~ +- Source installs from PyPI will now again work without ``cython`` installed, as in previous versions (:issue:`14204`) - Compat with Cython 0.25 for building (:issue:`14496`) +- Fixed regression where user-provided file handles were closed in ``read_csv`` (c engine) (:issue:`14418`). +- Fixed regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). +- Fixed regression in ``Index.difference`` where the ``freq`` of a ``DatetimeIndex`` was incorrectly set (:issue:`14323`) - Added back ``pandas.core.common.array_equivalent`` with a deprecation warning (:issue:`14555`). - - Bug in ``pd.read_csv`` for the C engine in which quotation marks were improperly parsed in skipped rows (:issue:`14459`) - Bug in ``pd.read_csv`` for Python 2.x in which Unicode quote characters were no longer being respected (:issue:`14477`) -- Fixed regression where user-provided file handles were closed in ``read_csv`` (c engine) (:issue:`14418`). +- Fixed regression in ``Index.append`` when categorical indices were appended (:issue:`14545`). +- Fixed regression in ``pd.DataFrame`` where constructor fails when given dict with ``None`` value (:issue:`14381`) +- Fixed regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - Bug in localizing an ambiguous timezone when a boolean is passed (:issue:`14402`) - Bug in ``TimedeltaIndex`` addition with a Datetime-like object where addition overflow in the negative direction was not being caught (:issue:`14068`, :issue:`14453`) - -- Bug in ``pd.DataFrame`` where constructor fails when given dict with ``None`` value (:issue:`14381`) - - - Bug in string indexing against data with ``object`` ``Index`` may raise ``AttributeError`` (:issue:`14424`) - Corrrecly raise ``ValueError`` on empty input to ``pd.eval()`` and ``df.query()`` (:issue:`13139`) - - - Bug in ``RangeIndex.intersection`` when result is a empty set (:issue:`14364`). -- Bug in union of differences from a ``DatetimeIndex``; this is a regression in 0.19.0 from 0.18.1 (:issue:`14323`) -- Regression in ``DatetimeIndex._maybe_cast_slice_bound`` when index is empty (:issue:`14354`). - - Bug in groupby-transform broadcasting that could cause incorrect dtype coercion (:issue:`14457`) - - - Bug in ``Series.__setitem__`` which allowed mutating read-only arrays (:issue:`14359`). - - -- Source installs from PyPI will now work without ``cython`` installed, as in previous versions (:issue:`14204`) - Bug in ``DataFrame.insert`` where multiple calls with duplicate columns can fail (:issue:`14291`) - - ``pd.merge()`` will raise ``ValueError`` with non-boolean parameters in passed boolean type arguments (:issue:`14434`) - - - Bug in ``Timestamp`` where dates very near the minimum (1677-09) could underflow on creation (:issue:`14415`) -- Regression in ``DataFrame.quantile`` when missing values where present in some columns (:issue:`14357`). - Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`) - Bug in ``pd.concat`` where ``axis`` cannot take string parameters ``'rows'`` or ``'columns'`` (:issue:`14369`) - Bug in ``pd.concat`` with dataframes heterogeneous in length and tuple ``keys`` (:issue:`14438`) @@ -74,13 +57,5 @@ Bug Fixes - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`) - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) - - - - - - - - - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file From 5aaf8fe13df43d0bab7312e84af4e75ec3ae6d1f Mon Sep 17 00:00:00 2001 From: dickreuter Date: Wed, 26 Oct 2016 21:14:55 +0100 Subject: [PATCH 26/29] Avoids exception when pandas.io.json.json_normalize contains items in meta parameter that don't always occur in every item of the list --- pandas/io/json.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index 878506a6ddc05..be69f7832e518 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -839,7 +839,10 @@ def _recursive_extract(data, path, seen_meta, level=0): if level + 1 > len(val): meta_val = seen_meta[key] else: - meta_val = _pull_field(obj, val[level:]) + try: + meta_val = _pull_field(obj, val[level:]) + except: + meta_val = np.nan meta_vals[key].append(meta_val) records.extend(recs) From 8928270af772ecb476466066f945b32842e1bc1e Mon Sep 17 00:00:00 2001 From: dickreuter Date: Sun, 30 Oct 2016 20:06:24 +0000 Subject: [PATCH 27/29] Added documentation and test for issue #14505 --- pandas/io/tests/json/test_json_norm.py | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 4848db97194d9..ef0e3c620ef72 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,6 +225,51 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + def test_json_normalise_fix(self): + j = { + "Trades": [{ + "general": { + "tradeid": 100, + "trade_version": 1, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + }, + }, { + "general": { + "tradeid": 100, + "stocks": [{ + + "symbol": "AAPL", + "name": "Apple", + "price": "0" + + }, { + "symbol": "GOOG", + "name": "Google", + "price": "0" + + } + ] + }, + } + ] + } + j = json_normalize(data=j['Trades'], record_path=[['general', 'stocks']], + meta=[['general', 'tradeid'], ['general', 'trade_version']]) + self.assertEqual(len(j), 4) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) From 984883796e0ee6e7355c2d5a746273c9551ead91 Mon Sep 17 00:00:00 2001 From: dickreuter Date: Tue, 1 Nov 2016 20:56:06 +0000 Subject: [PATCH 28/29] Added keyword errors {'raise'|'ignore} Added documenation Shortened what's new Removed commas in dictionary for linting compatibility --- pandas/io/json.py | 35 ++++++++++++++------------ pandas/io/tests/json/test_json_norm.py | 16 +++++++++--- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/pandas/io/json.py b/pandas/io/json.py index be69f7832e518..3b0930a36e199 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -22,10 +22,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False): - if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -53,7 +52,6 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', class Writer(object): - def __init__(self, obj, orient, date_format, double_precision, ensure_ascii, date_unit, default_handler=None): self.obj = obj @@ -291,7 +289,6 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, class Parser(object): - _STAMP_UNITS = ('s', 'ms', 'us', 'ns') _MIN_STAMPS = { 's': long(31536000), @@ -492,8 +489,8 @@ def _parse_no_numpy(self): if orient == "split": decoded = dict((str(k), v) for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + json, + precise_float=self.precise_float))) self.check_keys_split(decoded) self.obj = Series(dtype=None, **decoded) else: @@ -567,8 +564,8 @@ def _parse_no_numpy(self): elif orient == "split": decoded = dict((str(k), v) for k, v in compat.iteritems(loads( - json, - precise_float=self.precise_float))) + json, + precise_float=self.precise_float))) self.check_keys_split(decoded) self.obj = DataFrame(dtype=None, **decoded) elif orient == "index": @@ -595,7 +592,6 @@ def _process_converter(self, f, filt=None): new_obj[i] = c if needs_new_obj: - # possibly handle dup columns new_obj = DataFrame(new_obj, index=self.obj.index) new_obj.columns = self.obj.columns @@ -628,9 +624,9 @@ def is_ok(col): col_lower = col.lower() if (col_lower.endswith('_at') or col_lower.endswith('_time') or - col_lower == 'modified' or - col_lower == 'date' or - col_lower == 'datetime' or + col_lower == 'modified' or + col_lower == 'date' or + col_lower == 'datetime' or col_lower.startswith('timestamp')): return True return False @@ -640,6 +636,7 @@ def is_ok(col): lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates)) + # --------------------------------------------------------------------- # JSON normalization routines @@ -723,7 +720,7 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, - record_prefix=None): + record_prefix=None, errors='raise'): """ "Normalize" semi-structured JSON data into a flat table @@ -740,6 +737,8 @@ def json_normalize(data, record_path=None, meta=None, If True, prefix records with dotted (?) path, e.g. foo.bar.field if path to records is ['foo', 'bar'] meta_prefix : string, default None + error: {'raise', 'ignore'}, default 'raise' + * ignore: will ignore keyErrors if keys listed in meta are not always present Returns ------- @@ -775,6 +774,7 @@ def json_normalize(data, record_path=None, meta=None, 4 Cuyahoga 1337 John Kasich Ohio OH """ + def _pull_field(js, spec): result = js if isinstance(spec, list): @@ -841,8 +841,11 @@ def _recursive_extract(data, path, seen_meta, level=0): else: try: meta_val = _pull_field(obj, val[level:]) - except: - meta_val = np.nan + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise KeyError("Try running with errors='ignore' as the following key may not always be present: "+str(e)) meta_vals[key].append(meta_val) records.extend(recs) diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index ef0e3c620ef72..4877728d9ec52 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -225,7 +225,9 @@ def test_nested_flattens(self): self.assertEqual(result, expected) + def test_json_normalise_fix(self): + # issue 14505 j = { "Trades": [{ "general": { @@ -245,7 +247,7 @@ def test_json_normalise_fix(self): } ] - }, + } }, { "general": { "tradeid": 100, @@ -262,13 +264,19 @@ def test_json_normalise_fix(self): } ] - }, + } } ] } j = json_normalize(data=j['Trades'], record_path=[['general', 'stocks']], - meta=[['general', 'tradeid'], ['general', 'trade_version']]) - self.assertEqual(len(j), 4) + meta=[['general', 'tradeid'], ['general', 'trade_version']], errors='ignore') + expected={'general.trade_version': {0: 1.0, 1: 1.0, 2: '', 3: ''}, + 'general.tradeid': {0: 100, 1: 100, 2: 100, 3: 100}, + 'name': {0: 'Apple', 1: 'Google', 2: 'Apple', 3: 'Google'}, + 'price': {0: '0', 1: '0', 2: '0', 3: '0'}, + 'symbol': {0: 'AAPL', 1: 'GOOG', 2: 'AAPL', 3: 'GOOG'}} + + self.assertEqual(j.fillna('').to_dict(), expected) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', From cc2fdc2806c649fc4e1df49d58cddecdb78f2abd Mon Sep 17 00:00:00 2001 From: dickreuter Date: Sun, 30 Oct 2016 20:06:24 +0000 Subject: [PATCH 29/29] Added documentation and test for issue #14505 --- doc/source/whatsnew/v0.19.1.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt index db5bd22393e64..5022d9c12dd90 100644 --- a/doc/source/whatsnew/v0.19.1.txt +++ b/doc/source/whatsnew/v0.19.1.txt @@ -58,4 +58,5 @@ Bug Fixes - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`) - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`) - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns`` - is not scalar and ``values`` is not specified (:issue:`14380`) \ No newline at end of file + is not scalar and ``values`` is not specified (:issue:`14380`) +- Enhancement in ``pandas.io.json.json_normalize``Added errors{'raise','ignore'} for keys not found in meta (:issue: '14505') \ No newline at end of file