From 6faa5a66c005343c71b1ce623bb3ff89affd72b1 Mon Sep 17 00:00:00 2001 From: Chris MacLeod Date: Fri, 26 May 2017 12:06:55 -0300 Subject: [PATCH 01/55] PERF: HDFStore has faster __unicode__, new info() method with old behavior. __unicode__ now only returns file path info, not (expensive) details on all existing keys. --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.21.0.txt | 3 +- pandas/io/pytables.py | 59 ++++++++++++++++++-------------- pandas/tests/io/test_pytables.py | 37 +++++++++++--------- 4 files changed, 56 insertions(+), 44 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 888bb6d67e94b..350abb00f0849 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -99,6 +99,7 @@ HDFStore: PyTables (HDF5) HDFStore.append HDFStore.get HDFStore.select + HDFStore.info Feather ~~~~~~~ diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b4ca3f011a81d..246cf2e4bc8f6 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -49,6 +49,8 @@ Backwards incompatible API changes - Accessing a non-existent attribute on a closed :class:`HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). + .. _whatsnew_0210.api: Other API Changes @@ -77,7 +79,6 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - .. _whatsnew_0210.bug_fixes: Bug Fixes diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6665ccf8ce4c5..625b407dd43be 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -494,32 +494,7 @@ def __len__(self): return len(self.groups()) def __unicode__(self): - output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) - if self.is_open: - lkeys = sorted(list(self.keys())) - if len(lkeys): - keys = [] - values = [] - - for k in lkeys: - try: - s = self.get_storer(k) - if s is not None: - keys.append(pprint_thing(s.pathname or k)) - values.append( - pprint_thing(s or 'invalid_HDFStore node')) - except Exception as detail: - keys.append(k) - values.append("[invalid_HDFStore node: %s]" - % pprint_thing(detail)) - - output += adjoin(12, keys, values) - else: - output += 'Empty' - else: - output += "File is CLOSED" - - return output + return '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) def __enter__(self): return self @@ -1161,6 +1136,38 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, return new_store + def info(self): + """return detailed information on the store + + .. versionadded:: 0.21.0 + """ + output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) + if self.is_open: + lkeys = sorted(list(self.keys())) + if len(lkeys): + keys = [] + values = [] + + for k in lkeys: + try: + s = self.get_storer(k) + if s is not None: + keys.append(pprint_thing(s.pathname or k)) + values.append( + pprint_thing(s or 'invalid_HDFStore node')) + except Exception as detail: + keys.append(k) + values.append("[invalid_HDFStore node: %s]" + % pprint_thing(detail)) + + output += adjoin(12, keys, values) + else: + output += 'Empty' + else: + output += "File is CLOSED" + + return output + # private methods ###### def _check_if_open(self): if not self.is_open: diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 17f524cc279c0..06a4a67964b96 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -387,6 +387,7 @@ def test_repr(self): with ensure_clean_store(self.path) as store: repr(store) + store.info() store['a'] = tm.makeTimeSeries() store['b'] = tm.makeStringSeries() store['c'] = tm.makeDataFrame() @@ -418,8 +419,9 @@ def test_repr(self): # make a random group in hdf space store._handle.create_group(store._handle.root, 'bah') - repr(store) - str(store) + assert store.filename in repr(store) + assert store.filename in str(store) + store.info() # storers with ensure_clean_store(self.path) as store: @@ -4371,11 +4373,11 @@ def test_multiple_open_close(self): # single store = HDFStore(path) - assert 'CLOSED' not in str(store) + assert 'CLOSED' not in store.info() assert store.is_open store.close() - assert 'CLOSED' in str(store) + assert 'CLOSED' in store.info() assert not store.is_open with ensure_clean_path(self.path) as path: @@ -4396,20 +4398,20 @@ def f(): store1 = HDFStore(path) store2 = HDFStore(path) - assert 'CLOSED' not in str(store1) - assert 'CLOSED' not in str(store2) + assert 'CLOSED' not in store1.info() + assert 'CLOSED' not in store2.info() assert store1.is_open assert store2.is_open store1.close() - assert 'CLOSED' in str(store1) + assert 'CLOSED' in store1.info() assert not store1.is_open - assert 'CLOSED' not in str(store2) + assert 'CLOSED' not in store2.info() assert store2.is_open store2.close() - assert 'CLOSED' in str(store1) - assert 'CLOSED' in str(store2) + assert 'CLOSED' in store1.info() + assert 'CLOSED' in store2.info() assert not store1.is_open assert not store2.is_open @@ -4420,11 +4422,11 @@ def f(): store2 = HDFStore(path) store2.append('df2', df) store2.close() - assert 'CLOSED' in str(store2) + assert 'CLOSED' in store2.info() assert not store2.is_open store.close() - assert 'CLOSED' in str(store) + assert 'CLOSED' in store.info() assert not store.is_open # double closing @@ -4433,11 +4435,11 @@ def f(): store2 = HDFStore(path) store.close() - assert 'CLOSED' in str(store) + assert 'CLOSED' in store.info() assert not store.is_open store2.close() - assert 'CLOSED' in str(store2) + assert 'CLOSED' in store2.info() assert not store2.is_open # ops on a closed store @@ -4784,9 +4786,10 @@ def test_categorical(self): tm.assert_frame_equal(result, df2) # Make sure the metadata is OK - assert '/df2 ' in str(store) - assert '/df2/meta/values_block_0/meta' in str(store) - assert '/df2/meta/values_block_1/meta' in str(store) + info = store.info() + assert '/df2 ' in info + assert '/df2/meta/values_block_0/meta' in info + assert '/df2/meta/values_block_1/meta' in info # unordered s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ From c8af4cf79fcdbb02e009f63f95a93b07af3dee87 Mon Sep 17 00:00:00 2001 From: Christoph Moehl Date: Fri, 26 May 2017 16:43:12 +0200 Subject: [PATCH 02/55] ENH: added margins_name parameter for crosstab (#16489) * ENH #15972 added margins_name parameter for crosstab * ENH 15972 minor changes as suggested by reviewers * ENH 15972 correction in whatsnew * ENH 15972 style changes in whatsnew --- doc/source/whatsnew/v0.20.0.txt | 1 - doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/reshape/pivot.py | 28 ++++++++++++++-------- pandas/tests/reshape/test_pivot.py | 37 ++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a0bf2f9b3758a..9d475390175b2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -515,7 +515,6 @@ Other Enhancements - Options added to allow one to turn on/off using ``bottleneck`` and ``numexpr``, see :ref:`here ` (:issue:`16157`) - ``DataFrame.style.bar()`` now accepts two more options to further customize the bar chart. Bar alignment is set with ``align='left'|'mid'|'zero'``, the default is "left", which is backward compatible; You can now pass a list of ``color=[color_negative, color_positive]``. (:issue:`14757`) - .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 246cf2e4bc8f6..2a38fad37584b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -37,6 +37,7 @@ Other Enhancements - :func:`api.types.infer_dtype` now infers decimals. (:issue: `15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`) +- :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins=True. (:issue:`15972`) .. _whatsnew_0210.api_breaking: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74dbbfc00cb11..b562f8a32f5c9 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -388,7 +388,8 @@ def _convert_by(by): def crosstab(index, columns, values=None, rownames=None, colnames=None, - aggfunc=None, margins=False, dropna=True, normalize=False): + aggfunc=None, margins=False, margins_name='All', dropna=True, + normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -411,6 +412,12 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) + margins_name : string, default 'All' + Name of the row / column that will contain the totals + when margins is True. + + .. versionadded:: 0.21.0 + dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False @@ -490,23 +497,26 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - aggfunc=len, margins=margins, dropna=dropna) + aggfunc=len, margins=margins, + margins_name=margins_name, dropna=dropna) table = table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - aggfunc=aggfunc, margins=margins, dropna=dropna) + aggfunc=aggfunc, margins=margins, + margins_name=margins_name, dropna=dropna) # Post-process if normalize is not False: - table = _normalize(table, normalize=normalize, margins=margins) + table = _normalize(table, normalize=normalize, margins=margins, + margins_name=margins_name) return table -def _normalize(table, normalize, margins): +def _normalize(table, normalize, margins, margins_name='All'): if not isinstance(normalize, bool) and not isinstance(normalize, compat.string_types): @@ -537,9 +547,9 @@ def _normalize(table, normalize, margins): elif margins is True: - column_margin = table.loc[:, 'All'].drop('All') - index_margin = table.loc['All', :].drop('All') - table = table.drop('All', axis=1).drop('All') + column_margin = table.loc[:, margins_name].drop(margins_name) + index_margin = table.loc[margins_name, :].drop(margins_name) + table = table.drop(margins_name, axis=1).drop(margins_name) # to keep index and columns names table_index_names = table.index.names table_columns_names = table.columns.names @@ -561,7 +571,7 @@ def _normalize(table, normalize, margins): elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() index_margin = index_margin / index_margin.sum() - index_margin.loc['All'] = 1 + index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) table = table.append(index_margin) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 270a93e4ae382..fc5a2eb468d4f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1071,6 +1071,43 @@ def test_crosstab_margins(self): exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows) + def test_crosstab_margins_set_margin_name(self): + # GH 15972 + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({'a': a, 'b': b, 'c': c}) + + result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), + margins=True, margins_name='TOTAL') + + assert result.index.names == ('a',) + assert result.columns.names == ['b', 'c'] + + all_cols = result['TOTAL', ''] + exp_cols = df.groupby(['a']).size().astype('i8') + # to keep index.name + exp_margin = Series([len(df)], index=Index(['TOTAL'], name='a')) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ('TOTAL', '') + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc['TOTAL'] + exp_rows = df.groupby(['b', 'c']).size().astype('i8') + exp_rows = exp_rows.append(Series([len(df)], index=[('TOTAL', '')])) + exp_rows.name = 'TOTAL' + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + for margins_name in [666, None, ['a', 'b']]: + with pytest.raises(ValueError): + crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), + margins=True, margins_name=margins_name) + def test_crosstab_pass_values(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) From 840de2fffad125c07bf787f8d917f115545e5a46 Mon Sep 17 00:00:00 2001 From: Aaron Barber Date: Fri, 26 May 2017 12:11:55 -0700 Subject: [PATCH 03/55] TST: ujson tests are not being run (#16499) (#16500) closes #16499 --- pandas/tests/io/json/test_ujson.py | 44 +++++++++++++++--------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 662f06dbb725e..76fb6d442a25a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -27,7 +27,7 @@ else partial(json.dumps, encoding="utf-8")) -class UltraJSONTests(object): +class TestUltraJSONTests(object): @pytest.mark.skipif(compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865") @@ -944,19 +944,19 @@ def my_obj_handler(obj): ujson.decode(ujson.encode(l, default_handler=str))) -class NumpyJSONTests(object): +class TestNumpyJSONTests(object): - def testBool(self): + def test_Bool(self): b = np.bool(True) assert ujson.decode(ujson.encode(b)) == b - def testBoolArray(self): + def test_BoolArray(self): inpt = np.array([True, False, True, True, False, True, False, False], dtype=np.bool) outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) tm.assert_numpy_array_equal(inpt, outp) - def testInt(self): + def test_Int(self): num = np.int(2562010) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -984,7 +984,7 @@ def testInt(self): num = np.uint64(2562010) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testIntArray(self): + def test_IntArray(self): arr = np.arange(100, dtype=np.int) dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, np.uint, np.uint8, np.uint16, np.uint32, np.uint64) @@ -993,7 +993,7 @@ def testIntArray(self): outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) tm.assert_numpy_array_equal(inpt, outp) - def testIntMax(self): + def test_IntMax(self): num = np.int(np.iinfo(np.int).max) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -1023,7 +1023,7 @@ def testIntMax(self): num = np.uint64(np.iinfo(np.int64).max) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testFloat(self): + def test_Float(self): num = np.float(256.2013) assert np.float(ujson.decode(ujson.encode(num))) == num @@ -1033,7 +1033,7 @@ def testFloat(self): num = np.float64(256.2013) assert np.float64(ujson.decode(ujson.encode(num))) == num - def testFloatArray(self): + def test_FloatArray(self): arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) dtypes = (np.float, np.float32, np.float64) @@ -1043,7 +1043,7 @@ def testFloatArray(self): inpt, double_precision=15)), dtype=dtype) tm.assert_almost_equal(inpt, outp) - def testFloatMax(self): + def test_FloatMax(self): num = np.float(np.finfo(np.float).max / 10) tm.assert_almost_equal(np.float(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) @@ -1056,7 +1056,7 @@ def testFloatMax(self): tm.assert_almost_equal(np.float64(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) - def testArrays(self): + def test_Arrays(self): arr = np.arange(100) arr = arr.reshape((10, 10)) @@ -1097,13 +1097,13 @@ def testArrays(self): outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) tm.assert_almost_equal(arr, outp) - def testOdArray(self): + def test_OdArray(self): def will_raise(): ujson.encode(np.array(1)) pytest.raises(TypeError, will_raise) - def testArrayNumpyExcept(self): + def test_ArrayNumpyExcept(self): input = ujson.dumps([42, {}, 'a']) try: @@ -1186,7 +1186,7 @@ def testArrayNumpyExcept(self): except: assert False, "Wrong exception" - def testArrayNumpyLabelled(self): + def test_ArrayNumpyLabelled(self): input = {'a': []} output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) assert (np.empty((1, 0)) == output[0]).all() @@ -1220,9 +1220,9 @@ def testArrayNumpyLabelled(self): assert (np.array(['a', 'b']) == output[2]).all() -class PandasJSONTests(object): +class TestPandasJSONTests(object): - def testDataFrame(self): + def test_DataFrame(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1252,7 +1252,7 @@ def testDataFrame(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNumpy(self): + def test_DataFrameNumpy(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1275,7 +1275,7 @@ def testDataFrameNumpy(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNested(self): + def test_DataFrameNested(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1301,7 +1301,7 @@ def testDataFrameNested(self): 'df2': ujson.decode(ujson.encode(df, orient="split"))} assert ujson.decode(ujson.encode(nested, orient="split")) == exp - def testDataFrameNumpyLabelled(self): + def test_DataFrameNumpyLabelled(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1324,7 +1324,7 @@ def testDataFrameNumpyLabelled(self): tm.assert_index_equal(df.columns, outp.columns) tm.assert_index_equal(df.index, outp.index) - def testSeries(self): + def test_Series(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1372,7 +1372,7 @@ def testSeries(self): s, orient="index"), numpy=True)).sort_values() tm.assert_series_equal(outp, exp) - def testSeriesNested(self): + def test_SeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1398,7 +1398,7 @@ def testSeriesNested(self): 's2': ujson.decode(ujson.encode(s, orient="index"))} assert ujson.decode(ujson.encode(nested, orient="index")) == exp - def testIndex(self): + def test_Index(self): i = Index([23, 45, 18, 98, 43, 11], name="index") # column indexed From 4ed801b1e0c8bace0365a488b3d1692462966145 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 26 May 2017 21:35:11 -0400 Subject: [PATCH 04/55] DOC: Remove preference for pytest paradigm in assert_raises_regex (#16518) --- pandas/util/testing.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f987045c27d5f..17e09b38b20e0 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2424,15 +2424,8 @@ def assert_raises_regex(_exception, _regexp, _callable=None, Check that the specified Exception is raised and that the error message matches a given regular expression pattern. This may be a regular expression object or a string containing a regular expression suitable - for use by `re.search()`. - - This is a port of the `assertRaisesRegexp` function from unittest in - Python 2.7. However, with our migration to `pytest`, please refrain - from using this. Instead, use the following paradigm: - - with pytest.raises(_exception) as exc_info: - func(*args, **kwargs) - exc_info.matches(reg_exp) + for use by `re.search()`. This is a port of the `assertRaisesRegexp` + function from unittest in Python 2.7. Examples -------- From c570eafff06c1518ea59da74f97908d9d5135c07 Mon Sep 17 00:00:00 2001 From: "John W. O'Brien" Date: Mon, 29 May 2017 12:00:42 -0400 Subject: [PATCH 05/55] TST: Specify HTML file encoding on PY3 (#16526) --- pandas/tests/io/test_html.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6da77bf423609..1e1d653cf94d1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -20,7 +20,7 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import (map, zip, StringIO, string_types, BytesIO, - is_platform_windows) + is_platform_windows, PY3) from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html from pandas._libs.parsers import ParserError @@ -96,6 +96,9 @@ def read_html(self, *args, **kwargs): class TestReadHtml(ReadHtmlMixin): flavor = 'bs4' spam_data = os.path.join(DATA_PATH, 'spam.html') + spam_data_kwargs = {} + if PY3: + spam_data_kwargs['encoding'] = 'UTF-8' banklist_data = os.path.join(DATA_PATH, 'banklist.html') @classmethod @@ -247,10 +250,10 @@ def test_infer_types(self): assert_framelist_equal(df1, df2) def test_string_io(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data1 = StringIO(f.read()) - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) df1 = self.read_html(data1, '.*Water.*') @@ -258,7 +261,7 @@ def test_string_io(self): assert_framelist_equal(df1, df2) def test_string(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() df1 = self.read_html(data, '.*Water.*') @@ -267,10 +270,10 @@ def test_string(self): assert_framelist_equal(df1, df2) def test_file_like(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df1 = self.read_html(f, '.*Water.*') - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df2 = self.read_html(f, 'Unit') assert_framelist_equal(df1, df2) From 44d2a1232c21b0426b0ff9f866fd65e2cda71250 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 May 2017 17:52:55 -0500 Subject: [PATCH 06/55] BUG: Fixed tput output on windows (#16496) --- doc/source/whatsnew/v0.20.2.txt | 2 ++ pandas/io/formats/terminal.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 13365401f1d1c..7b7f9e8745809 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -37,6 +37,8 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when + detecting the terminal size. This fix only applies to python 3 (:issue:`16496`) - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) - Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) diff --git a/pandas/io/formats/terminal.py b/pandas/io/formats/terminal.py index dadd09ae74ea4..30bd1d16b538a 100644 --- a/pandas/io/formats/terminal.py +++ b/pandas/io/formats/terminal.py @@ -14,6 +14,8 @@ from __future__ import print_function import os +import sys +import shutil __all__ = ['get_terminal_size'] @@ -26,6 +28,10 @@ def get_terminal_size(): IPython zmq frontends, or IDLE do not run in a terminal, """ import platform + + if sys.version_info[0] >= 3: + return shutil.get_terminal_size() + current_os = platform.system() tuple_xy = None if current_os == 'Windows': From 1a9cb5bcfa98876687a4001987e420b06db06aa5 Mon Sep 17 00:00:00 2001 From: keitakurita Date: Wed, 31 May 2017 08:12:50 +0900 Subject: [PATCH 07/55] BUG: Incorrect handling of rolling.cov with offset window (#16244) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/window.py | 9 ++++++++- pandas/tests/test_window.py | 23 +++++++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 7b7f9e8745809..90146aa176b31 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -82,6 +82,7 @@ Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug creating datetime rolling window on an empty DataFrame (:issue:`15819`) +- Bug in ``rolling.cov()`` with offset window (:issue:`16058`) Sparse diff --git a/pandas/core/window.py b/pandas/core/window.py index cf1bad706ae1d..ba7e79944ab0e 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -81,6 +81,7 @@ def __init__(self, obj, window=None, min_periods=None, freq=None, self.freq = freq self.center = center self.win_type = win_type + self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() @@ -996,7 +997,12 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): # only default unset pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) - window = self._get_window(other) + + # GH 16058: offset window + if self.is_freq_type: + window = self.win_freq + else: + window = self._get_window(other) def _get_cov(X, Y): # GH #12373 : rolling functions error on float32 data @@ -1088,6 +1094,7 @@ def validate(self): "based windows") # this will raise ValueError on non-fixed freqs + self.win_freq = self.window self.window = freq.nanos self.win_type = 'freq' diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 6a640d62108b3..cbb3c345a9353 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -3833,3 +3833,26 @@ def test_non_monotonic(self): df2 = df.sort_values('B') result = df2.groupby('A').rolling('4s', on='B').C.mean() tm.assert_series_equal(result, expected) + + def test_rolling_cov_offset(self): + # GH16058 + + idx = pd.date_range('2017-01-01', periods=24, freq='1h') + ss = pd.Series(np.arange(len(idx)), index=idx) + + result = ss.rolling('2h').cov() + expected = pd.Series([np.nan] + [0.5 for _ in range(len(idx) - 1)], + index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(2, min_periods=1).cov() + tm.assert_series_equal(result, expected2) + + result = ss.rolling('3h').cov() + expected = pd.Series([np.nan, 0.5] + + [1.0 for _ in range(len(idx) - 2)], + index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(3, min_periods=1).cov() + tm.assert_series_equal(result, expected2) From 0a9f54848ec03508d85321dcf9563aac0831c9e4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 May 2017 21:39:15 -0500 Subject: [PATCH 08/55] TST: Avoid global state in matplotlib tests (#16539) Replaces most uses of implicit global state from matplotlib in test_datetimelike.py. This was potentially causing random failures where a figure expected to be on a new, blank figure would instead plot on an existing axes (that's the guess at least). --- pandas/tests/plotting/test_datetimelike.py | 379 +++++++++++---------- pandas/tests/plotting/test_series.py | 159 ++++++--- 2 files changed, 301 insertions(+), 237 deletions(-) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 0e15aaa2555f4..0cff365be3ec8 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -55,16 +55,15 @@ def test_ts_plot_with_tz(self): def test_fontsize_set_correctly(self): # For issue #8765 - import matplotlib.pyplot as plt # noqa df = DataFrame(np.random.randn(10, 9), index=range(10)) - ax = df.plot(fontsize=2) + fig, ax = self.plt.subplots() + df.plot(fontsize=2, ax=ax) for label in (ax.get_xticklabels() + ax.get_yticklabels()): assert label.get_fontsize() == 2 @slow def test_frame_inferred(self): # inferred freq - import matplotlib.pyplot as plt # noqa idx = date_range('1/1/1987', freq='MS', periods=100) idx = DatetimeIndex(idx.values, freq=None) @@ -90,26 +89,24 @@ def test_is_error_nozeroindex(self): _check_plot_works(a.plot, yerr=a) def test_nonnumeric_exclude(self): - import matplotlib.pyplot as plt - idx = date_range('1/1/1987', freq='A', periods=3) df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}, idx) - ax = df.plot() # it works + fig, ax = self.plt.subplots() + df.plot(ax=ax) # it works assert len(ax.get_lines()) == 1 # B was plotted - plt.close(plt.gcf()) + self.plt.close(fig) pytest.raises(TypeError, df['A'].plot) @slow def test_tsplot(self): from pandas.tseries.plotting import tsplot - import matplotlib.pyplot as plt - ax = plt.gca() + _, ax = self.plt.subplots() ts = tm.makeTimeSeries() - f = lambda *args, **kwds: tsplot(s, plt.Axes.plot, *args, **kwds) + f = lambda *args, **kwds: tsplot(s, self.plt.Axes.plot, *args, **kwds) for s in self.period_ser: _check_plot_works(f, s.index.freq, ax=ax, series=s) @@ -123,12 +120,12 @@ def test_tsplot(self): for s in self.datetime_ser: _check_plot_works(s.plot, ax=ax) - ax = ts.plot(style='k') + _, ax = self.plt.subplots() + ts.plot(style='k', ax=ax) color = (0., 0., 0., 1) if self.mpl_ge_2_0_0 else (0., 0., 0.) assert color == ax.get_lines()[0].get_color() def test_both_style_and_color(self): - import matplotlib.pyplot as plt # noqa ts = tm.makeTimeSeries() pytest.raises(ValueError, ts.plot, style='b-', color='#000099') @@ -140,9 +137,10 @@ def test_both_style_and_color(self): def test_high_freq(self): freaks = ['ms', 'us'] for freq in freaks: + _, ax = self.plt.subplots() rng = date_range('1/1/2012', periods=100000, freq=freq) ser = Series(np.random.randn(len(rng)), rng) - _check_plot_works(ser.plot) + _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): from pandas.plotting._converter import get_datevalue @@ -167,22 +165,25 @@ def check_format_of_first_point(ax, expected_string): annual = Series(1, index=date_range('2014-01-01', periods=3, freq='A-DEC')) - check_format_of_first_point(annual.plot(), 't = 2014 y = 1.000000') + _, ax = self.plt.subplots() + annual.plot(ax=ax) + check_format_of_first_point(ax, 't = 2014 y = 1.000000') # note this is added to the annual plot already in existence, and # changes its freq field daily = Series(1, index=date_range('2014-01-01', periods=3, freq='D')) - check_format_of_first_point(daily.plot(), + daily.plot(ax=ax) + check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') tm.close() # tsplot - import matplotlib.pyplot as plt + _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot - tsplot(annual, plt.Axes.plot) - check_format_of_first_point(plt.gca(), 't = 2014 y = 1.000000') - tsplot(daily, plt.Axes.plot) - check_format_of_first_point(plt.gca(), 't = 2014-01-01 y = 1.000000') + tsplot(annual, self.plt.Axes.plot, ax=ax) + check_format_of_first_point(ax, 't = 2014 y = 1.000000') + tsplot(daily, self.plt.Axes.plot, ax=ax) + check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') @slow def test_line_plot_period_series(self): @@ -215,14 +216,11 @@ def test_line_plot_inferred_freq(self): _check_plot_works(ser.plot) def test_fake_inferred_business(self): - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() - fig.add_subplot(111) + _, ax = self.plt.subplots() rng = date_range('2001-1-1', '2001-1-10') ts = Series(lrange(len(rng)), rng) ts = ts[:3].append(ts[5:]) - ax = ts.plot() + ts.plot(ax=ax) assert not hasattr(ax, 'freq') @slow @@ -244,15 +242,11 @@ def test_plot_multiple_inferred_freq(self): @slow def test_uhf(self): import pandas.plotting._converter as conv - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() - fig.add_subplot(111) - idx = date_range('2012-6-22 21:59:51.960928', freq='L', periods=500) df = DataFrame(np.random.randn(len(idx), 2), idx) - ax = df.plot() + _, ax = self.plt.subplots() + df.plot(ax=ax) axis = ax.get_xaxis() tlocs = axis.get_ticklocs() @@ -265,49 +259,40 @@ def test_uhf(self): @slow def test_irreg_hf(self): - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() - fig.add_subplot(111) - idx = date_range('2012-6-22 21:59:51', freq='S', periods=100) df = DataFrame(np.random.randn(len(idx), 2), idx) irreg = df.iloc[[0, 1, 3, 4]] - ax = irreg.plot() + _, ax = self.plt.subplots() + irreg.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() sec = 1. / 24 / 60 / 60 assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all() - plt.clf() - fig.add_subplot(111) + _, ax = self.plt.subplots() df2 = df.copy() df2.index = df.index.asobject - ax = df2.plot() + df2.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() assert (np.fabs(diffs[1:] - sec) < 1e-8).all() def test_irregular_datetime64_repr_bug(self): - import matplotlib.pyplot as plt ser = tm.makeTimeSeries() ser = ser[[0, 1, 2, 7]] - fig = plt.gcf() - plt.clf() + _, ax = self.plt.subplots() - ax = fig.add_subplot(211) - - ret = ser.plot() + ret = ser.plot(ax=ax) assert ret is not None for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index): assert rs == xp def test_business_freq(self): - import matplotlib.pyplot as plt # noqa bts = tm.makePeriodSeries() - ax = bts.plot() + _, ax = self.plt.subplots() + bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() assert PeriodIndex(data=idx).freqstr == 'B' @@ -319,7 +304,8 @@ def test_business_freq_convert(self): bts = tm.makeTimeSeries().asfreq('BM') tm.N = n ts = bts.to_period('M') - ax = bts.plot() + _, ax = self.plt.subplots() + bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == ts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() assert PeriodIndex(data=idx).freqstr == 'M' @@ -329,19 +315,20 @@ def test_nonzero_base(self): idx = (date_range('2012-12-20', periods=24, freq='H') + timedelta( minutes=30)) df = DataFrame(np.arange(24), index=idx) - ax = df.plot() + _, ax = self.plt.subplots() + df.plot(ax=ax) rs = ax.get_lines()[0].get_xdata() assert not Index(rs).is_normalized def test_dataframe(self): bts = DataFrame({'a': tm.makeTimeSeries()}) - ax = bts.plot() + _, ax = self.plt.subplots() + bts.plot(ax=ax) idx = ax.get_lines()[0].get_xdata() tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) @slow def test_axis_limits(self): - import matplotlib.pyplot as plt def _test(ax): xlim = ax.get_xlim() @@ -369,14 +356,16 @@ def _test(ax): assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal fig = ax.get_figure() - plt.close(fig) + self.plt.close(fig) ser = tm.makeTimeSeries() - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) _test(ax) + _, ax = self.plt.subplots() df = DataFrame({'a': ser, 'b': ser + 1}) - ax = df.plot() + df.plot(ax=ax) _test(ax) df = DataFrame({'a': ser, 'b': ser + 1}) @@ -397,13 +386,13 @@ def test_get_finder(self): @slow def test_finder_daily(self): - import matplotlib.pyplot as plt xp = Period('1999-1-1', freq='B').ordinal day_lst = [10, 40, 252, 400, 950, 2750, 10000] for n in day_lst: rng = bdate_range('1999-1-1', periods=n) ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert xp == rs @@ -411,17 +400,17 @@ def test_finder_daily(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] assert xp == rs - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) @slow def test_finder_quarterly(self): - import matplotlib.pyplot as plt xp = Period('1988Q1').ordinal yrs = [3.5, 11] for n in yrs: rng = period_range('1987Q2', periods=int(n * 4), freq='Q') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert rs == xp @@ -429,17 +418,17 @@ def test_finder_quarterly(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] assert xp == rs - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) @slow def test_finder_monthly(self): - import matplotlib.pyplot as plt xp = Period('Jan 1988').ordinal yrs = [1.15, 2.5, 4, 11] for n in yrs: rng = period_range('1987Q2', periods=int(n * 12), freq='M') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert rs == xp @@ -447,12 +436,13 @@ def test_finder_monthly(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] assert xp == rs - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) def test_finder_monthly_long(self): rng = period_range('1988Q1', periods=24 * 12, freq='M') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] xp = Period('1989Q1', 'M').ordinal @@ -460,23 +450,24 @@ def test_finder_monthly_long(self): @slow def test_finder_annual(self): - import matplotlib.pyplot as plt xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): rng = period_range('1987', periods=nyears, freq='A') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert rs == Period(xp[i], freq='A').ordinal - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) @slow def test_finder_minutely(self): nminutes = 50 * 24 * 60 rng = date_range('1/1/1999', freq='Min', periods=nminutes) ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] xp = Period('1/1/1999', freq='Min').ordinal @@ -486,7 +477,8 @@ def test_finder_hourly(self): nhours = 23 rng = date_range('1/1/1999', freq='H', periods=nhours) ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] xp = Period('1/1/1999', freq='H').ordinal @@ -494,11 +486,10 @@ def test_finder_hourly(self): @slow def test_gaps(self): - import matplotlib.pyplot as plt - ts = tm.makeTimeSeries() ts[5:25] = np.nan - ax = ts.plot() + _, ax = self.plt.subplots() + ts.plot(ax=ax) lines = ax.get_lines() tm._skip_if_mpl_1_5() assert len(lines) == 1 @@ -507,13 +498,14 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[5:25, 1].all() - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) # irregular ts = tm.makeTimeSeries() ts = ts[[0, 1, 2, 5, 7, 9, 12, 15, 20]] ts[2:5] = np.nan - ax = ts.plot() + _, ax = self.plt.subplots() + ax = ts.plot(ax=ax) lines = ax.get_lines() assert len(lines) == 1 l = lines[0] @@ -521,13 +513,14 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[2:5, 1].all() - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) # non-ts idx = [0, 1, 2, 5, 7, 9, 12, 15, 20] ser = Series(np.random.randn(len(idx)), idx) ser[2:5] = np.nan - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) lines = ax.get_lines() assert len(lines) == 1 l = lines[0] @@ -540,7 +533,8 @@ def test_gaps(self): def test_gap_upsample(self): low = tm.makeTimeSeries() low[5:25] = np.nan - ax = low.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) idxh = date_range(low.index[0], low.index[-1], freq='12h') s = Series(np.random.randn(len(idxh)), idxh) @@ -559,26 +553,25 @@ def test_gap_upsample(self): @slow def test_secondary_y(self): - import matplotlib.pyplot as plt - ser = Series(np.random.randn(10)) ser2 = Series(np.random.randn(10)) + fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') - fig = ax.get_figure() axes = fig.get_axes() l = ax.get_lines()[0] xp = Series(l.get_ydata(), l.get_xdata()) assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == 'right' assert not axes[0].get_yaxis().get_visible() - plt.close(fig) + self.plt.close(fig) - ax2 = ser2.plot() + _, ax2 = self.plt.subplots() + ser2.plot(ax=ax2) assert (ax2.get_yaxis().get_ticks_position() == self.default_tick_position) - plt.close(ax2.get_figure()) + self.plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) @@ -590,26 +583,26 @@ def test_secondary_y(self): @slow def test_secondary_y_ts(self): - import matplotlib.pyplot as plt idx = date_range('1/1/2000', periods=10) ser = Series(np.random.randn(10), idx) ser2 = Series(np.random.randn(10), idx) + fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') - fig = ax.get_figure() axes = fig.get_axes() l = ax.get_lines()[0] xp = Series(l.get_ydata(), l.get_xdata()).to_timestamp() assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == 'right' assert not axes[0].get_yaxis().get_visible() - plt.close(fig) + self.plt.close(fig) - ax2 = ser2.plot() + _, ax2 = self.plt.subplots() + ser2.plot(ax=ax2) assert (ax2.get_yaxis().get_ticks_position() == self.default_tick_position) - plt.close(ax2.get_figure()) + self.plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) @@ -620,20 +613,19 @@ def test_secondary_kde(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() - import matplotlib.pyplot as plt # noqa ser = Series(np.random.randn(10)) - ax = ser.plot(secondary_y=True, kind='density') + fig, ax = self.plt.subplots() + ax = ser.plot(secondary_y=True, kind='density', ax=ax) assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') - fig = ax.get_figure() axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == 'right' @slow def test_secondary_bar(self): ser = Series(np.random.randn(10)) - ax = ser.plot(secondary_y=True, kind='bar') - fig = ax.get_figure() + fig, ax = self.plt.subplots() + ser.plot(secondary_y=True, kind='bar', ax=ax) axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == 'right' @@ -656,7 +648,7 @@ def test_secondary_bar_frame(self): assert axes[2].get_yaxis().get_ticks_position() == 'right' def test_mixed_freq_regular_first(self): - import matplotlib.pyplot as plt # noqa + # TODO s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] @@ -676,11 +668,11 @@ def test_mixed_freq_regular_first(self): @slow def test_mixed_freq_irregular_first(self): - import matplotlib.pyplot as plt # noqa s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] - s2.plot(style='g') - ax = s1.plot() + _, ax = self.plt.subplots() + s2.plot(style='g', ax=ax) + s1.plot(ax=ax) assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() @@ -690,10 +682,10 @@ def test_mixed_freq_irregular_first(self): def test_mixed_freq_regular_first_df(self): # GH 9852 - import matplotlib.pyplot as plt # noqa s1 = tm.makeTimeSeries().to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) ax2 = s2.plot(style='g', ax=ax) lines = ax2.get_lines() idx1 = PeriodIndex(lines[0].get_xdata()) @@ -708,11 +700,11 @@ def test_mixed_freq_regular_first_df(self): @slow def test_mixed_freq_irregular_first_df(self): # GH 9852 - import matplotlib.pyplot as plt # noqa s1 = tm.makeTimeSeries().to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] - ax = s2.plot(style='g') - ax = s1.plot(ax=ax) + _, ax = self.plt.subplots() + s2.plot(style='g', ax=ax) + s1.plot(ax=ax) assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() @@ -725,8 +717,9 @@ def test_mixed_freq_hf_first(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - high.plot() - ax = low.plot() + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'D' @@ -738,33 +731,35 @@ def test_mixed_freq_alignment(self): ts = Series(ts_data, index=ts_ind) ts2 = ts.asfreq('T').interpolate() - ax = ts.plot() - ts2.plot(style='r') + _, ax = self.plt.subplots() + ax = ts.plot(ax=ax) + ts2.plot(style='r', ax=ax) assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0] @slow def test_mixed_freq_lf_first(self): - import matplotlib.pyplot as plt idxh = date_range('1/1/1999', periods=365, freq='D') idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot(legend=True) - ax = high.plot(legend=True) + _, ax = self.plt.subplots() + low.plot(legend=True, ax=ax) + high.plot(legend=True, ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'D' leg = ax.get_legend() assert len(leg.texts) == 2 - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) idxh = date_range('1/1/1999', periods=240, freq='T') idxl = date_range('1/1/1999', periods=4, freq='H') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot() - ax = high.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'T' @@ -773,8 +768,9 @@ def test_mixed_freq_irreg_period(self): irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] rng = period_range('1/3/2000', periods=30, freq='B') ps = Series(np.random.randn(len(rng)), rng) - irreg.plot() - ps.plot() + _, ax = self.plt.subplots() + irreg.plot(ax=ax) + ps.plot(ax=ax) def test_mixed_freq_shared_ax(self): @@ -813,9 +809,7 @@ def test_mixed_freq_shared_ax(self): def test_nat_handling(self): - fig = self.plt.gcf() - # self.plt.clf() - ax = fig.add_subplot(111) + _, ax = self.plt.subplots() dti = DatetimeIndex(['2015-01-01', NaT, '2015-01-03']) s = Series(range(len(dti)), dti) @@ -831,17 +825,18 @@ def test_to_weekly_resampling(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - high.plot() - ax = low.plot() + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq # tsplot from pandas.tseries.plotting import tsplot - import matplotlib.pyplot as plt - tsplot(high, plt.Axes.plot) - lines = tsplot(low, plt.Axes.plot) + _, ax = self.plt.subplots() + tsplot(high, self.plt.Axes.plot, ax=ax) + lines = tsplot(low, self.plt.Axes.plot, ax=ax) for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq @@ -851,8 +846,9 @@ def test_from_weekly_resampling(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot() - ax = high.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) expected_h = idxh.to_period().asi8.astype(np.float64) expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, @@ -868,10 +864,10 @@ def test_from_weekly_resampling(self): # tsplot from pandas.tseries.plotting import tsplot - import matplotlib.pyplot as plt - tsplot(low, plt.Axes.plot) - lines = tsplot(high, plt.Axes.plot) + _, ax = self.plt.subplots() + tsplot(low, self.plt.Axes.plot, ax=ax) + lines = tsplot(high, self.plt.Axes.plot, ax=ax) for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq xdata = l.get_xdata(orig=False) @@ -891,8 +887,9 @@ def test_from_resampling_area_line_mixed(self): # low to high for kind1, kind2 in [('line', 'area'), ('area', 'line')]: - ax = low.plot(kind=kind1, stacked=True) - ax = high.plot(kind=kind2, stacked=True, ax=ax) + _, ax = self.plt.subplots() + low.plot(kind=kind1, stacked=True, ax=ax) + high.plot(kind=kind2, stacked=True, ax=ax) # check low dataframe result expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, @@ -923,8 +920,9 @@ def test_from_resampling_area_line_mixed(self): # high to low for kind1, kind2 in [('line', 'area'), ('area', 'line')]: - ax = high.plot(kind=kind1, stacked=True) - ax = low.plot(kind=kind2, stacked=True, ax=ax) + _, ax = self.plt.subplots() + high.plot(kind=kind1, stacked=True, ax=ax) + low.plot(kind=kind2, stacked=True, ax=ax) # check high dataframe result expected_x = idxh.to_period().asi8.astype(np.float64) @@ -960,16 +958,18 @@ def test_mixed_freq_second_millisecond(self): high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) # high to low - high.plot() - ax = low.plot() + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'L' tm.close() # low to high - low.plot() - ax = high.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'L' @@ -985,7 +985,8 @@ def test_irreg_dtypes(self): idx = date_range('1/1/2000', periods=10) idx = idx[[0, 2, 5, 9]].asobject df = DataFrame(np.random.randn(len(idx), 3), idx) - _check_plot_works(df.plot) + _, ax = self.plt.subplots() + _check_plot_works(df.plot, ax=ax) @slow def test_time(self): @@ -995,7 +996,8 @@ def test_time(self): df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) - ax = df.plot() + _, ax = self.plt.subplots() + df.plot(ax=ax) # verify tick labels ticks = ax.get_xticks() @@ -1031,7 +1033,8 @@ def test_time_musec(self): df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) - ax = df.plot() + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) # verify tick labels ticks = ax.get_xticks() @@ -1054,8 +1057,9 @@ def test_secondary_upsample(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot() - ax = high.plot(secondary_y=True) + _, ax = self.plt.subplots() + low.plot(ax=ax) + ax = high.plot(secondary_y=True, ax=ax) for l in ax.get_lines(): assert PeriodIndex(l.get_xdata()).freq == 'D' assert hasattr(ax, 'left_ax') @@ -1065,14 +1069,12 @@ def test_secondary_upsample(self): @slow def test_secondary_legend(self): - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) # ts df = tm.makeTimeDataFrame() - ax = df.plot(secondary_y=['A', 'B']) + df.plot(secondary_y=['A', 'B'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert leg.get_texts()[0].get_text() == 'A (right)' @@ -1086,33 +1088,37 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 + self.plt.close(fig) - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['A', 'C'], mark_right=False) + df.plot(secondary_y=['A', 'C'], mark_right=False, ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert leg.get_texts()[0].get_text() == 'A' assert leg.get_texts()[1].get_text() == 'B' assert leg.get_texts()[2].get_text() == 'C' assert leg.get_texts()[3].get_text() == 'D' + self.plt.close(fig) - plt.clf() - ax = df.plot(kind='bar', secondary_y=['A']) + fig, ax = self.plt.subplots() + df.plot(kind='bar', secondary_y=['A'], ax=ax) leg = ax.get_legend() assert leg.get_texts()[0].get_text() == 'A (right)' assert leg.get_texts()[1].get_text() == 'B' + self.plt.close(fig) - plt.clf() - ax = df.plot(kind='bar', secondary_y=['A'], mark_right=False) + fig, ax = self.plt.subplots() + df.plot(kind='bar', secondary_y=['A'], mark_right=False, ax=ax) leg = ax.get_legend() assert leg.get_texts()[0].get_text() == 'A' assert leg.get_texts()[1].get_text() == 'B' + self.plt.close(fig) - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) df = tm.makeTimeDataFrame() - ax = df.plot(secondary_y=['C', 'D']) + ax = df.plot(secondary_y=['C', 'D'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1122,12 +1128,13 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 + self.plt.close(fig) # non-ts df = tm.makeDataFrame() - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['A', 'B']) + ax = df.plot(secondary_y=['A', 'B'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1137,10 +1144,11 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 + self.plt.close() - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['C', 'D']) + ax = df.plot(secondary_y=['C', 'D'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1154,7 +1162,8 @@ def test_secondary_legend(self): def test_format_date_axis(self): rng = date_range('1/1/2012', periods=12, freq='M') df = DataFrame(np.random.randn(len(rng), 3), rng) - ax = df.plot() + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) xaxis = ax.get_xaxis() for l in xaxis.get_ticklabels(): if len(l.get_text()) > 0: @@ -1162,28 +1171,21 @@ def test_format_date_axis(self): @slow def test_ax_plot(self): - import matplotlib.pyplot as plt - x = DatetimeIndex(start='2012-01-02', periods=10, freq='D') y = lrange(len(x)) - fig = plt.figure() - ax = fig.add_subplot(111) + _, ax = self.plt.subplots() lines = ax.plot(x, y, label='Y') tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) @slow def test_mpl_nopandas(self): - import matplotlib.pyplot as plt - dates = [date(2008, 12, 31), date(2009, 1, 31)] values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) kw = dict(fmt='-', lw=4) - plt.close('all') - fig = plt.figure() - ax = fig.add_subplot(111) + _, ax = self.plt.subplots() ax.plot_date([x.toordinal() for x in dates], values1, **kw) ax.plot_date([x.toordinal() for x in dates], values2, **kw) @@ -1201,7 +1203,8 @@ def test_irregular_ts_shared_ax_xlim(self): ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] # plot the left section of the irregular series, then the right section - ax = ts_irregular[:5].plot() + _, ax = self.plt.subplots() + ts_irregular[:5].plot(ax=ax) ts_irregular[5:].plot(ax=ax) # check that axis limits are correct @@ -1217,7 +1220,8 @@ def test_secondary_y_non_ts_xlim(self): s1 = Series(1, index=index_1) s2 = Series(2, index=index_2) - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) left_before, right_before = ax.get_xlim() s2.plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() @@ -1233,7 +1237,8 @@ def test_secondary_y_regular_ts_xlim(self): s1 = Series(1, index=index_1) s2 = Series(2, index=index_2) - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) left_before, right_before = ax.get_xlim() s2.plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() @@ -1247,7 +1252,8 @@ def test_secondary_y_mixed_freq_ts_xlim(self): rng = date_range('2000-01-01', periods=10000, freq='min') ts = Series(1, index=rng) - ax = ts.plot() + _, ax = self.plt.subplots() + ts.plot(ax=ax) left_before, right_before = ax.get_xlim() ts.resample('D').mean().plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() @@ -1262,7 +1268,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts = tm.makeTimeSeries()[:20] ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] - ax = ts_irregular[:5].plot() + _, ax = self.plt.subplots() + ts_irregular[:5].plot(ax=ax) # plot higher-x values on secondary axis ts_irregular[5:].plot(secondary_y=True, ax=ax) # ensure secondary limits aren't overwritten by plot on primary @@ -1275,10 +1282,11 @@ def test_secondary_y_irregular_ts_xlim(self): def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise values = [date(1677, 1, 1), date(1677, 1, 2)] - self.plt.plot(values) + _, ax = self.plt.subplots() + ax.plot(values) values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] - self.plt.plot(values) + ax.plot(values) def test_format_timedelta_ticks_narrow(self): if is_platform_mac(): @@ -1290,8 +1298,8 @@ def test_format_timedelta_ticks_narrow(self): rng = timedelta_range('0', periods=10, freq='ns') df = DataFrame(np.random.randn(len(rng), 3), rng) - ax = df.plot(fontsize=2) - fig = ax.get_figure() + fig, ax = self.plt.subplots() + df.plot(fontsize=2, ax=ax) fig.canvas.draw() labels = ax.get_xticklabels() assert len(labels) == len(expected_labels) @@ -1316,8 +1324,8 @@ def test_format_timedelta_ticks_wide(self): rng = timedelta_range('0', periods=10, freq='1 d') df = DataFrame(np.random.randn(len(rng), 3), rng) - ax = df.plot(fontsize=2) - fig = ax.get_figure() + fig, ax = self.plt.subplots() + ax = df.plot(fontsize=2, ax=ax) fig.canvas.draw() labels = ax.get_xticklabels() assert len(labels) == len(expected_labels) @@ -1327,19 +1335,22 @@ def test_format_timedelta_ticks_wide(self): def test_timedelta_plot(self): # test issue #8711 s = Series(range(5), timedelta_range('1day', periods=5)) - _check_plot_works(s.plot) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) # test long period index = timedelta_range('1 day 2 hr 30 min 10 s', periods=10, freq='1 d') s = Series(np.random.randn(len(index)), index) - _check_plot_works(s.plot) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) # test short period index = timedelta_range('1 day 2 hr 30 min 10 s', periods=10, freq='1 ns') s = Series(np.random.randn(len(index)), index) - _check_plot_works(s.plot) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) def test_hist(self): # https://github.com/matplotlib/matplotlib/issues/8459 @@ -1347,7 +1358,8 @@ def test_hist(self): x = rng w1 = np.arange(0, 1, .1) w2 = np.arange(0, 1, .1)[::-1] - self.plt.hist([x, x], weights=[w1, w2]) + _, ax = self.plt.subplots() + ax.hist([x, x], weights=[w1, w2]) @slow def test_overlapping_datetime(self): @@ -1361,7 +1373,8 @@ def test_overlapping_datetime(self): # plot first series, then add the second series to those axes, # then try adding the first series again - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) s2.plot(ax=ax) s1.plot(ax=ax) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 340a98484480f..7c66b5dafb9c7 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -82,7 +82,8 @@ def test_plot(self): @slow def test_plot_figsize_and_title(self): # figsize and title - ax = self.series.plot(title='Test', figsize=(16, 8)) + _, ax = self.plt.subplots() + ax = self.series.plot(title='Test', figsize=(16, 8), ax=ax) self._check_text_labels(ax.title, 'Test') self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) @@ -93,25 +94,28 @@ def test_dont_modify_rcParams(self): else: key = 'axes.color_cycle' colors = self.plt.rcParams[key] - Series([1, 2, 3]).plot() + _, ax = self.plt.subplots() + Series([1, 2, 3]).plot(ax=ax) assert colors == self.plt.rcParams[key] def test_ts_line_lim(self): - ax = self.ts.plot() + fig, ax = self.plt.subplots() + ax = self.ts.plot(ax=ax) xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin == lines[0].get_data(orig=False)[0][0] assert xmax == lines[0].get_data(orig=False)[0][-1] tm.close() - ax = self.ts.plot(secondary_y=True) + ax = self.ts.plot(secondary_y=True, ax=ax) xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin == lines[0].get_data(orig=False)[0][0] assert xmax == lines[0].get_data(orig=False)[0][-1] def test_ts_area_lim(self): - ax = self.ts.plot.area(stacked=False) + _, ax = self.plt.subplots() + ax = self.ts.plot.area(stacked=False, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] @@ -119,7 +123,8 @@ def test_ts_area_lim(self): tm.close() # GH 7471 - ax = self.ts.plot.area(stacked=False, x_compat=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.area(stacked=False, x_compat=True, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] @@ -128,14 +133,16 @@ def test_ts_area_lim(self): tz_ts = self.ts.copy() tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') - ax = tz_ts.plot.area(stacked=False, x_compat=True) + _, ax = self.plt.subplots() + ax = tz_ts.plot.area(stacked=False, x_compat=True, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] assert xmax == line[-1] tm.close() - ax = tz_ts.plot.area(stacked=False, secondary_y=True) + _, ax = self.plt.subplots() + ax = tz_ts.plot.area(stacked=False, secondary_y=True, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] @@ -143,23 +150,28 @@ def test_ts_area_lim(self): def test_label(self): s = Series([1, 2]) - ax = s.plot(label='LABEL', legend=True) + _, ax = self.plt.subplots() + ax = s.plot(label='LABEL', legend=True, ax=ax) self._check_legend_labels(ax, labels=['LABEL']) self.plt.close() - ax = s.plot(legend=True) + _, ax = self.plt.subplots() + ax = s.plot(legend=True, ax=ax) self._check_legend_labels(ax, labels=['None']) self.plt.close() # get name from index s.name = 'NAME' - ax = s.plot(legend=True) + _, ax = self.plt.subplots() + ax = s.plot(legend=True, ax=ax) self._check_legend_labels(ax, labels=['NAME']) self.plt.close() # override the default - ax = s.plot(legend=True, label='LABEL') + _, ax = self.plt.subplots() + ax = s.plot(legend=True, label='LABEL', ax=ax) self._check_legend_labels(ax, labels=['LABEL']) self.plt.close() # Add lebel info, but don't draw - ax = s.plot(legend=False, label='LABEL') + _, ax = self.plt.subplots() + ax = s.plot(legend=False, label='LABEL', ax=ax) assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it self._check_legend_labels(ax, labels=['LABEL']) @@ -189,10 +201,12 @@ def test_line_area_nan_series(self): def test_line_use_index_false(self): s = Series([1, 2, 3], index=['a', 'b', 'c']) s.index.name = 'The Index' - ax = s.plot(use_index=False) + _, ax = self.plt.subplots() + ax = s.plot(use_index=False, ax=ax) label = ax.get_xlabel() assert label == '' - ax2 = s.plot.bar(use_index=False) + _, ax = self.plt.subplots() + ax2 = s.plot.bar(use_index=False, ax=ax) label2 = ax2.get_xlabel() assert label2 == '' @@ -203,11 +217,13 @@ def test_bar_log(self): if not self.mpl_le_1_2_1: expected = np.hstack((.1, expected, 1e4)) - ax = Series([200, 500]).plot.bar(log=True) + _, ax = self.plt.subplots() + ax = Series([200, 500]).plot.bar(log=True, ax=ax) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) tm.close() - ax = Series([200, 500]).plot.barh(log=True) + _, ax = self.plt.subplots() + ax = Series([200, 500]).plot.barh(log=True, ax=ax) tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) tm.close() @@ -219,7 +235,8 @@ def test_bar_log(self): if self.mpl_ge_2_0_0: expected = np.hstack((1.0e-05, expected)) - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') + _, ax = self.plt.subplots() + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar', ax=ax) ymin = 0.0007943282347242822 if self.mpl_ge_2_0_0 else 0.001 ymax = 0.12589254117941673 if self.mpl_ge_2_0_0 else .10000000000000001 res = ax.get_ylim() @@ -228,7 +245,8 @@ def test_bar_log(self): tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) tm.close() - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') + _, ax = self.plt.subplots() + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh', ax=ax) res = ax.get_xlim() tm.assert_almost_equal(res[0], ymin) tm.assert_almost_equal(res[1], ymax) @@ -237,23 +255,27 @@ def test_bar_log(self): @slow def test_bar_ignore_index(self): df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) - ax = df.plot.bar(use_index=False) + _, ax = self.plt.subplots() + ax = df.plot.bar(use_index=False, ax=ax) self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) def test_rotation(self): df = DataFrame(randn(5, 5)) # Default rot 0 - axes = df.plot() + _, ax = self.plt.subplots() + axes = df.plot(ax=ax) self._check_ticks_props(axes, xrot=0) - axes = df.plot(rot=30) + _, ax = self.plt.subplots() + axes = df.plot(rot=30, ax=ax) self._check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): rng = date_range('1/1/2000', '3/1/2000') rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ax = ser.plot(ax=ax) xp = datetime(1999, 1, 1).toordinal() ax.set_xlim('1/1/1999', '1/1/2001') assert xp == ax.get_xlim()[0] @@ -311,7 +333,8 @@ def test_pie_series(self): def test_pie_nan(self): s = Series([1, np.nan, 1, 1]) - ax = s.plot.pie(legend=True) + _, ax = self.plt.subplots() + ax = s.plot.pie(legend=True, ax=ax) expected = ['0', '', '2', '3'] result = [x.get_text() for x in ax.texts] assert result == expected @@ -319,7 +342,8 @@ def test_pie_nan(self): @slow def test_hist_df_kwargs(self): df = DataFrame(np.random.randn(10, 2)) - ax = df.plot.hist(bins=5) + _, ax = self.plt.subplots() + ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 10 @slow @@ -329,10 +353,12 @@ def test_hist_df_with_nonnumerics(self): df = DataFrame( np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) df['E'] = ['x', 'y'] * 5 - ax = df.plot.hist(bins=5) + _, ax = self.plt.subplots() + ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 20 - ax = df.plot.hist() # bins=10 + _, ax = self.plt.subplots() + ax = df.plot.hist(ax=ax) # bins=10 assert len(ax.patches) == 40 @slow @@ -439,7 +465,8 @@ def test_hist_secondary_legend(self): df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) # primary -> secondary - ax = df['a'].plot.hist(legend=True) + _, ax = self.plt.subplots() + ax = df['a'].plot.hist(legend=True, ax=ax) df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible @@ -449,7 +476,8 @@ def test_hist_secondary_legend(self): tm.close() # secondary -> secondary - ax = df['a'].plot.hist(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are draw on left ax # left axis must be invisible, right axis must be visible @@ -460,7 +488,8 @@ def test_hist_secondary_legend(self): tm.close() # secondary -> primary - ax = df['a'].plot.hist(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) # right axes is returned df['b'].plot.hist(ax=ax, legend=True) # both legends are draw on left ax @@ -477,8 +506,9 @@ def test_df_series_secondary_legend(self): s = Series(np.random.randn(30), name='x') # primary -> secondary (without passing ax) - ax = df.plot() - s.plot(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) + s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left and right axis must be visible self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) @@ -487,7 +517,8 @@ def test_df_series_secondary_legend(self): tm.close() # primary -> secondary (with passing ax) - ax = df.plot() + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible @@ -497,8 +528,9 @@ def test_df_series_secondary_legend(self): tm.close() # seconcary -> secondary (without passing ax) - ax = df.plot(secondary_y=True) - s.plot(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, ax=ax) + s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left axis must be invisible and right axis must be visible expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] @@ -508,7 +540,8 @@ def test_df_series_secondary_legend(self): tm.close() # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, ax=ax) s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible @@ -519,7 +552,8 @@ def test_df_series_secondary_legend(self): tm.close() # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True, mark_right=False) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, mark_right=False, ax=ax) s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible @@ -533,11 +567,13 @@ def test_df_series_secondary_legend(self): def test_plot_fails_with_dupe_color_and_style(self): x = Series(randn(2)) with pytest.raises(ValueError): - x.plot(style='k--', color='k') + _, ax = self.plt.subplots() + x.plot(style='k--', color='k', ax=ax) @slow def test_hist_kde(self): - ax = self.ts.plot.hist(logy=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(logy=True, ax=ax) self._check_ax_scales(ax, yaxis='log') xlabels = ax.get_xticklabels() # ticks are values, thus ticklabels are blank @@ -549,7 +585,8 @@ def test_hist_kde(self): _skip_if_no_scipy_gaussian_kde() _check_plot_works(self.ts.plot.kde) _check_plot_works(self.ts.plot.density) - ax = self.ts.plot.kde(logy=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.kde(logy=True, ax=ax) self._check_ax_scales(ax, yaxis='log') xlabels = ax.get_xticklabels() self._check_text_labels(xlabels, [''] * len(xlabels)) @@ -565,8 +602,9 @@ def test_kde_kwargs(self): ind=linspace(-100, 100, 20)) _check_plot_works(self.ts.plot.density, bw_method=.5, ind=linspace(-100, 100, 20)) + _, ax = self.plt.subplots() ax = self.ts.plot.kde(logy=True, bw_method=.5, - ind=linspace(-100, 100, 20)) + ind=linspace(-100, 100, 20), ax=ax) self._check_ax_scales(ax, yaxis='log') self._check_text_labels(ax.yaxis.get_label(), 'Density') @@ -583,29 +621,34 @@ def test_kde_missing_vals(self): @slow def test_hist_kwargs(self): - ax = self.ts.plot.hist(bins=5) + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 5 self._check_text_labels(ax.yaxis.get_label(), 'Frequency') tm.close() if self.mpl_ge_1_3_1: - ax = self.ts.plot.hist(orientation='horizontal') + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(orientation='horizontal', ax=ax) self._check_text_labels(ax.xaxis.get_label(), 'Frequency') tm.close() - ax = self.ts.plot.hist(align='left', stacked=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(align='left', stacked=True, ax=ax) tm.close() @slow def test_hist_kde_color(self): - ax = self.ts.plot.hist(logy=True, bins=10, color='b') + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(logy=True, bins=10, color='b', ax=ax) self._check_ax_scales(ax, yaxis='log') assert len(ax.patches) == 10 self._check_colors(ax.patches, facecolors=['b'] * 10) tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() - ax = self.ts.plot.kde(logy=True, color='r') + _, ax = self.plt.subplots() + ax = self.ts.plot.kde(logy=True, color='r', ax=ax) self._check_ax_scales(ax, yaxis='log') lines = ax.get_lines() assert len(lines) == 1 @@ -613,7 +656,8 @@ def test_hist_kde_color(self): @slow def test_boxplot_series(self): - ax = self.ts.plot.box(logy=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.box(logy=True, ax=ax) self._check_ax_scales(ax, yaxis='log') xlabels = ax.get_xticklabels() self._check_text_labels(xlabels, [self.ts.name]) @@ -625,20 +669,22 @@ def test_kind_both_ways(self): s = Series(range(3)) kinds = (plotting._core._common_kinds + plotting._core._series_kinds) + _, ax = self.plt.subplots() for kind in kinds: if not _ok_for_gaussian_kde(kind): continue - s.plot(kind=kind) + s.plot(kind=kind, ax=ax) getattr(s.plot, kind)() @slow def test_invalid_plot_data(self): s = Series(list('abcd')) + _, ax = self.plt.subplots() for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue with pytest.raises(TypeError): - s.plot(kind=kind) + s.plot(kind=kind, ax=ax) @slow def test_valid_object_plot(self): @@ -650,11 +696,12 @@ def test_valid_object_plot(self): def test_partially_invalid_plot_data(self): s = Series(['a', 'b', 1.0, 2]) + _, ax = self.plt.subplots() for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue with pytest.raises(TypeError): - s.plot(kind=kind) + s.plot(kind=kind, ax=ax) def test_invalid_kind(self): s = Series([1, 2]) @@ -776,13 +823,15 @@ def test_standard_colors_all(self): def test_series_plot_color_kwargs(self): # GH1890 - ax = Series(np.arange(12) + 1).plot(color='green') + _, ax = self.plt.subplots() + ax = Series(np.arange(12) + 1).plot(color='green', ax=ax) self._check_colors(ax.get_lines(), linecolors=['green']) def test_time_series_plot_color_kwargs(self): # #1890 + _, ax = self.plt.subplots() ax = Series(np.arange(12) + 1, index=date_range( - '1/1/2000', periods=12)).plot(color='green') + '1/1/2000', periods=12)).plot(color='green', ax=ax) self._check_colors(ax.get_lines(), linecolors=['green']) def test_time_series_plot_color_with_empty_kwargs(self): @@ -797,14 +846,16 @@ def test_time_series_plot_color_with_empty_kwargs(self): ncolors = 3 + _, ax = self.plt.subplots() for i in range(ncolors): - ax = s.plot() + ax = s.plot(ax=ax) self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) def test_xticklabels(self): # GH11529 s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) - ax = s.plot(xticks=[0, 3, 5, 9]) + _, ax = self.plt.subplots() + ax = s.plot(xticks=[0, 3, 5, 9], ax=ax) exp = ['P%02d' % i for i in [0, 3, 5, 9]] self._check_text_labels(ax.get_xticklabels(), exp) From f7149a22b74e00fa584e94449e3bed34cc995a82 Mon Sep 17 00:00:00 2001 From: Vincent La Date: Wed, 31 May 2017 01:54:26 -0700 Subject: [PATCH 09/55] DOC: Update to docstring of DataFrame(dtype) (#14764) (#16487) * Adding some more documentation on dataframe with regards to dtype * Making example for creating dataframe from np matrix easier --- pandas/core/frame.py | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 743d623ee5e44..907959c42323e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -241,17 +241,47 @@ class DataFrame(NDFrame): Column labels to use for resulting frame. Will default to np.arange(n) if no column labels are provided dtype : dtype, default None - Data type to force, otherwise infer + Data type to force. Only a single dtype is allowed. If None, infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input Examples -------- - >>> d = {'col1': ts1, 'col2': ts2} - >>> df = DataFrame(data=d, index=index) - >>> df2 = DataFrame(np.random.randn(10, 5)) - >>> df3 = DataFrame(np.random.randn(10, 5), - ... columns=['a', 'b', 'c', 'd', 'e']) + Constructing DataFrame from a dictionary. + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + Notice that the inferred dtype is int64. + + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + To enforce a single dtype: + + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object + + Constructing DataFrame from numpy ndarray: + + >>> df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df2 + a b c d e + 0 2 8 8 3 4 + 1 4 2 9 0 9 + 2 1 0 7 8 0 + 3 5 1 7 1 3 + 4 6 0 2 4 2 See also -------- From 36d617138f2c1544ea9a05442882e8a539bd64e7 Mon Sep 17 00:00:00 2001 From: Kassandra Keeton Date: Wed, 31 May 2017 04:37:49 -0500 Subject: [PATCH 10/55] DOC: correct docstring examples (#3439) (#16432) --- ci/build_docs.sh | 9 ++++ pandas/core/reshape/concat.py | 2 + pandas/core/reshape/pivot.py | 72 +++++++++++++++++------------- pandas/core/reshape/reshape.py | 81 ++++++++++++++++++---------------- pandas/core/reshape/tile.py | 24 +++++----- 5 files changed, 108 insertions(+), 80 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 26917b8f9b792..a038304fe0f7a 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -59,6 +59,15 @@ if [ "$DOC" ]; then git remote -v git push origin gh-pages -f + + echo "Running doctests" + cd "$TRAVIS_BUILD_DIR" + pytest --doctest-modules \ + pandas/core/reshape/concat.py \ + pandas/core/reshape/pivot.py \ + pandas/core/reshape/reshape.py \ + pandas/core/reshape/tile.py + fi exit 0 diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index af2eb734a02f6..96603b6adc3b0 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -197,6 +197,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, 0 a 2 >>> pd.concat([df5, df6], verify_integrity=True) + Traceback (most recent call last): + ... ValueError: Indexes have overlapping values: ['a'] """ op = _Concatenator(objs, axis=axis, join_axes=join_axes, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b562f8a32f5c9..0581ec7484c49 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -50,26 +50,36 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', Examples -------- + >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + ... "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", + ... "one", "one", "two", "two"], + ... "C": ["small", "large", "large", "small", + ... "small", "large", "small", "small", + ... "large"], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) >>> df - A B C D - 0 foo one small 1 - 1 foo one large 2 - 2 foo one large 2 - 3 foo two small 3 - 4 foo two small 3 - 5 bar one large 4 - 6 bar one small 5 - 7 bar two small 6 - 8 bar two large 7 + A B C D + 0 foo one small 1 + 1 foo one large 2 + 2 foo one large 2 + 3 foo two small 3 + 4 foo two small 3 + 5 bar one large 4 + 6 bar one small 5 + 7 bar two small 6 + 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table - small large - foo one 1 4 - two 6 NaN - bar one 5 4 - two 6 7 + ... # doctest: +NORMALIZE_WHITESPACE + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 Returns ------- @@ -445,27 +455,27 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Examples -------- - >>> a - array([foo, foo, foo, foo, bar, bar, - bar, bar, foo, foo, foo], dtype=object) - >>> b - array([one, one, one, two, one, one, - one, two, two, two, one], dtype=object) - >>> c - array([dull, dull, shiny, dull, dull, shiny, - shiny, dull, shiny, shiny, shiny], dtype=object) - - >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) - b one two - c dull shiny dull shiny + >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", + ... "bar", "bar", "foo", "foo", "foo"], dtype=object) + >>> b = np.array(["one", "one", "one", "two", "one", "one", + ... "one", "two", "two", "two", "one"], dtype=object) + >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", + ... "shiny", "dull", "shiny", "shiny", "shiny"], + ... dtype=object) + + >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + ... # doctest: +NORMALIZE_WHITESPACE + b one two + c dull shiny dull shiny a - bar 1 2 1 0 - foo 2 2 1 2 + bar 1 2 1 0 + foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, - # but they still will be counted in the output + ... # but they still will be counted in the output + ... # doctest: +SKIP col_0 d e f row_0 a 1 0 0 diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f944dfe22361a..dcb83d225699d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -48,23 +48,23 @@ class _Unstacker(object): >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ... ('two', 'a'), ('two', 'b')]) - >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) >>> s - one a 1 - b 2 - two a 3 - b 4 - dtype: float64 + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 >>> s.unstack(level=-1) - a b + a b one 1 2 two 3 4 >>> s.unstack(level=0) one two - a 1 2 - b 3 4 + a 1 3 + b 2 4 Returns ------- @@ -789,18 +789,18 @@ def lreshape(data, groups, dropna=True, label=None): >>> import pandas as pd >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], ... 'team': ['Red Sox', 'Yankees'], - ... 'year1': [2007, 2008], 'year2': [2008, 2008]}) + ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) - team hr year - 0 Red Sox 514 2007 - 1 Yankees 573 2007 - 2 Red Sox 545 2008 - 3 Yankees 526 2008 + team year hr + 0 Red Sox 2007 514 + 1 Yankees 2007 573 + 2 Red Sox 2008 545 + 3 Yankees 2008 526 Returns ------- @@ -905,11 +905,12 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): ... }) >>> df["id"] = df.index >>> df - A1970 A1980 B1970 B1980 X id + A1970 A1980 B1970 B1980 X id 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") + ... # doctest: +NORMALIZE_WHITESPACE X A B id year 0 1970 -1.085631 a 2.5 @@ -940,6 +941,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') >>> l + ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age 1 1 1 2.8 @@ -979,41 +981,44 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): Less wieldy column names are also handled + >>> np.random.seed(0) >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), ... 'A(quarterly)-2011': np.random.rand(3), ... 'B(quarterly)-2010': np.random.rand(3), ... 'B(quarterly)-2011': np.random.rand(3), ... 'X' : np.random.randint(3, size=3)}) >>> df['id'] = df.index - >>> df - A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 - 0 0.531828 0.724455 0.322959 0.293714 - 1 0.634401 0.611024 0.361789 0.630976 - 2 0.849432 0.722443 0.228263 0.092105 - \ + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ... + 0 0.548814 0.544883 0.437587 ... + 1 0.715189 0.423655 0.891773 ... + 2 0.602763 0.645894 0.963663 ... X id 0 0 0 1 1 1 - 2 2 2 - >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], - i='id', j='year', sep='-') - X A(quarterly) B(quarterly) + 2 1 2 + + >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', + ... j='year', sep='-') + ... # doctest: +NORMALIZE_WHITESPACE + X A(quarterly) B(quarterly) id year - 0 2010 0 0.531828 0.322959 - 1 2010 2 0.634401 0.361789 - 2 2010 2 0.849432 0.228263 - 0 2011 0 0.724455 0.293714 - 1 2011 2 0.611024 0.630976 - 2 2011 2 0.722443 0.092105 + 0 2010 0 0.548814 0.437587 + 1 2010 1 0.715189 0.891773 + 2 2010 1 0.602763 0.963663 + 0 2011 0 0.544883 0.383442 + 1 2011 1 0.423655 0.791725 + 2 2011 1 0.645894 0.528895 If we have many columns, we could also use a regex to find our stubnames and pass that list on to wide_to_long - >>> stubnames = set([match[0] for match in - df.columns.str.findall('[A-B]\(.*\)').values - if match != [] ]) + >>> stubnames = sorted( + ... set([match[0] for match in df.columns.str.findall( + ... r'[A-B]\(.*\)').values if match != [] ]) + ... ) >>> list(stubnames) - ['B(quarterly)', 'A(quarterly)'] + ['A(quarterly)', 'B(quarterly)'] Notes ----- @@ -1133,7 +1138,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], - 'C': [1, 2, 3]}) + ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c @@ -1149,7 +1154,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 3 1 0 0 4 1 0 0 - >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 746742f47f2aa..866f229bec418 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -75,18 +75,18 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Examples -------- >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) - ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], - (6.533, 9.7], (0.191, 3.367]] - Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], - array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + ... # doctest: +ELLIPSIS + ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ... + Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ... - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, - labels=["good","medium","bad"]) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), + ... 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP [good, good, good, medium, bad, good] Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1], dtype=int64) + array([1, 1, 1, 1, 1]) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -182,15 +182,17 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): Examples -------- >>> pd.qcut(range(5), 4) - [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] - Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] + ... # doctest: +ELLIPSIS + [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... - >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) + >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] >>> pd.qcut(range(5), 4, labels=False) - array([0, 0, 1, 2, 3], dtype=int64) + array([0, 0, 1, 2, 3]) """ x_is_series, series_index, name, x = _preprocess_for_cut(x) From ab9bc9a2237c3b2c80ce55acc1c09c81f411476f Mon Sep 17 00:00:00 2001 From: Jeff Tratner Date: Wed, 31 May 2017 03:39:46 -0700 Subject: [PATCH 11/55] Fix unbound local with bad engine (#16511) --- doc/source/whatsnew/v0.20.2.txt | 3 +++ pandas/io/parsers.py | 4 ++++ pandas/tests/io/test_common.py | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 90146aa176b31..1517327ab7133 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -41,6 +41,9 @@ Bug Fixes detecting the terminal size. This fix only applies to python 3 (:issue:`16496`) - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) - Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) +- Passing an invalid engine to :func:`read_csv` now raises an informative + ``ValueError`` rather than ``UnboundLocalError``. (:issue:`16511`) + diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e287d92f67ef6..12b606d969c7d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -969,6 +969,10 @@ def _make_engine(self, engine='c'): klass = PythonParser elif engine == 'python-fwf': klass = FixedWidthFieldParser + else: + raise ValueError('Unknown engine: {engine} (valid options are' + ' "c", "python", or' ' "python-fwf")'.format( + engine=engine)) self._engine = klass(self.f, **self.options) def _failover_to_python(self): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b7d158dd75960..289f86eb2dc53 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -223,3 +223,10 @@ def test_next(self): assert next_line.strip() == line.strip() pytest.raises(StopIteration, next, wrapper) + + def test_unknown_engine(self): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path) + with tm.assert_raises_regex(ValueError, 'Unknown engine'): + read_csv(path, engine='pyt') From 9a9c315f93fb1b5a497298d9f779e935ce4e984e Mon Sep 17 00:00:00 2001 From: Christian Stade-Schuldt Date: Wed, 31 May 2017 13:56:52 +0200 Subject: [PATCH 12/55] return empty MultiIndex for symmetrical difference on equal MultiIndexes (#16486) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/indexes/multi.py | 6 ++++++ pandas/tests/indexes/test_base.py | 2 -- pandas/tests/indexing/test_multiindex.py | 11 +++++++++++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 1517327ab7133..38cf683208b3d 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -40,6 +40,7 @@ Bug Fixes - Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when detecting the terminal size. This fix only applies to python 3 (:issue:`16496`) - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) +- Bug in ``Index.symmetric_difference()`` on two equal MultiIndex's, results in a TypeError (:issue `13490`) - Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) - Passing an invalid engine to :func:`read_csv` now raises an informative ``ValueError`` rather than ``UnboundLocalError``. (:issue:`16511`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 569e16f2141ae..981a6a696a618 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -414,6 +414,12 @@ def view(self, cls=None): return result def _shallow_copy_with_infer(self, values=None, **kwargs): + # On equal MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + if len(values) == 0: + return MultiIndex(levels=[[] for _ in range(self.nlevels)], + labels=[[] for _ in range(self.nlevels)], + **kwargs) return self._shallow_copy(values, **kwargs) @Appender(_index_shared_docs['_shallow_copy']) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 6a2087b37631e..02561cba784b8 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -188,7 +188,6 @@ def test_constructor_ndarray_like(self): # it should be possible to convert any object that satisfies the numpy # ndarray interface directly into an Index class ArrayLike(object): - def __init__(self, array): self.array = array @@ -246,7 +245,6 @@ def test_index_ctor_infer_nan_nat(self): [np.timedelta64('nat'), np.nan], [pd.NaT, np.timedelta64('nat')], [np.timedelta64('nat'), pd.NaT]]: - tm.assert_index_equal(Index(data), exp) tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 483c39ed8694e..fc6c627075c96 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -697,6 +697,17 @@ def test_multiindex_slice_first_level(self): index=range(30, 71)) tm.assert_frame_equal(result, expected) + def test_multiindex_symmetric_difference(self): + # GH 13490 + idx = MultiIndex.from_product([['a', 'b'], ['A', 'B']], + names=['a', 'b']) + result = idx ^ idx + assert result.names == idx.names + + idx2 = idx.copy().rename(['A', 'B']) + result = idx ^ idx2 + assert result.names == [None, None] + class TestMultiIndexSlicers(object): From 79cc4a978ade10e04f1d10dce850852e56876223 Mon Sep 17 00:00:00 2001 From: JosephWagner Date: Wed, 31 May 2017 04:57:57 -0700 Subject: [PATCH 13/55] BUG: select_as_multiple doesn't respect start/stop kwargs GH16209 (#16317) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/pytables.py | 7 ++++--- pandas/tests/io/test_pytables.py | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 38cf683208b3d..676da5c370041 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -71,6 +71,7 @@ I/O - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) +- Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 625b407dd43be..2940d1f958776 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -806,8 +806,8 @@ def func(_start, _stop, _where): # retrieve the objs, _where is always passed as a set of # coordinates here - objs = [t.read(where=_where, columns=columns, **kwargs) - for t in tbls] + objs = [t.read(where=_where, columns=columns, start=_start, + stop=_stop, **kwargs) for t in tbls] # concat and return return concat(objs, axis=axis, @@ -1432,7 +1432,8 @@ def get_result(self, coordinates=False): # if specified read via coordinates (necessary for multiple selections if coordinates: - where = self.s.read_coordinates(where=self.where) + where = self.s.read_coordinates(where=self.where, start=self.start, + stop=self.stop) else: where = self.where diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 06a4a67964b96..e68de93c3e8ce 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4221,6 +4221,21 @@ def test_start_stop_table(self): expected = df.loc[30:40, ['A']] tm.assert_frame_equal(result, expected) + def test_start_stop_multiple(self): + + # GH 16209 + with ensure_clean_store(self.path) as store: + + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) + + store.append_to_multiple({'selector': ['foo'], 'data': None}, df, + selector='selector') + result = store.select_as_multiple(['selector', 'data'], + selector='selector', start=0, + stop=1) + expected = df.loc[[0], ['foo', 'bar']] + tm.assert_frame_equal(result, expected) + def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: From 0db4de5033548f615505b9a9426d518c8776848e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 31 May 2017 18:44:40 -0400 Subject: [PATCH 14/55] BUG: Bug in .resample() and .groupby() when aggregating on integers (#16549) closes #16361 --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/groupby.py | 10 ++++++---- pandas/tests/test_resample.py | 22 ++++++++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 676da5c370041..9f88d629880ed 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -88,6 +88,7 @@ Groupby/Resample/Rolling - Bug creating datetime rolling window on an empty DataFrame (:issue:`15819`) - Bug in ``rolling.cov()`` with offset window (:issue:`16058`) +- Bug in ``.resample()`` and ``.groupby()`` when aggregating on integers (:issue:`16361`) Sparse diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 91b55c414b507..286677d613484 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3337,13 +3337,15 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True): obj = self.obj[data.items[locs]] s = groupby(obj, self.grouper) result = s.aggregate(lambda x: alt(x, axis=self.axis)) - result = result._data.blocks[0] + newb = result._data.blocks[0] - # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result) + finally: + + # see if we can cast the block back to the original dtype + result = block._try_coerce_and_cast_result(result) + newb = block.make_block(result) new_items.append(locs) - newb = block.make_block_same_class(result) new_blocks.append(newb) if len(new_blocks) == 0: diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 170cab4947a5a..959e3d2f459ce 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -1672,6 +1672,28 @@ def test_resample_dtype_preservation(self): result = df.groupby('group').resample('1D').ffill() assert result.val.dtype == np.int32 + def test_resample_dtype_coerceion(self): + + pytest.importorskip('scipy') + + # GH 16361 + df = {"a": [1, 3, 1, 4]} + df = pd.DataFrame( + df, index=pd.date_range("2017-01-01", "2017-01-04")) + + expected = (df.astype("float64") + .resample("H") + .mean() + ["a"] + .interpolate("cubic") + ) + + result = df.resample("H")["a"].mean().interpolate("cubic") + tm.assert_series_equal(result, expected) + + result = df.resample("H").mean()["a"].interpolate("cubic") + tm.assert_series_equal(result, expected) + def test_weekly_resample_buglet(self): # #1327 rng = date_range('1/1/2000', freq='B', periods=20) From c193235a10ff399411027cd8496df5c4457b2604 Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Thu, 1 Jun 2017 02:04:55 +0300 Subject: [PATCH 15/55] COMPAT: cython str-to-int can raise a ValueError on non-CPython (#16563) --- pandas/_libs/index.pyx | 4 ++-- pandas/core/frame.py | 2 +- pandas/core/indexes/base.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 21680fb0b3921..5e92c506b5d0c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -152,7 +152,7 @@ cdef class IndexEngine: try: return self.mapping.get_item(val) - except TypeError: + except (TypeError, ValueError): raise KeyError(val) cdef inline _get_loc_duplicates(self, object val): @@ -470,7 +470,7 @@ cdef class DatetimeEngine(Int64Engine): try: val = _to_i8(val) return self.mapping.get_item(val) - except TypeError: + except (TypeError, ValueError): self._date_check_type(val) raise KeyError(val) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 907959c42323e..25c3c3fe4e48e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1918,7 +1918,7 @@ def get_value(self, index, col, takeable=False): try: return engine.get_value(series._values, index) - except TypeError: + except (TypeError, ValueError): # we cannot handle direct indexing # use positional diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2af4f112ca941..e1e08e008f782 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1590,7 +1590,7 @@ def __contains__(self, key): hash(key) try: return key in self._engine - except TypeError: + except (TypeError, ValueError): return False _index_shared_docs['contains'] = """ @@ -1610,7 +1610,7 @@ def contains(self, key): hash(key) try: return key in self._engine - except TypeError: + except (TypeError, ValueError): return False def __hash__(self): From b9febe04b2ca3c44bdf8ce74c1cfb74d6b50152e Mon Sep 17 00:00:00 2001 From: Giulio Pepe Date: Thu, 1 Jun 2017 00:10:24 +0100 Subject: [PATCH 16/55] CLN: raise correct error for Panel sort_values (#16532) --- pandas/core/generic.py | 9 +++++++-- pandas/tests/test_panel.py | 5 +++++ pandas/tests/test_panel4d.py | 5 +++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e541f1532d0a0..98999ec267c82 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2366,9 +2366,14 @@ def add_suffix(self, suffix): 1 A 1 1 """ - def sort_values(self, by, axis=0, ascending=True, inplace=False, + def sort_values(self, by=None, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): - raise AbstractMethodError(self) + """ + NOT IMPLEMENTED: do not call this method, as sorting values is not + supported for Panel objects and will raise an error. + """ + raise NotImplementedError("sort_values has not been implemented " + "on Panel or Panel4D objects.") _shared_docs['sort_index'] = """ Sort object by labels (along an axis) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 3243b69a25acd..e19e42e062932 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2429,6 +2429,11 @@ def test_all_any_unhandled(self): pytest.raises(NotImplementedError, self.panel.all, bool_only=True) pytest.raises(NotImplementedError, self.panel.any, bool_only=True) + # GH issue 15960 + def test_sort_values(self): + pytest.raises(NotImplementedError, self.panel.sort_values) + pytest.raises(NotImplementedError, self.panel.sort_values, 'ItemA') + class TestLongPanel(object): """ diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 96f02d63712fc..e1995316e7b7c 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -939,3 +939,8 @@ def test_rename(self): def test_get_attr(self): tm.assert_panel_equal(self.panel4d['l1'], self.panel4d.l1) + + # GH issue 15960 + def test_sort_values(self): + pytest.raises(NotImplementedError, self.panel4d.sort_values) + pytest.raises(NotImplementedError, self.panel4d.sort_values, 'ItemA') From 98ed54d0cc7a12653a9c07f00a222e8f7e6f3e62 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Jun 2017 05:30:57 -0500 Subject: [PATCH 17/55] BUG: Fixed pd.unique on array of tuples (#16543) --- doc/source/whatsnew/v0.20.2.txt | 3 +-- pandas/core/algorithms.py | 7 ++++++- pandas/tests/test_algos.py | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 9f88d629880ed..31df5899f0fc3 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -44,8 +44,7 @@ Bug Fixes - Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) - Passing an invalid engine to :func:`read_csv` now raises an informative ``ValueError`` rather than ``UnboundLocalError``. (:issue:`16511`) - - +- Bug in :func:`unique` on an array of tuples (:issue:`16519`) - Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 77d79c9585e57..d74c5e66ea1a9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -163,7 +163,7 @@ def _ensure_arraylike(values): ABCIndexClass, ABCSeries)): inferred = lib.infer_dtype(values) if inferred in ['mixed', 'string', 'unicode']: - values = np.asarray(values, dtype=object) + values = lib.list_to_object_array(values) else: values = np.asarray(values) return values @@ -328,6 +328,11 @@ def unique(values): [b, a, c] Categories (3, object): [a < b < c] + An array of tuples + + >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) + array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) + See Also -------- pandas.Index.unique diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 351e646cbb0b2..063dcea5c76d6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -929,6 +929,22 @@ def test_unique_index(self): tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False])) + @pytest.mark.parametrize('arr, unique', [ + ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], + [(0, 0), (0, 1), (1, 0), (1, 1)]), + ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')], + [('b', 'c'), ('a', 'b')]), + ([('a', 1), ('b', 2), ('a', 3), ('a', 1)], + [('a', 1), ('b', 2), ('a', 3)]), + ]) + def test_unique_tuples(self, arr, unique): + # https://github.com/pandas-dev/pandas/issues/16519 + expected = np.empty(len(unique), dtype=object) + expected[:] = unique + + result = pd.unique(arr) + tm.assert_numpy_array_equal(result, expected) + class GroupVarTestMixin(object): From 6d761b4ed5fdbc5171fec5eb81d7b29b2bc658dc Mon Sep 17 00:00:00 2001 From: Patrick O'Melveny Date: Thu, 1 Jun 2017 03:35:18 -0700 Subject: [PATCH 18/55] BUG: Allow non-callable attributes in aggregate function. Fixes GH16405 (#16458) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/base.py | 12 +++++++-- pandas/tests/frame/test_apply.py | 45 +++++++++++++++++++++++++++++++ pandas/tests/series/test_apply.py | 16 +++++++++++ 4 files changed, 72 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 31df5899f0fc3..c8b6dfa134120 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -102,6 +102,7 @@ Reshaping - Bug in ``pd.wide_to_long()`` where no error was raised when ``i`` was not a unique identifier (:issue:`16382`) - Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`) - Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`) +- Bug in ``DataFrame.agg()`` and ``Series.agg()`` with aggregating on non-callable attributes (:issue:`16405`) Numeric diff --git a/pandas/core/base.py b/pandas/core/base.py index a3ef24c80f883..97c4c8626dcbb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -378,7 +378,7 @@ def aggregate(self, func, *args, **kwargs): def _try_aggregate_string_function(self, arg, *args, **kwargs): """ if arg is a string, then try to operate on it: - - try to find a function on ourselves + - try to find a function (or attribute) on ourselves - try to find a numpy function - raise @@ -387,7 +387,15 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): f = getattr(self, arg, None) if f is not None: - return f(*args, **kwargs) + if callable(f): + return f(*args, **kwargs) + + # people may try to aggregate on a non-callable attribute + # but don't let them think they can pass args to it + assert len(args) == 0 + assert len([kwarg for kwarg in kwargs + if kwarg not in ['axis', '_level']]) == 0 + return f f = getattr(np, arg, None) if f is not None: diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index aa7c7a7120c1b..a6f39cabb60ed 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -635,3 +635,48 @@ def test_nuiscance_columns(self): expected = DataFrame([[6, 6., 'foobarbaz']], index=['sum'], columns=['A', 'B', 'C']) assert_frame_equal(result, expected) + + def test_non_callable_aggregates(self): + + # GH 16405 + # 'size' is a property of frame/series + # validate that this is working + df = DataFrame({'A': [None, 2, 3], + 'B': [1.0, np.nan, 3.0], + 'C': ['foo', None, 'bar']}) + + # Function aggregate + result = df.agg({'A': 'count'}) + expected = pd.Series({'A': 2}) + + assert_series_equal(result, expected) + + # Non-function aggregate + result = df.agg({'A': 'size'}) + expected = pd.Series({'A': 3}) + + assert_series_equal(result, expected) + + # Mix function and non-function aggs + result1 = df.agg(['count', 'size']) + result2 = df.agg({'A': ['count', 'size'], + 'B': ['count', 'size'], + 'C': ['count', 'size']}) + expected = pd.DataFrame({'A': {'count': 2, 'size': 3}, + 'B': {'count': 2, 'size': 3}, + 'C': {'count': 2, 'size': 3}}) + + assert_frame_equal(result1, result2, check_like=True) + assert_frame_equal(result2, expected, check_like=True) + + # Just functional string arg is same as calling df.arg() + result = df.agg('count') + expected = df.count() + + assert_series_equal(result, expected) + + # Just a string attribute arg same as calling df.arg + result = df.agg('size') + expected = df.size + + assert result == expected diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index c273d3161fff5..2c5f0d7772cc2 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -306,6 +306,22 @@ def test_reduce(self): name=self.series.name) assert_series_equal(result, expected) + def test_non_callable_aggregates(self): + # test agg using non-callable series attributes + s = Series([1, 2, None]) + + # Calling agg w/ just a string arg same as calling s.arg + result = s.agg('size') + expected = s.size + assert result == expected + + # test when mixed w/ callable reducers + result = s.agg(['size', 'count', 'mean']) + expected = Series(OrderedDict({'size': 3.0, + 'count': 2.0, + 'mean': 1.5})) + assert_series_equal(result[expected.index], expected) + class TestSeriesMap(TestData): From ed542ee91547eea11e6c8fd1c511fa4b67088543 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Jun 2017 05:37:00 -0500 Subject: [PATCH 19/55] Strictly monotonic (#16555) --- doc/source/api.rst | 2 + doc/source/whatsnew/v0.20.2.txt | 3 +- pandas/core/indexes/base.py | 50 +++++++++++++++++++ pandas/core/indexes/datetimes.py | 2 +- .../tests/indexes/datetimes/test_datetime.py | 7 +++ pandas/tests/indexes/test_base.py | 6 ++- pandas/tests/indexes/test_multi.py | 26 ++++++++++ pandas/tests/indexes/test_numeric.py | 22 +++++++- pandas/tests/indexes/test_range.py | 10 ++++ 9 files changed, 124 insertions(+), 4 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 350abb00f0849..cdb6e36870f24 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1287,6 +1287,8 @@ Attributes Index.is_monotonic Index.is_monotonic_increasing Index.is_monotonic_decreasing + Index.is_strictly_monotonic_increasing + Index.is_strictly_monotonic_decreasing Index.is_unique Index.has_duplicates Index.dtype diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index c8b6dfa134120..e3328e2d01dc7 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -21,6 +21,7 @@ Enhancements - Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) - ``Series`` provides a ``to_latex`` method (:issue:`16180`) +- Added :attr:`Index.is_strictly_monotonic_increasing` and :attr:`Index.is_strictly_monotonic_decreasing` properties (:issue:`16515`) .. _whatsnew_0202.performance: @@ -61,7 +62,7 @@ Indexing ^^^^^^^^ - Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) - +- Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`) I/O ^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e1e08e008f782..e8c2043138edb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1191,6 +1191,15 @@ def is_monotonic_increasing(self): """ return if the index is monotonic increasing (only equal or increasing) values. + + Examples + -------- + >>> Index([1, 2, 3]).is_monotonic_increasing + True + >>> Index([1, 2, 2]).is_monotonic_increasing + True + >>> Index([1, 3, 2]).is_monotonic_increasing + False """ return self._engine.is_monotonic_increasing @@ -1199,9 +1208,50 @@ def is_monotonic_decreasing(self): """ return if the index is monotonic decreasing (only equal or decreasing) values. + + Examples + -------- + >>> Index([3, 2, 1]).is_monotonic_decreasing + True + >>> Index([3, 2, 2]).is_monotonic_decreasing + True + >>> Index([3, 1, 2]).is_monotonic_decreasing + False """ return self._engine.is_monotonic_decreasing + @property + def is_strictly_monotonic_increasing(self): + """return if the index is strictly monotonic increasing + (only increasing) values + + Examples + -------- + >>> Index([1, 2, 3]).is_strictly_monotonic_increasing + True + >>> Index([1, 2, 2]).is_strictly_monotonic_increasing + False + >>> Index([1, 3, 2]).is_strictly_monotonic_increasing + False + """ + return self.is_unique and self.is_monotonic_increasing + + @property + def is_strictly_monotonic_decreasing(self): + """return if the index is strictly monotonic decreasing + (only decreasing) values + + Examples + -------- + >>> Index([3, 2, 1]).is_strictly_monotonic_decreasing + True + >>> Index([3, 2, 2]).is_strictly_monotonic_decreasing + False + >>> Index([3, 1, 2]).is_strictly_monotonic_decreasing + False + """ + return self.is_unique and self.is_monotonic_decreasing + def is_lexsorted_for_tuple(self, tup): return True diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ec678b1577d81..60560374cd420 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1472,7 +1472,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): # the bounds need swapped if index is reverse sorted and has a # length > 1 (is_monotonic_decreasing gives True for empty # and length 1 index) - if self.is_monotonic_decreasing and len(self) > 1: + if self.is_strictly_monotonic_decreasing and len(self) > 1: return upper if side == 'left' else lower return lower if side == 'left' else upper else: diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 6cba7e17abf8e..f99dcee9e5c8a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -771,3 +771,10 @@ def test_slice_bounds_empty(self): left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') exp = Timestamp('2015-01-02 00:00:00') assert left == exp + + def test_slice_duplicate_monotonic(self): + # https://github.com/pandas-dev/pandas/issues/16515 + idx = pd.DatetimeIndex(['2017', '2017']) + result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc') + expected = Timestamp('2017-01-01') + assert result == expected diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 02561cba784b8..a6933316e4291 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1328,8 +1328,10 @@ def test_tuple_union_bug(self): def test_is_monotonic_incomparable(self): index = Index([5, datetime.now(), 7]) - assert not index.is_monotonic + assert not index.is_monotonic_increasing assert not index.is_monotonic_decreasing + assert not index.is_strictly_monotonic_increasing + assert not index.is_strictly_monotonic_decreasing def test_get_set_value(self): values = np.random.randn(100) @@ -2028,6 +2030,8 @@ def test_is_monotonic_na(self): for index in examples: assert not index.is_monotonic_increasing assert not index.is_monotonic_decreasing + assert not index.is_strictly_monotonic_increasing + assert not index.is_strictly_monotonic_decreasing def test_repr_summary(self): with cf.option_context('display.max_seq_items', 10): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 1fe4d85815c4b..388a49d25cb82 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2373,22 +2373,30 @@ def test_is_monotonic(self): i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=['one', 'two']) assert i.is_monotonic + assert i.is_strictly_monotonic_increasing assert Index(i.values).is_monotonic + assert i.is_strictly_monotonic_increasing i = MultiIndex.from_product([np.arange(10, 0, -1), np.arange(10)], names=['one', 'two']) assert not i.is_monotonic + assert not i.is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic + assert not Index(i.values).is_strictly_monotonic_increasing i = MultiIndex.from_product([np.arange(10), np.arange(10, 0, -1)], names=['one', 'two']) assert not i.is_monotonic + assert not i.is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic + assert not Index(i.values).is_strictly_monotonic_increasing i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) assert not i.is_monotonic + assert not i.is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic + assert not Index(i.values).is_strictly_monotonic_increasing # string ordering i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], @@ -2398,6 +2406,8 @@ def test_is_monotonic(self): names=['first', 'second']) assert not i.is_monotonic assert not Index(i.values).is_monotonic + assert not i.is_strictly_monotonic_increasing + assert not Index(i.values).is_strictly_monotonic_increasing i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['mom', 'next', 'zenith']], @@ -2406,6 +2416,8 @@ def test_is_monotonic(self): names=['first', 'second']) assert i.is_monotonic assert Index(i.values).is_monotonic + assert i.is_strictly_monotonic_increasing + assert Index(i.values).is_strictly_monotonic_increasing # mixed levels, hits the TypeError i = MultiIndex( @@ -2416,6 +2428,20 @@ def test_is_monotonic(self): names=['household_id', 'asset_id']) assert not i.is_monotonic + assert not i.is_strictly_monotonic_increasing + + def test_is_strictly_monotonic(self): + idx = pd.MultiIndex(levels=[['bar', 'baz'], ['mom', 'next']], + labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) + assert idx.is_monotonic_increasing + assert not idx.is_strictly_monotonic_increasing + + @pytest.mark.xfail(reason="buggy MultiIndex.is_monotonic_decresaing.") + def test_is_strictly_monotonic_decreasing(self): + idx = pd.MultiIndex(levels=[['baz', 'bar'], ['next', 'mom']], + labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) + assert idx.is_monotonic_decreasing + assert not idx.is_strictly_monotonic_decreasing def test_reconstruct_sort(self): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 3d06f1672ae32..77f34dbf210e0 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -465,16 +465,36 @@ def test_view(self): def test_is_monotonic(self): assert self.index.is_monotonic assert self.index.is_monotonic_increasing + assert self.index.is_strictly_monotonic_increasing assert not self.index.is_monotonic_decreasing + assert not self.index.is_strictly_monotonic_decreasing index = self._holder([4, 3, 2, 1]) assert not index.is_monotonic - assert index.is_monotonic_decreasing + assert not index.is_strictly_monotonic_increasing + assert index.is_strictly_monotonic_decreasing index = self._holder([1]) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index.is_strictly_monotonic_increasing + assert index.is_strictly_monotonic_decreasing + + def test_is_strictly_monotonic(self): + index = self._holder([1, 1, 2, 3]) + assert index.is_monotonic_increasing + assert not index.is_strictly_monotonic_increasing + + index = self._holder([3, 2, 1, 1]) + assert index.is_monotonic_decreasing + assert not index.is_strictly_monotonic_decreasing + + index = self._holder([1, 1]) + assert index.is_monotonic_increasing + assert index.is_monotonic_decreasing + assert not index.is_strictly_monotonic_increasing + assert not index.is_strictly_monotonic_decreasing def test_logical_compat(self): idx = self.create_index() diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index c7af0954cf483..db8180cb736c4 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -331,25 +331,35 @@ def test_is_monotonic(self): assert self.index.is_monotonic assert self.index.is_monotonic_increasing assert not self.index.is_monotonic_decreasing + assert self.index.is_strictly_monotonic_increasing + assert not self.index.is_strictly_monotonic_decreasing index = RangeIndex(4, 0, -1) assert not index.is_monotonic + assert not index.is_strictly_monotonic_increasing assert index.is_monotonic_decreasing + assert index.is_strictly_monotonic_decreasing index = RangeIndex(1, 2) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index.is_strictly_monotonic_increasing + assert index.is_strictly_monotonic_decreasing index = RangeIndex(2, 1) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index.is_strictly_monotonic_increasing + assert index.is_strictly_monotonic_decreasing index = RangeIndex(1, 1) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index.is_strictly_monotonic_increasing + assert index.is_strictly_monotonic_decreasing def test_equals_range(self): equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), From f92ec386ea9784177598c937da47bee4cf4c5204 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 1 Jun 2017 06:38:50 -0400 Subject: [PATCH 20/55] COMPAT: Consider Python 2.x tarfiles file-like (#16533) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/dtypes/inference.py | 2 +- pandas/io/parsers.py | 29 ++++++++++++--- pandas/tests/dtypes/test_inference.py | 4 +- pandas/tests/io/parser/c_parser_only.py | 36 ++++++++++++++++++ pandas/tests/io/parser/data/tar_csv.tar | Bin 0 -> 10240 bytes pandas/tests/io/parser/data/tar_csv.tar.gz | Bin 0 -> 10240 bytes pandas/tests/io/parser/test_unsupported.py | 41 ++++++++++++++++----- setup.py | 2 + 9 files changed, 98 insertions(+), 17 deletions(-) create mode 100644 pandas/tests/io/parser/data/tar_csv.tar create mode 100644 pandas/tests/io/parser/data/tar_csv.tar.gz diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index e3328e2d01dc7..e309ac0a79e4b 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -70,6 +70,7 @@ I/O - Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) +- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index a5316a83612cb..ff7e215951a1f 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -171,7 +171,7 @@ def is_file_like(obj): if not (hasattr(obj, 'read') or hasattr(obj, 'write')): return False - if not is_iterator(obj): + if not hasattr(obj, "__iter__"): return False return True diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 12b606d969c7d..aab70c8ce2cd4 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,7 +13,7 @@ import numpy as np from pandas import compat -from pandas.compat import (range, lrange, StringIO, lzip, +from pandas.compat import (range, lrange, PY3, StringIO, lzip, zip, string_types, map, u) from pandas.core.dtypes.common import ( is_integer, _ensure_object, @@ -31,10 +31,10 @@ from pandas.core.common import AbstractMethodError from pandas.io.date_converters import generic_parser from pandas.errors import ParserWarning, ParserError, EmptyDataError -from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, - _get_handle, UnicodeReader, UTF8Recoder, - BaseIterator, - _NA_VALUES, _infer_compression) +from pandas.io.common import (get_filepath_or_buffer, is_file_like, + _validate_header_arg, _get_handle, + UnicodeReader, UTF8Recoder, _NA_VALUES, + BaseIterator, _infer_compression) from pandas.core.tools import datetimes as tools from pandas.util._decorators import Appender @@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds): self.squeeze = options.pop('squeeze', False) # might mutate self.engine + self.engine = self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) + if 'has_index_names' in kwds: self.options['has_index_names'] = kwds['has_index_names'] @@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine): return options + def _check_file_or_buffer(self, f, engine): + # see gh-16530 + if is_file_like(f): + next_attr = "__next__" if PY3 else "next" + + # The C engine doesn't need the file-like to have the "next" or + # "__next__" attribute. However, the Python engine explicitly calls + # "next(...)" when iterating through such an object, meaning it + # needs to have that attribute ("next" for Python 2.x, "__next__" + # for Python 3.x) + if engine != "c" and not hasattr(f, next_attr): + msg = ("The 'python' engine cannot iterate " + "through this file buffer.") + raise ValueError(msg) + + return engine + def _clean_options(self, options, engine): result = options.copy() diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b88481abcb2ec..ec5fe45d7f610 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -120,9 +120,9 @@ class MockFile(object): m = MockFile() assert not is_file(m) + # gh-16530: Valid iterator just means we have the + # __iter__ attribute for our purposes. MockFile.__iter__ = lambda self: self - MockFile.__next__ = lambda self: 0 - MockFile.next = MockFile.__next__ # Valid write-only file m = MockFile() diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 56ac10404b7b2..48812c04e3b55 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -7,7 +7,9 @@ further arguments when parsing. """ +import os import sys +import tarfile import pytest import numpy as np @@ -446,3 +448,37 @@ def test_comment_whitespace_delimited(self): [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) + + def test_file_like_no_next(self): + # gh-16530: the file-like need not have a "next" or "__next__" + # attribute despite having an "__iter__" attribute. + # + # NOTE: This is only true for the C engine, not Python engine. + class NoNextBuffer(StringIO): + def __next__(self): + raise AttributeError("No next method") + + next = __next__ + + data = "a\n1" + + expected = pd.DataFrame({"a": [1]}) + result = self.read_csv(NoNextBuffer(data)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) + def test_read_tarfile(self, tar_suffix): + # see gh-16530 + # + # Unfortunately, Python's CSV library can't handle + # tarfile objects (expects string, not bytes when + # iterating through a file-like). + tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix) + + tar = tarfile.open(tar_path, "r") + data_file = tar.extractfile("tar_data.csv") + + out = self.read_csv(data_file) + expected = pd.DataFrame({"a": [1]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/parser/data/tar_csv.tar b/pandas/tests/io/parser/data/tar_csv.tar new file mode 100644 index 0000000000000000000000000000000000000000..d1819550e0a0064b4d9ad829f120e49760c3ffe2 GIT binary patch literal 10240 zcmeIuK?;O03_#JW1@F)k3{BNsM}nR}J9B-TY7p4K!(9_GO$s`)68z@>0YS zW+wk!;+jjzL^~}framQ!s=W;o;!FQIwf(Nymk>_1JC}X6!*X|eRCwcUqis`RFe4E_ x009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~0R#|0009IZ32f(E6h{C6 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/parser/data/tar_csv.tar.gz b/pandas/tests/io/parser/data/tar_csv.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..b5a0f3e1b580535a3fbdc2ff943b79d8c585df9f GIT binary patch literal 10240 zcmeIu%?W@o41m#`1$XEK(ok*3k)RW3b$+WS^{9xKFPG3j^YgMz{b<>mVP55<@Fil5 zvgZ=_TuM;(Z{IR;yy82--BN0FV w0R#|0009ILKmY**5I_I{1Q0*~0R#|0009ILKmY**5I_I{1Q0*~0R&zNY&Ss^RsaA1 literal 0 HcmV?d00001 diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 3f62ff44531fb..5d248f2fef59c 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -16,6 +16,13 @@ from pandas.errors import ParserError from pandas.io.parsers import read_csv, read_table +import pytest + + +@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) +def python_engine(request): + return request.param + class TestUnsupportedFeatures(object): @@ -82,7 +89,7 @@ def test_c_engine(self): with tm.assert_raises_regex(ValueError, msg): read_csv(StringIO(data), lineterminator='~~') - def test_python_engine(self): + def test_python_engine(self, python_engine): from pandas.io.parsers import _python_unsupported as py_unsupported data = """1,2,3,, @@ -90,16 +97,32 @@ def test_python_engine(self): 1,2,3,4,5 1,2,,, 1,2,3,4,""" - engines = 'python', 'python-fwf' - for engine in engines: - for default in py_unsupported: - msg = ('The %r option is not supported ' - 'with the %r engine' % (default, engine)) + for default in py_unsupported: + msg = ('The %r option is not supported ' + 'with the %r engine' % (default, python_engine)) + + kwargs = {default: object()} + with tm.assert_raises_regex(ValueError, msg): + read_csv(StringIO(data), engine=python_engine, **kwargs) - kwargs = {default: object()} - with tm.assert_raises_regex(ValueError, msg): - read_csv(StringIO(data), engine=engine, **kwargs) + def test_python_engine_file_no_next(self, python_engine): + # see gh-16530 + class NoNextBuffer(object): + def __init__(self, csv_data): + self.data = csv_data + + def __iter__(self): + return self + + def read(self): + return self.data + + data = "a\n1" + msg = "The 'python' engine cannot iterate" + + with tm.assert_raises_regex(ValueError, msg): + read_csv(NoNextBuffer(data), engine=python_engine) class TestDeprecatedFeatures(object): diff --git a/setup.py b/setup.py index 82d5f407228a9..31a3cddc3f9fd 100755 --- a/setup.py +++ b/setup.py @@ -702,6 +702,8 @@ def pxd(name): 'parser/data/*.gz', 'parser/data/*.bz2', 'parser/data/*.txt', + 'parser/data/*.tar', + 'parser/data/*.tar.gz', 'sas/data/*.csv', 'sas/data/*.xpt', 'sas/data/*.sas7bdat', From 3f70fda0183f860955d098dd637fbc9125dc92ec Mon Sep 17 00:00:00 2001 From: Christian Prinoth Date: Thu, 1 Jun 2017 06:50:27 -0400 Subject: [PATCH 21/55] BUG: Fixed to_html ignoring index_names parameter closes #16493 Author: Christian Prinoth Author: Tom Augspurger Author: Christian Prinoth Author: Jeff Reback This patch had conflicts when merged, resolved by Committer: Jeff Reback Closes #16495 from CRP/bugfix_16493 and squashes the following commits: 567ae69 [Jeff Reback] doc corrections 8429f9a [Tom Augspurger] Fixed lint error 469a0e6 [Christian Prinoth] BUG: fix for bug 16493 20d512f [Christian Prinoth] BUG: fix for bug 16493 6bef829 [Christian Prinoth] BUG: fix for bug 16493 426565e [Christian Prinoth] BUG: fix for bug 16493 a40820d [Christian Prinoth] BUG: fix for bug 16493 --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/formats/format.py | 4 +++- pandas/tests/io/formats/test_to_html.py | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index e309ac0a79e4b..e918bc4fccfca 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -71,6 +71,7 @@ I/O - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) - Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) +- Bug where ``DataFrame.to_html()`` ignored the ``index_names`` parameter (:issue:`16493`) - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 054db769c56dd..3deaec2dfbbc5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1292,7 +1292,9 @@ def _column_header(): self.write_tr(col_row, indent, self.indent_delta, header=True, align=align) - if self.fmt.has_index_names and self.fmt.index: + if all((self.fmt.has_index_names, + self.fmt.index, + self.fmt.show_index_names)): row = ([x if x is not None else '' for x in self.frame.index.names] + [''] * min(len(self.columns), self.max_cols)) diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index cde920b1511d2..9f4e532ec2287 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -1869,3 +1869,10 @@ def test_to_html_notebook_has_no_style(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = df.to_html() assert "thead tr:only-child" not in result + + def test_to_html_with_index_names_false(self): + # gh-16493 + df = pd.DataFrame({"A": [1, 2]}, index=pd.Index(['a', 'b'], + name='myindexname')) + result = df.to_html(index_names=False) + assert 'myindexname' not in result From 785887a2e27b92d48b5060828ed2f49b7992024d Mon Sep 17 00:00:00 2001 From: economy Date: Thu, 1 Jun 2017 06:56:20 -0400 Subject: [PATCH 22/55] BUG: fixed wrong order of ordered labels in pd.cut() closes #16459 Author: economy This patch had conflicts when merged, resolved by Committer: Jeff Reback Closes #16466 from economy/fix_cut and squashes the following commits: 29128b3 [economy] comments and whatsnew edits 3898b72 [economy] BUG: fixed wrong order of ordered labels in pd.cut() --- doc/source/whatsnew/v0.20.2.txt | 4 +--- pandas/core/reshape/tile.py | 2 +- pandas/tests/reshape/test_tile.py | 8 ++++++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index e918bc4fccfca..379249b6e55d6 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -46,11 +46,9 @@ Bug Fixes - Passing an invalid engine to :func:`read_csv` now raises an informative ``ValueError`` rather than ``UnboundLocalError``. (:issue:`16511`) - Bug in :func:`unique` on an array of tuples (:issue:`16519`) - - +- Bug in :func:`cut`` when ``labels`` are set, resulting in incorrect label ordering (:issue:`16459`) - Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) - Conversion ^^^^^^^^^^ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 866f229bec418..d8398023a5083 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -254,7 +254,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): - labels = Categorical(labels, ordered=True) + labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 8602b33856fea..542af321632cf 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -211,6 +211,7 @@ def test_cut_pass_labels(self): result = cut(arr, bins, labels=labels) exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'], + categories=labels, ordered=True) tm.assert_categorical_equal(result, exp) @@ -219,6 +220,13 @@ def test_cut_pass_labels(self): exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels) tm.assert_categorical_equal(result, exp) + # issue 16459 + labels = ['Good', 'Medium', 'Bad'] + result = cut(arr, 3, labels=labels) + exp = cut(arr, 3, labels=Categorical(labels, categories=labels, + ordered=True)) + tm.assert_categorical_equal(result, exp) + def test_qcut_include_lowest(self): values = np.arange(10) From 746c3cbe34330667f3ca85320796a72161bcde37 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 1 Jun 2017 07:37:15 -0400 Subject: [PATCH 23/55] fix linting --- pandas/tests/reshape/test_tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 542af321632cf..2523f8ab9f776 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -220,7 +220,7 @@ def test_cut_pass_labels(self): exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels) tm.assert_categorical_equal(result, exp) - # issue 16459 + # issue 16459 labels = ['Good', 'Medium', 'Bad'] result = cut(arr, 3, labels=labels) exp = cut(arr, 3, labels=Categorical(labels, categories=labels, From 885522aea70ff73418dab592fbd00d3eaecf36ee Mon Sep 17 00:00:00 2001 From: Hugues Valois Date: Thu, 1 Jun 2017 12:31:52 -0700 Subject: [PATCH 24/55] TST: writing invalid table names to sqlite (#16464) * Add test for bug #13206. * Improve test by reading back the values from sql and comparing. Also fixes coding style violation. --- pandas/tests/io/test_sql.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7b3717281bf89..a6ad44ba31422 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -816,6 +816,16 @@ def test_unicode_column_name(self): df = DataFrame([[1, 2], [3, 4]], columns=[u'\xe9', u'b']) df.to_sql('test_unicode', self.conn, index=False) + def test_escaped_table_name(self): + # GH 13206 + df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]}) + df.to_sql('d1187b08-4943-4c8d-a7f6', self.conn, index=False) + + res = sql.read_sql_query('SELECT * FROM `d1187b08-4943-4c8d-a7f6`', + self.conn) + + tm.assert_frame_equal(res, df) + @pytest.mark.single class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): From 79beeb6bb8618602ce712f7a6994e283de0e0ade Mon Sep 17 00:00:00 2001 From: "John W. O'Brien" Date: Thu, 1 Jun 2017 15:50:37 -0400 Subject: [PATCH 25/55] TST: Skip test_database_uri_string if pg8000 importable (#16528) --- pandas/tests/io/test_sql.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a6ad44ba31422..deeb8cba2b228 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -948,6 +948,13 @@ def test_database_uri_string(self): # using driver that will not be installed on Travis to trigger error # in sqlalchemy.create_engine -> test passing of this error to user + try: + # the rest of this test depends on pg8000's being absent + import pg8000 # noqa + pytest.skip("pg8000 is installed") + except ImportError: + pass + db_uri = "postgresql+pg8000://user:pass@host/dbname" with tm.assert_raises_regex(ImportError, "pg8000"): sql.read_sql("select * from table", db_uri) From a7c95f2bc8618c0a07d3f8d593e170017791b5de Mon Sep 17 00:00:00 2001 From: kiwirob Date: Thu, 1 Jun 2017 20:59:44 +0100 Subject: [PATCH 26/55] DOC: Remove incorrect elements of PeriodIndex docstring (#16553) * DOC: Remove incorrect elements of PeriodIndex docstring See #9056. * Removed trailing space --- pandas/core/indexes/period.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 15fd9b7dc2b6a..f8af6c8303d99 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -125,15 +125,7 @@ def _new_PeriodIndex(cls, **d): class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray holding ordinal values indicating regular periods in - time such as particular years, quarters, months, etc. A value of 1 is the - period containing the Gregorian proleptic datetime Jan 1, 0001 00:00:00. - This ordinal representation is from the scikits.timeseries project. - - For instance, - # construct period for day 1/1/1 and get the first second - i = Period(year=1,month=1,day=1,freq='D').asfreq('S', 'S') - i.ordinal - ===> 1 + time such as particular years, quarters, months, etc. Index keys are boxed to Period objects which carries the metadata (eg, frequency information). From 50479ae90a92edf89da4589bd5402660dfc3a69c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 1 Jun 2017 17:09:52 -0500 Subject: [PATCH 27/55] TST: Make HDF5 fspath write test robust (#16575) The test_write_fspath_all test would fail on the HDF5 example occasionally (about 1/100 in my experience). Apparently you don't get an identical HDF5 every single time. This refactors that test out to its own where we write and read both versions, and compare equality that way. --- pandas/tests/io/test_common.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 289f86eb2dc53..b527e3c5dc254 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -143,7 +143,6 @@ def test_read_fspath_all(self, reader, module, path): ('to_csv', {}, 'os'), ('to_excel', {'engine': 'xlwt'}, 'xlwt'), ('to_feather', {}, 'feather'), - ('to_hdf', {'key': 'bar', 'mode': 'w'}, 'tables'), ('to_html', {}, 'os'), ('to_json', {}, 'os'), ('to_latex', {}, 'os'), @@ -171,6 +170,26 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): assert result == expected + def test_write_fspath_hdf5(self): + # Same test as write_fspath_all, except HDF5 files aren't + # necessarily byte-for-byte identical for a given dataframe, so we'll + # have to read and compare equality + pytest.importorskip('tables') + + df = pd.DataFrame({"A": [1, 2]}) + p1 = tm.ensure_clean('string') + p2 = tm.ensure_clean('fspath') + + with p1 as string, p2 as fspath: + mypath = CustomFSPath(fspath) + df.to_hdf(mypath, key='bar') + df.to_hdf(string, key='bar') + + result = pd.read_hdf(fspath, key='bar') + expected = pd.read_hdf(string, key='bar') + + tm.assert_frame_equal(result, expected) + class TestMMapWrapper(object): From b8ca9fcdcbccac2fe41d144134977d8ae95ce1ba Mon Sep 17 00:00:00 2001 From: DSM Date: Thu, 1 Jun 2017 18:12:14 -0400 Subject: [PATCH 28/55] ENH: add .ngroup() method to groupby objects (#14026) (#14026) --- doc/source/api.rst | 1 + doc/source/groupby.rst | 63 +++++++- doc/source/reshaping.rst | 2 +- doc/source/whatsnew/v0.20.2.txt | 5 + pandas/core/groupby.py | 75 +++++++++- pandas/tests/groupby/test_counting.py | 197 +++++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 54 ------- pandas/tests/groupby/test_whitelist.py | 4 +- 8 files changed, 338 insertions(+), 63 deletions(-) create mode 100644 pandas/tests/groupby/test_counting.py diff --git a/doc/source/api.rst b/doc/source/api.rst index cdb6e36870f24..04b952a99e8f7 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1708,6 +1708,7 @@ Computations / Descriptive Stats GroupBy.mean GroupBy.median GroupBy.min + GroupBy.ngroup GroupBy.nth GroupBy.ohlc GroupBy.prod diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index cf4f1059ae17a..865f1ccae2c04 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1122,12 +1122,36 @@ To see the order in which each row appears within its group, use the .. ipython:: python - df = pd.DataFrame(list('aaabba'), columns=['A']) - df + dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg + + dfg.groupby('A').cumcount() + + dfg.groupby('A').cumcount(ascending=False) + +.. _groupby.ngroup: + +Enumerate groups +~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.2 + +To see the ordering of the groups (as opposed to the order of rows +within a group given by ``cumcount``) you can use the ``ngroup`` +method. + +Note that the numbers given to the groups match the order in which the +groups would be seen when iterating over the groupby object, not the +order they are first observed. + +.. ipython:: python - df.groupby('A').cumcount() + dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg - df.groupby('A').cumcount(ascending=False) # kwarg only + dfg.groupby('A').ngroup() + + dfg.groupby('A').ngroup(ascending=False) Plotting ~~~~~~~~ @@ -1176,14 +1200,41 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df df.groupby(df.sum(), axis=1).sum() +.. _groupby.multicolumn_factorization + +Multi-column factorization +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By using ``.ngroup()``, we can extract information about the groups in +a way similar to :func:`factorize` (as described further in the +:ref:`reshaping API `) but which applies +naturally to multiple columns of mixed type and different +sources. This can be useful as an intermediate categorical-like step +in processing, when the relationships between the group rows are more +important than their content, or as input to an algorithm which only +accepts the integer encoding. (For more information about support in +pandas for full categorical data, see the :ref:`Categorical +introduction ` and the +:ref:`API documentation `.) + +.. ipython:: python + + dfg = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")}) + + dfg + + dfg.groupby(["A", "B"]).ngroup() + + dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() + Groupby by Indexer to 'resample' data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Resampling produces new hypothetical samples(resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. +Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. In order to resample to work on indices that are non-datetimelike , the following procedure can be utilized. -In the following examples, **df.index // 5** returns a binary array which is used to determine what get's selected for the groupby operation. +In the following examples, **df.index // 5** returns a binary array which is used to determine what gets selected for the groupby operation. .. note:: The below example shows how we can downsample by consolidation of samples into fewer samples. Here by using **df.index // 5**, we are aggregating the samples in bins. By applying **std()** function, we aggregate the information contained in many samples into a small subset of values which is their standard deviation thereby reducing the number of samples. diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index b93749922c8ea..5f125e329f6f1 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -636,7 +636,7 @@ When a column contains only one level, it will be omitted in the result. pd.get_dummies(df, drop_first=True) - +.. _reshaping.factorize: Factorizing values ------------------ diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 379249b6e55d6..4028d594d954f 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -23,6 +23,11 @@ Enhancements - ``Series`` provides a ``to_latex`` method (:issue:`16180`) - Added :attr:`Index.is_strictly_monotonic_increasing` and :attr:`Index.is_strictly_monotonic_decreasing` properties (:issue:`16515`) +- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`, + parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`, + has been added to return the group order (:issue:`11642`); see + :ref:`here `. + .. _whatsnew_0202.performance: Performance Improvements diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 286677d613484..9d6d2297f6ea0 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -150,7 +150,7 @@ 'last', 'first', 'head', 'tail', 'median', 'mean', 'sum', 'min', 'max', - 'cumcount', + 'cumcount', 'ngroup', 'resample', 'rank', 'quantile', 'fillna', @@ -1437,6 +1437,75 @@ def nth(self, n, dropna=None): return result + @Substitution(name='groupby') + @Appender(_doc_template) + def ngroup(self, ascending=True): + """ + Number each group from 0 to the number of groups - 1. + + This is the enumerative complement of cumcount. Note that the + numbers given to the groups match the order in which the groups + would be seen when iterating over the groupby object, not the + order they are first observed. + + .. versionadded:: 0.20.2 + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from number of group - 1 to 0. + + Examples + -------- + + >>> df = pd.DataFrame({"A": list("aaabba")}) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').ngroup() + 0 0 + 1 0 + 2 0 + 3 1 + 4 1 + 5 0 + dtype: int64 + >>> df.groupby('A').ngroup(ascending=False) + 0 1 + 1 1 + 2 1 + 3 0 + 4 0 + 5 1 + dtype: int64 + >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() + 0 0 + 1 0 + 2 1 + 3 3 + 4 2 + 5 0 + dtype: int64 + + See also + -------- + .cumcount : Number the rows in each group. + + """ + + self._set_group_selection() + + index = self._selected_obj.index + result = Series(self.grouper.group_info[0], index) + if not ascending: + result = self.ngroups - 1 - result + return result + @Substitution(name='groupby') @Appender(_doc_template) def cumcount(self, ascending=True): @@ -1481,6 +1550,10 @@ def cumcount(self, ascending=True): 4 0 5 0 dtype: int64 + + See also + -------- + .ngroup : Number the groups themselves. """ self._set_group_selection() diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py new file mode 100644 index 0000000000000..485241d593d4f --- /dev/null +++ b/pandas/tests/groupby/test_counting.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import numpy as np + +from pandas import (DataFrame, Series, MultiIndex) +from pandas.util.testing import assert_series_equal +from pandas.compat import (range, product as cart_product) + + +class TestCounting(object): + + def test_cumcount(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3]) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.cumcount()) + assert_series_equal(e, se.cumcount()) + + def test_cumcount_dupe_index(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=mi) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=mi) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_groupby_not_col(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_ngroup(self): + df = DataFrame({'A': list('aaaba')}) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0]) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_distinct(self): + df = DataFrame({'A': list('abcde')}) + g = df.groupby('A') + sg = g.A + + expected = Series(range(5), dtype='int64') + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_one_group(self): + df = DataFrame({'A': [0] * 5}) + g = df.groupby('A') + sg = g.A + + expected = Series([0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.ngroup()) + assert_series_equal(e, se.ngroup()) + + def test_ngroup_series_matches_frame(self): + df = DataFrame({'A': list('aaaba')}) + s = Series(list('aaaba')) + + assert_series_equal(df.groupby(s).ngroup(), + s.groupby(s).ngroup()) + + def test_ngroup_dupe_index(self): + df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame({'A': list('aaaba')}, index=mi) + g = df.groupby('A') + sg = g.A + expected = Series([0, 0, 0, 1, 0], index=mi) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_groupby_not_col(self): + df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_descending(self): + df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A']) + g = df.groupby(['A']) + + ascending = Series([0, 0, 1, 0, 1]) + descending = Series([1, 1, 0, 1, 0]) + + assert_series_equal(descending, (g.ngroups - 1) - ascending) + assert_series_equal(ascending, g.ngroup(ascending=True)) + assert_series_equal(descending, g.ngroup(ascending=False)) + + def test_ngroup_matches_cumcount(self): + # verify one manually-worked out case works + df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'], + ['a', 'x'], ['b', 'y']], columns=['A', 'X']) + g = df.groupby(['A', 'X']) + g_ngroup = g.ngroup() + g_cumcount = g.cumcount() + expected_ngroup = Series([0, 1, 2, 0, 3]) + expected_cumcount = Series([0, 0, 0, 1, 0]) + + assert_series_equal(g_ngroup, expected_ngroup) + assert_series_equal(g_cumcount, expected_cumcount) + + def test_ngroup_cumcount_pair(self): + # brute force comparison for all small series + for p in cart_product(range(3), repeat=4): + df = DataFrame({'a': p}) + g = df.groupby(['a']) + + order = sorted(set(p)) + ngroupd = [order.index(val) for val in p] + cumcounted = [p[:i].count(val) for i, val in enumerate(p)] + + assert_series_equal(g.ngroup(), Series(ngroupd)) + assert_series_equal(g.cumcount(), Series(cumcounted)) + + def test_ngroup_respects_groupby_order(self): + np.random.seed(0) + df = DataFrame({'a': np.random.choice(list('abcdef'), 100)}) + for sort_flag in (False, True): + g = df.groupby(['a'], sort=sort_flag) + df['group_id'] = -1 + df['group_index'] = -1 + + for i, (_, group) in enumerate(g): + df.loc[group.index, 'group_id'] = i + for j, ind in enumerate(group.index): + df.loc[ind, 'group_index'] = j + + assert_series_equal(Series(df['group_id'].values), + g.ngroup()) + assert_series_equal(Series(df['group_index'].values), + g.cumcount()) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 88afa51e46b6c..19124a33bdbcb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3399,60 +3399,6 @@ def test_groupby_with_small_elem(self): res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) tm.assert_frame_equal(res, df.iloc[[2], :]) - def test_cumcount(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3]) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_empty(self): - ge = DataFrame().groupby(level=0) - se = Series().groupby(level=0) - - # edge case, as this is usually considered float - e = Series(dtype='int64') - - assert_series_equal(e, ge.cumcount()) - assert_series_equal(e, se.cumcount()) - - def test_cumcount_dupe_index(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_mi(self): - mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=mi) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=mi) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_groupby_not_col(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby([0, 0, 0, 1, 0]) - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - def test_fill_constistency(self): # GH9221 diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 5d131717f8345..2c8bf57f20fae 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -24,6 +24,7 @@ 'head', 'tail', 'cumcount', + 'ngroup', 'resample', 'rank', 'quantile', @@ -61,6 +62,7 @@ 'head', 'tail', 'cumcount', + 'ngroup', 'resample', 'rank', 'quantile', @@ -237,7 +239,7 @@ def test_tab_completion(mframe): 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'nunique', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'all', 'shift', 'skew', + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) From e331c783e800954aaa0fd98f23ff500f13a2aea1 Mon Sep 17 00:00:00 2001 From: Oleg Shteynbuk Date: Thu, 1 Jun 2017 18:19:10 -0400 Subject: [PATCH 29/55] make null lowercase a missing value (#16534) --- doc/source/io.rst | 2 +- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/_libs/parsers.pyx | 2 +- pandas/io/common.py | 2 +- pandas/tests/io/parser/na_values.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index bca23dd18a0e3..82cb7abde4b38 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -227,7 +227,7 @@ na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', - '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''``. + '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''``. keep_default_na : boolean, default ``True`` If na_values are specified and keep_default_na is ``False`` the default NaN values are overridden, otherwise they're appended to. diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 2a38fad37584b..9460039449ebc 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -38,7 +38,7 @@ Other Enhancements - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins=True. (:issue:`15972`) - +- :func:`read_csv` has gained 'null' as an additional default missing value.(:issue:`16471`) .. _whatsnew_0210.api_breaking: Backwards incompatible API changes diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2def4dc9dcf24..7a6f366d5b1a9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -277,7 +277,7 @@ DEFAULT_CHUNKSIZE = 256 * 1024 # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = [b'-1.#IND', b'1.#QNAN', b'1.#IND', b'-1.#QNAN', - b'#N/A N/A', b'NA', b'#NA', b'NULL', b'NaN', + b'#N/A N/A', b'NA', b'#NA', b'NULL', b'null', b'NaN', b'nan', b''] diff --git a/pandas/io/common.py b/pandas/io/common.py index f4e12ea3fb173..1c987f6a9dfc3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -31,7 +31,7 @@ # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = set([ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' + 'N/A', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '' ]) try: diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 362837a46f838..6f72ed51d76c6 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -70,7 +70,7 @@ def test_non_string_na_values(self): def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'NaN', + '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'null', 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) assert _NA_VALUES == parsers._NA_VALUES nv = len(_NA_VALUES) From e24e57c301b3bc757e2d4590755062ba2d4b9bfa Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 1 Jun 2017 18:24:19 -0400 Subject: [PATCH 30/55] MAINT: Drop has_index_names input from read_excel (#16522) --- doc/source/io.rst | 5 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/io/excel.py | 40 +++++++-------------- pandas/tests/io/test_excel.py | 63 ++++++++++++++++++++------------- 4 files changed, 52 insertions(+), 57 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 82cb7abde4b38..0c31bfe014a12 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2739,11 +2739,6 @@ should be passed to ``index_col`` and ``header`` import os os.remove('path_to_file.xlsx') -.. warning:: - - Excel files saved in version 0.16.2 or prior that had index names will still able to be read in, - but the ``has_index_names`` argument must specified to ``True``. - Parsing Specific Columns ++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 9460039449ebc..c2468917013f4 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -72,6 +72,7 @@ Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- ``pd.read_excel()`` has dropped the ``has_index_names`` parameter (:issue:`10967`) .. _whatsnew_0210.performance: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index aa08e5fd378f0..a4d2fabf76a41 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -141,10 +141,6 @@ convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally -has_index_names : boolean, default None - DEPRECATED: for version 0.17+ index names will be automatically - inferred based on index_col. To read Excel output from 0.16.2 and - prior that had saved index names, use True. Returns ------- @@ -198,8 +194,8 @@ def get_writer(engine_name): def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, - convert_float=True, has_index_names=None, converters=None, - dtype=None, true_values=None, false_values=None, engine=None, + convert_float=True, converters=None, dtype=None, + true_values=None, false_values=None, engine=None, squeeze=False, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning @@ -218,10 +214,9 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, sheetname=sheet_name, header=header, skiprows=skiprows, names=names, index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, thousands=thousands, - convert_float=convert_float, has_index_names=has_index_names, - skip_footer=skip_footer, converters=converters, dtype=dtype, - true_values=true_values, false_values=false_values, squeeze=squeeze, - **kwds) + convert_float=convert_float, skip_footer=skip_footer, + converters=converters, dtype=dtype, true_values=true_values, + false_values=false_values, squeeze=squeeze, **kwds) class ExcelFile(object): @@ -283,9 +278,8 @@ def __fspath__(self): def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, names=None, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, - convert_float=True, has_index_names=None, - converters=None, true_values=None, false_values=None, - squeeze=False, **kwds): + convert_float=True, converters=None, true_values=None, + false_values=None, squeeze=False, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -296,7 +290,6 @@ def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, return self._parse_excel(sheetname=sheet_name, header=header, skiprows=skiprows, names=names, index_col=index_col, - has_index_names=has_index_names, parse_cols=parse_cols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, @@ -343,23 +336,17 @@ def _excel2num(x): return i in parse_cols def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, - skip_footer=0, index_col=None, has_index_names=None, - parse_cols=None, parse_dates=False, date_parser=None, - na_values=None, thousands=None, convert_float=True, - true_values=None, false_values=None, verbose=False, - dtype=None, squeeze=False, **kwds): + skip_footer=0, index_col=None, parse_cols=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, convert_float=True, true_values=None, + false_values=None, verbose=False, dtype=None, + squeeze=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter _validate_header_arg(header) - if has_index_names is not None: - warn("\nThe has_index_names argument is deprecated; index names " - "will be automatically inferred based on index_col.\n" - "This argmument is still necessary if reading Excel output " - "from 0.16.2 or prior with index names.", FutureWarning, - stacklevel=3) if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " @@ -511,8 +498,7 @@ def _parse_cell(cell_contents, cell_typ): else: last = data[row][col] - if is_list_like(header) and len(header) > 1: - has_index_names = True + has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4441ed815370b..abe3757ec64f3 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -881,8 +881,42 @@ def test_excel_multindex_roundtrip(self): tm.assert_frame_equal( df, act, check_names=check_names) - def test_excel_oldindex_format(self): - # GH 4679 + def test_excel_old_index_format(self): + # see gh-4679 + filename = 'test_index_name_pre17' + self.ext + in_file = os.path.join(self.dirpath, filename) + + # We detect headers to determine if index names exist, so + # that "index" name in the "names" version of the data will + # now be interpreted as rows that include null data. + data = np.array([[None, None, None, None, None], + ['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], + ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], + ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], + ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], + ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) + columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] + mi = MultiIndex(levels=[['R0', 'R_l0_g0', 'R_l0_g1', + 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], + ['R1', 'R_l1_g0', 'R_l1_g1', + 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], + labels=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None]) + si = Index(['R0', 'R_l0_g0', 'R_l0_g1', 'R_l0_g2', + 'R_l0_g3', 'R_l0_g4'], name=None) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(in_file, 'single_names') + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(in_file, 'multi_names') + tm.assert_frame_equal(actual, expected) + + # The analogous versions of the "names" version data + # where there are explicitly no names for the indices. data = np.array([['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], @@ -894,40 +928,19 @@ def test_excel_oldindex_format(self): ['R_l1_g0', 'R_l1_g1', 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], labels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], - names=['R0', 'R1']) + names=[None, None]) si = Index(['R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], name='R0') - - in_file = os.path.join( - self.dirpath, 'test_index_name_pre17' + self.ext) + 'R_l0_g3', 'R_l0_g4'], name=None) expected = pd.DataFrame(data, index=si, columns=columns) - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel( - in_file, 'single_names', has_index_names=True) - tm.assert_frame_equal(actual, expected) - expected.index.name = None actual = pd.read_excel(in_file, 'single_no_names') tm.assert_frame_equal(actual, expected) - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel( - in_file, 'single_no_names', has_index_names=False) - tm.assert_frame_equal(actual, expected) expected.index = mi - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel( - in_file, 'multi_names', has_index_names=True) - tm.assert_frame_equal(actual, expected) - expected.index.names = [None, None] actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1], - has_index_names=False) - tm.assert_frame_equal(actual, expected, check_names=False) def test_read_excel_bool_header_arg(self): # GH 6114 From ec535e91583dd35356becc60d25ca6c8c7453b4d Mon Sep 17 00:00:00 2001 From: Ryan Hendrickson Date: Thu, 1 Jun 2017 20:23:10 -0400 Subject: [PATCH 31/55] BUG: reimplement MultiIndex.remove_unused_levels (#16565) --- asv_bench/benchmarks/indexing.py | 9 ++++++++ doc/source/whatsnew/v0.20.2.txt | 2 ++ pandas/core/indexes/multi.py | 34 +++++++++++++----------------- pandas/tests/indexes/test_multi.py | 29 ++++++++++++++++++++++++- 4 files changed, 54 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 6a2c9d48c4a28..d941ef20dc7ac 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -204,6 +204,12 @@ def setup(self): [np.arange(100), list('A'), list('A')], names=['one', 'two', 'three']) + rng = np.random.RandomState(4) + size = 1 << 16 + self.mi_unused_levels = pd.MultiIndex.from_arrays([ + rng.randint(0, 1 << 13, size), + rng.randint(0, 1 << 10, size)])[rng.rand(size) < 0.1] + def time_series_xs_mi_ix(self): self.s.ix[999] @@ -248,6 +254,9 @@ def time_multiindex_small_get_loc_warm(self): def time_is_monotonic(self): self.miint.is_monotonic + def time_remove_unused_levels(self): + self.mi_unused_levels.remove_unused_levels() + class IntervalIndexing(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 4028d594d954f..87a790d43577f 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -37,6 +37,7 @@ Performance Improvements - Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) - Improved performance of groupby with categorical groupers (:issue:`16413`) +- Improved performance of ``MultiIndex.remove_unused_levels()`` (:issue:`16556`) .. _whatsnew_0202.bug_fixes: @@ -66,6 +67,7 @@ Indexing - Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) - Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`) +- Bug in ``MultiIndex.remove_unused_levels()`` (:issue:`16556`) I/O ^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 981a6a696a618..f30da5b05f8ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1290,8 +1290,8 @@ def remove_unused_levels(self): new_levels = [] new_labels = [] - changed = np.ones(self.nlevels, dtype=bool) - for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): + changed = False + for lev, lab in zip(self.levels, self.labels): uniques = algos.unique(lab) @@ -1299,33 +1299,29 @@ def remove_unused_levels(self): if len(uniques) == len(lev): new_levels.append(lev) new_labels.append(lab) - changed[i] = False continue - # set difference, then reverse sort - diff = Index(np.arange(len(lev))).difference(uniques) - unused = diff.sort_values(ascending=False) + changed = True + + # labels get mapped from uniques to 0:len(uniques) + label_mapping = np.zeros(len(lev)) + label_mapping[uniques] = np.arange(len(uniques)) + lab = label_mapping[lab] # new levels are simple lev = lev.take(uniques) - # new labels, we remove the unsued - # by decrementing the labels for that value - # prob a better way - for u in unused: - - lab = np.where(lab > u, lab - 1, lab) - new_levels.append(lev) new_labels.append(lab) - # nothing changed - if not changed.any(): - return self + result = self._shallow_copy() - return MultiIndex(new_levels, new_labels, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + if changed: + result._reset_identity() + result._set_levels(new_levels, validate=False) + result._set_labels(new_labels, validate=False) + + return result @property def nlevels(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 388a49d25cb82..242a9d63eac63 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2515,7 +2515,34 @@ def test_reconstruct_remove_unused(self): # idempotent result2 = result.remove_unused_levels() tm.assert_index_equal(result2, expected) - assert result2 is result + assert result2.is_(result) + + @pytest.mark.parametrize('first_type,second_type', [ + ('int64', 'int64'), + ('datetime64[D]', 'str')]) + def test_remove_unused_levels_large(self, first_type, second_type): + # GH16556 + + # because tests should be deterministic (and this test in particular + # checks that levels are removed, which is not the case for every + # random input): + rng = np.random.RandomState(4) # seed is arbitrary value that works + + size = 1 << 16 + df = DataFrame(dict( + first=rng.randint(0, 1 << 13, size).astype(first_type), + second=rng.randint(0, 1 << 10, size).astype(second_type), + third=rng.rand(size))) + df = df.groupby(['first', 'second']).sum() + df = df[df.third < 0.1] + + result = df.index.remove_unused_levels() + assert len(result.levels[0]) < len(df.index.levels[0]) + assert len(result.levels[1]) < len(df.index.levels[1]) + assert result.equals(df.index) + + expected = df.reset_index().set_index(['first', 'second']).index + tm.assert_index_equal(result, expected) def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] From 9e71f08038260629017b4a4d515c9c3edc2b4cf8 Mon Sep 17 00:00:00 2001 From: Chris Filo Gorgolewski Date: Thu, 1 Jun 2017 17:28:09 -0700 Subject: [PATCH 32/55] Adding 'n/a' to list of strings denoting missing values (#16079) --- doc/source/io.rst | 2 +- doc/source/whatsnew/v0.21.0.txt | 4 +++- pandas/_libs/parsers.pyx | 2 +- pandas/io/common.py | 2 +- pandas/tests/io/parser/na_values.py | 4 ++-- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 0c31bfe014a12..bd81b478b5326 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -226,7 +226,7 @@ NA and Missing Data Handling na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: - ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', + ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''``. keep_default_na : boolean, default ``True`` If na_values are specified and keep_default_na is ``False`` the default NaN diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index c2468917013f4..fe3eb291d06ff 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -38,7 +38,7 @@ Other Enhancements - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`) - :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins=True. (:issue:`15972`) -- :func:`read_csv` has gained 'null' as an additional default missing value.(:issue:`16471`) + .. _whatsnew_0210.api_breaking: Backwards incompatible API changes @@ -49,6 +49,8 @@ Backwards incompatible API changes - Accessing a non-existent attribute on a closed :class:`HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) +- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7a6f366d5b1a9..2549c8545908d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -277,7 +277,7 @@ DEFAULT_CHUNKSIZE = 256 * 1024 # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = [b'-1.#IND', b'1.#QNAN', b'1.#IND', b'-1.#QNAN', - b'#N/A N/A', b'NA', b'#NA', b'NULL', b'null', b'NaN', + b'#N/A N/A', b'n/a', b'NA', b'#NA', b'NULL', b'null', b'NaN', b'nan', b''] diff --git a/pandas/io/common.py b/pandas/io/common.py index 1c987f6a9dfc3..cbfc33dbebb81 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -31,7 +31,7 @@ # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = set([ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '' + 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '' ]) try: diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 6f72ed51d76c6..170f9d428c9cc 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -70,8 +70,8 @@ def test_non_string_na_values(self): def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'null', 'NaN', - 'nan', '-NaN', '-nan', '#N/A N/A', '']) + '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', + 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) assert _NA_VALUES == parsers._NA_VALUES nv = len(_NA_VALUES) From 32512b938b6cce3cd2a5f1847aeb4ae7fcfb3b04 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 2 Jun 2017 04:54:40 -0500 Subject: [PATCH 33/55] API: Make is_strictly_monotonic_* private (#16576) --- doc/source/advanced.rst | 10 +++++++++ doc/source/api.rst | 2 -- doc/source/whatsnew/v0.20.2.txt | 1 - pandas/core/indexes/base.py | 16 +++++++------- pandas/core/indexes/datetimes.py | 2 +- pandas/tests/indexes/test_base.py | 8 +++---- pandas/tests/indexes/test_multi.py | 32 ++++++++++++++-------------- pandas/tests/indexes/test_numeric.py | 20 ++++++++--------- pandas/tests/indexes/test_range.py | 20 ++++++++--------- 9 files changed, 59 insertions(+), 52 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index ea00588ba156f..711c3e9a95d05 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -948,6 +948,16 @@ On the other hand, if the index is not monotonic, then both slice bounds must be In [11]: df.loc[2:3, :] KeyError: 'Cannot get right slice bound for non-unique label: 3' +:meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` only check that +an index is weakly monotonic. To check for strict montonicity, you can combine one of those with +:meth:`Index.is_unique` + +.. ipython:: python + + weakly_monotonic = pd.Index(['a', 'b', 'c', 'c']) + weakly_monotonic + weakly_monotonic.is_monotonic_increasing + weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique Endpoints are inclusive ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/api.rst b/doc/source/api.rst index 04b952a99e8f7..d6053791d6f4b 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1287,8 +1287,6 @@ Attributes Index.is_monotonic Index.is_monotonic_increasing Index.is_monotonic_decreasing - Index.is_strictly_monotonic_increasing - Index.is_strictly_monotonic_decreasing Index.is_unique Index.has_duplicates Index.dtype diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 87a790d43577f..d58a98703f22a 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -21,7 +21,6 @@ Enhancements - Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) - ``Series`` provides a ``to_latex`` method (:issue:`16180`) -- Added :attr:`Index.is_strictly_monotonic_increasing` and :attr:`Index.is_strictly_monotonic_decreasing` properties (:issue:`16515`) - A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`, parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e8c2043138edb..028464ad5cd89 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1221,33 +1221,33 @@ def is_monotonic_decreasing(self): return self._engine.is_monotonic_decreasing @property - def is_strictly_monotonic_increasing(self): + def _is_strictly_monotonic_increasing(self): """return if the index is strictly monotonic increasing (only increasing) values Examples -------- - >>> Index([1, 2, 3]).is_strictly_monotonic_increasing + >>> Index([1, 2, 3])._is_strictly_monotonic_increasing True - >>> Index([1, 2, 2]).is_strictly_monotonic_increasing + >>> Index([1, 2, 2])._is_strictly_monotonic_increasing False - >>> Index([1, 3, 2]).is_strictly_monotonic_increasing + >>> Index([1, 3, 2])._is_strictly_monotonic_increasing False """ return self.is_unique and self.is_monotonic_increasing @property - def is_strictly_monotonic_decreasing(self): + def _is_strictly_monotonic_decreasing(self): """return if the index is strictly monotonic decreasing (only decreasing) values Examples -------- - >>> Index([3, 2, 1]).is_strictly_monotonic_decreasing + >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing True - >>> Index([3, 2, 2]).is_strictly_monotonic_decreasing + >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing False - >>> Index([3, 1, 2]).is_strictly_monotonic_decreasing + >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing False """ return self.is_unique and self.is_monotonic_decreasing diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 60560374cd420..239894cff3874 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1472,7 +1472,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): # the bounds need swapped if index is reverse sorted and has a # length > 1 (is_monotonic_decreasing gives True for empty # and length 1 index) - if self.is_strictly_monotonic_decreasing and len(self) > 1: + if self._is_strictly_monotonic_decreasing and len(self) > 1: return upper if side == 'left' else lower return lower if side == 'left' else upper else: diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index a6933316e4291..d9f8e5e7f382b 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1330,8 +1330,8 @@ def test_is_monotonic_incomparable(self): index = Index([5, datetime.now(), 7]) assert not index.is_monotonic_increasing assert not index.is_monotonic_decreasing - assert not index.is_strictly_monotonic_increasing - assert not index.is_strictly_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing def test_get_set_value(self): values = np.random.randn(100) @@ -2030,8 +2030,8 @@ def test_is_monotonic_na(self): for index in examples: assert not index.is_monotonic_increasing assert not index.is_monotonic_decreasing - assert not index.is_strictly_monotonic_increasing - assert not index.is_strictly_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing def test_repr_summary(self): with cf.option_context('display.max_seq_items', 10): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 242a9d63eac63..ba917f33d8595 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2373,30 +2373,30 @@ def test_is_monotonic(self): i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=['one', 'two']) assert i.is_monotonic - assert i.is_strictly_monotonic_increasing + assert i._is_strictly_monotonic_increasing assert Index(i.values).is_monotonic - assert i.is_strictly_monotonic_increasing + assert i._is_strictly_monotonic_increasing i = MultiIndex.from_product([np.arange(10, 0, -1), np.arange(10)], names=['one', 'two']) assert not i.is_monotonic - assert not i.is_strictly_monotonic_increasing + assert not i._is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic - assert not Index(i.values).is_strictly_monotonic_increasing + assert not Index(i.values)._is_strictly_monotonic_increasing i = MultiIndex.from_product([np.arange(10), np.arange(10, 0, -1)], names=['one', 'two']) assert not i.is_monotonic - assert not i.is_strictly_monotonic_increasing + assert not i._is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic - assert not Index(i.values).is_strictly_monotonic_increasing + assert not Index(i.values)._is_strictly_monotonic_increasing i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) assert not i.is_monotonic - assert not i.is_strictly_monotonic_increasing + assert not i._is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic - assert not Index(i.values).is_strictly_monotonic_increasing + assert not Index(i.values)._is_strictly_monotonic_increasing # string ordering i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], @@ -2406,8 +2406,8 @@ def test_is_monotonic(self): names=['first', 'second']) assert not i.is_monotonic assert not Index(i.values).is_monotonic - assert not i.is_strictly_monotonic_increasing - assert not Index(i.values).is_strictly_monotonic_increasing + assert not i._is_strictly_monotonic_increasing + assert not Index(i.values)._is_strictly_monotonic_increasing i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['mom', 'next', 'zenith']], @@ -2416,8 +2416,8 @@ def test_is_monotonic(self): names=['first', 'second']) assert i.is_monotonic assert Index(i.values).is_monotonic - assert i.is_strictly_monotonic_increasing - assert Index(i.values).is_strictly_monotonic_increasing + assert i._is_strictly_monotonic_increasing + assert Index(i.values)._is_strictly_monotonic_increasing # mixed levels, hits the TypeError i = MultiIndex( @@ -2428,20 +2428,20 @@ def test_is_monotonic(self): names=['household_id', 'asset_id']) assert not i.is_monotonic - assert not i.is_strictly_monotonic_increasing + assert not i._is_strictly_monotonic_increasing def test_is_strictly_monotonic(self): idx = pd.MultiIndex(levels=[['bar', 'baz'], ['mom', 'next']], labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) assert idx.is_monotonic_increasing - assert not idx.is_strictly_monotonic_increasing + assert not idx._is_strictly_monotonic_increasing @pytest.mark.xfail(reason="buggy MultiIndex.is_monotonic_decresaing.") - def test_is_strictly_monotonic_decreasing(self): + def test__is_strictly_monotonic_decreasing(self): idx = pd.MultiIndex(levels=[['baz', 'bar'], ['next', 'mom']], labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) assert idx.is_monotonic_decreasing - assert not idx.is_strictly_monotonic_decreasing + assert not idx._is_strictly_monotonic_decreasing def test_reconstruct_sort(self): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 77f34dbf210e0..29d4214fd549b 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -465,36 +465,36 @@ def test_view(self): def test_is_monotonic(self): assert self.index.is_monotonic assert self.index.is_monotonic_increasing - assert self.index.is_strictly_monotonic_increasing + assert self.index._is_strictly_monotonic_increasing assert not self.index.is_monotonic_decreasing - assert not self.index.is_strictly_monotonic_decreasing + assert not self.index._is_strictly_monotonic_decreasing index = self._holder([4, 3, 2, 1]) assert not index.is_monotonic - assert not index.is_strictly_monotonic_increasing - assert index.is_strictly_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing index = self._holder([1]) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing - assert index.is_strictly_monotonic_increasing - assert index.is_strictly_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing def test_is_strictly_monotonic(self): index = self._holder([1, 1, 2, 3]) assert index.is_monotonic_increasing - assert not index.is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_increasing index = self._holder([3, 2, 1, 1]) assert index.is_monotonic_decreasing - assert not index.is_strictly_monotonic_decreasing + assert not index._is_strictly_monotonic_decreasing index = self._holder([1, 1]) assert index.is_monotonic_increasing assert index.is_monotonic_decreasing - assert not index.is_strictly_monotonic_increasing - assert not index.is_strictly_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing def test_logical_compat(self): idx = self.create_index() diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index db8180cb736c4..0d88e88030604 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -331,35 +331,35 @@ def test_is_monotonic(self): assert self.index.is_monotonic assert self.index.is_monotonic_increasing assert not self.index.is_monotonic_decreasing - assert self.index.is_strictly_monotonic_increasing - assert not self.index.is_strictly_monotonic_decreasing + assert self.index._is_strictly_monotonic_increasing + assert not self.index._is_strictly_monotonic_decreasing index = RangeIndex(4, 0, -1) assert not index.is_monotonic - assert not index.is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_increasing assert index.is_monotonic_decreasing - assert index.is_strictly_monotonic_decreasing + assert index._is_strictly_monotonic_decreasing index = RangeIndex(1, 2) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing - assert index.is_strictly_monotonic_increasing - assert index.is_strictly_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing index = RangeIndex(2, 1) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing - assert index.is_strictly_monotonic_increasing - assert index.is_strictly_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing index = RangeIndex(1, 1) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing - assert index.is_strictly_monotonic_increasing - assert index.is_strictly_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing def test_equals_range(self): equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), From 36670fce472436419efd5666cd8189c0b56fdc8c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 2 Jun 2017 14:26:44 +0200 Subject: [PATCH 34/55] DOC: change doc build to python 3.6 (#16545) * DOC: change doc build to python 3.6 * Remove pinning of pyqt to 4.x * Remove pinning of openpyxl * Add xsel to doc build for clipboard --- .travis.yml | 8 ++++++-- ...uirements-3.5_DOC.build => requirements-3.6_DOC.build} | 2 +- ci/{requirements-3.5_DOC.run => requirements-3.6_DOC.run} | 4 ++-- ci/{requirements-3.5_DOC.sh => requirements-3.6_DOC.sh} | 0 4 files changed, 9 insertions(+), 5 deletions(-) rename ci/{requirements-3.5_DOC.build => requirements-3.6_DOC.build} (73%) rename ci/{requirements-3.5_DOC.run => requirements-3.6_DOC.run} (87%) rename ci/{requirements-3.5_DOC.sh => requirements-3.6_DOC.sh} (100%) diff --git a/.travis.yml b/.travis.yml index 8b6700e11d2c5..5dc4256a268ad 100644 --- a/.travis.yml +++ b/.travis.yml @@ -74,7 +74,11 @@ matrix: # In allow_failures - os: linux env: - - JOB="3.5_DOC" DOC=true + - JOB="3.6_DOC" DOC=true + addons: + apt: + packages: + - xsel allow_failures: - os: linux env: @@ -87,7 +91,7 @@ matrix: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" - os: linux env: - - JOB="3.5_DOC" DOC=true + - JOB="3.6_DOC" DOC=true before_install: - echo "before_install" diff --git a/ci/requirements-3.5_DOC.build b/ci/requirements-3.6_DOC.build similarity index 73% rename from ci/requirements-3.5_DOC.build rename to ci/requirements-3.6_DOC.build index 73aeb3192242f..bdcfe28105866 100644 --- a/ci/requirements-3.5_DOC.build +++ b/ci/requirements-3.6_DOC.build @@ -1,4 +1,4 @@ -python=3.5* +python=3.6* python-dateutil pytz numpy diff --git a/ci/requirements-3.5_DOC.run b/ci/requirements-3.6_DOC.run similarity index 87% rename from ci/requirements-3.5_DOC.run rename to ci/requirements-3.6_DOC.run index 9647ab53ab835..df8087f62ef16 100644 --- a/ci/requirements-3.5_DOC.run +++ b/ci/requirements-3.6_DOC.run @@ -12,7 +12,7 @@ lxml beautifulsoup4 html5lib pytables -openpyxl=1.8.5 +openpyxl xlrd xlwt xlsxwriter @@ -21,4 +21,4 @@ numexpr bottleneck statsmodels xarray -pyqt=4.11.4 +pyqt diff --git a/ci/requirements-3.5_DOC.sh b/ci/requirements-3.6_DOC.sh similarity index 100% rename from ci/requirements-3.5_DOC.sh rename to ci/requirements-3.6_DOC.sh From 5d7a02079087c55b497155389a2d40a0fb76c542 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 2 Jun 2017 18:30:10 -0400 Subject: [PATCH 35/55] DOC: whatsnew 0.20.2 edits (#16587) --- doc/source/whatsnew/v0.20.2.txt | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index d58a98703f22a..c9486954258c8 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -1,7 +1,7 @@ .. _whatsnew_0202: -v0.20.2 (???) -------------- +v0.20.2 (June 3, 2017) +---------------------- This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, bug fixes and performance improvements. @@ -46,19 +46,19 @@ Bug Fixes - Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when detecting the terminal size. This fix only applies to python 3 (:issue:`16496`) - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) -- Bug in ``Index.symmetric_difference()`` on two equal MultiIndex's, results in a TypeError (:issue `13490`) +- Bug in ``Index.symmetric_difference()`` on two equal MultiIndex's, results in a ``TypeError`` (:issue `13490`) - Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) - Passing an invalid engine to :func:`read_csv` now raises an informative ``ValueError`` rather than ``UnboundLocalError``. (:issue:`16511`) - Bug in :func:`unique` on an array of tuples (:issue:`16519`) -- Bug in :func:`cut`` when ``labels`` are set, resulting in incorrect label ordering (:issue:`16459`) -- Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) +- Bug in :func:`cut` when ``labels`` are set, resulting in incorrect label ordering (:issue:`16459`) +- Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on ``Categoricals`` (:issue:`16409`) Conversion ^^^^^^^^^^ -- Bug in ``pd.to_numeric()`` in which empty data inputs were causing Python to crash (:issue:`16302`) -- Silence numpy warnings when broadcasting DataFrame to Series with comparison ops (:issue:`16378`, :issue:`16306`) +- Bug in :func:`to_numeric` in which empty data inputs were causing a segfault of the interpreter (:issue:`16302`) +- Silence numpy warnings when broadcasting ``DataFrame`` to ``Series`` with comparison ops (:issue:`16378`, :issue:`16306`) Indexing @@ -66,15 +66,15 @@ Indexing - Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) - Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`) -- Bug in ``MultiIndex.remove_unused_levels()`` (:issue:`16556`) +- Bug in ``MultiIndex.remove_unused_levels()`` that would not return a ``MultiIndex`` equal to the original. (:issue:`16556`) I/O ^^^ -- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) +- Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) -- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) -- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) +- Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`) +- Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) - Bug where ``DataFrame.to_html()`` ignored the ``index_names`` parameter (:issue:`16493`) - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) @@ -92,7 +92,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug creating datetime rolling window on an empty DataFrame (:issue:`15819`) +- Bug in creating a time-based rolling window on an empty ``DataFrame`` (:issue:`15819`) - Bug in ``rolling.cov()`` with offset window (:issue:`16058`) - Bug in ``.resample()`` and ``.groupby()`` when aggregating on integers (:issue:`16361`) @@ -100,12 +100,12 @@ Groupby/Resample/Rolling Sparse ^^^^^^ -- Bug in construction of SparseDataFrame from ``scipy.sparse.dok_matrix`` (:issue:`16179`) +- Bug in construction of ``SparseDataFrame`` from ``scipy.sparse.dok_matrix`` (:issue:`16179`) Reshaping ^^^^^^^^^ -- Bug in ``DataFrame.stack`` with unsorted levels in MultiIndex columns (:issue:`16323`) +- Bug in ``DataFrame.stack`` with unsorted levels in ``MultiIndex`` columns (:issue:`16323`) - Bug in ``pd.wide_to_long()`` where no error was raised when ``i`` was not a unique identifier (:issue:`16382`) - Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`) - Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`) @@ -114,7 +114,7 @@ Reshaping Numeric ^^^^^^^ -- Bug in .interpolate(), where limit_direction was not respected when limit=None (default) was passed (:issue:16282) +- Bug in ``.interpolate()``, where ``limit_direction`` was not respected when ``limit=None`` (default) was passed (:issue:`16282`) Categorical ^^^^^^^^^^^ @@ -124,4 +124,4 @@ Categorical Other ^^^^^ -- Bug in ``pd.drop([])`` for DataFrame with non-unique indices (:issue:`16270`) +- Bug in ``DataFrame.drop()`` with an empty-list with non-unique indices (:issue:`16270`) From 882ea0f3ff7741f7f613f1bac9f63f7fb2afb780 Mon Sep 17 00:00:00 2001 From: Tong SHEN Date: Sun, 4 Jun 2017 08:52:50 +0800 Subject: [PATCH 36/55] DOC: Fix typo in timeseries.rst (#16590) --- doc/source/timeseries.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 71d85f9b3995b..1dd80aec4fd6c 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1922,7 +1922,7 @@ then you can use a ``PeriodIndex`` and/or ``Series`` of ``Periods`` to do comput span = pd.period_range('1215-01-01', '1381-01-01', freq='D') span -To convert from a ``int64`` based YYYYMMDD representation. +To convert from an ``int64`` based YYYYMMDD representation. .. ipython:: python From 9d0be9d92f4d967c1d154fa1623f52f1b3abb422 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 Jun 2017 05:39:31 -0500 Subject: [PATCH 37/55] PERF: vectorize _interp_limit (#16592) * PERF: vectorize _interp_limit * CLN: remove old implementation * fixup! CLN: remove old implementation --- pandas/core/missing.py | 77 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 51778684d68f5..5aabc9d8730dd 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -143,12 +143,6 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, 'DatetimeIndex') method = 'values' - def _interp_limit(invalid, fw_limit, bw_limit): - "Get idx of values that won't be filled b/c they exceed the limits." - for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): - yield x - valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: @@ -180,21 +174,29 @@ def _interp_limit(invalid, fw_limit, bw_limit): # default limit is unlimited GH #16282 if limit is None: - limit = len(xvalues) + # limit = len(xvalues) + pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') # each possible limit_direction - if limit_direction == 'forward': + # TODO: do we need sorted? + if limit_direction == 'forward' and limit is not None: violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) - elif limit_direction == 'backward': + elif limit_direction == 'forward': + violate_limit = sorted(start_nans) + elif limit_direction == 'backward' and limit is not None: violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) - elif limit_direction == 'both': + elif limit_direction == 'backward': + violate_limit = sorted(end_nans) + elif limit_direction == 'both' and limit is not None: violate_limit = sorted(_interp_limit(invalid, limit, limit)) + else: + violate_limit = [] xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) @@ -630,3 +632,58 @@ def fill_zeros(result, x, y, name, fill): result = result.reshape(shape) return result + + +def _interp_limit(invalid, fw_limit, bw_limit): + """Get idx of values that won't be filled b/c they exceed the limits. + + This is equivalent to the more readable, but slower + + .. code-block:: python + + for x in np.where(invalid)[0]: + if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + yield x + """ + # handle forward first; the backward direction is the same except + # 1. operate on the reversed array + # 2. subtract the returned indicies from N - 1 + N = len(invalid) + + def inner(invalid, limit): + limit = min(limit, N) + windowed = _rolling_window(invalid, limit + 1).all(1) + idx = (set(np.where(windowed)[0] + limit) | + set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0])) + return idx + + if fw_limit == 0: + f_idx = set(np.where(invalid)[0]) + else: + f_idx = inner(invalid, fw_limit) + + if bw_limit == 0: + # then we don't even need to care about backwards, just use forwards + return f_idx + else: + b_idx = set(N - 1 - np.asarray(list(inner(invalid[::-1], bw_limit)))) + if fw_limit == 0: + return b_idx + return f_idx & b_idx + + +def _rolling_window(a, window): + """ + [True, True, False, True, False], 2 -> + + [ + [True, True], + [True, False], + [False, True], + [True, False], + ] + """ + # https://stackoverflow.com/a/6811241 + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) From b3769f16f308550c9f7e5585c12560be7ca843b2 Mon Sep 17 00:00:00 2001 From: Mahdi Ben Jelloul Date: Sun, 4 Jun 2017 12:42:47 +0200 Subject: [PATCH 38/55] DOC: Fix typo in merge doc for validate kwarg (#16595) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 25c3c3fe4e48e..2b2e7be62427b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -184,7 +184,7 @@ dataset. * "many_to_one" or "m:1": check if merge keys are unique in right dataset. - * "many_to_may" or "m:m": allowed, but does not result in checks. + * "many_to_many" or "m:m": allowed, but does not result in checks. .. versionadded:: 0.21.0 From a0174eb1e79a9157aa14a0ff02d684c2ede933ad Mon Sep 17 00:00:00 2001 From: "Mehmet Ali \"Mali\" Akmanalp" Date: Sun, 4 Jun 2017 06:44:25 -0400 Subject: [PATCH 39/55] BUG: convert numpy strings in index names in HDF #13492 (#16444) * BUG: Handle numpy strings in index names in HDF5 #13492 * REF: refactor to _ensure_str --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/pytables.py | 14 +++++++++++++- pandas/tests/io/test_pytables.py | 23 ++++++++++++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index c9486954258c8..362a80c10694a 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -76,6 +76,7 @@ I/O - Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`) - Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) - Bug where ``DataFrame.to_html()`` ignored the ``index_names`` parameter (:issue:`16493`) +- Bug where ``pd.read_hdf()`` returns numpy strings for index names (:issue:`13492`) - Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2940d1f958776..ddd25aafa060c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -73,6 +73,18 @@ def _ensure_encoding(encoding): return encoding +def _ensure_str(name): + """Ensure that an index / column name is a str (python 3) or + unicode (python 2); otherwise they may be np.string dtype. + Non-string dtypes are passed through unchanged. + + https://github.com/pandas-dev/pandas/issues/13492 + """ + if isinstance(name, compat.string_types): + name = compat.text_type(name) + return name + + Term = Expr @@ -2574,7 +2586,7 @@ def read_index_node(self, node, start=None, stop=None): name = None if 'name' in node._v_attrs: - name = node._v_attrs.name + name = _ensure_str(node._v_attrs.name) index_class = self._alias_to_class(getattr(node._v_attrs, 'index_class', '')) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index e68de93c3e8ce..efec778e12b50 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -16,7 +16,7 @@ date_range, timedelta_range, Index, DatetimeIndex, isnull) -from pandas.compat import is_platform_windows, PY3, PY35, BytesIO +from pandas.compat import is_platform_windows, PY3, PY35, BytesIO, text_type from pandas.io.formats.printing import pprint_thing tables = pytest.importorskip('tables') @@ -2922,6 +2922,27 @@ def test_store_index_name_with_tz(self): recons = store['frame'] tm.assert_frame_equal(recons, df) + @pytest.mark.parametrize('table_format', ['table', 'fixed']) + def test_store_index_name_numpy_str(self, table_format): + # GH #13492 + idx = pd.Index(pd.to_datetime([datetime.date(2000, 1, 1), + datetime.date(2000, 1, 2)]), + name=u('cols\u05d2')) + idx1 = pd.Index(pd.to_datetime([datetime.date(2010, 1, 1), + datetime.date(2010, 1, 2)]), + name=u('rows\u05d0')) + df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) + + # This used to fail, returning numpy strings instead of python strings. + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format=table_format) + df2 = read_hdf(path, 'df') + + assert_frame_equal(df, df2, check_names=True) + + assert type(df2.index.name) == text_type + assert type(df2.columns.name) == text_type + def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] From 977151477395c1a20ab108f9956524cd225ddfe0 Mon Sep 17 00:00:00 2001 From: bpraggastis Date: Sun, 4 Jun 2017 03:47:14 -0700 Subject: [PATCH 40/55] ERRR: Raise error in usecols when column doesn't exist but length matches (#16460) * gh-14671 Check if usecols with type string contains a subset of names, if not throws an error * tests added for gh-14671, expected behavior of simultaneous use of usecols and names unclear so these tests are commented out * Review comments --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/parsers.py | 6 ++++ pandas/tests/io/parser/usecols.py | 51 +++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 362a80c10694a..e1469cf15e20c 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -72,6 +72,7 @@ I/O ^^^ - Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`) +- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`) - Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aab70c8ce2cd4..055d6d045d2f2 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1649,6 +1649,12 @@ def __init__(self, src, **kwds): if self.usecols: usecols = _evaluate_usecols(self.usecols, self.orig_names) + + # GH 14671 + if (self.usecols_dtype == 'string' and + not set(usecols).issubset(self.orig_names)): + raise ValueError("Usecols do not match names.") + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 8761d1ccd3da4..f582e5037ca07 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -475,3 +475,54 @@ def test_uneven_length_cols(self): 'C': [3, 5, 4, 3, 3, 7]}) df = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(df, expected) + + def test_raise_on_usecols_names_mismatch(self): + # GH 14671 + data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' + + if self.engine == 'c': + msg = 'Usecols do not match names' + else: + msg = 'is not in list' + + usecols = ['a', 'b', 'c', 'd'] + df = self.read_csv(StringIO(data), usecols=usecols) + expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7], + 'd': [4, 8]}) + tm.assert_frame_equal(df, expected) + + usecols = ['a', 'b', 'c', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), usecols=usecols) + + usecols = ['a', 'b', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), usecols=usecols) + + names = ['A', 'B', 'C', 'D'] + + df = self.read_csv(StringIO(data), header=0, names=names) + expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7], + 'D': [4, 8]}) + tm.assert_frame_equal(df, expected) + + # TODO: https://github.com/pandas-dev/pandas/issues/16469 + # usecols = ['A','C'] + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + # + # usecols = [0,2] + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + + usecols = ['A', 'B', 'C', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), header=0, names=names, + usecols=usecols) + usecols = ['A', 'B', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), names=names, usecols=usecols) From cf5f2d899a24d4a1406de6495b7263e1ff9f6eee Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 Jun 2017 06:29:15 -0500 Subject: [PATCH 41/55] DOC: Whatsnew fixups (#16596) --- doc/source/whatsnew/v0.20.2.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index e1469cf15e20c..31125db0f34d4 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -1,14 +1,12 @@ .. _whatsnew_0202: -v0.20.2 (June 3, 2017) +v0.20.2 (June 4, 2017) ---------------------- This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, bug fixes and performance improvements. We recommend that all users upgrade to this version. -Highlights include: - .. contents:: What's new in v0.20.2 :local: :backlinks: none From 1415b95f965026639563b2c572c79c321727ee10 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 4 Jun 2017 15:52:53 -0500 Subject: [PATCH 42/55] DOC: Update release.rst --- doc/source/release.rst | 50 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/doc/source/release.rst b/doc/source/release.rst index 2587962299569..bf272e243e0dd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,56 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.20.2 +------------- + +**Release date:** June 4, 2017 + +This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, +bug fixes and performance improvements. +We recommend that all users upgrade to this version. + +See the :ref:`v0.20.2 Whatsnew ` overview for an extensive list +of all enhancements and bugs that have been fixed in 0.20.2. + +Thanks +~~~~~~ + +- Aaron Barber +- Andrew δΊ® +- Becky Sweger +- Christian Prinoth +- Christian Stade-Schuldt +- DSM +- Erik Fredriksen +- Hugues Valois +- Jeff Reback +- Jeff Tratner +- JimStearns206 +- John W. O'Brien +- Joris Van den Bossche +- JosephWagner +- Keith Webber +- Mehmet Ali "Mali" Akmanalp +- Pankaj Pandey +- Patrick Luo +- Patrick O'Melveny +- Pietro Battiston +- RobinFiveWords +- Ryan Hendrickson +- SimonBaron +- Tom Augspurger +- WBare +- bpraggastis +- chernrick +- chris-b1 +- economy +- gfyoung +- jaredsnyder +- keitakurita +- linebp +- lloydkirk + pandas 0.20.0 / 0.20.1 ---------------------- From 93aabe746b13d99ec8099738dd35b4040e37a249 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 6 Jun 2017 09:10:00 -0400 Subject: [PATCH 43/55] BUG: pickle compat with UTC tz's (#16611) closes #16608 --- doc/source/whatsnew.rst | 2 + doc/source/whatsnew/v0.20.3.txt | 89 ++++++++++++++++++ pandas/compat/pickle_compat.py | 2 +- .../0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle | Bin 125349 -> 126076 bytes .../tests/io/generate_legacy_storage_files.py | 8 +- 5 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 doc/source/whatsnew/v0.20.3.txt mode change 100644 => 100755 pandas/tests/io/generate_legacy_storage_files.py diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index b1f9990a3e6af..3385bafc26467 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -20,6 +20,8 @@ These are new features and improvements of note in each release. .. include:: whatsnew/v0.21.0.txt +.. include:: whatsnew/v0.20.3.txt + .. include:: whatsnew/v0.20.2.txt .. include:: whatsnew/v0.20.0.txt diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt new file mode 100644 index 0000000000000..2032209c4aa23 --- /dev/null +++ b/doc/source/whatsnew/v0.20.3.txt @@ -0,0 +1,89 @@ +.. _whatsnew_0203: + +v0.20.3 (June ??, 2017) +----------------------- + +This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, +bug fixes and performance improvements. +We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.20.3 + :local: + :backlinks: none + + +.. _whatsnew_0203.enhancements: + +Enhancements +~~~~~~~~~~~~ + + + + + + +.. _whatsnew_0203.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + + + + + +.. _whatsnew_0203.bug_fixes: + +Bug Fixes +~~~~~~~~~ + + + + +Conversion +^^^^^^^^^^ + +- Bug in pickle compat prior to the v0.20.x series, when ``UTC`` is a timezone in a Series/DataFrame/Index (:issue:`16608`) + +Indexing +^^^^^^^^ + + + +I/O +^^^ + + + +Plotting +^^^^^^^^ + + + + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + + + +Sparse +^^^^^^ + + + + +Reshaping +^^^^^^^^^ + + + +Numeric +^^^^^^^ + + +Categorical +^^^^^^^^^^^ + + +Other +^^^^^ diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index b875bbb0d63c0..f6223c48994ae 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -15,7 +15,7 @@ def load_reduce(self): args = stack.pop() func = stack[-1] - if type(args[0]) is type: + if len(args) and type(args[0]) is type: n = args[0].__name__ # noqa try: diff --git a/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle index 6bb02672a4151c8d6536127fc94e68634e56c86d..75ea95ff402c4e9f0c93ef80e6c04baf7f0a70d7 100644 GIT binary patch delta 785 zcmah{Ur19?7(d_H*{*vCuc`ZU4d>jp=7gkK*7 zbDG^U-F;$y>Z#7`OHC!lFJ2x?bta}}FQG0)G3l&>LHyKm2&K{<2m(&4u44d}5@v*~-n>`aa6m6)H86ooF zVb!;w`sAa^6^4sZJ6&ezZnRTmc*NGKJ@d?hu)~3h&Tvw4im%F~UCw)k!CEKIx7g_m zhHs^A{)k?!>R=kg?-f8|#x+T*FBr{JN*5($dQ{1oxGv}2cOSQwJd6v2UYq9ii9mOF zlKQYw`aRAXhq)qci@B7x&5WCV2?|=?KSE(r>uCQ-K}TyKR-G*qc9Z7tT%cFnhd1Ir5Br*S!Bu>mgir7f1XJYfo`F!-#$;UsyRIra-ib&&MK&_? z50Ey|LqR-dsn|ji(@_VVv6Q|{#)AEBJ3P3Sk8b6oaC?MG!q$Z~2*zPK delta 603 zcmex!gMH~`cGd=#sXDVZvTjk{JVC9Bk-eCOfg!7eWpbgEq(BakonaX}rFM!&hD}yA z%VYy#$;rVQ;!GJVlP^k{Y+j(TpHW~6ilXH#lk>HtL??g*-Lj0C7#K>EvJNl<%>?RM zsA$IqHotiDBP~rvrec=KilU|<%3oPha6i-_4TyDTSSEiE6=%FOd7`p3Q%(3CbHOv;eYN}1eiS%$2bv1GHW6%(W26R3}%!OzYH3VtS8 zw#kZe;!MVDll>LN89kd9S#Mut%{ViW=^)#7+qsMeevC@f^IkLVg-6Zw;5Uq#0GDgF A(*OVf diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py old mode 100644 new mode 100755 index 22c62b738e6a2..996965999724e --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,3 +1,5 @@ +#!/usr/env/bin python + """ self-contained to write legacy storage (pickle/msgpack) files """ from __future__ import print_function from warnings import catch_warnings @@ -125,7 +127,11 @@ def create_data(): mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame({ u'A': Timestamp('20130102', tz='US/Eastern'), - u'B': Timestamp('20130603', tz='CET')}, index=range(5)) + u'B': Timestamp('20130603', tz='CET')}, index=range(5)), + dt_mixed2_tzs=DataFrame({ + u'A': Timestamp('20130102', tz='US/Eastern'), + u'B': Timestamp('20130603', tz='CET'), + u'C': Timestamp('20130603', tz='UTC')}, index=range(5)) ) with catch_warnings(record=True): From 3ebd7191fd3d85af3ba5c1319c4d9ada5d8917fd Mon Sep 17 00:00:00 2001 From: Jean Helie Date: Wed, 7 Jun 2017 01:41:31 +0100 Subject: [PATCH 44/55] Fix some lgtm alerts (#16613) --- pandas/core/dtypes/cast.py | 2 +- pandas/core/generic.py | 8 ++++---- pandas/core/indexes/interval.py | 4 ++-- pandas/core/internals.py | 3 --- pandas/core/sparse/array.py | 2 +- pandas/io/parsers.py | 2 +- pandas/tseries/offsets.py | 1 - 7 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fd61813a57c98..16b0a5c8a74ca 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -668,7 +668,7 @@ def maybe_convert_objects(values, convert_dates=True, convert_numeric=True, if convert_timedeltas == 'coerce': from pandas.core.tools.timedeltas import to_timedelta - new_values = to_timedelta(values, coerce=True) + new_values = to_timedelta(values, errors='coerce') # if we are all nans then leave me alone if not isnull(new_values).all(): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 98999ec267c82..accb7d0db1d2c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4285,7 +4285,7 @@ def asof(self, where, subset=None): raise ValueError("subset is not valid for Series") elif self.ndim > 2: raise NotImplementedError("asof is not implemented " - "for {type}".format(type(self))) + "for {type}".format(type=type(self))) else: if subset is None: subset = self.columns @@ -4980,7 +4980,7 @@ def last(self, offset): offset = to_offset(offset) - start_date = start = self.index[-1] - offset + start_date = self.index[-1] - offset start = self.index.searchsorted(start_date, side='right') return self.iloc[start:] @@ -5303,8 +5303,8 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, # slice me out of the other else: - raise NotImplemented("cannot align with a higher dimensional " - "NDFrame") + raise NotImplementedError("cannot align with a higher " + "dimensional NDFrame") elif is_list_like(other): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b1523cd6c0d0c..e6b2bc0953680 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1053,11 +1053,11 @@ def interval_range(start=None, end=None, freq=None, periods=None, if periods is None or end is None: raise ValueError("must specify 2 of start, end, periods") start = end - periods * freq - elif end is None: + if end is None: if periods is None or start is None: raise ValueError("must specify 2 of start, end, periods") end = start + periods * freq - elif periods is None: + if periods is None: if start is None or end is None: raise ValueError("must specify 2 of start, end, periods") pass diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 15851a17274ca..58690ad632152 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4645,7 +4645,6 @@ def _block2d_to_blocknd(values, placement, shape, labels, ref_items): pvalues = np.empty(panel_shape, dtype=dtype) pvalues.fill(fill_value) - values = values for i in range(len(placement)): pvalues[i].flat[mask] = values[:, i] @@ -5154,8 +5153,6 @@ def dtype(self): return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) - return self._dtype - @cache_readonly def is_null(self): if self.block is None: diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 8ac9d3916573e..c75de01b98e4e 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -125,7 +125,7 @@ def _sparse_array_op(left, right, op, name, series=False): name = name[1:] if name in ('and', 'or') and dtype == 'bool': - opname = 'sparse_{name}_uint8'.format(name=name, dtype=dtype) + opname = 'sparse_{name}_uint8'.format(name=name) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 055d6d045d2f2..c2d5a629b03a3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2211,7 +2211,7 @@ def _exclude_implicit_index(self, alldata): def get_chunk(self, size=None): if size is None: size = self.chunksize - return self.read(nrows=size) + return self.read(rows=size) def _convert_data(self, data): # apply converters diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f9f4adc1b2c81..2a120a0696836 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1596,7 +1596,6 @@ def apply(self, other): if otherDay != self.weekday: other = other + timedelta((self.weekday - otherDay) % 7) k = k - 1 - other = other for i in range(k): other = other + self._inc else: From fd171ebeda446a0d7abfe0d61e5bfc0897d4937e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 8 Jun 2017 06:47:18 -0400 Subject: [PATCH 45/55] BLD: fix numpy on 3.6 build as 1.13 was released but no deps are built for it (#16633) --- ci/requirements-3.6.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.6.build b/ci/requirements-3.6.build index 1c4b46aea3865..8d09e0ee93070 100644 --- a/ci/requirements-3.6.build +++ b/ci/requirements-3.6.build @@ -2,5 +2,5 @@ python=3.6* python-dateutil pytz nomkl -numpy +numpy=1.12* cython From 4b0ef03c30102a5516c7345b92fa4906bf5bd87f Mon Sep 17 00:00:00 2001 From: DSM Date: Thu, 8 Jun 2017 06:47:32 -0400 Subject: [PATCH 46/55] BUG: Fix Series.get failure on missing NaN (#8569) (#16619) --- doc/source/whatsnew/v0.20.3.txt | 2 +- pandas/core/indexes/numeric.py | 2 ++ pandas/tests/indexes/test_multi.py | 8 ++++++++ pandas/tests/indexes/test_numeric.py | 8 ++++++++ pandas/tests/series/test_indexing.py | 15 +++++++++++++++ 5 files changed, 34 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index 2032209c4aa23..049737f948e17 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -48,7 +48,7 @@ Conversion Indexing ^^^^^^^^ - +- Bug in ``Float64Index`` causing an empty array instead of None to be returned from ``.get(np.nan)`` on a Series whose index did not contain any NaNs (:issue:`8569`) I/O ^^^ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index bdae0ac7ac5e9..72d521cbe2d60 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -369,6 +369,8 @@ def get_loc(self, key, method=None, tolerance=None): except (ValueError, IndexError): # should only need to catch ValueError here but on numpy # 1.7 .item() can raise IndexError when NaNs are present + if not len(nan_idxs): + raise KeyError(key) return nan_idxs except (TypeError, NotImplementedError): pass diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index ba917f33d8595..7d2e6f495311f 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1172,6 +1172,14 @@ def test_get_loc_level(self): assert result == expected assert new_index.equals(index.droplevel(0)) + def test_get_loc_missing_nan(self): + # GH 8569 + idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]]) + assert isinstance(idx.get_loc(1), slice) + pytest.raises(KeyError, idx.get_loc, 3) + pytest.raises(KeyError, idx.get_loc, np.nan) + pytest.raises(KeyError, idx.get_loc, [np.nan]) + def test_slice_locs(self): df = tm.makeTimeDataFrame() stacked = df.stack() diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 29d4214fd549b..62ac337d02727 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -371,6 +371,14 @@ def test_get_loc_na(self): assert idx.get_loc(1) == 1 pytest.raises(KeyError, idx.slice_locs, np.nan) + def test_get_loc_missing_nan(self): + # GH 8569 + idx = Float64Index([1, 2]) + assert idx.get_loc(1) == 0 + pytest.raises(KeyError, idx.get_loc, 3) + pytest.raises(KeyError, idx.get_loc, np.nan) + pytest.raises(KeyError, idx.get_loc, [np.nan]) + def test_contains_nans(self): i = Float64Index([1.0, 2.0, np.nan]) assert np.nan in i diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 6ded4d593a571..7774d10c5eaf8 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -70,6 +70,21 @@ def test_get(self): result = vc.get(True, default='Missing') assert result == 'Missing' + def test_get_nan(self): + # GH 8569 + s = pd.Float64Index(range(10)).to_series() + assert s.get(np.nan) is None + assert s.get(np.nan, default='Missing') == 'Missing' + + # ensure that fixing the above hasn't broken get + # with multiple elements + idx = [20, 30] + assert_series_equal(s.get(idx), + Series([np.nan] * 2, index=idx)) + idx = [np.nan, np.nan] + assert_series_equal(s.get(idx), + Series([np.nan] * 2, index=idx)) + def test_delitem(self): # GH 5542 From 1b159af6879a46d54811891ac394cb807264209a Mon Sep 17 00:00:00 2001 From: DSM Date: Thu, 8 Jun 2017 06:48:00 -0400 Subject: [PATCH 47/55] TST: NaN in MultiIndex should not become a string (#7031) (#16625) --- pandas/tests/indexes/test_multi.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 7d2e6f495311f..3f6fd8c8aa827 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2866,3 +2866,24 @@ def test_tuples_with_name_string(self): pd.Index(li, name='abc') with pytest.raises(ValueError): pd.Index(li, name='a') + + def test_nan_stays_float(self): + + # GH 7031 + idx0 = pd.MultiIndex(levels=[["A", "B"], []], + labels=[[1, 0], [-1, -1]], + names=[0, 1]) + idx1 = pd.MultiIndex(levels=[["C"], ["D"]], + labels=[[0], [0]], + names=[0, 1]) + idxm = idx0.join(idx1, how='outer') + assert pd.isnull(idx0.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isnull(idxm.get_level_values(1)[:-1]).all() + + df0 = pd.DataFrame([[1, 2]], index=idx0) + df1 = pd.DataFrame([[3, 4]], index=idx1) + dfm = df0 - df1 + assert pd.isnull(df0.index.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isnull(dfm.index.get_level_values(1)[:-1]).all() From 8eb0c7fdc6a9fde19815a1d50e3c54ec195ddfcc Mon Sep 17 00:00:00 2001 From: DSM Date: Thu, 8 Jun 2017 06:48:22 -0400 Subject: [PATCH 48/55] TST: verify we can add and subtract from indices (#8142) (#16629) --- pandas/tests/indexes/test_base.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index d9f8e5e7f382b..18dbe6624008a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1800,6 +1800,25 @@ def test_string_index_repr(self): assert coerce(idx) == expected + @pytest.mark.parametrize('dtype', [np.int64, np.float64]) + @pytest.mark.parametrize('delta', [1, 0, -1]) + def test_addsub_arithmetic(self, dtype, delta): + # GH 8142 + delta = dtype(delta) + idx = pd.Index([10, 11, 12], dtype=dtype) + result = idx + delta + expected = pd.Index(idx.values + delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + # this subtraction used to fail + result = idx - delta + expected = pd.Index(idx.values - delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + tm.assert_index_equal(idx + idx, 2 * idx) + tm.assert_index_equal(idx - idx, 0 * idx) + assert not (idx - idx).empty + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ From 6fa83d3680100c5114d252b4908c0182b963006c Mon Sep 17 00:00:00 2001 From: Pradyumna Reddy Chinthala Date: Fri, 9 Jun 2017 21:21:11 +0530 Subject: [PATCH 49/55] BUG: conversion of Series to Categorical (#16557) fix #16524 --- doc/source/whatsnew/v0.20.3.txt | 1 + pandas/core/internals.py | 2 +- pandas/tests/series/test_dtypes.py | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index 049737f948e17..52f7701724f18 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -44,6 +44,7 @@ Conversion ^^^^^^^^^^ - Bug in pickle compat prior to the v0.20.x series, when ``UTC`` is a timezone in a Series/DataFrame/Index (:issue:`16608`) +- Bug in Series construction when passing a Series with ``dtype='category'`` (:issue:`16524`). Indexing ^^^^^^^^ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 58690ad632152..f2a7ac76481d4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -471,7 +471,7 @@ def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): **kwargs) def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None, **kwargs): + klass=None, mgr=None, raise_on_error=False, **kwargs): """ Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index e084fa58d6c51..9ab02a8c2aad7 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -248,3 +248,12 @@ def test_intercept_astype_object(self): result = df.values.squeeze() assert (result[:, 0] == expected.values).all() + + def test_series_to_categorical(self): + # see gh-16524: test conversion of Series to Categorical + series = Series(['a', 'b', 'c']) + + result = Series(series, dtype='category') + expected = Series(['a', 'b', 'c'], dtype='category') + + tm.assert_series_equal(result, expected) From aba51b6afa011a9614411bb2cd966a0903dbef74 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 9 Jun 2017 18:45:29 -0400 Subject: [PATCH 50/55] BLD: fix numpy on 2.7 build as 1.13 was released but no deps are built for it (#16633) (#16650) --- ci/requirements-2.7.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index 415df13179fcf..a7b950e615464 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -2,5 +2,5 @@ python=2.7* python-dateutil=2.4.1 pytz=2013b nomkl -numpy +numpy=1.12* cython=0.23 From fdb54dfbc782890887e02230d62850f6512c6233 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 9 Jun 2017 15:46:33 -0700 Subject: [PATCH 51/55] CLN: make license file machine readable (#16649) Splits extra information about the license and copyright holders to AUTHORS.md. --- AUTHORS.md | 57 ++++++++++++++++++++++++++++ LICENSE | 106 ++++++++++++----------------------------------------- 2 files changed, 81 insertions(+), 82 deletions(-) create mode 100644 AUTHORS.md diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000000000..dcaaea101f4c8 --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,57 @@ +About the Copyright Holders +=========================== + +* Copyright (c) 2008-2011 AQR Capital Management, LLC + + AQR Capital Management began pandas development in 2008. Development was + led by Wes McKinney. AQR released the source under this license in 2009. +* Copyright (c) 2011-2012, Lambda Foundry, Inc. + + Wes is now an employee of Lambda Foundry, and remains the pandas project + lead. +* Copyright (c) 2011-2012, PyData Development Team + + The PyData Development Team is the collection of developers of the PyData + project. This includes all of the PyData sub-projects, including pandas. The + core team that coordinates development on GitHub can be found here: + http://github.com/pydata. + +Full credits for pandas contributors can be found in the documentation. + +Our Copyright Policy +==================== + +PyData uses a shared copyright model. Each contributor maintains copyright +over their contributions to PyData. However, it is important to note that +these contributions are typically only changes to the repositories. Thus, +the PyData source code, in its entirety, is not the copyright of any single +person or institution. Instead, it is the collective copyright of the +entire PyData Development Team. If individual contributors want to maintain +a record of what changes/contributions they have specific copyright on, +they should indicate their copyright in the commit message of the change +when they commit the change to one of the PyData repositories. + +With this in mind, the following banner should be used in any source code +file to indicate the copyright and license terms: + +``` +#----------------------------------------------------------------------------- +# Copyright (c) 2012, PyData Development Team +# All rights reserved. +# +# Distributed under the terms of the BSD Simplified License. +# +# The full license is in the LICENSE file, distributed with this software. +#----------------------------------------------------------------------------- +``` + +Other licenses can be found in the LICENSES directory. + +License +======= + +pandas is distributed under a 3-clause ("Simplified" or "New") BSD +license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have +BSD-compatible licenses, are included. Their licenses follow the pandas +license. + diff --git a/LICENSE b/LICENSE index c9b8834e8774b..924de26253bf4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,87 +1,29 @@ -======= -License -======= +BSD 3-Clause License -pandas is distributed under a 3-clause ("Simplified" or "New") BSD -license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have -BSD-compatible licenses, are included. Their licenses follow the pandas -license. - -pandas license -============== - -Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team -All rights reserved. - -Copyright (c) 2008-2011 AQR Capital Management, LLC +Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the copyright holder nor the names of any - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -About the Copyright Holders -=========================== - -AQR Capital Management began pandas development in 2008. Development was -led by Wes McKinney. AQR released the source under this license in 2009. -Wes is now an employee of Lambda Foundry, and remains the pandas project -lead. - -The PyData Development Team is the collection of developers of the PyData -project. This includes all of the PyData sub-projects, including pandas. The -core team that coordinates development on GitHub can be found here: -http://github.com/pydata. - -Full credits for pandas contributors can be found in the documentation. - -Our Copyright Policy -==================== - -PyData uses a shared copyright model. Each contributor maintains copyright -over their contributions to PyData. However, it is important to note that -these contributions are typically only changes to the repositories. Thus, -the PyData source code, in its entirety, is not the copyright of any single -person or institution. Instead, it is the collective copyright of the -entire PyData Development Team. If individual contributors want to maintain -a record of what changes/contributions they have specific copyright on, -they should indicate their copyright in the commit message of the change -when they commit the change to one of the PyData repositories. - -With this in mind, the following banner should be used in any source code -file to indicate the copyright and license terms: - -#----------------------------------------------------------------------------- -# Copyright (c) 2012, PyData Development Team -# All rights reserved. -# -# Distributed under the terms of the BSD Simplified License. -# -# The full license is in the LICENSE file, distributed with this software. -#----------------------------------------------------------------------------- - -Other licenses can be found in the LICENSES directory. \ No newline at end of file From 41b3968e57a72ca5d80e02f8b87f2566fa336444 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 9 Jun 2017 21:28:41 -0400 Subject: [PATCH 52/55] fix pytest-xidst version as 1.17 appears buggy (#16652) --- ci/install_travis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 8cf6f2ce636da..f4e6c979f28a4 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -107,7 +107,7 @@ if [ -e ${REQ} ]; then fi time conda install -n pandas pytest -time pip install pytest-xdist +time pip install pytest-xdist==1.16.0 if [ "$LINT" ]; then conda install flake8 From 9d4c88d37b4e093ca06372124ac5c8781800ab6d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 9 Jun 2017 22:09:24 -0400 Subject: [PATCH 53/55] COMPAT: numpy 1.13 test compat (#16654) * COMPAT: numpy 1.13 test compat * CI: fix doc build to 1.12 --- ci/requirements-3.6_DOC.build | 2 +- pandas/compat/numpy/__init__.py | 3 +++ pandas/tests/test_expressions.py | 6 +++++- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ci/requirements-3.6_DOC.build b/ci/requirements-3.6_DOC.build index bdcfe28105866..37faaa7e4db88 100644 --- a/ci/requirements-3.6_DOC.build +++ b/ci/requirements-3.6_DOC.build @@ -1,5 +1,5 @@ python=3.6* python-dateutil pytz -numpy +numpy=1.12* cython diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 4a9a2647ece0f..2c5a18973afa8 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -15,6 +15,7 @@ _np_version_under1p11 = _nlv < '1.11' _np_version_under1p12 = _nlv < '1.12' _np_version_under1p13 = _nlv < '1.13' +_np_version_under1p14 = _nlv < '1.14' if _nlv < '1.7.0': raise ImportError('this version of pandas is incompatible with ' @@ -74,4 +75,6 @@ def np_array_datetime64_compat(arr, *args, **kwargs): '_np_version_under1p10', '_np_version_under1p11', '_np_version_under1p12', + '_np_version_under1p13', + '_np_version_under1p14' ] diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index fae7bfa513dcd..08c3a25e66b0e 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -13,7 +13,7 @@ from pandas.core.api import DataFrame, Panel from pandas.core.computation import expressions as expr -from pandas import compat, _np_version_under1p11 +from pandas import compat, _np_version_under1p11, _np_version_under1p13 from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_panel4d_equal, slow) @@ -420,6 +420,10 @@ def test_bool_ops_warn_on_arithmetic(self): f = getattr(operator, name) fe = getattr(operator, sub_funcs[subs[op]]) + # >= 1.13.0 these are now TypeErrors + if op == '-' and not _np_version_under1p13: + continue + with tm.use_numexpr(True, min_elements=5): with tm.assert_produces_warning(check_stacklevel=False): r = f(df, df) From 8f6e50ae280c8c99bc796f290f765eb65edb6515 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 10 Jun 2017 11:09:30 -0400 Subject: [PATCH 54/55] Revert "fix pytest-xidst version as 1.17 appears buggy (#16652)" (#16657) This reverts commit ec6bf6deaf502ac05a7120df13bd9b13cb3083f6. 1.17.1 released that fixes --- ci/install_travis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index f4e6c979f28a4..8cf6f2ce636da 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -107,7 +107,7 @@ if [ -e ${REQ} ]; then fi time conda install -n pandas pytest -time pip install pytest-xdist==1.16.0 +time pip install pytest-xdist if [ "$LINT" ]; then conda install flake8 From 1de16b6fe61222aabad176a85f7b5c6fc688d984 Mon Sep 17 00:00:00 2001 From: Chris MacLeod Date: Sun, 11 Jun 2017 07:56:59 -0300 Subject: [PATCH 55/55] Add ASV benchmark. --- asv_bench/benchmarks/hdfstore_bench.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index dc72f3d548aaf..9da7aea2e7b5e 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -90,6 +90,14 @@ def time_query_store_table(self): stop = self.df2.index[15000] self.store.select('table', where="index > start and index < stop") + def time_store_tostring(self): + repr(self.store) + str(self.store) + + def time_store_info(self): + self.store.info() + + class HDF5Panel(object): goal_time = 0.2