From 6a4567ecc59099dc9330687f4ed4b52a4a826ae1 Mon Sep 17 00:00:00 2001 From: Alex Rychyk Date: Fri, 27 Oct 2017 23:32:19 +0300 Subject: [PATCH 01/85] Added applying of multiple columns to resample (#17950) (cherry picked from commit bdeadb98fc006f39fcefe048dd2452c24849e86f) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/resample.py | 6 +++++- pandas/tests/test_resample.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 56412651f13f0..d89992faf6f53 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -56,6 +56,7 @@ Documentation Changes Bug Fixes ~~~~~~~~~ +- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6edbb99641542..5a571f9077999 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -395,7 +395,11 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): grouped = PanelGroupBy(obj, grouper=grouper, axis=self.axis) try: - result = grouped.aggregate(how, *args, **kwargs) + if isinstance(obj, ABCDataFrame) and compat.callable(how): + # Check if the function is reducing or not. + result = grouped._aggregate_item_by_item(how, *args, **kwargs) + else: + result = grouped.aggregate(how, *args, **kwargs) except Exception: # we have a non-reducing function diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index ac8297a53de37..ba1a2ad1f42e2 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3103,6 +3103,26 @@ def f(x): result = g.apply(f) assert_frame_equal(result, expected) + def test_apply_with_mutated_index(self): + # GH 15169 + index = pd.date_range('1-1-2015', '12-31-15', freq='D') + df = pd.DataFrame(data={'col1': np.random.rand(len(index))}, + index=index) + + def f(x): + s = pd.Series([1, 2], index=['a', 'b']) + return s + + expected = df.groupby(pd.Grouper(freq='M')).apply(f) + + result = df.resample('M').apply(f) + assert_frame_equal(result, expected) + + # A case for series + expected = df['col1'].groupby(pd.Grouper(freq='M')).apply(f) + result = df['col1'].resample('M').apply(f) + assert_series_equal(result, expected) + def test_resample_groupby_with_label(self): # GH 13235 index = date_range('2000-01-01', freq='2D', periods=5) From ba36a14b490a10131355b3a4893923c8ab37c154 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Oct 2017 15:41:31 -0500 Subject: [PATCH 02/85] COMPAT: Update for NumPy dev (#17987) Closes https://github.com/pandas-dev/pandas/issues/17986 xref https://github.com/numpy/numpy/pull/9487 (cherry picked from commit dff5109abfd1e29dcd349d04d535a9c8735219b3) --- pandas/io/packers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 92270b39f56ef..abd258034af99 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -350,8 +350,11 @@ def unconvert(values, dtype, compress=None): ) # fall through to copying `np.fromstring` - # Copy the string into a numpy array. - return np.fromstring(values, dtype=dtype) + # Copy the bytes into a numpy array. + buf = np.frombuffer(values, dtype=dtype) + buf = buf.copy() # required to not mutate the original data + buf.flags.writeable = True + return buf def encode(obj): From 5c23ef408f193688d2e56b6c808874b79ce4d974 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 27 Oct 2017 15:03:55 -0700 Subject: [PATCH 03/85] Add timestamp method+test; closes #17329 (#17906) (cherry picked from commit 5dd2ea0b3211528ffcfe9b231ce6c00f02918153) --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/tslib.pyx | 10 +++++++--- pandas/tests/scalar/test_nat.py | 3 ++- pandas/tests/scalar/test_timestamp.py | 19 ++++++++++++++++++- 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 80f8d42be8ed6..e8b8b3624740d 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1794,6 +1794,7 @@ Methods Timestamp.strftime Timestamp.strptime Timestamp.time + Timestamp.timestamp Timestamp.timetuple Timestamp.timetz Timestamp.to_datetime64 diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index d89992faf6f53..b6b82546c209c 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -21,7 +21,7 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ -- +- :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) - - diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a0aae6a5de707..20b974ce5a659 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -961,8 +961,7 @@ class NaTType(_NaT): combine = _make_error_func('combine', None) utcnow = _make_error_func('utcnow', None) - if PY3: - timestamp = _make_error_func('timestamp', datetime) + timestamp = _make_error_func('timestamp', Timestamp) # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or # return NaT create functions that raise, for binding to NaTType @@ -1409,6 +1408,11 @@ cdef class _Timestamp(datetime): def __get__(self): return np.datetime64(self.value, 'ns') + def timestamp(self): + """Return POSIX timestamp as float.""" + # py27 compat, see GH#17329 + return round(self.value / 1e9, 6) + cdef PyTypeObject* ts_type = Timestamp @@ -3366,7 +3370,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): """ Convert the val (in i8) from timezone1 to timezone2 - This is a single timezone versoin of tz_convert + This is a single timezone version of tz_convert Parameters ---------- diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 135e4c544de41..0e69371511294 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -125,12 +125,13 @@ def test_round_nat(klass): def test_NaT_methods(): # GH 9513 + # GH 17329 for `timestamp` raise_methods = ['astimezone', 'combine', 'ctime', 'dst', 'fromordinal', 'fromtimestamp', 'isocalendar', 'strftime', 'strptime', 'time', 'timestamp', 'timetuple', 'timetz', 'toordinal', 'tzname', 'utcfromtimestamp', 'utcnow', 'utcoffset', - 'utctimetuple'] + 'utctimetuple', 'timestamp'] nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today', 'tz_convert', 'tz_localize'] nan_methods = ['weekday', 'isoweekday'] diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index c1b9f858a08de..c160471bd0981 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -19,7 +19,7 @@ from pandas._libs import tslib, period from pandas._libs.tslibs.timezones import get_timezone -from pandas.compat import lrange, long +from pandas.compat import lrange, long, PY3 from pandas.util.testing import assert_series_equal from pandas.compat.numpy import np_datetime64_compat from pandas import (Timestamp, date_range, Period, Timedelta, compat, @@ -1079,6 +1079,23 @@ def test_is_leap_year(self): dt = Timestamp('2100-01-01 00:00:00', tz=tz) assert not dt.is_leap_year + def test_timestamp(self): + # GH#17329 + # tz-naive --> treat it as if it were UTC for purposes of timestamp() + ts = Timestamp.now() + uts = ts.replace(tzinfo=utc) + assert ts.timestamp() == uts.timestamp() + + tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') + utsc = tsc.tz_convert('UTC') + # utsc is a different representation of the same time + assert tsc.timestamp() == utsc.timestamp() + + if PY3: + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() + class TestTimestampNsOperations(object): From 679d3b94e2bfed032c79d583f261759c1ab489ef Mon Sep 17 00:00:00 2001 From: cgohlke Date: Sat, 28 Oct 2017 11:49:36 -0700 Subject: [PATCH 04/85] BUG: Fix memory access violations in is_lexsorted (#18005) (cherry picked from commit 34abef282cc7f27257ed873c60a91f69f67b36b5) --- pandas/_libs/algos.pyx | 5 +++-- pandas/tests/test_algos.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d159761c3f5e6..a44a7288bda45 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -87,7 +87,7 @@ class NegInfinity(object): @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): cdef: - int i + Py_ssize_t i Py_ssize_t n, nlevels int64_t k, cur, pre ndarray arr @@ -99,11 +99,12 @@ def is_lexsorted(list list_of_arrays): cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) for i in range(nlevels): arr = list_of_arrays[i] + assert arr.dtype.name == 'int64' vecs[i] = arr.data # Assume uniqueness?? with nogil: - for i in range(n): + for i in range(1, n): for k in range(nlevels): cur = vecs[k][i] pre = vecs[k][i -1] diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 38625bfb29917..df9297312a6f3 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1219,7 +1219,7 @@ def test_is_lexsorted(): 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'), np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, @@ -1231,7 +1231,7 @@ def test_is_lexsorted(): 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, - 4, 3, 2, 1, 0])] + 4, 3, 2, 1, 0], dtype='int64')] assert (not libalgos.is_lexsorted(failure)) From aa5ddb3367049b6e3f21b12cb2b51a642ce324a4 Mon Sep 17 00:00:00 2001 From: jschendel Date: Sat, 28 Oct 2017 18:12:08 -0600 Subject: [PATCH 05/85] ERR: Fix segfault with .astype('category') on empty DataFrame (#18015) (cherry picked from commit 5959ee3e133723136d4862864988a63ef3cc2a2f) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/src/inference.pyx | 6 +++--- pandas/tests/dtypes/test_inference.py | 6 ++++++ pandas/tests/test_categorical.py | 7 +++++++ 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index b6b82546c209c..786f06cf19b8e 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -116,7 +116,7 @@ Numeric Categorical ^^^^^^^^^^^ -- +- Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) - - diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index b0a64e1ccc225..c340e870e9722 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -349,13 +349,13 @@ def infer_dtype(object value, bint skipna=False): if values.dtype != np.object_: values = values.astype('O') + # make contiguous + values = values.ravel() + n = len(values) if n == 0: return 'empty' - # make contiguous - values = values.ravel() - # try to use a valid value for i in range(n): val = util.get_value_1d(values, i) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 70273f9e999cf..7195cb43a70dc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -416,6 +416,12 @@ def test_length_zero(self): result = lib.infer_dtype([]) assert result == 'empty' + # GH 18004 + arr = np.array([np.array([], dtype=object), + np.array([], dtype=object)]) + result = lib.infer_dtype(arr) + assert result == 'empty' + def test_integers(self): arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') result = lib.infer_dtype(arr) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 272ba25bf8f8a..6366aae8ccdf6 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2124,6 +2124,13 @@ def test_creation_astype(self): res = s.astype(CategoricalDtype(list('abcdef'), ordered=True)) tm.assert_series_equal(res, exp) + @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']]) + def test_empty_astype(self, columns): + # GH 18004 + msg = '> 1 ndim Categorical are not supported at this time' + with tm.assert_raises_regex(NotImplementedError, msg): + DataFrame(columns=columns).astype('category') + def test_construction_series(self): l = [1, 2, 3, 1] From 6912d253e4ce5c1158dc61e84e480c194a049cef Mon Sep 17 00:00:00 2001 From: miker985 Date: Mon, 30 Oct 2017 17:24:52 -0700 Subject: [PATCH 06/85] BUG: Fix parsing of stata dates (#17797) (#17990) (cherry picked from commit e886af53721b1bd376e21e57552aa38a30d0d021) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/stata.py | 26 ++++++++++++------------- pandas/tests/io/data/stata13_dates.dta | Bin 0 -> 3386 bytes pandas/tests/io/test_stata.py | 21 ++++++++++++++++++++ 4 files changed, 35 insertions(+), 13 deletions(-) create mode 100644 pandas/tests/io/data/stata13_dates.dta diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 786f06cf19b8e..0369d6608076f 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -78,6 +78,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) + Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index afc1631a947c8..aafe5f2ce76bd 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -306,11 +306,11 @@ def convert_delta_safe(base, deltas, unit): data_col[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) - if fmt in ["%tc", "tc"]: # Delta ms relative to base + if fmt.startswith(("%tc", "tc")): # Delta ms relative to base base = stata_epoch ms = dates conv_dates = convert_delta_safe(base, ms, 'ms') - elif fmt in ["%tC", "tC"]: + elif fmt.startswith(("%tC", "tC")): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") @@ -318,27 +318,30 @@ def convert_delta_safe(base, deltas, unit): if has_bad_values: conv_dates[bad_locs] = pd.NaT return conv_dates - elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base + # Delta days relative to base + elif fmt.startswith(("%td", "td", "%d", "d")): base = stata_epoch days = dates conv_dates = convert_delta_safe(base, days, 'd') - elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + # does not count leap days - 7 days is a week. + # 52nd week may have more than 7 days + elif fmt.startswith(("%tw", "tw")): year = stata_epoch.year + dates // 52 days = (dates % 52) * 7 conv_dates = convert_year_days_safe(year, days) - elif fmt in ["%tm", "tm"]: # Delta months relative to base + elif fmt.startswith(("%tm", "tm")): # Delta months relative to base year = stata_epoch.year + dates // 12 month = (dates % 12) + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%tq", "tq"]: # Delta quarters relative to base + elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base year = stata_epoch.year + dates // 4 month = (dates % 4) * 3 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%th", "th"]: # Delta half-years relative to base + elif fmt.startswith(("%th", "th")): # Delta half-years relative to base year = stata_epoch.year + dates // 2 month = (dates % 2) * 6 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%ty", "ty"]: # Years -- not delta + elif fmt.startswith(("%ty", "ty")): # Years -- not delta year = dates month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) @@ -1029,10 +1032,6 @@ def _read_header(self): # calculate size of a data record self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) - # remove format details from %td - self.fmtlist = ["%td" if x.startswith("%td") else x - for x in self.fmtlist] - def _read_new_header(self, first_char): # The first part of the header is common to 117 and 118. self.path_or_buf.read(27) # stata_dta>
@@ -1578,7 +1577,8 @@ def read(self, nrows=None, convert_dates=None, self._do_convert_missing(data, convert_missing) if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, + cols = np.where(lmap(lambda x: any(x.startswith(fmt) + for fmt in _date_formats), self.fmtlist))[0] for i in cols: col = data.columns[i] diff --git a/pandas/tests/io/data/stata13_dates.dta b/pandas/tests/io/data/stata13_dates.dta new file mode 100644 index 0000000000000000000000000000000000000000..87b857559e501e555b74131bce5abc7b2b1d98ff GIT binary patch literal 3386 zcmeHKO=uHA6n?g1#Xl&Bco706;HBM75vy@nAq`TfjY{jGc-T&}lkVbfwwax@1`qAg zqZbd_i+F1f;=zMntXFS}XQ3bz5%Hi#4y9Q z!?1;1f*nFFM2SyuM666^2pM$)P1hI+sUTb(zk-a0iAJpxfQ(TK6|4gQyj!r4kEe-m z@xJOH<^_aH1+!M~;Q7?GB1;XLnW2<LIc&PED$j^@(8qakr zdq(5gL&`pLSmD!S3g2n$I0bzJIf=t2MXQ(g=cCi|< zFn~(6jRYsicxqG9mf&K)1em9{=dkZNn0ev9qYwm{!T~S@WyJmet%=uYueF|95%8_u4DozI6c{JqU1%w+Fm^;Oz_d!JdSLIR5@4 z{xc_@V>vc(JQd%O&!p;G3>|`~^Lx*OsBf(+-1s4s>mD`g-KXc>qD;JfvucZa2aoP| z342l7@5d^#jMVK*eaha(mrErsorIk^gwD2s&YRLGCQaYgy8fV|izzNLLiyL1SBxxV zZ1eEXL(B`+;^*AGYOap!x%J#8Y{C Date: Thu, 2 Nov 2017 06:58:52 -0500 Subject: [PATCH 07/85] Update Contributing Environment section (#18052) (cherry picked from commit 0fd3bd73e17e4123f1d334f4ebd305e33cd75fe1) --- ci/environment-dev.yaml | 14 + ...ll.txt => requirements-optional-conda.txt} | 45 ++- ci/requirements-optional-pip.txt | 27 ++ ci/requirements_dev.txt | 14 +- doc/source/contributing.rst | 282 ++++++++---------- scripts/convert_deps.py | 29 ++ 6 files changed, 232 insertions(+), 179 deletions(-) create mode 100644 ci/environment-dev.yaml rename ci/{requirements_all.txt => requirements-optional-conda.txt} (68%) create mode 100644 ci/requirements-optional-pip.txt create mode 100644 scripts/convert_deps.py diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml new file mode 100644 index 0000000000000..c3d3d59f895c6 --- /dev/null +++ b/ci/environment-dev.yaml @@ -0,0 +1,14 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - Cython + - NumPy + - moto + - pytest + - python-dateutil + - python=3 + - pytz + - setuptools + - sphinx diff --git a/ci/requirements_all.txt b/ci/requirements-optional-conda.txt similarity index 68% rename from ci/requirements_all.txt rename to ci/requirements-optional-conda.txt index e13afd619f105..6edb8d17337e4 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements-optional-conda.txt @@ -1,28 +1,27 @@ -pytest>=3.1.0 -pytest-cov -pytest-xdist -flake8 -sphinx=1.5* -nbsphinx -ipython -python-dateutil -pytz -openpyxl -xlsxwriter -xlrd -xlwt -html5lib -patsy beautifulsoup4 -numpy -cython -scipy +blosc +bottleneck +fastparquet +feather-format +html5lib +ipython +ipykernel +jinja2 +lxml +matplotlib +nbsphinx numexpr +openpyxl +pyarrow +pymysql pytables -matplotlib +pytest-cov +pytest-xdist +s3fs +scipy seaborn -lxml sqlalchemy -bottleneck -pymysql -Jinja2 +xarray +xlrd +xlsxwriter +xlwt diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt new file mode 100644 index 0000000000000..06b22bd8f2c63 --- /dev/null +++ b/ci/requirements-optional-pip.txt @@ -0,0 +1,27 @@ +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directlybeautifulsoup4 +blosc +bottleneck +fastparquet +feather-format +html5lib +ipython +jinja2 +lxml +matplotlib +nbsphinx +numexpr +openpyxl +pyarrow +pymysql +tables +pytest-cov +pytest-xdist +s3fs +scipy +seaborn +sqlalchemy +xarray +xlrd +xlsxwriter +xlwt \ No newline at end of file diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index dbc4f6cbd6509..2fb36b7cd70d8 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -1,8 +1,10 @@ +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directly +Cython +NumPy +moto +pytest python-dateutil pytz -numpy -cython -pytest>=3.1.0 -pytest-cov -flake8 -moto +setuptools +sphinx \ No newline at end of file diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index d8d57a8bfffdd..afde8d8374fb5 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -11,32 +11,32 @@ Where to start? =============== All contributions, bug reports, bug fixes, documentation improvements, -enhancements and ideas are welcome. +enhancements, and ideas are welcome. -If you are simply looking to start working with the *pandas* codebase, navigate to the -`GitHub "issues" tab `_ and start looking through -interesting issues. There are a number of issues listed under `Docs +If you are brand new to pandas or open-source development, we recommend going +through the `GitHub "issues" tab `_ +to find issues that interest you. There are a number of issues listed under `Docs `_ and `Difficulty Novice `_ -where you could start out. - -Or maybe through using *pandas* you have an idea of your own or are looking for something -in the documentation and thinking 'this can be improved'...you can do something -about it! +where you could start out. Once you've found an interesting issue, you can +return here to get your development environment setup. Feel free to ask questions on the `mailing list -`_ or on `Gitter -`_. +`_ or on `Gitter`_. + +.. _contributing.bug_reports: Bug reports and enhancement requests ==================================== -Bug reports are an important part of making *pandas* more stable. Having a complete bug report -will allow others to reproduce the bug and provide insight into fixing. Because many versions of -*pandas* are supported, knowing version information will also identify improvements made since -previous versions. Trying the bug-producing code out on the *master* branch is often a worthwhile exercise -to confirm the bug still exists. It is also worth searching existing bug reports and pull requests +Bug reports are an important part of making *pandas* more stable. Having a complete bug report +will allow others to reproduce the bug and provide insight into fixing. See +`this stackoverflow article `_ for tips on +writing a good bug report. + +Trying the bug-producing code out on the *master* branch is often a worthwhile exercise +to confirm the bug still exists. It is also worth searching existing bug reports and pull requests to see if the issue has already been reported and/or fixed. Bug reports must: @@ -60,12 +60,16 @@ Bug reports must: The issue will then show up to the *pandas* community and be open to comments/ideas from others. +.. _contributing.github + Working with the code ===================== Now that you have an issue you want to fix, enhancement to add, or documentation to improve, you need to learn how to work with GitHub and the *pandas* code base. +.. _contributing.version_control: + Version control, Git, and GitHub -------------------------------- @@ -103,167 +107,164 @@ want to clone your fork to your machine:: git clone https://github.com/your-user-name/pandas.git pandas-yourname cd pandas-yourname - git remote add upstream git://github.com/pandas-dev/pandas.git + git remote add upstream https://github.com/pandas-dev/pandas.git This creates the directory `pandas-yourname` and connects your repository to the upstream (main project) *pandas* repository. -Creating a branch ------------------ +.. _contributing.dev_env: -You want your master branch to reflect only production-ready code, so create a -feature branch for making your changes. For example:: +Creating a development environment +---------------------------------- - git branch shiny-new-feature - git checkout shiny-new-feature +To test out code changes, you'll need to build pandas from source, which +requires a C compiler and python environment. If you're making documentation +changes, you can skip to :ref:`contributing.documentation` but you won't be able +to build the documentation locally before pushing your changes. -The above can be simplified to:: +.. _contributiong.dev_c: - git checkout -b shiny-new-feature +Installing a C Complier +~~~~~~~~~~~~~~~~~~~~~~~ -This changes your working directory to the shiny-new-feature branch. Keep any -changes in this branch specific to one bug or feature so it is clear -what the branch brings to *pandas*. You can have many shiny-new-features -and switch in between them using the git checkout command. +Pandas uses C extensions (mostly written using Cython) to speed up certain +operations. To install pandas from source, you need to compile these C +extensions, which means you need a C complier. This process depends on which +platform you're using. Follow the `CPython contributing guidelines +`_ for getting a +complier installed. You don't need to do any of the ``./configure`` or ``make`` +steps; you only need to install the complier. -To update this branch, you need to retrieve the changes from the master branch:: +For Windows developers, the following links may be helpful. - git fetch upstream - git rebase upstream/master +- https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ +- https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit +- https://cowboyprogrammer.org/building-python-wheels-for-windows/ +- https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ +- https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy -This will replay your commits on top of the latest pandas git master. If this -leads to merge conflicts, you must resolve these before submitting your pull -request. If you have uncommitted changes, you will need to ``stash`` them prior -to updating. This will effectively store your changes and they can be reapplied -after updating. +Let us know if you have any difficulties by opening an issue or reaching out on +`Gitter`_. -.. _contributing.dev_env: +.. _contributiong.dev_python: -Creating a development environment ----------------------------------- +Creating a Python Environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -An easy way to create a *pandas* development environment is as follows. +Now that you have a C complier, create an isolated pandas development +environment: -- Install either :ref:`Anaconda ` or :ref:`miniconda ` +- Install either `Anaconda `_ or `miniconda + `_ +- Make sure your conda is up to date (``conda update conda``) - Make sure that you have :ref:`cloned the repository ` - ``cd`` to the *pandas* source directory -Tell conda to create a new environment, named ``pandas_dev``, or any other name you would like -for this environment, by running:: - - conda create -n pandas_dev --file ci/requirements_dev.txt - - -For a python 3 environment:: - - conda create -n pandas_dev python=3 --file ci/requirements_dev.txt - -.. warning:: - - If you are on Windows, see :ref:`here for a fully compliant Windows environment `. - -This will create the new environment, and not touch any of your existing environments, -nor any existing python installation. It will install all of the basic dependencies of -*pandas*, as well as the development and testing tools. If you would like to install -other dependencies, you can install them as follows:: +We'll now kick off a three-step process: - conda install -n pandas_dev -c pandas pytables scipy +1. Install the build dependencies +2. Build and install pandas +3. Install the optional dependencies -To install *all* pandas dependencies you can do the following:: +.. code-block:: none - conda install -n pandas_dev -c conda-forge --file ci/requirements_all.txt + # Create and activate the build environment + conda env create -f ci/environment-dev.yaml + conda activate pandas-dev -To work in this environment, Windows users should ``activate`` it as follows:: + # Build and install pandas + python setup.py build_ext --inplace -j 4 + python -m pip install -e . - activate pandas_dev + # Install the rest of the optional dependencies + conda install -c defaults -c conda-forge --file=ci/requirements-optional-conda.txt -Mac OSX / Linux users should use:: +At this point you should be able to import pandas from your locally built version:: - source activate pandas_dev + $ python # start an interpreter + >>> import pandas + >>> print(pandas.__version__) + 0.22.0.dev0+29.g4ad6d4d74 -You will then see a confirmation message to indicate you are in the new development environment. +This will create the new environment, and not touch any of your existing environments, +nor any existing python installation. To view your environments:: conda info -e -To return to your home root environment in Windows:: - - deactivate +To return to your root environment:: -To return to your home root environment in OSX / Linux:: - - source deactivate + conda deactivate See the full conda docs `here `__. -At this point you can easily do an *in-place* install, as detailed in the next section. - -.. _contributing.windows: - -Creating a Windows development environment ------------------------------------------- +.. _contributing.pip: -To build on Windows, you need to have compilers installed to build the extensions. You will need to install the appropriate Visual Studio compilers, VS 2008 for Python 2.7, VS 2010 for 3.4, and VS 2015 for Python 3.5 and 3.6. +Creating a Python Environment (pip) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For Python 2.7, you can install the ``mingw`` compiler which will work equivalently to VS 2008:: +If you aren't using conda for you development environment, follow these instructions. +You'll need to have at least python3.5 installed on your system. - conda install -n pandas_dev libpython +.. code-block:: none -or use the `Microsoft Visual Studio VC++ compiler for Python `__. Note that you have to check the ``x64`` box to install the ``x64`` extension building capability as this is not installed by default. + # Create a virtual environment + # Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev + # Any parent directories should already exist + python3 -m venv ~/virtualenvs/pandas-dev + # Activate the virtulaenv + . ~/virtualenvs/pandas-dev/bin/activate -For Python 3.4, you can download and install the `Windows 7.1 SDK `__. Read the references below as there may be various gotchas during the installation. - -For Python 3.5 and 3.6, you can download and install the `Visual Studio 2015 Community Edition `__. - -Here are some references and blogs: - -- https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ -- https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit -- https://cowboyprogrammer.org/building-python-wheels-for-windows/ -- https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ -- https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy + # Install the build dependencies + python -m pip install -r ci/requirements_dev.txt + # Build and install pandas + python setup.py build_ext --inplace -j 4 + python -m pip install -e . -.. _contributing.getting_source: + # Install additional dependencies + python -m pip install -r ci/requirements-optional-pip.txt -Making changes --------------- +Creating a branch +----------------- -Before making your code changes, it is often necessary to build the code that was -just checked out. There are two primary methods of doing this. +You want your master branch to reflect only production-ready code, so create a +feature branch for making your changes. For example:: -#. The best way to develop *pandas* is to build the C extensions in-place by - running:: + git branch shiny-new-feature + git checkout shiny-new-feature - python setup.py build_ext --inplace +The above can be simplified to:: - If you startup the Python interpreter in the *pandas* source directory you - will call the built C extensions + git checkout -b shiny-new-feature -#. Another very common option is to do a ``develop`` install of *pandas*:: +This changes your working directory to the shiny-new-feature branch. Keep any +changes in this branch specific to one bug or feature so it is clear +what the branch brings to *pandas*. You can have many shiny-new-features +and switch in between them using the git checkout command. - python setup.py develop +To update this branch, you need to retrieve the changes from the master branch:: - This makes a symbolic link that tells the Python interpreter to import *pandas* - from your development directory. Thus, you can always be using the development - version on your system without being inside the clone directory. + git fetch upstream + git rebase upstream/master +This will replay your commits on top of the latest pandas git master. If this +leads to merge conflicts, you must resolve these before submitting your pull +request. If you have uncommitted changes, you will need to ``stash`` them prior +to updating. This will effectively store your changes and they can be reapplied +after updating. .. _contributing.documentation: Contributing to the documentation ================================= -If you're not the developer type, contributing to the documentation is still -of huge value. You don't even have to be an expert on -*pandas* to do so! Something as simple as rewriting small passages for clarity -as you reference the docs is a simple but effective way to contribute. The -next person to read that passage will be in your debt! - -In fact, there are sections of the docs that are worse off after being written -by experts. If something in the docs doesn't make sense to you, updating the -relevant section after you figure it out is a simple way to ensure it will -help the next person. +If you're not the developer type, contributing to the documentation is still of +huge value. You don't even have to be an expert on *pandas* to do so! In fact, +there are sections of the docs that are worse off after being written by +experts. If something in the docs doesn't make sense to you, updating the +relevant section after you figure it out is a great way to ensure it will help +the next person. .. contents:: Documentation: :local: @@ -342,30 +343,6 @@ Requirements First, you need to have a development environment to be able to build pandas (see the docs on :ref:`creating a development environment above `). -Further, to build the docs, there are some extra requirements: you will need to -have ``sphinx`` and ``ipython`` installed. `numpydoc -`_ is used to parse the docstrings that -follow the Numpy Docstring Standard (see above), but you don't need to install -this because a local copy of numpydoc is included in the *pandas* source -code. `nbsphinx `_ is required to build -the Jupyter notebooks included in the documentation. - -If you have a conda environment named ``pandas_dev``, you can install the extra -requirements with:: - - conda install -n pandas_dev sphinx ipython nbconvert nbformat - conda install -n pandas_dev -c conda-forge nbsphinx - -Furthermore, it is recommended to have all :ref:`optional dependencies `. -installed. This is not strictly necessary, but be aware that you will see some error -messages when building the docs. This happens because all the code in the documentation -is executed during the doc build, and so code examples using optional dependencies -will generate errors. Run ``pd.show_versions()`` to get an overview of the installed -version of all dependencies. - -.. warning:: - - You need to have ``sphinx`` version >= 1.3.2. Building the documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -386,10 +363,10 @@ If you want to do a full clean build, do:: python make.py clean python make.py html -Starting with *pandas* 0.13.1 you can tell ``make.py`` to compile only a single section -of the docs, greatly reducing the turn-around time for checking your changes. -You will be prompted to delete ``.rst`` files that aren't required. This is okay because -the prior versions of these files can be checked out from git. However, you must make sure +You can tell ``make.py`` to compile only a single section of the docs, greatly +reducing the turn-around time for checking your changes. You will be prompted to +delete ``.rst`` files that aren't required. This is okay because the prior +versions of these files can be checked out from git. However, you must make sure not to commit the file deletions to your Git repository! :: @@ -422,6 +399,8 @@ the documentation are also built by Travis-CI. These docs are then hosted `here `__, see also the :ref:`Continuous Integration ` section. +.. _contributing.code: + Contributing to the code base ============================= @@ -480,7 +459,7 @@ Once configured, you can run the tool as follows:: clang-format modified-c-file This will output what your file will look like if the changes are made, and to apply -them, just run the following command:: +them, run the following command:: clang-format -i modified-c-file @@ -1033,7 +1012,7 @@ delete your branch:: git checkout master git merge upstream/master -Then you can just do:: +Then you can do:: git branch -d shiny-new-feature @@ -1043,3 +1022,6 @@ branch has not actually been merged. The branch will still exist on GitHub, so to delete it there do:: git push origin --delete shiny-new-feature + + +.. _Gitter: https://gitter.im/pydata/pandas diff --git a/scripts/convert_deps.py b/scripts/convert_deps.py new file mode 100644 index 0000000000000..aabeb24a0c3c8 --- /dev/null +++ b/scripts/convert_deps.py @@ -0,0 +1,29 @@ +""" +Convert the conda environment.yaml to a pip requirements.txt +""" +import yaml + +exclude = {'python=3'} +rename = {'pytables': 'tables'} + +with open("ci/environment-dev.yaml") as f: + dev = yaml.load(f) + +with open("ci/requirements-optional-conda.txt") as f: + optional = [x.strip() for x in f.readlines()] + +required = dev['dependencies'] +required = [rename.get(dep, dep) for dep in required if dep not in exclude] +optional = [rename.get(dep, dep) for dep in optional if dep not in exclude] + + +with open("ci/requirements_dev.txt", 'wt') as f: + f.write("# This file was autogenerated by scripts/convert_deps.py\n") + f.write("# Do not modify directly\n") + f.write('\n'.join(required)) + + +with open("ci/requirements-optional-pip.txt", 'wt') as f: + f.write("# This file was autogenerated by scripts/convert_deps.py\n") + f.write("# Do not modify directly\n") + f.write("\n".join(optional)) From 992b0f93d808bf12b38f8fa61de198192a6c504b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 2 Nov 2017 04:28:09 -0700 Subject: [PATCH 08/85] MAINT: Remove np.array_equal calls in tests (#18047) (cherry picked from commit 194dbff2c3e986dc385237b7e56fe9c6d27358bc) --- pandas/tests/dtypes/test_cast.py | 10 +++---- pandas/tests/frame/test_constructors.py | 3 +- pandas/tests/frame/test_nonunique_indexes.py | 2 +- .../indexes/datetimes/test_date_range.py | 28 +++++++++---------- pandas/tests/io/parser/common.py | 2 +- pandas/tests/io/parser/test_textreader.py | 27 ++++++++++-------- pandas/tests/reshape/test_concat.py | 4 ++- pandas/tests/series/test_missing.py | 16 +++++++---- pandas/tests/test_algos.py | 21 ++++---------- pandas/tests/test_join.py | 11 ++++---- pandas/tests/test_lib.py | 2 +- pandas/tests/test_sorting.py | 11 ++++---- pandas/tests/tseries/test_timezones.py | 4 +-- 13 files changed, 72 insertions(+), 69 deletions(-) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index d9fb458c83529..82a35fa711e8c 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -38,17 +38,17 @@ def test_downcast_conv(self): arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') - assert (np.array_equal(result, arr)) + tm.assert_numpy_array_equal(result, arr) arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) + expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = maybe_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) + expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) # GH16875 coercing of bools ser = Series([True, True, False]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c55c79ef18602..8291e9d452348 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1913,10 +1913,11 @@ def test_from_records_len0_with_columns(self): # #2633 result = DataFrame.from_records([], index='foo', columns=['foo', 'bar']) + expected = Index(['bar']) - assert np.array_equal(result.columns, ['bar']) assert len(result) == 0 assert result.index.name == 'foo' + tm.assert_index_equal(result.columns, expected) def test_to_frame_with_falsey_names(self): # GH 16114 diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 4f77ba0ae1f5a..5b903c5a1eaf6 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -448,7 +448,7 @@ def test_as_matrix_duplicates(self): expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], dtype=object) - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_set_value_by_index(self): # See gh-12344 diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 3b40ef092f364..1fca0445de5c4 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -20,11 +20,6 @@ START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) -def eq_gen_range(kwargs, expected): - rng = generate_range(**kwargs) - assert (np.array_equal(list(rng), expected)) - - class TestDateRanges(TestData): def test_date_range_gen_error(self): @@ -201,20 +196,23 @@ def test_generate_cday(self): assert rng1 == rng2 def test_1(self): - eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), - [datetime(2009, 3, 25), datetime(2009, 3, 26)]) + rng = list(generate_range(start=datetime(2009, 3, 25), periods=2)) + expected = [datetime(2009, 3, 25), datetime(2009, 3, 26)] + assert rng == expected def test_2(self): - eq_gen_range(dict(start=datetime(2008, 1, 1), - end=datetime(2008, 1, 3)), - [datetime(2008, 1, 1), - datetime(2008, 1, 2), - datetime(2008, 1, 3)]) + rng = list(generate_range(start=datetime(2008, 1, 1), + end=datetime(2008, 1, 3))) + expected = [datetime(2008, 1, 1), + datetime(2008, 1, 2), + datetime(2008, 1, 3)] + assert rng == expected def test_3(self): - eq_gen_range(dict(start=datetime(2008, 1, 5), - end=datetime(2008, 1, 6)), - []) + rng = list(generate_range(start=datetime(2008, 1, 5), + end=datetime(2008, 1, 6))) + expected = [] + assert rng == expected def test_precision_finer_than_offset(self): # GH 9907 diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index e85d3ad294655..6a996213b28bb 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -823,7 +823,7 @@ def test_parse_integers_above_fp_precision(self): 17007000002000192, 17007000002000194]}) - assert np.array_equal(result['Numbers'], expected['Numbers']) + tm.assert_series_equal(result['Numbers'], expected['Numbers']) def test_chunks_have_consistent_numerical_type(self): integers = [str(i) for i in range(499999)] diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index c9088d2ecc5e7..f66f9ccf065f7 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -161,9 +161,9 @@ def test_skip_bad_lines(self): error_bad_lines=False, warn_bad_lines=False) result = reader.read() - expected = {0: ['a', 'd', 'g', 'l'], - 1: ['b', 'e', 'h', 'm'], - 2: ['c', 'f', 'i', 'n']} + expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object), + 1: np.array(['b', 'e', 'h', 'm'], dtype=object), + 2: np.array(['c', 'f', 'i', 'n'], dtype=object)} assert_array_dicts_equal(result, expected) reader = TextReader(StringIO(data), delimiter=':', @@ -189,8 +189,10 @@ def test_header_not_enough_lines(self): assert header == expected recs = reader.read() - expected = {0: [1, 4], 1: [2, 5], 2: [3, 6]} - assert_array_dicts_equal(expected, recs) + expected = {0: np.array([1, 4], dtype=np.int64), + 1: np.array([2, 5], dtype=np.int64), + 2: np.array([3, 6], dtype=np.int64)} + assert_array_dicts_equal(recs, expected) # not enough rows pytest.raises(parser.ParserError, TextReader, StringIO(data), @@ -203,14 +205,16 @@ def test_header_not_enough_lines_as_recarray(self): '1,2,3\n' '4,5,6') - reader = TextReader(StringIO(data), delimiter=',', header=2, - as_recarray=True) + reader = TextReader(StringIO(data), delimiter=',', + header=2, as_recarray=True) header = reader.header expected = [['a', 'b', 'c']] assert header == expected recs = reader.read() - expected = {'a': [1, 4], 'b': [2, 5], 'c': [3, 6]} + expected = {'a': np.array([1, 4], dtype=np.int64), + 'b': np.array([2, 5], dtype=np.int64), + 'c': np.array([3, 6], dtype=np.int64)} assert_array_dicts_equal(expected, recs) # not enough rows @@ -225,7 +229,7 @@ def test_escapechar(self): reader = TextReader(StringIO(data), delimiter=',', header=None, escapechar='\\') result = reader.read() - expected = {0: ['"hello world"'] * 3} + expected = {0: np.array(['"hello world"'] * 3, dtype=object)} assert_array_dicts_equal(result, expected) def test_eof_has_eol(self): @@ -360,7 +364,7 @@ def test_empty_field_eof(self): result = TextReader(StringIO(data), delimiter=',').read() - expected = {0: np.array([1, 4]), + expected = {0: np.array([1, 4], dtype=np.int64), 1: np.array(['2', ''], dtype=object), 2: np.array(['3', ''], dtype=object)} assert_array_dicts_equal(result, expected) @@ -397,4 +401,5 @@ def test_empty_csv_input(self): def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): - assert(np.array_equal(v, right[k])) + assert tm.assert_numpy_array_equal(np.asarray(v), + np.asarray(right[k])) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 84a15cab34cd0..bd2af6859dc46 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1594,7 +1594,9 @@ def test_concat_series_axis1_same_names_ignore_index(self): s2 = Series(randn(len(dates)), index=dates, name='value') result = concat([s1, s2], axis=1, ignore_index=True) - assert np.array_equal(result.columns, [0, 1]) + expected = Index([0, 1]) + + tm.assert_index_equal(result.columns, expected) def test_concat_iterables(self): from collections import deque, Iterable diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index bd4e8b23f31b4..5ca4eba4da13b 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -636,17 +636,21 @@ def test_valid(self): def test_isna(self): ser = Series([0, 5.4, 3, nan, -0.001]) - np.array_equal(ser.isna(), - Series([False, False, False, True, False]).values) + expected = Series([False, False, False, True, False]) + tm.assert_series_equal(ser.isna(), expected) + ser = Series(["hi", "", nan]) - np.array_equal(ser.isna(), Series([False, False, True]).values) + expected = Series([False, False, True]) + tm.assert_series_equal(ser.isna(), expected) def test_notna(self): ser = Series([0, 5.4, 3, nan, -0.001]) - np.array_equal(ser.notna(), - Series([True, True, True, False, True]).values) + expected = Series([True, True, True, False, True]) + tm.assert_series_equal(ser.notna(), expected) + ser = Series(["hi", "", nan]) - np.array_equal(ser.notna(), Series([True, True, False]).values) + expected = Series([True, True, False]) + tm.assert_series_equal(ser.notna(), expected) def test_pad_nan(self): x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'], diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index df9297312a6f3..6a5c0ae11abb7 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1132,19 +1132,19 @@ def test_pad_backfill_object_segfault(): result = libalgos.pad_object(old, new) expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.pad_object(new, old) expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.backfill_object(old, new) expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.backfill_object(new, old) expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_arrmap(): @@ -1235,15 +1235,6 @@ def test_is_lexsorted(): assert (not libalgos.is_lexsorted(failure)) -# def test_get_group_index(): -# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) -# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) -# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) - -# result = lib.get_group_index([a, b], (3, 4)) - -# assert(np.array_equal(result, expected)) - def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) @@ -1253,13 +1244,13 @@ def test_groupsort_indexer(): # need to use a stable sort expected = np.argsort(a, kind='mergesort') - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) # compare with lexsort key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_infinity_sort(): diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index cde1cab37d09c..af946436b55c7 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -53,7 +53,7 @@ def test_left_join_indexer_unique(): result = _join.left_join_indexer_unique_int64(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_left_outer_join_bug(): @@ -69,13 +69,14 @@ def test_left_outer_join_bug(): lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) - exp_lidx = np.arange(len(left)) - exp_ridx = -np.ones(len(left)) + exp_lidx = np.arange(len(left), dtype=np.int64) + exp_ridx = -np.ones(len(left), dtype=np.int64) + exp_ridx[left == 1] = 1 exp_ridx[left == 3] = 0 - assert (np.array_equal(lidx, exp_lidx)) - assert (np.array_equal(ridx, exp_ridx)) + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) def test_inner_join_indexer(): diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 2662720bb436d..75aa9aa4e8198 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -198,7 +198,7 @@ def test_get_reverse_indexer(self): indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) result = lib.get_reverse_indexer(indexer, 5) expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) class TestNAObj(object): diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index a5b12bbf9608a..06c1fa1c0905a 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -332,16 +332,17 @@ def testit(label_list, shape): label_list2 = decons_group_index(group_index, shape) for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) + tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64)] testit(label_list, shape) shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + label_list = [np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5)] testit(label_list, shape) diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index aa8fe90ea6500..ddcf1bb7d8b7b 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -70,7 +70,7 @@ def test_utc_to_local_no_modify(self): rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) # Values are unmodified - assert np.array_equal(rng.asi8, rng_eastern.asi8) + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) assert self.cmptz(rng_eastern.tz, self.tz('US/Eastern')) @@ -108,7 +108,7 @@ def test_localize_utc_conversion_explicit(self): rng = date_range('3/10/2012', '3/11/2012', freq='30T') converted = rng.tz_localize(self.tz('US/Eastern')) expected_naive = rng + offsets.Hour(5) - assert np.array_equal(converted.asi8, expected_naive.asi8) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail rng = date_range('3/11/2012', '3/12/2012', freq='30T') From e2c596de7e99452d8508330a5ddc9c71bf911df0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 3 Nov 2017 08:48:23 -0400 Subject: [PATCH 09/85] COMPAT: compare platform return on 32-bit (#18090) xref #18047 (cherry picked from commit 27bbea7ee125f4dc19dca2a7703c9a13ca754f9b) --- pandas/tests/test_algos.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6a5c0ae11abb7..240a7ad4b22f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1243,13 +1243,21 @@ def test_groupsort_indexer(): result = libalgos.groupsort_indexer(a, 1000)[0] # need to use a stable sort + # np.argsort returns int, groupsort_indexer + # always returns int64 expected = np.argsort(a, kind='mergesort') + expected = expected.astype(np.int64) + tm.assert_numpy_array_equal(result, expected) # compare with lexsort + # np.lexsort returns int, groupsort_indexer + # always returns int64 key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) + expected = expected.astype(np.int64) + tm.assert_numpy_array_equal(result, expected) From c8206990a22e84018e262809a845b8934755b372 Mon Sep 17 00:00:00 2001 From: Manraj Singh Date: Sat, 4 Nov 2017 04:33:51 +0530 Subject: [PATCH 10/85] Fix 18068: Updates merge_asof error, now outputs datatypes (#18082) (cherry picked from commit 86e9dcc164760c5197438968151b2b852647de84) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/reshape/merge.py | 8 +++++--- pandas/tests/reshape/test_merge_asof.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 0369d6608076f..4259d8ca26043 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -103,7 +103,7 @@ Sparse Reshaping ^^^^^^^^^ -- +- Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) - - diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e409090e76944..0234a5563326c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1253,10 +1253,12 @@ def _get_merge_keys(self): join_names) = super(_AsOfMerge, self)._get_merge_keys() # validate index types are the same - for lk, rk in zip(left_join_keys, right_join_keys): + for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): if not is_dtype_equal(lk.dtype, rk.dtype): - raise MergeError("incompatible merge keys, " - "must be the same type") + raise MergeError("incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type" + .format(i=i, lkdtype=lk.dtype, + rkdtype=rk.dtype)) # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: diff --git a/pandas/tests/reshape/test_merge_asof.py b/pandas/tests/reshape/test_merge_asof.py index 78bfa2ff8597c..4b2680b9be592 100644 --- a/pandas/tests/reshape/test_merge_asof.py +++ b/pandas/tests/reshape/test_merge_asof.py @@ -973,3 +973,15 @@ def test_on_float_by_int(self): columns=['symbol', 'exch', 'price', 'mpv']) assert_frame_equal(result, expected) + + def test_merge_datatype_error(self): + """ Tests merge datatype mismatch error """ + msg = 'merge keys \[0\] object and int64, must be the same type' + + left = pd.DataFrame({'left_val': [1, 5, 10], + 'a': ['a', 'b', 'c']}) + right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], + 'a': [1, 2, 3, 6, 7]}) + + with tm.assert_raises_regex(MergeError, msg): + merge_asof(left, right, on='a') From 877917b74c7ae3884d204c2e37f86ba87dfbb9db Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 5 Nov 2017 00:19:58 +0900 Subject: [PATCH 11/85] BUG: Fix the error when reading the compressed UTF-16 file (#18091) (cherry picked from commit e0c9c67f0f81a128653e274b712b27d8618d321c) From 0d82dfb21804af9247859d94a744831aff62bcea Mon Sep 17 00:00:00 2001 From: Matt Braymer-Hayes Date: Thu, 2 Nov 2017 04:25:18 -0700 Subject: [PATCH 12/85] DOC: Remove duplicate 'in' from contributing.rst (#18040) (#18076) (cherry picked from commit 21f9e1a47b8331e2894d0bd2d401710d78850f7b) --- doc/source/contributing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index afde8d8374fb5..4426d3fb0165e 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -331,7 +331,7 @@ The utility script ``scripts/api_rst_coverage.py`` can be used to compare the list of methods documented in ``doc/source/api.rst`` (which is used to generate the `API Reference `_ page) and the actual public methods. -This will identify methods documented in in ``doc/source/api.rst`` that are not actually +This will identify methods documented in ``doc/source/api.rst`` that are not actually class methods, and existing methods that are not documented in ``doc/source/api.rst``. From 192163620249bd0e6c15cc8d6f3f89d9ea28e29d Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 4 Nov 2017 21:36:51 -0700 Subject: [PATCH 13/85] BUG: Override mi-columns in to_csv if requested (#18110) Previously, MultiIndex columns weren't being overwritten when header was passed in for to_csv. Closes gh-5539 (cherry picked from commit e1f3a70239c636e0dc05f5ee289bbd4bfb4c1436) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/formats/format.py | 2 +- pandas/tests/frame/test_to_csv.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 4259d8ca26043..034c174f76005 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -77,6 +77,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) +- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) Plotting diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c5d4a0ecf44ab..ab98b9c4e4f49 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1695,7 +1695,7 @@ def _save_header(self): else: encoded_labels = [] - if not has_mi_columns: + if not has_mi_columns or has_aliases: encoded_labels += list(write_cols) writer.writerow(encoded_labels) else: diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 4162a586f8063..ca8a0d8bda3ab 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1203,3 +1203,16 @@ def test_period_index_date_overflow(self): expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n' assert result == expected + + def test_multi_index_header(self): + # see gh-5539 + columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), + ("b", 1), ("b", 2)]) + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + df.columns = columns + + header = ["a", "b", "c", "d"] + result = df.to_csv(header=header) + + expected = ",a,b,c,d\n0,1,2,3,4\n1,5,6,7,8\n" + assert result == expected From 0b94cead9f35f00b1139bd49d7a9933260728d4b Mon Sep 17 00:00:00 2001 From: topper-123 Date: Sun, 5 Nov 2017 10:54:49 +0000 Subject: [PATCH 14/85] Let CategoricalIndex take CategoricalDtype as dtype (#18116) (cherry picked from commit 58c2f098ab13178f7cf3d3a61c9f4e0fa5d54ead) --- doc/source/whatsnew/v0.21.1.txt | 3 +++ pandas/core/indexes/category.py | 3 ++- pandas/tests/indexes/test_category.py | 25 +++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 034c174f76005..d3c590b0ed6e7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -121,6 +121,9 @@ Categorical - Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) - - +- Error messages in the testing module have been improved when items have + different ``CategoricalDtype`` (:issue:`18069`) +- ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) Other ^^^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 8b680127723c3..70b531ffb0ec4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -79,7 +79,8 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if data is not None or categories is None: cls._scalar_data_error(data) data = [] - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) if copy: data = data.copy() diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d8ec23b9c7e0e..5e40e06d57413 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -4,6 +4,7 @@ import pandas.util.testing as tm from pandas.core.indexes.api import Index, CategoricalIndex +from pandas.core.dtypes.dtypes import CategoricalDtype from .common import Base from pandas.compat import range, PY3 @@ -95,6 +96,11 @@ def test_construction(self): 1, -1, 0], dtype='int8')) assert result.ordered + result = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True) + expected = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True, + dtype='category') + tm.assert_index_equal(result, expected, exact=True) + # turn me to an Index result = Index(np.array(ci)) assert isinstance(result, Index) @@ -125,6 +131,25 @@ def test_construction_with_dtype(self): result = CategoricalIndex(idx, categories=idx, ordered=True) tm.assert_index_equal(result, expected, exact=True) + def test_construction_with_categorical_dtype(self): + # construction with CategoricalDtype + # GH18109 + data, cats, ordered = 'a a b b'.split(), 'c b a'.split(), True + dtype = CategoricalDtype(categories=cats, ordered=ordered) + + result = pd.CategoricalIndex(data, dtype=dtype) + expected = pd.CategoricalIndex(data, categories=cats, + ordered=ordered) + tm.assert_index_equal(result, expected, exact=True) + + # error to combine categories or ordered and dtype keywords args + with pytest.raises(ValueError, match="Cannot specify both `dtype` and " + "`categories` or `ordered`."): + pd.CategoricalIndex(data, categories=cats, dtype=dtype) + with pytest.raises(ValueError, match="Cannot specify both `dtype` and " + "`categories` or `ordered`."): + pd.CategoricalIndex(data, ordered=ordered, dtype=dtype) + def test_create_categorical(self): # https://github.com/pandas-dev/pandas/pull/17513 # The public CI constructor doesn't hit this code path with From 357a7b35ed9dff53a6e88d4db197a5cd25d25bca Mon Sep 17 00:00:00 2001 From: derestle-htwg Date: Mon, 6 Nov 2017 05:39:44 +0100 Subject: [PATCH 15/85] DOC: Remove stray '}' in frame.py docs (#18131) Closes gh-18130 (cherry picked from commit 1181622fbfc9208a906ae9e70716b83fc7b3ae5a) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a1af806e5cb9e..2c417dda6fc37 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5113,7 +5113,7 @@ def append(self, other, ignore_index=False, verify_integrity=False): >>> df = pd.DataFrame(columns=['A']) >>> for i in range(5): - ... df = df.append({'A'}: i}, ignore_index=True) + ... df = df.append({'A': i}, ignore_index=True) >>> df A 0 0 From 447517626a506bcd2bc1b7a1faa3e66e3856e10c Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 6 Nov 2017 09:42:08 -0800 Subject: [PATCH 16/85] TST: Add another test for segfault in C engine (#18128) xref gh-13833. Closes gh-5291. (cherry picked from commit 6f0ff1a6f5fdc2cc135ffffc6e54b471977d2659) --- pandas/tests/io/parser/c_parser_only.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index c68b2bf064d97..6d476e326213e 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -290,11 +290,11 @@ def test_empty_header_read(count): test_empty_header_read(count) def test_parse_trim_buffers(self): - # This test is part of a bugfix for issue #13703. It attmepts to + # This test is part of a bugfix for issue #13703. It attempts to # to stress the system memory allocator, to cause it to move the # stream buffer and either let the OS reclaim the region, or let # other memory requests of parser otherwise modify the contents - # of memory space, where it was formely located. + # of memory space, where it was formally located. # This test is designed to cause a `segfault` with unpatched # `tokenizer.c`. Sometimes the test fails on `segfault`, other # times it fails due to memory corruption, which causes the @@ -346,7 +346,7 @@ def test_parse_trim_buffers(self): # Generate the expected output: manually create the dataframe # by splitting by comma and repeating the `n_lines` times. - row = tuple(val_ if val_ else float("nan") + row = tuple(val_ if val_ else np.nan for val_ in record_.split(",")) expected = pd.DataFrame([row for _ in range(n_lines)], dtype=object, columns=None, index=None) @@ -359,6 +359,15 @@ def test_parse_trim_buffers(self): # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) + # This extra test was added to replicate the fault in gh-5291. + # Force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + chunks_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize, + encoding='utf_8') + result = pd.concat(chunks_, axis=0, ignore_index=True) + tm.assert_frame_equal(result, expected) + def test_internal_null_byte(self): # see gh-14012 # From 08a718169769e8cffc17ba01d8b9eed3de5b5a2d Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 2 Nov 2017 11:26:50 +0000 Subject: [PATCH 17/85] improve test output for Categoricals (#18069) (cherry picked from commit bb4fa658483cc0f9aa714dfc5733fe7ab7945f73) --- doc/source/whatsnew/v0.21.1.txt | 3 ++- pandas/util/testing.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index d3c590b0ed6e7..e64753fb21f7c 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -119,7 +119,8 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) -- +- Error messages in the testing module have been improved when items have + different ``CategoricalDtype`` (:issue:`18069`) - - Error messages in the testing module have been improved when items have different ``CategoricalDtype`` (:issue:`18069`) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 730d2782e85d2..dec67bbea854f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1074,8 +1074,12 @@ def assert_categorical_equal(left, right, check_dtype=True, def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) + elif is_categorical_dtype(left): + left = repr(left) if isinstance(right, np.ndarray): right = pprint_thing(right) + elif is_categorical_dtype(right): + right = repr(right) msg = """{obj} are different From 91639e5e4e440bb509e4a624d6529c90d4185662 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 6 Nov 2017 12:33:43 -0800 Subject: [PATCH 18/85] BUG: Don't parse NA-values in index when requested (#18127) Closes gh-5239. (cherry picked from commit c176a3c29ccabe7e471fa0b76c6cc1ceeb9bcd77) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/parsers.py | 13 ++++++++----- pandas/tests/io/parser/na_values.py | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index e64753fb21f7c..22d4b1e288837 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -77,6 +77,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) +- Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7f3f5630e49f9..cf181f1de938b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1231,6 +1231,8 @@ def __init__(self, kwds): self.na_values = kwds.get('na_values') self.na_fvalues = kwds.get('na_fvalues') + self.na_filter = kwds.get('na_filter', False) + self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.as_recarray = kwds.get('as_recarray', False) @@ -1404,7 +1406,6 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): elif not self._has_complex_date_col: index = self._get_simple_index(alldata, columns) index = self._agg_index(index) - elif self._has_complex_date_col: if not self._name_processed: (self.index_names, _, @@ -1487,8 +1488,12 @@ def _agg_index(self, index, try_parse_dates=True): if (try_parse_dates and self._should_parse_dates(i)): arr = self._date_conv(arr) - col_na_values = self.na_values - col_na_fvalues = self.na_fvalues + if self.na_filter: + col_na_values = self.na_values + col_na_fvalues = self.na_fvalues + else: + col_na_values = set() + col_na_fvalues = set() if isinstance(self.na_values, dict): col_name = self.index_names[i] @@ -2043,8 +2048,6 @@ def __init__(self, f, **kwds): self.names_passed = kwds['names'] or None - self.na_filter = kwds['na_filter'] - self.has_index_names = False if 'has_index_names' in kwds: self.has_index_names = kwds['has_index_names'] diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 7fbf174e19eee..8dc599b42ddc7 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -312,3 +312,21 @@ def test_empty_na_values_no_default_with_index(self): out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0) tm.assert_frame_equal(out, expected) + + def test_no_na_filter_on_index(self): + # see gh-5239 + data = "a,b,c\n1,,3\n4,5,6" + + # Don't parse NA-values in index when na_filter=False. + out = self.read_csv(StringIO(data), index_col=[1], na_filter=False) + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, + index=Index(["", "5"], name="b")) + tm.assert_frame_equal(out, expected) + + # Parse NA-values in index when na_filter=True. + out = self.read_csv(StringIO(data), index_col=[1], na_filter=True) + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, + index=Index([np.nan, 5.0], name="b")) + tm.assert_frame_equal(out, expected) From db63a390f68d0a733981f17355620cd69ce9d7aa Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 6 Nov 2017 10:18:28 -0800 Subject: [PATCH 19/85] TST: Check lossiness of floats with parse_dates (#18136) Closes gh-2697. (cherry picked from commit e23bd24912af10a39f86415221f293619a37e079) --- pandas/tests/io/parser/parse_dates.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index 90103e7bf26b0..4c0f67fa6876a 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -656,3 +656,21 @@ def test_parse_date_column_with_empty_string(self): [621, ' ']] expected = DataFrame(expected_data, columns=['case', 'opdate']) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("data,expected", [ + ("a\n135217135789158401\n1352171357E+5", + DataFrame({"a": [135217135789158401, + 135217135700000]}, dtype="float64")), + ("a\n99999999999\n123456789012345\n1234E+0", + DataFrame({"a": [99999999999, + 123456789012345, + 1234]}, dtype="float64")) + ]) + @pytest.mark.parametrize("parse_dates", [True, False]) + def test_parse_date_float(self, data, expected, parse_dates): + # see gh-2697 + # + # Date parsing should fail, so we leave the data untouched + # (i.e. float precision should remain unchanged). + result = self.read_csv(StringIO(data), parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) From 0b57dd879c27528ecd20c4073f39c8213414bb1b Mon Sep 17 00:00:00 2001 From: Mie~~~ Date: Wed, 8 Nov 2017 11:00:34 +0800 Subject: [PATCH 20/85] BUG:fix rolling skew kurt floating issue (#18065) (cherry picked from commit 93c755e0ea64b6039c8ecd55fc637d1dd67dbd6e) --- doc/source/whatsnew/v0.21.1.txt | 3 +++ pandas/_libs/window.pyx | 23 +++++++++++++++++++++-- pandas/core/nanops.py | 7 +++++++ pandas/tests/test_window.py | 10 ++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 22d4b1e288837..966a5a72f3bd4 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -57,6 +57,9 @@ Documentation Changes Bug Fixes ~~~~~~~~~ - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) +- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) +- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) +- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) Conversion ^^^^^^^^^^ diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index b6bd6f92f6199..27fee4de5c3a2 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -788,7 +788,17 @@ cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, A = x / dnobs B = xx / dnobs - A * A C = xxx / dnobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: + + # #18044: with uniform distribution, floating issue will + # cause B != 0. and cause the result is a very + # large number. + # + # in core/nanops.py nanskew/nankurt call the function + # _zero_out_fperr(m2) to fix floating error. + # if the variance is less than 1e-14, it could be + # treat as zero, here we follow the original + # skew/kurt behaviour to check B <= 1e-14 + if B <= 1e-14 or nobs < 3: result = NaN else: R = sqrt(B) @@ -915,7 +925,16 @@ cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, R = R * A D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A - if B == 0 or nobs < 4: + # #18044: with uniform distribution, floating issue will + # cause B != 0. and cause the result is a very + # large number. + # + # in core/nanops.py nanskew/nankurt call the function + # _zero_out_fperr(m2) to fix floating error. + # if the variance is less than 1e-14, it could be + # treat as zero, here we follow the original + # skew/kurt behaviour to check B <= 1e-14 + if B <= 1e-14 or nobs < 4: result = NaN else: K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index baeb869239c1e..e1c09947ac0b4 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -548,6 +548,9 @@ def nanskew(values, axis=None, skipna=True): m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error + # + # #18044 in _libs/windows.pyx calc_skew follow this behavior + # to fix the fperr to treat m2 <1e-14 as zero m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) @@ -609,6 +612,9 @@ def nankurt(values, axis=None, skipna=True): result = numer / denom - adj # floating point error + # + # #18044 in _libs/windows.pyx calc_kurt follow this behavior + # to fix the fperr to treat denom <1e-14 as zero numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) @@ -699,6 +705,7 @@ def _maybe_null_out(result, axis, mask): def _zero_out_fperr(arg): + # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): with np.errstate(invalid='ignore'): return np.where(np.abs(arg) < 1e-14, 0, arg) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index c567613acebd1..165813a89b5db 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2979,6 +2979,16 @@ def test_rolling_kurt_edge_cases(self): x = d.rolling(window=4).kurt() tm.assert_series_equal(expected, x) + def test_rolling_skew_eq_value_fperr(self): + # #18804 all rolling skew for all equal values should return Nan + a = pd.Series([1.1] * 15).rolling(window=10).skew() + assert np.isnan(a).all() + + def test_rolling_kurt_eq_value_fperr(self): + # #18804 all rolling kurt for all equal values should return Nan + a = pd.Series([1.1] * 15).rolling(window=10).kurt() + assert np.isnan(a).all() + def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, has_time_rule=True, preserve_nan=True): result = func(self.arr) From 62ca3d992cd5e7a5ba7236b2fdc06f25dd7e3d6a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 8 Nov 2017 03:11:32 -0800 Subject: [PATCH 21/85] Modernize indexes.timedeltas, indexes.datetimeindex (#18161) (cherry picked from commit 079f6786f90553a6fae569354273c80d7c1e44ff) --- pandas/core/indexes/datetimes.py | 15 ++----- pandas/core/indexes/timedeltas.py | 71 ++++++++++++++----------------- 2 files changed, 35 insertions(+), 51 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 18be6c61abdf7..70fda425e1ae3 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -2,9 +2,11 @@ from __future__ import division import operator import warnings -from datetime import time, datetime -from datetime import timedelta +from datetime import time, datetime, timedelta + import numpy as np +from pytz import utc + from pandas.core.base import _shared_docs from pandas.core.dtypes.common import ( @@ -55,10 +57,6 @@ from pandas._libs.tslibs import timezones -def _utc(): - import pytz - return pytz.utc - # -------- some conversion wrapper functions @@ -66,7 +64,6 @@ def _field_accessor(name, field, docstring=None): def f(self): values = self.asi8 if self.tz is not None: - utc = _utc() if self.tz is not utc: values = self._local_timestamps() @@ -562,8 +559,6 @@ def _convert_for_op(self, value): raise ValueError('Passed item and index have different timezone') def _local_timestamps(self): - utc = _utc() - if self.is_monotonic: return libts.tz_convert(self.asi8, utc, self.tz) else: @@ -823,7 +818,6 @@ def _add_delta(self, delta): tz = 'UTC' if self.tz is not None else None result = DatetimeIndex(new_values, tz=tz, name=name, freq='infer') - utc = _utc() if self.tz is not None and self.tz is not utc: result = result.tz_convert(self.tz) return result @@ -877,7 +871,6 @@ def astype(self, dtype, copy=True): raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) def _get_time_micros(self): - utc = _utc() values = self.asi8 if self.tz is not None and self.tz is not utc: values = self._local_timestamps() diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 6e08c32f30dcd..ca2377a03ca6b 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -36,6 +36,26 @@ join as libjoin, Timedelta, NaT, iNaT) +def _field_accessor(name, alias, docstring=None): + def f(self): + if self.hasnans: + result = np.empty(len(self), dtype='float64') + mask = self._isnan + imask = ~mask + result.flat[imask] = np.array([getattr(Timedelta(val), alias) + for val in self.asi8[imask]]) + result[mask] = np.nan + else: + result = np.array([getattr(Timedelta(val), alias) + for val in self.asi8], dtype='int64') + + return Index(result, name=self.name) + + f.__name__ = name + f.__doc__ = docstring + return property(f) + + def _td_index_cmp(opname, nat_result=False): """ Wrap comparison operations to convert timedelta-like to timedelta64 @@ -380,46 +400,17 @@ def _format_native_types(self, na_rep=u('NaT'), nat_rep=na_rep, justify='all').get_result() - def _get_field(self, m): - - values = self.asi8 - hasnans = self.hasnans - if hasnans: - result = np.empty(len(self), dtype='float64') - mask = self._isnan - imask = ~mask - result.flat[imask] = np.array( - [getattr(Timedelta(val), m) for val in values[imask]]) - result[mask] = np.nan - else: - result = np.array([getattr(Timedelta(val), m) - for val in values], dtype='int64') - return Index(result, name=self.name) - - @property - def days(self): - """ Number of days for each element. """ - return self._get_field('days') - - @property - def seconds(self): - """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return self._get_field('seconds') - - @property - def microseconds(self): - """ - Number of microseconds (>= 0 and less than 1 second) for each - element. """ - return self._get_field('microseconds') - - @property - def nanoseconds(self): - """ - Number of nanoseconds (>= 0 and less than 1 microsecond) for each - element. - """ - return self._get_field('nanoseconds') + days = _field_accessor("days", "days", + " Number of days for each element. ") + seconds = _field_accessor("seconds", "seconds", + " Number of seconds (>= 0 and less than 1 day) " + "for each element. ") + microseconds = _field_accessor("microseconds", "microseconds", + "\nNumber of microseconds (>= 0 and less " + "than 1 second) for each\nelement. ") + nanoseconds = _field_accessor("nanoseconds", "nanoseconds", + "\nNumber of nanoseconds (>= 0 and less " + "than 1 microsecond) for each\nelement.\n") @property def components(self): From f7d0b9f1ee95362bed1e9cec947cfcccd2511d28 Mon Sep 17 00:00:00 2001 From: Peter Hoffmann Date: Wed, 8 Nov 2017 21:11:33 +0100 Subject: [PATCH 22/85] restrict columns to read for pandas.read_parquet (#18155) (cherry picked from commit 5128fe60b24e72c896ebfdb3319e28e710b44386) --- doc/source/io.rst | 10 ++++++++++ doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/parquet.py | 16 ++++++++++------ pandas/tests/io/test_parquet.py | 13 +++++++++++-- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 82cb83c168b22..de3ae2e8f4305 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4580,6 +4580,16 @@ Read from a parquet file. result.dtypes +Read only certain columns of a parquet file. + +.. ipython:: python + + result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b']) + result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) + + result.dtypes + + .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 966a5a72f3bd4..0c5642ad52853 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -82,6 +82,7 @@ I/O - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) +- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) Plotting diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4b507b7f5df6f..ef95e32cc241e 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -76,9 +76,9 @@ def write(self, df, path, compression='snappy', table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs) - def read(self, path): + def read(self, path, columns=None): path, _, _ = get_filepath_or_buffer(path) - return self.api.parquet.read_table(path).to_pandas() + return self.api.parquet.read_table(path, columns=columns).to_pandas() class FastParquetImpl(object): @@ -115,9 +115,9 @@ def write(self, df, path, compression='snappy', **kwargs): self.api.write(path, df, compression=compression, **kwargs) - def read(self, path): + def read(self, path, columns=None): path, _, _ = get_filepath_or_buffer(path) - return self.api.ParquetFile(path).to_pandas() + return self.api.ParquetFile(path).to_pandas(columns=columns) def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): @@ -178,7 +178,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): return impl.write(df, path, compression=compression) -def read_parquet(path, engine='auto', **kwargs): +def read_parquet(path, engine='auto', columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. @@ -188,6 +188,10 @@ def read_parquet(path, engine='auto', **kwargs): ---------- path : string File path + columns: list, default=None + If not None, only these columns will be read from the file. + + .. versionadded 0.21.1 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet reader library to use. If 'auto', then the option 'io.parquet.engine' is used. If 'auto', then the first @@ -201,4 +205,4 @@ def read_parquet(path, engine='auto', **kwargs): """ impl = get_engine(engine) - return impl.read(path) + return impl.read(path, columns=columns) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ecd4e8f719014..9a4edf38e2ef4 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -192,7 +192,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs): with tm.ensure_clean() as path: df.to_parquet(path, engine, **kwargs) - result = read_parquet(path, engine) + result = read_parquet(path, engine, **kwargs) if expected is None: expected = df @@ -200,7 +200,7 @@ def check_round_trip(self, df, engine, expected=None, **kwargs): # repeat to_parquet(df, path, engine, **kwargs) - result = pd.read_parquet(path, engine) + result = pd.read_parquet(path, engine, **kwargs) if expected is None: expected = df @@ -282,6 +282,15 @@ def test_compression(self, engine, compression): df = pd.DataFrame({'A': [1, 2, 3]}) self.check_round_trip(df, engine, compression=compression) + def test_read_columns(self, engine): + # GH18154 + df = pd.DataFrame({'string': list('abc'), + 'int': list(range(1, 4))}) + + expected = pd.DataFrame({'string': list('abc')}) + self.check_round_trip(df, engine, expected=expected, + compression=None, columns=["string"]) + class TestParquetPyArrow(Base): From c03a3874447305328aff5fabea0c76e163e2dce9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Nov 2017 15:46:54 -0500 Subject: [PATCH 23/85] CI: don't show miniconda output on install / numpy 1.14 compat (#18157) CI: don't show miniconda output on install COMPAT: compat with numpy >= 1.14 on str repr TST: temp disable python-dateutil from master closes #18123 (cherry picked from commit 8dac633142daa8d5bcd0cf77ad89b97628d474eb) --- ci/install_travis.sh | 4 ++-- ci/requirements-3.6_NUMPY_DEV.build.sh | 5 ++++- pandas/compat/__init__.py | 13 ++++++++----- pandas/tests/frame/test_dtypes.py | 9 ++++++++- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index b85263daa1eac..4d8a371ba2994 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -34,9 +34,9 @@ fi # install miniconda if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -q -O miniconda.sh || exit 1 else - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 fi time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 diff --git a/ci/requirements-3.6_NUMPY_DEV.build.sh b/ci/requirements-3.6_NUMPY_DEV.build.sh index fd79142c5cebb..bc92d8fca6b17 100644 --- a/ci/requirements-3.6_NUMPY_DEV.build.sh +++ b/ci/requirements-3.6_NUMPY_DEV.build.sh @@ -12,7 +12,10 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy # install dateutil from master -pip install -U git+git://github.com/dateutil/dateutil.git + +# TODO(jreback), temp disable dateutil master has changed +# pip install -U git+git://github.com/dateutil/dateutil.git +pip install python-dateutil # cython via pip pip install cython diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3853ac017044c..4a201d065c0b6 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -381,17 +381,20 @@ def raise_with_traceback(exc, traceback=Ellipsis): # http://stackoverflow.com/questions/4126348 # Thanks to @martineau at SO -from dateutil import parser as _date_parser import dateutil + +if PY2 and LooseVersion(dateutil.__version__) == '2.0': + # dateutil brokenness + raise Exception('dateutil 2.0 incompatible with Python 2.x, you must ' + 'install version 1.5 or 2.1+!') + +from dateutil import parser as _date_parser if LooseVersion(dateutil.__version__) < '2.0': + @functools.wraps(_date_parser.parse) def parse_date(timestr, *args, **kwargs): timestr = bytes(timestr) return _date_parser.parse(timestr, *args, **kwargs) -elif PY2 and LooseVersion(dateutil.__version__) == '2.0': - # dateutil brokenness - raise Exception('dateutil 2.0 incompatible with Python 2.x, you must ' - 'install version 1.5 or 2.1+!') else: parse_date = _date_parser.parse diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index abb528f0d2179..5adcd3b6855ce 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -10,6 +10,8 @@ from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, compat, concat, option_context) from pandas.compat import u +from pandas import _np_version_under1p14 + from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, @@ -531,7 +533,12 @@ def test_astype_str(self): assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(tt) - expected = DataFrame(['1.12345678901']) + if _np_version_under1p14: + # < 1.14 truncates + expected = DataFrame(['1.12345678901']) + else: + # >= 1.14 preserves the full repr + expected = DataFrame(['1.1234567890123457']) assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) From b0bda40ec502429239ba74c93345c42bf57d937d Mon Sep 17 00:00:00 2001 From: Ingolf Becker Date: Wed, 8 Nov 2017 20:25:46 +0000 Subject: [PATCH 24/85] Fix groupby().count() for datetimelike columns (#18167) (cherry picked from commit 4054632e151c33e8e31c201a1e4ffb5f857b0652) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/groupby.py | 3 ++- pandas/tests/groupby/test_counting.py | 21 +++++++++++++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 0c5642ad52853..29c99805b0f99 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -60,6 +60,7 @@ Bug Fixes - Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) +- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5c07033f5a68f..add465e066422 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -4363,7 +4363,8 @@ def count(self): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - val = ((mask & ~isna(blk.get_values())) for blk in data.blocks) + val = ((mask & ~isna(np.atleast_2d(blk.get_values()))) + for blk in data.blocks) loc = (blk.mgr_locs for blk in data.blocks) counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 485241d593d4f..787d99086873e 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -2,9 +2,11 @@ from __future__ import print_function import numpy as np +import pytest -from pandas import (DataFrame, Series, MultiIndex) -from pandas.util.testing import assert_series_equal +from pandas import (DataFrame, Series, MultiIndex, Timestamp, Timedelta, + Period) +from pandas.util.testing import (assert_series_equal, assert_frame_equal) from pandas.compat import (range, product as cart_product) @@ -195,3 +197,18 @@ def test_ngroup_respects_groupby_order(self): g.ngroup()) assert_series_equal(Series(df['group_index'].values), g.cumcount()) + + @pytest.mark.parametrize('datetimelike', [ + [Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)], + [Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)], + [Timedelta(x, unit="h") for x in range(1, 4)], + [Period(freq="2W", year=2017, month=x) for x in range(1, 4)]]) + def test_count_with_datetimelike(self, datetimelike): + # test for #13393, where DataframeGroupBy.count() fails + # when counting a datetimelike column. + + df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike}) + res = df.groupby('x').count() + expected = DataFrame({'y': [2, 1]}, index=['a', 'b']) + expected.index.name = "x" + assert_frame_equal(expected, res) From adfdb89c28779bf78c84990f6b3ec52dea7314b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20D=C3=B6pfert?= Date: Fri, 10 Nov 2017 14:54:55 +0100 Subject: [PATCH 25/85] Add requirement for a 1-dimensional ndarray in the `pd.qcut` docstring (#18211) (cherry picked from commit a6345c7cd8c41842d61902801e7bef9cacb0c2d5) --- pandas/core/reshape/tile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index fda339aa30461..2adf17a227a59 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -148,7 +148,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): Parameters ---------- - x : ndarray or Series + x : 1d ndarray or Series q : integer or array of quantiles Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles From de4be1f02641b93b1ad9753f181ee65cf27c487e Mon Sep 17 00:00:00 2001 From: Robert Meyer Date: Fri, 10 Nov 2017 14:53:42 +0100 Subject: [PATCH 26/85] Fix for #18178 and #18187 by changing the concat of empty RangeIndex (#18191) (cherry picked from commit 6b3641b48439922ce4c1225a1d338dfe0b1f8967) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/dtypes/concat.py | 19 ++++++++++++++----- pandas/tests/reshape/test_concat.py | 18 ++++++++++++++++++ 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 29c99805b0f99..a54b2d91ce1df 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -61,6 +61,7 @@ Bug Fixes - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) +- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 93993fd0a0cab..bca5847f3a6cc 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -569,9 +569,10 @@ def _concat_rangeindex_same_dtype(indexes): start = step = next = None - for obj in indexes: - if not len(obj): - continue + # Filter the empty indexes + non_empty_indexes = [obj for obj in indexes if len(obj)] + + for obj in non_empty_indexes: if start is None: # This is set by the first non-empty index @@ -595,8 +596,16 @@ def _concat_rangeindex_same_dtype(indexes): if step is not None: next = obj[-1] + step - if start is None: + if non_empty_indexes: + # Get the stop value from "next" or alternatively + # from the last non-empty index + stop = non_empty_indexes[-1]._stop if next is None else next + else: + # Here all "indexes" had 0 length, i.e. were empty. + # Simply take start, stop, and step from the last empty index. + obj = indexes[-1] start = obj._start step = obj._step - stop = obj._stop if next is None else next + stop = obj._stop + return indexes[0].__class__(start, stop, step) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index bd2af6859dc46..11368e44943d8 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1983,3 +1983,21 @@ def test_concat_will_upcast(dt, pdt): pdt(np.array([5], dtype=dt, ndmin=dims))] x = pd.concat(dfs) assert x.values.dtype == 'float64' + + +def test_concat_empty_and_non_empty_frame_regression(): + # GH 18178 regression test + df1 = pd.DataFrame({'foo': [1]}) + df2 = pd.DataFrame({'foo': []}) + expected = pd.DataFrame({'foo': [1.0]}) + result = pd.concat([df1, df2]) + assert_frame_equal(result, expected) + + +def test_concat_empty_and_non_empty_series_regression(): + # GH 18187 regression test + s1 = pd.Series([1]) + s2 = pd.Series([]) + expected = s1 + result = pd.concat([s1, s2]) + tm.assert_series_equal(result, expected) From b22356ac3a1b38dd502589cb32356a043f8c4c76 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 10 Nov 2017 16:41:17 -0500 Subject: [PATCH 27/85] COMPAT: re-enable dateutil install from master (#18172) (cherry picked from commit bbadc81fc93383bbcd264e2de7a1c1128a6b2c8e) --- ci/requirements-3.6_NUMPY_DEV.build.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ci/requirements-3.6_NUMPY_DEV.build.sh b/ci/requirements-3.6_NUMPY_DEV.build.sh index bc92d8fca6b17..fd79142c5cebb 100644 --- a/ci/requirements-3.6_NUMPY_DEV.build.sh +++ b/ci/requirements-3.6_NUMPY_DEV.build.sh @@ -12,10 +12,7 @@ PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy # install dateutil from master - -# TODO(jreback), temp disable dateutil master has changed -# pip install -U git+git://github.com/dateutil/dateutil.git -pip install python-dateutil +pip install -U git+git://github.com/dateutil/dateutil.git # cython via pip pip install cython From 4df0d2a7f884bff263d8fbe56364bd09ff3d6165 Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Fri, 10 Nov 2017 18:27:45 -0500 Subject: [PATCH 28/85] ENH: improve 'incompatible tolerance' error message in merge_asof (#17260) (cherry picked from commit f68bf254a5a9503bbaef64e23bd53cd85527d20e) --- pandas/core/reshape/merge.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0234a5563326c..412c00dc95ec0 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -126,7 +126,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, try: if k in merged: merged[k] = key - except: + except KeyError: pass pieces.append(merged) @@ -1268,8 +1268,10 @@ def _get_merge_keys(self): else: lt = left_join_keys[-1] - msg = "incompatible tolerance, must be compat " \ - "with type {lt}".format(lt=type(lt)) + msg = ("incompatible tolerance {tolerance}, must be compat " + "with type {lkdtype}".format( + tolerance=type(self.tolerance), + lkdtype=lt.dtype)) if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): @@ -1505,12 +1507,12 @@ def _sort_labels(uniques, left, right): # tuplesafe uniques = Index(uniques).values - l = len(left) + llength = len(left) labels = np.concatenate([left, right]) _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) new_labels = _ensure_int64(new_labels) - new_left, new_right = new_labels[:l], new_labels[l:] + new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right From 7da16e30f164dcce3a294631db66fe9e095ee93f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 11 Nov 2017 22:41:11 +0100 Subject: [PATCH 29/85] DOC: clarfiy sum of empty Series case + matplotlib registering in 0.21.0 whatsnew (#18204) * DOC: clarfiy sum of empty Series case in 0.21.0 whatsnew * DOC: emphasize matplotlib changes (cherry picked from commit 85e6864177af5d90c633943ee0d86b46ea5150d9) --- doc/source/release.rst | 2 +- doc/source/whatsnew/v0.21.0.txt | 44 ++++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 6c3e7f847b485..a3289b1144863 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -52,7 +52,7 @@ Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. - New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying categoricals independent of the data, see :ref:`here `. -- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, see :ref:`here `. +- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. - Compatibility fixes for pypy, see :ref:`here `. - Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. - Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4c460eeb85b82..89e2d3006696c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -12,7 +12,7 @@ Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. - New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying categoricals independent of the data, see :ref:`here `. -- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, see :ref:`here `. +- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. - Compatibility fixes for pypy, see :ref:`here `. - Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. - Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). @@ -369,11 +369,11 @@ Additionally, support has been dropped for Python 3.4 (:issue:`15251`). .. _whatsnew_0210.api_breaking.bottleneck: -Sum/Prod of all-NaN Series/DataFrames is now consistently NaN -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Sum/Prod of all-NaN or empty Series/DataFrames is now consistently NaN +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on -whether `bottleneck `__ is installed. (:issue:`9422`, :issue:`15507`). +whether `bottleneck `__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`). Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `. @@ -381,35 +381,35 @@ Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of s = Series([np.nan]) -Previously NO ``bottleneck`` +Previously WITHOUT ``bottleneck`` installed: .. code-block:: ipython In [2]: s.sum() Out[2]: np.nan -Previously WITH ``bottleneck`` +Previously WITH ``bottleneck``: .. code-block:: ipython In [2]: s.sum() Out[2]: 0.0 -New Behavior, without regard to the bottleneck installation. +New Behavior, without regard to the bottleneck installation: .. ipython:: python s.sum() -Note that this also changes the sum of an empty ``Series`` - -Previously regardless of ``bottlenck`` +Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottlenck`` installation: .. code-block:: ipython In [1]: pd.Series([]).sum() Out[1]: 0 +but for consistency with the all-NaN case, this was changed to return NaN as well: + .. ipython:: python pd.Series([]).sum() @@ -877,6 +877,28 @@ New Behavior: pd.interval_range(start=0, end=4) +.. _whatsnew_0210.api.mpl_converters: + +No Automatic Matplotlib Converters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas no longer registers our ``date``, ``time``, ``datetime``, +``datetime64``, and ``Period`` converters with matplotlib when pandas is +imported. Matplotlib plot methods (``plt.plot``, ``ax.plot``, ...), will not +nicely format the x-axis for ``DatetimeIndex`` or ``PeriodIndex`` values. You +must explicitly register these methods: + +.. ipython:: python + + from pandas.tseries import converter + converter.register() + + fig, ax = plt.subplots() + plt.plot(pd.date_range('2017', periods=6), range(6)) + +Pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these +converters on first-use (:issue:17710). + .. _whatsnew_0210.api: Other API Changes @@ -900,8 +922,6 @@ Other API Changes - Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`) - Bug in :func:`DataFrame.drop` caused boolean labels ``False`` and ``True`` to be treated as labels 0 and 1 respectively when dropping indices from a numeric index. This will now raise a ValueError (:issue:`16877`) - Restricted DateOffset keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`). -- Pandas no longer registers matplotlib converters on import. The converters - will be registered and used when the first plot is draw (:issue:`17710`) .. _whatsnew_0210.deprecations: From 8ea84a8b35c70c72f5c336d938c87e1a44cea301 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 11 Nov 2017 18:38:27 -0500 Subject: [PATCH 30/85] TST: clean up some tests issues & style (#18232) * STYLE: clean up flake8 exceptions * TST: skip if no bs4 for some html tests * TST: fix local timezone checking with .timestamp() (cherry picked from commit 96a527434a6138b291c1b4a782bd2793cff51f74) --- pandas/core/dtypes/cast.py | 34 +++++++++++++------------- pandas/core/indexes/base.py | 10 ++++---- pandas/core/indexes/datetimes.py | 8 +++--- pandas/core/indexes/timedeltas.py | 2 +- pandas/core/series.py | 6 ++--- pandas/tests/scalar/test_timestamp.py | 11 ++++++--- pandas/tests/tseries/test_timezones.py | 16 ++++++++---- 7 files changed, 49 insertions(+), 38 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f3b11e52cdd7a..eae283e9bc00d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -136,7 +136,7 @@ def trans(x): # noqa try: if np.allclose(new_result, result, rtol=0): return new_result - except: + except Exception: # comparison of an object dtype with a number type could # hit here @@ -151,14 +151,14 @@ def trans(x): # noqa elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']: try: result = result.astype(dtype) - except: + except Exception: if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime result = to_datetime(result).tz_localize('utc') result = result.tz_convert(dtype.tz) - except: + except Exception: pass return result @@ -210,7 +210,7 @@ def changeit(): new_result[mask] = om_at result[:] = new_result return result, False - except: + except Exception: pass # we are forced to change the dtype of the result as the input @@ -243,7 +243,7 @@ def changeit(): try: np.place(result, mask, other) - except: + except Exception: return changeit() return result, False @@ -274,14 +274,14 @@ def maybe_promote(dtype, fill_value=np.nan): if issubclass(dtype.type, np.datetime64): try: fill_value = tslib.Timestamp(fill_value).value - except: + except Exception: # the proper thing to do here would probably be to upcast # to object (but numpy 1.6.1 doesn't do this properly) fill_value = iNaT elif issubclass(dtype.type, np.timedelta64): try: fill_value = lib.Timedelta(fill_value).value - except: + except Exception: # as for datetimes, cannot upcast to object fill_value = iNaT else: @@ -592,12 +592,12 @@ def maybe_convert_scalar(values): def coerce_indexer_dtype(indexer, categories): """ coerce the indexer input array to the smallest dtype possible """ - l = len(categories) - if l < _int8_max: + length = len(categories) + if length < _int8_max: return _ensure_int8(indexer) - elif l < _int16_max: + elif length < _int16_max: return _ensure_int16(indexer) - elif l < _int32_max: + elif length < _int32_max: return _ensure_int32(indexer) return _ensure_int64(indexer) @@ -629,7 +629,7 @@ def conv(r, dtype): r = float(r) elif dtype.kind == 'i': r = int(r) - except: + except Exception: pass return r @@ -756,7 +756,7 @@ def maybe_convert_objects(values, convert_dates=True, convert_numeric=True, if not isna(new_values).all(): values = new_values - except: + except Exception: pass else: # soft-conversion @@ -817,7 +817,7 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, # If all NaNs, then do not-alter values = converted if not isna(converted).all() else values values = values.copy() if copy else values - except: + except Exception: pass return values @@ -888,10 +888,10 @@ def try_datetime(v): try: from pandas import to_datetime return to_datetime(v) - except: + except Exception: pass - except: + except Exception: pass return v.reshape(shape) @@ -903,7 +903,7 @@ def try_timedelta(v): from pandas import to_timedelta try: return to_timedelta(v)._values.reshape(shape) - except: + except Exception: return v.reshape(shape) inferred_type = lib.infer_datetimelike_array(_ensure_object(v)) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a995fc10a6674..9e14a3838733e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2032,7 +2032,7 @@ def equals(self, other): try: return array_equivalent(_values_from_object(self), _values_from_object(other)) - except: + except Exception: return False def identical(self, other): @@ -2315,7 +2315,7 @@ def intersection(self, other): try: indexer = Index(other._values).get_indexer(self._values) indexer = indexer.take((indexer != -1).nonzero()[0]) - except: + except Exception: # duplicates indexer = algos.unique1d( Index(other._values).get_indexer_non_unique(self._values)[0]) @@ -3024,13 +3024,13 @@ def _reindex_non_unique(self, target): new_indexer = None if len(missing): - l = np.arange(len(indexer)) + length = np.arange(len(indexer)) missing = _ensure_platform_int(missing) missing_labels = target.take(missing) - missing_indexer = _ensure_int64(l[~check]) + missing_indexer = _ensure_int64(length[~check]) cur_labels = self.take(indexer[check]).values - cur_indexer = _ensure_int64(l[check]) + cur_indexer = _ensure_int64(length[check]) new_labels = np.empty(tuple([len(indexer)]), dtype=object) new_labels[cur_indexer] = cur_labels diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 70fda425e1ae3..50085889ad88f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -448,7 +448,7 @@ def _generate(cls, start, end, periods, name, offset, try: inferred_tz = timezones.infer_tzinfo(start, end) - except: + except Exception: raise TypeError('Start and end cannot both be tz-aware with ' 'different timezones') @@ -1176,12 +1176,12 @@ def __iter__(self): # convert in chunks of 10k for efficiency data = self.asi8 - l = len(self) + length = len(self) chunksize = 10000 - chunks = int(l / chunksize) + 1 + chunks = int(length / chunksize) + 1 for i in range(chunks): start_i = i * chunksize - end_i = min((i + 1) * chunksize, l) + end_i = min((i + 1) * chunksize, length) converted = libts.ints_to_pydatetime(data[start_i:end_i], tz=self.tz, freq=self.freq, box=True) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ca2377a03ca6b..445adb6bd3b18 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -841,7 +841,7 @@ def insert(self, loc, item): if _is_convertible_to_td(item): try: item = Timedelta(item) - except: + except Exception: pass freq = None diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c92c4b8850ee..c9a72bb688270 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -597,7 +597,7 @@ def _ixs(self, i, axis=0): return values[i] except IndexError: raise - except: + except Exception: if isinstance(i, slice): indexer = self.index._convert_slice_indexer(i, kind='iloc') return self._get_values(indexer) @@ -675,7 +675,7 @@ def _get_with(self, key): if isinstance(key, tuple): try: return self._get_values_tuple(key) - except: + except Exception: if len(key) == 1: key = key[0] if isinstance(key, slice): @@ -818,7 +818,7 @@ def _set_with(self, key, value): if not isinstance(key, (list, Series, np.ndarray, Series)): try: key = list(key) - except: + except Exception: key = [key] if isinstance(key, Index): diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index c160471bd0981..4053257fbd2c8 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -1088,13 +1088,18 @@ def test_timestamp(self): tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') utsc = tsc.tz_convert('UTC') + # utsc is a different representation of the same time assert tsc.timestamp() == utsc.timestamp() if PY3: - # should agree with datetime.timestamp method - dt = ts.to_pydatetime() - assert dt.timestamp() == ts.timestamp() + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() class TestTimestampNsOperations(object): diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index ddcf1bb7d8b7b..724628649796d 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -13,7 +13,7 @@ import pandas.util.testing as tm import pandas.tseries.offsets as offsets -from pandas.compat import lrange, zip +from pandas.compat import lrange, zip, PY3 from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas._libs import tslib @@ -1278,16 +1278,22 @@ def test_replace_tzinfo(self): result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) - if hasattr(result_dt, 'timestamp'): # New method in Py 3.3 - assert result_dt.timestamp() == result_pd.timestamp() + if PY3: + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + assert result_dt.timestamp() == result_pd.timestamp() + assert result_dt == result_pd assert result_dt == result_pd.to_pydatetime() result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None) result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) - if hasattr(result_dt, 'timestamp'): # New method in Py 3.3 - assert result_dt.timestamp() == result_pd.timestamp() + if PY3: + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + assert result_dt.timestamp() == result_pd.timestamp() + assert result_dt == result_pd assert result_dt == result_pd.to_pydatetime() From 0216706b004a63b7cc74c7af72563911b0851ac7 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Nov 2017 11:15:03 -0500 Subject: [PATCH 31/85] CI: slightly more robust xfvb starting (#18239) (cherry picked from commit fbe15d0f60da77277139787838cd2441d3153146) --- .travis.yml | 4 ++-- ci/{before_install_travis.sh => before_script_travis.sh} | 1 + pandas/tests/io/test_clipboard.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) rename ci/{before_install_travis.sh => before_script_travis.sh} (93%) diff --git a/.travis.yml b/.travis.yml index fe1a2950dbf08..42b4ef0396fc8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -102,8 +102,6 @@ before_install: - uname -a - git --version - git tag - - ci/before_install_travis.sh - - export DISPLAY=":99.0" install: - echo "install start" @@ -114,6 +112,8 @@ install: before_script: - ci/install_db_travis.sh + - export DISPLAY=":99.0" + - ci/before_script_travis.sh script: - echo "script start" diff --git a/ci/before_install_travis.sh b/ci/before_script_travis.sh similarity index 93% rename from ci/before_install_travis.sh rename to ci/before_script_travis.sh index 2d0b4da6120dc..0b3939b1906a2 100755 --- a/ci/before_install_travis.sh +++ b/ci/before_script_travis.sh @@ -4,6 +4,7 @@ echo "inside $0" if [ "${TRAVIS_OS_NAME}" == "linux" ]; then sh -e /etc/init.d/xvfb start + sleep 3 fi # Never fail because bad things happened here. diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 940a331a9de84..b5d1435c29cb7 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -18,7 +18,7 @@ try: DataFrame({'A': [1, 2]}).to_clipboard() _DEPS_INSTALLED = 1 -except PyperclipException: +except (PyperclipException, RuntimeError): _DEPS_INSTALLED = 0 From 631333f09659f0921cbca532303f2484611c6d05 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 12 Nov 2017 11:14:06 -0500 Subject: [PATCH 32/85] BUG: MultiIndex not raising AttributeError with a million records (#18165) (#18229) (cherry picked from commit eb39b445cc5b41431078b1a03600f13910da157f) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/indexes/multi.py | 22 +++++++++++----------- pandas/tests/indexes/test_multi.py | 10 ++++++++++ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index a54b2d91ce1df..da93ceec2a3d9 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -73,7 +73,7 @@ Conversion Indexing ^^^^^^^^ -- +- Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) - - diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4cc59f5297058..f603a0eef36a5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -446,6 +446,17 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): **kwargs) return self._shallow_copy(values, **kwargs) + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) + def __contains__(self, key): + hash(key) + try: + self.get_loc(key) + return True + except (LookupError, TypeError): + return False + + contains = __contains__ + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is not None: @@ -1370,17 +1381,6 @@ def nlevels(self): def levshape(self): return tuple(len(x) for x in self.levels) - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) - def __contains__(self, key): - hash(key) - try: - self.get_loc(key) - return True - except LookupError: - return False - - contains = __contains__ - def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 18bfc3d0efbee..c9c4029786c64 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2980,3 +2980,13 @@ def test_nan_stays_float(self): assert pd.isna(df0.index.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(dfm.index.get_level_values(1)[:-1]).all() + + def test_million_record_attribute_error(self): + # GH 18165 + r = list(range(1000000)) + df = pd.DataFrame({'a': r, 'b': r}, + index=pd.MultiIndex.from_tuples([(x, x) for x in r])) + + with tm.assert_raises_regex(AttributeError, + "'Series' object has no attribute 'foo'"): + df['a'].foo() From b9ed3a2c0a9b1ba428cd82ae96fb0279176decae Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Nov 2017 11:57:52 -0500 Subject: [PATCH 33/85] TST: xfail dateutil > 2.6.1 tests (#18240) xref #18141 (cherry picked from commit 40fd6b4045b7241d5aedc20e2427c68bd4797312) --- pandas/tests/indexes/datetimes/test_tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 330ec9f357655..c7944c078d8c4 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -960,6 +960,7 @@ def test_guess_datetime_format_nopadding(self): for dt_string, dt_format in dt_string_to_format: assert tools._guess_datetime_format(dt_string) == dt_format + @pytest.mark.xfail(reason="GH18141 - dateutil > 2.6.1 broken") def test_guess_datetime_format_for_array(self): tm._skip_if_not_us_locale() expected_format = '%Y-%m-%d %H:%M:%S.%f' From 6427bed4d029584b0fc8a89b12f26f00b434e70c Mon Sep 17 00:00:00 2001 From: ghasemnaddaf Date: Mon, 13 Nov 2017 13:03:56 -0800 Subject: [PATCH 34/85] DOC: add docstring for MultiIndex.fillna (#18018) (#18269) (cherry picked from commit 7495e9ade5a0ca0c24611f94b6d6c294dbd5446e) --- pandas/core/indexes/multi.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f603a0eef36a5..f4acb6862addb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -820,9 +820,10 @@ def duplicated(self, keep='first'): return duplicated_int64(ids, keep) - @Appender(ibase._index_shared_docs['fillna']) def fillna(self, value=None, downcast=None): - # isna is not implemented for MultiIndex + """ + fillna is not implemented for MultiIndex + """ raise NotImplementedError('isna is not defined for MultiIndex') @Appender(_index_shared_docs['dropna']) From 6ba8cd1e662864620a33b3f7e47516c54734cb75 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 14 Nov 2017 09:43:08 +0000 Subject: [PATCH 35/85] DOC: updated (Series/DataFrame).combine_first doc strings (#18266) (cherry picked from commit 7857c684c38c478190ff5b045f396d05a69c83a1) --- pandas/core/frame.py | 20 ++++++++++++++------ pandas/core/series.py | 16 +++++++++++++++- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c417dda6fc37..6a013bf203431 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4131,16 +4131,24 @@ def combine_first(self, other): ---------- other : DataFrame + Returns + ------- + combined : DataFrame + Examples -------- - a's values prioritized, use values from b to fill holes: - - >>> a.combine_first(b) + df1's values prioritized, use values from df2 to fill holes: + >>> df1 = pd.DataFrame([[1, np.nan]]) + >>> df2 = pd.DataFrame([[3, 4]]) + >>> df1.combine_first(df2) + 0 1 + 0 1 4.0 - Returns - ------- - combined : DataFrame + See Also + -------- + DataFrame.combine : Perform series-wise operation on two DataFrames + using a given function """ import pandas.core.computation.expressions as expressions diff --git a/pandas/core/series.py b/pandas/core/series.py index c9a72bb688270..acebec7733c0b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1764,7 +1764,21 @@ def combine_first(self, other): Returns ------- - y : Series + combined : Series + + Examples + -------- + >>> s1 = pd.Series([1, np.nan]) + >>> s2 = pd.Series([3, 4]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + dtype: float64 + + See Also + -------- + Series.combine : Perform elementwise operation on two Series + using a given function """ new_index = self.index.union(other.index) this = self.reindex(new_index, copy=False) From de7b14211b07cc7a91a8f875dfee45f4028c665c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 4 Nov 2017 17:14:10 -0400 Subject: [PATCH 36/85] Revert "CI: temp disable scipy on windows 3.6 build (#18078)" (#18105) * Revert "CI: temp disable scipy on windows 3.6 build (#18078)" This reverts commit cd6dc87466e119aabb76d8439df8289d082ea948. * use numpy=1.13 (cherry picked from commit 2d6f83654ae62315151466507ae0a3d1e8212104) --- appveyor.yml | 2 +- ci/requirements-3.6_WIN.run | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index a1f8886f6d068..44af73b498aa8 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,7 +22,7 @@ environment: PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" - CONDA_NPY: "112" + CONDA_NPY: "113" - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "2.7" diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run index af7a90b126f22..db2d429a2a4ff 100644 --- a/ci/requirements-3.6_WIN.run +++ b/ci/requirements-3.6_WIN.run @@ -1,6 +1,6 @@ python-dateutil pytz -numpy=1.12* +numpy=1.13* bottleneck openpyxl xlsxwriter From 754ec083f685c1d7d41bb6487ed657baecd00ee1 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 14 Nov 2017 11:32:44 +0000 Subject: [PATCH 37/85] updated (Series/DataFrame).combine doc strings (#18268) (cherry picked from commit 1e30886c09b63ea33236d02f9fb3ae854387ae67) --- pandas/core/frame.py | 17 ++++++++++++++++- pandas/core/series.py | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6a013bf203431..220f639649574 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4035,6 +4035,8 @@ def combine(self, other, func, fill_value=None, overwrite=True): ---------- other : DataFrame func : function + Function that takes two series as inputs and return a Series or a + scalar fill_value : scalar value overwrite : boolean, default True If True then overwrite values for common keys in the calling frame @@ -4042,8 +4044,21 @@ def combine(self, other, func, fill_value=None, overwrite=True): Returns ------- result : DataFrame - """ + Examples + -------- + >>> df1 = DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2) + A B + 0 0 3 + 1 0 3 + + See Also + -------- + DataFrame.combine_first : Combine two DataFrame objects and default to + non-null values in frame calling the method + """ other_idxlen = len(other.index) # save for compare this, other = self.align(other, copy=False) diff --git a/pandas/core/series.py b/pandas/core/series.py index acebec7733c0b..d2d7de0310b31 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1731,11 +1731,26 @@ def combine(self, other, func, fill_value=np.nan): ---------- other : Series or scalar value func : function + Function that takes two scalars as inputs and return a scalar fill_value : scalar value Returns ------- result : Series + + Examples + -------- + >>> s1 = Series([1, 2]) + >>> s2 = Series([0, 3]) + >>> s1.combine(s2, lambda x1, x2: x1 if x1 < x2 else x2) + 0 0 + 1 2 + dtype: int64 + + See Also + -------- + Series.combine_first : Combine Series values, choosing the calling + Series's values first """ if isinstance(other, Series): new_index = self.index.union(other.index) From ea94667e7fbf58349a12e7b14aa0541d95f94215 Mon Sep 17 00:00:00 2001 From: Ingolf Becker Date: Tue, 14 Nov 2017 13:04:34 +0000 Subject: [PATCH 38/85] Fix #17965 to allow full comparison of datetimelike objects (#18188) (cherry picked from commit 77f10f0805651493b95a462085c9a3cd98e9197a) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/_libs/index.pyx | 4 ++-- .../indexes/datetimes/test_partial_slicing.py | 21 ++++++++++++++++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index da93ceec2a3d9..9c05f91588e23 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -73,6 +73,7 @@ Conversion Indexing ^^^^^^^^ +- Bug in a boolean comparison of a ``datetime.datetime`` and a ``datetime64[ns]`` dtype Series (:issue:`17965`) - Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) - - diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c96251a0293d6..65e99f5f46fc2 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -19,7 +19,7 @@ from hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta -from datetime import datetime, timedelta +from datetime import datetime, timedelta, date from cpython cimport PyTuple_Check, PyList_Check @@ -500,7 +500,7 @@ cpdef convert_scalar(ndarray arr, object value): if arr.descr.type_num == NPY_DATETIME: if isinstance(value, np.ndarray): pass - elif isinstance(value, datetime): + elif isinstance(value, (datetime, np.datetime64, date)): return Timestamp(value).value elif value is None or value != value: return iNaT diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index e7d03aa193cbd..04c180350fb72 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -2,9 +2,10 @@ import pytest -from datetime import datetime +from datetime import datetime, date import numpy as np import pandas as pd +import operator as op from pandas import (DatetimeIndex, Series, DataFrame, date_range, Index, Timedelta, Timestamp) @@ -268,3 +269,21 @@ def test_loc_datetime_length_one(self): result = df.loc['2016-10-01T00:00:00':] tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize('datetimelike', [ + Timestamp('20130101'), datetime(2013, 1, 1), + date(2013, 1, 1), np.datetime64('2013-01-01T00:00', 'ns')]) + @pytest.mark.parametrize('op,expected', [ + (op.lt, [True, False, False, False]), + (op.le, [True, True, False, False]), + (op.eq, [False, True, False, False]), + (op.gt, [False, False, False, True])]) + def test_selection_by_datetimelike(self, datetimelike, op, expected): + # GH issue #17965, test for ability to compare datetime64[ns] columns + # to datetimelike + df = DataFrame({'A': [pd.Timestamp('20120101'), + pd.Timestamp('20130101'), + np.nan, pd.Timestamp('20130103')]}) + result = op(df.A, datetimelike) + expected = Series(expected, name='A') + tm.assert_series_equal(result, expected) From b6f536fae03fedeafedbcce76be2ffb93aafdce2 Mon Sep 17 00:00:00 2001 From: Cornelius Riemenschneider Date: Tue, 14 Nov 2017 16:40:07 +0100 Subject: [PATCH 39/85] ENH: Pass kwargs from read_parquet() to the underlying engines. (#18216) This allows e.g. to specify filters for predicate pushdown to fastparquet. (cherry picked from commit ef4e30b85fa3058f0127a969ae7576f0ef3f7454) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/parquet.py | 13 ++++---- pandas/tests/io/test_parquet.py | 54 +++++++++++++++++++++------------ 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 9c05f91588e23..7ecb672c28ed1 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -86,6 +86,7 @@ I/O - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) +- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) Plotting diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ef95e32cc241e..4a13d2c9db944 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -76,9 +76,10 @@ def write(self, df, path, compression='snappy', table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs) - def read(self, path, columns=None): + def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - return self.api.parquet.read_table(path, columns=columns).to_pandas() + return self.api.parquet.read_table(path, columns=columns, + **kwargs).to_pandas() class FastParquetImpl(object): @@ -115,9 +116,9 @@ def write(self, df, path, compression='snappy', **kwargs): self.api.write(path, df, compression=compression, **kwargs) - def read(self, path, columns=None): + def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - return self.api.ParquetFile(path).to_pandas(columns=columns) + return self.api.ParquetFile(path).to_pandas(columns=columns, **kwargs) def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): @@ -175,7 +176,7 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): if df.columns.inferred_type not in valid_types: raise ValueError("parquet must have string column names") - return impl.write(df, path, compression=compression) + return impl.write(df, path, compression=compression, **kwargs) def read_parquet(path, engine='auto', columns=None, **kwargs): @@ -205,4 +206,4 @@ def read_parquet(path, engine='auto', columns=None, **kwargs): """ impl = get_engine(engine) - return impl.read(path, columns=columns) + return impl.read(path, columns=columns, **kwargs) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9a4edf38e2ef4..e7bcff22371b7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -105,7 +105,7 @@ def test_options_py(df_compat, pa): with pd.option_context('io.parquet.engine', 'pyarrow'): df.to_parquet(path) - result = read_parquet(path, compression=None) + result = read_parquet(path) tm.assert_frame_equal(result, df) @@ -118,7 +118,7 @@ def test_options_fp(df_compat, fp): with pd.option_context('io.parquet.engine', 'fastparquet'): df.to_parquet(path, compression=None) - result = read_parquet(path, compression=None) + result = read_parquet(path) tm.assert_frame_equal(result, df) @@ -130,7 +130,7 @@ def test_options_auto(df_compat, fp, pa): with pd.option_context('io.parquet.engine', 'auto'): df.to_parquet(path) - result = read_parquet(path, compression=None) + result = read_parquet(path) tm.assert_frame_equal(result, df) @@ -162,7 +162,7 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) - result = read_parquet(path, engine=fp, compression=None) + result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) @@ -174,7 +174,7 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None) - result = read_parquet(path, engine=pa, compression=None) + result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) @@ -188,19 +188,23 @@ def check_error_on_write(self, df, engine, exc): with tm.ensure_clean() as path: to_parquet(df, path, engine, compression=None) - def check_round_trip(self, df, engine, expected=None, **kwargs): - + def check_round_trip(self, df, engine, expected=None, + write_kwargs=None, read_kwargs=None): + if write_kwargs is None: + write_kwargs = {} + if read_kwargs is None: + read_kwargs = {} with tm.ensure_clean() as path: - df.to_parquet(path, engine, **kwargs) - result = read_parquet(path, engine, **kwargs) + df.to_parquet(path, engine, **write_kwargs) + result = read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected) # repeat - to_parquet(df, path, engine, **kwargs) - result = pd.read_parquet(path, engine, **kwargs) + to_parquet(df, path, engine, **write_kwargs) + result = pd.read_parquet(path, engine, **read_kwargs) if expected is None: expected = df @@ -222,7 +226,7 @@ def test_columns_dtypes(self, engine): # unicode df.columns = [u'foo', u'bar'] - self.check_round_trip(df, engine, compression=None) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) def test_columns_dtypes_invalid(self, engine): @@ -246,7 +250,7 @@ def test_columns_dtypes_invalid(self, engine): def test_write_with_index(self, engine): df = pd.DataFrame({'A': [1, 2, 3]}) - self.check_round_trip(df, engine, compression=None) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) # non-default index for index in [[2, 3, 4], @@ -280,7 +284,8 @@ def test_compression(self, engine, compression): pytest.importorskip('brotli') df = pd.DataFrame({'A': [1, 2, 3]}) - self.check_round_trip(df, engine, compression=compression) + self.check_round_trip(df, engine, + write_kwargs={'compression': compression}) def test_read_columns(self, engine): # GH18154 @@ -289,7 +294,8 @@ def test_read_columns(self, engine): expected = pd.DataFrame({'string': list('abc')}) self.check_round_trip(df, engine, expected=expected, - compression=None, columns=["string"]) + write_kwargs={'compression': None}, + read_kwargs={'columns': ['string']}) class TestParquetPyArrow(Base): @@ -377,7 +383,7 @@ def test_basic(self, fp): 'timedelta': pd.timedelta_range('1 day', periods=3), }) - self.check_round_trip(df, fp, compression=None) + self.check_round_trip(df, fp, write_kwargs={'compression': None}) @pytest.mark.skip(reason="not supported") def test_duplicate_columns(self, fp): @@ -390,7 +396,8 @@ def test_duplicate_columns(self, fp): def test_bool_with_none(self, fp): df = pd.DataFrame({'a': [True, None, False]}) expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16') - self.check_round_trip(df, fp, expected=expected, compression=None) + self.check_round_trip(df, fp, expected=expected, + write_kwargs={'compression': None}) def test_unsupported(self, fp): @@ -406,7 +413,7 @@ def test_categorical(self, fp): if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"): pytest.skip("CategoricalDtype not supported for older fp") df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) - self.check_round_trip(df, fp, compression=None) + self.check_round_trip(df, fp, write_kwargs={'compression': None}) def test_datetime_tz(self, fp): # doesn't preserve tz @@ -416,4 +423,13 @@ def test_datetime_tz(self, fp): # warns on the coercion with catch_warnings(record=True): self.check_round_trip(df, fp, df.astype('datetime64[ns]'), - compression=None) + write_kwargs={'compression': None}) + + def test_filter_row_groups(self, fp): + d = {'a': list(range(0, 3))} + df = pd.DataFrame(d) + with tm.ensure_clean() as path: + df.to_parquet(path, fp, compression=None, + row_group_offsets=1) + result = read_parquet(path, fp, filters=[('a', '==', 0)]) + assert len(result) == 1 From 89b3c9185438b09e6348f9cb1d8a3bca36d02fa9 Mon Sep 17 00:00:00 2001 From: ghasemnaddaf Date: Tue, 14 Nov 2017 08:03:18 -0800 Subject: [PATCH 40/85] DOC: clarify idxmax behaviour issue #18206 (#18209) (cherry picked from commit 63e8527d32aaf6afe1cd4b2a7b3bfadb088c9a72) --- pandas/core/frame.py | 14 ++++++++++++-- pandas/core/series.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 220f639649574..cb4b5c9f4b082 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5813,7 +5813,12 @@ def idxmin(self, axis=0, skipna=True): 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result - will be NA + will be NA. + + Raises + ------ + ValueError + * If the row/column is empty Returns ------- @@ -5844,7 +5849,12 @@ def idxmax(self, axis=0, skipna=True): 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result - will be first index. + will be NA. + + Raises + ------ + ValueError + * If the row/column is empty Returns ------- diff --git a/pandas/core/series.py b/pandas/core/series.py index d2d7de0310b31..59606e86465c5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1306,7 +1306,13 @@ def idxmin(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- skipna : boolean, default True - Exclude NA/null values + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + + Raises + ------ + ValueError + * If the Series is empty Returns ------- @@ -1336,7 +1342,13 @@ def idxmax(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- skipna : boolean, default True - Exclude NA/null values + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + + Raises + ------ + ValueError + * If the Series is empty Returns ------- From 4df8ee27de56831fab8b4119398c45b49a445084 Mon Sep 17 00:00:00 2001 From: Matt Braymer-Hayes Date: Tue, 14 Nov 2017 14:50:57 -0800 Subject: [PATCH 41/85] DOC: Fix "drop=True" reset_index() reference in Feather and Parquet IO index caveats (#18285) (#18292) (cherry picked from commit 148ed63f9287cc55f7a2802da300b717d01cabe6) --- doc/source/io.rst | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index de3ae2e8f4305..7418617ae9004 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4469,8 +4469,10 @@ Several caveats. - This is a newer library, and the format, though stable, is not guaranteed to be backward compatible to the earlier versions. -- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an - error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index. +- The format will NOT write an ``Index``, or ``MultiIndex`` for the + ``DataFrame`` and will raise an error if a non-default one is provided. You + can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to + ignore it. - Duplicate column names and non-string columns names are not supported - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message on an attempt at serialization. @@ -4533,8 +4535,10 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. -- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an - error if a non-default one is provided. You can simply ``.reset_index(drop=True)`` in order to store the index. +- The format will NOT write an ``Index``, or ``MultiIndex`` for the + ``DataFrame`` and will raise an error if a non-default one is provided. You + can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to + ignore it. - Duplicate column names and non-string columns names are not supported - Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message @@ -4580,7 +4584,7 @@ Read from a parquet file. result.dtypes -Read only certain columns of a parquet file. +Read only certain columns of a parquet file. .. ipython:: python From 9d53574d0e100aecc1950ece24c2e3b9854c10cc Mon Sep 17 00:00:00 2001 From: Wes Turner Date: Thu, 16 Nov 2017 05:15:41 -0600 Subject: [PATCH 42/85] DOC: ecosystem: dask-ml (#18317) (cherry picked from commit 7c4ae124c2cf9a2087b0f08828c71ff1e8e4fb75) --- doc/source/ecosystem.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 2348a3d10c54f..991ed3bfd98dd 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -222,7 +222,13 @@ Out-of-core ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dask is a flexible parallel computing library for analytics. Dask -allow a familiar ``DataFrame`` interface to out-of-core, parallel and distributed computing. +provides a familiar ``DataFrame`` interface for out-of-core, parallel and distributed computing. + +`Dask-ML `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. + `Blaze `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 3a109eac69bd4379e3d70add8cf06b30d22c4c8c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Nov 2017 06:28:36 -0500 Subject: [PATCH 43/85] TST: add downstream deps in 3.6 build (#18333) (cherry picked from commit a39f967b3d6e43edc62e3feae040e22eb69f20b4) --- ci/requirements-3.6.sh | 7 +++++++ ci/script_multi.sh | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 ci/requirements-3.6.sh diff --git a/ci/requirements-3.6.sh b/ci/requirements-3.6.sh new file mode 100644 index 0000000000000..dfc123c88f24b --- /dev/null +++ b/ci/requirements-3.6.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "[install 3.6 downstream deps]" + +conda install -n pandas -c conda-forge pandas-gbq pandas-datareader xarray geopandas seaborn statsmodels scikit-learn dask diff --git a/ci/script_multi.sh b/ci/script_multi.sh index ee9fbcaad5ef5..ae8f030b92d66 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e echo "[script multi]" From 3b802bdaade7995dbf43b4dc97c9ac62454c292c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 17 Nov 2017 07:33:54 -0500 Subject: [PATCH 44/85] CI: remove pandas-gbqg (#18343) (cherry picked from commit cfad581e9a6d35c7d05d2b1f34e4a19f7ee15cc6) --- ci/requirements-3.6.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-3.6.sh b/ci/requirements-3.6.sh index dfc123c88f24b..f5c3dbf59a29d 100644 --- a/ci/requirements-3.6.sh +++ b/ci/requirements-3.6.sh @@ -4,4 +4,4 @@ source activate pandas echo "[install 3.6 downstream deps]" -conda install -n pandas -c conda-forge pandas-gbq pandas-datareader xarray geopandas seaborn statsmodels scikit-learn dask +conda install -n pandas -c conda-forge pandas-datareader xarray geopandas seaborn statsmodels scikit-learn dask From e96375d1b9a3208f4dd84b0b791b328e8118ae0c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 18 Nov 2017 20:26:29 -0500 Subject: [PATCH 45/85] TST: don't skip statsmodels tests on network builds (#18362) (cherry picked from commit 1798c9df8144890f9d9b74cca9f3134ea523b201) --- pandas/tests/test_downstream.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 61f0c992225c6..b8e9191002640 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -52,7 +52,6 @@ def test_xarray(df): assert df.to_xarray() is not None -@tm.network def test_statsmodels(): statsmodels = import_module('statsmodels') # noqa From 3a1eeefd5ee077910ca50a415e9f13a8ba4c3c97 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 20 Nov 2017 19:21:03 +0100 Subject: [PATCH 46/85] PERF: improve plotting performance by not stringifying all x data (#18373) * add benchmark with basic default plotting (cherry picked from commit 8d04dafd48d541042613548183b62288ef7e97b3) --- asv_bench/benchmarks/plotting.py | 31 ++++++++++++++++++++++++++++--- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/plotting/_core.py | 12 ++++++++---- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index dda684b35e301..16889b2f19e89 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -10,15 +10,37 @@ def date_range(start=None, end=None, periods=None, freq=None): from pandas.tools.plotting import andrews_curves +class Plotting(object): + goal_time = 0.2 + + def setup(self): + import matplotlib + matplotlib.use('Agg') + self.s = Series(np.random.randn(1000000)) + self.df = DataFrame({'col': self.s}) + + def time_series_plot(self): + self.s.plot() + + def time_frame_plot(self): + self.df.plot() + + class TimeseriesPlotting(object): goal_time = 0.2 def setup(self): import matplotlib matplotlib.use('Agg') - self.N = 2000 - self.M = 5 - self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) + N = 2000 + M = 5 + idx = date_range('1/1/1975', periods=N) + self.df = DataFrame(np.random.randn(N, M), index=idx) + + idx_irregular = pd.DatetimeIndex(np.concatenate((idx.values[0:10], + idx.values[12:]))) + self.df2 = DataFrame(np.random.randn(len(idx_irregular), M), + index=idx_irregular) def time_plot_regular(self): self.df.plot() @@ -26,6 +48,9 @@ def time_plot_regular(self): def time_plot_regular_compat(self): self.df.plot(x_compat=True) + def time_plot_irregular(self): + self.df2.plot() + class Misc(object): goal_time = 0.6 diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 7ecb672c28ed1..5231bc47571b0 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -39,7 +39,7 @@ Deprecations Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Improved performance of plotting large series/dataframes (:issue:`18236`). - - diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 43f33cf30dea1..4a8bef69e4a4b 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -383,12 +383,16 @@ def _add_table(self): def _post_plot_logic_common(self, ax, data): """Common post process for each axes""" - labels = [pprint_thing(key) for key in data.index] - labels = dict(zip(range(len(data.index)), labels)) + + def get_label(i): + try: + return pprint_thing(data.index[i]) + except Exception: + return '' if self.orientation == 'vertical' or self.orientation is None: if self._need_to_set_index: - xticklabels = [labels.get(x, '') for x in ax.get_xticks()] + xticklabels = [get_label(x) for x in ax.get_xticks()] ax.set_xticklabels(xticklabels) self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) @@ -400,7 +404,7 @@ def _post_plot_logic_common(self, ax, data): elif self.orientation == 'horizontal': if self._need_to_set_index: - yticklabels = [labels.get(y, '') for y in ax.get_yticks()] + yticklabels = [get_label(y) for y in ax.get_yticks()] ax.set_yticklabels(yticklabels) self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) From 57c24a3cff722c321759316123424e7f14085b6b Mon Sep 17 00:00:00 2001 From: Alexander Buchkovsky Date: Tue, 21 Nov 2017 05:14:13 -0800 Subject: [PATCH 47/85] !I fix for BUG: resample with tz-aware: Values falls after last bin #15549 (#18337) (cherry picked from commit 8efd1a0162a643c06af998bb1bc60a2cc9f5dbf6) --- doc/source/whatsnew/v0.21.1.txt | 3 +++ pandas/core/resample.py | 10 ++++++++++ pandas/tests/test_resample.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5231bc47571b0..3d67d29797841 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -62,6 +62,9 @@ Bug Fixes - Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) +- Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) +- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) +- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5a571f9077999..40946318aa33a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1141,6 +1141,16 @@ def _get_time_bins(self, ax): tz=tz, name=ax.name) + # GH 15549 + # In edge case of tz-aware resapmling binner last index can be + # less than the last variable in data object, this happens because of + # DST time change + if len(binner) > 1 and binner[-1] < last: + extra_date_range = pd.date_range(binner[-1], last + self.freq, + freq=self.freq, tz=tz, + name=ax.name) + binner = labels = binner.append(extra_date_range[1:]) + # a little hack trimmed = False if (len(binner) > 2 and binner[-2] == last and diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index ba1a2ad1f42e2..a703d24952af1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2729,6 +2729,34 @@ def test_resample_weekly_bug_1726(self): # it works! df.resample('W-MON', closed='left', label='left').first() + def test_resample_with_dst_time_change(self): + # GH 15549 + index = pd.DatetimeIndex([1457537600000000000, 1458059600000000000], + tz='UTC').tz_convert('America/Chicago') + df = pd.DataFrame([1, 2], index=index) + result = df.resample('12h', closed='right', + label='right').last().ffill() + + expected_index_values = ['2016-03-09 12:00:00-06:00', + '2016-03-10 00:00:00-06:00', + '2016-03-10 12:00:00-06:00', + '2016-03-11 00:00:00-06:00', + '2016-03-11 12:00:00-06:00', + '2016-03-12 00:00:00-06:00', + '2016-03-12 12:00:00-06:00', + '2016-03-13 00:00:00-06:00', + '2016-03-13 13:00:00-05:00', + '2016-03-14 01:00:00-05:00', + '2016-03-14 13:00:00-05:00', + '2016-03-15 01:00:00-05:00', + '2016-03-15 13:00:00-05:00'] + index = pd.DatetimeIndex(expected_index_values, + tz='UTC').tz_convert('America/Chicago') + expected = pd.DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 2.0], index=index) + assert_frame_equal(result, expected) + def test_resample_bms_2752(self): # GH2753 foo = pd.Series(index=pd.bdate_range('20000101', '20000201')) From 5ab9595e4885f9de025f4f28855552d988108e6d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 21 Nov 2017 08:18:18 -0500 Subject: [PATCH 48/85] DOC: clean up whatsnew 0.21.1 (#18408) (cherry picked from commit c4a2cd362afdfbc21cb62a8e09758226e86735c4) --- doc/source/whatsnew/v0.21.1.txt | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 3d67d29797841..963154646c4a9 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -56,19 +56,13 @@ Documentation Changes Bug Fixes ~~~~~~~~~ -- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) -- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) -- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) -- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) -- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) -- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) -- Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) -- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) -- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) Conversion ^^^^^^^^^^ +- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) +- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) +- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) - - - @@ -78,6 +72,7 @@ Indexing - Bug in a boolean comparison of a ``datetime.datetime`` and a ``datetime64[ns]`` dtype Series (:issue:`17965`) - Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) +- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) - - @@ -102,6 +97,9 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) +- Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) +- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) - - - @@ -117,12 +115,14 @@ Reshaping ^^^^^^^^^ - Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) +- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) - - Numeric ^^^^^^^ +- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) - - - @@ -131,9 +131,6 @@ Categorical ^^^^^^^^^^^ - Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) -- Error messages in the testing module have been improved when items have - different ``CategoricalDtype`` (:issue:`18069`) -- - Error messages in the testing module have been improved when items have different ``CategoricalDtype`` (:issue:`18069`) - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) From b46e96d4f9c1a0a7ddb90d7701e2f81b8767a3fe Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Wed, 22 Nov 2017 11:03:51 +0900 Subject: [PATCH 49/85] BUG: Fix filter method so that accepts byte and unicode column names (#18238) (cherry picked from commit ec065b277ac09a4879828366cb99557ddb5eaa0a) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/compat/__init__.py | 18 ++++++++++++++++ pandas/core/generic.py | 10 ++++----- .../tests/frame/test_axis_select_reindex.py | 21 +++++++++++++++++++ 4 files changed, 45 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 963154646c4a9..06ae7870ea917 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -116,7 +116,7 @@ Reshaping - Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) -- +- Bug in ``DataFrame.filter(...)`` when :class:`unicode` is passed as a condition in Python 2 (:issue:`13101`) - Numeric diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 4a201d065c0b6..288d9d7742daf 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -257,6 +257,16 @@ def u(s): def u_safe(s): return s + def to_str(s): + """ + Convert bytes and non-string into Python 3 str + """ + if isinstance(s, binary_type): + s = bytes_to_str(s) + elif not isinstance(s, string_types): + s = str(s) + return s + def strlen(data, encoding=None): # encoding is for compat with PY2 return len(data) @@ -302,6 +312,14 @@ def u_safe(s): except: return s + def to_str(s): + """ + Convert unicode and non-string into Python 2 str + """ + if not isinstance(s, string_types): + s = str(s) + return s + def strlen(data, encoding=None): try: data = data.decode(encoding) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 118e7d5cd437b..58d86251a4a62 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -49,7 +49,7 @@ from pandas.tseries.frequencies import to_offset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.compat import (map, zip, lzip, lrange, string_types, +from pandas.compat import (map, zip, lzip, lrange, string_types, to_str, isidentifier, set_function_name, cPickle as pkl) from pandas.core.ops import _align_method_FRAME import pandas.core.nanops as nanops @@ -3235,14 +3235,14 @@ def filter(self, items=None, like=None, regex=None, axis=None): **{name: [r for r in items if r in labels]}) elif like: def f(x): - if not isinstance(x, string_types): - x = str(x) - return like in x + return like in to_str(x) values = labels.map(f) return self.loc(axis=axis)[values] elif regex: + def f(x): + return matcher.search(to_str(x)) is not None matcher = re.compile(regex) - values = labels.map(lambda x: matcher.search(str(x)) is not None) + values = labels.map(f) return self.loc(axis=axis)[values] else: raise TypeError('Must pass either `items`, `like`, or `regex`') diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 1e2f630401c89..343e235fb741c 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -884,6 +884,27 @@ def test_filter_regex_search(self): exp = df[[x for x in df.columns if 'BB' in x]] assert_frame_equal(result, exp) + @pytest.mark.parametrize('name,expected', [ + ('a', DataFrame({u'a': [1, 2]})), + (u'a', DataFrame({u'a': [1, 2]})), + (u'あ', DataFrame({u'あ': [3, 4]})) + ]) + def test_filter_unicode(self, name, expected): + # GH13101 + df = DataFrame({u'a': [1, 2], u'あ': [3, 4]}) + + assert_frame_equal(df.filter(like=name), expected) + assert_frame_equal(df.filter(regex=name), expected) + + @pytest.mark.parametrize('name', ['a', u'a']) + def test_filter_bytestring(self, name): + # GH13101 + df = DataFrame({b'a': [1, 2], b'b': [3, 4]}) + expected = DataFrame({b'a': [1, 2]}) + + assert_frame_equal(df.filter(like=name), expected) + assert_frame_equal(df.filter(regex=name), expected) + def test_filter_corner(self): empty = DataFrame() From b33bf4aae45479e31c8c89a7a2226fd257072852 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 21 Nov 2017 20:54:50 -0500 Subject: [PATCH 50/85] STYLE: fix flake8=3.4.1 (#18418) (cherry picked from commit bdbd6549f74174cd05edcab20c713c332ab548cb) --- ci/install_travis.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 4d8a371ba2994..dac3625cba4ba 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -107,7 +107,7 @@ time conda install -n pandas pytest>=3.1.0 time pip install pytest-xdist moto if [ "$LINT" ]; then - conda install flake8 + conda install flake8=3.4.1 pip install cpplint fi From 3113f4020e328e2dc7d2604b619881cb50809570 Mon Sep 17 00:00:00 2001 From: Sam Cohan Date: Wed, 22 Nov 2017 03:28:41 -0800 Subject: [PATCH 51/85] Read csv category fix (#18402) (cherry picked from commit d421a09e382109c1bbe064107c4024b065839de2) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/_libs/parsers.pyx | 7 ++++--- pandas/tests/io/parser/dtypes.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 06ae7870ea917..b9bf19147c487 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -82,6 +82,7 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) - Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) +- Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`) - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 44fad899ff099..a90039d789972 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2221,9 +2221,10 @@ def _concatenate_chunks(list chunks): for name in names: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. - dtypes = set([a.dtype for a in arrs]) - if len(dtypes) > 1: - common_type = np.find_common_type(dtypes, []) + dtypes = {a.dtype for a in arrs} + numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + if len(numpy_dtypes) > 1: + common_type = np.find_common_type(numpy_dtypes, []) if common_type == np.object: warning_columns.append(str(name)) diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 7d3df6201a390..b91ce04673e29 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -114,6 +114,17 @@ def test_categorical_dtype(self): actual = self.read_csv(StringIO(data), dtype='category') tm.assert_frame_equal(actual, expected) + @pytest.mark.slow + def test_categorical_dtype_high_cardinality_numeric(self): + # GH 18186 + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({'a': Categorical(data, ordered=True)}) + actual = self.read_csv(StringIO('a\n' + '\n'.join(data)), + dtype='category') + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True) + tm.assert_frame_equal(actual, expected) + def test_categorical_dtype_encoding(self): # GH 10153 pth = tm.get_data_path('unicode_series.csv') From 542533aaeb76aca3a8d8019f756f188bc0c8d499 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 22 Nov 2017 20:32:02 +0100 Subject: [PATCH 52/85] DOC: add whatsnew for #17882 (#18433) (cherry picked from commit cf909957dae0a07fcb00379aeb2ed9323cd888ff) --- doc/source/whatsnew/v0.21.1.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index b9bf19147c487..6da206a61b193 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -86,6 +86,8 @@ I/O - Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) +- Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). + Plotting From be26b7ec862df24bcc2407f6d46dbf9cd79a9787 Mon Sep 17 00:00:00 2001 From: Dror Atariah Date: Wed, 22 Nov 2017 20:24:36 +0100 Subject: [PATCH 53/85] BUG: formating integers datetimes using sql GH17855 (#17882) (cherry picked from commit bc956290bae647e5cd4aeac964d79883df213001) --- pandas/io/sql.py | 12 ++++----- pandas/tests/io/test_sql.py | 50 +++++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c42c19e1357bc..a9b4f504dd624 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -103,12 +103,12 @@ def _handle_date_column(col, utc=None, format=None): if isinstance(format, dict): return to_datetime(col, errors='ignore', **format) else: - if format in ['D', 's', 'ms', 'us', 'ns']: - return to_datetime(col, errors='coerce', unit=format, utc=utc) - elif (issubclass(col.dtype.type, np.floating) or - issubclass(col.dtype.type, np.integer)): - # parse dates as timestamp - format = 's' if format is None else format + # Allow passing of formatting string for integers + # GH17855 + if format is None and (issubclass(col.dtype.type, np.floating) or + issubclass(col.dtype.type, np.integer)): + format = 's' + if format in ['D', 'd', 'h', 'm', 's', 'ms', 'us', 'ns']: return to_datetime(col, errors='coerce', unit=format, utc=utc) elif is_datetime64tz_dtype(col): # coerce to UTC timezone diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 2df43158b5370..4528565eefa0c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -88,6 +88,7 @@ "TextCol" TEXT, "DateCol" TEXT, "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, "FloatCol" REAL, "IntCol" INTEGER, "BoolCol" INTEGER, @@ -98,6 +99,7 @@ `TextCol` TEXT, `DateCol` DATETIME, `IntDateCol` INTEGER, + `IntDateOnlyCol` INTEGER, `FloatCol` DOUBLE, `IntCol` INTEGER, `BoolCol` BOOLEAN, @@ -109,6 +111,7 @@ "DateCol" TIMESTAMP, "DateColWithTz" TIMESTAMP WITH TIME ZONE, "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, "FloatCol" DOUBLE PRECISION, "IntCol" INTEGER, "BoolCol" BOOLEAN, @@ -120,31 +123,33 @@ 'sqlite': { 'query': """ INSERT INTO types_test_data - VALUES(?, ?, ?, ?, ?, ?, ?, ?) + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) """, 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', - 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', + 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', + 'BoolColWithNull' ) }, 'mysql': { 'query': """ INSERT INTO types_test_data - VALUES("%s", %s, %s, %s, %s, %s, %s, %s) + VALUES("%s", %s, %s, %s, %s, %s, %s, %s, %s) """, 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', - 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', + 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', + 'BoolColWithNull' ) }, 'postgresql': { 'query': """ INSERT INTO types_test_data - VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s) + VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, 'fields': ( 'TextCol', 'DateCol', 'DateColWithTz', - 'IntDateCol', 'FloatCol', + 'IntDateCol', 'IntDateOnlyCol', 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' ) }, @@ -313,13 +318,13 @@ def _load_raw_sql(self): self.drop_table('types_test_data') self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor]) ins = SQL_STRINGS['insert_test_types'][self.flavor] - data = [ { 'TextCol': 'first', 'DateCol': '2000-01-03 00:00:00', 'DateColWithTz': '2000-01-01 00:00:00-08:00', 'IntDateCol': 535852800, + 'IntDateOnlyCol': 20101010, 'FloatCol': 10.10, 'IntCol': 1, 'BoolCol': False, @@ -331,6 +336,7 @@ def _load_raw_sql(self): 'DateCol': '2000-01-04 00:00:00', 'DateColWithTz': '2000-06-01 00:00:00-07:00', 'IntDateCol': 1356998400, + 'IntDateOnlyCol': 20101212, 'FloatCol': 10.10, 'IntCol': 1, 'BoolCol': False, @@ -610,20 +616,42 @@ def test_date_parsing(self): df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates=['DateCol']) assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + pd.Timestamp(2000, 1, 3, 0, 0, 0), + pd.Timestamp(2000, 1, 4, 0, 0, 0) + ] df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + pd.Timestamp(2000, 1, 3, 0, 0, 0), + pd.Timestamp(2000, 1, 4, 0, 0, 0) + ] df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates=['IntDateCol']) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + pd.Timestamp(1986, 12, 25, 0, 0, 0), + pd.Timestamp(2013, 1, 1, 0, 0, 0) + ] df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates={'IntDateCol': 's'}) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + pd.Timestamp(1986, 12, 25, 0, 0, 0), + pd.Timestamp(2013, 1, 1, 0, 0, 0) + ] + + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, + parse_dates={'IntDateOnlyCol': '%Y%m%d'}) + assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) + assert df.IntDateOnlyCol.tolist() == [ + pd.Timestamp('2010-10-10'), + pd.Timestamp('2010-12-12') + ] def test_date_and_index(self): # Test case where same column appears in parse_date and index_col From ea264b53e44bf14ed72a4e6149187ce2c1e6d5dc Mon Sep 17 00:00:00 2001 From: Jan Werkmann Date: Thu, 23 Nov 2017 16:29:01 +0100 Subject: [PATCH 54/85] Numpy bool msgpack bugfix (#18395) (cherry picked from commit 6c074d10090062f95b4510f126edd6a0bb93b163) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/msgpack/_packer.pyx | 3 ++- pandas/tests/io/test_packers.py | 13 ++++++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 6da206a61b193..04e4413b19788 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -87,6 +87,7 @@ I/O - :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). +- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index fd3f4612fb432..f175a6743f44b 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -8,6 +8,7 @@ from libc.limits cimport * from pandas.io.msgpack.exceptions import PackValueError from pandas.io.msgpack import ExtType +import numpy as np cdef extern from "../../src/msgpack/pack.h": @@ -133,7 +134,7 @@ cdef class Packer(object): while True: if o is None: ret = msgpack_pack_nil(&self.pk) - elif isinstance(o, bool): + elif isinstance(o, (bool, np.bool_)): if o: ret = msgpack_pack_true(&self.pk) else: diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index a28adcf1ee771..bc58ea1c7c228 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -180,6 +180,15 @@ def test_scalar_float(self): x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) + def test_scalar_bool(self): + x = np.bool_(1) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + + x = np.bool_(0) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + def test_scalar_complex(self): x = np.random.rand() + 1j * np.random.rand() x_rec = self.encode_decode(x) @@ -263,7 +272,7 @@ def test_numpy_array_complex(self): x.dtype == x_rec.dtype) def test_list_mixed(self): - x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] + x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) @@ -401,6 +410,7 @@ def setup_method(self, method): 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, 'H': Categorical([1, 2, 3, 4, 5]), 'I': Categorical([1, 2, 3, 4, 5], ordered=True), + 'J': (np.bool_(1), 2, 3, 4, 5), } self.d['float'] = Series(data['A']) @@ -410,6 +420,7 @@ def setup_method(self, method): self.d['dt_tz'] = Series(data['G']) self.d['cat_ordered'] = Series(data['H']) self.d['cat_unordered'] = Series(data['I']) + self.d['numpy_bool_mixed'] = Series(data['J']) def test_basic(self): From 7f9dac834851ce16905ea83e0d61d5bbd1b8943f Mon Sep 17 00:00:00 2001 From: bolkedebruin Date: Thu, 23 Nov 2017 16:35:40 +0100 Subject: [PATCH 55/85] [BUG-FIX] DataFrame created with tzinfo cannot use to_dict(orient="records") (#18416) Closes #18372 (cherry picked from commit 4e0948030de512b353e0a39b3d3c309b77c3f3f2) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/frame.py | 2 +- pandas/tests/frame/test_convert_to.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 04e4413b19788..c847650a3caee 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -63,7 +63,7 @@ Conversion - Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) -- +- Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cb4b5c9f4b082..eef787a1dd912 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -997,7 +997,7 @@ def to_dict(self, orient='dict', into=dict): for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): return [into_c((k, _maybe_box_datetimelike(v)) - for k, v in zip(self.columns, row)) + for k, v in zip(self.columns, np.atleast_1d(row))) for row in self.values] elif orient.lower().startswith('i'): return into_c((k, v.to_dict(into)) for k, v in self.iterrows()) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 5bdb76494f4c8..7d2d18db8d41c 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- +from datetime import datetime + import pytest +import pytz import collections import numpy as np @@ -249,3 +252,18 @@ def test_to_dict_box_scalars(self): result = DataFrame(d).to_dict(orient='records') assert isinstance(result[0]['a'], (int, long)) + + def test_frame_to_dict_tz(self): + # GH18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)] + df = DataFrame(list(data), columns=["d", ]) + + result = df.to_dict(orient='records') + expected = [ + {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)}, + {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) From 5aca87de2317d6c9e86ebe125a4bf304b25c9953 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Thu, 23 Nov 2017 16:13:47 +0000 Subject: [PATCH 56/85] BUG: Copy categorical codes if empty (fixes #18051) (#18436) (cherry picked from commit b45325e283b16ec8869aaea407de8256fc234f33) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/categorical.py | 2 +- pandas/tests/series/test_analytics.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index c847650a3caee..8595a4e414700 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -138,6 +138,7 @@ Categorical - Error messages in the testing module have been improved when items have different ``CategoricalDtype`` (:issue:`18069`) - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) +- Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) Other ^^^^^ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index e709c771b7d18..c574e6d56916b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2268,7 +2268,7 @@ def _recode_for_categories(codes, old_categories, new_categories): if len(old_categories) == 0: # All null anyway, so just retain the nulls - return codes + return codes.copy() indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), new_categories) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 8cc40bb5146c5..2ee404ab5fe0d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -848,6 +848,12 @@ def test_value_counts_nunique(self): result = series.nunique() assert result == 11 + # GH 18051 + s = pd.Series(pd.Categorical([])) + assert s.nunique() == 0 + s = pd.Series(pd.Categorical([np.nan])) + assert s.nunique() == 0 + def test_unique(self): # 714 also, dtype=float @@ -920,6 +926,14 @@ def test_drop_duplicates(self): sc.drop_duplicates(keep=False, inplace=True) assert_series_equal(sc, s[~expected]) + # GH 18051 + s = pd.Series(pd.Categorical([])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([]), + check_dtype=False) + s = pd.Series(pd.Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), + check_dtype=False) + def test_clip(self): val = self.ts.median() From de9a7f00fee5ea85bfe7edfa354588fb5b2e48dd Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 26 Nov 2017 06:01:21 +0900 Subject: [PATCH 57/85] BUG: Fix Index.putmask makes stack overflow with an invalid mask (#18407) (cherry picked from commit b69c1a26899b38adff8390236ee83ba36af0374e) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/indexes/base.py | 5 ++++- pandas/tests/indexes/common.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 8595a4e414700..5fdf180d74e2c 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -73,7 +73,7 @@ Indexing - Bug in a boolean comparison of a ``datetime.datetime`` and a ``datetime64[ns]`` dtype Series (:issue:`17965`) - Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) - Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) -- +- Bug in ``Index.putmask`` when an invalid mask passed (:issue:`18368`) - I/O diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9e14a3838733e..83c78f084a9da 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1934,7 +1934,10 @@ def putmask(self, mask, value): try: np.putmask(values, mask, self._convert_for_op(value)) return self._shallow_copy(values) - except (ValueError, TypeError): + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err + # coerces to object return self.astype(object).putmask(mask, value) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 456e5a9bd6439..3a57337efea6f 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -996,3 +996,16 @@ def test_searchsorted_monotonic(self, indices): # non-monotonic should raise. with pytest.raises(ValueError): indices._searchsorted_monotonic(value, side='left') + + def test_putmask_with_wrong_mask(self): + # GH18368 + index = self.create_index() + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) + 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) - 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask('foo', 1) From cbe24b69f8498ed117adb4e351dc4952f519f751 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Sun, 26 Nov 2017 06:13:14 +0900 Subject: [PATCH 58/85] BUG: Fix inaccurate rolling.var calculation (#18481) (cherry picked from commit 3d4422173ee2c169afd19b6762e3b5003d8a954f) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/window.pyx | 25 +++++++++++++++++-------- pandas/tests/test_window.py | 8 ++++++++ 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5fdf180d74e2c..d003cb3a6f0d5 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -104,7 +104,7 @@ Groupby/Resample/Rolling - Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) - Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) - Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) -- +- Bug in ``rolling.var`` where calculation is inaccurate with a zero-valued array (:issue:`18430`) - - diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 27fee4de5c3a2..a1c4ddbc8d0b0 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -661,9 +661,11 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] + 1 - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, @@ -675,9 +677,11 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] else: mean_x[0] = 0 ssqdm_x[0] = 0 @@ -689,7 +693,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N @@ -749,6 +753,9 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, add_var(input[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # After the first window, observations can both be added and # removed for i from win <= i < N: @@ -760,10 +767,12 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Adding one observation and removing another one delta = val - prev - prev -= mean_x + mean_x_old = mean_x + mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: add_var(val, &nobs, &mean_x, &ssqdm_x) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 165813a89b5db..35ae4ad4d5db4 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2491,6 +2491,14 @@ def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, min_periods=5) + @pytest.mark.parametrize('window', range(7)) + def test_rolling_corr_with_zero_variance(self, window): + # GH 18430 + s = pd.Series(np.zeros(20)) + other = pd.Series(np.arange(20)) + + assert s.rolling(window=window).corr(other=other).isna().all() + def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) From 7a72e3e9cefea25d66aa6ef8ef7a0e41ba8c4667 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 25 Nov 2017 16:51:13 -0500 Subject: [PATCH 59/85] Propogating NaN values when using str.split (#18450) (#18462) (cherry picked from commit 20f65126e0de65876bf412fa4280d8725afe2260) --- doc/source/whatsnew/v0.21.1.txt | 6 +++++- pandas/core/strings.py | 4 ++++ pandas/tests/test_strings.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index d003cb3a6f0d5..04bd93331ad4a 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -140,9 +140,13 @@ Categorical - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) - Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +String +^^^^^^ + +- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) + Other ^^^^^ - - -- diff --git a/pandas/core/strings.py b/pandas/core/strings.py index abef6f6086dbd..9614641aa1abf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1423,6 +1423,10 @@ def cons_row(x): return [x] result = [cons_row(x) for x in result] + if result: + # propogate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [x * max_len if x[0] is np.nan else x for x in result] if not isinstance(expand, bool): raise ValueError("expand must be True or False") diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f1b97081b6d93..8aa69bcbfdf7f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2086,6 +2086,18 @@ def test_rsplit_to_multiindex_expand(self): tm.assert_index_equal(result, exp) assert result.nlevels == 2 + def test_split_nan_expand(self): + # gh-18450 + s = Series(["foo,bar,baz", NA]) + result = s.str.split(",", expand=True) + exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) + tm.assert_frame_equal(result, exp) + + # check that these are actually np.nan and not None + # TODO see GH 18463 + # tm.assert_frame_equal does not differentiate + assert all(np.isnan(x) for x in result.iloc[1]) + def test_split_with_name(self): # GH 12617 From e5303476413c6e2b3bf79b47289df6cd382ae221 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 25 Nov 2017 19:30:34 -0500 Subject: [PATCH 60/85] CI: remove pandas-gbq from 3.5 build to avoid conflicts with 3.6 build_test (#18492) (cherry picked from commit 38f41e64f4b8a0479f8835022af5e7343ccf8498) --- ci/requirements-3.5.pip | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 6e4f7b65f9728..0d9e44cf39fa4 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,2 +1 @@ xarray==0.9.1 -pandas-gbq From 237ed875085c68481483bb9d402d98838baf8eb1 Mon Sep 17 00:00:00 2001 From: Alexander Michael Schade <3345464+aschade@users.noreply.github.com> Date: Sun, 26 Nov 2017 10:19:13 -0500 Subject: [PATCH 61/85] Fix tzaware dates mismatch but no exception raised (#18488) (cherry picked from commit d1010643fea058ba43c2c7124af75cc462ccf242) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/_libs/tslibs/timezones.pyx | 7 +++---- .../tests/indexes/datetimes/test_date_range.py | 16 ++++++++++++++++ pandas/tests/tseries/test_timezones.py | 2 +- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 04bd93331ad4a..489446cd6bcef 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -64,7 +64,7 @@ Conversion - Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) - Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) - Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) -- +- Bug in :class:`DateTimeIndex` and :meth:`date_range` where mismatching tz-aware ``start`` and ``end`` timezones would not raise an err if ``end.tzinfo`` is None (:issue:`18431`) - Indexing diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7f778dde86e23..ba7031bc382b1 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -283,10 +283,9 @@ cdef object get_dst_info(object tz): def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo - if end.tzinfo: - if not (get_timezone(tz) == get_timezone(end.tzinfo)): - msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' - raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + if not (get_timezone(tz) == get_timezone(end.tzinfo)): + msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' + raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) elif start is not None: tz = start.tzinfo elif end is not None: diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 1fca0445de5c4..1349f2f761a2f 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -234,6 +234,22 @@ def test_precision_finer_than_offset(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) + dt1, dt2 = '2017-01-01', '2017-01-01' + tz1, tz2 = 'US/Eastern', 'Europe/London' + + @pytest.mark.parametrize("start,end", [ + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), + (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)) + ]) + def test_mismatching_tz_raises_err(self, start, end): + # issue 18488 + with pytest.raises(TypeError): + pd.date_range(start, end) + with pytest.raises(TypeError): + pd.DatetimeIndex(start, end, freq=BDay()) + class TestBusinessDateRange(object): diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index 724628649796d..823e22c4f87d1 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -424,7 +424,7 @@ def test_with_tz(self): # datetimes with tzinfo set dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - '1/1/2009', tz=pytz.utc) + datetime(2009, 1, 1, tzinfo=pytz.utc)) pytest.raises(Exception, bdate_range, datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', From 94121f905323bdb7dba8d4061ed65d7107b4ba18 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 26 Nov 2017 16:46:51 -0500 Subject: [PATCH 62/85] TST: move gbq back to 3.5 build and remove from BUILD_TEST (#18506) (cherry picked from commit 982ad07cf38ba4567ddf17d3cfe3e986d1adaae1) --- ci/requirements-2.7_BUILD_TEST.pip | 1 - ci/requirements-3.5.pip | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-2.7_BUILD_TEST.pip b/ci/requirements-2.7_BUILD_TEST.pip index a0fc77c40bc00..f4617133cad5b 100644 --- a/ci/requirements-2.7_BUILD_TEST.pip +++ b/ci/requirements-2.7_BUILD_TEST.pip @@ -1,7 +1,6 @@ xarray geopandas seaborn -pandas_gbq pandas_datareader statsmodels scikit-learn diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 0d9e44cf39fa4..c9565f2173070 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1 +1,2 @@ xarray==0.9.1 +pandas_gbq From 80efa3e7027219b0a1b31ea6b7c54560d1116126 Mon Sep 17 00:00:00 2001 From: Kevin Kuhl Date: Mon, 27 Nov 2017 05:34:56 -0600 Subject: [PATCH 63/85] COMPAT: reading json with lines=True from s3, xref #17200 (#17201) (cherry picked from commit 4fd104a72a825914851820fee623fbcdf1a989a7) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/io/json/json.py | 20 ++++--- pandas/tests/io/conftest.py | 74 +++++++++++++++++++++++++ pandas/tests/io/json/test_pandas.py | 65 +++++++++++++++++++++- pandas/tests/io/parser/data/items.jsonl | 2 + pandas/tests/io/parser/test_network.py | 48 ---------------- 6 files changed, 152 insertions(+), 59 deletions(-) create mode 100644 pandas/tests/io/conftest.py create mode 100644 pandas/tests/io/parser/data/items.jsonl diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 489446cd6bcef..bc01ed082b620 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -88,7 +88,7 @@ I/O - :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - +- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) Plotting diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index be39f4baba0fb..203b1d62fcbf3 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -5,7 +5,7 @@ import pandas._libs.json as json from pandas._libs.tslib import iNaT -from pandas.compat import StringIO, long, u +from pandas.compat import StringIO, long, u, to_str from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, @@ -458,8 +458,10 @@ def read(self): if self.lines and self.chunksize: obj = concat(self) elif self.lines: + + data = to_str(self.data) obj = self._get_object_parser( - self._combine_lines(self.data.split('\n')) + self._combine_lines(data.split('\n')) ) else: obj = self._get_object_parser(self.data) @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: dtype = np.dtype(dtype) return data.astype(dtype), True - except: + except (TypeError, ValueError): return data, False if convert_dates: @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass if data.dtype.kind == 'f': @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass # do't coerce 0-len data @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, if (new_data == data).all(): data = new_data result = True - except: + except (TypeError, ValueError): pass # coerce ints to 64 @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('int64') result = True - except: + except (TypeError, ValueError): pass return data, result @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data): if new_data.dtype == 'object': try: new_data = data.astype('int64') - except: + except (TypeError, ValueError): pass # ignore numbers that are out of range @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data): unit=date_unit) except ValueError: continue - except: + except Exception: break return new_data, True return data, False diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py new file mode 100644 index 0000000000000..828d5d0ccd3c6 --- /dev/null +++ b/pandas/tests/io/conftest.py @@ -0,0 +1,74 @@ +import os + +import moto +import pytest +from pandas.io.parsers import read_table + +HERE = os.path.dirname(__file__) + + +@pytest.fixture(scope='module') +def tips_file(): + """Path to the tips dataset""" + return os.path.join(HERE, 'parser', 'data', 'tips.csv') + + +@pytest.fixture(scope='module') +def jsonl_file(): + """Path a JSONL dataset""" + return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + + +@pytest.fixture(scope='module') +def salaries_table(): + """DataFrame with the salaries dataset""" + path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + return read_table(path) + + +@pytest.fixture(scope='module') +def s3_resource(tips_file, jsonl_file): + """Fixture for mocking S3 interaction. + + The primary bucket name is "pandas-test". The following datasets + are loaded. + + - tips.csv + - tips.csv.gz + - tips.csv.bz2 + - items.jsonl + + A private bucket "cant_get_it" is also created. The boto3 s3 resource + is yielded by the fixture. + """ + pytest.importorskip('s3fs') + moto.mock_s3().start() + + test_s3_files = [ + ('tips.csv', tips_file), + ('tips.csv.gz', tips_file + '.gz'), + ('tips.csv.bz2', tips_file + '.bz2'), + ('items.jsonl', jsonl_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, 'rb') as f: + conn.Bucket(bucket_name).put_object( + Key=s3_key, + Body=f) + + boto3 = pytest.importorskip('boto3') + # see gh-16135 + bucket = 'pandas-test' + + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + + yield conn + + moto.mock_s3().stop() diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6625446bea469..78e33f8966d1f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,7 +4,6 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os - import numpy as np from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) @@ -1030,6 +1029,70 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) assert dumps(df, iso_dates=True) == dfexp + def test_read_inline_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_s3_jsonl(self, s3_resource): + pytest.importorskip('s3fs') + # GH17200 + + result = read_json('s3n://pandas-test/items.jsonl', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_local_jsonl(self): + # GH17200 + with ensure_clean('tmp_items.json') as path: + with open(path, 'w') as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(path, lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_jsonl_unicode_chars(self): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl new file mode 100644 index 0000000000000..f784d37befa82 --- /dev/null +++ b/pandas/tests/io/parser/data/items.jsonl @@ -0,0 +1,2 @@ +{"a": 1, "b": 2} +{"b":2, "a" :1} diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 27cc708889fa2..d00d3f31ce189 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -4,10 +4,7 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -import os - import pytest -import moto import pandas.util.testing as tm from pandas import DataFrame @@ -15,51 +12,6 @@ from pandas.compat import BytesIO -@pytest.fixture(scope='module') -def tips_file(): - return os.path.join(tm.get_data_path(), 'tips.csv') - - -@pytest.fixture(scope='module') -def salaries_table(): - path = os.path.join(tm.get_data_path(), 'salaries.csv') - return read_table(path) - - -@pytest.fixture(scope='module') -def s3_resource(tips_file): - pytest.importorskip('s3fs') - moto.mock_s3().start() - - test_s3_files = [ - ('tips.csv', tips_file), - ('tips.csv.gz', tips_file + '.gz'), - ('tips.csv.bz2', tips_file + '.bz2'), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, 'rb') as f: - conn.Bucket(bucket_name).put_object( - Key=s3_key, - Body=f) - - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' - - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') - - yield conn - - moto.mock_s3().stop() - - @pytest.mark.network @pytest.mark.parametrize( "compression,extension", From 792c868e021c3ae458a44c2adfabc94b6b12d33d Mon Sep 17 00:00:00 2001 From: Yee Mey Date: Mon, 27 Nov 2017 03:36:21 -0800 Subject: [PATCH 64/85] BUG: Ignore division by 0 when merging empty dataframes (#17776) (#17846) (cherry picked from commit 262e8ff367c9291c79c4df0c2daf4713de52abc0) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/reshape/merge.py | 3 ++- pandas/tests/reshape/test_merge.py | 6 ++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index bc01ed082b620..5870ff6e7fee2 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -121,7 +121,7 @@ Reshaping - Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) - Bug in ``DataFrame.filter(...)`` when :class:`unicode` is passed as a condition in Python 2 (:issue:`13101`) -- +- Bug when merging empty DataFrames when ``np.seterr(divide='raise')`` is set (:issue:`17776`) Numeric ^^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 412c00dc95ec0..bdb7ec00a29fd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1529,7 +1529,8 @@ def _get_join_keys(llab, rlab, shape, sort): rkey = stride * rlab[0].astype('i8', subok=False, copy=False) for i in range(1, nlev): - stride //= shape[i] + with np.errstate(divide='ignore'): + stride //= shape[i] lkey += llab[i] * stride rkey += rlab[i] * stride diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 172667c9a0fb8..33d91af21c723 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -861,6 +861,12 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) + def test_merge_two_empty_df_no_division_error(self): + # GH17776, PR #17846 + a = pd.DataFrame({'a': [], 'b': [], 'c': []}) + with np.errstate(divide='raise'): + merge(a, a, on=('a', 'b')) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: From 92dc7a0b40ccf84fcc42d452c4277f3e22949525 Mon Sep 17 00:00:00 2001 From: topper-123 Date: Mon, 27 Nov 2017 11:28:13 +0000 Subject: [PATCH 65/85] Added repr string for Grouper and TimeGrouper (#18203) (cherry picked from commit f7c79be4d5bc966a631c9876e272d19a54fd8edf) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/core/groupby.py | 16 +++++++++++++--- pandas/core/resample.py | 19 +++++++------------ pandas/tests/groupby/test_groupby.py | 9 +++++++++ pandas/tests/test_resample.py | 8 ++++++++ 5 files changed, 38 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 5870ff6e7fee2..55be2ec76633a 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -22,7 +22,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) -- +- :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). - .. _whatsnew_0211.deprecations: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index add465e066422..1ff4a22d1fa54 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -206,12 +206,13 @@ class Grouper(object): sort : boolean, default to False whether to sort the resulting labels - additional kwargs to control time-like groupers (when freq is passed) + additional kwargs to control time-like groupers (when ``freq`` is passed) - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If grouper is PeriodIndex + base, loffset Returns ------- @@ -233,6 +234,7 @@ class Grouper(object): >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ + _attributes = ('key', 'level', 'freq', 'axis', 'sort') def __new__(cls, *args, **kwargs): if kwargs.get('freq') is not None: @@ -333,6 +335,14 @@ def _set_grouper(self, obj, sort=False): def groups(self): return self.grouper.groups + def __repr__(self): + attrs_list = ["{}={!r}".format(attr_name, getattr(self, attr_name)) + for attr_name in self._attributes + if getattr(self, attr_name) is not None] + attrs = ", ".join(attrs_list) + cls_name = self.__class__.__name__ + return "{}({})".format(cls_name, attrs) + class GroupByPlot(PandasObject): """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 40946318aa33a..1adb3a078cca3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1014,22 +1014,18 @@ class TimeGrouper(Grouper): Parameters ---------- freq : pandas date offset or offset alias for identifying bin edges - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right - nperiods : optional, integer + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex - - Notes - ----- - Use begin, end, nperiods to generate intervals that cannot be derived - directly from the associated object """ + _attributes = Grouper._attributes + ('closed', 'label', 'how', + 'loffset', 'kind', 'convention', + 'base') def __init__(self, freq='Min', closed=None, label=None, how='mean', - nperiods=None, axis=0, - fill_method=None, limit=None, loffset=None, kind=None, - convention=None, base=0, **kwargs): + axis=0, fill_method=None, limit=None, loffset=None, + kind=None, convention=None, base=0, **kwargs): freq = to_offset(freq) end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W']) @@ -1048,7 +1044,6 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.closed = closed self.label = label - self.nperiods = nperiods self.kind = kind self.convention = convention or 'E' diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9d25117fbd954..697b60d95bf84 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -28,6 +28,15 @@ from .common import MixIn +class TestGrouper(object): + + def test_repr(self): + # GH18203 + result = repr(pd.Grouper(key='A', level='B')) + expected = "Grouper(key='A', level='B', axis=0, sort=False)" + assert result == expected + + class TestGroupBy(MixIn): def test_basic(self): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index a703d24952af1..e64bf2217e717 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3428,3 +3428,11 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet + + def test_repr(self): + # GH18203 + result = repr(TimeGrouper(key='A', freq='H')) + expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0)") + assert result == expected From 75a38029de347e852afa78d795c22894ebc413fb Mon Sep 17 00:00:00 2001 From: topper-123 Date: Tue, 28 Nov 2017 11:29:40 +0000 Subject: [PATCH 66/85] improved DataFrame/SeriesGroupBy.apply doc string (#18534) (cherry picked from commit 2a0e54bc841f27164b116135ebda4b74bae2fc4a) --- pandas/core/groupby.py | 166 ++++++++++++++++++++++++++++++----------- 1 file changed, 123 insertions(+), 43 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1ff4a22d1fa54..99d92e2b0f59e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -77,6 +77,119 @@ pandas.Panel.%(name)s """ +_apply_docs = dict( + template=""" + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a {input} as its first + argument and return a dataframe, a series or a scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods. + Pandas offers a wide range of method that will be much faster + than using ``apply`` for their specific purposes, so try to use them + before reaching for ``apply``. + + Parameters + ---------- + func : function + A callable that takes a {input} as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to ``func`` + + Returns + ------- + applied : Series or DataFrame + + Notes + ----- + In the current implementation ``apply`` calls func twice on the + first group to decide whether it can take a fast or slow code + path. This can lead to unexpected behavior if func has + side-effects, as they will take effect twice for the first + group. + + Examples + -------- + {examples} + + See also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate, transform + """, + dataframe_examples=""" + >>> df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4,6, 5]}) + >>> g = df.groupby('A') + + From ``df`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: below the function passed to ``apply`` takes a dataframe as + its argument and returns a dataframe. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + + Example 2: The function passed to ``apply`` takes a dataframe as + its argument and returns a series. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x.max() - x.min()) + B C + A + a 1 2 + b 0 0 + + Example 3: The function passed to ``apply`` takes a dataframe as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.C.max() - x.B.min()) + A + a 5 + b 2 + dtype: int64 + """, + series_examples=""" + >>> ser = pd.Series([0, 1, 2], index='a a b'.split()) + >>> g = ser.groupby(ser.index) + + From ``ser`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: The function passed to ``apply`` takes a series as + its argument and returns a series. ``apply`` combines the result for + each group together into a new series: + + >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2) + 0 0.0 + 1 0.5 + 2 4.0 + dtype: float64 + + Example 2: The function passed to ``apply`` takes a series as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + """) + _transform_template = """ Call function producing a like-indexed %(klass)s on each group and return a %(klass)s having the same indexes as the original object @@ -144,6 +257,7 @@ """ + # special case to prevent duplicate plots when catching exceptions when # forwarding methods from NDFrames _plotting_methods = frozenset(['plot', 'boxplot', 'hist']) @@ -663,50 +777,10 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) - @Substitution(name='groupby') + @Appender(_apply_docs['template'] + .format(input="dataframe", + examples=_apply_docs['dataframe_examples'])) def apply(self, func, *args, **kwargs): - """ - Apply function and combine results together in an intelligent way. - - The split-apply-combine combination rules attempt to be as common - sense based as possible. For example: - - case 1: - group DataFrame - apply aggregation function (f(chunk) -> Series) - yield DataFrame, with group axis having group labels - - case 2: - group DataFrame - apply transform function ((f(chunk) -> DataFrame with same indexes) - yield DataFrame with resulting chunks glued together - - case 3: - group Series - apply function with f(chunk) -> DataFrame - yield DataFrame with result of chunks glued together - - Parameters - ---------- - func : function - - Notes - ----- - See online documentation for full exposition on how to use apply. - - In the current implementation apply calls func twice on the - first group to decide whether it can take a fast or slow code - path. This can lead to unexpected behavior if func has - side-effects, as they will take effect twice for the first - group. - - - See also - -------- - pipe : Apply function to the full GroupBy object instead of to each - group. - aggregate, transform - """ func = self._is_builtin_func(func) @@ -3021,6 +3095,12 @@ def _selection_name(self): """) + @Appender(_apply_docs['template'] + .format(input='series', + examples=_apply_docs['series_examples'])) + def apply(self, func, *args, **kwargs): + return super(SeriesGroupBy, self).apply(func, *args, **kwargs) + @Appender(_agg_doc) @Appender(_shared_docs['aggregate'] % dict( klass='Series', From 5348c6e77a6c4ff8c7f3d4fb90197f907508b1ff Mon Sep 17 00:00:00 2001 From: David Stansby Date: Thu, 30 Nov 2017 13:55:59 +0000 Subject: [PATCH 67/85] DOC: clarify default window in rolling method (#18177) (cherry picked from commit c40c8f8b3baccbd658d078816698f85e3268a781) --- doc/source/computation.rst | 4 +++- pandas/core/window.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 466ac3c9cbf51..0cdfec63fd696 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -346,7 +346,9 @@ The following methods are available: :meth:`~Window.sum`, Sum of values :meth:`~Window.mean`, Mean of values -The weights used in the window are specified by the ``win_type`` keyword. The list of recognized types are: +The weights used in the window are specified by the ``win_type`` keyword. +The list of recognized types are the `scipy.signal window functions + `__: - ``boxcar`` - ``triang`` diff --git a/pandas/core/window.py b/pandas/core/window.py index 5143dddc5e866..345f9b035a36b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -503,6 +503,9 @@ class Window(_Window): * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). + If ``win_type=None`` all points are evenly weighted. To learn more about + different window types see `scipy.signal window functions + `__. """ def validate(self): From 68fd85bf29aa2c5e945d9e00dc46de0db8ddcd0a Mon Sep 17 00:00:00 2001 From: Eric Kisslinger <33908309+ekisslinger@users.noreply.github.com> Date: Thu, 30 Nov 2017 07:11:42 -0800 Subject: [PATCH 68/85] BUG: Fix groupby over a CategoricalIndex in axis=1 (#18525) (cherry picked from commit 5da3759b30167cd5ef5cb02f5bbfb98ac1be1103) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/groupby.py | 8 +++++--- pandas/tests/groupby/test_groupby.py | 25 ++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 55be2ec76633a..2e640df53ae27 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -139,6 +139,7 @@ Categorical different ``CategoricalDtype`` (:issue:`18069`) - ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) - Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +- Bug in ``DataFrame.groupby(axis=1)`` with a ``CategoricalIndex`` (:issue:`18432`) String ^^^^^^ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 99d92e2b0f59e..5931f6e009dab 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2931,9 +2931,11 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if is_categorical_dtype(gpr) and len(gpr) != len(obj): - raise ValueError("Categorical dtype grouper must " - "have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: + raise ValueError( + ("Length of grouper ({len_gpr}) and axis ({len_axis})" + " must be same length" + .format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 697b60d95bf84..675f8d6413b2a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,7 @@ from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, - concat, Panel, DatetimeIndex) + concat, Panel, DatetimeIndex, CategoricalIndex) from pandas.errors import UnsupportedFunctionCall, PerformanceWarning from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -262,6 +262,29 @@ def test_grouper_column_and_index(self): expected = df_single.reset_index().groupby(['inner', 'B']).mean() assert_frame_equal(result, expected) + def test_groupby_categorical_index_and_columns(self): + # GH18432 + columns = ['A', 'B', 'A', 'B'] + categories = ['B', 'A'] + data = np.ones((5, 4), int) + cat_columns = CategoricalIndex(columns, + categories=categories, + ordered=True) + df = DataFrame(data=data, columns=cat_columns) + result = df.groupby(axis=1, level=0).sum() + expected_data = 2 * np.ones((5, 2), int) + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) + expected = DataFrame(data=expected_data, columns=expected_columns) + assert_frame_equal(result, expected) + + # test transposed version + df = DataFrame(data.T, index=cat_columns) + result = df.groupby(axis=0, level=0).sum() + expected = DataFrame(data=expected_data.T, index=expected_columns) + assert_frame_equal(result, expected) + def test_grouper_getting_correct_binner(self): # GH 10063 From 309c9c1ca71fefadf9313bdcad37bce25c94e461 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 30 Nov 2017 17:05:15 -0800 Subject: [PATCH 69/85] Update pandas.read_gbq docs to point to pandas-gbq (#18548) (cherry picked from commit 5cd5e3b81fc3850367bb3e25644cbe3197cdea5a) --- doc/source/install.rst | 3 ++- pandas/io/gbq.py | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index c805f84d0faaa..7a9a3b6c177d1 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -258,7 +258,8 @@ Optional Dependencies `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* For Google BigQuery I/O - see `here `__ +* `pandas-gbq `__: for Google BigQuery I/O. + * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index b4dc9173f11ba..caa67d1ce6bce 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -29,9 +29,8 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, The main method a user calls to execute a Query in Google BigQuery and read results into a pandas DataFrame. - Google BigQuery API Client Library v2 for Python is used. - Documentation is available `here - `__ + This function requires the `pandas-gbq package + `__. Authentication to the Google BigQuery service is via OAuth 2.0. @@ -70,7 +69,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. - 'standard' : Use BigQuery's standard SQL (beta), which is + 'standard' : Use BigQuery's standard SQL, which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference `__ From aa878765d6cac7703b55819b5756550c4ae60081 Mon Sep 17 00:00:00 2001 From: Chris Mazzullo Date: Thu, 30 Nov 2017 09:26:21 -0500 Subject: [PATCH 70/85] DOC: header='infer' is not working when there is no header, closes #17473 (#18042) (cherry picked from commit 67c4d0f4f9f45b981d3e6cb07521f9c0bbb459d7) --- doc/source/io.rst | 31 ++++++++++++++++++++++--------- pandas/io/parsers.py | 22 +++++++++++++--------- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 7418617ae9004..8269c1a69c95b 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -103,15 +103,20 @@ Column and Index Locations and Names ++++++++++++++++++++++++++++++++++++ header : int or list of ints, default ``'infer'`` - Row number(s) to use as the column names, and the start of the data. Default - behavior is as if ``header=0`` if no ``names`` passed, otherwise as if - ``header=None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of ints that specify row locations for a - multi-index on the columns e.g. ``[0,1,3]``. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first line of the file, if column names are + passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to replace + existing names. + + The header can be a list of ints that specify row locations + for a multi-index on the columns e.g. ``[0,1,3]``. Intervening rows + that are not specified will be skipped (e.g. 2 in this example is + skipped). Note that this parameter ignores commented lines and empty + lines if ``skip_blank_lines=True``, so header=0 denotes the first + line of data rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should explicitly pass ``header=None``. Duplicates in this list will cause @@ -553,6 +558,14 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. note:: + + Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first nonblank line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. + .. _io.dupe_names: Duplicate names parsing diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index cf181f1de938b..e4b221b1768ec 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -74,15 +74,19 @@ .. versionadded:: 0.18.1 support for the Python parser. header : int or list of ints, default 'infer' - Row number(s) to use as the column names, and the start of the data. - Default behavior is as if set to 0 if no ``names`` passed, otherwise - ``None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of integers that specify row locations for - a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a multi-index on the columns + e.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so header=0 denotes the first line of + data rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you should explicitly pass header=None. Duplicates in this list will cause From 17cfc2d8cea47b1d61c0b94cc3d5dbf98a83aec3 Mon Sep 17 00:00:00 2001 From: fjdiod Date: Sat, 2 Dec 2017 20:43:01 +0300 Subject: [PATCH 71/85] BUG: Unwanted conversion from timedelta to float (#18493) (#18586) (cherry picked from commit 0e168188811677f9de72a6a5b97253e551b6b04a) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/core/internals.py | 3 ++- pandas/tests/indexing/test_timedelta.py | 21 +++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 2e640df53ae27..e31396b0cb8ff 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -74,6 +74,7 @@ Indexing - Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) - Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) - Bug in ``Index.putmask`` when an invalid mask passed (:issue:`18368`) +- Bug in masked assignment of a ``timedelta64[ns]`` dtype ``Series``, incorrectly coerced to float (:issue:`18493`) - I/O diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 045580d393b26..b929dfd5a9d0b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1946,7 +1946,8 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) - return isinstance(element, (timedelta, np.timedelta64)) + return is_integer(element) or isinstance( + element, (timedelta, np.timedelta64)) def fillna(self, value, **kwargs): diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 32609362e49af..3ad3b771b2ab2 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -2,6 +2,7 @@ import pandas as pd from pandas.util import testing as tm +import numpy as np class TestTimedeltaIndexing(object): @@ -47,3 +48,23 @@ def test_string_indexing(self): expected = df.iloc[0] sliced = df.loc['0 days'] tm.assert_series_equal(sliced, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_masked_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series[series == series[0]] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_listlike_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series.iloc[0] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) From 53e0c93a8bf4197c89e36d7ef00dcc2d2b85434f Mon Sep 17 00:00:00 2001 From: Aaron Critchley Date: Mon, 4 Dec 2017 11:27:29 +0000 Subject: [PATCH 72/85] DOC: Remove keep=False docs on nlargest/nsmallest (#18617) (cherry picked from commit 73ed6de17ca390418d23a5698cf4db78aa8b7b80) --- pandas/core/frame.py | 4 ++-- pandas/core/series.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index eef787a1dd912..cdc11cc7dc9df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3751,7 +3751,7 @@ def nlargest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -3788,7 +3788,7 @@ def nsmallest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. diff --git a/pandas/core/series.py b/pandas/core/series.py index 59606e86465c5..2b4f9c4c6f7e3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2023,7 +2023,7 @@ def nlargest(self, n=5, keep='first'): ---------- n : int Return this many descending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -2070,7 +2070,7 @@ def nsmallest(self, n=5, keep='first'): ---------- n : int Return this many ascending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. From 01dd5a7f424cba24d1228f8e62fe6ca6fa155f17 Mon Sep 17 00:00:00 2001 From: David Fischer Date: Mon, 4 Dec 2017 13:55:54 +0100 Subject: [PATCH 73/85] json_normalize: Make code more pythonic and avoid modification of meta if mutable (#18610) (cherry picked from commit 2c903d594299b2441d4742e777a10e8c76557386) --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/json/normalize.py | 6 ++---- pandas/tests/io/json/test_normalize.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index e31396b0cb8ff..e323dd613b3c7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -90,6 +90,7 @@ I/O - Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) +- Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) Plotting diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index e811dd1eab142..23d2f730d070c 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -181,7 +181,7 @@ def _pull_field(js, spec): return result - if isinstance(data, list) and len(data) is 0: + if isinstance(data, list) and not data: return DataFrame() # A bit of a hackjob @@ -207,9 +207,7 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - for i, x in enumerate(meta): - if not isinstance(x, list): - meta[i] = [x] + meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records = [] diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 49b765b18d623..1cceae32cd748 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -173,6 +173,21 @@ def test_meta_name_conflict(self): for val in ['metafoo', 'metabar', 'foo', 'bar']: assert val in result + def test_meta_parameter_not_modified(self): + # GH 18610 + data = [{'foo': 'hello', + 'bar': 'there', + 'data': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}]}] + + COLUMNS = ['foo', 'bar'] + result = json_normalize(data, 'data', meta=COLUMNS, + meta_prefix='meta') + + assert COLUMNS == ['foo', 'bar'] + for val in ['metafoo', 'metabar', 'foo', 'bar']: + assert val in result + def test_record_prefix(self, state_data): result = json_normalize(state_data[0], 'counties') expected = DataFrame(state_data[0]['counties']) From 02eac02c0f3f50118bfbe72c1b5c2200f9e9e0b8 Mon Sep 17 00:00:00 2001 From: jschendel Date: Sun, 19 Nov 2017 17:25:28 -0700 Subject: [PATCH 74/85] BUG: Fix IntervalIndex constructor and copy with non-default closed (#18340) (cherry picked from commit 1915ffc53ea60494f24d83844bbff00efa392c82) --- doc/source/whatsnew/v0.21.1.txt | 3 - pandas/core/indexes/interval.py | 7 +- pandas/tests/indexes/test_interval.py | 440 +++++++++++++++----------- 3 files changed, 265 insertions(+), 185 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index e323dd613b3c7..f690e6f95aba6 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -54,9 +54,6 @@ Documentation Changes .. _whatsnew_0211.bug_fixes: -Bug Fixes -~~~~~~~~~ - Conversion ^^^^^^^^^^ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7bf7cfce515a1..9619f5403b761 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -179,7 +179,7 @@ def __new__(cls, data, closed='right', if isinstance(data, IntervalIndex): left = data.left right = data.right - + closed = data.closed else: # don't allow scalars @@ -187,7 +187,7 @@ def __new__(cls, data, closed='right', cls._scalar_data_error(data) data = IntervalIndex.from_intervals(data, name=name) - left, right = data.left, data.right + left, right, closed = data.left, data.right, data.closed return cls._simple_new(left, right, closed, name, copy=copy, verify_integrity=verify_integrity) @@ -569,7 +569,8 @@ def copy(self, deep=False, name=None): left = self.left.copy(deep=True) if deep else self.left right = self.right.copy(deep=True) if deep else self.right name = name if name is not None else self.name - return type(self).from_arrays(left, right, name=name) + closed = self.closed + return type(self).from_arrays(left, right, closed=closed, name=name) @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index b55bab3a210cc..399d88309072e 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -6,6 +6,7 @@ from pandas import (Interval, IntervalIndex, Index, isna, interval_range, Timestamp, Timedelta, compat, date_range, timedelta_range, DateOffset) +from pandas.compat import zip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree from pandas.tests.indexes.common import Base @@ -13,6 +14,11 @@ import pandas as pd +@pytest.fixture(scope='class', params=['left', 'right', 'both', 'neither']) +def closed(request): + return request.param + + class TestIntervalIndex(Base): _holder = IntervalIndex @@ -22,34 +28,63 @@ def setup_method(self, method): [(0, 1), np.nan, (1, 2)]) self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) - def create_index(self): - return IntervalIndex.from_breaks(np.arange(10)) + def create_index(self, closed='right'): + return IntervalIndex.from_breaks(np.arange(3), closed=closed) - def test_constructors(self): - expected = self.index - actual = IntervalIndex.from_breaks(np.arange(3), closed='right') - assert expected.equals(actual) + def create_index_with_nan(self, closed='right'): + return IntervalIndex.from_tuples( + [(0, 1), np.nan, (1, 2)], closed=closed) - alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') - assert not expected.equals(alternate) + @pytest.mark.parametrize('name', [None, 'foo']) + def test_constructors(self, closed, name): + left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) + ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)] + expected = IntervalIndex._simple_new( + left=left, right=right, closed=closed, name=name) - actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) - assert expected.equals(actual) + result = IntervalIndex(ivs, name=name) + tm.assert_index_equal(result, expected) - actual = IntervalIndex([Interval(0, 1), Interval(1, 2)]) - assert expected.equals(actual) + result = IntervalIndex.from_intervals(ivs, name=name) + tm.assert_index_equal(result, expected) - actual = IntervalIndex.from_arrays(np.arange(2), np.arange(2) + 1, - closed='right') - assert expected.equals(actual) + result = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name=name) + tm.assert_index_equal(result, expected) - actual = Index([Interval(0, 1), Interval(1, 2)]) - assert isinstance(actual, IntervalIndex) - assert expected.equals(actual) + result = IntervalIndex.from_arrays( + left.values, right.values, closed=closed, name=name) + tm.assert_index_equal(result, expected) - actual = Index(expected) - assert isinstance(actual, IntervalIndex) - assert expected.equals(actual) + result = IntervalIndex.from_tuples( + zip(left, right), closed=closed, name=name) + tm.assert_index_equal(result, expected) + + result = Index(ivs, name=name) + assert isinstance(result, IntervalIndex) + tm.assert_index_equal(result, expected) + + # idempotent + tm.assert_index_equal(Index(expected), expected) + tm.assert_index_equal(IntervalIndex(expected), expected) + + result = IntervalIndex.from_intervals( + expected.values, name=expected.name) + tm.assert_index_equal(result, expected) + + left, right = expected.left, expected.right + result = IntervalIndex.from_arrays( + left, right, closed=expected.closed, name=expected.name) + tm.assert_index_equal(result, expected) + + result = IntervalIndex.from_tuples( + expected.to_tuples(), closed=expected.closed, name=expected.name) + tm.assert_index_equal(result, expected) + + breaks = expected.left.tolist() + [expected.right[-1]] + result = IntervalIndex.from_breaks( + breaks, closed=expected.closed, name=expected.name) + tm.assert_index_equal(result, expected) def test_constructors_other(self): @@ -66,43 +101,57 @@ def test_constructors_other(self): def test_constructors_errors(self): # scalar - with pytest.raises(TypeError): + msg = ('IntervalIndex(...) must be called with a collection of ' + 'some kind, 5 was passed') + with pytest.raises(TypeError, message=msg): IntervalIndex(5) # not an interval - with pytest.raises(TypeError): + msg = "type with value 0 is not an interval" + with pytest.raises(TypeError, message=msg): IntervalIndex([0, 1]) - with pytest.raises(TypeError): + with pytest.raises(TypeError, message=msg): IntervalIndex.from_intervals([0, 1]) # invalid closed - with pytest.raises(ValueError): + msg = "invalid options for 'closed': invalid" + with pytest.raises(ValueError, message=msg): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') # mismatched closed - with pytest.raises(ValueError): + msg = 'intervals must all be closed on the same side' + with pytest.raises(ValueError, message=msg): IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2, closed='left')]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, message=msg): IntervalIndex.from_arrays([0, 10], [3, 5]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, message=msg): Index([Interval(0, 1), Interval(2, 3, closed='left')]) # no point in nesting periods in an IntervalIndex - with pytest.raises(ValueError): + msg = 'Period dtypes are not supported, use a PeriodIndex instead' + with pytest.raises(ValueError, message=msg): IntervalIndex.from_breaks( pd.period_range('2000-01-01', periods=3)) - def test_constructors_datetimelike(self): + # decreasing breaks/arrays + msg = 'left side of interval must be <= right side' + with pytest.raises(ValueError, message=msg): + IntervalIndex.from_breaks(range(10, -1, -1)) + + with pytest.raises(ValueError, message=msg): + IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1)) + + def test_constructors_datetimelike(self, closed): # DTI / TDI for idx in [pd.date_range('20130101', periods=5), pd.timedelta_range('1 day', periods=5)]: - result = IntervalIndex.from_breaks(idx) - expected = IntervalIndex.from_breaks(idx.values) + result = IntervalIndex.from_breaks(idx, closed=closed) + expected = IntervalIndex.from_breaks(idx.values, closed=closed) tm.assert_index_equal(result, expected) expected_scalar_type = type(idx[0]) @@ -117,8 +166,8 @@ def f(): IntervalIndex.from_intervals([0.997, 4.0]) pytest.raises(TypeError, f) - def test_properties(self): - index = self.index + def test_properties(self, closed): + index = self.create_index(closed=closed) assert len(index) == 2 assert index.size == 2 assert index.shape == (2, ) @@ -127,14 +176,15 @@ def test_properties(self): tm.assert_index_equal(index.right, Index([1, 2])) tm.assert_index_equal(index.mid, Index([0.5, 1.5])) - assert index.closed == 'right' + assert index.closed == closed - expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) + expected = np.array([Interval(0, 1, closed=closed), + Interval(1, 2, closed=closed)], dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) # with nans - index = self.index_with_nan + index = self.create_index_with_nan(closed=closed) assert len(index) == 3 assert index.size == 3 assert index.shape == (3, ) @@ -143,41 +193,43 @@ def test_properties(self): tm.assert_index_equal(index.right, Index([1, np.nan, 2])) tm.assert_index_equal(index.mid, Index([0.5, np.nan, 1.5])) - assert index.closed == 'right' + assert index.closed == closed - expected = np.array([Interval(0, 1), np.nan, - Interval(1, 2)], dtype=object) + expected = np.array([Interval(0, 1, closed=closed), np.nan, + Interval(1, 2, closed=closed)], dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) - def test_with_nans(self): - index = self.index + def test_with_nans(self, closed): + index = self.create_index(closed=closed) assert not index.hasnans tm.assert_numpy_array_equal(index.isna(), np.array([False, False])) tm.assert_numpy_array_equal(index.notna(), np.array([True, True])) - index = self.index_with_nan + index = self.create_index_with_nan(closed=closed) assert index.hasnans tm.assert_numpy_array_equal(index.notna(), np.array([True, False, True])) tm.assert_numpy_array_equal(index.isna(), np.array([False, True, False])) - def test_copy(self): - actual = self.index.copy() - assert actual.equals(self.index) + def test_copy(self, closed): + expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) + + result = expected.copy() + assert result.equals(expected) - actual = self.index.copy(deep=True) - assert actual.equals(self.index) - assert actual.left is not self.index.left + result = expected.copy(deep=True) + assert result.equals(expected) + assert result.left is not expected.left - def test_ensure_copied_data(self): + def test_ensure_copied_data(self, closed): # exercise the copy flag in the constructor # not copying - index = self.index + index = self.create_index(closed=closed) result = IntervalIndex(index, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='same') @@ -191,23 +243,34 @@ def test_ensure_copied_data(self): tm.assert_numpy_array_equal(index.right.values, result.right.values, check_same='copy') - def test_equals(self): + def test_equals(self, closed): + expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) + assert expected.equals(expected) + assert expected.equals(expected.copy()) - idx = self.index - assert idx.equals(idx) - assert idx.equals(idx.copy()) + assert not expected.equals(expected.astype(object)) + assert not expected.equals(np.array(expected)) + assert not expected.equals(list(expected)) - assert not idx.equals(idx.astype(object)) - assert not idx.equals(np.array(idx)) - assert not idx.equals(list(idx)) + assert not expected.equals([1, 2]) + assert not expected.equals(np.array([1, 2])) + assert not expected.equals(pd.date_range('20130101', periods=2)) - assert not idx.equals([1, 2]) - assert not idx.equals(np.array([1, 2])) - assert not idx.equals(pd.date_range('20130101', periods=2)) + expected_name1 = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name='foo') + expected_name2 = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name='bar') + assert expected.equals(expected_name1) + assert expected_name1.equals(expected_name2) - def test_astype(self): + for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: + expected_other_closed = IntervalIndex.from_breaks( + np.arange(5), closed=other_closed) + assert not expected.equals(expected_other_closed) - idx = self.index + def test_astype(self, closed): + + idx = self.create_index(closed=closed) for dtype in [np.int64, np.float64, 'datetime64[ns]', 'datetime64[ns, US/Eastern]', 'timedelta64', @@ -227,24 +290,24 @@ def test_astype(self): expected = pd.Categorical(idx, ordered=True) tm.assert_categorical_equal(result, expected) - def test_where(self): - expected = self.index - result = self.index.where(self.index.notna()) + def test_where(self, closed): + expected = self.create_index(closed=closed) + result = expected.where(expected.notna()) tm.assert_index_equal(result, expected) - idx = IntervalIndex.from_breaks([1, 2]) + idx = IntervalIndex.from_breaks([1, 2], closed=closed) result = idx.where([True, False]) expected = IntervalIndex.from_intervals( - [Interval(1.0, 2.0, closed='right'), np.nan]) + [Interval(1.0, 2.0, closed=closed), np.nan]) tm.assert_index_equal(result, expected) def test_where_array_like(self): pass - def test_delete(self): - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.delete(0) - assert expected.equals(actual) + def test_delete(self, closed): + expected = IntervalIndex.from_breaks([1, 2], closed=closed) + result = self.create_index(closed=closed).delete(0) + tm.assert_index_equal(result, expected) def test_insert(self): expected = IntervalIndex.from_breaks(range(4)) @@ -255,113 +318,128 @@ def test_insert(self): pytest.raises(ValueError, self.index.insert, 0, Interval(2, 3, closed='left')) - def test_take(self): - actual = self.index.take([0, 1]) - assert self.index.equals(actual) + def test_take(self, closed): + index = self.create_index(closed=closed) - expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2]) - actual = self.index.take([0, 0, 1]) - assert expected.equals(actual) + actual = index.take([0, 1]) + tm.assert_index_equal(actual, index) + + expected = IntervalIndex.from_arrays( + [0, 0, 1], [1, 1, 2], closed=closed) + actual = index.take([0, 0, 1]) + tm.assert_index_equal(actual, expected) - def test_unique(self): + def test_unique(self, closed): # unique non-overlapping - idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (2, 3), (4, 5)], closed=closed) assert idx.is_unique # unique overlapping - distinct endpoints - idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)]) + idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed) assert idx.is_unique # unique overlapping - shared endpoints - idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)]) + idx = pd.IntervalIndex.from_tuples( + [(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_unique # unique nested - idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)]) + idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed) assert idx.is_unique # duplicate - idx = IntervalIndex.from_tuples([(0, 1), (0, 1), (2, 3)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (0, 1), (2, 3)], closed=closed) assert not idx.is_unique # unique mixed - idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b')]) + idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b')], closed=closed) assert idx.is_unique # duplicate mixed - idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b'), (0, 1)]) + idx = IntervalIndex.from_tuples( + [(0, 1), ('a', 'b'), (0, 1)], closed=closed) assert not idx.is_unique # empty - idx = IntervalIndex([]) + idx = IntervalIndex([], closed=closed) assert idx.is_unique - def test_monotonic(self): + def test_monotonic(self, closed): # increasing non-overlapping - idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (2, 3), (4, 5)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing non-overlapping - idx = IntervalIndex.from_tuples([(4, 5), (2, 3), (1, 2)]) + idx = IntervalIndex.from_tuples( + [(4, 5), (2, 3), (1, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # unordered non-overlapping - idx = IntervalIndex.from_tuples([(0, 1), (4, 5), (2, 3)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (4, 5), (2, 3)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # increasing overlapping - idx = IntervalIndex.from_tuples([(0, 2), (0.5, 2.5), (1, 3)]) + idx = IntervalIndex.from_tuples( + [(0, 2), (0.5, 2.5), (1, 3)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing overlapping - idx = IntervalIndex.from_tuples([(1, 3), (0.5, 2.5), (0, 2)]) + idx = IntervalIndex.from_tuples( + [(1, 3), (0.5, 2.5), (0, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # unordered overlapping - idx = IntervalIndex.from_tuples([(0.5, 2.5), (0, 2), (1, 3)]) + idx = IntervalIndex.from_tuples( + [(0.5, 2.5), (0, 2), (1, 3)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # increasing overlapping shared endpoints - idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)]) + idx = pd.IntervalIndex.from_tuples( + [(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing overlapping shared endpoints - idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)]) + idx = pd.IntervalIndex.from_tuples( + [(2, 3), (1, 3), (1, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # stationary - idx = IntervalIndex.from_tuples([(0, 1), (0, 1)]) + idx = IntervalIndex.from_tuples([(0, 1), (0, 1)], closed=closed) assert idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # empty - idx = IntervalIndex([]) + idx = IntervalIndex([], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing @@ -395,24 +473,24 @@ def test_repr_max_seq_item_setting(self): def test_repr_roundtrip(self): super(TestIntervalIndex, self).test_repr_roundtrip() - def test_get_item(self): + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), - closed='right') - assert i[0] == Interval(0.0, 1.0) - assert i[1] == Interval(1.0, 2.0) + closed=closed) + assert i[0] == Interval(0.0, 1.0, closed=closed) + assert i[1] == Interval(1.0, 2.0, closed=closed) assert isna(i[2]) result = i[0:1] - expected = IntervalIndex.from_arrays((0.,), (1.,), closed='right') + expected = IntervalIndex.from_arrays((0.,), (1.,), closed=closed) tm.assert_index_equal(result, expected) result = i[0:2] - expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed='right') + expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed=closed) tm.assert_index_equal(result, expected) result = i[1:3] expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan), - closed='right') + closed=closed) tm.assert_index_equal(result, expected) def test_get_loc_value(self): @@ -581,20 +659,22 @@ def testcontains(self): assert not i.contains(20) assert not i.contains(-20) - def test_dropna(self): + def test_dropna(self, closed): - expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)]) + expected = IntervalIndex.from_tuples( + [(0.0, 1.0), (1.0, 2.0)], closed=closed) - ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan]) + ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) - ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan]) + ii = IntervalIndex.from_arrays( + [0, 1, np.nan], [1, 2, np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) - def test_non_contiguous(self): - index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) + def test_non_contiguous(self, closed): + index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) target = [0.5, 1.5, 2.5] actual = index.get_indexer(target) expected = np.array([0, -1, 1], dtype='intp') @@ -602,31 +682,32 @@ def test_non_contiguous(self): assert 1.5 not in index - def test_union(self): - other = IntervalIndex.from_arrays([2], [3]) - expected = IntervalIndex.from_arrays(range(3), range(1, 4)) - actual = self.index.union(other) + def test_union(self, closed): + idx = self.create_index(closed=closed) + other = IntervalIndex.from_arrays([2], [3], closed=closed) + expected = IntervalIndex.from_arrays( + range(3), range(1, 4), closed=closed) + actual = idx.union(other) assert expected.equals(actual) - actual = other.union(self.index) + actual = other.union(idx) assert expected.equals(actual) - tm.assert_index_equal(self.index.union(self.index), self.index) - tm.assert_index_equal(self.index.union(self.index[:1]), - self.index) + tm.assert_index_equal(idx.union(idx), idx) + tm.assert_index_equal(idx.union(idx[:1]), idx) - def test_intersection(self): - other = IntervalIndex.from_breaks([1, 2, 3]) - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.intersection(other) + def test_intersection(self, closed): + idx = self.create_index(closed=closed) + other = IntervalIndex.from_breaks([1, 2, 3], closed=closed) + expected = IntervalIndex.from_breaks([1, 2], closed=closed) + actual = idx.intersection(other) assert expected.equals(actual) - tm.assert_index_equal(self.index.intersection(self.index), - self.index) + tm.assert_index_equal(idx.intersection(idx), idx) - def test_difference(self): - tm.assert_index_equal(self.index.difference(self.index[:1]), - self.index[1:]) + def test_difference(self, closed): + idx = self.create_index(closed=closed) + tm.assert_index_equal(idx.difference(idx[:1]), idx[1:]) def test_symmetric_difference(self): result = self.index[:1].symmetric_difference(self.index[1:]) @@ -639,11 +720,12 @@ def test_set_operation_errors(self): other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') pytest.raises(ValueError, self.index.union, other) - def test_isin(self): - actual = self.index.isin(self.index) + def test_isin(self, closed): + idx = self.create_index(closed=closed) + actual = idx.isin(idx) tm.assert_numpy_array_equal(np.array([True, True]), actual) - actual = self.index.isin(self.index[:1]) + actual = idx.isin(idx[:1]) tm.assert_numpy_array_equal(np.array([True, False]), actual) def test_comparison(self): @@ -702,25 +784,28 @@ def test_comparison(self): with pytest.raises(ValueError): self.index > np.arange(3) - def test_missing_values(self): - idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) - idx2 = pd.IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2]) + def test_missing_values(self, closed): + idx = Index([np.nan, Interval(0, 1, closed=closed), + Interval(1, 2, closed=closed)]) + idx2 = IntervalIndex.from_arrays( + [np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) with pytest.raises(ValueError): - IntervalIndex.from_arrays([np.nan, 0, 1], np.array([0, 1, 2])) + IntervalIndex.from_arrays( + [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed) tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) - def test_sort_values(self): - expected = IntervalIndex.from_breaks([1, 2, 3, 4]) - actual = IntervalIndex.from_tuples([(3, 4), (1, 2), - (2, 3)]).sort_values() + def test_sort_values(self, closed): + expected = IntervalIndex.from_breaks([1, 2, 3, 4], closed=closed) + actual = IntervalIndex.from_tuples( + [(3, 4), (1, 2), (2, 3)], closed=closed).sort_values() tm.assert_index_equal(expected, actual) # nan - idx = self.index_with_nan + idx = self.create_index_with_nan(closed=closed) mask = idx.isna() tm.assert_numpy_array_equal(mask, np.array([False, True, False])) @@ -733,84 +818,83 @@ def test_sort_values(self): tm.assert_numpy_array_equal(mask, np.array([True, False, False])) def test_datetime(self): - dates = pd.date_range('2000', periods=3) + dates = date_range('2000', periods=3) idx = IntervalIndex.from_breaks(dates) tm.assert_index_equal(idx.left, dates[:2]) tm.assert_index_equal(idx.right, dates[-2:]) - expected = pd.date_range('2000-01-01T12:00', periods=2) + expected = date_range('2000-01-01T12:00', periods=2) tm.assert_index_equal(idx.mid, expected) - assert pd.Timestamp('2000-01-01T12') not in idx - assert pd.Timestamp('2000-01-01T12') not in idx + assert Timestamp('2000-01-01T12') not in idx + assert Timestamp('2000-01-01T12') not in idx - target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') + target = date_range('1999-12-31T12:00', periods=7, freq='12H') actual = idx.get_indexer(target) expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) - def test_append(self): + def test_append(self, closed): - index1 = IntervalIndex.from_arrays([0, 1], [1, 2]) - index2 = IntervalIndex.from_arrays([1, 2], [2, 3]) + index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed) + index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) result = index1.append(index2) - expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3]) + expected = IntervalIndex.from_arrays( + [0, 1, 1, 2], [1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) result = index1.append([index1, index2]) - expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2], - [1, 2, 1, 2, 2, 3]) + expected = IntervalIndex.from_arrays( + [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) - def f(): - index1.append(IntervalIndex.from_arrays([0, 1], [1, 2], - closed='both')) - - pytest.raises(ValueError, f) + msg = ('can only append two IntervalIndex objects that are closed ' + 'on the same side') + for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: + index_other_closed = IntervalIndex.from_arrays( + [0, 1], [1, 2], closed=other_closed) + with tm.assert_raises_regex(ValueError, msg): + index1.append(index_other_closed) - def test_is_non_overlapping_monotonic(self): + def test_is_non_overlapping_monotonic(self, closed): # Should be True in all cases tpls = [(0, 1), (2, 3), (4, 5), (6, 7)] - for closed in ('left', 'right', 'neither', 'both'): - idx = IntervalIndex.from_tuples(tpls, closed=closed) - assert idx.is_non_overlapping_monotonic is True + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is True - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) - assert idx.is_non_overlapping_monotonic is True + idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + assert idx.is_non_overlapping_monotonic is True # Should be False in all cases (overlapping) tpls = [(0, 2), (1, 3), (4, 5), (6, 7)] - for closed in ('left', 'right', 'neither', 'both'): - idx = IntervalIndex.from_tuples(tpls, closed=closed) - assert idx.is_non_overlapping_monotonic is False + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) - assert idx.is_non_overlapping_monotonic is False + idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + assert idx.is_non_overlapping_monotonic is False # Should be False in all cases (non-monotonic) tpls = [(0, 1), (2, 3), (6, 7), (4, 5)] - for closed in ('left', 'right', 'neither', 'both'): - idx = IntervalIndex.from_tuples(tpls, closed=closed) - assert idx.is_non_overlapping_monotonic is False - - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) - assert idx.is_non_overlapping_monotonic is False + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False - # Should be False for closed='both', overwise True (GH16560) - idx = IntervalIndex.from_breaks(range(4), closed='both') + idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) assert idx.is_non_overlapping_monotonic is False - for closed in ('left', 'right', 'neither'): + # Should be False for closed='both', overwise True (GH16560) + if closed == 'both': + idx = IntervalIndex.from_breaks(range(4), closed=closed) + assert idx.is_non_overlapping_monotonic is False + else: idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True class TestIntervalRange(object): - @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) def test_construction_from_numeric(self, closed): # combinations of start/end/periods without freq expected = IntervalIndex.from_breaks( @@ -848,7 +932,6 @@ def test_construction_from_numeric(self, closed): closed=closed) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) def test_construction_from_timestamp(self, closed): # combinations of start/end/periods without freq start, end = Timestamp('2017-01-01'), Timestamp('2017-01-06') @@ -915,7 +998,6 @@ def test_construction_from_timestamp(self, closed): closed=closed) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) def test_construction_from_timedelta(self, closed): # combinations of start/end/periods without freq start, end = Timedelta('1 day'), Timedelta('6 days') From 9291ba396b174578b548993b25ccb8442ba423db Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Wed, 6 Dec 2017 20:26:46 +0900 Subject: [PATCH 75/85] BUG: Fix the un-pickleable plot with DatetimeIndex (#18486) --- doc/source/whatsnew/v0.21.1.txt | 2 +- pandas/plotting/_timeseries.py | 9 +++++++-- pandas/tests/plotting/test_datetimelike.py | 10 +++++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index f690e6f95aba6..017eb5ca155e3 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -93,7 +93,7 @@ I/O Plotting ^^^^^^^^ -- +- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`) - - diff --git a/pandas/plotting/_timeseries.py b/pandas/plotting/_timeseries.py index 3d04973ed0009..56b5311326e98 100644 --- a/pandas/plotting/_timeseries.py +++ b/pandas/plotting/_timeseries.py @@ -1,5 +1,7 @@ # TODO: Use the fact that axis can have units to simplify the process +import functools + import numpy as np from matplotlib import pylab @@ -293,6 +295,10 @@ def format_timedelta_ticks(x, pos, n_decimals): return s +def _format_coord(freq, t, y): + return "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y) + + def format_dateaxis(subplot, freq, index): """ Pretty-formats the date axis (x-axis). @@ -327,8 +333,7 @@ def format_dateaxis(subplot, freq, index): subplot.xaxis.set_minor_formatter(minformatter) # x and y coord info - subplot.format_coord = lambda t, y: ( - "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y)) + subplot.format_coord = functools.partial(_format_coord, freq) elif isinstance(index, TimedeltaIndex): subplot.xaxis.set_major_formatter( diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index d66012e2a56a0..d6cedac747f25 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1,13 +1,14 @@ """ Test cases for time series specific (freq conversion, etc) """ from datetime import datetime, timedelta, date, time +import pickle import pytest from pandas.compat import lrange, zip import numpy as np from pandas import Index, Series, DataFrame, NaT -from pandas.compat import is_platform_mac +from pandas.compat import is_platform_mac, PY3 from pandas.core.indexes.datetimes import date_range, bdate_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.tseries.offsets import DateOffset @@ -1470,5 +1471,12 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): with ensure_clean(return_filelike=True) as path: plt.savefig(path) + + # GH18439 + # this is supported only in Python 3 pickle since + # pickle in Python2 doesn't support instancemethod pickling + if PY3: + with ensure_clean(return_filelike=True) as path: + pickle.dump(fig, path) finally: plt.close(fig) From 0cc9377fc237ed5f063d17a58e3c0730caa040ae Mon Sep 17 00:00:00 2001 From: topper-123 Date: Wed, 6 Dec 2017 23:20:36 +0000 Subject: [PATCH 76/85] DOC: Give python3 precedence over py2 in the install notes (#18603) --- doc/source/install.rst | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 7a9a3b6c177d1..27dde005e5a87 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -141,28 +141,24 @@ and can take a few minutes to complete. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The commands in this table will install pandas for Python 2 from your distribution. -To install pandas for Python 3 you may need to use the package ``python3-pandas``. +The commands in this table will install pandas for Python 3 from your distribution. +To install pandas for Python 2 you may need to use the package ``python-pandas``. .. csv-table:: :header: "Distribution", "Status", "Download / Repository Link", "Install method" :widths: 10, 10, 20, 50 - Debian, stable, `official Debian repository `__ , ``sudo apt-get install python-pandas`` - Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python-pandas`` - Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python-pandas`` - Ubuntu, unstable (daily builds), `PythonXY PPA `__; activate by: ``sudo add-apt-repository ppa:pythonxy/pythonxy-devel && sudo apt-get update``, ``sudo apt-get install python-pandas`` - OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python-pandas`` - Fedora, stable, `official Fedora repository `__ , ``dnf install python-pandas`` - Centos/RHEL, stable, `EPEL repository `__ , ``yum install python-pandas`` - - - - - - + Debian, stable, `official Debian repository `__ , ``sudo apt-get install python3-pandas`` + Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python3-pandas`` + Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python3-pandas`` + OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python3-pandas`` + Fedora, stable, `official Fedora repository `__ , ``dnf install python3-pandas`` + Centos/RHEL, stable, `EPEL repository `__ , ``yum install python3-pandas`` +**However**, the packages in the linux package managers are often a few versions behind, so +to get the newest version of pandas, it's recommended to install using the ``pip`` or ``conda`` +methods described above. Installing from source From f5fb09e97e17af778616cc577569ac8a8d0b53d2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Dec 2017 02:20:48 +0100 Subject: [PATCH 77/85] DOC: temporary remove pyarrow example of reading subset columns (#18661) --- doc/source/io.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 8269c1a69c95b..4024414610a82 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4601,7 +4601,6 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet('example_pa.parquet', engine='pyarrow', columns=['a', 'b']) result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) result.dtypes From 2d1b85ad62ecc73e19d764144a77f76cb366043b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 7 Dec 2017 09:04:35 -0600 Subject: [PATCH 78/85] API: Restore implicit converter registration (#18307) * API: Restore implicit converter registration * Remove matplotlib from blacklist * fixup! Remove matplotlib from blacklist * Add option for toggling formatters * Remove move * Handle no matplotlib * Cleanup * Test no register * Restore original state * Added deregister * Doc, naming * Naming * Added deprecation * PEP8 * Fix typos * Rename it all * Missed one * Check version * No warnings by default * Update release notes * Test fixup - actually switch the default to not warn - We do overwrite matplotlib's formatters * Doc update * Fix deprecation message * Test added by default --- ci/check_imports.py | 1 - doc/source/api.rst | 11 + doc/source/options.rst | 318 ++++++++++++------------ doc/source/whatsnew/v0.21.1.txt | 35 ++- pandas/core/config_init.py | 26 ++ pandas/plotting/__init__.py | 7 + pandas/plotting/_converter.py | 107 +++++++- pandas/plotting/_core.py | 28 +-- pandas/tests/plotting/test_converter.py | 126 +++++++++- pandas/tseries/converter.py | 11 +- 10 files changed, 485 insertions(+), 185 deletions(-) diff --git a/ci/check_imports.py b/ci/check_imports.py index a83436e7d258c..d6f24ebcc4d3e 100644 --- a/ci/check_imports.py +++ b/ci/check_imports.py @@ -9,7 +9,6 @@ 'ipython', 'jinja2' 'lxml', - 'matplotlib', 'numexpr', 'openpyxl', 'py', diff --git a/doc/source/api.rst b/doc/source/api.rst index e8b8b3624740d..a9766b5c04496 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -2174,6 +2174,17 @@ Style Export and Import Styler.export Styler.use +Plotting +~~~~~~~~ + +.. currentmodule:: pandas + +.. autosummary:: + :toctree: generated/ + + plotting.register_matplotlib_converters + plotting.deregister_matplotlib_converters + .. currentmodule:: pandas General utility functions diff --git a/doc/source/options.rst b/doc/source/options.rst index 2da55a5a658a4..be3a3d9a55534 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -273,164 +273,166 @@ Options are 'right', and 'left'. Available Options ----------------- -=================================== ============ ================================== -Option Default Function -=================================== ============ ================================== -display.chop_threshold None If set to a float value, all float - values smaller then the given - threshold will be displayed as - exactly 0 by repr and friends. -display.colheader_justify right Controls the justification of - column headers. used by DataFrameFormatter. -display.column_space 12 No description available. -display.date_dayfirst False When True, prints and parses dates - with the day first, eg 20/01/2005 -display.date_yearfirst False When True, prints and parses dates - with the year first, eg 2005/01/20 -display.encoding UTF-8 Defaults to the detected encoding - of the console. Specifies the encoding - to be used for strings returned by - to_string, these are generally strings - meant to be displayed on the console. -display.expand_frame_repr True Whether to print out the full DataFrame - repr for wide DataFrames across - multiple lines, `max_columns` is - still respected, but the output will - wrap-around across multiple "pages" - if its width exceeds `display.width`. -display.float_format None The callable should accept a floating - point number and return a string with - the desired format of the number. - This is used in some places like - SeriesFormatter. - See core.format.EngFormatter for an example. -display.large_repr truncate For DataFrames exceeding max_rows/max_cols, - the repr (and HTML repr) can show - a truncated table (the default), - or switch to the view from df.info() - (the behaviour in earlier versions of pandas). - allowable settings, ['truncate', 'info'] -display.latex.repr False Whether to produce a latex DataFrame - representation for jupyter frontends - that support it. -display.latex.escape True Escapes special characters in DataFrames, when - using the to_latex method. -display.latex.longtable False Specifies if the to_latex method of a DataFrame - uses the longtable format. -display.latex.multicolumn True Combines columns when using a MultiIndex -display.latex.multicolumn_format 'l' Alignment of multicolumn labels -display.latex.multirow False Combines rows when using a MultiIndex. - Centered instead of top-aligned, - separated by clines. -display.max_columns 20 max_rows and max_columns are used - in __repr__() methods to decide if - to_string() or info() is used to - render an object to a string. In - case python/IPython is running in - a terminal this can be set to 0 and - pandas will correctly auto-detect - the width the terminal and swap to - a smaller format in case all columns - would not fit vertically. The IPython - notebook, IPython qtconsole, or IDLE - do not run in a terminal and hence - it is not possible to do correct - auto-detection. 'None' value means - unlimited. -display.max_colwidth 50 The maximum width in characters of - a column in the repr of a pandas - data structure. When the column overflows, - a "..." placeholder is embedded in - the output. -display.max_info_columns 100 max_info_columns is used in DataFrame.info - method to decide if per column information - will be printed. -display.max_info_rows 1690785 df.info() will usually show null-counts - for each column. For large frames - this can be quite slow. max_info_rows - and max_info_cols limit this null - check only to frames with smaller - dimensions then specified. -display.max_rows 60 This sets the maximum number of rows - pandas should output when printing - out various output. For example, - this value determines whether the - repr() for a dataframe prints out - fully or just a summary repr. - 'None' value means unlimited. -display.max_seq_items 100 when pretty-printing a long sequence, - no more then `max_seq_items` will - be printed. If items are omitted, - they will be denoted by the addition - of "..." to the resulting string. - If set to None, the number of items - to be printed is unlimited. -display.memory_usage True This specifies if the memory usage of - a DataFrame should be displayed when the - df.info() method is invoked. -display.multi_sparse True "Sparsify" MultiIndex display (don't - display repeated elements in outer - levels within groups) -display.notebook_repr_html True When True, IPython notebook will - use html representation for - pandas objects (if it is available). -display.pprint_nest_depth 3 Controls the number of nested levels - to process when pretty-printing -display.precision 6 Floating point output precision in - terms of number of places after the - decimal, for regular formatting as well - as scientific notation. Similar to - numpy's ``precision`` print option -display.show_dimensions truncate Whether to print out dimensions - at the end of DataFrame repr. - If 'truncate' is specified, only - print out the dimensions if the - frame is truncated (e.g. not display - all rows and/or columns) -display.width 80 Width of the display in characters. - In case python/IPython is running in - a terminal this can be set to None - and pandas will correctly auto-detect - the width. Note that the IPython notebook, - IPython qtconsole, or IDLE do not run in a - terminal and hence it is not possible - to correctly detect the width. -display.html.table_schema False Whether to publish a Table Schema - representation for frontends that - support it. -display.html.border 1 A ``border=value`` attribute is - inserted in the ```` tag - for the DataFrame HTML repr. -io.excel.xls.writer xlwt The default Excel writer engine for - 'xls' files. -io.excel.xlsm.writer openpyxl The default Excel writer engine for - 'xlsm' files. Available options: - 'openpyxl' (the default). -io.excel.xlsx.writer openpyxl The default Excel writer engine for - 'xlsx' files. -io.hdf.default_format None default format writing format, if - None, then put will default to - 'fixed' and append will default to - 'table' -io.hdf.dropna_table True drop ALL nan rows when appending - to a table -io.parquet.engine None The engine to use as a default for - parquet reading and writing. If None - then try 'pyarrow' and 'fastparquet' -mode.chained_assignment warn Raise an exception, warn, or no - action if trying to use chained - assignment, The default is warn -mode.sim_interactive False Whether to simulate interactive mode - for purposes of testing. -mode.use_inf_as_na False True means treat None, NaN, -INF, - INF as NA (old way), False means - None and NaN are null, but INF, -INF - are not NA (new way). -compute.use_bottleneck True Use the bottleneck library to accelerate - computation if it is installed. -compute.use_numexpr True Use the numexpr library to accelerate - computation if it is installed. -=================================== ============ ================================== +======================================= ============ ================================== +Option Default Function +======================================= ============ ================================== +display.chop_threshold None If set to a float value, all float + values smaller then the given + threshold will be displayed as + exactly 0 by repr and friends. +display.colheader_justify right Controls the justification of + column headers. used by DataFrameFormatter. +display.column_space 12 No description available. +display.date_dayfirst False When True, prints and parses dates + with the day first, eg 20/01/2005 +display.date_yearfirst False When True, prints and parses dates + with the year first, eg 2005/01/20 +display.encoding UTF-8 Defaults to the detected encoding + of the console. Specifies the encoding + to be used for strings returned by + to_string, these are generally strings + meant to be displayed on the console. +display.expand_frame_repr True Whether to print out the full DataFrame + repr for wide DataFrames across + multiple lines, `max_columns` is + still respected, but the output will + wrap-around across multiple "pages" + if its width exceeds `display.width`. +display.float_format None The callable should accept a floating + point number and return a string with + the desired format of the number. + This is used in some places like + SeriesFormatter. + See core.format.EngFormatter for an example. +display.large_repr truncate For DataFrames exceeding max_rows/max_cols, + the repr (and HTML repr) can show + a truncated table (the default), + or switch to the view from df.info() + (the behaviour in earlier versions of pandas). + allowable settings, ['truncate', 'info'] +display.latex.repr False Whether to produce a latex DataFrame + representation for jupyter frontends + that support it. +display.latex.escape True Escapes special characters in DataFrames, when + using the to_latex method. +display.latex.longtable False Specifies if the to_latex method of a DataFrame + uses the longtable format. +display.latex.multicolumn True Combines columns when using a MultiIndex +display.latex.multicolumn_format 'l' Alignment of multicolumn labels +display.latex.multirow False Combines rows when using a MultiIndex. + Centered instead of top-aligned, + separated by clines. +display.max_columns 20 max_rows and max_columns are used + in __repr__() methods to decide if + to_string() or info() is used to + render an object to a string. In + case python/IPython is running in + a terminal this can be set to 0 and + pandas will correctly auto-detect + the width the terminal and swap to + a smaller format in case all columns + would not fit vertically. The IPython + notebook, IPython qtconsole, or IDLE + do not run in a terminal and hence + it is not possible to do correct + auto-detection. 'None' value means + unlimited. +display.max_colwidth 50 The maximum width in characters of + a column in the repr of a pandas + data structure. When the column overflows, + a "..." placeholder is embedded in + the output. +display.max_info_columns 100 max_info_columns is used in DataFrame.info + method to decide if per column information + will be printed. +display.max_info_rows 1690785 df.info() will usually show null-counts + for each column. For large frames + this can be quite slow. max_info_rows + and max_info_cols limit this null + check only to frames with smaller + dimensions then specified. +display.max_rows 60 This sets the maximum number of rows + pandas should output when printing + out various output. For example, + this value determines whether the + repr() for a dataframe prints out + fully or just a summary repr. + 'None' value means unlimited. +display.max_seq_items 100 when pretty-printing a long sequence, + no more then `max_seq_items` will + be printed. If items are omitted, + they will be denoted by the addition + of "..." to the resulting string. + If set to None, the number of items + to be printed is unlimited. +display.memory_usage True This specifies if the memory usage of + a DataFrame should be displayed when the + df.info() method is invoked. +display.multi_sparse True "Sparsify" MultiIndex display (don't + display repeated elements in outer + levels within groups) +display.notebook_repr_html True When True, IPython notebook will + use html representation for + pandas objects (if it is available). +display.pprint_nest_depth 3 Controls the number of nested levels + to process when pretty-printing +display.precision 6 Floating point output precision in + terms of number of places after the + decimal, for regular formatting as well + as scientific notation. Similar to + numpy's ``precision`` print option +display.show_dimensions truncate Whether to print out dimensions + at the end of DataFrame repr. + If 'truncate' is specified, only + print out the dimensions if the + frame is truncated (e.g. not display + all rows and/or columns) +display.width 80 Width of the display in characters. + In case python/IPython is running in + a terminal this can be set to None + and pandas will correctly auto-detect + the width. Note that the IPython notebook, + IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible + to correctly detect the width. +display.html.table_schema False Whether to publish a Table Schema + representation for frontends that + support it. +display.html.border 1 A ``border=value`` attribute is + inserted in the ``
`` tag + for the DataFrame HTML repr. +io.excel.xls.writer xlwt The default Excel writer engine for + 'xls' files. +io.excel.xlsm.writer openpyxl The default Excel writer engine for + 'xlsm' files. Available options: + 'openpyxl' (the default). +io.excel.xlsx.writer openpyxl The default Excel writer engine for + 'xlsx' files. +io.hdf.default_format None default format writing format, if + None, then put will default to + 'fixed' and append will default to + 'table' +io.hdf.dropna_table True drop ALL nan rows when appending + to a table +io.parquet.engine None The engine to use as a default for + parquet reading and writing. If None + then try 'pyarrow' and 'fastparquet' +mode.chained_assignment warn Raise an exception, warn, or no + action if trying to use chained + assignment, The default is warn +mode.sim_interactive False Whether to simulate interactive mode + for purposes of testing. +mode.use_inf_as_na False True means treat None, NaN, -INF, + INF as NA (old way), False means + None and NaN are null, but INF, -INF + are not NA (new way). +compute.use_bottleneck True Use the bottleneck library to accelerate + computation if it is installed. +compute.use_numexpr True Use the numexpr library to accelerate + computation if it is installed. +plotting.matplotlib.register_converters True Register custom converters with + matplotlib. Set to False to de-register. +======================================= ============ ======================================== .. _basics.console_output: diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 017eb5ca155e3..215800488c326 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -7,6 +7,36 @@ This is a minor release from 0.21.1 and includes a number of deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. +.. _whatsnew_0211.special: + +Restore Matplotlib datetime Converter Registration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas implements some matplotlib converters for nicely formatting the axis +labels on plots with ``datetime`` or ``Period`` values. Prior to pandas 0.21.0, +these were implicitly registered with matplotlib, as a side effect of ``import +pandas``. + +In pandas 0.21.0, we required users to explicitly register the +converter. This caused problems for some users who relied on those converters +being present for regular ``matplotlib.pyplot`` plotting methods, so we're +temporarily reverting that change; pandas will again register the converters on +import. + +We've added a new option to control the converters: +``pd.options.plotting.matplotlib.register_converters``. By default, they are +registered. Toggling this to ``False`` removes pandas' formatters and restore +any converters we overwrote when registering them (:issue:`18301`). + +We're working with the matplotlib developers to make this easier. We're trying +to balance user convenience (automatically registering the converters) with +import performance and best practices (importing pandas shouldn't have the side +effect of overwriting any custom converters you've already set). In the future +we hope to have most of the datetime formatting functionality in matplotlib, +with just the pandas-specific converters in pandas. We'll then gracefully +deprecate the automatic registration of converters in favor of users explicitly +registering them when they want them. + .. _whatsnew_0211.enhancements: New features @@ -30,9 +60,8 @@ Other Enhancements Deprecations ~~~~~~~~~~~~ -- -- -- +- ``pandas.tseries.register`` has been renamed to + :func:`pandas.plotting.register_matplotlib_converters`` (:issue:`18301`) .. _whatsnew_0211.performance: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 33531e80449d8..94208a61a4377 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -479,3 +479,29 @@ def use_inf_as_na_cb(key): cf.register_option( 'engine', 'auto', parquet_engine_doc, validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet'])) + +# -------- +# Plotting +# --------- + +register_converter_doc = """ +: bool + Whether to register converters with matplotlib's units registry for + dates, times, datetimes, and Periods. Toggling to False will remove + the converters, restoring any converters that pandas overwrote. +""" + + +def register_converter_cb(key): + from pandas.plotting import register_matplotlib_converters + from pandas.plotting import deregister_matplotlib_converters + + if cf.get_option(key): + register_matplotlib_converters() + else: + deregister_matplotlib_converters() + + +with cf.config_prefix("plotting.matplotlib"): + cf.register_option("register_converters", True, register_converter_doc, + validator=bool, cb=register_converter_cb) diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index 8f98e297e3e66..385d4d7f047c7 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -11,3 +11,10 @@ from pandas.plotting._core import boxplot from pandas.plotting._style import plot_params from pandas.plotting._tools import table +try: + from pandas.plotting._converter import \ + register as register_matplotlib_converters + from pandas.plotting._converter import \ + deregister as deregister_matplotlib_converters +except ImportError: + pass diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 47d15195315ba..357e84d1f17ea 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -1,3 +1,4 @@ +import warnings from datetime import datetime, timedelta import datetime as pydt import numpy as np @@ -44,14 +45,96 @@ MUSEC_PER_DAY = 1e6 * SEC_PER_DAY +_WARN = True # Global for whether pandas has registered the units explicitly +_mpl_units = {} # Cache for units overwritten by us -def register(): - units.registry[lib.Timestamp] = DatetimeConverter() - units.registry[Period] = PeriodConverter() - units.registry[pydt.datetime] = DatetimeConverter() - units.registry[pydt.date] = DatetimeConverter() - units.registry[pydt.time] = TimeConverter() - units.registry[np.datetime64] = DatetimeConverter() + +def get_pairs(): + pairs = [ + (lib.Timestamp, DatetimeConverter), + (Period, PeriodConverter), + (pydt.datetime, DatetimeConverter), + (pydt.date, DatetimeConverter), + (pydt.time, TimeConverter), + (np.datetime64, DatetimeConverter), + ] + return pairs + + +def register(explicit=True): + """Register Pandas Formatters and Converters with matplotlib + + This function modifies the global ``matplotlib.units.registry`` + dictionary. Pandas adds custom converters for + + * pd.Timestamp + * pd.Period + * np.datetime64 + * datetime.datetime + * datetime.date + * datetime.time + + See Also + -------- + deregister_matplotlib_converter + """ + # Renamed in pandas.plotting.__init__ + global _WARN + + if explicit: + _WARN = False + + pairs = get_pairs() + for type_, cls in pairs: + converter = cls() + if type_ in units.registry: + previous = units.registry[type_] + _mpl_units[type_] = previous + units.registry[type_] = converter + + +def deregister(): + """Remove pandas' formatters and converters + + Removes the custom converters added by :func:`register`. This + attempts to set the state of the registry back to the state before + pandas registered its own units. Converters for pandas' own types like + Timestamp and Period are removed completely. Converters for types + pandas overwrites, like ``datetime.datetime``, are restored to their + original value. + + See Also + -------- + deregister_matplotlib_converters + """ + # Renamed in pandas.plotting.__init__ + for type_, cls in get_pairs(): + # We use type to catch our classes directly, no inheritance + if type(units.registry.get(type_)) is cls: + units.registry.pop(type_) + + # restore the old keys + for unit, formatter in _mpl_units.items(): + if type(formatter) not in {DatetimeConverter, PeriodConverter, + TimeConverter}: + # make it idempotent by excluding ours. + units.registry[unit] = formatter + + +def _check_implicitly_registered(): + global _WARN + + if _WARN: + msg = ("Using an implicitly registered datetime converter for a " + "matplotlib plotting method. The converter was registered " + "by pandas on import. Future versions of pandas will require " + "you to explicitly register matplotlib converters.\n\n" + "To register the converters:\n\t" + ">>> from pandas.plotting import register_matplotlib_converters" + "\n\t" + ">>> register_matplotlib_converters()") + warnings.warn(msg, FutureWarning) + _WARN = False def _to_ordinalf(tm): @@ -189,6 +272,7 @@ class DatetimeConverter(dates.DateConverter): @staticmethod def convert(values, unit, axis): # values might be a 1-d array, or a list-like of arrays. + _check_implicitly_registered() if is_nested_list_like(values): values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values] @@ -273,6 +357,7 @@ class PandasAutoDateLocator(dates.AutoDateLocator): def get_locator(self, dmin, dmax): 'Pick the best locator based on a distance.' + _check_implicitly_registered() delta = relativedelta(dmax, dmin) num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days @@ -314,6 +399,7 @@ def get_unit_generic(freq): def __call__(self): # if no data have been set, this will tank with a ValueError + _check_implicitly_registered() try: dmin, dmax = self.viewlim_to_dt() except ValueError: @@ -914,6 +1000,8 @@ def _get_default_locs(self, vmin, vmax): def __call__(self): 'Return the locations of the ticks.' # axis calls Locator.set_axis inside set_m_formatter + _check_implicitly_registered() + vi = tuple(self.axis.get_view_interval()) if vi != self.plot_obj.view_interval: self.plot_obj.date_axis_info = None @@ -998,6 +1086,8 @@ def set_locs(self, locs): 'Sets the locations of the ticks' # don't actually use the locs. This is just needed to work with # matplotlib. Force to use vmin, vmax + _check_implicitly_registered() + self.locs = locs (vmin, vmax) = vi = tuple(self.axis.get_view_interval()) @@ -1009,6 +1099,8 @@ def set_locs(self, locs): self._set_default_format(vmin, vmax) def __call__(self, x, pos=0): + _check_implicitly_registered() + if self.formatdict is None: return '' else: @@ -1039,6 +1131,7 @@ def format_timedelta_ticks(x, pos, n_decimals): return s def __call__(self, x, pos=0): + _check_implicitly_registered() (vmin, vmax) = tuple(self.axis.get_view_interval()) n_decimals = int(np.ceil(np.log10(100 * 1e9 / (vmax - vmin)))) if n_decimals > 9: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 4a8bef69e4a4b..e1380953e4519 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -11,6 +11,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.base import PandasObject +from pandas.core.config import get_option from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike from pandas.core.dtypes.common import ( is_list_like, @@ -40,16 +41,13 @@ _get_xlim, _set_ticks_props, format_date_labels) -_registered = False - - -def _setup(): - # delay the import of matplotlib until nescessary - global _registered - if not _registered: - from pandas.plotting import _converter - _converter.register() - _registered = True +try: + from pandas.plotting import _converter +except ImportError: + pass +else: + if get_option('plotting.matplotlib.register_converters'): + _converter.register(explicit=True) def _get_standard_kind(kind): @@ -99,7 +97,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, secondary_y=False, colormap=None, table=False, layout=None, **kwds): - _setup() + _converter._WARN = False self.data = data self.by = by @@ -2063,7 +2061,7 @@ def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds): import matplotlib.pyplot as plt - _setup() + _converter._WARN = False ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize, grid=grid, rot=rot, figsize=figsize, layout=layout, return_type=return_type, **kwds) @@ -2159,7 +2157,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, kwds : other plotting keyword arguments To be passed to hist function """ - _setup() + _converter._WARN = False if by is not None: axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, sharex=sharex, sharey=sharey, @@ -2293,6 +2291,8 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, ------- axes: collection of Matplotlib Axes """ + _converter._WARN = False + def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) @@ -2356,7 +2356,7 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1) >>> boxplot_frame_groupby(grouped, subplots=False) """ - _setup() + _converter._WARN = False if subplots is True: naxes = len(grouped) fig, axes = _subplots(naxes=naxes, squeeze=False, diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index e1f64bed5598d..3818c04649366 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -1,20 +1,144 @@ +import subprocess import pytest from datetime import datetime, date import numpy as np -from pandas import Timestamp, Period, Index +from pandas import Timestamp, Period, Index, date_range, Series from pandas.compat import u +import pandas.core.config as cf import pandas.util.testing as tm from pandas.tseries.offsets import Second, Milli, Micro, Day from pandas.compat.numpy import np_datetime64_compat converter = pytest.importorskip('pandas.plotting._converter') +from pandas.plotting import (register_matplotlib_converters, + deregister_matplotlib_converters) def test_timtetonum_accepts_unicode(): assert (converter.time2num("00:01") == converter.time2num(u("00:01"))) +class TestRegistration(object): + + def test_register_by_default(self): + # Run in subprocess to ensure a clean state + code = ("'import matplotlib.units; " + "import pandas as pd; " + "units = dict(matplotlib.units.registry); " + "assert pd.Timestamp in units)'") + call = ['python', '-c', code] + assert subprocess.check_call(call) == 0 + + def test_warns(self): + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + _, ax = plt.subplots() + + # Set to the "warning" state, in case this isn't the first test run + converter._WARN = True + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) as w: + ax.plot(s.index, s.values) + plt.close() + + assert len(w) == 1 + assert "Using an implicitly registered datetime converter" in str(w[0]) + + def test_registering_no_warning(self): + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + _, ax = plt.subplots() + + # Set to the "warn" state, in case this isn't the first test run + converter._WARN = True + register_matplotlib_converters() + with tm.assert_produces_warning(None) as w: + ax.plot(s.index, s.values) + + assert len(w) == 0 + + def test_pandas_plots_register(self): + pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + # Set to the "warn" state, in case this isn't the first test run + converter._WARN = True + with tm.assert_produces_warning(None) as w: + s.plot() + + assert len(w) == 0 + + def test_matplotlib_formatters(self): + units = pytest.importorskip("matplotlib.units") + assert Timestamp in units.registry + + ctx = cf.option_context("plotting.matplotlib.register_converters", + False) + with ctx: + assert Timestamp not in units.registry + + assert Timestamp in units.registry + + def test_option_no_warning(self): + pytest.importorskip("matplotlib.pyplot") + ctx = cf.option_context("plotting.matplotlib.register_converters", + False) + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + _, ax = plt.subplots() + + converter._WARN = True + # Test without registering first, no warning + with ctx: + with tm.assert_produces_warning(None) as w: + ax.plot(s.index, s.values) + + assert len(w) == 0 + + # Now test with registering + converter._WARN = True + register_matplotlib_converters() + with ctx: + with tm.assert_produces_warning(None) as w: + ax.plot(s.index, s.values) + + assert len(w) == 0 + + def test_registry_resets(self): + units = pytest.importorskip("matplotlib.units") + dates = pytest.importorskip("matplotlib.dates") + + # make a copy, to reset to + original = dict(units.registry) + + try: + # get to a known state + units.registry.clear() + date_converter = dates.DateConverter() + units.registry[datetime] = date_converter + units.registry[date] = date_converter + + register_matplotlib_converters() + assert units.registry[date] is not date_converter + deregister_matplotlib_converters() + assert units.registry[date] is date_converter + + finally: + # restore original stater + units.registry.clear() + for k, v in original.items(): + units.registry[k] = v + + def test_old_import_warns(self): + with tm.assert_produces_warning(FutureWarning) as w: + from pandas.tseries import converter + converter.register() + + assert len(w) + assert ('pandas.plotting.register_matplotlib_converters' in + str(w[0].message)) + + class TestDateTimeConverter(object): def setup_method(self, method): diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index df603c4d880d8..26d3f3cb85edc 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -1,6 +1,7 @@ # flake8: noqa +import warnings -from pandas.plotting._converter import (register, time2num, +from pandas.plotting._converter import (time2num, TimeConverter, TimeFormatter, PeriodConverter, get_datevalue, DatetimeConverter, @@ -9,3 +10,11 @@ MilliSecondLocator, get_finder, TimeSeries_DateLocator, TimeSeries_DateFormatter) + + +def register(): + from pandas.plotting._converter import register as register_ + msg = ("'pandas.tseries.converter.register' has been moved and renamed to " + "'pandas.plotting.register_matplotlib_converters'. ") + warnings.warn(msg, FutureWarning, stacklevel=2) + register_() From 469813da457cb99cd8610a46842d7efd847c7671 Mon Sep 17 00:00:00 2001 From: Sietse Brouwer Date: Thu, 7 Dec 2017 16:46:08 +0100 Subject: [PATCH 79/85] DOC: explain the `mode.chained_assignment` option (#18635) --- doc/source/indexing.rst | 22 +++++++++++++++++----- doc/source/options.rst | 7 ++++--- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index fdb002a642d62..b329fac969343 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1835,15 +1835,27 @@ that you've done this: Yikes! +.. _indexing.evaluation_order: + Evaluation order matters ~~~~~~~~~~~~~~~~~~~~~~~~ -Furthermore, in chained expressions, the order may determine whether a copy is returned or not. -If an expression will set values on a copy of a slice, then a ``SettingWithCopy`` -warning will be issued. +When you use chained indexing, the order and type of the indexing operation +partially determine whether the result is a slice into the original object, or +a copy of the slice. + +Pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a +slice is frequently not intentional, but a mistake caused by chained indexing +returning a copy where a slice was expected. + +If you would like pandas to be more or less trusting about assignment to a +chained indexing expression, you can set the :ref:`option ` +``mode.chained_assignment`` to one of these values: -You can control the action of a chained assignment via the option ``mode.chained_assignment``, -which can take the values ``['raise','warn',None]``, where showing a warning is the default. +* ``'warn'``, the default, means a ``SettingWithCopyWarning`` is printed. +* ``'raise'`` means pandas will raise a ``SettingWithCopyException`` + you have to deal with. +* ``None`` will suppress the warnings entirely. .. ipython:: python :okwarning: diff --git a/doc/source/options.rst b/doc/source/options.rst index be3a3d9a55534..db3380bd4a3e7 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -417,9 +417,10 @@ io.hdf.dropna_table True drop ALL nan rows when appe io.parquet.engine None The engine to use as a default for parquet reading and writing. If None then try 'pyarrow' and 'fastparquet' -mode.chained_assignment warn Raise an exception, warn, or no - action if trying to use chained - assignment, The default is warn +mode.chained_assignment warn Controls ``SettingWithCopyWarning``: + 'raise', 'warn', or None. Raise an + exception, warn, or no action if + trying to use :ref:`chained assignment `. mode.sim_interactive False Whether to simulate interactive mode for purposes of testing. mode.use_inf_as_na False True means treat None, NaN, -INF, From 11a4100a4eb3f5defade59a188f6ad8863358fcd Mon Sep 17 00:00:00 2001 From: gabrielclow Date: Fri, 8 Dec 2017 00:11:14 -0200 Subject: [PATCH 80/85] Documentation fix for method last_valid_index (#18681) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cdc11cc7dc9df..ad79001e45b86 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4306,7 +4306,7 @@ def first_valid_index(self): return valid_indices[0] if len(valid_indices) else None @Appender(_shared_docs['valid_index'] % { - 'position': 'first', 'klass': 'DataFrame'}) + 'position': 'last', 'klass': 'DataFrame'}) def last_valid_index(self): if len(self) == 0: return None From 2524ff46537a04b2bde168a66d09c6045bee5689 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 8 Dec 2017 03:11:43 -0800 Subject: [PATCH 81/85] BUG: LatexFormatter.write_result multi-index (#18685) * BUG: LatexFormatter.write_result multi-index Fixed GH issue 14484: `LatexFormatter.write_result`` now does not print blanks if a higher-order index differs from the previous row. Also added testcase for this. * MAINT: Address reviewer comments Closes gh-14484 Closes gh-17499 --- doc/source/whatsnew/v0.21.1.txt | 1 + pandas/io/formats/format.py | 18 +++++++++++++----- pandas/tests/io/formats/test_to_latex.py | 22 ++++++++++++++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 215800488c326..2cf36cbbcaefd 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -117,6 +117,7 @@ I/O - Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) - Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) - Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) +- Bug in :func:`to_latex` where repeated multi-index values were not printed even though a higher level index differed from the previous row (:issue:`14484`) Plotting diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index ab98b9c4e4f49..24eeb1dd94c18 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -45,7 +45,6 @@ import pandas as pd import numpy as np -import itertools import csv from functools import partial @@ -891,6 +890,7 @@ def get_col_type(dtype): name = any(self.frame.index.names) cname = any(self.frame.columns.names) lastcol = self.frame.index.nlevels - 1 + previous_lev3 = None for i, lev in enumerate(self.frame.index.levels): lev2 = lev.format() blank = ' ' * len(lev2[0]) @@ -901,11 +901,19 @@ def get_col_type(dtype): lev3 = [blank] * clevels if name: lev3.append(lev.name) - for level_idx, group in itertools.groupby( - self.frame.index.labels[i]): - count = len(list(group)) - lev3.extend([lev2[level_idx]] + [blank] * (count - 1)) + current_idx_val = None + for level_idx in self.frame.index.labels[i]: + if ((previous_lev3 is None or + previous_lev3[len(lev3)].isspace()) and + lev2[level_idx] == current_idx_val): + # same index as above row and left index was the same + lev3.append(blank) + else: + # different value than above or left index different + lev3.append(lev2[level_idx]) + current_idx_val = lev2[level_idx] strcols.insert(i, lev3) + previous_lev3 = lev3 column_format = self.column_format if column_format is None: diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index aa86d1d9231fb..c0b7d4cee384a 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -221,6 +221,28 @@ def test_to_latex_multiindex(self): assert result == expected + def test_to_latex_multiindex_dupe_level(self): + # see gh-14484 + # + # If an index is repeated in subsequent rows, it should be + # replaced with a blank in the created table. This should + # ONLY happen if all higher order indices (to the left) are + # equal too. In this test, 'c' has to be printed both times + # because the higher order index 'A' != 'B'. + df = pd.DataFrame(index=pd.MultiIndex.from_tuples( + [('A', 'c'), ('B', 'c')]), columns=['col']) + result = df.to_latex() + expected = r"""\begin{tabular}{lll} +\toprule + & & col \\ +\midrule +A & c & NaN \\ +B & c & NaN \\ +\bottomrule +\end{tabular} +""" + assert result == expected + def test_to_latex_multicolumnrow(self): df = pd.DataFrame({ ('c1', 0): dict((x, x) for x in range(5)), From 966c80ad1593f0923fa061c942ba491dbcb2a273 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 8 Dec 2017 07:40:57 -0500 Subject: [PATCH 82/85] TST: xfail unrelaible parallel coordinates sorted label test (#18688) --- pandas/tests/plotting/test_misc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 6f476553091d9..54a512d14fef4 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -201,6 +201,7 @@ def test_parallel_coordinates(self): with tm.assert_produces_warning(FutureWarning): parallel_coordinates(df, 'Name', colors=colors) + @pytest.mark.xfail(reason="unreliable test") def test_parallel_coordinates_with_sorted_labels(self): """ For #15908 """ from pandas.plotting import parallel_coordinates From e1b4c05186a51971c57e106476d56cc40e43ea5d Mon Sep 17 00:00:00 2001 From: Michael Waskom Date: Sun, 26 Nov 2017 10:13:24 -0500 Subject: [PATCH 83/85] Improved description of seaborn (#18495) --- doc/source/ecosystem.rst | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 991ed3bfd98dd..69913b2c1fbd8 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -53,6 +53,18 @@ the latest web technologies. Its goal is to provide elegant, concise constructio graphics in the style of Protovis/D3, while delivering high-performance interactivity over large data to thin clients. +`seaborn `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Seaborn is a Python visualization library based on `matplotlib +`__. It provides a high-level, dataset-oriented +interface for creating attractive statistical graphics. The plotting functions +in seaborn understand pandas objects and leverage pandas grouping operations +internally to support concise specification of complex visualizations. Seaborn +also goes beyond matplotlib and pandas with the option to perform statistical +estimation while plotting, aggregating across observations and visualizing the +fit of statistical models to emphasize patterns in a dataset. + `yhat/ggplot `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,15 +76,6 @@ but a faithful implementation for python users has long been missing. Although s (as of Jan-2014), the `yhat/ggplot `__ project has been progressing quickly in that direction. -`Seaborn `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Although pandas has quite a bit of "just plot it" functionality built-in, visualization and -in particular statistical graphics is a vast field with a long tradition and lots of ground -to cover. The `Seaborn `__ project builds on top of pandas -and `matplotlib `__ to provide easy plotting of data which extends to -more advanced types of plots then those offered by pandas. - `Vincent `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 9e36f21a3a06479290f6b5a20cfa22635a41ae6a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 8 Dec 2017 16:51:41 -0600 Subject: [PATCH 84/85] CLN: pep8 error --- pandas/io/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e4b221b1768ec..df8b1b5cca1d3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1680,8 +1680,8 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if (kwds.get('compression') is None - and 'utf-16' in (kwds.get('encoding') or '')): + if (kwds.get('compression') is None and + 'utf-16' in (kwds.get('encoding') or '')): # if source is utf-16 plain text, convert source to utf-8 if isinstance(src, compat.string_types): src = open(src, 'rb') From 24e6fa8812cbd0d72ea61e0c3a7cbca15a3ae525 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 9 Dec 2017 14:21:45 -0600 Subject: [PATCH 85/85] Fix merge error --- doc/source/whatsnew/v0.21.1.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 2cf36cbbcaefd..00726a4606cf7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -83,6 +83,9 @@ Documentation Changes .. _whatsnew_0211.bug_fixes: +Bug Fixes +~~~~~~~~~ + Conversion ^^^^^^^^^^