diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index 30cdb06b28487..6714398084186 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput
Use pandas DataFrames in your `scikit-learn `__
ML pipeline.
+`Featuretools `__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community.
.. _ecosystem.visualization:
diff --git a/doc/source/io.rst b/doc/source/io.rst
index aa2484b0cb5c3..d818f486ad62d 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -4719,14 +4719,6 @@ writes ``data`` to the database in batches of 1000 rows at a time:
data.to_sql('data_chunked', engine, chunksize=1000)
-.. note::
-
- The function :func:`~pandas.DataFrame.to_sql` will perform a multivalue
- insert if the engine dialect ``supports_multivalues_insert``. This will
- greatly speed up the insert in some cases.
-
-SQL data types
-++++++++++++++
:func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate
SQL data type based on the dtype of the data. When you have columns of dtype
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
index b3c1dbc86525d..db25bcf8113f5 100644
--- a/doc/source/whatsnew/v0.23.1.txt
+++ b/doc/source/whatsnew/v0.23.1.txt
@@ -10,19 +10,69 @@ and bug fixes. We recommend that all users upgrade to this version.
:local:
:backlinks: none
-.. _whatsnew_0231.enhancements:
-
-New features
-~~~~~~~~~~~~
-
-
-.. _whatsnew_0231.deprecations:
-
-Deprecations
-~~~~~~~~~~~~
-
--
--
+.. _whatsnew_0231.fixed_regressions:
+
+Fixed Regressions
+~~~~~~~~~~~~~~~~~
+
+**Comparing Series with datetime.date**
+
+We've reverted a 0.23.0 change to comparing a :class:`Series` holding datetimes and a ``datetime.date`` object (:issue:`21152`).
+In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comapring.
+This was inconsistent with Python, NumPy, and :class:`DatetimeIndex`, which never consider a datetime and ``datetime.date`` equal.
+
+In 0.23.0, we unified operations between DatetimeIndex and Series, and in the process changed comparisons between a Series of datetimes and ``datetime.date`` without warning.
+
+We've temporarily restored the 0.22.0 behavior, so datetimes and dates may again compare equal, but restore the 0.23.0 behavior in a future release.
+
+To summarize, here's the behavior in 0.22.0, 0.23.0, 0.23.1:
+
+.. code-block:: python
+
+ # 0.22.0... Silently coerce the datetime.date
+ >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1)
+ 0 True
+ 1 False
+ dtype: bool
+
+ # 0.23.0... Do not coerce the datetime.date
+ >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1)
+ 0 False
+ 1 False
+ dtype: bool
+
+ # 0.23.1... Coerce the datetime.date with a warning
+ >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1)
+ /bin/python:1: FutureWarning: Comparing Series of datetimes with 'datetime.date'. Currently, the
+ 'datetime.date' is coerced to a datetime. In the future pandas will
+ not coerce, and the values not compare equal to the 'datetime.date'.
+ To retain the current behavior, convert the 'datetime.date' to a
+ datetime with 'pd.Timestamp'.
+ #!/bin/python3
+ 0 True
+ 1 False
+ dtype: bool
+
+In addition, ordering comparisons will raise a ``TypeError`` in the future.
+
+**Other Fixes**
+
+- Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue
+ inserts as this caused regression in certain cases (:issue:`21103`).
+ In the future this will be made configurable.
+- Fixed regression in the :attr:`DatetimeIndex.date` and :attr:`DatetimeIndex.time`
+ attributes in case of timezone-aware data: :attr:`DatetimeIndex.time` returned
+ a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date`
+ returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`).
+- Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values
+ in nested levels in JSON, and to not drop keys with value as `None` (:issue:`21158`, :issue:`21356`).
+- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
+- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`)
+- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`)
+- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`)
+- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing
+ values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`)
+- Fixed regression in merging on boolean index/columns (:issue:`21119`).
.. _whatsnew_0231.performance:
@@ -30,82 +80,56 @@ Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`)
--
--
-
-Documentation Changes
-~~~~~~~~~~~~~~~~~~~~~
+- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`)
--
--
.. _whatsnew_0231.bug_fixes:
Bug Fixes
~~~~~~~~~
-Groupby/Resample/Rolling
-^^^^^^^^^^^^^^^^^^^^^^^^
+**Groupby/Resample/Rolling**
- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`)
- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`)
- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True``
+- Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`)
-Strings
-^^^^^^^
+**Data-type specific**
- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue: `21078`)
-
-Timedelta
-^^^^^^^^^
- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue: `14156`)
+- Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`)
-Categorical
-^^^^^^^^^^^
-
-- Bug in :func:`pandas.util.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`)
-- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`)
-
-Sparse
-^^^^^^
+**Sparse**
- Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`)
-Conversion
-^^^^^^^^^^
-
--
--
-
-Indexing
-^^^^^^^^
+**Indexing**
- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
--
+- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`)
+- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`)
-I/O
-^^^
+**Plotting**
-- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
-- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
--
+- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue: `20968`)
-Plotting
-^^^^^^^^
+**I/O**
--
--
+- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
+- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
+- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
+- Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`)
-Reshaping
-^^^^^^^^^
+**Reshaping**
- Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`)
--
+- Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`)
-Other
-^^^^^
+**Other**
- Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`)
-- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`)
+- Bug preventing pandas being used on Windows without C++ redistributable installed (:issue:`21106`)
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
index 17453d8af1297..0f58cfa761f21 100644
--- a/pandas/_libs/tslib.pyx
+++ b/pandas/_libs/tslib.pyx
@@ -77,7 +77,7 @@ cdef inline object create_time_from_ts(
int64_t value, pandas_datetimestruct dts,
object tz, object freq):
""" convenience routine to construct a datetime.time from its parts """
- return time(dts.hour, dts.min, dts.sec, dts.us, tz)
+ return time(dts.hour, dts.min, dts.sec, dts.us)
def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None,
diff --git a/pandas/conftest.py b/pandas/conftest.py
index b09cb872a12fb..d5f399c7cd63d 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -105,6 +105,16 @@ def compression(request):
return request.param
+@pytest.fixture(params=['gzip', 'bz2', 'zip',
+ pytest.param('xz', marks=td.skip_if_no_lzma)])
+def compression_only(request):
+ """
+ Fixture for trying common compression types in compression tests excluding
+ uncompressed case
+ """
+ return request.param
+
+
@pytest.fixture(scope='module')
def datetime_tz_utc():
from datetime import timezone
@@ -149,3 +159,14 @@ def tz_aware_fixture(request):
Fixture for trying explicit timezones: {0}
"""
return request.param
+
+
+@pytest.fixture(params=[str, 'str', 'U'])
+def string_dtype(request):
+ """Parametrized fixture for string dtypes.
+
+ * str
+ * 'str'
+ * 'U'
+ """
+ return request.param
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index e4ed6d544d42e..ebc7a13234a98 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values):
result = np.empty(len(values), dtype='object')
result[:] = values
return result
+
+
+def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False):
+ """
+ Construct a new ndarray, coercing `values` to `dtype`, preserving NA.
+
+ Parameters
+ ----------
+ values : Sequence
+ dtype : numpy.dtype, optional
+ copy : bool, default False
+ Note that copies may still be made with ``copy=False`` if casting
+ is required.
+
+ Returns
+ -------
+ arr : ndarray[dtype]
+
+ Examples
+ --------
+ >>> np.array([1.0, 2.0, None], dtype='str')
+ array(['1.0', '2.0', 'None'], dtype='>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str')
+
+
+ """
+ subarr = np.array(values, dtype=dtype, copy=copy)
+
+ if dtype is not None and dtype.kind in ("U", "S"):
+ # GH-21083
+ # We can't just return np.array(subarr, dtype='str') since
+ # NumPy will convert the non-string objects into strings
+ # Including NA values. Se we have to go
+ # string -> object -> update NA, which requires an
+ # additional pass over the data.
+ na_values = isna(values)
+ subarr2 = subarr.astype(object)
+ subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
+ subarr = subarr2
+
+ return subarr
diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
index f9501cd2f9ddf..6f4fdfe5bf5cd 100644
--- a/pandas/core/indexes/api.py
+++ b/pandas/core/indexes/api.py
@@ -24,9 +24,9 @@
Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
-To accept the future behavior, pass 'sort=True'.
+To accept the future behavior, pass 'sort=False'.
-To retain the current behavior and silence the warning, pass sort=False
+To retain the current behavior and silence the warning, pass 'sort=True'.
""")
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 78b7ae7054248..150eca32e229d 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -378,7 +378,7 @@ def _engine(self):
# introspection
@cache_readonly
def is_unique(self):
- return not self.duplicated().any()
+ return self._engine.is_unique
@property
def is_monotonic_increasing(self):
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 83950f1d71633..0ddf33cdcae73 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -2032,7 +2032,16 @@ def time(self):
"""
Returns numpy array of datetime.time. The time part of the Timestamps.
"""
- return libts.ints_to_pydatetime(self.asi8, self.tz, box="time")
+
+ # If the Timestamps have a timezone that is not UTC,
+ # convert them into their i8 representation while
+ # keeping their timezone and not using UTC
+ if (self.tz is not None and self.tz is not utc):
+ timestamps = self._local_timestamps()
+ else:
+ timestamps = self.asi8
+
+ return libts.ints_to_pydatetime(timestamps, box="time")
@property
def date(self):
@@ -2040,7 +2049,16 @@ def date(self):
Returns numpy array of python datetime.date objects (namely, the date
part of Timestamps without timezone information).
"""
- return libts.ints_to_pydatetime(self.normalize().asi8, box="date")
+
+ # If the Timestamps have a timezone that is not UTC,
+ # convert them into their i8 representation while
+ # keeping their timezone and not using UTC
+ if (self.tz is not None and self.tz is not utc):
+ timestamps = self._local_timestamps()
+ else:
+ timestamps = self.asi8
+
+ return libts.ints_to_pydatetime(timestamps, box="date")
def normalize(self):
"""
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 8f8d8760583ce..eb9d7efc06c27 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values):
-------
array
"""
+ if is_categorical_dtype(values):
+ # GH 21243/21253
+ values = np.array(values)
+
if isinstance(values, (list, tuple)) and len(values) == 0:
# GH 19016
# empty lists/tuples get object dtype by default, but this is not
diff --git a/pandas/core/ops.py b/pandas/core/ops.py
index e14f82906cd06..540ebeee438f6 100644
--- a/pandas/core/ops.py
+++ b/pandas/core/ops.py
@@ -5,7 +5,10 @@
"""
# necessary to enforce truediv in Python 2.X
from __future__ import division
+import datetime
import operator
+import textwrap
+import warnings
import numpy as np
import pandas as pd
@@ -1197,8 +1200,35 @@ def wrapper(self, other, axis=None):
if is_datetime64_dtype(self) or is_datetime64tz_dtype(self):
# Dispatch to DatetimeIndex to ensure identical
# Series/Index behavior
+ if (isinstance(other, datetime.date) and
+ not isinstance(other, datetime.datetime)):
+ # https://github.com/pandas-dev/pandas/issues/21152
+ # Compatibility for difference between Series comparison w/
+ # datetime and date
+ msg = (
+ "Comparing Series of datetimes with 'datetime.date'. "
+ "Currently, the 'datetime.date' is coerced to a "
+ "datetime. In the future pandas will not coerce, "
+ "and {future}. "
+ "To retain the current behavior, "
+ "convert the 'datetime.date' to a datetime with "
+ "'pd.Timestamp'."
+ )
+
+ if op in {operator.lt, operator.le, operator.gt, operator.ge}:
+ future = "a TypeError will be raised"
+ else:
+ future = (
+ "'the values will not compare equal to the "
+ "'datetime.date'"
+ )
+ msg = '\n'.join(textwrap.wrap(msg.format(future=future)))
+ warnings.warn(msg, FutureWarning, stacklevel=2)
+ other = pd.Timestamp(other)
+
res_values = dispatch_to_index_op(op, self, other,
pd.DatetimeIndex)
+
return self._constructor(res_values, index=self.index,
name=res_name)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 4d8897fb7c811..d69d79ca9b098 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -28,6 +28,7 @@
is_int_or_datetime_dtype,
is_dtype_equal,
is_bool,
+ is_bool_dtype,
is_list_like,
is_datetimelike,
_ensure_int64,
@@ -974,9 +975,14 @@ def _maybe_coerce_merge_keys(self):
# Check if we are trying to merge on obviously
# incompatible dtypes GH 9780, GH 15800
- elif is_numeric_dtype(lk) and not is_numeric_dtype(rk):
+
+ # boolean values are considered as numeric, but are still allowed
+ # to be merged on object boolean values
+ elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk))
+ and not is_numeric_dtype(rk)):
raise ValueError(msg)
- elif not is_numeric_dtype(lk) and is_numeric_dtype(rk):
+ elif (not is_numeric_dtype(lk)
+ and (is_numeric_dtype(rk) and not is_bool_dtype(rk))):
raise ValueError(msg)
elif is_datetimelike(lk) and not is_datetimelike(rk):
raise ValueError(msg)
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index e02420323704e..9a2ad5d13d77a 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -1,8 +1,10 @@
# pylint: disable=E1103
-from pandas.core.dtypes.common import is_list_like, is_scalar
+from pandas.core.dtypes.common import (
+ is_list_like, is_scalar, is_integer_dtype)
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
+from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.reshape.concat import concat
from pandas.core.series import Series
@@ -79,8 +81,22 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
pass
values = list(values)
- grouped = data.groupby(keys, observed=dropna)
+ # group by the cartesian product of the grouper
+ # if we have a categorical
+ grouped = data.groupby(keys, observed=False)
agged = grouped.agg(aggfunc)
+ if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
+ agged = agged.dropna(how='all')
+
+ # gh-21133
+ # we want to down cast if
+ # the original values are ints
+ # as we grouped with a NaN value
+ # and then dropped, coercing to floats
+ for v in [v for v in values if v in data and v in agged]:
+ if (is_integer_dtype(data[v]) and
+ not is_integer_dtype(agged[v])):
+ agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)
table = agged
if table.index.nlevels > 1:
diff --git a/pandas/core/series.py b/pandas/core/series.py
index c5caafa07fb8e..6975dd8fc918e 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -40,6 +40,7 @@
maybe_convert_platform,
maybe_cast_to_datetime, maybe_castable,
construct_1d_arraylike_from_scalar,
+ construct_1d_ndarray_preserving_na,
construct_1d_object_array_from_listlike)
from pandas.core.dtypes.missing import (
isna,
@@ -4047,7 +4048,8 @@ def _try_cast(arr, take_fast_path):
isinstance(subarr, np.ndarray))):
subarr = construct_1d_object_array_from_listlike(subarr)
elif not is_extension_type(subarr):
- subarr = np.array(subarr, dtype=dtype, copy=copy)
+ subarr = construct_1d_ndarray_preserving_na(subarr, dtype,
+ copy=copy)
except (ValueError, TypeError):
if is_categorical_dtype(dtype):
# We *do* allow casting to categorical, since we know
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
index 5d50c45fe7eca..44811781837bc 100644
--- a/pandas/core/strings.py
+++ b/pandas/core/strings.py
@@ -2172,9 +2172,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
Returns
-------
- concat : str if `other is None`, Series/Index of objects if `others is
- not None`. In the latter case, the result will remain categorical
- if the calling Series/Index is categorical.
+ concat : str or Series/Index of objects
+ If `others` is None, `str` is returned, otherwise a `Series/Index`
+ (same type as caller) of objects is returned.
See Also
--------
diff --git a/pandas/core/window.py b/pandas/core/window.py
index 015e7f7913ed0..9d0f9dc4f75f9 100644
--- a/pandas/core/window.py
+++ b/pandas/core/window.py
@@ -602,8 +602,8 @@ def validate(self):
if isinstance(window, (list, tuple, np.ndarray)):
pass
elif is_integer(window):
- if window < 0:
- raise ValueError("window must be non-negative")
+ if window <= 0:
+ raise ValueError("window must be > 0 ")
try:
import scipy.signal as sig
except ImportError:
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index 29b8d29af0808..7f660e2644fa4 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -9,6 +9,7 @@
import numpy as np
from pandas.core.dtypes.missing import notna
+from pandas.core.dtypes.inference import is_file_like
from pandas.core.index import Index, MultiIndex
from pandas import compat
from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
else:
encoding = self.encoding
- if hasattr(self.path_or_buf, 'write'):
- f = self.path_or_buf
- close = False
+ # PR 21300 uses string buffer to receive csv writing and dump into
+ # file-like output with compression as option. GH 21241, 21118
+ f = StringIO()
+ if not is_file_like(self.path_or_buf):
+ # path_or_buf is path
+ path_or_buf = self.path_or_buf
+ elif hasattr(self.path_or_buf, 'name'):
+ # path_or_buf is file handle
+ path_or_buf = self.path_or_buf.name
else:
- f, handles = _get_handle(self.path_or_buf, self.mode,
- encoding=encoding,
- compression=None)
- close = True if self.compression is None else False
+ # path_or_buf is file-like IO objects.
+ f = self.path_or_buf
+ path_or_buf = None
try:
writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
self._save()
finally:
- # GH 17778 handles compression for byte strings.
- if not close and self.compression:
- f.close()
- with open(self.path_or_buf, 'r') as f:
- data = f.read()
- f, handles = _get_handle(self.path_or_buf, self.mode,
+ # GH 17778 handles zip compression for byte strings separately.
+ buf = f.getvalue()
+ if path_or_buf:
+ f, handles = _get_handle(path_or_buf, self.mode,
encoding=encoding,
compression=self.compression)
- f.write(data)
- close = True
- if close:
+ f.write(buf)
f.close()
+ for _fh in handles:
+ _fh.close()
def _save_header(self):
diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py
index 549204abd3caf..b845a43b9ca9e 100644
--- a/pandas/io/json/normalize.py
+++ b/pandas/io/json/normalize.py
@@ -80,8 +80,6 @@ def nested_to_record(ds, prefix="", sep=".", level=0):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
- if v is None: # pop the key if the value is None
- new_d.pop(k)
continue
else:
v = new_d.pop(k)
diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py
index 01f7db7d68664..5cea64388bdd7 100644
--- a/pandas/io/json/table_schema.py
+++ b/pandas/io/json/table_schema.py
@@ -296,7 +296,7 @@ def parse_table_schema(json, precise_float):
"""
table = loads(json, precise_float=precise_float)
col_order = [field['name'] for field in table['schema']['fields']]
- df = DataFrame(table['data'])[col_order]
+ df = DataFrame(table['data'], columns=col_order)[col_order]
dtypes = {field['name']: convert_json_field_to_pandas_type(field)
for field in table['schema']['fields']}
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index ccb8d2d99d734..a582d32741ae9 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -572,29 +572,8 @@ def create(self):
else:
self._execute_create()
- def insert_statement(self, data, conn):
- """
- Generate tuple of SQLAlchemy insert statement and any arguments
- to be executed by connection (via `_execute_insert`).
-
- Parameters
- ----------
- conn : SQLAlchemy connectable(engine/connection)
- Connection to recieve the data
- data : list of dict
- The data to be inserted
-
- Returns
- -------
- SQLAlchemy statement
- insert statement
- *, optional
- Additional parameters to be passed when executing insert statement
- """
- dialect = getattr(conn, 'dialect', None)
- if dialect and getattr(dialect, 'supports_multivalues_insert', False):
- return self.table.insert(data),
- return self.table.insert(), data
+ def insert_statement(self):
+ return self.table.insert()
def insert_data(self):
if self.index is not None:
@@ -633,9 +612,8 @@ def insert_data(self):
return column_names, data_list
def _execute_insert(self, conn, keys, data_iter):
- """Insert data into this table with database connection"""
data = [{k: v for k, v in zip(keys, row)} for row in data_iter]
- conn.execute(*self.insert_statement(data, conn))
+ conn.execute(self.insert_statement(), data)
def insert(self, chunksize=None):
keys, data_list = self.insert_data()
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
index 87b7d13251f28..d1a2121597dd6 100644
--- a/pandas/plotting/_core.py
+++ b/pandas/plotting/_core.py
@@ -811,7 +811,7 @@ class PlanePlot(MPLPlot):
def __init__(self, data, x, y, **kwargs):
MPLPlot.__init__(self, data, **kwargs)
if x is None or y is None:
- raise ValueError(self._kind + ' requires and x and y column')
+ raise ValueError(self._kind + ' requires an x and y column')
if is_integer(x) and not self.data.columns.holds_integer():
x = self.data.columns[x]
if is_integer(y) and not self.data.columns.holds_integer():
diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py
index 20cd8b43478d2..4a19682e2c558 100644
--- a/pandas/tests/dtypes/test_cast.py
+++ b/pandas/tests/dtypes/test_cast.py
@@ -23,6 +23,7 @@
maybe_convert_scalar,
find_common_type,
construct_1d_object_array_from_listlike,
+ construct_1d_ndarray_preserving_na,
construct_1d_arraylike_from_scalar)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
@@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self):
tm.assert_categorical_equal(result, expected,
check_category_order=True,
check_dtype=True)
+
+
+@pytest.mark.parametrize('values, dtype, expected', [
+ ([1, 2, 3], None, np.array([1, 2, 3])),
+ (np.array([1, 2, 3]), None, np.array([1, 2, 3])),
+ (['1', '2', None], None, np.array(['1', '2', None])),
+ (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])),
+ ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])),
+])
+def test_construct_1d_ndarray_preserving_na(values, dtype, expected):
+ result = construct_1d_ndarray_preserving_na(values, dtype=dtype)
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 6dd38187f7277..70dd358248bc4 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self):
assert a.dtype == df.a.dtype
assert b.dtype == df.b.dtype
+ def test_constructor_dtype_str_na_values(self, string_dtype):
+ # https://github.com/pandas-dev/pandas/issues/21083
+ df = DataFrame({'A': ['x', None]}, dtype=string_dtype)
+ result = df.isna()
+ expected = DataFrame({"A": [False, True]})
+ tm.assert_frame_equal(result, expected)
+ assert df.iloc[1, 0] is None
+
+ df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype)
+ assert np.isnan(df.iloc[1, 0])
+
def test_constructor_rec(self):
rec = self.frame.to_records(index=False)
diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
index 4c9f8c2ea0980..1eeeec0be3b8b 100644
--- a/pandas/tests/frame/test_dtypes.py
+++ b/pandas/tests/frame/test_dtypes.py
@@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self):
@pytest.mark.parametrize('input_vals', [
([1, 2]),
- ([1.0, 2.0, np.nan]),
(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
- def test_constructor_list_str(self, input_vals):
+ def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements are converted to strings when
# dtype is str, 'str', or 'U'
- for dtype in ['str', str, 'U']:
- result = DataFrame({'A': input_vals}, dtype=dtype)
- expected = DataFrame({'A': input_vals}).astype({'A': dtype})
- assert_frame_equal(result, expected)
+ result = DataFrame({'A': input_vals}, dtype=string_dtype)
+ expected = DataFrame({'A': input_vals}).astype({'A': string_dtype})
+ assert_frame_equal(result, expected)
+
+ def test_constructor_list_str_na(self, string_dtype):
+
+ result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
+ expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object)
+ assert_frame_equal(result, expected)
class TestDataFrameDatetimeWithTZ(TestData):
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
index e4829ebf48561..60dc336a85388 100644
--- a/pandas/tests/frame/test_to_csv.py
+++ b/pandas/tests/frame/test_to_csv.py
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
recons = pd.read_csv(StringIO(csv_str), index_col=0)
assert_frame_equal(self.frame, recons)
- def test_to_csv_compression(self, compression):
-
- df = DataFrame([[0.123456, 0.234567, 0.567567],
- [12.32112, 123123.2, 321321.2]],
- index=['A', 'B'], columns=['X', 'Y', 'Z'])
+ @pytest.mark.parametrize('df,encoding', [
+ (DataFrame([[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
+ # GH 21241, 21118
+ (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
+ (DataFrame(5 * [[123, u"你好", u"世界"]],
+ columns=['X', 'Y', 'Z']), 'gb2312'),
+ (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
+ columns=['X', 'Y', 'Z']), 'cp737')
+ ])
+ def test_to_csv_compression(self, df, encoding, compression):
with ensure_clean() as filename:
- df.to_csv(filename, compression=compression)
+ df.to_csv(filename, compression=compression, encoding=encoding)
# test the round trip - to_csv -> read_csv
- rs = read_csv(filename, compression=compression,
- index_col=0)
- assert_frame_equal(df, rs)
+ result = read_csv(filename, compression=compression,
+ index_col=0, encoding=encoding)
+
+ with open(filename, 'w') as fh:
+ df.to_csv(fh, compression=compression, encoding=encoding)
+
+ result_fh = read_csv(filename, compression=compression,
+ index_col=0, encoding=encoding)
+ assert_frame_equal(df, result)
+ assert_frame_equal(df, result_fh)
# explicitly make sure file is compressed
with tm.decompress_file(filename, compression) as fh:
- text = fh.read().decode('utf8')
+ text = fh.read().decode(encoding or 'utf8')
for col in df.columns:
assert col in text
with tm.decompress_file(filename, compression) as fh:
- assert_frame_equal(df, read_csv(fh, index_col=0))
+ assert_frame_equal(df, read_csv(fh,
+ index_col=0,
+ encoding=encoding))
def test_to_csv_date_format(self):
with ensure_clean('__tmp_to_csv_date_format__') as path:
diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py
index 09210d8b64d1b..573940edaa08f 100644
--- a/pandas/tests/indexes/datetimes/test_timezones.py
+++ b/pandas/tests/indexes/datetimes/test_timezones.py
@@ -2,7 +2,7 @@
"""
Tests for DatetimeIndex timezone-related methods
"""
-from datetime import datetime, timedelta, tzinfo
+from datetime import datetime, timedelta, tzinfo, date, time
from distutils.version import LooseVersion
import pytest
@@ -706,6 +706,32 @@ def test_join_utc_convert(self, join_type):
assert isinstance(result, DatetimeIndex)
assert result.tz.zone == 'UTC'
+ @pytest.mark.parametrize("dtype", [
+ None, 'datetime64[ns, CET]',
+ 'datetime64[ns, EST]', 'datetime64[ns, UTC]'
+ ])
+ def test_date_accessor(self, dtype):
+ # Regression test for GH#21230
+ expected = np.array([date(2018, 6, 4), pd.NaT])
+
+ index = DatetimeIndex(['2018-06-04 10:00:00', pd.NaT], dtype=dtype)
+ result = index.date
+
+ tm.assert_numpy_array_equal(result, expected)
+
+ @pytest.mark.parametrize("dtype", [
+ None, 'datetime64[ns, CET]',
+ 'datetime64[ns, EST]', 'datetime64[ns, UTC]'
+ ])
+ def test_time_accessor(self, dtype):
+ # Regression test for GH#21267
+ expected = np.array([time(10, 20, 30), pd.NaT])
+
+ index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], dtype=dtype)
+ result = index.time
+
+ tm.assert_numpy_array_equal(result, expected)
+
def test_dti_drop_dont_lose_tz(self):
# GH#2621
ind = date_range("2012-12-01", periods=10, tz="utc")
diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py
index 5fdf92dcb2044..b1711c3444586 100644
--- a/pandas/tests/indexes/interval/test_construction.py
+++ b/pandas/tests/indexes/interval/test_construction.py
@@ -6,8 +6,9 @@
from pandas import (
Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical,
- date_range, timedelta_range, period_range, notna)
+ CategoricalIndex, date_range, timedelta_range, period_range, notna)
from pandas.compat import lzip
+from pandas.core.dtypes.common import is_categorical_dtype
from pandas.core.dtypes.dtypes import IntervalDtype
import pandas.core.common as com
import pandas.util.testing as tm
@@ -111,6 +112,22 @@ def test_constructor_string(self, constructor, breaks):
with tm.assert_raises_regex(TypeError, msg):
constructor(**self.get_kwargs_from_breaks(breaks))
+ @pytest.mark.parametrize('cat_constructor', [
+ Categorical, CategoricalIndex])
+ def test_constructor_categorical_valid(self, constructor, cat_constructor):
+ # GH 21243/21253
+ if isinstance(constructor, partial) and constructor.func is Index:
+ # Index is defined to create CategoricalIndex from categorical data
+ pytest.skip()
+
+ breaks = np.arange(10, dtype='int64')
+ expected = IntervalIndex.from_breaks(breaks)
+
+ cat_breaks = cat_constructor(breaks)
+ result_kwargs = self.get_kwargs_from_breaks(cat_breaks)
+ result = constructor(**result_kwargs)
+ tm.assert_index_equal(result, expected)
+
def test_generic_errors(self, constructor):
# filler input data to be used when supplying invalid kwargs
filler = self.get_kwargs_from_breaks(range(10))
@@ -238,6 +255,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'):
tuples = lzip(breaks[:-1], breaks[1:])
if isinstance(breaks, (list, tuple)):
return {'data': tuples}
+ elif is_categorical_dtype(breaks):
+ return {'data': breaks._constructor(tuples)}
return {'data': com._asarray_tuplesafe(tuples)}
def test_constructor_errors(self):
@@ -286,6 +305,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'):
if isinstance(breaks, list):
return {'data': ivs}
+ elif is_categorical_dtype(breaks):
+ return {'data': breaks._constructor(ivs)}
return {'data': np.array(ivs, dtype=object)}
def test_generic_errors(self, constructor):
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
index 0e630f69b1a32..a2a4170256088 100644
--- a/pandas/tests/indexes/test_category.py
+++ b/pandas/tests/indexes/test_category.py
@@ -581,6 +581,15 @@ def test_is_monotonic(self, data, non_lexsorted_data):
assert c.is_monotonic_increasing
assert not c.is_monotonic_decreasing
+ @pytest.mark.parametrize('values, expected', [
+ ([1, 2, 3], True),
+ ([1, 3, 1], False),
+ (list('abc'), True),
+ (list('aba'), False)])
+ def test_is_unique(self, values, expected):
+ ci = CategoricalIndex(values)
+ assert ci.is_unique is expected
+
def test_duplicates(self):
idx = CategoricalIndex([0, 0, 0], name='foo')
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
index 49b39c17238ae..b6483d0e978ba 100644
--- a/pandas/tests/io/json/test_json_table_schema.py
+++ b/pandas/tests/io/json/test_json_table_schema.py
@@ -560,3 +560,16 @@ def test_multiindex(self, index_names):
out = df.to_json(orient="table")
result = pd.read_json(out, orient="table")
tm.assert_frame_equal(df, result)
+
+ @pytest.mark.parametrize("strict_check", [
+ pytest.param(True, marks=pytest.mark.xfail), False])
+ def test_empty_frame_roundtrip(self, strict_check):
+ # GH 21287
+ df = pd.DataFrame([], columns=['a', 'b', 'c'])
+ expected = df.copy()
+ out = df.to_json(orient='table')
+ result = pd.read_json(out, orient='table')
+ # TODO: When DF coercion issue (#21345) is resolved tighten type checks
+ tm.assert_frame_equal(expected, result,
+ check_dtype=strict_check,
+ check_index_type=strict_check)
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
index 0fabaf747b6de..395c2c90767d3 100644
--- a/pandas/tests/io/json/test_normalize.py
+++ b/pandas/tests/io/json/test_normalize.py
@@ -238,15 +238,16 @@ def test_non_ascii_key(self):
tm.assert_frame_equal(result, expected)
def test_missing_field(self, author_missing_data):
- # GH20030: Checks for robustness of json_normalize - should
- # unnest records where only the first record has a None value
+ # GH20030:
result = json_normalize(author_missing_data)
ex_data = [
- {'author_name.first': np.nan,
+ {'info': np.nan,
+ 'author_name.first': np.nan,
'author_name.last_name': np.nan,
'info.created_at': np.nan,
'info.last_updated': np.nan},
- {'author_name.first': 'Jane',
+ {'info': None,
+ 'author_name.first': 'Jane',
'author_name.last_name': 'Doe',
'info.created_at': '11/08/1993',
'info.last_updated': '26/05/2012'}
@@ -351,9 +352,8 @@ def test_json_normalize_errors(self):
errors='raise'
)
- def test_nonetype_dropping(self):
- # GH20030: Checks that None values are dropped in nested_to_record
- # to prevent additional columns of nans when passed to DataFrame
+ def test_donot_drop_nonevalues(self):
+ # GH21356
data = [
{'info': None,
'author_name':
@@ -367,7 +367,8 @@ def test_nonetype_dropping(self):
]
result = nested_to_record(data)
expected = [
- {'author_name.first': 'Smith',
+ {'info': None,
+ 'author_name.first': 'Smith',
'author_name.last_name': 'Appleseed'},
{'author_name.first': 'Jane',
'author_name.last_name': 'Doe',
@@ -375,3 +376,61 @@ def test_nonetype_dropping(self):
'info.last_updated': '26/05/2012'}]
assert result == expected
+
+ def test_nonetype_top_level_bottom_level(self):
+ # GH21158: If inner level json has a key with a null value
+ # make sure it doesnt do a new_d.pop twice and except
+ data = {
+ "id": None,
+ "location": {
+ "country": {
+ "state": {
+ "id": None,
+ "town.info": {
+ "id": None,
+ "region": None,
+ "x": 49.151580810546875,
+ "y": -33.148521423339844,
+ "z": 27.572303771972656}}}
+ }
+ }
+ result = nested_to_record(data)
+ expected = {
+ 'id': None,
+ 'location.country.state.id': None,
+ 'location.country.state.town.info.id': None,
+ 'location.country.state.town.info.region': None,
+ 'location.country.state.town.info.x': 49.151580810546875,
+ 'location.country.state.town.info.y': -33.148521423339844,
+ 'location.country.state.town.info.z': 27.572303771972656}
+ assert result == expected
+
+ def test_nonetype_multiple_levels(self):
+ # GH21158: If inner level json has a key with a null value
+ # make sure it doesnt do a new_d.pop twice and except
+ data = {
+ "id": None,
+ "location": {
+ "id": None,
+ "country": {
+ "id": None,
+ "state": {
+ "id": None,
+ "town.info": {
+ "region": None,
+ "x": 49.151580810546875,
+ "y": -33.148521423339844,
+ "z": 27.572303771972656}}}
+ }
+ }
+ result = nested_to_record(data)
+ expected = {
+ 'id': None,
+ 'location.id': None,
+ 'location.country.id': None,
+ 'location.country.state.id': None,
+ 'location.country.state.town.info.region': None,
+ 'location.country.state.town.info.x': 49.151580810546875,
+ 'location.country.state.town.info.y': -33.148521423339844,
+ 'location.country.state.town.info.z': 27.572303771972656}
+ assert result == expected
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 4530cc9d2fba9..f3ab74d37a2bc 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -1665,29 +1665,6 @@ class Temporary(Base):
tm.assert_frame_equal(df, expected)
- def test_insert_multivalues(self):
- # issues addressed
- # https://github.com/pandas-dev/pandas/issues/14315
- # https://github.com/pandas-dev/pandas/issues/8953
-
- db = sql.SQLDatabase(self.conn)
- df = DataFrame({'A': [1, 0, 0], 'B': [1.1, 0.2, 4.3]})
- table = sql.SQLTable("test_table", db, frame=df)
- data = [
- {'A': 1, 'B': 0.46},
- {'A': 0, 'B': -2.06}
- ]
- statement = table.insert_statement(data, conn=self.conn)[0]
-
- if self.supports_multivalues_insert:
- assert statement.parameters == data, (
- 'insert statement should be multivalues'
- )
- else:
- assert statement.parameters is None, (
- 'insert statement should not be multivalues'
- )
-
class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy):
@@ -1702,7 +1679,6 @@ class _TestSQLiteAlchemy(object):
"""
flavor = 'sqlite'
- supports_multivalues_insert = True
@classmethod
def connect(cls):
@@ -1751,7 +1727,6 @@ class _TestMySQLAlchemy(object):
"""
flavor = 'mysql'
- supports_multivalues_insert = True
@classmethod
def connect(cls):
@@ -1821,7 +1796,6 @@ class _TestPostgreSQLAlchemy(object):
"""
flavor = 'postgresql'
- supports_multivalues_insert = True
@classmethod
def connect(cls):
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index 8e639edd34b18..037bd9cc7cd18 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -1526,6 +1526,27 @@ def test_merge_on_ints_floats_warning(self):
result = B.merge(A, left_on='Y', right_on='X')
assert_frame_equal(result, expected[['Y', 'X']])
+ def test_merge_incompat_infer_boolean_object(self):
+ # GH21119: bool + object bool merge OK
+ df1 = DataFrame({'key': Series([True, False], dtype=object)})
+ df2 = DataFrame({'key': [True, False]})
+
+ expected = DataFrame({'key': [True, False]}, dtype=object)
+ result = pd.merge(df1, df2, on='key')
+ assert_frame_equal(result, expected)
+ result = pd.merge(df2, df1, on='key')
+ assert_frame_equal(result, expected)
+
+ # with missing value
+ df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)})
+ df2 = DataFrame({'key': [True, False]})
+
+ expected = DataFrame({'key': [True, False]}, dtype=object)
+ result = pd.merge(df1, df2, on='key')
+ assert_frame_equal(result, expected)
+ result = pd.merge(df2, df1, on='key')
+ assert_frame_equal(result, expected)
+
@pytest.mark.parametrize('df1_vals, df2_vals', [
([0, 1, 2], ["0", "1", "2"]),
([0.0, 1.0, 2.0], ["0", "1", "2"]),
@@ -1538,6 +1559,8 @@ def test_merge_on_ints_floats_warning(self):
pd.date_range('20130101', periods=3, tz='US/Eastern')),
([0, 1, 2], Series(['a', 'b', 'a']).astype('category')),
([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')),
+ # TODO ([0, 1], pd.Series([False, True], dtype=bool)),
+ ([0, 1], pd.Series([False, True], dtype=object))
])
def test_merge_incompat_dtypes(self, df1_vals, df2_vals):
# GH 9780, GH 15800
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index d2cf3fc11e165..3ec60d50f2792 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
from datetime import datetime, date, timedelta
@@ -16,6 +17,11 @@
from pandas.api.types import CategoricalDtype as CDT
+@pytest.fixture(params=[True, False])
+def dropna(request):
+ return request.param
+
+
class TestPivotTable(object):
def setup_method(self, method):
@@ -109,7 +115,6 @@ def test_pivot_table_categorical(self):
index=exp_index)
tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize('dropna', [True, False])
def test_pivot_table_dropna_categoricals(self, dropna):
# GH 15193
categories = ['a', 'b', 'c', 'd']
@@ -137,6 +142,25 @@ def test_pivot_table_dropna_categoricals(self, dropna):
tm.assert_frame_equal(result, expected)
+ def test_pivot_with_non_observable_dropna(self, dropna):
+ # gh-21133
+ df = pd.DataFrame(
+ {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'],
+ categories=['low', 'high'],
+ ordered=True),
+ 'B': range(5)})
+
+ result = df.pivot_table(index='A', values='B', dropna=dropna)
+ expected = pd.DataFrame(
+ {'B': [2, 3]},
+ index=pd.Index(
+ pd.Categorical.from_codes([0, 1],
+ categories=['low', 'high'],
+ ordered=True),
+ name='A'))
+
+ tm.assert_frame_equal(result, expected)
+
def test_pass_array(self):
result = self.data.pivot_table(
'D', index=self.data.A, columns=self.data.C)
diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py
index ec0d7296e540e..95836f046195a 100644
--- a/pandas/tests/series/test_arithmetic.py
+++ b/pandas/tests/series/test_arithmetic.py
@@ -88,6 +88,46 @@ def test_ser_cmp_result_names(self, names, op):
class TestTimestampSeriesComparison(object):
+ def test_dt64_ser_cmp_date_warning(self):
+ # https://github.com/pandas-dev/pandas/issues/21359
+ # Remove this test and enble invalid test below
+ ser = pd.Series(pd.date_range('20010101', periods=10), name='dates')
+ date = ser.iloc[0].to_pydatetime().date()
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser == date
+ expected = pd.Series([True] + [False] * 9, name='dates')
+ tm.assert_series_equal(result, expected)
+ assert "Comparing Series of datetimes " in str(m[0].message)
+ assert "will not compare equal" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser != date
+ tm.assert_series_equal(result, ~expected)
+ assert "will not compare equal" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser <= date
+ tm.assert_series_equal(result, expected)
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser < date
+ tm.assert_series_equal(result, pd.Series([False] * 10, name='dates'))
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser >= date
+ tm.assert_series_equal(result, pd.Series([True] * 10, name='dates'))
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ with tm.assert_produces_warning(FutureWarning) as m:
+ result = ser > date
+ tm.assert_series_equal(result, pd.Series([False] + [True] * 9,
+ name='dates'))
+ assert "a TypeError will be raised" in str(m[0].message)
+
+ @pytest.mark.skip(reason="GH-21359")
def test_dt64ser_cmp_date_invalid(self):
# GH#19800 datetime.date comparison raises to
# match DatetimeIndex/Timestamp. This also matches the behavior
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 7e59325c32ddc..906d2aacd5586 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self):
result = pd.Series(index=['b', 'a', 'c'])
assert result.index.tolist() == ['b', 'a', 'c']
+ def test_constructor_dtype_str_na_values(self, string_dtype):
+ # https://github.com/pandas-dev/pandas/issues/21083
+ ser = Series(['x', None], dtype=string_dtype)
+ result = ser.isna()
+ expected = Series([False, True])
+ tm.assert_series_equal(result, expected)
+ assert ser.iloc[1] is None
+
+ ser = Series(['x', np.nan], dtype=string_dtype)
+ assert np.isnan(ser.iloc[1])
+
def test_constructor_series(self):
index1 = ['d', 'b', 'a', 'c']
index2 = sorted(index1)
@@ -164,22 +175,25 @@ def test_constructor_list_like(self):
@pytest.mark.parametrize('input_vals', [
([1, 2]),
- ([1.0, 2.0, np.nan]),
(['1', '2']),
(list(pd.date_range('1/1/2011', periods=2, freq='H'))),
(list(pd.date_range('1/1/2011', periods=2, freq='H',
tz='US/Eastern'))),
([pd.Interval(left=0, right=5)]),
])
- def test_constructor_list_str(self, input_vals):
+ def test_constructor_list_str(self, input_vals, string_dtype):
# GH 16605
# Ensure that data elements from a list are converted to strings
# when dtype is str, 'str', or 'U'
+ result = Series(input_vals, dtype=string_dtype)
+ expected = Series(input_vals).astype(string_dtype)
+ assert_series_equal(result, expected)
- for dtype in ['str', str, 'U']:
- result = Series(input_vals, dtype=dtype)
- expected = Series(input_vals).astype(dtype)
- assert_series_equal(result, expected)
+ def test_constructor_list_str_na(self, string_dtype):
+ result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
+ expected = Series(['1.0', '2.0', np.nan], dtype=object)
+ assert_series_equal(result, expected)
+ assert np.isnan(result[2])
def test_constructor_generator(self):
gen = (i for i in range(10))
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
index 0b0d4334c86a3..76dd4bc1f3d4a 100644
--- a/pandas/tests/series/test_io.py
+++ b/pandas/tests/series/test_io.py
@@ -138,29 +138,45 @@ def test_to_csv_path_is_none(self):
csv_str = s.to_csv(path=None)
assert isinstance(csv_str, str)
- def test_to_csv_compression(self, compression):
-
- s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
- name='X')
+ @pytest.mark.parametrize('s,encoding', [
+ (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
+ name='X'), None),
+ # GH 21241, 21118
+ (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
+ (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
+ (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
+ ])
+ def test_to_csv_compression(self, s, encoding, compression):
with ensure_clean() as filename:
- s.to_csv(filename, compression=compression, header=True)
+ s.to_csv(filename, compression=compression, encoding=encoding,
+ header=True)
# test the round trip - to_csv -> read_csv
- rs = pd.read_csv(filename, compression=compression,
- index_col=0, squeeze=True)
- assert_series_equal(s, rs)
+ result = pd.read_csv(filename, compression=compression,
+ encoding=encoding, index_col=0, squeeze=True)
+
+ with open(filename, 'w') as fh:
+ s.to_csv(fh, compression=compression, encoding=encoding,
+ header=True)
+
+ result_fh = pd.read_csv(filename, compression=compression,
+ encoding=encoding, index_col=0,
+ squeeze=True)
+ assert_series_equal(s, result)
+ assert_series_equal(s, result_fh)
# explicitly ensure file was compressed
with tm.decompress_file(filename, compression) as fh:
- text = fh.read().decode('utf8')
+ text = fh.read().decode(encoding or 'utf8')
assert s.name in text
with tm.decompress_file(filename, compression) as fh:
assert_series_equal(s, pd.read_csv(fh,
index_col=0,
- squeeze=True))
+ squeeze=True,
+ encoding=encoding))
class TestSeriesIO(TestData):
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
index bb7ee1b911fee..3443331e3d4ba 100644
--- a/pandas/tests/test_common.py
+++ b/pandas/tests/test_common.py
@@ -241,3 +241,26 @@ def test_compression_size(obj, method, compression):
getattr(obj, method)(filename, compression=None)
uncompressed = os.path.getsize(filename)
assert uncompressed > compressed
+
+
+@pytest.mark.parametrize('obj', [
+ DataFrame(100 * [[0.123456, 0.234567, 0.567567],
+ [12.32112, 123123.2, 321321.2]],
+ columns=['X', 'Y', 'Z']),
+ Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
+@pytest.mark.parametrize('method', ['to_csv'])
+def test_compression_size_fh(obj, method, compression_only):
+
+ with tm.ensure_clean() as filename:
+ with open(filename, 'w') as fh:
+ getattr(obj, method)(fh, compression=compression_only)
+ assert not fh.closed
+ assert fh.closed
+ compressed = os.path.getsize(filename)
+ with tm.ensure_clean() as filename:
+ with open(filename, 'w') as fh:
+ getattr(obj, method)(fh, compression=None)
+ assert not fh.closed
+ assert fh.closed
+ uncompressed = os.path.getsize(filename)
+ assert uncompressed > compressed
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index c2d09c6d49e86..afd7993fefc70 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -103,7 +103,6 @@ def test_pandas_datareader():
'F', 'quandl', '2017-01-01', '2017-02-01')
-@pytest.mark.xfail(reaason="downstream install issue")
def test_geopandas():
geopandas = import_module('geopandas') # noqa
diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py
index 74f2c977e0db2..cfd88f41f855e 100644
--- a/pandas/tests/test_window.py
+++ b/pandas/tests/test_window.py
@@ -389,8 +389,8 @@ def test_constructor(self, which):
c(window=2, min_periods=1, center=False)
# GH 13383
- c(0)
with pytest.raises(ValueError):
+ c(0)
c(-1)
# not valid
@@ -409,7 +409,6 @@ def test_constructor_with_win_type(self, which):
# GH 13383
o = getattr(self, which)
c = o.rolling
- c(0, win_type='boxcar')
with pytest.raises(ValueError):
c(-1, win_type='boxcar')
diff --git a/setup.py b/setup.py
index 6febe674fb2a1..90ec8e91a0700 100755
--- a/setup.py
+++ b/setup.py
@@ -453,10 +453,10 @@ def pxd(name):
return pjoin('pandas', name + '.pxd')
-# args to ignore warnings
if is_platform_windows():
extra_compile_args = []
else:
+ # args to ignore warnings
extra_compile_args = ['-Wno-unused-function']
lib_depends = lib_depends + ['pandas/_libs/src/numpy_helper.h',
@@ -733,7 +733,7 @@ def pxd(name):
maintainer=AUTHOR,
version=versioneer.get_version(),
packages=find_packages(include=['pandas', 'pandas.*']),
- package_data={'': ['data/*', 'templates/*'],
+ package_data={'': ['data/*', 'templates/*', '_libs/*.dll'],
'pandas.tests.io': ['data/legacy_hdf/*.h5',
'data/legacy_pickle/*/*.pickle',
'data/legacy_msgpack/*/*.msgpack',