Merge remote-tracking branch 'upstream/master' into empty-json-empty-df-fix

Pyry Kovanen · Pyry Kovanen · commit 833afeaf6f41 · 2018-06-06T20:23:49.000+03:00
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -512,3 +512,21 @@ def time_nlargest(self, keep):
 
     def time_nsmallest(self, keep):
         self.df.nsmallest(100, 'A', keep=keep)
+
+
+class Describe(object):
+
+    goal_time = 0.2
+
+    def setup(self):
+        self.df = DataFrame({
+            'a': np.random.randint(0, 100, int(1e6)),
+            'b': np.random.randint(0, 100, int(1e6)),
+            'c': np.random.randint(0, 100, int(1e6))
+        })
+
+    def time_series_describe(self):
+        self.df['a'].describe()
+
+    def time_dataframe_describe(self):
+        self.df.describe()
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
@@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput
 Use pandas DataFrames in your `scikit-learn <http://scikit-learn.org/>`__
 ML pipeline.
 
+`Featuretools <https://github.com/featuretools/featuretools/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. 
 
 .. _ecosystem.visualization:
 
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -32,6 +32,7 @@ Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
 - Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`)
+- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`)
 -
 -
 
@@ -85,14 +86,18 @@ Indexing
 - Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`)
 - Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`)
 - Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`)
+- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, issue:`21253`)
+- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`)
 -
 
 I/O
 ^^^
 
 - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
 - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
-- Bug in IO JSON :func:`read_json`reading empty JSON schema with ``orient='table'`` back to :class:DataFrame caused an error (:issue:`21287`)
+- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
+- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
+- Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`)
 
 Plotting
 ^^^^^^^^
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -63,8 +63,7 @@ Removal of prior version deprecations/changes
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
--
--
+- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
 -
 
 .. _whatsnew_0240.docs:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -8519,7 +8519,7 @@ def describe_numeric_1d(series):
             stat_index = (['count', 'mean', 'std', 'min'] +
                           formatted_percentiles + ['max'])
             d = ([series.count(), series.mean(), series.std(), series.min()] +
-                 [series.quantile(x) for x in percentiles] + [series.max()])
+                 series.quantile(percentiles).tolist() + [series.max()])
             return pd.Series(d, index=stat_index, name=series.name)
 
         def describe_categorical_1d(data):
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -378,7 +378,7 @@ def _engine(self):
     # introspection
     @cache_readonly
     def is_unique(self):
-        return not self.duplicated().any()
+        return self._engine.is_unique
 
     @property
     def is_monotonic_increasing(self):
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values):
     -------
     array
     """
+    if is_categorical_dtype(values):
+        # GH 21243/21253
+        values = np.array(values)
+
     if isinstance(values, (list, tuple)) and len(values) == 0:
         # GH 19016
         # empty lists/tuples get object dtype by default, but this is not
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1837,7 +1837,7 @@ def round(self, decimals=0, *args, **kwargs):
 
     def quantile(self, q=0.5, interpolation='linear'):
         """
-        Return value at the given quantile, a la numpy.percentile.
+        Return value at the given quantile.
 
         Parameters
         ----------
@@ -1876,6 +1876,7 @@ def quantile(self, q=0.5, interpolation='linear'):
         See Also
         --------
         pandas.core.window.Rolling.quantile
+        numpy.percentile
         """
 
         self._check_percentile(q)
diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -2172,9 +2172,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None):
 
         Returns
         -------
-        concat : str if `other is None`, Series/Index of objects if `others is
-            not None`. In the latter case, the result will remain categorical
-            if the calling Series/Index is categorical.
+        concat : str or Series/Index of objects
+            If `others` is None, `str` is returned, otherwise a `Series/Index`
+            (same type as caller) of objects is returned.
 
         See Also
         --------
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 from pandas.core.dtypes.missing import notna
+from pandas.core.dtypes.inference import is_file_like
 from pandas.core.index import Index, MultiIndex
 from pandas import compat
 from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
         else:
             encoding = self.encoding
 
-        if hasattr(self.path_or_buf, 'write'):
-            f = self.path_or_buf
-            close = False
+        # PR 21300 uses string buffer to receive csv writing and dump into
+        # file-like output with compression as option. GH 21241, 21118
+        f = StringIO()
+        if not is_file_like(self.path_or_buf):
+            # path_or_buf is path
+            path_or_buf = self.path_or_buf
+        elif hasattr(self.path_or_buf, 'name'):
+            # path_or_buf is file handle
+            path_or_buf = self.path_or_buf.name
         else:
-            f, handles = _get_handle(self.path_or_buf, self.mode,
-                                     encoding=encoding,
-                                     compression=None)
-            close = True if self.compression is None else False
+            # path_or_buf is file-like IO objects.
+            f = self.path_or_buf
+            path_or_buf = None
 
         try:
             writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
             self._save()
 
         finally:
-            # GH 17778 handles compression for byte strings.
-            if not close and self.compression:
-                f.close()
-                with open(f.name, 'r') as f:
-                    data = f.read()
-                f, handles = _get_handle(f.name, self.mode,
+            # GH 17778 handles zip compression for byte strings separately.
+            buf = f.getvalue()
+            if path_or_buf:
+                f, handles = _get_handle(path_or_buf, self.mode,
                                          encoding=encoding,
                                          compression=self.compression)
-                f.write(data)
-                close = True
-            if close:
+                f.write(buf)
                 f.close()
+                for _fh in handles:
+                    _fh.close()
 
     def _save_header(self):
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -182,7 +182,7 @@ def read_stata(filepath_or_buffer, convert_dates=True,
                          preserve_dtypes=preserve_dtypes,
                          columns=columns,
                          order_categoricals=order_categoricals,
-                         chunksize=chunksize, encoding=encoding)
+                         chunksize=chunksize)
 
     if iterator or chunksize:
         data = reader
@@ -838,15 +838,8 @@ def get_base_missing_value(cls, dtype):
 
 
 class StataParser(object):
-    _default_encoding = 'latin-1'
 
-    def __init__(self, encoding):
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
-
-        self._encoding = encoding
+    def __init__(self):
 
         # type          code.
         # --------------------
@@ -964,8 +957,8 @@ def __init__(self, path_or_buf, convert_dates=True,
                  convert_categoricals=True, index_col=None,
                  convert_missing=False, preserve_dtypes=True,
                  columns=None, order_categoricals=True,
-                 encoding='latin-1', chunksize=None):
-        super(StataReader, self).__init__(encoding)
+                 encoding=None, chunksize=None):
+        super(StataReader, self).__init__()
         self.col_sizes = ()
 
         # Arguments to the reader (can be temporarily overridden in
@@ -977,10 +970,6 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._preserve_dtypes = preserve_dtypes
         self._columns = columns
         self._order_categoricals = order_categoricals
-        if encoding is not None:
-            if encoding not in VALID_ENCODINGS:
-                raise ValueError('Unknown encoding. Only latin-1 and ascii '
-                                 'supported.')
         self._encoding = encoding
         self._chunksize = chunksize
 
@@ -998,18 +987,13 @@ def __init__(self, path_or_buf, convert_dates=True,
         path_or_buf = _stringify_path(path_or_buf)
         if isinstance(path_or_buf, str):
             path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
-                path_or_buf, encoding=self._default_encoding
-            )
+                path_or_buf)
 
         if isinstance(path_or_buf, (str, text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
         else:
             # Copy to BytesIO, and ensure no encoding
             contents = path_or_buf.read()
-            try:
-                contents = contents.encode(self._default_encoding)
-            except:
-                pass
             self.path_or_buf = BytesIO(contents)
 
         self._read_header()
@@ -1030,6 +1014,15 @@ def close(self):
         except IOError:
             pass
 
+    def _set_encoding(self):
+        """
+        Set string encoding which depends on file version
+        """
+        if self.format_version < 118:
+            self._encoding = 'latin-1'
+        else:
+            self._encoding = 'utf-8'
+
     def _read_header(self):
         first_char = self.path_or_buf.read(1)
         if struct.unpack('c', first_char)[0] == b'<':
@@ -1049,6 +1042,7 @@ def _read_new_header(self, first_char):
         self.format_version = int(self.path_or_buf.read(3))
         if self.format_version not in [117, 118]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.path_or_buf.read(21)  # </release><byteorder>
         self.byteorder = self.path_or_buf.read(3) == b'MSF' and '>' or '<'
         self.path_or_buf.read(15)  # </byteorder><K>
@@ -1235,6 +1229,7 @@ def _read_old_header(self, first_char):
         self.format_version = struct.unpack('b', first_char)[0]
         if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
             raise ValueError(_version_error)
+        self._set_encoding()
         self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[
             0] == 0x1 and '>' or '<'
         self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0]
@@ -1338,16 +1333,9 @@ def _decode(self, s):
         return s.decode('utf-8')
 
     def _null_terminate(self, s):
-        if compat.PY3 or self._encoding is not None:
-            # have bytes not strings, so must decode
-            s = s.partition(b"\0")[0]
-            return s.decode(self._encoding or self._default_encoding)
-        else:
-            null_byte = "\0"
-            try:
-                return s.lstrip(null_byte)[:s.index(null_byte)]
-            except:
-                return s
+        # have bytes not strings, so must decode
+        s = s.partition(b"\0")[0]
+        return s.decode(self._encoding)
 
     def _read_value_labels(self):
         if self._value_labels_read:
@@ -1433,10 +1421,7 @@ def _read_strls(self):
                                    self.path_or_buf.read(4))[0]
             va = self.path_or_buf.read(length)
             if typ == 130:
-                encoding = 'utf-8'
-                if self.format_version == 117:
-                    encoding = self._encoding or self._default_encoding
-                va = va[0:-1].decode(encoding)
+                va = va[0:-1].decode(self._encoding)
             # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
             self.GSO[str(v_o)] = va
 
@@ -1980,9 +1965,14 @@ class StataWriter(StataParser):
     def __init__(self, fname, data, convert_dates=None, write_index=True,
                  encoding="latin-1", byteorder=None, time_stamp=None,
                  data_label=None, variable_labels=None):
-        super(StataWriter, self).__init__(encoding)
+        super(StataWriter, self).__init__()
         self._convert_dates = {} if convert_dates is None else convert_dates
         self._write_index = write_index
+        if encoding is not None:
+            if encoding not in VALID_ENCODINGS:
+                raise ValueError('Unknown encoding. Only latin-1 and ascii '
+                                 'supported.')
+        self._encoding = encoding
         self._time_stamp = time_stamp
         self._data_label = data_label
         self._variable_labels = variable_labels
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
         recons = pd.read_csv(StringIO(csv_str), index_col=0)
         assert_frame_equal(self.frame, recons)
 
-    def test_to_csv_compression(self, compression):
-
-        df = DataFrame([[0.123456, 0.234567, 0.567567],
-                        [12.32112, 123123.2, 321321.2]],
-                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
+    @pytest.mark.parametrize('df,encoding', [
+        (DataFrame([[0.123456, 0.234567, 0.567567],
+                    [12.32112, 123123.2, 321321.2]],
+                   index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
+        # GH 21241, 21118
+        (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
+        (DataFrame(5 * [[123, u"你好", u"世界"]],
+                   columns=['X', 'Y', 'Z']), 'gb2312'),
+        (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
+                   columns=['X', 'Y', 'Z']), 'cp737')
+    ])
+    def test_to_csv_compression(self, df, encoding, compression):
 
         with ensure_clean() as filename:
 
-            df.to_csv(filename, compression=compression)
+            df.to_csv(filename, compression=compression, encoding=encoding)
 
             # test the round trip - to_csv -> read_csv
-            rs = read_csv(filename, compression=compression,
-                          index_col=0)
-            assert_frame_equal(df, rs)
+            result = read_csv(filename, compression=compression,
+                              index_col=0, encoding=encoding)
+
+            with open(filename, 'w') as fh:
+                df.to_csv(fh, compression=compression, encoding=encoding)
+
+            result_fh = read_csv(filename, compression=compression,
+                                 index_col=0, encoding=encoding)
+            assert_frame_equal(df, result)
+            assert_frame_equal(df, result_fh)
 
             # explicitly make sure file is compressed
             with tm.decompress_file(filename, compression) as fh:
-                text = fh.read().decode('utf8')
+                text = fh.read().decode(encoding or 'utf8')
                 for col in df.columns:
                     assert col in text
 
             with tm.decompress_file(filename, compression) as fh:
-                assert_frame_equal(df, read_csv(fh, index_col=0))
+                assert_frame_equal(df, read_csv(fh,
+                                                index_col=0,
+                                                encoding=encoding))
 
     def test_to_csv_date_format(self):
         with ensure_clean('__tmp_to_csv_date_format__') as path:
diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py
diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py
diff --git a/pandas/tests/io/data/stata16_118.dta b/pandas/tests/io/data/stata16_118.dta
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py

Original file line number	Diff line number	Diff line change
`@@ -63,8 +63,7 @@ Removal of prior version deprecations/changes`
`63`	`63`	`Performance Improvements`
`64`	`64`	`~~~~~~~~~~~~~~~~~~~~~~~~`
`65`	`65`
`66`		`--`
`67`		`--`
	`66`	+- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
`68`	`67`	`-`
`69`	`68`
`70`	`69`	`.. _whatsnew_0240.docs:`