Merge branch 'master' of https://github.com/pandas-dev/pandas into ref-cat-astype-2

jbrockmendel · jbrockmendel · commit 5621418e780e · 2020-12-16T20:13:23.000-08:00
diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py
@@ -1,4 +1,4 @@
-from io import StringIO
+from io import BytesIO, StringIO
 import random
 import string
 
@@ -146,10 +146,10 @@ def time_read_csv(self, bad_date_value):
 class ReadCSVSkipRows(BaseIO):
 
     fname = "__test__.csv"
-    params = [None, 10000]
-    param_names = ["skiprows"]
+    params = ([None, 10000], ["c", "python"])
+    param_names = ["skiprows", "engine"]
 
-    def setup(self, skiprows):
+    def setup(self, skiprows, engine):
         N = 20000
         index = tm.makeStringIndex(N)
         df = DataFrame(
@@ -164,8 +164,8 @@ def setup(self, skiprows):
         )
         df.to_csv(self.fname)
 
-    def time_skipprows(self, skiprows):
-        read_csv(self.fname, skiprows=skiprows)
+    def time_skipprows(self, skiprows, engine):
+        read_csv(self.fname, skiprows=skiprows, engine=engine)
 
 
 class ReadUint64Integers(StringIORewind):
@@ -192,10 +192,10 @@ def time_read_uint64_na_values(self):
 class ReadCSVThousands(BaseIO):
 
     fname = "__test__.csv"
-    params = ([",", "|"], [None, ","])
-    param_names = ["sep", "thousands"]
+    params = ([",", "|"], [None, ","], ["c", "python"])
+    param_names = ["sep", "thousands", "engine"]
 
-    def setup(self, sep, thousands):
+    def setup(self, sep, thousands, engine):
         N = 10000
         K = 8
         data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))
@@ -206,16 +206,19 @@ def setup(self, sep, thousands):
             df = df.applymap(lambda x: fmt.format(x))
         df.to_csv(self.fname, sep=sep)
 
-    def time_thousands(self, sep, thousands):
-        read_csv(self.fname, sep=sep, thousands=thousands)
+    def time_thousands(self, sep, thousands, engine):
+        read_csv(self.fname, sep=sep, thousands=thousands, engine=engine)
 
 
 class ReadCSVComment(StringIORewind):
-    def setup(self):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = ["A,B,C"] + (["1,2,3 # comment"] * 100000)
         self.StringIO_input = StringIO("\n".join(data))
 
-    def time_comment(self):
+    def time_comment(self, engine):
         read_csv(
             self.data(self.StringIO_input), comment="#", header=None, names=list("abc")
         )
@@ -255,25 +258,47 @@ def time_read_csv_python_engine(self, sep, decimal, float_precision):
         )
 
 
+class ReadCSVEngine(StringIORewind):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
+        data = ["A,B,C,D,E"] + (["1,2,3,4,5"] * 100000)
+        self.StringIO_input = StringIO("\n".join(data))
+        # simulate reading from file
+        self.BytesIO_input = BytesIO(self.StringIO_input.read().encode("utf-8"))
+
+    def time_read_stringcsv(self, engine):
+        read_csv(self.data(self.StringIO_input), engine=engine)
+
+    def time_read_bytescsv(self, engine):
+        read_csv(self.data(self.BytesIO_input), engine=engine)
+
+
 class ReadCSVCategorical(BaseIO):
 
     fname = "__test__.csv"
+    params = ["c", "python"]
+    param_names = ["engine"]
 
-    def setup(self):
+    def setup(self, engine):
         N = 100000
         group1 = ["aaaaaaaa", "bbbbbbb", "cccccccc", "dddddddd", "eeeeeeee"]
         df = DataFrame(np.random.choice(group1, (N, 3)), columns=list("abc"))
         df.to_csv(self.fname, index=False)
 
-    def time_convert_post(self):
-        read_csv(self.fname).apply(Categorical)
+    def time_convert_post(self, engine):
+        read_csv(self.fname, engine=engine).apply(Categorical)
 
-    def time_convert_direct(self):
-        read_csv(self.fname, dtype="category")
+    def time_convert_direct(self, engine):
+        read_csv(self.fname, engine=engine, dtype="category")
 
 
 class ReadCSVParseDates(StringIORewind):
-    def setup(self):
+    params = ["c", "python"]
+    param_names = ["engine"]
+
+    def setup(self, engine):
         data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n
                   {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n
                   {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n
@@ -284,18 +309,20 @@ def setup(self):
         data = data.format(*two_cols)
         self.StringIO_input = StringIO(data)
 
-    def time_multiple_date(self):
+    def time_multiple_date(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=list(string.digits[:9]),
             parse_dates=[[1, 2], [1, 3]],
         )
 
-    def time_baseline(self):
+    def time_baseline(self, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             parse_dates=[1],
@@ -304,17 +331,18 @@ def time_baseline(self):
 
 
 class ReadCSVCachedParseDates(StringIORewind):
-    params = ([True, False],)
-    param_names = ["do_cache"]
+    params = ([True, False], ["c", "python"])
+    param_names = ["do_cache", "engine"]
 
-    def setup(self, do_cache):
+    def setup(self, do_cache, engine):
         data = ("\n".join(f"10/{year}" for year in range(2000, 2100)) + "\n") * 10
         self.StringIO_input = StringIO(data)
 
-    def time_read_csv_cached(self, do_cache):
+    def time_read_csv_cached(self, do_cache, engine):
         try:
             read_csv(
                 self.data(self.StringIO_input),
+                engine=engine,
                 header=None,
                 parse_dates=[0],
                 cache_dates=do_cache,
@@ -329,37 +357,40 @@ class ReadCSVMemoryGrowth(BaseIO):
     chunksize = 20
     num_rows = 1000
     fname = "__test__.csv"
+    params = ["c", "python"]
+    param_names = ["engine"]
 
-    def setup(self):
+    def setup(self, engine):
         with open(self.fname, "w") as f:
             for i in range(self.num_rows):
                 f.write(f"{i}\n")
 
-    def mem_parser_chunks(self):
+    def mem_parser_chunks(self, engine):
         # see gh-24805.
-        result = read_csv(self.fname, chunksize=self.chunksize)
+        result = read_csv(self.fname, chunksize=self.chunksize, engine=engine)
 
         for _ in result:
             pass
 
 
 class ReadCSVParseSpecialDate(StringIORewind):
-    params = (["mY", "mdY", "hm"],)
-    param_names = ["value"]
+    params = (["mY", "mdY", "hm"], ["c", "python"])
+    param_names = ["value", "engine"]
     objects = {
         "mY": "01-2019\n10-2019\n02/2000\n",
         "mdY": "12/02/2010\n",
         "hm": "21:34\n",
     }
 
-    def setup(self, value):
+    def setup(self, value, engine):
         count_elem = 10000
         data = self.objects[value] * count_elem
         self.StringIO_input = StringIO(data)
 
-    def time_read_special_date(self, value):
+    def time_read_special_date(self, value, engine):
         read_csv(
             self.data(self.StringIO_input),
+            engine=engine,
             sep=",",
             header=None,
             names=["Date"],
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -178,6 +178,10 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then
     pytest -q --doctest-modules pandas/core/strings/
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
+    MSG='Doctests sql.py' ; echo $MSG
+    pytest -q --doctest-modules pandas/io/sql.py
+    RET=$(($RET + $?)) ; echo $MSG "DONE"
+
     # Directories
 
     MSG='Doctests arrays'; echo $MSG
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -232,7 +232,7 @@ MultiIndex
 ^^^^^^^^^^
 
 - Bug in :meth:`DataFrame.drop` raising ``TypeError`` when :class:`MultiIndex` is non-unique and no level is provided (:issue:`36293`)
--
+- Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differntly ordered (:issue:`38439`)
 
 I/O
 ^^^
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -8,7 +8,7 @@
 from pandas._libs import index as libindex
 from pandas._libs.lib import no_default
 from pandas._typing import ArrayLike, Label
-from pandas.util._decorators import Appender, cache_readonly, doc
+from pandas.util._decorators import Appender, doc
 
 from pandas.core.dtypes.common import (
     ensure_platform_int,
@@ -381,14 +381,6 @@ def fillna(self, value, downcast=None):
         cat = self._data.fillna(value)
         return type(self)._simple_new(cat, name=self.name)
 
-    @cache_readonly
-    def _engine(self):
-        # we are going to look things up with the codes themselves.
-        # To avoid a reference cycle, bind `codes` to a local variable, so
-        # `self` is not passed into the lambda.
-        codes = self.codes
-        return self._engine_type(lambda: codes, len(self))
-
     @doc(Index.unique)
     def unique(self, level=None):
         if level is not None:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -3454,13 +3454,17 @@ def equals(self, other: object) -> bool:
 
         for i in range(self.nlevels):
             self_codes = self.codes[i]
-            self_codes = self_codes[self_codes != -1]
+            other_codes = other.codes[i]
+            self_mask = self_codes == -1
+            other_mask = other_codes == -1
+            if not np.array_equal(self_mask, other_mask):
+                return False
+            self_codes = self_codes[~self_mask]
             self_values = algos.take_nd(
                 np.asarray(self.levels[i]._values), self_codes, allow_fill=False
             )
 
-            other_codes = other.codes[i]
-            other_codes = other_codes[other_codes != -1]
+            other_codes = other_codes[~other_mask]
             other_values = algos.take_nd(
                 np.asarray(other.levels[i]._values), other_codes, allow_fill=False
             )
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -482,6 +482,64 @@ def read_sql(
     --------
     read_sql_table : Read SQL database table into a DataFrame.
     read_sql_query : Read SQL query into a DataFrame.
+
+    Examples
+    --------
+    Read data from SQL via either a SQL query or a SQL tablename.
+    When using a SQLite database only SQL queries are accepted,
+    providing only the SQL tablename will result in an error.
+
+    >>> from sqlite3 import connect
+    >>> conn = connect(':memory:')
+    >>> df = pd.DataFrame(data=[[0, '10/11/12'], [1, '12/11/10']],
+    ...                   columns=['int_column', 'date_column'])
+    >>> df.to_sql('test_data', conn)
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data', conn)
+       int_column date_column
+    0           0    10/11/12
+    1           1    12/11/10
+
+    >>> pd.read_sql('test_data', 'postgres:///db_name')  # doctest:+SKIP
+
+    Apply date parsing to columns through the ``parse_dates`` argument
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates=["date_column"])
+       int_column date_column
+    0           0  2012-10-11
+    1           1  2010-12-11
+
+    The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns.
+    Custom argument values for applying ``pd.to_datetime`` on a column are specified
+    via a dictionary format:
+    1. Ignore errors while parsing the values of "date_column"
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates={"date_column": {"errors": "ignore"}})
+       int_column date_column
+    0           0  2012-10-11
+    1           1  2010-12-11
+
+    2. Apply a dayfirst date parsing order on the values of "date_column"
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates={"date_column": {"dayfirst": True}})
+       int_column date_column
+    0           0  2012-11-10
+    1           1  2010-11-12
+
+    3. Apply custom formatting when date parsing the values of "date_column"
+
+    >>> pd.read_sql('SELECT int_column, date_column FROM test_data',
+    ...             conn,
+    ...             parse_dates={"date_column": {"format": "%d/%m/%y"}})
+       int_column date_column
+    0           0  2012-11-10
+    1           1  2010-11-12
     """
     pandas_sql = pandasSQL_builder(con)
 
diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py
@@ -209,6 +209,16 @@ def test_equals_missing_values():
     assert not result
 
 
+def test_equals_missing_values_differently_sorted():
+    # GH#38439
+    mi1 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)])
+    mi2 = pd.MultiIndex.from_tuples([(np.nan, np.nan), (81.0, np.nan)])
+    assert not mi1.equals(mi2)
+
+    mi2 = pd.MultiIndex.from_tuples([(81.0, np.nan), (np.nan, np.nan)])
+    assert mi1.equals(mi2)
+
+
 def test_is_():
     mi = MultiIndex.from_tuples(zip(range(10), range(10)))
     assert mi.is_(mi)