Merge remote-tracking branch 'upstream/master' into typing

simonjayhawkins · simonjayhawkins · commit 523f9c86f437 · 2019-09-06T07:07:47.000+01:00
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -50,12 +50,13 @@
         "xlsxwriter": [],
         "xlrd": [],
         "xlwt": [],
+        "odfpy": [],
         "pytest": [],
         // If using Windows with python 2.7 and want to build using the
         // mingw toolchain (rather than MSVC), uncomment the following line.
         // "libpython": [],
     },
-
+    "conda_channels": ["defaults", "conda-forge"],
     // Combinations of libraries/python versions can be excluded/included
     // from the set to test. Each entry is a dictionary containing additional
     // key-value pairs to include/exclude.
diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py
@@ -1,40 +1,72 @@
 from io import BytesIO
 
 import numpy as np
+from odf.opendocument import OpenDocumentSpreadsheet
+from odf.table import Table, TableCell, TableRow
+from odf.text import P
 
 from pandas import DataFrame, ExcelWriter, date_range, read_excel
 import pandas.util.testing as tm
 
 
-class Excel:
+def _generate_dataframe():
+    N = 2000
+    C = 5
+    df = DataFrame(
+        np.random.randn(N, C),
+        columns=["float{}".format(i) for i in range(C)],
+        index=date_range("20000101", periods=N, freq="H"),
+    )
+    df["object"] = tm.makeStringIndex(N)
+    return df
+
+
+class WriteExcel:
 
     params = ["openpyxl", "xlsxwriter", "xlwt"]
     param_names = ["engine"]
 
     def setup(self, engine):
-        N = 2000
-        C = 5
-        self.df = DataFrame(
-            np.random.randn(N, C),
-            columns=["float{}".format(i) for i in range(C)],
-            index=date_range("20000101", periods=N, freq="H"),
-        )
-        self.df["object"] = tm.makeStringIndex(N)
-        self.bio_read = BytesIO()
-        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
-        self.df.to_excel(self.writer_read, sheet_name="Sheet1")
-        self.writer_read.save()
-        self.bio_read.seek(0)
-
-    def time_read_excel(self, engine):
-        read_excel(self.bio_read)
+        self.df = _generate_dataframe()
 
     def time_write_excel(self, engine):
-        bio_write = BytesIO()
-        bio_write.seek(0)
-        writer_write = ExcelWriter(bio_write, engine=engine)
-        self.df.to_excel(writer_write, sheet_name="Sheet1")
-        writer_write.save()
+        bio = BytesIO()
+        bio.seek(0)
+        writer = ExcelWriter(bio, engine=engine)
+        self.df.to_excel(writer, sheet_name="Sheet1")
+        writer.save()
+
+
+class ReadExcel:
+
+    params = ["xlrd", "openpyxl", "odf"]
+    param_names = ["engine"]
+    fname_excel = "spreadsheet.xlsx"
+    fname_odf = "spreadsheet.ods"
+
+    def _create_odf(self):
+        doc = OpenDocumentSpreadsheet()
+        table = Table(name="Table1")
+        for row in self.df.values:
+            tr = TableRow()
+            for val in row:
+                tc = TableCell(valuetype="string")
+                tc.addElement(P(text=val))
+                tr.addElement(tc)
+            table.addElement(tr)
+
+        doc.spreadsheet.addElement(table)
+        doc.save(self.fname_odf)
+
+    def setup_cache(self):
+        self.df = _generate_dataframe()
+
+        self.df.to_excel(self.fname_excel, sheet_name="Sheet1")
+        self._create_odf()
+
+    def time_read_excel(self, engine):
+        fname = self.fname_odf if engine == "odf" else self.fname_excel
+        read_excel(fname, engine=engine)
 
 
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py
@@ -0,0 +1,25 @@
+"""
+Benchmarks for pandas at the package-level.
+"""
+import subprocess
+import sys
+
+from pandas.compat import PY37
+
+
+class TimeImport:
+    def time_import(self):
+        if PY37:
+            # on py37+ we the "-X importtime" usage gives us a more precise
+            #  measurement of the import time we actually care about,
+            #  without the subprocess or interpreter overhead
+            cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"]
+            p = subprocess.run(cmd, stderr=subprocess.PIPE)
+
+            line = p.stderr.splitlines()[-1]
+            field = line.split(b"|")[-2].strip()
+            total = int(field)  # microseconds
+            return total
+
+        cmd = [sys.executable, "-c", "import pandas as pd"]
+        subprocess.run(cmd, stderr=subprocess.PIPE)
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -203,10 +203,14 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
 import sys
 import pandas
 
-blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2', 'hypothesis',
+blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis',
              'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy',
-             'tables', 'xlrd', 'xlsxwriter', 'xlwt'}
-mods = blacklist & set(m.split('.')[0] for m in sys.modules)
+             'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'}
+
+# GH#28227 for some of these check for top-level modules, while others are
+#  more specific (e.g. urllib.request)
+import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules)
+mods = blacklist & import_mods
 if mods:
     sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
     sys.exit(len(mods))
diff --git a/environment.yml b/environment.yml
@@ -80,4 +80,5 @@ dependencies:
   - xlrd  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
   - xlsxwriter  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
   - xlwt  # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile
+  - odfpy  # pandas.read_excel
   - pyreadstat  # pandas.read_spss
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -24,7 +24,7 @@
 ordered_sentinel = object()  # type: object
 
 
-def register_extension_dtype(cls: Type[ExtensionDtype],) -> Type[ExtensionDtype]:
+def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]:
     """
     Register an ExtensionType with pandas as class decorator.
 
diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py
@@ -807,7 +807,13 @@ def na_op(x, y):
         return result
 
     fill_int = lambda x: x.fillna(0)
-    fill_bool = lambda x: x.fillna(False).astype(bool)
+
+    def fill_bool(x, left=None):
+        # if `left` is specifically not-boolean, we do not cast to bool
+        x = x.fillna(False)
+        if left is None or is_bool_dtype(left.dtype):
+            x = x.astype(bool)
+        return x
 
     def wrapper(self, other):
         is_self_int_dtype = is_integer_dtype(self.dtype)
@@ -836,7 +842,7 @@ def wrapper(self, other):
 
         elif isinstance(other, (ABCSeries, ABCIndexClass)):
             is_other_int_dtype = is_integer_dtype(other.dtype)
-            other = other if is_other_int_dtype else fill_bool(other)
+            other = other if is_other_int_dtype else fill_bool(other, self)
 
         else:
             # scalars, list, tuple, np.array
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -4,7 +4,6 @@
 import codecs
 import csv
 import gzip
-from http.client import HTTPException  # noqa
 from io import BufferedIOBase, BytesIO
 import mmap
 import os
@@ -22,7 +21,6 @@
     Type,
     Union,
 )
-from urllib.error import URLError  # noqa
 from urllib.parse import (  # noqa
     urlencode,
     urljoin,
@@ -31,7 +29,6 @@
     uses_params,
     uses_relative,
 )
-from urllib.request import pathname2url, urlopen
 import zipfile
 
 from pandas.compat import _get_lzma_file, _import_lzma
@@ -188,6 +185,16 @@ def is_gcs_url(url) -> bool:
         return False
 
 
+def urlopen(*args, **kwargs):
+    """
+    Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
+    the stdlib.
+    """
+    import urllib.request
+
+    return urllib.request.urlopen(*args, **kwargs)
+
+
 def get_filepath_or_buffer(
     filepath_or_buffer: FilePathOrBuffer,
     encoding: Optional[str] = None,
@@ -261,6 +268,9 @@ def file_path_to_url(path: str) -> str:
     -------
     a valid FILE URL
     """
+    # lazify expensive import (~30ms)
+    from urllib.request import pathname2url
+
     return urljoin("file:", pathname2url(path))
 
 
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -4,7 +4,6 @@
 from io import BytesIO
 import os
 from textwrap import fill
-from urllib.request import urlopen
 
 from pandas._config import config
 
@@ -21,6 +20,7 @@
     _stringify_path,
     _validate_header_arg,
     get_filepath_or_buffer,
+    urlopen,
 )
 from pandas.io.excel._util import (
     _fill_mi_header,
@@ -112,7 +112,7 @@
 
 engine : str, default None
     If io is not a buffer or path, this must be set to identify io.
-    Acceptable values are None or xlrd.
+    Acceptable values are None, "xlrd", "openpyxl" or "odf".
 converters : dict, default None
     Dict of functions for converting values in certain columns. Keys can
     either be integers or column labels, values are functions that take one
@@ -783,11 +783,12 @@ class ExcelFile:
     Parameters
     ----------
     io : string, path object (pathlib.Path or py._path.local.LocalPath),
-        file-like object or xlrd workbook
-        If a string or path object, expected to be a path to xls or xlsx file.
+        a file-like object, xlrd workbook or openpypl workbook.
+        If a string or path object, expected to be a path to xls, xlsx or odf file.
     engine : string, default None
         If io is not a buffer or path, this must be set to identify io.
-        Acceptable values are None or ``xlrd``.
+        Acceptable values are None, ``xlrd``, ``openpyxl`` or ``odf``.
+        Note that ``odf`` reads tables out of OpenDocument formatted files.
     """
 
     from pandas.io.excel._odfreader import _ODFReader
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -3,6 +3,7 @@
 from datetime import datetime, time
 from functools import partial
 import os
+from urllib.error import URLError
 import warnings
 
 import numpy as np
@@ -14,8 +15,6 @@
 from pandas import DataFrame, Index, MultiIndex, Series
 import pandas.util.testing as tm
 
-from pandas.io.common import URLError
-
 
 @contextlib.contextmanager
 def ignore_xlrd_time_clock_warning():
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
@@ -11,6 +11,7 @@
 import os
 import platform
 from tempfile import TemporaryFile
+from urllib.error import URLError
 
 import numpy as np
 import pytest
@@ -21,7 +22,6 @@
 from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
 import pandas.util.testing as tm
 
-from pandas.io.common import URLError
 from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
 
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -4,6 +4,7 @@
 import os
 import re
 import threading
+from urllib.error import URLError
 
 import numpy as np
 from numpy.random import rand
@@ -17,7 +18,7 @@
 import pandas.util.testing as tm
 from pandas.util.testing import makeCustomDataframe as mkdf, network
 
-from pandas.io.common import URLError, file_path_to_url
+from pandas.io.common import file_path_to_url
 import pandas.io.html
 from pandas.io.html import read_html
 
diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py
@@ -103,11 +103,8 @@ def test_logical_operators_int_dtype_with_float(self):
             s_0123 & [0.1, 4, 3.14, 2]
         with pytest.raises(TypeError):
             s_0123 & np.array([0.1, 4, 3.14, 2])
-
-        # FIXME: this should be consistent with the list case above
-        expected = Series([False, True, False, True])
-        result = s_0123 & Series([0.1, 4, -3.14, 2])
-        assert_series_equal(result, expected)
+        with pytest.raises(TypeError):
+            s_0123 & Series([0.1, 4, -3.14, 2])
 
     def test_logical_operators_int_dtype_with_str(self):
         s_1111 = Series([1] * 4, dtype="int8")
@@ -145,9 +142,8 @@ def test_logical_operators_int_dtype_with_object(self):
         assert_series_equal(result, expected)
 
         s_abNd = Series(["a", "b", np.NaN, "d"])
-        result = s_0123 & s_abNd
-        expected = Series([False, True, False, True])
-        assert_series_equal(result, expected)
+        with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"):
+            s_0123 & s_abNd
 
     def test_logical_operators_bool_dtype_with_int(self):
         index = list("bca")
diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py
@@ -103,7 +103,7 @@ def _skip_if_no_scipy():
     )
 
 
-def skip_if_installed(package: str,) -> Callable:
+def skip_if_installed(package: str) -> Callable:
     """
     Skip a test if a package is installed.
 
diff --git a/pandas/util/testing.py b/pandas/util/testing.py
diff --git a/requirements-dev.txt b/requirements-dev.txt

Original file line number	Diff line number	Diff line change
`@@ -103,7 +103,7 @@ def _skip_if_no_scipy():`
`103`	`103`	`)`
`104`	`104`
`105`	`105`
`106`		`-def skip_if_installed(package: str,) -> Callable:`
	`106`	`+def skip_if_installed(package: str) -> Callable:`
`107`	`107`	`"""`
`108`	`108`	`Skip a test if a package is installed.`
`109`	`109`