pandas-dev
diff --git a/‎.pre-commit-config.yaml
-4 b/‎.pre-commit-config.yaml
-4
diff --git a/‎asv_bench/asv.conf.json
+2-1 b/‎asv_bench/asv.conf.json
+2-1
diff --git a/‎asv_bench/benchmarks/io/excel.py
+54-22 b/‎asv_bench/benchmarks/io/excel.py
+54-22
diff --git a/‎asv_bench/benchmarks/io/json.py
+2-2 b/‎asv_bench/benchmarks/io/json.py
+2-2
diff --git a/‎asv_bench/benchmarks/package.py
+25 b/‎asv_bench/benchmarks/package.py
+25
diff --git a/‎asv_bench/benchmarks/rolling.py
+3 b/‎asv_bench/benchmarks/rolling.py
+3
diff --git a/‎asv_bench/benchmarks/stat_ops.py
+11 b/‎asv_bench/benchmarks/stat_ops.py
+11
diff --git a/‎ci/code_checks.sh
+7-3 b/‎ci/code_checks.sh
+7-3
diff --git a/‎ci/deps/azure-windows-36.yaml
+1-3 b/‎ci/deps/azure-windows-36.yaml
+1-3
diff --git a/‎doc/source/development/contributing.rst
+38-3 b/‎doc/source/development/contributing.rst
+38-3
diff --git a/‎doc/source/ecosystem.rst
+15-12 b/‎doc/source/ecosystem.rst
+15-12
diff --git a/‎doc/source/reference/plotting.rst
+4 b/‎doc/source/reference/plotting.rst
+4
diff --git a/‎doc/source/user_guide/io.rst
+23 b/‎doc/source/user_guide/io.rst
+23
diff --git a/‎doc/source/user_guide/options.rst
+1-1 b/‎doc/source/user_guide/options.rst
+1-1
diff --git a/‎doc/source/whatsnew/v0.25.2.rst
+1 b/‎doc/source/whatsnew/v0.25.2.rst
+1
@@ -15,7 +15,3 @@ repos:
     hooks:
     -   id: isort
         language: python_venv
--   repo: https://github.com/asottile/seed-isort-config
-    rev: v1.9.2
-    hooks:
-    -   id: seed-isort-config
@@ -50,12 +50,13 @@
         "xlsxwriter": [],
         "xlrd": [],
         "xlwt": [],
+        "odfpy": [],
         "pytest": [],
         // If using Windows with python 2.7 and want to build using the
         // mingw toolchain (rather than MSVC), uncomment the following line.
         // "libpython": [],
     },
-
+    "conda_channels": ["defaults", "conda-forge"],
     // Combinations of libraries/python versions can be excluded/included
     // from the set to test. Each entry is a dictionary containing additional
     // key-value pairs to include/exclude.
 
@@ -1,40 +1,72 @@
 from io import BytesIO
 
 import numpy as np
+from odf.opendocument import OpenDocumentSpreadsheet
+from odf.table import Table, TableCell, TableRow
+from odf.text import P
 
 from pandas import DataFrame, ExcelWriter, date_range, read_excel
 import pandas.util.testing as tm
 
 
-class Excel:
+def _generate_dataframe():
+    N = 2000
+    C = 5
+    df = DataFrame(
+        np.random.randn(N, C),
+        columns=["float{}".format(i) for i in range(C)],
+        index=date_range("20000101", periods=N, freq="H"),
+    )
+    df["object"] = tm.makeStringIndex(N)
+    return df
+
+
+class WriteExcel:
 
     params = ["openpyxl", "xlsxwriter", "xlwt"]
     param_names = ["engine"]
 
     def setup(self, engine):
-        N = 2000
-        C = 5
-        self.df = DataFrame(
-            np.random.randn(N, C),
-            columns=["float{}".format(i) for i in range(C)],
-            index=date_range("20000101", periods=N, freq="H"),
-        )
-        self.df["object"] = tm.makeStringIndex(N)
-        self.bio_read = BytesIO()
-        self.writer_read = ExcelWriter(self.bio_read, engine=engine)
-        self.df.to_excel(self.writer_read, sheet_name="Sheet1")
-        self.writer_read.save()
-        self.bio_read.seek(0)
-
-    def time_read_excel(self, engine):
-        read_excel(self.bio_read)
+        self.df = _generate_dataframe()
 
     def time_write_excel(self, engine):
-        bio_write = BytesIO()
-        bio_write.seek(0)
-        writer_write = ExcelWriter(bio_write, engine=engine)
-        self.df.to_excel(writer_write, sheet_name="Sheet1")
-        writer_write.save()
+        bio = BytesIO()
+        bio.seek(0)
+        writer = ExcelWriter(bio, engine=engine)
+        self.df.to_excel(writer, sheet_name="Sheet1")
+        writer.save()
+
+
+class ReadExcel:
+
+    params = ["xlrd", "openpyxl", "odf"]
+    param_names = ["engine"]
+    fname_excel = "spreadsheet.xlsx"
+    fname_odf = "spreadsheet.ods"
+
+    def _create_odf(self):
+        doc = OpenDocumentSpreadsheet()
+        table = Table(name="Table1")
+        for row in self.df.values:
+            tr = TableRow()
+            for val in row:
+                tc = TableCell(valuetype="string")
+                tc.addElement(P(text=val))
+                tr.addElement(tc)
+            table.addElement(tr)
+
+        doc.spreadsheet.addElement(table)
+        doc.save(self.fname_odf)
+
+    def setup_cache(self):
+        self.df = _generate_dataframe()
+
+        self.df.to_excel(self.fname_excel, sheet_name="Sheet1")
+        self._create_odf()
+
+    def time_read_excel(self, engine):
+        fname = self.fname_odf if engine == "odf" else self.fname_excel
+        read_excel(fname, engine=engine)
 
 
 from ..pandas_vb_common import setup  # noqa: F401 isort:skip
@@ -118,15 +118,15 @@ def setup(self, orient, frame):
     def time_to_json(self, orient, frame):
         getattr(self, frame).to_json(self.fname, orient=orient)
 
-    def mem_to_json(self, orient, frame):
+    def peakmem_to_json(self, orient, frame):
         getattr(self, frame).to_json(self.fname, orient=orient)
 
     def time_to_json_wide(self, orient, frame):
         base_df = getattr(self, frame).copy()
         df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
         df.to_json(self.fname, orient=orient)
 
-    def mem_to_json_wide(self, orient, frame):
+    def peakmem_to_json_wide(self, orient, frame):
         base_df = getattr(self, frame).copy()
         df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1)
         df.to_json(self.fname, orient=orient)
 
@@ -0,0 +1,25 @@
+"""
+Benchmarks for pandas at the package-level.
+"""
+import subprocess
+import sys
+
+from pandas.compat import PY37
+
+
+class TimeImport:
+    def time_import(self):
+        if PY37:
+            # on py37+ we the "-X importtime" usage gives us a more precise
+            #  measurement of the import time we actually care about,
+            #  without the subprocess or interpreter overhead
+            cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"]
+            p = subprocess.run(cmd, stderr=subprocess.PIPE)
+
+            line = p.stderr.splitlines()[-1]
+            field = line.split(b"|")[-2].strip()
+            total = int(field)  # microseconds
+            return total
+
+        cmd = [sys.executable, "-c", "import pandas as pd"]
+        subprocess.run(cmd, stderr=subprocess.PIPE)
@@ -21,6 +21,9 @@ def setup(self, constructor, window, dtype, method):
     def time_rolling(self, constructor, window, dtype, method):
         getattr(self.roll, method)()
 
+    def peakmem_rolling(self, constructor, window, dtype, method):
+        getattr(self.roll, method)()
+
 
 class ExpandingMethods:
 
 
@@ -113,12 +113,23 @@ def setup(self, method, use_bottleneck):
             nanops._USE_BOTTLENECK = use_bottleneck
         self.df = pd.DataFrame(np.random.randn(1000, 30))
         self.df2 = pd.DataFrame(np.random.randn(1000, 30))
+        self.df_wide = pd.DataFrame(np.random.randn(1000, 200))
+        self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9)
         self.s = pd.Series(np.random.randn(1000))
         self.s2 = pd.Series(np.random.randn(1000))
 
     def time_corr(self, method, use_bottleneck):
         self.df.corr(method=method)
 
+    def time_corr_wide(self, method, use_bottleneck):
+        self.df_wide.corr(method=method)
+
+    def time_corr_wide_nans(self, method, use_bottleneck):
+        self.df_wide_nans.corr(method=method)
+
+    def peakmem_corr_wide(self, method, use_bottleneck):
+        self.df_wide.corr(method=method)
+
     def time_corr_series(self, method, use_bottleneck):
         self.s.corr(self.s2, method=method)
 
 
@@ -203,10 +203,14 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then
 import sys
 import pandas
 
-blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis',
+blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis',
              'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy',
-             'tables', 'xlrd', 'xlsxwriter', 'xlwt'}
-mods = blacklist & set(m.split('.')[0] for m in sys.modules)
+             'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'}
+
+# GH#28227 for some of these check for top-level modules, while others are
+#  more specific (e.g. urllib.request)
+import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules)
+mods = blacklist & import_mods
 if mods:
     sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods)))
     sys.exit(len(mods))
 
@@ -1,17 +1,15 @@
 name: pandas-dev
 channels:
-  - defaults
   - conda-forge
+  - defaults
 dependencies:
   - blosc
   - bottleneck
-  - boost-cpp<1.67
   - fastparquet>=0.2.1
   - matplotlib=3.0.2
   - numexpr
   - numpy=1.15.*
   - openpyxl
-  - parquet-cpp
   - pyarrow
   - pytables
   - python-dateutil
 
@@ -135,9 +135,44 @@ operations. To install pandas from source, you need to compile these C
 extensions, which means you need a C compiler. This process depends on which
 platform you're using.
 
-* Windows: https://devguide.python.org/setup/#windows-compiling
-* Mac: https://devguide.python.org/setup/#macos
-* Unix: https://devguide.python.org/setup/#unix-compiling
+**Windows**
+
+You will need `Build Tools for Visual Studio 2017
+<https://visualstudio.microsoft.com/downloads/>`_.
+
+.. warning::
+	You DO NOT need to install Visual Studio 2019.
+	You only need "Build Tools for Visual Studio 2019" found by
+	scrolling down to "All downloads" -> "Tools for Visual Studio 2019".
+
+**Mac OS**
+
+Information about compiler installation can be found here:
+https://devguide.python.org/setup/#macos
+
+**Unix**
+
+Some Linux distributions will come with a pre-installed C compiler. To find out
+which compilers (and versions) are installed on your system::
+
+    # for Debian/Ubuntu:
+    dpkg --list | grep compiler
+    # for Red Hat/RHEL/CentOS/Fedora:
+    yum list installed | grep -i --color compiler
+
+`GCC (GNU Compiler Collection) <https://gcc.gnu.org/>`_, is a widely used
+compiler, which supports C and a number of other languages. If GCC is listed
+as an installed compiler nothing more is required. If no C compiler is
+installed (or you wish to install a newer version) you can install a compiler
+(GCC in the example code below) with::
+
+    # for recent Debian/Ubuntu:
+    sudo apt install build-essential
+    # for Red Had/RHEL/CentOS/Fedora
+    yum groupinstall "Development Tools"
+
+For other Linux distributions, consult your favourite search engine for
+commpiler installation instructions.
 
 Let us know if you have any difficulties by opening an issue or reaching out on
 `Gitter`_.
 
@@ -23,6 +23,21 @@ or `search pypi for pandas <https://pypi.org/search/?q=pandas>`_.
 We'd like to make it easier for users to find these projects, if you know of other
 substantial projects that you feel should be on this list, please let us know.
 
+.. _ecosystem.data_cleaning_and_validation:
+
+Data cleaning and validation
+----------------------------
+
+`pyjanitor <https://github.com/ericmjl/pyjanitor/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Pyjanitor provides a clean API for cleaning data, using method chaining.
+
+`Engarde <https://engarde.readthedocs.io/en/latest/>`__
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Engarde is a lightweight library used to explicitly state assumptions about your datasets
+and check that they're *actually* true.
 
 .. _ecosystem.stats:
 
@@ -329,18 +344,6 @@ Increasingly, packages are being built on top of pandas to address specific need
  * vaex.from_pandas
  * vaex.to_pandas_df
 
-
-.. _ecosystem.data_validation:
-
-Data validation
----------------
-
-`Engarde <https://engarde.readthedocs.io/en/latest/>`__
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Engarde is a lightweight library used to explicitly state your assumptions about your datasets
-and check that they're *actually* true.
-
 .. _ecosystem.extensions:
 
 Extension data types
 
@@ -13,10 +13,14 @@ The following functions are contained in the `pandas.plotting` module.
    :toctree: api/
 
    andrews_curves
+   autocorrelation_plot
    bootstrap_plot
+   boxplot
    deregister_matplotlib_converters
    lag_plot
    parallel_coordinates
+   plot_params
    radviz
    register_matplotlib_converters
    scatter_matrix
+   table
@@ -5047,6 +5047,17 @@ Example of a callable using PostgreSQL `COPY clause
   from io import StringIO
 
   def psql_insert_copy(table, conn, keys, data_iter):
+      """
+      Execute SQL statement inserting data
+
+      Parameters
+      ----------
+      table : pandas.io.sql.SQLTable
+      conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
+      keys : list of str
+          Column names
+      data_iter : Iterable that iterates the values to be inserted
+      """
       # gets a DBAPI connection that can provide a cursor
       dbapi_conn = conn.connection
       with dbapi_conn.cursor() as cur:
@@ -5080,6 +5091,18 @@ table name and optionally a subset of columns to read.
 
    pd.read_sql_table('data', engine)
 
+.. note::
+
+  Note that pandas infers column dtypes from query outputs, and not by looking
+  up data types in the physical database schema. For example, assume ``userid``
+  is an integer column in a table. Then, intuitively, ``select userid ...`` will
+  return integer-valued series, while ``select cast(userid as text) ...`` will
+  return object-valued (str) series. Accordingly, if the query output is empty,
+  then all resulting columns will be returned as object-valued (since they are
+  most general). If you foresee that your query will sometimes generate an empty
+  result, you may want to explicitly typecast afterwards to ensure dtype
+  integrity.
+
 You can also specify the name of the column as the ``DataFrame`` index,
 and specify a subset of columns to be read.
 
 
@@ -163,7 +163,7 @@ determines how many rows are shown in the truncated repr.
 .. ipython:: python
 
    pd.set_option('max_rows', 8)
-   pd.set_option('max_rows', 4)
+   pd.set_option('min_rows', 4)
    # below max_rows -> all rows shown
    df = pd.DataFrame(np.random.randn(7, 2))
    df
 
@@ -78,6 +78,7 @@ Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
+- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
 -
 -
 -
Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,7 @@ Groupby/resample/rolling`
`78`	`78`	`^^^^^^^^^^^^^^^^^^^^^^^^`
`79`	`79`
`80`	`80`	- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`).
	`81`	+- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`)
`81`	`82`	`-`
`82`	`83`	`-`
`83`	`84`	`-`