diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000000000..e947f30d285cd --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,147 @@ +version: 2 +jobs: + + # -------------------------------------------------------------------------- + # 0. py27_compat + # -------------------------------------------------------------------------- + py27_compat: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + environment: + JOB: "2.7_COMPAT" + ENV_FILE: "ci/circle-27-compat.yaml" + LOCALE_OVERRIDE: "it_IT.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 1. py36_locale + # -------------------------------------------------------------------------- + py36_locale: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE" + ENV_FILE: "ci/circle-36-locale.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 2. py36_locale_slow + # -------------------------------------------------------------------------- + py36_locale_slow: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE_SLOW" + ENV_FILE: "ci/circle-36-locale_slow.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --only-slow --skip-network + + # -------------------------------------------------------------------------- + # 3. py35_ascii + # -------------------------------------------------------------------------- + py35_ascii: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.5_ASCII" + ENV_FILE: "ci/circle-35-ascii.yaml" + LOCALE_OVERRIDE: "C" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + +workflows: + version: 2 + build_and_test: + jobs: + - py27_compat + - py36_locale + - py36_locale_slow + - py35_ascii diff --git a/.travis.yml b/.travis.yml index 4e25380a7d941..2d2a0bc019c80 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,11 @@ matrix: language: generic env: - JOB="3.5, OSX" ENV_FILE="ci/travis-35-osx.yaml" TEST_ARGS="--skip-slow --skip-network" + + - dist: trusty + env: + - JOB="3.7" ENV_FILE="ci/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - dist: trusty env: - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true diff --git a/MANIFEST.in b/MANIFEST.in index 9773019c6e6e0..b417b8890fa24 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,27 +3,39 @@ include LICENSE include RELEASE.md include README.md include setup.py -include pyproject.toml graft doc prune doc/build +graft LICENSES + graft pandas -global-exclude *.so -global-exclude *.pyd +global-exclude *.bz2 +global-exclude *.csv +global-exclude *.dta +global-exclude *.gz +global-exclude *.h5 +global-exclude *.html +global-exclude *.json +global-exclude *.msgpack +global-exclude *.pickle +global-exclude *.png global-exclude *.pyc +global-exclude *.pyd +global-exclude *.sas7bdat +global-exclude *.so +global-exclude *.xls +global-exclude *.xlsm +global-exclude *.xlsx +global-exclude *.xpt +global-exclude *.xz +global-exclude *.zip global-exclude *~ -global-exclude \#* -global-exclude .git* global-exclude .DS_Store -global-exclude *.png +global-exclude .git* +global-exclude \#* -# include examples/data/* -# recursive-include examples *.py -# recursive-include doc/source * -# recursive-include doc/sphinxext * -# recursive-include LICENSES * include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl diff --git a/appveyor.yml b/appveyor.yml index f70fc829ec971..c6199c1493f22 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -20,12 +20,14 @@ environment: matrix: - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" CONDA_NPY: "113" - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" CONDA_PY: "27" diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index ae1d7029217a4..5464e7cba22c3 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -173,3 +173,23 @@ def setup(self, dtype): def time_isin_categorical(self, dtype): self.series.isin(self.sample) + + +class IsMonotonic(object): + + def setup(self): + N = 1000 + self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N)) + self.s = pd.Series(self.c) + + def time_categorical_index_is_monotonic_increasing(self): + self.c.is_monotonic_increasing + + def time_categorical_index_is_monotonic_decreasing(self): + self.c.is_monotonic_decreasing + + def time_categorical_series_is_monotonic_increasing(self): + self.s.is_monotonic_increasing + + def time_categorical_series_is_monotonic_decreasing(self): + self.s.is_monotonic_decreasing diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml index 84107c605b14f..e47ebf75344fa 100644 --- a/ci/appveyor-27.yaml +++ b/ci/appveyor-27.yaml @@ -12,7 +12,7 @@ dependencies: - matplotlib - numexpr - numpy=1.10* - - openpyxl + - openpyxl=2.5.5 - pytables==3.2.2 - python=2.7.* - pytz diff --git a/ci/appveyor-36.yaml b/ci/appveyor-36.yaml index 5e370de39958a..d007f04ca0720 100644 --- a/ci/appveyor-36.yaml +++ b/ci/appveyor-36.yaml @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy=1.13* - - openpyxl + - openpyxl=2.5.5 - pyarrow - pytables - python-dateutil diff --git a/ci/circle-27-compat.yaml b/ci/circle-27-compat.yaml index 81a48d4edf11c..e037877819b14 100644 --- a/ci/circle-27-compat.yaml +++ b/ci/circle-27-compat.yaml @@ -4,11 +4,11 @@ channels: - conda-forge dependencies: - bottleneck=1.0.0 - - cython=0.24 + - cython=0.28.2 - jinja2=2.8 - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr - - numpy=1.9.2 - - openpyxl + - numpy=1.9.3 + - openpyxl=2.5.5 - psycopg2 - pytables=3.2.2 - python-dateutil=2.5.0 diff --git a/ci/circle-35-ascii.yaml b/ci/circle-35-ascii.yaml index 602c414b49bb2..745678791458d 100644 --- a/ci/circle-35-ascii.yaml +++ b/ci/circle-35-ascii.yaml @@ -2,7 +2,7 @@ name: pandas channels: - defaults dependencies: - - cython + - cython>=0.28.2 - nomkl - numpy - python-dateutil diff --git a/ci/circle-36-locale.yaml b/ci/circle-36-locale.yaml index cc852c1e2aeeb..a85e0b58f5e33 100644 --- a/ci/circle-36-locale.yaml +++ b/ci/circle-36-locale.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pymysql - pytables diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml index cc852c1e2aeeb..a85e0b58f5e33 100644 --- a/ci/circle-36-locale_slow.yaml +++ b/ci/circle-36-locale_slow.yaml @@ -13,7 +13,7 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - psycopg2 - pymysql - pytables diff --git a/ci/install_circle.sh b/ci/install_circle.sh index 5ffff84c88488..f8bcf6bcffc99 100755 --- a/ci/install_circle.sh +++ b/ci/install_circle.sh @@ -6,14 +6,7 @@ echo "[home_dir: $home_dir]" echo "[ls -ltr]" ls -ltr -echo "[Using clean Miniconda install]" -rm -rf "$MINICONDA_DIR" - -# install miniconda -wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 -bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 - -export PATH="$MINICONDA_DIR/bin:$PATH" +apt-get update -y && apt-get install -y build-essential postgresql-client-9.6 echo "[update conda]" conda config --set ssl_verify false || exit 1 @@ -48,9 +41,17 @@ source $ENVS_FILE # edit the locale override if needed if [ -n "$LOCALE_OVERRIDE" ]; then + + apt-get update && apt-get -y install locales locales-all + + export LANG=$LOCALE_OVERRIDE + export LC_ALL=$LOCALE_OVERRIDE + + python -c "import locale; locale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")" || exit 1; + echo "[Adding locale to the first line of pandas/__init__.py]" rm -f pandas/__init__.pyc - sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")\n" sed -i "$sedc" pandas/__init__.py echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py diff --git a/ci/install_db_circle.sh b/ci/install_db_circle.sh deleted file mode 100755 index a00f74f009f54..0000000000000 --- a/ci/install_db_circle.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -echo "installing dbs" -mysql -e 'create database pandas_nosetest;' -psql -c 'create database pandas_nosetest;' -U postgres - -echo "done" -exit 0 diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index e8cfcdf80f2e8..ca60c772392e7 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -11,7 +11,7 @@ lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql pytables diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 877c52fa0b4fd..a6009c270c2a6 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -13,7 +13,7 @@ lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql tables @@ -26,4 +26,4 @@ sqlalchemy xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt diff --git a/ci/run_circle.sh b/ci/run_circle.sh index 435985bd42148..fc2a8b849a354 100755 --- a/ci/run_circle.sh +++ b/ci/run_circle.sh @@ -6,4 +6,4 @@ export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" -pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas +pytest --strict --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/script_single.sh b/ci/script_single.sh index f376c920ac71b..60e2fbb33ee5d 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -25,12 +25,12 @@ if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml index 1312c1296d46a..eacae4630edeb 100644 --- a/ci/travis-27-locale.yaml +++ b/ci/travis-27-locale.yaml @@ -7,7 +7,7 @@ dependencies: - cython=0.24 - lxml - matplotlib=1.4.3 - - numpy=1.9.2 + - numpy=1.9.3 - openpyxl=2.4.0 - python-dateutil - python-blosc diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml index 22b993a2da886..26a520a16a4cc 100644 --- a/ci/travis-27.yaml +++ b/ci/travis-27.yaml @@ -27,6 +27,7 @@ dependencies: - PyCrypto - pymysql=0.6.3 - pytables + - blosc=1.14.3 - python-blosc - python-dateutil=2.5.0 - python=2.7* diff --git a/ci/travis-35-osx.yaml b/ci/travis-35-osx.yaml index e74abac4c9775..5722d91781999 100644 --- a/ci/travis-35-osx.yaml +++ b/ci/travis-35-osx.yaml @@ -12,7 +12,7 @@ dependencies: - nomkl - numexpr - numpy=1.10.4 - - openpyxl + - openpyxl=2.5.5 - pytables - python=3.5* - pytz diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml index c22dddbe0ba3f..05ff26020ac7d 100644 --- a/ci/travis-36-doc.yaml +++ b/ci/travis-36-doc.yaml @@ -21,7 +21,7 @@ dependencies: - notebook - numexpr - numpy=1.13* - - openpyxl + - openpyxl=2.5.5 - pandoc - pyqt - pytables @@ -36,6 +36,7 @@ dependencies: - sphinx - sqlalchemy - statsmodels + - tzlocal - xarray - xlrd - xlsxwriter diff --git a/ci/travis-36-slow.yaml b/ci/travis-36-slow.yaml index 6c475dc48723c..ae6353216cc2d 100644 --- a/ci/travis-36-slow.yaml +++ b/ci/travis-36-slow.yaml @@ -10,7 +10,7 @@ dependencies: - matplotlib - numexpr - numpy - - openpyxl + - openpyxl=2.5.5 - patsy - psycopg2 - pymysql diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml index fe057e714761e..83f963b9d9b6d 100644 --- a/ci/travis-36.yaml +++ b/ci/travis-36.yaml @@ -17,13 +17,11 @@ dependencies: - nomkl - numexpr - numpy - - openpyxl - - pandas-datareader + - openpyxl=2.5.5 - psycopg2 - pyarrow - pymysql - pytables - - python-dateutil - python-snappy - python=3.6* - pytz @@ -45,3 +43,5 @@ dependencies: - pip: - brotlipy - coverage + - pandas-datareader + - python-dateutil diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml new file mode 100644 index 0000000000000..8b255c9e6ec72 --- /dev/null +++ b/ci/travis-37.yaml @@ -0,0 +1,14 @@ +name: pandas +channels: + - defaults + - conda-forge + - c3i_test +dependencies: + - python=3.7 + - cython + - numpy + - python-dateutil + - nomkl + - pytz + - pytest + - pytest-xdist diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 66415defba6fe..0000000000000 --- a/circle.yml +++ /dev/null @@ -1,38 +0,0 @@ -machine: - environment: - # these are globally set - MINICONDA_DIR: /home/ubuntu/miniconda3 - - -database: - override: - - ./ci/install_db_circle.sh - - -checkout: - post: - # since circleci does a shallow fetch - # we need to populate our tags - - git fetch --depth=1000 - - -dependencies: - override: - - > - case $CIRCLE_NODE_INDEX in - 0) - sudo apt-get install language-pack-it && ./ci/install_circle.sh JOB="2.7_COMPAT" ENV_FILE="ci/circle-27-compat.yaml" LOCALE_OVERRIDE="it_IT.UTF-8" ;; - 1) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE" ENV_FILE="ci/circle-36-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 2) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE_SLOW" ENV_FILE="ci/circle-36-locale_slow.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 3) - ./ci/install_circle.sh JOB="3.5_ASCII" ENV_FILE="ci/circle-35-ascii.yaml" LOCALE_OVERRIDE="C" ;; - esac - - ./ci/show_circle.sh - - -test: - override: - - case $CIRCLE_NODE_INDEX in 0) ./ci/run_circle.sh --skip-slow --skip-network ;; 1) ./ci/run_circle.sh --only-slow --skip-network ;; 2) ./ci/run_circle.sh --skip-slow --skip-network ;; 3) ./ci/run_circle.sh --skip-slow --skip-network ;; esac: - parallel: true diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index c81842d3d9212..ec517d3e07bdf 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -924,6 +924,55 @@ bins, with ``NaN`` representing a missing value similar to other dtypes. pd.cut([0, 3, 5, 1], bins=c.categories) + +Generating Ranges of Intervals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If we need intervals on a regular frequency, we can use the :func:`interval_range` function +to create an ``IntervalIndex`` using various combinations of ``start``, ``end``, and ``periods``. +The default frequency for ``interval_range`` is a 1 for numeric intervals, and calendar day for +datetime-like intervals: + +.. ipython:: python + + pd.interval_range(start=0, end=5) + + pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4) + + pd.interval_range(end=pd.Timedelta('3 days'), periods=3) + +The ``freq`` parameter can used to specify non-default frequencies, and can utilize a variety +of :ref:`frequency aliases ` with datetime-like intervals: + +.. ipython:: python + + pd.interval_range(start=0, periods=5, freq=1.5) + + pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4, freq='W') + + pd.interval_range(start=pd.Timedelta('0 days'), periods=3, freq='9H') + +Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals +are closed on. Intervals are closed on the right side by default. + +.. ipython:: python + + pd.interval_range(start=0, end=4, closed='both') + + pd.interval_range(start=0, end=4, closed='neither') + +.. versionadded:: 0.23.0 + +Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced +intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements +in the resulting ``IntervalIndex``: + +.. ipython:: python + + pd.interval_range(start=0, end=6, periods=4) + + pd.interval_range(pd.Timestamp('2018-01-01'), pd.Timestamp('2018-02-28'), periods=3) + Miscellaneous indexing FAQ -------------------------- diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 30cdb06b28487..6714398084186 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -38,7 +38,10 @@ Statsmodels leverages pandas objects as the underlying data container for comput Use pandas DataFrames in your `scikit-learn `__ ML pipeline. +`Featuretools `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. .. _ecosystem.visualization: diff --git a/doc/source/install.rst b/doc/source/install.rst index 6054be112f52c..846170f9f0fa5 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -43,7 +43,7 @@ For more information, see the `Python 3 statement`_ and the `Porting to Python 3 Python version support ---------------------- -Officially Python 2.7, 3.5, and 3.6. +Officially Python 2.7, 3.5, 3.6, and 3.7. Installing pandas ----------------- diff --git a/doc/source/io.rst b/doc/source/io.rst index aa2484b0cb5c3..d818f486ad62d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -4719,14 +4719,6 @@ writes ``data`` to the database in batches of 1000 rows at a time: data.to_sql('data_chunked', engine, chunksize=1000) -.. note:: - - The function :func:`~pandas.DataFrame.to_sql` will perform a multivalue - insert if the engine dialect ``supports_multivalues_insert``. This will - greatly speed up the insert in some cases. - -SQL data types -++++++++++++++ :func:`~pandas.DataFrame.to_sql` will try to map your data to an appropriate SQL data type based on the dtype of the data. When you have columns of dtype diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 1161656731f88..4d7cd0bdadef7 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -279,7 +279,7 @@ need to be: Ignoring indexes on the concatenation axis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For ``DataFrame``s which don't have a meaningful index, you may wish to append +For ``DataFrame`` s which don't have a meaningful index, you may wish to append them and ignore the fact that they may have overlapping indexes. To do this, use the ``ignore_index`` argument: @@ -314,7 +314,7 @@ This is also a valid argument to :meth:`DataFrame.append`: Concatenating with mixed ndims ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can concatenate a mix of ``Series`` and ``DataFrame``s. The +You can concatenate a mix of ``Series`` and ``DataFrame`` s. The ``Series`` will be transformed to ``DataFrame`` with the column name as the name of the ``Series``. diff --git a/doc/source/release.rst b/doc/source/release.rst index 32db2ff5ebb24..08200d4d276cc 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,10 +37,91 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: https://pypi.org/project/pandas * Documentation: http://pandas.pydata.org +pandas 0.23.2 +------------- + +**Release date**: July 5, 2018 + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. + +See the :ref:`full whatsnew ` for a list of all the changes. + +Thanks +~~~~~~ + +A total of 17 people contributed to this release. People with a "+" by their +names contributed a patch for the first time. + +* David Krych +* Jacopo Rota + +* Jeff Reback +* Jeremy Schendel +* Joris Van den Bossche +* Kalyan Gokhale +* Matthew Roeschke +* Michael Odintsov + +* Ming Li +* Pietro Battiston +* Tom Augspurger +* Uddeshya Singh +* Vu Le + +* alimcmaster1 + +* david-liu-brattle-1 + +* gfyoung +* jbrockmendel + +pandas 0.23.1 +------------- + +**Release date**: June 12, 2018 + +This is a minor release from 0.23.0 and includes a number of bug fixes and +performance improvements. + +See the :ref:`full whatsnew ` for a list of all the changes. + +Thanks +~~~~~~ + +A total of 30 people contributed to this release. People with a "+" by their +names contributed a patch for the first time. + +* Adam J. Stewart +* Adam Kim + +* Aly Sivji +* Chalmer Lowe + +* Damini Satya + +* Dr. Irv +* Gabe Fernando + +* Giftlin Rajaiah +* Jeff Reback +* Jeremy Schendel + +* Joris Van den Bossche +* Kalyan Gokhale + +* Kevin Sheppard +* Matthew Roeschke +* Max Kanter + +* Ming Li +* Pyry Kovanen + +* Stefano Cianciulli +* Tom Augspurger +* Uddeshya Singh + +* Wenhuan +* William Ayd +* chris-b1 +* gfyoung +* h-vetinari +* nprad + +* ssikdar1 + +* tmnhat2001 +* topper-123 +* zertrin + + pandas 0.23.0 ------------- -**Release date**: May 15, 2017 +**Release date**: May 15, 2018 This is a major release from 0.22.0 and includes a number of API changes, new features, enhancements, and performance improvements along with a large number diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index 5f3a01f0725d4..745810704f665 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -352,8 +352,8 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the TimedeltaIndex -------------- -To generate an index with time delta, you can use either the ``TimedeltaIndex`` or -the ``timedelta_range`` constructor. +To generate an index with time delta, you can use either the :class:`TimedeltaIndex` or +the :func:`timedelta_range` constructor. Using ``TimedeltaIndex`` you can pass string-like, ``Timedelta``, ``timedelta``, or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent missing values. @@ -363,13 +363,47 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2,'D'), datetime.timedelta(days=2,seconds=2)]) -Similarly to ``date_range``, you can construct regular ranges of a ``TimedeltaIndex``: +Generating Ranges of Time Deltas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Similar to :func:`date_range`, you can construct regular ranges of a ``TimedeltaIndex`` +using :func:`timedelta_range`. The default frequency for ``timedelta_range`` is +calendar day: + +.. ipython:: python + + pd.timedelta_range(start='1 days', periods=5) + +Various combinations of ``start``, ``end``, and ``periods`` can be used with +``timedelta_range``: + +.. ipython:: python + + pd.timedelta_range(start='1 days', end='5 days') + + pd.timedelta_range(end='10 days', periods=4) + +The ``freq`` parameter can passed a variety of :ref:`frequency aliases `: .. ipython:: python - pd.timedelta_range(start='1 days', periods=5, freq='D') pd.timedelta_range(start='1 days', end='2 days', freq='30T') + pd.timedelta_range(start='1 days', periods=5, freq='2D5H') + + +.. versionadded:: 0.23.0 + +Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced +timedeltas from ``start`` to ``end`` inclusively, with ``periods`` number of elements +in the resulting ``TimedeltaIndex``: + +.. ipython:: python + + pd.timedelta_range('0 days', '4 days', periods=5) + + pd.timedelta_range('0 days', '4 days', periods=10) + Using the TimedeltaIndex ~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 73e3e721aad71..1b0cf86995a39 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -393,6 +393,18 @@ of those specified will not be generated: pd.bdate_range(start=start, periods=20) +.. versionadded:: 0.23.0 + +Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced +dates from ``start`` to ``end`` inclusively, with ``periods`` number of elements in the +resulting ``DatetimeIndex``: + +.. ipython:: python + + pd.date_range('2018-01-01', '2018-01-05', periods=5) + + pd.date_range('2018-01-01', '2018-01-05', periods=10) + .. _timeseries.custom-freq-ranges: Custom Frequency Ranges diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index d61a98fe2dae4..afd274332b3df 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,12 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.23.3.txt + +.. include:: whatsnew/v0.23.2.txt + +.. include:: whatsnew/v0.23.1.txt + .. include:: whatsnew/v0.23.0.txt .. include:: whatsnew/v0.22.0.txt diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3f89de1dc22d8..feba9d856789b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1,6 +1,6 @@ .. _whatsnew_0230: -v0.23.0 (May 15, 2017) +v0.23.0 (May 15, 2018) ---------------------- This is a major release from 0.22.0 and includes a number of API changes, diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt new file mode 100644 index 0000000000000..9f8635743ea6a --- /dev/null +++ b/doc/source/whatsnew/v0.23.1.txt @@ -0,0 +1,140 @@ +.. _whatsnew_0231: + +v0.23.1 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. contents:: What's new in v0.23.1 + :local: + :backlinks: none + +.. _whatsnew_0231.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +**Comparing Series with datetime.date** + +We've reverted a 0.23.0 change to comparing a :class:`Series` holding datetimes and a ``datetime.date`` object (:issue:`21152`). +In pandas 0.22 and earlier, comparing a Series holding datetimes and ``datetime.date`` objects would coerce the ``datetime.date`` to a datetime before comapring. +This was inconsistent with Python, NumPy, and :class:`DatetimeIndex`, which never consider a datetime and ``datetime.date`` equal. + +In 0.23.0, we unified operations between DatetimeIndex and Series, and in the process changed comparisons between a Series of datetimes and ``datetime.date`` without warning. + +We've temporarily restored the 0.22.0 behavior, so datetimes and dates may again compare equal, but restore the 0.23.0 behavior in a future release. + +To summarize, here's the behavior in 0.22.0, 0.23.0, 0.23.1: + +.. code-block:: python + + # 0.22.0... Silently coerce the datetime.date + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + 0 True + 1 False + dtype: bool + + # 0.23.0... Do not coerce the datetime.date + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + 0 False + 1 False + dtype: bool + + # 0.23.1... Coerce the datetime.date with a warning + >>> Series(pd.date_range('2017', periods=2)) == datetime.date(2017, 1, 1) + /bin/python:1: FutureWarning: Comparing Series of datetimes with 'datetime.date'. Currently, the + 'datetime.date' is coerced to a datetime. In the future pandas will + not coerce, and the values not compare equal to the 'datetime.date'. + To retain the current behavior, convert the 'datetime.date' to a + datetime with 'pd.Timestamp'. + #!/bin/python3 + 0 True + 1 False + dtype: bool + +In addition, ordering comparisons will raise a ``TypeError`` in the future. + +**Other Fixes** + +- Reverted the ability of :func:`~DataFrame.to_sql` to perform multivalue + inserts as this caused regression in certain cases (:issue:`21103`). + In the future this will be made configurable. +- Fixed regression in the :attr:`DatetimeIndex.date` and :attr:`DatetimeIndex.time` + attributes in case of timezone-aware data: :attr:`DatetimeIndex.time` returned + a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` + returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). +- Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values + in nested levels in JSON, and to not drop keys with value as `None` (:issue:`21158`, :issue:`21356`). +- Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) +- Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) +- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) +- Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing + values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) +- Fixed regression in merging on boolean index/columns (:issue:`21119`). + +.. _whatsnew_0231.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improved performance of :meth:`CategoricalIndex.is_monotonic_increasing`, :meth:`CategoricalIndex.is_monotonic_decreasing` and :meth:`CategoricalIndex.is_monotonic` (:issue:`21025`) +- Improved performance of :meth:`CategoricalIndex.is_unique` (:issue:`21107`) + + +.. _whatsnew_0231.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Groupby/Resample/Rolling** + +- Bug in :func:`DataFrame.agg` where applying multiple aggregation functions to a :class:`DataFrame` with duplicated column names would cause a stack overflow (:issue:`21063`) +- Bug in :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` where the fill within a grouping would not always be applied as intended due to the implementations' use of a non-stable sort (:issue:`21207`) +- Bug in :func:`pandas.core.groupby.GroupBy.rank` where results did not scale to 100% when specifying ``method='dense'`` and ``pct=True`` +- Bug in :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling` which incorrectly accepted a 0 window size rather than raising (:issue:`21286`) + +**Data-type specific** + +- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue:`21078`) +- Bug in :class:`Timedelta`: where passing a float with a unit would prematurely round the float precision (:issue:`14156`) +- Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) + +**Sparse** + +- Bug in :attr:`SparseArray.shape` which previously only returned the shape :attr:`SparseArray.sp_values` (:issue:`21126`) + +**Indexing** + +- Bug in :meth:`Series.reset_index` where appropriate error was not raised with an invalid level name (:issue:`20925`) +- Bug in :func:`interval_range` when ``start``/``periods`` or ``end``/``periods`` are specified with float ``start`` or ``end`` (:issue:`21161`) +- Bug in :meth:`MultiIndex.set_names` where error raised for a ``MultiIndex`` with ``nlevels == 1`` (:issue:`21149`) +- Bug in :class:`IntervalIndex` constructors where creating an ``IntervalIndex`` from categorical data was not fully supported (:issue:`21243`, :issue:`21253`) +- Bug in :meth:`MultiIndex.sort_index` which was not guaranteed to sort correctly with ``level=1``; this was also causing data misalignment in particular :meth:`DataFrame.stack` operations (:issue:`20994`, :issue:`20945`, :issue:`21052`) + +**Plotting** + +- New keywords (sharex, sharey) to turn on/off sharing of x/y-axis by subplots generated with pandas.DataFrame().groupby().boxplot() (:issue:`20968`) + +**I/O** + +- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) +- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) +- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`) +- Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`) + +**Reshaping** + +- Bug in :func:`concat` where error was raised in concatenating :class:`Series` with numpy scalar and tuple names (:issue:`21015`) +- Bug in :func:`concat` warning message providing the wrong guidance for future behavior (:issue:`21101`) + +**Other** + +- Tab completion on :class:`Index` in IPython no longer outputs deprecation warnings (:issue:`21125`) +- Bug preventing pandas being used on Windows without C++ redistributable installed (:issue:`21106`) diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt new file mode 100644 index 0000000000000..77ad860fc4e8e --- /dev/null +++ b/doc/source/whatsnew/v0.23.2.txt @@ -0,0 +1,108 @@ +.. _whatsnew_0232: + +v0.23.2 +------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. note:: + + Pandas 0.23.2 is first pandas release that's compatible with + Python 3.7 (:issue:`20552`) + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. contents:: What's new in v0.23.2 + :local: + :backlinks: none + +.. _whatsnew_0232.enhancements: + +Logical Reductions over Entire DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.all` and :meth:`DataFrame.any` now accept ``axis=None`` to reduce over all axes to a scalar (:issue:`19976`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2], "B": [True, False]}) + df.all(axis=None) + + +This also provides compatibility with NumPy 1.15, which now dispatches to ``DataFrame.all``. +With NumPy 1.15 and pandas 0.23.1 or earlier, :func:`numpy.all` will no longer reduce over every axis: + +.. code-block:: python + + >>> # NumPy 1.15, pandas 0.23.1 + >>> np.any(pd.DataFrame({"A": [False], "B": [False]})) + A False + B False + dtype: bool + +With pandas 0.23.2, that will correctly return False, as it did with NumPy < 1.15. + +.. ipython:: python + + np.any(pd.DataFrame({"A": [False], "B": [False]})) + + +.. _whatsnew_0232.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`) +- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`). +- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`) +- Fixed printing of DataFrames with hierarchical columns with long names (:issue:`21180`) +- Fixed regression in :meth:`~DataFrame.reindex` and :meth:`~DataFrame.groupby` + with a MultiIndex or multiple keys that contains categorical datetime-like values (:issue:`21390`). +- Fixed regression in unary negative operations with object dtype (:issue:`21380`) +- Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`) +- Fixed regression in :func:`to_clipboard` that defaulted to copying dataframes with space delimited instead of tab delimited (:issue:`21104`) + + +Build Changes +~~~~~~~~~~~~~ + +- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) + +.. _whatsnew_0232.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Conversion** + +- Bug in constructing :class:`Index` with an iterator or generator (:issue:`21470`) +- Bug in :meth:`Series.nlargest` for signed and unsigned integer dtypes when the minimum value is present (:issue:`21426`) + +**Indexing** + +- Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) +- Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :meth:`DataFrame.drop` behaviour is not consistent for unique and non-unique indexes (:issue:`21494`) +- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). + +**I/O** + +- Bug in :func:`read_csv` that caused it to incorrectly raise an error when ``nrows=0``, ``low_memory=True``, and ``index_col`` was not ``None`` (:issue:`21141`) +- Bug in :func:`json_normalize` when formatting the ``record_prefix`` with integer columns (:issue:`21536`) + +**Categorical** + +- Bug in rendering :class:`Series` with ``Categorical`` dtype in rare conditions under Python 2.7 (:issue:`21002`) + +**Timezones** + +- Bug in :class:`Timestamp` and :class:`DatetimeIndex` where passing a :class:`Timestamp` localized after a DST transition would return a datetime before the DST transition (:issue:`20854`) +- Bug in comparing :class:`DataFrame`s with tz-aware :class:`DatetimeIndex` columns with a DST transition that raised a ``KeyError`` (:issue:`19970`) + +**Timedelta** + +- Bug in :class:`Timedelta` where non-zero timedeltas shorter than 1 microsecond were considered False (:issue:`21484`) diff --git a/doc/source/whatsnew/v0.23.3.txt b/doc/source/whatsnew/v0.23.3.txt new file mode 100644 index 0000000000000..b8adce27d2523 --- /dev/null +++ b/doc/source/whatsnew/v0.23.3.txt @@ -0,0 +1,7 @@ +.. _whatsnew_0233: + +v0.23.3 (July 7, 2018) +---------------------- + +This release fixes a build issue with the sdist for Python 3.7 (:issue:`21785`) +There are no other changes. diff --git a/doc/source/whatsnew/v0.23.4.txt b/doc/source/whatsnew/v0.23.4.txt new file mode 100644 index 0000000000000..9a3ad3f61ee49 --- /dev/null +++ b/doc/source/whatsnew/v0.23.4.txt @@ -0,0 +1,37 @@ +.. _whatsnew_0234: + +v0.23.4 (August 3, 2018) +------------------------ + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. contents:: What's new in v0.23.4 + :local: + :backlinks: none + +.. _whatsnew_0234.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- Python 3.7 with Windows gave all missing values for rolling variance calculations (:issue:`21813`) + +.. _whatsnew_0234.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Groupby/Resample/Rolling** + +- Bug where calling :func:`DataFrameGroupBy.agg` with a list of functions including ``ohlc`` as the non-initial element would raise a ``ValueError`` (:issue:`21716`) +- Bug in ``roll_quantile`` caused a memory leak when calling ``.rolling(...).quantile(q)`` with ``q`` in (0,1) (:issue:`21965`) + +**Missing** + +- Bug in :func:`Series.clip` and :func:`DataFrame.clip` cannot accept list-like threshold containing ``NaN`` (:issue:`19992`) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt new file mode 100644 index 0000000000000..f69e38e7fdd50 --- /dev/null +++ b/doc/source/whatsnew/v0.23.5.txt @@ -0,0 +1,46 @@ +.. _whatsnew_0235: + +v0.23.5 (TBD 0, 2018) +--------------------- + +This is a minor bug-fix release in the 0.23.x series and includes some small regression fixes +and bug fixes. We recommend that all users upgrade to this version. + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. contents:: What's new in v0.23.5 + :local: + :backlinks: none + +.. _whatsnew_0235.fixed_regressions: + +Fixed Regressions +~~~~~~~~~~~~~~~~~ + +- Calling :meth:`DataFrameGroupBy.rank` and :meth:`SeriesGroupBy.rank` with empty groups + and ``pct=True`` was raising a ``ZeroDivisionError`` due to `c1068d9 + `_ (:issue:`22519`) +- +- + +.. _whatsnew_0235.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +**Groupby/Resample/Rolling** + +- Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). +- + +**Missing** + +- +- + +**I/O** + +- Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 43afd1e0f5969..a6dbaff17e543 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -297,7 +297,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, # Make sure all arrays are the same size assert N == len(labels) == len(mask) - sorted_labels = np.argsort(labels).astype(np.int64, copy=False) + sorted_labels = np.argsort(labels, kind='mergesort').astype( + np.int64, copy=False) if direction == 'bfill': sorted_labels = sorted_labels[::-1] diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 6a33e4a09476d..d7885e112a7e0 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -418,7 +418,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, bint is_datetimelike, object ties_method, bint ascending, bint pct, object na_option): """ - Provides the rank of values within each group. + Provides the rank of values within each group. Parameters ---------- @@ -451,8 +451,8 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, """ cdef: TiebreakEnumType tiebreak - Py_ssize_t i, j, N, K, val_start=0, grp_start=0, dups=0, sum_ranks=0 - Py_ssize_t grp_vals_seen=1, grp_na_count=0 + Py_ssize_t i, j, N, K, grp_start=0, dups=0, sum_ranks=0 + Py_ssize_t grp_vals_seen=1, grp_na_count=0, grp_tie_count=0 ndarray[int64_t] _as ndarray[float64_t, ndim=2] grp_sizes ndarray[{{c_type}}] masked_vals @@ -563,6 +563,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, dups = sum_ranks = 0 val_start = i grp_vals_seen += 1 + grp_tie_count +=1 # Similar to the previous conditional, check now if we are moving # to a new group. If so, keep track of the index where the new @@ -571,17 +572,27 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # (used by pct calculations later). also be sure to reset any of # the items helping to calculate dups if i == N - 1 or labels[_as[i]] != labels[_as[i+1]]: - for j in range(grp_start, i + 1): - grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + if tiebreak != TIEBREAK_DENSE: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = i - grp_start + 1 - grp_na_count + else: + for j in range(grp_start, i + 1): + grp_sizes[_as[j], 0] = (grp_tie_count - + (grp_na_count > 0)) dups = sum_ranks = 0 grp_na_count = 0 - val_start = i + 1 + grp_tie_count = 0 grp_start = i + 1 grp_vals_seen = 1 if pct: for i in range(N): - out[i, 0] = out[i, 0] / grp_sizes[i, 0] + # We don't include NaN values in percentage + # rankings, so we assign them percentages of NaN. + if out[i, 0] != out[i, 0] or out[i, 0] == NAN: + out[i, 0] = NAN + else: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index c6f182ac5003f..4489847518a1d 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -8,8 +8,7 @@ import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t from util cimport _checknull -from cpython cimport (PyString_Check, - PyBytes_Check, +from cpython cimport (PyBytes_Check, PyUnicode_Check) from libc.stdlib cimport malloc, free @@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): cdef list datas = [] for i in range(n): val = arr[i] - if PyString_Check(val): - data = val.encode(encoding) - elif PyBytes_Check(val): + if PyBytes_Check(val): data = val elif PyUnicode_Check(val): data = val.encode(encoding) diff --git a/pandas/_libs/src/headers/cmath b/pandas/_libs/src/headers/cmath index d8e2239406cae..2bccf9bb13d77 100644 --- a/pandas/_libs/src/headers/cmath +++ b/pandas/_libs/src/headers/cmath @@ -6,6 +6,7 @@ #if defined(_MSC_VER) && (_MSC_VER < 1800) #include namespace std { + __inline int isnan(double x) { return _isnan(x); } __inline int signbit(double num) { return _copysign(1.0, num) < 0; } } #else diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 17453d8af1297..0f58cfa761f21 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -77,7 +77,7 @@ cdef inline object create_time_from_ts( int64_t value, pandas_datetimestruct dts, object tz, object freq): """ convenience routine to construct a datetime.time from its parts """ - return time(dts.hour, dts.min, dts.sec, dts.us, tz) + return time(dts.hour, dts.min, dts.sec, dts.us) def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index f4841e6abb7e8..3cbef82437544 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -347,25 +347,11 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, if tz is not None: tz = maybe_get_tz(tz) - # sort of a temporary hack if ts.tzinfo is not None: - if hasattr(tz, 'normalize') and hasattr(ts.tzinfo, '_utcoffset'): - ts = tz.normalize(ts) - obj.value = pydatetime_to_dt64(ts, &obj.dts) - obj.tzinfo = ts.tzinfo - else: - # tzoffset - try: - tz = ts.astimezone(tz).tzinfo - except: - pass - obj.value = pydatetime_to_dt64(ts, &obj.dts) - ts_offset = get_utcoffset(ts.tzinfo, ts) - obj.value -= int(ts_offset.total_seconds() * 1e9) - tz_offset = get_utcoffset(tz, ts) - obj.value += int(tz_offset.total_seconds() * 1e9) - dt64_to_dtstruct(obj.value, &obj.dts) - obj.tzinfo = tz + # Convert the current timezone to the passed timezone + ts = ts.astimezone(tz) + obj.value = pydatetime_to_dt64(ts, &obj.dts) + obj.tzinfo = ts.tzinfo elif not is_utc(tz): ts = _localize_pydatetime(ts, tz) obj.value = pydatetime_to_dt64(ts, &obj.dts) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d17d4e7139d72..769f3ca5fa8bf 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -202,22 +202,22 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: if unit == 'D' or unit == 'd': m = 1000000000L * 86400 - p = 6 + p = 9 elif unit == 'h': m = 1000000000L * 3600 - p = 6 + p = 9 elif unit == 'm': m = 1000000000L * 60 - p = 6 + p = 9 elif unit == 's': m = 1000000000L - p = 6 + p = 9 elif unit == 'ms': m = 1000000L - p = 3 + p = 6 elif unit == 'us': m = 1000L - p = 0 + p = 3 elif unit == 'ns' or unit is None: m = 1L p = 0 @@ -231,10 +231,10 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int base = ts - frac = ts -base + frac = ts - base if p: frac = round(frac, p) - return (base *m) + (frac *m) + return (base * m) + (frac * m) cdef inline _decode_if_necessary(object ts): @@ -760,7 +760,32 @@ cdef class _Timedelta(timedelta): @property def delta(self): - """ return out delta in ns (for internal compat) """ + """ + Return the timedelta in nanoseconds (ns), for internal compatibility. + + Returns + ------- + int + Timedelta in nanoseconds. + + Examples + -------- + >>> td = pd.Timedelta('1 days 42 ns') + >>> td.delta + 86400000000042 + + >>> td = pd.Timedelta('3 s') + >>> td.delta + 3000000000 + + >>> td = pd.Timedelta('3 ms 5 us') + >>> td.delta + 3005000 + + >>> td = pd.Timedelta(42, unit='ns') + >>> td.delta + 42 + """ return self.value @property @@ -791,9 +816,32 @@ cdef class _Timedelta(timedelta): @property def nanoseconds(self): """ - Number of nanoseconds (>= 0 and less than 1 microsecond). + Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. + + Returns + ------- + int + Number of nanoseconds. - .components will return the shown components + See Also + -------- + Timedelta.components : Return all attributes with assigned values + (i.e. days, hours, minutes, seconds, milliseconds, microseconds, + nanoseconds). + + Examples + -------- + **Using string input** + + >>> td = pd.Timedelta('1 days 2 min 3 us 42 ns') + >>> td.nanoseconds + 42 + + **Using integer input** + + >>> td = pd.Timedelta(42, unit='ns') + >>> td.nanoseconds + 42 """ self._ensure_components() return self._ns @@ -851,6 +899,9 @@ cdef class _Timedelta(timedelta): def __str__(self): return self._repr_base(format='long') + def __bool__(self): + return self.value != 0 + def isoformat(self): """ Format Timedelta as ISO 8601 Duration like @@ -1198,7 +1249,7 @@ class Timedelta(_Timedelta): deprecated. Use 'array // timedelta.value' instead. If you want to obtain epochs from an array of timestamps, you can rather use - 'array - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")'. + '(array - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")'. """) warnings.warn(msg, FutureWarning) return other // self.value diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ba5ebdab82ddc..123ccebf83a56 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -59,42 +59,51 @@ cdef inline object create_timestamp_from_ts(int64_t value, def round_ns(values, rounder, freq): + """ Applies rounding function at given frequency Parameters ---------- - values : int, :obj:`ndarray` - rounder : function + values : :obj:`ndarray` + rounder : function, eg. 'ceil', 'floor', 'round' freq : str, obj Returns ------- - int or :obj:`ndarray` + :obj:`ndarray` """ + from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos + + # GH21262 If the Timestamp is multiple of the freq str + # don't apply any rounding + mask = values % unit == 0 + if mask.all(): + return values + r = values.copy() + if unit < 1000: # for nano rounding, work with the last 6 digits separately # due to float precision buff = 1000000 - r = (buff * (values // buff) + unit * - (rounder((values % buff) * (1 / float(unit)))).astype('i8')) + r[~mask] = (buff * (values[~mask] // buff) + + unit * (rounder((values[~mask] % buff) * + (1 / float(unit)))).astype('i8')) else: if unit % 1000 != 0: msg = 'Precision will be lost using frequency: {}' warnings.warn(msg.format(freq)) - # GH19206 # to deal with round-off when unit is large if unit >= 1e9: divisor = 10 ** int(np.log10(unit / 1e7)) else: divisor = 10 - - r = (unit * rounder((values * (divisor / float(unit))) / divisor) - .astype('i8')) - + r[~mask] = (unit * rounder((values[~mask] * + (divisor / float(unit))) / divisor) + .astype('i8')) return r @@ -649,7 +658,10 @@ class Timestamp(_Timestamp): else: value = self.value - r = round_ns(value, rounder, freq) + value = np.array([value], dtype=np.int64) + + # Will only ever contain 1 element for timestamp + r = round_ns(value, rounder, freq)[0] result = Timestamp(r, unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index 5121d293efcb6..6954094b46e69 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -14,6 +14,7 @@ cnp.import_array() cdef extern from "../src/headers/cmath" namespace "std": + bint isnan(double) nogil int signbit(double) nogil double sqrt(double x) nogil @@ -654,16 +655,16 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, double *ssqdm_x) nogil: """ add a value from the var calc """ cdef double delta + # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug + if isnan(val): + return - # Not NaN - if val == val: - nobs[0] = nobs[0] + 1 - - # a part of Welford's method for the online variance-calculation - # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - delta = val - mean_x[0] - mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] + nobs[0] = nobs[0] + 1 + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] + mean_x[0] = mean_x[0] + delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, @@ -1482,6 +1483,8 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, else: output[i] = NaN + skiplist_destroy(skiplist) + return output diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 12517372fedd1..28a55133e68aa 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -40,10 +40,11 @@ from collections import namedtuple PY2 = sys.version_info[0] == 2 -PY3 = (sys.version_info[0] >= 3) -PY35 = (sys.version_info >= (3, 5)) -PY36 = (sys.version_info >= (3, 6)) -PYPY = (platform.python_implementation() == 'PyPy') +PY3 = sys.version_info[0] >= 3 +PY35 = sys.version_info >= (3, 5) +PY36 = sys.version_info >= (3, 6) +PY37 = sys.version_info >= (3, 7) +PYPY = platform.python_implementation() == 'PyPy' try: import __builtin__ as builtins @@ -425,7 +426,7 @@ def raise_with_traceback(exc, traceback=Ellipsis): # In Python 3.7, the private re._pattern_type is removed. # Python 3.5+ have typing.re.Pattern -if PY35: +if PY36: import typing re_type = typing.re.Pattern else: diff --git a/pandas/conftest.py b/pandas/conftest.py index b09cb872a12fb..ead357747666d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,5 +1,8 @@ +import os + import pytest +import pandas import numpy as np import pandas as pd from pandas.compat import PY3 @@ -15,6 +18,8 @@ def pytest_addoption(parser): help="run high memory tests") parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption("--strict-data-files", action="store_true", + help="Fail if a test is skipped for missing data file.") def pytest_runtest_setup(item): @@ -105,6 +110,16 @@ def compression(request): return request.param +@pytest.fixture(params=['gzip', 'bz2', 'zip', + pytest.param('xz', marks=td.skip_if_no_lzma)]) +def compression_only(request): + """ + Fixture for trying common compression types in compression tests excluding + uncompressed case + """ + return request.param + + @pytest.fixture(scope='module') def datetime_tz_utc(): from datetime import timezone @@ -119,6 +134,51 @@ def join_type(request): return request.param +@pytest.fixture +def datapath(request): + """Get the path to a data file. + + Parameters + ---------- + path : str + Path to the file, relative to ``pandas/tests/`` + + Returns + ------- + path : path including ``pandas/tests``. + + Raises + ------ + ValueError + If the path doesn't exist and the --strict-data-files option is set. + """ + def deco(*args): + path = os.path.join('pandas', 'tests', *args) + if not os.path.exists(path): + if request.config.getoption("--strict-data-files"): + msg = "Could not find file {} and --strict-data-files is set." + raise ValueError(msg.format(path)) + else: + msg = "Could not find {}." + pytest.skip(msg.format(path)) + return path + return deco + + +@pytest.fixture +def iris(datapath): + """The iris dataset as a DataFrame.""" + return pandas.read_csv(datapath('data', 'iris.csv')) + + +@pytest.fixture(params=['nlargest', 'nsmallest']) +def nselect_method(request): + """ + Fixture for trying all nselect methods + """ + return request.param + + @pytest.fixture(params=[None, np.nan, pd.NaT, float('nan'), np.float('NaN')]) def nulls_fixture(request): """ @@ -149,3 +209,77 @@ def tz_aware_fixture(request): Fixture for trying explicit timezones: {0} """ return request.param + + +@pytest.fixture(params=[str, 'str', 'U']) +def string_dtype(request): + """Parametrized fixture for string dtypes. + + * str + * 'str' + * 'U' + """ + return request.param + + +@pytest.fixture(params=["float32", "float64"]) +def float_dtype(request): + """ + Parameterized fixture for float dtypes. + + * float32 + * float64 + """ + + return request.param + + +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +SIGNED_INT_DTYPES = ["int8", "int16", "int32", "int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES + + +@pytest.fixture(params=SIGNED_INT_DTYPES) +def sint_dtype(request): + """ + Parameterized fixture for signed integer dtypes. + + * int8 + * int16 + * int32 + * int64 + """ + + return request.param + + +@pytest.fixture(params=UNSIGNED_INT_DTYPES) +def uint_dtype(request): + """ + Parameterized fixture for unsigned integer dtypes. + + * uint8 + * uint16 + * uint32 + * uint64 + """ + + return request.param + + +@pytest.fixture(params=ALL_INT_DTYPES) +def any_int_dtype(request): + """ + Parameterized fixture for any integer dtypes. + + * int8 + * uint8 + * int16 + * uint16 + * int32 + * uint32 + * int64 + * uint64 + """ + + return request.param diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index c638b9e4ea117..7a853d575aa69 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -12,7 +12,8 @@ class DirNamesMixin(object): _accessors = frozenset([]) - _deprecations = frozenset(['asobject']) + _deprecations = frozenset( + ['asobject', 'base', 'data', 'flags', 'itemsize', 'strides']) def _dir_deletions(self): """ delete unwanted __dir__ for this object """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 88bc497f9f22d..9d8d208d2d5c1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -95,7 +95,7 @@ def _ensure_data(values, dtype=None): values = _ensure_float64(values) return values, 'float64', 'float64' - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): # if we are trying to coerce to a dtype # and it is incompat this will fall thru to here return _ensure_object(values), 'object', 'object' @@ -429,7 +429,7 @@ def isin(comps, values): values = values.astype('int64', copy=False) comps = comps.astype('int64', copy=False) f = lambda x, y: htable.ismember_int64(x, y) - except (TypeError, ValueError): + except (TypeError, ValueError, OverflowError): values = values.astype(object) comps = comps.astype(object) @@ -1131,9 +1131,12 @@ def compute(self, method): return dropped[slc].sort_values(ascending=ascending).head(n) # fast method - arr, _, _ = _ensure_data(dropped.values) + arr, pandas_dtype, _ = _ensure_data(dropped.values) if method == 'nlargest': arr = -arr + if is_integer_dtype(pandas_dtype): + # GH 21426: ensure reverse ordering at boundaries + arr -= 1 if self.keep == 'last': arr = arr[::-1] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index abcb9ae3494b5..b587a4c0bc722 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -3,7 +3,6 @@ import numpy as np from warnings import warn import textwrap -import types from pandas import compat from pandas.compat import u, lzip @@ -12,6 +11,7 @@ from pandas.core.dtypes.generic import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex) from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.cast import ( maybe_infer_to_datetimelike, coerce_indexer_dtype) @@ -27,7 +27,7 @@ is_categorical, is_categorical_dtype, is_list_like, is_sequence, - is_scalar, + is_scalar, is_iterator, is_dict_like) from pandas.core.algorithms import factorize, take_1d, unique1d, take @@ -1751,7 +1751,7 @@ def fillna(self, value=None, method=None, limit=None): values[indexer] = values_codes[values_codes != -1] # If value is not a dict or Series it should be a scalar - elif is_scalar(value): + elif is_hashable(value): if not isna(value) and value not in self.categories: raise ValueError("fill value must be in categories") @@ -2472,7 +2472,7 @@ def _convert_to_list_like(list_like): if isinstance(list_like, list): return list_like if (is_sequence(list_like) or isinstance(list_like, tuple) or - isinstance(list_like, types.GeneratorType)): + is_iterator(list_like)): return list(list_like) elif is_scalar(list_like): return [list_like] diff --git a/pandas/core/base.py b/pandas/core/base.py index fa78c89ed4ee7..c331ead8d2fef 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -114,7 +114,7 @@ def _reset_cache(self, key=None): def __sizeof__(self): """ - Generates the total memory usage for a object that returns + Generates the total memory usage for an object that returns either a value or Series of values """ if hasattr(self, 'memory_usage'): @@ -590,9 +590,10 @@ def _aggregate_multiple_funcs(self, arg, _level, _axis): # multiples else: - for col in obj: + for index, col in enumerate(obj): try: - colg = self._gotitem(col, ndim=1, subset=obj[col]) + colg = self._gotitem(col, ndim=1, + subset=obj.iloc[:, index]) results.append(colg.aggregate(arg)) keys.append(col) except (TypeError, DataError): @@ -675,7 +676,6 @@ def _gotitem(self, key, ndim, subset=None): subset : object, default None subset to act on """ - # create a new object to prevent aliasing if subset is None: subset = self.obj diff --git a/pandas/core/common.py b/pandas/core/common.py index b9182bfd2cbe2..1de8269c9a0c6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -55,8 +55,11 @@ def flatten(l): def _consensus_name_attr(objs): name = objs[0].name for obj in objs[1:]: - if obj.name != name: - return None + try: + if obj.name != name: + name = None + except ValueError: + name = None return name diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e4ed6d544d42e..ebc7a13234a98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1227,3 +1227,45 @@ def construct_1d_object_array_from_listlike(values): result = np.empty(len(values), dtype='object') result[:] = values return result + + +def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): + """ + Construct a new ndarray, coercing `values` to `dtype`, preserving NA. + + Parameters + ---------- + values : Sequence + dtype : numpy.dtype, optional + copy : bool, default False + Note that copies may still be made with ``copy=False`` if casting + is required. + + Returns + ------- + arr : ndarray[dtype] + + Examples + -------- + >>> np.array([1.0, 2.0, None], dtype='str') + array(['1.0', '2.0', 'None'], dtype='>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype='str') + + + """ + subarr = np.array(values, dtype=dtype, copy=copy) + + if dtype is not None and dtype.kind in ("U", "S"): + # GH-21083 + # We can't just return np.array(subarr, dtype='str') since + # NumPy will convert the non-string objects into strings + # Including NA values. Se we have to go + # string -> object -> update NA, which requires an + # additional pass over the data. + na_values = isna(values) + subarr2 = subarr.astype(object) + subarr2[na_values] = np.asarray(values, dtype=object)[na_values] + subarr = subarr2 + + return subarr diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dccc840f5affd..2a40dd28a6fd7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1774,8 +1774,11 @@ def to_stata(self, fname, convert_dates=None, write_index=True, Parameters ---------- - fname : str or buffer - String path of file-like object. + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + data has been written. convert_dates : dict Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', @@ -3718,7 +3721,7 @@ def rename(self, *args, **kwargs): copy : boolean, default True Also copy underlying data inplace : boolean, default False - Whether to return a new %(klass)s. If True then value of copy is + Whether to return a new DataFrame. If True then value of copy is ignored. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified @@ -4454,7 +4457,10 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) labels = self._get_axis(axis) - if level: + # make sure that the axis is lexsorted to start + # if not we need to reconstruct to get the correct indexer + labels = labels._sort_levels_monotonic() + if level is not None: new_axis, indexer = labels.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) @@ -4462,9 +4468,6 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(labels, MultiIndex): from pandas.core.sorting import lexsort_indexer - # make sure that the axis is lexsorted to start - # if not we need to reconstruct to get the correct indexer - labels = labels._sort_levels_monotonic() indexer = lexsort_indexer(labels._get_labels_for_sorting(), orders=ascending, na_position=na_position) @@ -5731,7 +5734,12 @@ def diff(self, periods=1, axis=0): # ---------------------------------------------------------------------- # Function application - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, + key, # type: Union[str, List[str]] + ndim, # type: int + subset=None # type: Union[Series, DataFrame, None] + ): + # type: (...) -> Union[Series, DataFrame] """ sub-classes to define return a sliced object @@ -5746,9 +5754,11 @@ def _gotitem(self, key, ndim, subset=None): """ if subset is None: subset = self + elif subset.ndim == 1: # is Series + return subset # TODO: _shallow_copy(subset)? - return self[key] + return subset[key] _agg_doc = dedent(""" The aggregation operations are always performed over an axis, either the @@ -6834,13 +6844,18 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): - axis = self._get_axis_number(axis) + if axis is None and filter_type == 'bool': + labels = None + constructor = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor def f(x): return op(x, axis=axis, skipna=skipna, **kwds) - labels = self._get_agg_axis(axis) - # exclude timedelta/datetime unless we are uniform types if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type: numeric_only = True @@ -6849,6 +6864,13 @@ def f(x): try: values = self.values result = f(values) + + if (filter_type == 'bool' and is_object_dtype(values) and + axis is None): + # work around https://github.com/numpy/numpy/issues/10489 + # TODO: combine with hasattr(result, 'dtype') further down + # hard since we don't have `values` down there. + result = np.bool_(result) except Exception as e: # try by-column first @@ -6915,7 +6937,9 @@ def f(x): if axis == 0: result = coerce_to_dtypes(result, self.dtypes) - return Series(result, index=labels) + if constructor is not None: + result = Series(result, index=labels) + return result def nunique(self, axis=0, dropna=True): """ @@ -7079,6 +7103,9 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0 <= q <= 1, the quantile(s) to compute axis : {0, 1, 'index', 'columns'} (default 0) 0 or 'index' for row-wise, 1 or 'columns' for column-wise + numeric_only : boolean, default True + If False, the quantile of datetime and timedelta data will be + computed as well interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} .. versionadded:: 0.18.0 @@ -7106,7 +7133,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, -------- >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), - columns=['a', 'b']) + columns=['a', 'b']) >>> df.quantile(.1) a 1.3 b 3.7 @@ -7116,6 +7143,20 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0.1 1.3 3.7 0.5 2.5 55.0 + Specifying `numeric_only=False` will also compute the quantile of + datetime and timedelta data. + + >>> df = pd.DataFrame({'A': [1, 2], + 'B': [pd.Timestamp('2010'), + pd.Timestamp('2011')], + 'C': [pd.Timedelta('1 days'), + pd.Timedelta('2 days')]}) + >>> df.quantile(0.5, numeric_only=False) + A 1.5 + B 2010-07-02 12:00:00 + C 1 days 12:00:00 + Name: 0.5, dtype: object + See Also -------- pandas.core.window.Rolling.quantile diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9e4eda1bc4dc7..facc709877285 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -27,6 +27,7 @@ is_dict_like, is_re_compilable, is_period_arraylike, + is_object_dtype, pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.inference import is_hashable @@ -1117,7 +1118,8 @@ def __neg__(self): values = com._values_from_object(self) if is_bool_dtype(values): arr = operator.inv(values) - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): arr = operator.neg(values) else: raise TypeError("Unary negative expects numeric dtype, not {}" @@ -1128,7 +1130,8 @@ def __pos__(self): values = com._values_from_object(self) if (is_bool_dtype(values) or is_period_arraylike(values)): arr = values - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values)): + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): arr = operator.pos(values) else: raise TypeError("Unary plus expects numeric dtype, not {}" @@ -3129,7 +3132,7 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): """ axis = self._get_axis_number(axis) axis_name = self._get_axis_name(axis) - axis, axis_ = self._get_axis(axis), axis + axis = self._get_axis(axis) if axis.is_unique: if level is not None: @@ -3138,24 +3141,25 @@ def _drop_axis(self, labels, axis, level=None, errors='raise'): new_axis = axis.drop(labels, level=level, errors=errors) else: new_axis = axis.drop(labels, errors=errors) - dropped = self.reindex(**{axis_name: new_axis}) - try: - dropped.axes[axis_].set_names(axis.names, inplace=True) - except AttributeError: - pass - result = dropped + result = self.reindex(**{axis_name: new_axis}) + # Case for non-unique axis else: labels = _ensure_object(com._index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') indexer = ~axis.get_level_values(level).isin(labels) + + # GH 18561 MultiIndex.drop should raise if label is absent + if errors == 'raise' and indexer.all(): + raise KeyError('{} not found in axis'.format(labels)) else: indexer = ~axis.isin(labels) - - if errors == 'raise' and indexer.all(): - raise KeyError('{} not found in axis'.format(labels)) + # Check if label doesn't exist along axis + labels_missing = (axis.get_indexer_for(labels) == -1).any() + if errors == 'raise' and labels_missing: + raise KeyError('{} not found in axis'.format(labels)) slicer = [slice(None)] * self.ndim slicer[self._get_axis_number(axis_name)] = indexer @@ -6429,9 +6433,11 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, # GH 17276 # numpy doesn't like NaN as a clip value # so ignore - if np.any(pd.isnull(lower)): + # GH 19992 + # numpy doesn't drop a list-like bound containing NaN + if not is_list_like(lower) and np.any(pd.isnull(lower)): lower = None - if np.any(pd.isnull(upper)): + if not is_list_like(upper) and np.any(pd.isnull(upper)): upper = None # GH 2747 (arguments were reversed) @@ -8728,6 +8734,8 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + if axis is None: + raise ValueError("Must specify 'axis' when aggregating by level.") grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) @@ -8969,18 +8977,17 @@ def _find_valid_index(self, how): is_valid = is_valid.any(1) # reduce axis 1 if how == 'first': - # First valid value case - i = is_valid.idxmax() - if not is_valid[i]: - return None - return i - - elif how == 'last': - # Last valid value case - i = is_valid.values[::-1].argmax() - if not is_valid.iat[len(self) - i - 1]: - return None - return self.index[len(self) - i - 1] + idxpos = is_valid.values[::].argmax() + + if how == 'last': + idxpos = len(self) - 1 - is_valid.values[::-1].argmax() + + chk_notna = is_valid.iat[idxpos] + idx = self.index[idxpos] + + if not chk_notna: + return None + return idx @Appender(_shared_docs['valid_index'] % {'position': 'first', 'klass': 'NDFrame'}) @@ -9055,8 +9062,15 @@ def _doc_parms(cls): Parameters ---------- -axis : int, default 0 - Select the axis which can be 0 for indices and 1 for columns. +axis : {0 or 'index', 1 or 'columns', None}, default 0 + Indicate which axis or axes should be reduced. + + * 0 / 'index' : reduce the index, return a Series whose index is the + original column labels. + * 1 / 'columns' : reduce the columns, return a Series whose index is the + original index. + * None : reduce all axes, return a scalar. + skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -9078,9 +9092,9 @@ def _doc_parms(cls): %(examples)s""" _all_doc = """\ -Return whether all elements are True over series or dataframe axis. +Return whether all elements are True, potentially over an axis. -Returns True if all elements within a series or along a dataframe +Returns True if all elements within a series or along a Dataframe axis are non-zero, not-empty or not-False.""" _all_examples = """\ @@ -9093,7 +9107,7 @@ def _doc_parms(cls): >>> pd.Series([True, False]).all() False -Dataframes +DataFrames Create a dataframe from a dictionary. @@ -9110,12 +9124,17 @@ def _doc_parms(cls): col2 False dtype: bool -Adding axis=1 argument will check if row-wise values all return True. +Specify ``axis='columns'`` to check if row-wise values all return True. ->>> df.all(axis=1) +>>> df.all(axis='columns') 0 True 1 False dtype: bool + +Or ``axis=None`` for whether every value is True. + +>>> df.all(axis=None) +False """ _all_see_also = """\ @@ -9481,6 +9500,11 @@ def _doc_parms(cls): 1 False dtype: bool +Aggregating over the entire DataFrame with ``axis=None``. + +>>> df.any(axis=None) +True + `any` for an empty DataFrame is an empty Series. >>> pd.DataFrame([]).any() @@ -9651,22 +9675,17 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f, @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr, examples=examples, see_also=see_also) @Appender(_bool_doc) - def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, + def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) - if skipna is None: - skipna = True - if axis is None: - axis = self._stat_axis_number if level is not None: if bool_only is not None: raise NotImplementedError("Option bool_only is not " "implemented with option level.") return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) - return self._reduce(f, axis=axis, skipna=skipna, - numeric_only=bool_only, filter_type='bool', - name=name) + return self._reduce(f, name, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') return set_function_name(logical_func, name, cls) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index df7a5dc9dc173..9d227ef37595f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3557,13 +3557,11 @@ def _aggregate_multiple_funcs(self, arg, _level): obj._selection = name results[name] = obj.aggregate(func) - if isinstance(list(compat.itervalues(results))[0], - DataFrame): - + if any(isinstance(x, DataFrame) for x in compat.itervalues(results)): # let higher level handle if _level: return results - return list(compat.itervalues(results))[0] + return DataFrame(results, columns=columns) def _wrap_output(self, output, index, names=None): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index f9501cd2f9ddf..6f4fdfe5bf5cd 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -24,9 +24,9 @@ Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. -To accept the future behavior, pass 'sort=True'. +To accept the future behavior, pass 'sort=False'. -To retain the current behavior and silence the warning, pass sort=False +To retain the current behavior and silence the warning, pass 'sort=True'. """) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index df39eb5fd8312..59527afe6c1f7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,6 +31,7 @@ is_dtype_equal, is_dtype_union_equal, is_object_dtype, + is_categorical, is_categorical_dtype, is_interval_dtype, is_period_dtype, @@ -96,7 +97,8 @@ def cmp_method(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - if is_object_dtype(self) and self.nlevels == 1: + from .multi import MultiIndex + if is_object_dtype(self) and not isinstance(self, MultiIndex): # don't pass MultiIndex with np.errstate(all='ignore'): result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) @@ -187,6 +189,9 @@ class Index(IndexOpsMixin, PandasObject): ---------- data : array-like (1-dimensional) dtype : NumPy dtype (default: object) + If dtype is None, we find the dtype that best fits the data. + If an actual dtype is provided, we coerce to that dtype if it's safe. + Otherwise, an error will be raised. copy : bool Make a copy of input ndarray name : object @@ -312,7 +317,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - data = np.array(data, copy=copy, dtype=dtype) + try: + data = np.array(data, copy=copy, dtype=dtype) + except OverflowError: + # gh-15823: a more user-friendly error message + raise OverflowError( + "the elements provided in the data cannot " + "all be casted to the dtype {dtype}" + .format(dtype=dtype)) elif inferred in ['floating', 'mixed-integer-float']: if isna(data).any(): raise ValueError('cannot convert float ' @@ -424,12 +436,14 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif data is None or is_scalar(data): cls._scalar_data_error(data) else: - if tupleize_cols and is_list_like(data) and data: + if tupleize_cols and is_list_like(data): + # GH21470: convert iterable to list before determining if empty if is_iterator(data): data = list(data) - # we must be all tuples, otherwise don't construct - # 10697 - if all(isinstance(e, tuple) for e in data): + + if data and all(isinstance(e, tuple) for e in data): + # we must be all tuples, otherwise don't construct + # 10697 from .multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get('names')) @@ -1384,7 +1398,8 @@ def set_names(self, names, level=None, inplace=False): names=[u'baz', u'bar']) """ - if level is not None and self.nlevels == 1: + from .multi import MultiIndex + if level is not None and not isinstance(self, MultiIndex): raise ValueError('Level must be None for non-MultiIndex') if level is not None and not is_list_like(level) and is_list_like( @@ -3346,6 +3361,8 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) def get_indexer_non_unique(self, target): target = _ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) @@ -4375,7 +4392,7 @@ def drop(self, labels, errors='raise'): Raises ------ KeyError - If none of the labels are found in the selected axis + If not all of the labels are found in the selected axis """ arr_dtype = 'object' if self.dtype == 'object' else None labels = com._index_labels_to_array(labels, dtype=arr_dtype) @@ -4384,7 +4401,7 @@ def drop(self, labels, errors='raise'): if mask.any(): if errors != 'ignore': raise KeyError( - 'labels %s not contained in axis' % labels[mask]) + '{} not found in axis'.format(labels[mask])) indexer = indexer[~mask] return self.delete(indexer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 3ffef5804acf7..587090fa72def 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -378,15 +378,15 @@ def _engine(self): # introspection @cache_readonly def is_unique(self): - return not self.duplicated().any() + return self._engine.is_unique @property def is_monotonic_increasing(self): - return Index(self.codes).is_monotonic_increasing + return self._engine.is_monotonic_increasing @property def is_monotonic_decreasing(self): - return Index(self.codes).is_monotonic_decreasing + return self._engine.is_monotonic_decreasing @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) def unique(self, level=None): @@ -598,7 +598,12 @@ def get_indexer_non_unique(self, target): target = ibase._ensure_index(target) if isinstance(target, CategoricalIndex): - target = target.categories + # Indexing on codes is more efficient if categories are the same: + if target.categories is self.categories: + target = target.codes + indexer, missing = self._engine.get_indexer_non_unique(target) + return _ensure_platform_int(indexer), missing + target = target.values codes = self.categories.get_indexer(target) indexer, missing = self._engine.get_indexer_non_unique(codes) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 83950f1d71633..0ddf33cdcae73 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -2032,7 +2032,16 @@ def time(self): """ Returns numpy array of datetime.time. The time part of the Timestamps. """ - return libts.ints_to_pydatetime(self.asi8, self.tz, box="time") + + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if (self.tz is not None and self.tz is not utc): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return libts.ints_to_pydatetime(timestamps, box="time") @property def date(self): @@ -2040,7 +2049,16 @@ def date(self): Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). """ - return libts.ints_to_pydatetime(self.normalize().asi8, box="date") + + # If the Timestamps have a timezone that is not UTC, + # convert them into their i8 representation while + # keeping their timezone and not using UTC + if (self.tz is not None and self.tz is not utc): + timestamps = self._local_timestamps() + else: + timestamps = self.asi8 + + return libts.ints_to_pydatetime(timestamps, box="date") def normalize(self): """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 408a8cc435b63..23a655b9a51ee 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -112,6 +112,10 @@ def maybe_convert_platform_interval(values): ------- array """ + if is_categorical_dtype(values): + # GH 21243/21253 + values = np.array(values) + if isinstance(values, (list, tuple)) and len(values) == 0: # GH 19016 # empty lists/tuples get object dtype by default, but this is not @@ -156,7 +160,7 @@ class IntervalIndex(IntervalMixin, Index): dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Attributes ---------- @@ -434,7 +438,7 @@ def from_breaks(cls, breaks, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -564,7 +568,7 @@ def from_intervals(cls, data, closed=None, name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -615,7 +619,7 @@ def from_tuples(cls, data, closed='right', name=None, copy=False, dtype : dtype or None, default None If None, dtype will be inferred - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -667,7 +671,7 @@ def to_tuples(self, na_tuple=True): Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA value itself if False, ``nan``. - ..versionadded:: 0.23.0 + .. versionadded:: 0.23.0 Examples -------- @@ -1572,6 +1576,10 @@ def interval_range(start=None, end=None, periods=None, freq=None, periods += 1 if is_number(endpoint): + # force consistency between start/end/freq (lower end if freq skips it) + if com._all_not_none(start, end, freq): + end -= (end - start) % freq + # compute the period/start/end if unspecified (at most one) if periods is None: periods = int((end - start) // freq) + 1 @@ -1580,10 +1588,6 @@ def interval_range(start=None, end=None, periods=None, freq=None, elif end is None: end = start + (periods - 1) * freq - # force end to be consistent with freq (lower if freq skips end) - if freq is not None: - end -= end % freq - breaks = np.linspace(start, end, periods) if all(is_integer(x) for x in com._not_none(start, end, freq)): # np.linspace always produces float output diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fbcf06a28c1e5..9a4aa15f4cc25 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -11,6 +11,8 @@ from pandas.compat.numpy import function as nv from pandas import compat +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.common import ( _ensure_int64, _ensure_platform_int, @@ -672,30 +674,18 @@ def _set_names(self, names, level=None, validate=True): if level is None: level = range(self.nlevels) - used = {} else: level = [self._get_level_number(l) for l in level] - used = {self.levels[l].name: l - for l in set(range(self.nlevels)) - set(level)} # set the name for l, name in zip(level, names): if name is not None: - # GH 20527 # All items in 'names' need to be hashable: if not is_hashable(name): raise TypeError('{}.name must be a hashable type' .format(self.__class__.__name__)) - - if name in used: - raise ValueError( - 'Duplicated level name: "{}", assigned to ' - 'level {}, is already used for level ' - '{}.'.format(name, l, used[name])) - self.levels[l].rename(name, inplace=True) - used[name] = l names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") @@ -820,20 +810,16 @@ def values(self): return self._tuples values = [] - for lev, lab in zip(self.levels, self.labels): - # Need to box timestamps, etc. - box = hasattr(lev, '_box_values') - # Try to minimize boxing. - if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._ndarray_values, - lab)) - elif box: - taken = algos.take_1d(lev._box_values(lev._ndarray_values), - lab, - fill_value=lev._na_value) - else: - taken = algos.take_1d(np.asarray(lev._values), lab) - values.append(taken) + + for i in range(self.nlevels): + vals = self._get_level_values(i) + if is_categorical_dtype(vals): + vals = vals.get_values() + if (isinstance(vals.dtype, (PandasExtensionDtype, ExtensionDtype)) + or hasattr(vals, '_box_values')): + vals = vals.astype(object) + vals = np.array(vals, copy=False) + values.append(vals) self._tuples = lib.fast_zip(values) return self._tuples @@ -852,14 +838,6 @@ def _has_complex_internals(self): # to disable groupby tricks return True - @cache_readonly - def is_monotonic(self): - """ - return if the index is monotonic increasing (only equal or - increasing) values. - """ - return self.is_monotonic_increasing - @cache_readonly def is_monotonic_increasing(self): """ @@ -887,10 +865,6 @@ def is_monotonic_decreasing(self): # monotonic decreasing if and only if reverse is monotonic increasing return self[::-1].is_monotonic_increasing - @cache_readonly - def is_unique(self): - return not self.duplicated().any() - @cache_readonly def _have_mixed_levels(self): """ return a boolean list indicated if we have mixed levels """ @@ -1719,7 +1693,6 @@ def drop(self, labels, level=None, errors='raise'): if errors != 'ignore': raise ValueError('labels %s not contained in axis' % labels[mask]) - indexer = indexer[~mask] except Exception: pass @@ -2948,6 +2921,13 @@ def isin(self, values, level=None): else: return np.lib.arraysetops.in1d(labs, sought_labels) + def _reference_duplicate_name(self, name): + """ + Returns True if the name refered to in self.names is duplicated. + """ + # count the times name equals an element in self.names. + return sum(name == n for n in self.names) > 1 + MultiIndex._add_numeric_methods_disabled() MultiIndex._add_numeric_methods_add_sub_disabled() diff --git a/pandas/core/ops.py b/pandas/core/ops.py index e14f82906cd06..540ebeee438f6 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -5,7 +5,10 @@ """ # necessary to enforce truediv in Python 2.X from __future__ import division +import datetime import operator +import textwrap +import warnings import numpy as np import pandas as pd @@ -1197,8 +1200,35 @@ def wrapper(self, other, axis=None): if is_datetime64_dtype(self) or is_datetime64tz_dtype(self): # Dispatch to DatetimeIndex to ensure identical # Series/Index behavior + if (isinstance(other, datetime.date) and + not isinstance(other, datetime.datetime)): + # https://github.com/pandas-dev/pandas/issues/21152 + # Compatibility for difference between Series comparison w/ + # datetime and date + msg = ( + "Comparing Series of datetimes with 'datetime.date'. " + "Currently, the 'datetime.date' is coerced to a " + "datetime. In the future pandas will not coerce, " + "and {future}. " + "To retain the current behavior, " + "convert the 'datetime.date' to a datetime with " + "'pd.Timestamp'." + ) + + if op in {operator.lt, operator.le, operator.gt, operator.ge}: + future = "a TypeError will be raised" + else: + future = ( + "'the values will not compare equal to the " + "'datetime.date'" + ) + msg = '\n'.join(textwrap.wrap(msg.format(future=future))) + warnings.warn(msg, FutureWarning, stacklevel=2) + other = pd.Timestamp(other) + res_values = dispatch_to_index_op(op, self, other, pd.DatetimeIndex) + return self._constructor(res_values, index=self.index, name=res_name) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 16e64192fdb20..bad0dd79aaedd 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1143,13 +1143,26 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, raise NotImplementedError('Panel.{0} does not implement ' 'numeric_only.'.format(name)) - axis_name = self._get_axis_name(axis) - axis_number = self._get_axis_number(axis_name) + if axis is None and filter_type == 'bool': + # labels = None + # constructor = None + axis_number = None + axis_name = None + else: + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + # labels = self._get_agg_axis(axis) + # constructor = self._constructor + axis_name = self._get_axis_name(axis) + axis_number = self._get_axis_number(axis_name) + f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) with np.errstate(all='ignore'): result = f(self.values) + if axis is None and filter_type == 'bool': + return np.bool_(result) axes = self._get_plane_axes(axis_name) if result.ndim == 2 and axis_name != self._info_axis_name: result = result.T diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 0707cc756682e..e6b9f88c52cd7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1383,8 +1383,7 @@ def _get_time_delta_bins(self, ax): data=[], freq=self.freq, name=ax.name) return binner, [], labels - start = ax[0] - end = ax[-1] + start, end = ax.min(), ax.max() labels = binner = TimedeltaIndex(start=start, end=end, freq=self.freq, diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4d8897fb7c811..d69d79ca9b098 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -28,6 +28,7 @@ is_int_or_datetime_dtype, is_dtype_equal, is_bool, + is_bool_dtype, is_list_like, is_datetimelike, _ensure_int64, @@ -974,9 +975,14 @@ def _maybe_coerce_merge_keys(self): # Check if we are trying to merge on obviously # incompatible dtypes GH 9780, GH 15800 - elif is_numeric_dtype(lk) and not is_numeric_dtype(rk): + + # boolean values are considered as numeric, but are still allowed + # to be merged on object boolean values + elif ((is_numeric_dtype(lk) and not is_bool_dtype(lk)) + and not is_numeric_dtype(rk)): raise ValueError(msg) - elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): + elif (not is_numeric_dtype(lk) + and (is_numeric_dtype(rk) and not is_bool_dtype(rk))): raise ValueError(msg) elif is_datetimelike(lk) and not is_datetimelike(rk): raise ValueError(msg) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index e02420323704e..9a2ad5d13d77a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,8 +1,10 @@ # pylint: disable=E1103 -from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.dtypes.common import ( + is_list_like, is_scalar, is_integer_dtype) from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.reshape.concat import concat from pandas.core.series import Series @@ -79,8 +81,22 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys, observed=dropna) + # group by the cartesian product of the grouper + # if we have a categorical + grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) + if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): + agged = agged.dropna(how='all') + + # gh-21133 + # we want to down cast if + # the original values are ints + # as we grouped with a NaN value + # and then dropped, coercing to floats + for v in [v for v in values if v in data and v in agged]: + if (is_integer_dtype(data[v]) and + not is_integer_dtype(agged[v])): + agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 0829aa8f5a509..3d9e84954a63b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None, self.index = index.remove_unused_levels() + if isinstance(self.index, MultiIndex): + if index._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The index " + "names are not unique.".format(level=level)) + raise ValueError(msg) + self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 @@ -528,6 +534,12 @@ def factorize(index): N, K = frame.shape + if isinstance(frame.columns, MultiIndex): + if frame.columns._reference_duplicate_name(level): + msg = ("Ambiguous reference to {level}. The column " + "names are not unique.".format(level=level)) + raise ValueError(msg) + # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) @@ -725,7 +737,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, ---------- data : array-like, Series, or DataFrame prefix : string, list of strings, or dict of strings, default None - String to append DataFrame column names + String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. diff --git a/pandas/core/series.py b/pandas/core/series.py index 0e2ae22f35af7..6b005c673c7cd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -40,6 +40,7 @@ maybe_convert_platform, maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, construct_1d_object_array_from_listlike) from pandas.core.dtypes.missing import ( isna, @@ -1195,12 +1196,13 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if drop: new_index = com._default_index(len(self)) - if level is not None and isinstance(self.index, MultiIndex): + if level is not None: if not isinstance(level, (tuple, list)): level = [level] level = [self.index._get_level_number(lev) for lev in level] - if len(level) < len(self.index.levels): - new_index = self.index.droplevel(level) + if isinstance(self.index, MultiIndex): + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) if inplace: self.index = new_index @@ -2616,7 +2618,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, axis = self._get_axis_number(axis) index = self.index - if level: + if level is not None: new_index, indexer = index.sortlevel(level, ascending=ascending, sort_remaining=sort_remaining) elif isinstance(index, MultiIndex): @@ -3210,7 +3212,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, delegate = self._values if isinstance(delegate, np.ndarray): # Validate that 'axis' is consistent with Series's single axis. - self._get_axis_number(axis) + if axis is not None: + self._get_axis_number(axis) if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) @@ -3268,7 +3271,7 @@ def rename(self, index=None, **kwargs): copy : boolean, default True Also copy underlying data inplace : boolean, default False - Whether to return a new %(klass)s. If True then value of copy is + Whether to return a new Series. If True then value of copy is ignored. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified @@ -4046,7 +4049,8 @@ def _try_cast(arr, take_fast_path): isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): - subarr = np.array(subarr, dtype=dtype, copy=copy) + subarr = construct_1d_ndarray_preserving_na(subarr, dtype, + copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e550976d1deeb..212f44e55c489 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -52,7 +52,21 @@ def _int64_cut_off(shape): return i return len(shape) - def loop(labels, shape): + def maybe_lift(lab, size): + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + labels = list(labels) + shape = list(shape) + + # Iteratively process all the labels in chunks sized so less + # than _INT64_MAX unique int ids will be required for each chunk + while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) @@ -74,7 +88,7 @@ def loop(labels, shape): out[mask] = -1 if nlev == len(shape): # all levels done! - return out + break # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted @@ -83,16 +97,7 @@ def loop(labels, shape): labels = [comp_ids] + labels[nlev:] shape = [len(obs_ids)] + shape[nlev:] - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) + return out def get_compressed_ids(labels, sizes): diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 5532d7522cd2d..ff58f7d104ff9 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -290,6 +290,7 @@ def __reduce__(self): """Necessary for making this object picklable""" object_state = list(np.ndarray.__reduce__(self)) subclass_state = self.fill_value, self.sp_index + object_state[2] = self.sp_values.__reduce__()[2] object_state[2] = (object_state[2], subclass_state) return tuple(object_state) @@ -339,6 +340,10 @@ def values(self): output.put(int_index.indices, self) return output + @property + def shape(self): + return (len(self),) + @property def sp_values(self): # caching not an option, leaks memory diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 81d775157cf62..44811781837bc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -241,7 +241,7 @@ def str_count(arr, pat, flags=0): Escape ``'$'`` to find the literal dollar sign. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\$') + >>> s.str.count('\\$') 0 1 1 0 2 1 @@ -358,7 +358,7 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): Returning any digit using regular expression. - >>> s1.str.contains('\d', regex=True) + >>> s1.str.contains('\\d', regex=True) 0 False 1 False 2 False @@ -2172,9 +2172,9 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): Returns ------- - concat : str if `other is None`, Series/Index of objects if `others is - not None`. In the latter case, the result will remain categorical - if the calling Series/Index is categorical. + concat : str or Series/Index of objects + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. See Also -------- diff --git a/pandas/core/window.py b/pandas/core/window.py index 015e7f7913ed0..9d0f9dc4f75f9 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -602,8 +602,8 @@ def validate(self): if isinstance(window, (list, tuple, np.ndarray)): pass elif is_integer(window): - if window < 0: - raise ValueError("window must be non-negative") + if window <= 0: + raise ValueError("window must be > 0 ") try: import scipy.signal as sig except ImportError: diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index dcc221ce978b3..b3f40b3a2429c 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -1,6 +1,7 @@ """ io on the clipboard """ from pandas import compat, get_option, option_context, DataFrame -from pandas.compat import StringIO, PY2 +from pandas.compat import StringIO, PY2, PY3 +import warnings def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover @@ -32,7 +33,7 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover # try to decode (if needed on PY3) # Strange. linux py33 doesn't complain, win py33 does - if compat.PY3: + if PY3: try: text = compat.bytes_to_str( text, encoding=(kwargs.get('encoding') or @@ -55,11 +56,27 @@ def read_clipboard(sep=r'\s+', **kwargs): # pragma: no cover counts = {x.lstrip().count('\t') for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: - sep = r'\t' + sep = '\t' + # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get('delim_whitespace') is None: sep = r'\s+' + # Regex separator currently only works with python engine. + # Default to python if separator is multi-character (regex) + if len(sep) > 1 and kwargs.get('engine') is None: + kwargs['engine'] = 'python' + elif len(sep) > 1 and kwargs.get('engine') == 'c': + warnings.warn('read_clipboard with regex separator does not work' + ' properly with c engine') + + # In PY2, the c table reader first encodes text with UTF-8 but Python + # table reader uses the format of the passed string. For consistency, + # encode strings for python engine so that output from python and c + # engines produce consistent results + if kwargs.get('engine') == 'python' and PY2: + text = text.encode('utf-8') + return read_table(StringIO(text), sep=sep, **kwargs) @@ -99,7 +116,7 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover if excel: try: if sep is None: - sep = r'\t' + sep = '\t' buf = StringIO() # clipboard_set (pyperclip) expects unicode obj.to_csv(buf, sep=sep, encoding='utf-8', **kwargs) @@ -108,8 +125,11 @@ def to_clipboard(obj, excel=True, sep=None, **kwargs): # pragma: no cover text = text.decode('utf-8') clipboard_set(text) return - except: - pass + except TypeError: + warnings.warn('to_clipboard in excel mode requires a single ' + 'character separator.') + elif sep is not None: + warnings.warn('to_clipboard with excel=False ignores the sep argument') if isinstance(obj, DataFrame): # str(df) has various unhelpful defaults, like truncation diff --git a/pandas/io/common.py b/pandas/io/common.py index 0827216975f15..ac9077f2db50e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,7 +5,7 @@ import codecs import mmap from contextlib import contextmanager, closing -from zipfile import ZipFile +import zipfile from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat @@ -428,7 +428,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, return f, handles -class BytesZipFile(ZipFile, BytesIO): +class BytesZipFile(zipfile.ZipFile, BytesIO): """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. @@ -437,14 +437,18 @@ class BytesZipFile(ZipFile, BytesIO): bytes strings into a member of the archive. """ # GH 17778 - def __init__(self, file, mode='r', **kwargs): + def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs): if mode in ['wb', 'rb']: mode = mode.replace('b', '') - super(BytesZipFile, self).__init__(file, mode, **kwargs) + super(BytesZipFile, self).__init__(file, mode, compression, **kwargs) def write(self, data): super(BytesZipFile, self).writestr(self.filename, data) + @property + def closed(self): + return self.fp is None + class MMapWrapper(BaseIterator): """ diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 29b8d29af0808..60518f596e9af 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,7 +5,10 @@ from __future__ import print_function +import warnings + import csv as csvlib +from zipfile import ZipFile import numpy as np from pandas.core.dtypes.missing import notna @@ -127,14 +130,31 @@ def save(self): else: encoding = self.encoding - if hasattr(self.path_or_buf, 'write'): + # GH 21227 internal compression is not used when file-like passed. + if self.compression and hasattr(self.path_or_buf, 'write'): + msg = ("compression has no effect when passing file-like " + "object as input.") + warnings.warn(msg, RuntimeWarning, stacklevel=2) + + # when zip compression is called. + is_zip = isinstance(self.path_or_buf, ZipFile) or ( + not hasattr(self.path_or_buf, 'write') + and self.compression == 'zip') + + if is_zip: + # zipfile doesn't support writing string to archive. uses string + # buffer to receive csv writing and dump into zip compression + # file handle. GH 21241, 21118 + f = StringIO() + close = False + elif hasattr(self.path_or_buf, 'write'): f = self.path_or_buf close = False else: f, handles = _get_handle(self.path_or_buf, self.mode, encoding=encoding, - compression=None) - close = True if self.compression is None else False + compression=self.compression) + close = True try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -151,18 +171,21 @@ def save(self): self._save() finally: - # GH 17778 handles compression for byte strings. - if not close and self.compression: - f.close() - with open(self.path_or_buf, 'r') as f: - data = f.read() - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, - compression=self.compression) - f.write(data) - close = True + if is_zip: + # GH 17778 handles zip compression separately. + buf = f.getvalue() + if hasattr(self.path_or_buf, 'write'): + self.path_or_buf.write(buf) + else: + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=encoding, + compression=self.compression) + f.write(buf) + close = True if close: f.close() + for _fh in handles: + _fh.close() def _save_header(self): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 12201f62946ac..c46f4b5ad9c18 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -636,10 +636,14 @@ def to_string(self): mid = int(round(n_cols / 2.)) mid_ix = col_lens.index[mid] col_len = col_lens[mid_ix] - adj_dif -= (col_len + 1) # adjoin adds one + # adjoin adds one + adj_dif -= (col_len + 1) col_lens = col_lens.drop(mid_ix) n_cols = len(col_lens) - max_cols_adj = n_cols - self.index # subtract index column + # subtract index column + max_cols_adj = n_cols - self.index + # GH-21180. Ensure that we print at least two. + max_cols_adj = max(max_cols_adj, 2) self.max_cols_adj = max_cols_adj # Call again _chk_truncate to cut frame appropriately @@ -778,7 +782,7 @@ def space_format(x, y): str_columns = list(zip(*[[space_format(x, y) for y in x] for x in fmt_columns])) - if self.sparsify: + if self.sparsify and len(str_columns): str_columns = _sparsify(str_columns) str_columns = [list(x) for x in zip(*str_columns)] diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index 549204abd3caf..2004a24c2ec5a 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -80,8 +80,6 @@ def nested_to_record(ds, prefix="", sep=".", level=0): if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) new_d[newkey] = v - if v is None: # pop the key if the value is None - new_d.pop(k) continue else: v = new_d.pop(k) @@ -172,6 +170,11 @@ def json_normalize(data, record_path=None, meta=None, 3 Summit 1234 John Kasich Ohio OH 4 Cuyahoga 1337 John Kasich Ohio OH + >>> data = {'A': [1, 2]} + >>> json_normalize(data, 'A', record_prefix='Prefix.') + Prefix.0 + 0 1 + 1 2 """ def _pull_field(js, spec): result = js @@ -261,7 +264,8 @@ def _recursive_extract(data, path, seen_meta, level=0): result = DataFrame(records) if record_prefix is not None: - result.rename(columns=lambda x: record_prefix + x, inplace=True) + result = result.rename( + columns=lambda x: "{p}{c}".format(p=record_prefix, c=x)) # Data types, a problem for k, v in compat.iteritems(meta_vals): diff --git a/pandas/io/json/table_schema.py b/pandas/io/json/table_schema.py index 01f7db7d68664..5cea64388bdd7 100644 --- a/pandas/io/json/table_schema.py +++ b/pandas/io/json/table_schema.py @@ -296,7 +296,7 @@ def parse_table_schema(json, precise_float): """ table = loads(json, precise_float=precise_float) col_order = [field['name'] for field in table['schema']['fields']] - df = DataFrame(table['data'])[col_order] + df = DataFrame(table['data'], columns=col_order)[col_order] dtypes = {field['name']: convert_json_field_to_pandas_type(field) for field in table['schema']['fields']} diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2c8f98732c92f..65df2bffb4abf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3209,12 +3209,22 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): col = columns[k] if is_integer(k) else k dtype[col] = v - if index_col is None or index_col is False: + # Even though we have no data, the "index" of the empty DataFrame + # could for example still be an empty MultiIndex. Thus, we need to + # check whether we have any index columns specified, via either: + # + # 1) index_col (column indices) + # 2) index_names (column names) + # + # Both must be non-null to ensure a successful construction. Otherwise, + # we have to create a generic emtpy Index. + if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = _ensure_index_from_sequences(data, names=index_names) index_col.sort() + for i, n in enumerate(index_col): columns.pop(n - i) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index ccb8d2d99d734..a582d32741ae9 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -572,29 +572,8 @@ def create(self): else: self._execute_create() - def insert_statement(self, data, conn): - """ - Generate tuple of SQLAlchemy insert statement and any arguments - to be executed by connection (via `_execute_insert`). - - Parameters - ---------- - conn : SQLAlchemy connectable(engine/connection) - Connection to recieve the data - data : list of dict - The data to be inserted - - Returns - ------- - SQLAlchemy statement - insert statement - *, optional - Additional parameters to be passed when executing insert statement - """ - dialect = getattr(conn, 'dialect', None) - if dialect and getattr(dialect, 'supports_multivalues_insert', False): - return self.table.insert(data), - return self.table.insert(), data + def insert_statement(self): + return self.table.insert() def insert_data(self): if self.index is not None: @@ -633,9 +612,8 @@ def insert_data(self): return column_names, data_list def _execute_insert(self, conn, keys, data_iter): - """Insert data into this table with database connection""" data = [{k: v for k, v in zip(keys, row)} for row in data_iter] - conn.execute(*self.insert_statement(data, conn)) + conn.execute(self.insert_statement(), data) def insert(self, chunksize=None): keys, data_list = self.insert_data() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8f91c7a497e2d..2797924985c70 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1758,11 +1758,25 @@ def value_labels(self): return self.value_label_dict -def _open_file_binary_write(fname, encoding): +def _open_file_binary_write(fname): + """ + Open a binary file or no-op if file-like + + Parameters + ---------- + fname : string path, path object or buffer + + Returns + ------- + file : file-like object + File object supporting write + own : bool + True if the file was created, otherwise False + """ if hasattr(fname, 'write'): # if 'b' not in fname.mode: - return fname - return open(fname, "wb") + return fname, False + return open(fname, "wb"), True def _set_endianness(endianness): @@ -1899,7 +1913,9 @@ class StataWriter(StataParser): ---------- fname : path (string), buffer or path object string, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() functions. + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. .. versionadded:: 0.23.0 support for pathlib, py.path. @@ -1970,6 +1986,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels + self._own_file = True # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -2183,9 +2200,7 @@ def _prepare_pandas(self, data): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._file = _open_file_binary_write( - self._fname, self._encoding or self._default_encoding - ) + self._file, self._own_file = _open_file_binary_write(self._fname) try: self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) @@ -2205,6 +2220,23 @@ def write_file(self): self._write_file_close_tag() self._write_map() finally: + self._close() + + def _close(self): + """ + Close the file if it was created by the writer. + + If a buffer or file-like object was passed in, for example a GzipFile, + then leave this file open for the caller to close. In either case, + attempt to flush the file contents to ensure they are written to disk + (if supported) + """ + # Some file-like objects might not support flush + try: + self._file.flush() + except AttributeError: + pass + if self._own_file: self._file.close() def _write_map(self): @@ -2374,7 +2406,7 @@ def _prepare_data(self): def _write_data(self): data = self.data - data.tofile(self._file) + self._file.write(data.tobytes()) def _null_terminate(self, s, as_string=False): null_byte = '\x00' @@ -2641,7 +2673,9 @@ class StataWriter117(StataWriter): ---------- fname : path (string), buffer or path object string, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() functions. + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. data : DataFrame Input to save convert_dates : dict @@ -2879,7 +2913,7 @@ def _write_data(self): self._update_map('data') data = self.data self._file.write(b'') - data.tofile(self._file) + self._file.write(data.tobytes()) self._file.write(b'') def _write_strls(self): diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 87b7d13251f28..d1a2121597dd6 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -811,7 +811,7 @@ class PlanePlot(MPLPlot): def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: - raise ValueError(self._kind + ' requires and x and y column') + raise ValueError(self._kind + ' requires an x and y column') if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): diff --git a/pandas/tests/categorical/test_indexing.py b/pandas/tests/categorical/test_indexing.py index 9c27b1101e5ca..cf7b5cfa55882 100644 --- a/pandas/tests/categorical/test_indexing.py +++ b/pandas/tests/categorical/test_indexing.py @@ -5,7 +5,7 @@ import numpy as np import pandas.util.testing as tm -from pandas import Categorical, Index, PeriodIndex +from pandas import Categorical, Index, CategoricalIndex, PeriodIndex from pandas.tests.categorical.common import TestCategorical @@ -103,3 +103,21 @@ def f(): s.categories = [1, 2] pytest.raises(ValueError, f) + + # Combinations of sorted/unique: + @pytest.mark.parametrize("idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], + [1, 3, 3, 4], [1, 2, 2, 4]]) + # Combinations of missing/unique + @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]]) + @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex]) + def test_get_indexer_non_unique(self, idx_values, key_values, key_class): + # GH 21448 + key = key_class(key_values, categories=range(1, 5)) + # Test for flat index and CategoricalIndex with same/different cats: + for dtype in None, 'category', key.dtype: + idx = Index(idx_values, dtype=dtype) + expected, exp_miss = idx.get_indexer_non_unique(key_values) + result, res_miss = idx.get_indexer_non_unique(key) + + tm.assert_numpy_array_equal(expected, result) + tm.assert_numpy_array_equal(exp_miss, res_miss) diff --git a/pandas/tests/categorical/test_missing.py b/pandas/tests/categorical/test_missing.py index 5133c97d8b590..c78f02245a5b4 100644 --- a/pandas/tests/categorical/test_missing.py +++ b/pandas/tests/categorical/test_missing.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import collections + import numpy as np import pytest @@ -68,3 +70,16 @@ def test_fillna_raises(self, fillna_kwargs, msg): with tm.assert_raises_regex(ValueError, msg): cat.fillna(**fillna_kwargs) + + @pytest.mark.parametrize("named", [True, False]) + def test_fillna_iterable_category(self, named): + # https://github.com/pandas-dev/pandas/issues/21097 + if named: + Point = collections.namedtuple("Point", "x y") + else: + Point = lambda *args: args # tuple + cat = Categorical([Point(0, 0), Point(0, 1), None]) + result = cat.fillna(Point(0, 0)) + expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)]) + + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 20cd8b43478d2..4a19682e2c558 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -23,6 +23,7 @@ maybe_convert_scalar, find_common_type, construct_1d_object_array_from_listlike, + construct_1d_ndarray_preserving_na, construct_1d_arraylike_from_scalar) from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -440,3 +441,15 @@ def test_cast_1d_arraylike_from_scalar_categorical(self): tm.assert_categorical_equal(result, expected, check_category_order=True, check_dtype=True) + + +@pytest.mark.parametrize('values, dtype, expected', [ + ([1, 2, 3], None, np.array([1, 2, 3])), + (np.array([1, 2, 3]), None, np.array([1, 2, 3])), + (['1', '2', None], None, np.array(['1', '2', None])), + (['1', '2', None], np.dtype('str'), np.array(['1', '2', None])), + ([1, 2, None], np.dtype('str'), np.array(['1', '2', None])), +]) +def test_construct_1d_ndarray_preserving_na(values, dtype, expected): + result = construct_1d_ndarray_preserving_na(values, dtype=dtype) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 32cf29818e069..af26d83df3fe2 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -18,6 +18,11 @@ def test_isna(self, data_missing): expected = pd.Series(expected) self.assert_series_equal(result, expected) + # GH 21189 + result = pd.Series(data_missing).drop([0, 1]).isna() + expected = pd.Series([], dtype=bool) + self.assert_series_equal(result, expected) + def test_dropna_series(self, data_missing): ser = pd.Series(data_missing) result = ser.dropna() diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index e9431bd0c233c..90f0181beab0d 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -90,7 +90,7 @@ def nbytes(self): return 0 def isna(self): - return np.array([x.is_nan() for x in self._data]) + return np.array([x.is_nan() for x in self._data], dtype=bool) @property def _na_value(self): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 88bb66f38b35c..10be7836cb8d7 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -108,7 +108,8 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == self.dtype.na_value for x in self.data]) + return np.array([x == self.dtype.na_value for x in self.data], + dtype=bool) def take(self, indexer, allow_fill=False, fill_value=None): # re-implement here, since NumPy has trouble setting diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 164d6746edec0..21961906c39bb 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -130,19 +130,27 @@ def test_set_index2(self): result = df.set_index(df.C) assert result.index.name == 'C' - @pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')]) + @pytest.mark.parametrize( + 'level', ['a', pd.Series(range(0, 8, 2), name='a')]) def test_set_index_duplicate_names(self, level): - # GH18872 + # GH18872 - GH19029 df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b']) # Pass an existing level name: df.index.name = 'a' - pytest.raises(ValueError, df.set_index, level, append=True) - pytest.raises(ValueError, df.set_index, [level], append=True) - - # Pass twice the same level name: - df.index.name = 'c' - pytest.raises(ValueError, df.set_index, [level, level]) + expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)], + names=['a', 'a']) + result = df.set_index(level, append=True) + tm.assert_index_equal(result.index, expected) + result = df.set_index([level], append=True) + tm.assert_index_equal(result.index, expected) + + # Pass twice the same level name (only works with passing actual data) + if isinstance(level, pd.Series): + result = df.set_index([level, level]) + expected = pd.MultiIndex.from_tuples( + [(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a']) + tm.assert_index_equal(result.index, expected) def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], @@ -617,6 +625,19 @@ def test_reorder_levels(self): index=e_idx) assert_frame_equal(result, expected) + result = df.reorder_levels([0, 0, 0]) + e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']], + labels=[[0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0]], + names=['L0', 'L0', 'L0']) + expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, + index=e_idx) + assert_frame_equal(result, expected) + + result = df.reorder_levels(['L0', 'L0', 'L0']) + assert_frame_equal(result, expected) + def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index d1a4a5f615b86..415ae982673ee 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -12,10 +12,10 @@ from numpy.random import randn import numpy as np -from pandas.compat import lrange, product, PY35 +from pandas.compat import lrange, PY35 from pandas import (compat, isna, notna, DataFrame, Series, MultiIndex, date_range, Timestamp, Categorical, - _np_version_under1p12, _np_version_under1p15) + _np_version_under1p12) import pandas as pd import pandas.core.nanops as nanops import pandas.core.algorithms as algorithms @@ -1139,11 +1139,35 @@ def test_any_all(self): self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) - df = DataFrame(randn(10, 4)) > 0 - df.any(1) - df.all(1) - df.any(1, bool_only=True) - df.all(1, bool_only=True) + def test_any_all_extra(self): + df = DataFrame({ + 'A': [True, False, False], + 'B': [True, True, False], + 'C': [True, True, True], + }, index=['a', 'b', 'c']) + result = df[['A', 'B']].any(1) + expected = Series([True, True, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df[['A', 'B']].any(1, bool_only=True) + tm.assert_series_equal(result, expected) + + result = df.all(1) + expected = Series([True, False, False], index=['a', 'b', 'c']) + tm.assert_series_equal(result, expected) + + result = df.all(1, bool_only=True) + tm.assert_series_equal(result, expected) + + # Axis is None + result = df.all(axis=None).item() + assert result is False + + result = df.any(axis=None).item() + assert result is True + + result = df[['C']].all(axis=None).item() + assert result is True # skip pathological failure cases # class CantNonzero(object): @@ -1165,6 +1189,86 @@ def test_any_all(self): # df.any(1, bool_only=True) # df.all(1, bool_only=True) + @pytest.mark.parametrize('func, data, expected', [ + (np.any, {}, False), + (np.all, {}, True), + (np.any, {'A': []}, False), + (np.all, {'A': []}, True), + (np.any, {'A': [False, False]}, False), + (np.all, {'A': [False, False]}, False), + (np.any, {'A': [True, False]}, True), + (np.all, {'A': [True, False]}, False), + (np.any, {'A': [True, True]}, True), + (np.all, {'A': [True, True]}, True), + + (np.any, {'A': [False], 'B': [False]}, False), + (np.all, {'A': [False], 'B': [False]}, False), + + (np.any, {'A': [False, False], 'B': [False, True]}, True), + (np.all, {'A': [False, False], 'B': [False, True]}, False), + + # other types + (np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False), + (np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True), + (np.all, {'A': pd.Series([0, 1], dtype=int)}, False), + (np.any, {'A': pd.Series([0, 1], dtype=int)}, True), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True, + marks=[td.skip_if_np_lt_115]), + (np.all, {'A': pd.Series([0, 1], dtype='category')}, False), + (np.any, {'A': pd.Series([0, 1], dtype='category')}, True), + (np.all, {'A': pd.Series([1, 2], dtype='category')}, True), + (np.any, {'A': pd.Series([1, 2], dtype='category')}, True), + + # # Mix + # GH-21484 + # (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'), + # 'B': pd.Series([10, 20], dtype='m8[ns]')}, True), + ]) + def test_any_all_np_func(self, func, data, expected): + # https://github.com/pandas-dev/pandas/issues/19976 + data = DataFrame(data) + result = func(data) + assert isinstance(result, np.bool_) + assert result.item() is expected + + # method version + result = getattr(DataFrame(data), func.__name__)(axis=None) + assert isinstance(result, np.bool_) + assert result.item() is expected + + def test_any_all_object(self): + # https://github.com/pandas-dev/pandas/issues/19976 + result = np.all(DataFrame(columns=['a', 'b'])).item() + assert result is True + + result = np.any(DataFrame(columns=['a', 'b'])).item() + assert result is False + + @pytest.mark.parametrize('method', ['any', 'all']) + def test_any_all_level_axis_none_raises(self, method): + df = DataFrame( + {"A": 1}, + index=MultiIndex.from_product([['A', 'B'], ['a', 'b']], + names=['out', 'in']) + ) + xpr = "Must specify 'axis' when aggregating by level." + with tm.assert_raises_regex(ValueError, xpr): + getattr(df, method)(axis=None, level='out') + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, has_bool_only=False): if frame is None: @@ -1507,6 +1611,23 @@ def test_duplicated_with_misspelled_column_name(self, subset): with pytest.raises(KeyError): df.drop_duplicates(subset) + @pytest.mark.slow + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([ @@ -2054,9 +2175,6 @@ def test_clip_against_list_like(self, inplace, lower, axis, res): result = original tm.assert_frame_equal(result, expected, check_exact=True) - @pytest.mark.xfail( - not _np_version_under1p15, - reason="failing under numpy-dev gh-19976") @pytest.mark.parametrize("axis", [0, 1, None]) def test_clip_against_frame(self, axis): df = DataFrame(np.random.randn(1000, 2)) @@ -2077,13 +2195,23 @@ def test_clip_with_na_args(self): """Should process np.nan argument as None """ # GH # 17276 tm.assert_frame_equal(self.frame.clip(np.nan), self.frame) - tm.assert_frame_equal(self.frame.clip(upper=[1, 2, np.nan]), - self.frame) - tm.assert_frame_equal(self.frame.clip(lower=[1, np.nan, 3]), - self.frame) tm.assert_frame_equal(self.frame.clip(upper=np.nan, lower=np.nan), self.frame) + # GH #19992 + df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6], + 'col_2': [7, 8, 9]}) + + result = df.clip(lower=[4, 5, np.nan], axis=0) + expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan], + 'col_2': [7, 8, np.nan]}) + tm.assert_frame_equal(result, expected) + + result = df.clip(lower=[4, 5, np.nan], axis=1) + expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6], + 'col_2': [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) + # Matrix-like def test_dot(self): a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], @@ -2240,54 +2368,49 @@ class TestNLargestNSmallest(object): # ---------------------------------------------------------------------- # Top / bottom - @pytest.mark.parametrize( - 'method, n, order', - product(['nsmallest', 'nlargest'], range(1, 11), - [['a'], - ['c'], - ['a', 'b'], - ['a', 'c'], - ['b', 'a'], - ['b', 'c'], - ['a', 'b', 'c'], - ['c', 'a', 'b'], - ['c', 'b', 'a'], - ['b', 'c', 'a'], - ['b', 'a', 'c'], - - # dups! - ['b', 'c', 'c'], - - ])) - def test_n(self, df_strings, method, n, order): + @pytest.mark.parametrize('order', [ + ['a'], + ['c'], + ['a', 'b'], + ['a', 'c'], + ['b', 'a'], + ['b', 'c'], + ['a', 'b', 'c'], + ['c', 'a', 'b'], + ['c', 'b', 'a'], + ['b', 'c', 'a'], + ['b', 'a', 'c'], + + # dups! + ['b', 'c', 'c']]) + @pytest.mark.parametrize('n', range(1, 11)) + def test_n(self, df_strings, nselect_method, n, order): # GH10393 df = df_strings if 'b' in order: error_msg = self.dtype_error_msg_template.format( - column='b', method=method, dtype='object') + column='b', method=nselect_method, dtype='object') with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(n, order) + getattr(df, nselect_method)(n, order) else: - ascending = method == 'nsmallest' - result = getattr(df, method)(n, order) + ascending = nselect_method == 'nsmallest' + result = getattr(df, nselect_method)(n, order) expected = df.sort_values(order, ascending=ascending).head(n) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'method, columns', - product(['nsmallest', 'nlargest'], - product(['group'], ['category_string', 'string']) - )) - def test_n_error(self, df_main_dtypes, method, columns): + @pytest.mark.parametrize('columns', [ + ('group', 'category_string'), ('group', 'string')]) + def test_n_error(self, df_main_dtypes, nselect_method, columns): df = df_main_dtypes + col = columns[1] error_msg = self.dtype_error_msg_template.format( - column=columns[1], method=method, dtype=df[columns[1]].dtype) + column=col, method=nselect_method, dtype=df[col].dtype) # escape some characters that may be in the repr error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)") .replace("[", "\\[").replace("]", "\\]")) with tm.assert_raises_regex(TypeError, error_msg): - getattr(df, method)(2, columns) + getattr(df, nselect_method)(2, columns) def test_n_all_dtypes(self, df_main_dtypes): df = df_main_dtypes @@ -2308,15 +2431,14 @@ def test_n_identical_values(self): expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]}) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - 'n, order', - product([1, 2, 3, 4, 5], - [['a', 'b', 'c'], - ['c', 'b', 'a'], - ['a'], - ['b'], - ['a', 'b'], - ['c', 'b']])) + @pytest.mark.parametrize('order', [ + ['a', 'b', 'c'], + ['c', 'b', 'a'], + ['a'], + ['b'], + ['a', 'b'], + ['c', 'b']]) + @pytest.mark.parametrize('n', range(1, 6)) def test_n_duplicate_index(self, df_duplicates, n, order): # GH 13412 diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index ac46f02d00773..dfb2961befe35 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -554,6 +554,14 @@ def test_apply_non_numpy_dtype(self): result = df.apply(lambda x: x) assert_frame_equal(result, df) + def test_apply_dup_names_multi_agg(self): + # GH 21063 + df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'a']) + expected = pd.DataFrame([[0, 1]], columns=['a', 'a'], index=['min']) + result = df.agg(['min']) + + tm.assert_frame_equal(result, expected) + class TestInferOutputShape(object): # the user has supplied an opaque UDF where diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 28e82f7585850..004fb4eb0c128 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -10,7 +10,7 @@ import numpy as np from pandas.compat import lrange, lzip, u -from pandas import (compat, DataFrame, Series, Index, MultiIndex, +from pandas import (compat, DataFrame, Series, Index, MultiIndex, Categorical, date_range, isna) import pandas as pd @@ -1129,6 +1129,19 @@ def test_reindex_multi(self): assert_frame_equal(result, expected) + def test_reindex_multi_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = pd.MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + df = pd.DataFrame({'a': range(len(midx))}, index=midx) + df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] + + result = df2.reindex(midx) + expected = pd.DataFrame( + {'a': [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) + assert_frame_equal(result, expected) + data = [[1, 2, 3], [1, 2, 3]] @pytest.mark.parametrize('actual', [ @@ -1151,3 +1164,18 @@ def test_raise_on_drop_duplicate_index(self, actual): expected_no_err = actual.T.drop('c', axis=1, level=level, errors='ignore') assert_frame_equal(expected_no_err.T, actual) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 2]]) + @pytest.mark.parametrize('drop_labels', [[], [1], [2]]) + def test_drop_empty_list(self, index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + frame = pd.DataFrame(index=index).drop(drop_labels) + tm.assert_frame_equal(frame, pd.DataFrame(index=expected_index)) + + @pytest.mark.parametrize('index', [[1, 2, 3], [1, 2, 2]]) + @pytest.mark.parametrize('drop_labels', [[1, 4], [4, 5]]) + def test_drop_non_empty_list(self, index, drop_labels): + # GH 21494 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.DataFrame(index=index).drop(drop_labels) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6dd38187f7277..70dd358248bc4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -151,6 +151,17 @@ def test_constructor_complex_dtypes(self): assert a.dtype == df.a.dtype assert b.dtype == df.b.dtype + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + df = DataFrame({'A': ['x', None]}, dtype=string_dtype) + result = df.isna() + expected = DataFrame({"A": [False, True]}) + tm.assert_frame_equal(result, expected) + assert df.iloc[1, 0] is None + + df = DataFrame({'A': ['x', np.nan]}, dtype=string_dtype) + assert np.isnan(df.iloc[1, 0]) + def test_constructor_rec(self): rec = self.frame.to_records(index=False) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 4c9f8c2ea0980..1eeeec0be3b8b 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -794,22 +794,26 @@ def test_arg_for_errors_in_astype(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' - for dtype in ['str', str, 'U']: - result = DataFrame({'A': input_vals}, dtype=dtype) - expected = DataFrame({'A': input_vals}).astype({'A': dtype}) - assert_frame_equal(result, expected) + result = DataFrame({'A': input_vals}, dtype=string_dtype) + expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) + assert_frame_equal(result, expected) + + def test_constructor_list_str_na(self, string_dtype): + + result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) + expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) + assert_frame_equal(result, expected) class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 5df50f3d7835b..fdf50805ad818 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -3,6 +3,7 @@ from __future__ import print_function from collections import deque from datetime import datetime +from decimal import Decimal import operator import pytest @@ -282,6 +283,17 @@ def test_neg_numeric(self, df, expected): assert_frame_equal(-df, expected) assert_series_equal(-df['a'], expected['a']) + @pytest.mark.parametrize('df, expected', [ + (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)), + ([Decimal('1.0'), Decimal('2.0')], [Decimal('-1.0'), Decimal('-2.0')]), + ]) + def test_neg_object(self, df, expected): + # GH 21380 + df = pd.DataFrame({'a': df}) + expected = pd.DataFrame({'a': expected}) + assert_frame_equal(-df, expected) + assert_series_equal(-df['a'], expected['a']) + @pytest.mark.parametrize('df', [ pd.DataFrame({'a': ['a', 'b']}), pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), @@ -307,6 +319,15 @@ def test_pos_numeric(self, df): @pytest.mark.parametrize('df', [ pd.DataFrame({'a': ['a', 'b']}), + pd.DataFrame({'a': np.array([-1, 2], dtype=object)}), + pd.DataFrame({'a': [Decimal('-1.0'), Decimal('2.0')]}), + ]) + def test_pos_object(self, df): + # GH 21380 + assert_frame_equal(+df, df) + assert_series_equal(+df['a'], df['a']) + + @pytest.mark.parametrize('df', [ pd.DataFrame({'a': pd.to_datetime(['2017-01-22', '1970-01-01'])}), ]) def test_pos_raises(self, df): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d89731dc09044..ebf6c5e37b916 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -560,6 +560,16 @@ def test_unstack_dtypes(self): assert left.shape == (3, 2) tm.assert_frame_equal(left, right) + def test_unstack_non_unique_index_names(self): + idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')], + names=['c1', 'c1']) + df = DataFrame([1, 2], index=idx) + with pytest.raises(ValueError): + df.unstack('c1') + + with pytest.raises(ValueError): + df.T.stack('c1') + def test_unstack_unused_levels(self): # GH 17845: unused labels in index make unstack() cast int to float idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1] @@ -861,6 +871,23 @@ def test_stack_preserve_categorical_dtype(self): tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("level", [0, 'baz']) + def test_unstack_swaplevel_sortlevel(self, level): + # GH 20994 + mi = pd.MultiIndex.from_product([[0], ['d', 'c']], + names=['bar', 'baz']) + df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A']) + df.columns.name = 'foo' + + expected = pd.DataFrame([ + [3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([ + ('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[ + 'baz', 'foo'])) + expected.index.name = 'bar' + + result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level) + tm.assert_frame_equal(result, expected) + def test_unstack_fill_frame_object(): # GH12815 Test unstacking with object. diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index b60eb89e87da5..599ae683f914b 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -550,18 +550,36 @@ def test_sort_index(self): expected = frame.iloc[:, ::-1] assert_frame_equal(result, expected) - def test_sort_index_multiindex(self): + @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + def test_sort_index_multiindex(self, level): # GH13496 # sort rows by specified level of multi-index - mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) + mi = MultiIndex.from_tuples([ + [2, 1, 3], [2, 1, 2], [1, 1, 1]], names=list('ABC')) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mi) + + expected_mi = MultiIndex.from_tuples([ + [1, 1, 1], + [2, 1, 2], + [2, 1, 3]], names=list('ABC')) + expected = pd.DataFrame([ + [5, 6], + [3, 4], + [1, 2]], index=expected_mi) + result = df.sort_index(level=level) + assert_frame_equal(result, expected) - # MI sort, but no level: sort_level has no effect - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - df = DataFrame([[1, 2], [3, 4]], mi) - result = df.sort_index(sort_remaining=False) - expected = df.sort_index() + # sort_remaining=False + expected_mi = MultiIndex.from_tuples([ + [1, 1, 1], + [2, 1, 3], + [2, 1, 2]], names=list('ABC')) + expected = pd.DataFrame([ + [5, 6], + [1, 2], + [3, 4]], index=expected_mi) + result = df.sort_index(level=level, sort_remaining=False) assert_frame_equal(result, expected) def test_sort_index_intervalindex(self): diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 90fbc6e628369..fb9bd74d9876d 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -506,7 +506,15 @@ def test_asfreq_fillvalue(self): actual_series = ts.asfreq(freq='1S', fill_value=9.0) assert_series_equal(expected_series, actual_series) - def test_first_last_valid(self): + @pytest.mark.parametrize("data,idx,expected_first,expected_last", [ + ({'A': [1, 2, 3]}, [1, 1, 2], 1, 2), + ({'A': [1, 2, 3]}, [1, 2, 2], 1, 2), + ({'A': [1, 2, 3, 4]}, ['d', 'd', 'd', 'd'], 'd', 'd'), + ({'A': [1, np.nan, 3]}, [1, 1, 2], 1, 2), + ({'A': [np.nan, np.nan, 3]}, [1, 1, 2], 2, 2), + ({'A': [1, np.nan, 3]}, [1, 2, 2], 1, 2)]) + def test_first_last_valid(self, data, idx, + expected_first, expected_last): N = len(self.frame.index) mat = randn(N) mat[:5] = nan @@ -539,6 +547,11 @@ def test_first_last_valid(self): assert frame.first_valid_index().freq == frame.index.freq assert frame.last_valid_index().freq == frame.index.freq + # GH 21441 + df = DataFrame(data, index=idx) + assert expected_first == df.first_valid_index() + assert expected_last == df.last_valid_index() + def test_first_subset(self): ts = tm.makeTimeDataFrame(freq='12h') result = ts.first('10d') diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index fa589a0aa4817..3956968173070 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -133,3 +133,13 @@ def test_frame_reset_index(self, tz): xp = df.index.tz rs = roundtripped.index.tz assert xp == rs + + @pytest.mark.parametrize('tz', [None, 'America/New_York']) + def test_boolean_compare_transpose_tzindex_with_dst(self, tz): + # GH 19970 + idx = date_range('20161101', '20161130', freq='4H', tz=tz) + df = DataFrame({'a': range(len(idx)), 'b': range(len(idx))}, + index=idx) + result = df.T == df.T + expected = DataFrame(True, index=list('ab'), columns=idx) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e4829ebf48561..3ad25ae73109e 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -9,6 +9,7 @@ import numpy as np from pandas.compat import (lmap, range, lrange, StringIO, u) +from pandas.io.common import _get_handle import pandas.core.common as com from pandas.errors import ParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, @@ -919,29 +920,46 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - def test_to_csv_compression(self, compression): - - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + @pytest.mark.parametrize('df,encoding', [ + (DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']), None), + # GH 21241, 21118 + (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), + (DataFrame(5 * [[123, u"你好", u"世界"]], + columns=['X', 'Y', 'Z']), 'gb2312'), + (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]], + columns=['X', 'Y', 'Z']), 'cp737') + ]) + def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: - df.to_csv(filename, compression=compression) - + df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv - rs = read_csv(filename, compression=compression, - index_col=0) - assert_frame_equal(df, rs) + result = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + assert_frame_equal(df, result) + + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + df.to_csv(f, encoding=encoding) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) + assert_frame_equal(df, result) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(df, read_csv(fh, index_col=0)) + assert_frame_equal(df, read_csv(fh, + index_col=0, + encoding=encoding)) def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e0793b8e1bd64..d021396a7acb3 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.compat import PY37 from pandas import (Index, MultiIndex, CategoricalIndex, DataFrame, Categorical, Series, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal @@ -205,6 +206,7 @@ def test_level_get_group(observed): assert_frame_equal(result, expected) +@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636") @pytest.mark.parametrize('ordered', [True, False]) def test_apply(ordered): # GH 10138 @@ -553,15 +555,11 @@ def test_as_index(): columns=['cat', 'A', 'B']) tm.assert_frame_equal(result, expected) - # another not in-axis grouper - s = Series(['a', 'b', 'b'], name='cat2') + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') result = df.groupby(['cat', s], as_index=False, observed=True).sum() tm.assert_frame_equal(result, expected) - # GH18872: conflicting names in desired index - with pytest.raises(ValueError): - df.groupby(['cat', s.rename('cat')], observed=True).sum() - # is original index dropped? group_columns = ['cat', 'A'] expected = DataFrame( @@ -852,3 +850,23 @@ def test_empty_prod(): result = df.groupby("A", observed=False).B.prod(min_count=1) expected = pd.Series([2, 1, np.nan], expected_idx, name='B') tm.assert_series_equal(result, expected) + + +def test_groupby_multiindex_categorical_datetime(): + # https://github.com/pandas-dev/pandas/issues/21390 + + df = pd.DataFrame({ + 'key1': pd.Categorical(list('abcbabcba')), + 'key2': pd.Categorical( + list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3), + 'values': np.arange(9), + }) + result = df.groupby(['key1', 'key2']).mean() + + idx = pd.MultiIndex.from_product( + [pd.Categorical(['a', 'b', 'c']), + pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))], + names=['key1', 'key2']) + expected = pd.DataFrame( + {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) + assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e05f9de5ea7f4..66577d738dd28 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1674,3 +1674,22 @@ def test_tuple_correct_keyerror(): [3, 4]])) with tm.assert_raises_regex(KeyError, "(7, 8)"): df.groupby((7, 8)).mean() + + +def test_groupby_agg_ohlc_non_first(): + # GH 21716 + df = pd.DataFrame([[1], [1]], columns=['foo'], + index=pd.date_range('2018-01-01', periods=2, freq='D')) + + expected = pd.DataFrame([ + [1, 1, 1, 1, 1], + [1, 1, 1, 1, 1] + ], columns=pd.MultiIndex.from_tuples(( + ('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'), + ('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'), + ('foo', 'sum', 'foo'))), index=pd.date_range( + '2018-01-01', periods=2, freq='D')) + + result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 6ad8b4905abff..d978e144e5013 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -1,7 +1,7 @@ import pytest import numpy as np import pandas as pd -from pandas import DataFrame, concat +from pandas import DataFrame, Series, concat from pandas.util import testing as tm @@ -59,9 +59,9 @@ def test_rank_apply(): ('first', False, False, [3., 4., 1., 5., 2.]), ('first', False, True, [.6, .8, .2, 1., .4]), ('dense', True, False, [1., 1., 3., 1., 2.]), - ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', True, True, [1. / 3., 1. / 3., 3. / 3., 1. / 3., 2. / 3.]), ('dense', False, False, [3., 3., 1., 3., 2.]), - ('dense', False, True, [.6, .6, .2, .6, .4]), + ('dense', False, True, [3. / 3., 3. / 3., 1. / 3., 3. / 3., 2. / 3.]), ]) def test_rank_args(grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) @@ -126,7 +126,7 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): @pytest.mark.parametrize("grps", [ ['qux'], ['qux', 'quux']]) @pytest.mark.parametrize("vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-06'), np.nan, np.nan] @@ -167,11 +167,11 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): ('dense', True, 'keep', False, [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), ('dense', True, 'keep', True, - [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + [1. / 3., 1. / 3., np.nan, 3. / 3., 1. / 3., 2. / 3., np.nan, np.nan]), ('dense', False, 'keep', False, [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), ('dense', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + [3. / 3., 3. / 3., np.nan, 1. / 3., 3. / 3., 2. / 3., np.nan, np.nan]), ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), ('average', True, 'no_na', True, [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), @@ -198,10 +198,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), ('dense', True, 'no_na', True, - [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + [0.25, 0.25, 1., 0.75, 0.25, 0.5, 1., 1.]), ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), ('dense', False, 'no_na', True, - [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) + [0.75, 0.75, 1., 0.25, 0.75, 0.5, 1., 1.]) ]) def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): @@ -252,3 +252,20 @@ def test_rank_object_raises(ties_method, ascending, na_option, df.groupby('key').rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct) + + +def test_rank_empty_group(): + # see gh-22519 + column = "A" + df = DataFrame({ + "A": [0, 1, 0], + "B": [1., np.nan, 2.] + }) + + result = df.groupby(column).B.rank(pct=True) + expected = Series([0.5, np.nan, 1.0], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby(column).rank(pct=True) + expected = DataFrame({"B": [0.5, np.nan, 1.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 626057c1ea760..7fccf1f57a886 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -721,6 +721,23 @@ def interweave(list_obj): assert_frame_equal(result, exp) +@pytest.mark.parametrize("fill_method", ['ffill', 'bfill']) +def test_pad_stable_sorting(fill_method): + # GH 21207 + x = [0] * 20 + y = [np.nan] * 10 + [1] * 10 + + if fill_method == 'bfill': + y = y[::-1] + + df = pd.DataFrame({'x': x, 'y': y}) + expected = df.copy() + + result = getattr(df.groupby('x'), fill_method)() + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("test_series", [True, False]) @pytest.mark.parametrize("periods,fill_method,limit", [ (1, 'ffill', None), (1, 'ffill', 1), diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py index dae69a86910af..b138b79caac76 100644 --- a/pandas/tests/indexes/datetimes/test_construction.py +++ b/pandas/tests/indexes/datetimes/test_construction.py @@ -469,6 +469,15 @@ def test_constructor_with_non_normalized_pytz(self, tz): result = DatetimeIndex(['2010'], tz=non_norm_tz) assert pytz.timezone(tz) is result.tz + def test_constructor_timestamp_near_dst(self): + # GH 20854 + ts = [Timestamp('2016-10-30 03:00:00+0300', tz='Europe/Helsinki'), + Timestamp('2016-10-30 03:00:00+0200', tz='Europe/Helsinki')] + result = DatetimeIndex(ts) + expected = DatetimeIndex([ts[0].to_pydatetime(), + ts[1].to_pydatetime()]) + tm.assert_index_equal(result, expected) + class TestTimeSeries(object): diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 193804b66395b..ec37bbbcb6c02 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -278,6 +278,20 @@ def test_wom_len(self, periods): res = date_range(start='20110101', periods=periods, freq='WOM-1MON') assert len(res) == periods + def test_construct_over_dst(self): + # GH 20854 + pre_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=True) + pst_dst = Timestamp('2010-11-07 01:00:00').tz_localize('US/Pacific', + ambiguous=False) + expect_data = [Timestamp('2010-11-07 00:00:00', tz='US/Pacific'), + pre_dst, + pst_dst] + expected = DatetimeIndex(expect_data) + result = date_range(start='2010-11-7', periods=3, + freq='H', tz='US/Pacific') + tm.assert_index_equal(result, expected) + class TestGenRangeGeneration(object): diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 056924f2c6663..743cbc107cce5 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,5 +1,6 @@ import locale import calendar +import unicodedata import pytest @@ -7,7 +8,7 @@ import pandas as pd import pandas.util.testing as tm from pandas import (Index, DatetimeIndex, datetime, offsets, - date_range, Timestamp) + date_range, Timestamp, compat) class TestTimeSeries(object): @@ -284,10 +285,24 @@ def test_datetime_name_accessors(self, time_locale): dti = DatetimeIndex(freq='M', start='2012', end='2013') result = dti.month_name(locale=time_locale) expected = Index([month.capitalize() for month in expected_months]) + + # work around different normalization schemes + # https://github.com/pandas-dev/pandas/issues/22342 + if not compat.PY2: + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + tm.assert_index_equal(result, expected) + for date, expected in zip(dti, expected_months): result = date.month_name(locale=time_locale) - assert result == expected.capitalize() + expected = expected.capitalize() + + if not compat.PY2: + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", result) + + assert result == expected dti = dti.append(DatetimeIndex([pd.NaT])) assert np.isnan(dti.month_name(locale=time_locale)[-1]) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 9180bb0af3af3..801dcb91b124e 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -134,6 +134,21 @@ def test_round(self, tz): ts = '2016-10-17 12:00:00.001501031' DatetimeIndex([ts]).round('1010ns') + def test_no_rounding_occurs(self, tz): + # GH 21262 + rng = date_range(start='2016-01-01', periods=5, + freq='2Min', tz=tz) + + expected_rng = DatetimeIndex([ + Timestamp('2016-01-01 00:00:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:02:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:04:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:06:00', tz=tz, freq='2T'), + Timestamp('2016-01-01 00:08:00', tz=tz, freq='2T'), + ]) + + tm.assert_index_equal(rng.round(freq='2T'), expected_rng) + @pytest.mark.parametrize('test_input, rounder, freq, expected', [ (['2117-01-01 00:00:45'], 'floor', '15s', ['2117-01-01 00:00:45']), (['2117-01-01 00:00:45'], 'ceil', '15s', ['2117-01-01 00:00:45']), @@ -143,6 +158,10 @@ def test_round(self, tz): ['1823-01-01 00:00:01.000000020']), (['1823-01-01 00:00:01'], 'floor', '1s', ['1823-01-01 00:00:01']), (['1823-01-01 00:00:01'], 'ceil', '1s', ['1823-01-01 00:00:01']), + (['2018-01-01 00:15:00'], 'ceil', '15T', ['2018-01-01 00:15:00']), + (['2018-01-01 00:15:00'], 'floor', '15T', ['2018-01-01 00:15:00']), + (['1823-01-01 03:00:00'], 'ceil', '3H', ['1823-01-01 03:00:00']), + (['1823-01-01 03:00:00'], 'floor', '3H', ['1823-01-01 03:00:00']), (('NaT', '1823-01-01 00:00:01'), 'floor', '1s', ('NaT', '1823-01-01 00:00:01')), (('NaT', '1823-01-01 00:00:01'), 'ceil', '1s', diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 09210d8b64d1b..573940edaa08f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -2,7 +2,7 @@ """ Tests for DatetimeIndex timezone-related methods """ -from datetime import datetime, timedelta, tzinfo +from datetime import datetime, timedelta, tzinfo, date, time from distutils.version import LooseVersion import pytest @@ -706,6 +706,32 @@ def test_join_utc_convert(self, join_type): assert isinstance(result, DatetimeIndex) assert result.tz.zone == 'UTC' + @pytest.mark.parametrize("dtype", [ + None, 'datetime64[ns, CET]', + 'datetime64[ns, EST]', 'datetime64[ns, UTC]' + ]) + def test_date_accessor(self, dtype): + # Regression test for GH#21230 + expected = np.array([date(2018, 6, 4), pd.NaT]) + + index = DatetimeIndex(['2018-06-04 10:00:00', pd.NaT], dtype=dtype) + result = index.date + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", [ + None, 'datetime64[ns, CET]', + 'datetime64[ns, EST]', 'datetime64[ns, UTC]' + ]) + def test_time_accessor(self, dtype): + # Regression test for GH#21267 + expected = np.array([time(10, 20, 30), pd.NaT]) + + index = DatetimeIndex(['2018-06-04 10:20:30', pd.NaT], dtype=dtype) + result = index.time + + tm.assert_numpy_array_equal(result, expected) + def test_dti_drop_dont_lose_tz(self): # GH#2621 ind = date_range("2012-12-01", periods=10, tz="utc") diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 45be3974dad63..8b0514764b0c0 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -650,6 +650,14 @@ def test_unit_mixed(self, cache): with pytest.raises(ValueError): pd.to_datetime(arr, errors='raise', cache=cache) + @pytest.mark.parametrize('cache', [True, False]) + def test_unit_rounding(self, cache): + # GH 14156: argument will incur floating point errors but no + # premature rounding + result = pd.to_datetime(1434743731.8770001, unit='s', cache=cache) + expected = pd.Timestamp('2015-06-19 19:55:31.877000093') + assert result == expected + @pytest.mark.parametrize('cache', [True, False]) def test_dataframe(self, cache): diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 5fdf92dcb2044..b1711c3444586 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -6,8 +6,9 @@ from pandas import ( Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical, - date_range, timedelta_range, period_range, notna) + CategoricalIndex, date_range, timedelta_range, period_range, notna) from pandas.compat import lzip +from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.dtypes.dtypes import IntervalDtype import pandas.core.common as com import pandas.util.testing as tm @@ -111,6 +112,22 @@ def test_constructor_string(self, constructor, breaks): with tm.assert_raises_regex(TypeError, msg): constructor(**self.get_kwargs_from_breaks(breaks)) + @pytest.mark.parametrize('cat_constructor', [ + Categorical, CategoricalIndex]) + def test_constructor_categorical_valid(self, constructor, cat_constructor): + # GH 21243/21253 + if isinstance(constructor, partial) and constructor.func is Index: + # Index is defined to create CategoricalIndex from categorical data + pytest.skip() + + breaks = np.arange(10, dtype='int64') + expected = IntervalIndex.from_breaks(breaks) + + cat_breaks = cat_constructor(breaks) + result_kwargs = self.get_kwargs_from_breaks(cat_breaks) + result = constructor(**result_kwargs) + tm.assert_index_equal(result, expected) + def test_generic_errors(self, constructor): # filler input data to be used when supplying invalid kwargs filler = self.get_kwargs_from_breaks(range(10)) @@ -238,6 +255,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'): tuples = lzip(breaks[:-1], breaks[1:]) if isinstance(breaks, (list, tuple)): return {'data': tuples} + elif is_categorical_dtype(breaks): + return {'data': breaks._constructor(tuples)} return {'data': com._asarray_tuplesafe(tuples)} def test_constructor_errors(self): @@ -286,6 +305,8 @@ def get_kwargs_from_breaks(self, breaks, closed='right'): if isinstance(breaks, list): return {'data': ivs} + elif is_categorical_dtype(breaks): + return {'data': breaks._constructor(ivs)} return {'data': np.array(ivs, dtype=object)} def test_generic_errors(self, constructor): diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index 0fadfcf0c7f28..29fe2b0185662 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -110,6 +110,8 @@ def test_constructor_timedelta(self, closed, name, freq, periods): @pytest.mark.parametrize('start, end, freq, expected_endpoint', [ (0, 10, 3, 9), + (0, 10, 1.5, 9), + (0.5, 10, 3, 9.5), (Timedelta('0D'), Timedelta('10D'), '2D4H', Timedelta('8D16H')), (Timestamp('2018-01-01'), Timestamp('2018-02-09'), @@ -125,6 +127,22 @@ def test_early_truncation(self, start, end, freq, expected_endpoint): result_endpoint = result.right[-1] assert result_endpoint == expected_endpoint + @pytest.mark.parametrize('start, end, freq', [ + (0.5, None, None), + (None, 4.5, None), + (0.5, None, 1.5), + (None, 6.5, 1.5)]) + def test_no_invalid_float_truncation(self, start, end, freq): + # GH 21161 + if freq is None: + breaks = [0.5, 1.5, 2.5, 3.5, 4.5] + else: + breaks = [0.5, 2.0, 3.5, 5.0, 6.5] + expected = IntervalIndex.from_breaks(breaks) + + result = interval_range(start=start, end=end, periods=4, freq=freq) + tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('start, mid, end', [ (Timestamp('2018-03-10', tz='US/Eastern'), Timestamp('2018-03-10 23:30:00', tz='US/Eastern'), diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f4fa547574b9e..a0d6907055a2e 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -419,21 +419,24 @@ def test_constructor_dtypes_timedelta(self, attr, klass): result = klass(list(values), dtype=dtype) tm.assert_index_equal(result, index) - def test_constructor_empty_gen(self): - skip_index_keys = ["repeats", "periodIndex", "rangeIndex", - "tuples"] - for key, index in self.generate_index_types(skip_index_keys): - empty = index.__class__([]) - assert isinstance(empty, index.__class__) - assert not len(empty) + @pytest.mark.parametrize("value", [[], iter([]), (x for x in [])]) + @pytest.mark.parametrize("klass", + [Index, Float64Index, Int64Index, UInt64Index, + CategoricalIndex, DatetimeIndex, TimedeltaIndex]) + def test_constructor_empty(self, value, klass): + empty = klass(value) + assert isinstance(empty, klass) + assert not len(empty) @pytest.mark.parametrize("empty,klass", [ (PeriodIndex([], freq='B'), PeriodIndex), + (PeriodIndex(iter([]), freq='B'), PeriodIndex), + (PeriodIndex((x for x in []), freq='B'), PeriodIndex), (RangeIndex(step=1), pd.RangeIndex), (MultiIndex(levels=[[1, 2], ['blue', 'red']], labels=[[], []]), MultiIndex) ]) - def test_constructor_empty(self, empty, klass): + def test_constructor_empty_special(self, empty, klass): assert isinstance(empty, klass) assert not len(empty) @@ -455,6 +458,13 @@ def test_constructor_nonhashable_name(self, indices): tm.assert_raises_regex(TypeError, message, indices.set_names, names=renamed) + def test_constructor_overflow_int64(self): + # see gh-15832 + msg = ("the elements provided in the data cannot " + "all be casted to the dtype int64") + with tm.assert_raises_regex(OverflowError, msg): + Index([np.iinfo(np.uint64).max - 1], dtype="int64") + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', @@ -2088,6 +2098,17 @@ def test_get_duplicates_deprecated(self): with tm.assert_produces_warning(FutureWarning): index.get_duplicates() + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; idx = pd.Index([1, 2])" + ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('idx.', 4)) + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 6a1a1a5bdba4f..a2a4170256088 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -543,38 +543,53 @@ def test_reindex_empty_index(self): tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp)) - def test_is_monotonic(self): - c = CategoricalIndex([1, 2, 3]) + @pytest.mark.parametrize('data, non_lexsorted_data', [ + [[1, 2, 3], [9, 0, 1, 2, 3]], + [list('abc'), list('fabcd')], + ]) + def test_is_monotonic(self, data, non_lexsorted_data): + c = CategoricalIndex(data) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing - c = CategoricalIndex([1, 2, 3], ordered=True) + c = CategoricalIndex(data, ordered=True) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing - c = CategoricalIndex([1, 2, 3], categories=[3, 2, 1]) + c = CategoricalIndex(data, categories=reversed(data)) assert not c.is_monotonic_increasing assert c.is_monotonic_decreasing - c = CategoricalIndex([1, 3, 2], categories=[3, 2, 1]) + c = CategoricalIndex(data, categories=reversed(data), ordered=True) assert not c.is_monotonic_increasing - assert not c.is_monotonic_decreasing + assert c.is_monotonic_decreasing - c = CategoricalIndex([1, 2, 3], categories=[3, 2, 1], ordered=True) + # test when data is neither monotonic increasing nor decreasing + reordered_data = [data[0], data[2], data[1]] + c = CategoricalIndex(reordered_data, categories=reversed(data)) assert not c.is_monotonic_increasing - assert c.is_monotonic_decreasing + assert not c.is_monotonic_decreasing # non lexsorted categories - categories = [9, 0, 1, 2, 3] + categories = non_lexsorted_data - c = CategoricalIndex([9, 0], categories=categories) + c = CategoricalIndex(categories[:2], categories=categories) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing - c = CategoricalIndex([0, 1], categories=categories) + c = CategoricalIndex(categories[1:3], categories=categories) assert c.is_monotonic_increasing assert not c.is_monotonic_decreasing + @pytest.mark.parametrize('values, expected', [ + ([1, 2, 3], True), + ([1, 3, 1], False), + (list('abc'), True), + (list('aba'), False)]) + def test_is_unique(self, values, expected): + ci = CategoricalIndex(values) + assert ci.is_unique is expected + def test_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 37f70090c179f..a7e90207c9ad7 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -12,8 +12,8 @@ import pandas as pd -from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex, - compat, date_range, period_range) +from pandas import (CategoricalIndex, Categorical, DataFrame, Index, + MultiIndex, compat, date_range, period_range) from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.core.dtypes.dtypes import CategoricalDtype @@ -164,6 +164,22 @@ def test_set_name_methods(self): assert res is None assert ind.names == new_names2 + @pytest.mark.parametrize('inplace', [True, False]) + def test_set_names_with_nlevel_1(self, inplace): + # GH 21149 + # Ensure that .set_names for MultiIndex with + # nlevels == 1 does not raise any errors + expected = pd.MultiIndex(levels=[[0, 1]], + labels=[[0, 1]], + names=['first']) + m = pd.MultiIndex.from_product([[0, 1]]) + result = m.set_names('first', level=0, inplace=inplace) + + if inplace: + result = m + + tm.assert_index_equal(result, expected) + def test_set_levels_labels_directly(self): # setting levels/labels directly raises AttributeError @@ -639,22 +655,27 @@ def test_constructor_nonhashable_names(self): # With .set_names() tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed) - @pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'], - ['1', 'a', '1']]) + @pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2], + [1, 'a', 1]]) def test_duplicate_level_names(self, names): - # GH18872 - pytest.raises(ValueError, pd.MultiIndex.from_product, - [[0, 1]] * 3, names=names) + # GH18872, GH19029 + mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names) + assert mi.names == names # With .rename() mi = pd.MultiIndex.from_product([[0, 1]] * 3) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names) + mi = mi.rename(names) + assert mi.names == names # With .rename(., level=) - mi.rename(names[0], level=1, inplace=True) - tm.assert_raises_regex(ValueError, "Duplicated level name:", - mi.rename, names[:2], level=[0, 2]) + mi.rename(names[1], level=1, inplace=True) + mi = mi.rename([names[0], names[2]], level=[0, 2]) + assert mi.names == names + + def test_duplicate_level_names_access_raises(self): + self.index.names = ['foo', 'foo'] + tm.assert_raises_regex(KeyError, 'Level foo not found', + self.index._get_level_number, 'foo') def assert_multiindex_copied(self, copy, original): # Levels should be (at least, shallow copied) @@ -1165,12 +1186,12 @@ def test_iter(self): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] assert result == expected - def test_legacy_pickle(self): + def test_legacy_pickle(self, datapath): if PY3: pytest.skip("testing for legacy pickles not " "support on py3") - path = tm.get_data_path('multiindex_v1.pickle') + path = datapath('indexes', 'data', 'multiindex_v1.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) @@ -1186,10 +1207,10 @@ def test_legacy_pickle(self): assert_almost_equal(res, exp) assert_almost_equal(exp, exp2) - def test_legacy_v2_unpickle(self): + def test_legacy_v2_unpickle(self, datapath): # 0.7.3 -> 0.8.0 format manage - path = tm.get_data_path('mindex_073.pickle') + path = datapath('indexes', 'data', 'mindex_073.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) @@ -1574,6 +1595,14 @@ def test_get_indexer_nearest(self): with pytest.raises(NotImplementedError): midx.get_indexer(['a'], method='pad', tolerance=2) + def test_get_indexer_categorical_time(self): + # https://github.com/pandas-dev/pandas/issues/21390 + midx = MultiIndex.from_product( + [Categorical(['a', 'b', 'c']), + Categorical(date_range("2012-01-01", periods=3, freq='H'))]) + result = midx.get_indexer(midx) + tm.assert_numpy_array_equal(result, np.arange(9, dtype=np.intp)) + def test_hash_collisions(self): # non-smoke test that we don't get hash collisions @@ -3279,3 +3308,20 @@ def test_duplicate_multiindex_labels(self): with pytest.raises(ValueError): ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], inplace=True) + + def test_multiindex_compare(self): + # GH 21149 + # Ensure comparison operations for MultiIndex with nlevels == 1 + # behave consistently with those for MultiIndex with nlevels > 1 + + midx = pd.MultiIndex.from_product([[0, 1]]) + + # Equality self-test: MultiIndex object vs self + expected = pd.Series([True, True]) + result = pd.Series(midx == midx) + tm.assert_series_equal(result, expected) + + # Greater than comparison: MultiIndex object vs self + expected = pd.Series([False, False]) + result = pd.Series(midx > midx) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8deb51e190bab..7623587803b41 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,32 +1,23 @@ -import os - import pytest from pandas.io.parsers import read_table -from pandas.util import testing as tm - - -@pytest.fixture -def parser_data(request): - return os.path.join(tm.get_data_path(), '..', 'parser', 'data') @pytest.fixture -def tips_file(parser_data): +def tips_file(datapath): """Path to the tips dataset""" - return os.path.join(parser_data, 'tips.csv') + return datapath('io', 'parser', 'data', 'tips.csv') @pytest.fixture -def jsonl_file(parser_data): +def jsonl_file(datapath): """Path a JSONL dataset""" - return os.path.join(parser_data, 'items.jsonl') + return datapath('io', 'parser', 'data', 'items.jsonl') @pytest.fixture -def salaries_table(parser_data): +def salaries_table(datapath): """DataFrame with the salaries dataset""" - path = os.path.join(parser_data, 'salaries.csv') - return read_table(path) + return read_table(datapath('io', 'parser', 'data', 'salaries.csv')) @pytest.fixture diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f221df93dd412..191e3f37f1c37 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -305,6 +305,44 @@ def test_repr_non_interactive(self): assert not has_truncated_repr(df) assert not has_expanded_repr(df) + def test_repr_truncates_terminal_size(self): + # https://github.com/pandas-dev/pandas/issues/21180 + # TODO: use mock fixutre. + # This is being backported, so doing it directly here. + try: + from unittest import mock + except ImportError: + mock = pytest.importorskip("mock") + + terminal_size = (118, 96) + p1 = mock.patch('pandas.io.formats.console.get_terminal_size', + return_value=terminal_size) + p2 = mock.patch('pandas.io.formats.format.get_terminal_size', + return_value=terminal_size) + + index = range(5) + columns = pd.MultiIndex.from_tuples([ + ('This is a long title with > 37 chars.', 'cat'), + ('This is a loooooonger title with > 43 chars.', 'dog'), + ]) + df = pd.DataFrame(1, index=index, columns=columns) + + with p1, p2: + result = repr(df) + + h1, h2 = result.split('\n')[:2] + assert 'long' in h1 + assert 'loooooonger' in h1 + assert 'cat' in h2 + assert 'dog' in h2 + + # regular columns + df2 = pd.DataFrame({"A" * 41: [1, 2], 'B' * 41: [1, 2]}) + with p1, p2: + result = repr(df2) + + assert df2.columns[0] in result.split('\n')[0] + def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: @@ -916,8 +954,8 @@ def test_unicode_problem_decoding_as_ascii(self): dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})}) compat.text_type(dm.to_string()) - def test_string_repr_encoding(self): - filepath = tm.get_data_path('unicode_series.csv') + def test_string_repr_encoding(self, datapath): + filepath = datapath('io', 'formats', 'data', 'unicode_series.csv') df = pd.read_csv(filepath, header=None, encoding='latin1') repr(df) repr(df[1]) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index dfa3751bff57a..36c4ae547ad4e 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -285,3 +285,18 @@ def test_to_csv_string_array_utf8(self): df.to_csv(path, encoding='utf-8') with open(path, 'r') as f: assert f.read() == expected_utf8 + + @tm.capture_stdout + def test_to_csv_stdout_file(self): + # GH 21561 + df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], + columns=['name_1', 'name_2']) + expected_ascii = '''\ +,name_1,name_2 +0,foo,bar +1,baz,qux +''' + df.to_csv(sys.stdout, encoding='ascii') + output = sys.stdout.getvalue() + assert output == expected_ascii + assert not sys.stdout.closed diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c9074ca49e5be..1b9cbc57865d2 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -2,6 +2,7 @@ import pandas as pd import pandas.util.testing as tm +import pandas.util._test_decorators as td from pandas.util.testing import assert_frame_equal, assert_raises_regex @@ -21,16 +22,17 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -def test_read_zipped_json(): - uncompressed_path = tm.get_data_path("tsframe_v012.json") +def test_read_zipped_json(datapath): + uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) - compressed_path = tm.get_data_path("tsframe_v012.json.zip") + compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df) +@td.skip_if_not_us_locale def test_with_s3_url(compression): boto3 = pytest.importorskip('boto3') pytest.importorskip('s3fs') diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 49b39c17238ae..b6483d0e978ba 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -560,3 +560,16 @@ def test_multiindex(self, index_names): out = df.to_json(orient="table") result = pd.read_json(out, orient="table") tm.assert_frame_equal(df, result) + + @pytest.mark.parametrize("strict_check", [ + pytest.param(True, marks=pytest.mark.xfail), False]) + def test_empty_frame_roundtrip(self, strict_check): + # GH 21287 + df = pd.DataFrame([], columns=['a', 'b', 'c']) + expected = df.copy() + out = df.to_json(orient='table') + result = pd.read_json(out, orient='table') + # TODO: When DF coercion issue (#21345) is resolved tighten type checks + tm.assert_frame_equal(expected, result, + check_dtype=strict_check, + check_index_type=strict_check) diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 0fabaf747b6de..200a853c48900 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -123,6 +123,12 @@ def test_simple_normalize_with_separator(self, deep_nested): 'country', 'states_name']).sort_values() assert result.columns.sort_values().equals(expected) + def test_value_array_record_prefix(self): + # GH 21536 + result = json_normalize({'A': [1, 2]}, 'A', record_prefix='Prefix.') + expected = DataFrame([[1], [2]], columns=['Prefix.0']) + tm.assert_frame_equal(result, expected) + def test_more_deeply_nested(self, deep_nested): result = json_normalize(deep_nested, ['states', 'cities'], @@ -238,15 +244,16 @@ def test_non_ascii_key(self): tm.assert_frame_equal(result, expected) def test_missing_field(self, author_missing_data): - # GH20030: Checks for robustness of json_normalize - should - # unnest records where only the first record has a None value + # GH20030: result = json_normalize(author_missing_data) ex_data = [ - {'author_name.first': np.nan, + {'info': np.nan, + 'author_name.first': np.nan, 'author_name.last_name': np.nan, 'info.created_at': np.nan, 'info.last_updated': np.nan}, - {'author_name.first': 'Jane', + {'info': None, + 'author_name.first': 'Jane', 'author_name.last_name': 'Doe', 'info.created_at': '11/08/1993', 'info.last_updated': '26/05/2012'} @@ -351,9 +358,8 @@ def test_json_normalize_errors(self): errors='raise' ) - def test_nonetype_dropping(self): - # GH20030: Checks that None values are dropped in nested_to_record - # to prevent additional columns of nans when passed to DataFrame + def test_donot_drop_nonevalues(self): + # GH21356 data = [ {'info': None, 'author_name': @@ -367,7 +373,8 @@ def test_nonetype_dropping(self): ] result = nested_to_record(data) expected = [ - {'author_name.first': 'Smith', + {'info': None, + 'author_name.first': 'Smith', 'author_name.last_name': 'Appleseed'}, {'author_name.first': 'Jane', 'author_name.last_name': 'Doe', @@ -375,3 +382,61 @@ def test_nonetype_dropping(self): 'info.last_updated': '26/05/2012'}] assert result == expected + + def test_nonetype_top_level_bottom_level(self): + # GH21158: If inner level json has a key with a null value + # make sure it doesnt do a new_d.pop twice and except + data = { + "id": None, + "location": { + "country": { + "state": { + "id": None, + "town.info": { + "id": None, + "region": None, + "x": 49.151580810546875, + "y": -33.148521423339844, + "z": 27.572303771972656}}} + } + } + result = nested_to_record(data) + expected = { + 'id': None, + 'location.country.state.id': None, + 'location.country.state.town.info.id': None, + 'location.country.state.town.info.region': None, + 'location.country.state.town.info.x': 49.151580810546875, + 'location.country.state.town.info.y': -33.148521423339844, + 'location.country.state.town.info.z': 27.572303771972656} + assert result == expected + + def test_nonetype_multiple_levels(self): + # GH21158: If inner level json has a key with a null value + # make sure it doesnt do a new_d.pop twice and except + data = { + "id": None, + "location": { + "id": None, + "country": { + "id": None, + "state": { + "id": None, + "town.info": { + "region": None, + "x": 49.151580810546875, + "y": -33.148521423339844, + "z": 27.572303771972656}}} + } + } + result = nested_to_record(data) + expected = { + 'id': None, + 'location.id': None, + 'location.country.id': None, + 'location.country.state.id': None, + 'location.country.state.town.info.region': None, + 'location.country.state.town.info.x': 49.151580810546875, + 'location.country.state.town.info.y': -33.148521423339844, + 'location.country.state.town.info.z': 27.572303771972656} + assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7e497c395266f..b5a2be87de1c4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -15,6 +15,7 @@ assert_series_equal, network, ensure_clean, assert_index_equal) import pandas.util.testing as tm +import pandas.util._test_decorators as td _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -37,8 +38,9 @@ class TestPandasContainer(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(scope="function", autouse=True) + def setup(self, datapath): + self.dirpath = datapath("io", "json", "data") self.ts = tm.makeTimeSeries() self.ts.name = 'ts' @@ -59,7 +61,8 @@ def setup_method(self, method): self.mixed_frame = _mixed_frame.copy() self.categorical = _cat_frame.copy() - def teardown_method(self, method): + yield + del self.dirpath del self.ts @@ -1038,6 +1041,7 @@ def test_read_inline_jsonl(self): expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) + @td.skip_if_not_us_locale def test_read_s3_jsonl(self, s3_resource): # GH17200 diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 2423ddcd9a1a0..fb510f1a74556 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -54,20 +54,21 @@ def test_bad_stream_exception(self): # and C engine will raise UnicodeDecodeError instead of # c engine raising ParserError and swallowing exception # that caused read to fail. - handle = open(self.csv_shiftjs, "rb") codec = codecs.lookup("utf-8") utf8 = codecs.lookup('utf-8') - # stream must be binary UTF8 - stream = codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, - codec.streamwriter) + if compat.PY3: msg = "'utf-8' codec can't decode byte" else: msg = "'utf8' codec can't decode byte" - with tm.assert_raises_regex(UnicodeDecodeError, msg): - self.read_csv(stream) - stream.close() + + # stream must be binary UTF8 + with open(self.csv_shiftjs, "rb") as handle, codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) as stream: + + with tm.assert_raises_regex(UnicodeDecodeError, msg): + self.read_csv(stream) def test_read_csv(self): if not compat.PY3: @@ -76,7 +77,7 @@ def test_read_csv(self): else: prefix = u("file://") - fname = prefix + compat.text_type(self.csv1) + fname = prefix + compat.text_type(os.path.abspath(self.csv1)) self.read_csv(fname, index_col=0, parse_dates=True) def test_1000_sep(self): @@ -237,6 +238,21 @@ def test_csv_mixed_type(self): out = self.read_csv(StringIO(data)) tm.assert_frame_equal(out, expected) + def test_read_csv_low_memory_no_rows_with_index(self): + if self.engine == "c" and not self.low_memory: + pytest.skip("This is a low-memory specific test") + + # see gh-21141 + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + out = self.read_csv(StringIO(data), low_memory=True, + index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(out, expected) + def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, @@ -635,21 +651,19 @@ def test_read_csv_parse_simple_list(self): tm.assert_frame_equal(df, expected) @tm.network - def test_url(self): + def test_url(self, datapath): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing @pytest.mark.slow - def test_file(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + def test_file(self, datapath): + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) try: @@ -739,8 +753,8 @@ def test_utf16_bom_skiprows(self): tm.assert_frame_equal(result, expected) - def test_utf16_example(self): - path = tm.get_data_path('utf16_ex.txt') + def test_utf16_example(self, datapath): + path = datapath('io', 'parser', 'data', 'utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') @@ -751,8 +765,8 @@ def test_utf16_example(self): result = self.read_table(buf, encoding='utf-16') assert len(result) == 50 - def test_unicode_encoding(self): - pth = tm.get_data_path('unicode_series.csv') + def test_unicode_encoding(self, datapath): + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') result = self.read_csv(pth, header=None, encoding='latin-1') result = result.set_index(0) @@ -1497,10 +1511,9 @@ def test_internal_eof_byte_to_file(self): result = self.read_csv(path) tm.assert_frame_equal(result, expected) - def test_sub_character(self): + def test_sub_character(self, datapath): # see gh-16893 - dirpath = tm.get_data_path() - filename = os.path.join(dirpath, "sub_char.csv") + filename = datapath('io', 'parser', 'data', 'sub_char.csv') expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) result = self.read_csv(filename) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 01c6620e50d37..e4950af19ea95 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -110,20 +110,19 @@ def test_read_csv_infer_compression(self): # see gh-9770 expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) - inputs = [self.csv1, self.csv1 + '.gz', - self.csv1 + '.bz2', open(self.csv1)] + with open(self.csv1) as f: + inputs = [self.csv1, self.csv1 + '.gz', + self.csv1 + '.bz2', f] - for f in inputs: - df = self.read_csv(f, index_col=0, parse_dates=True, - compression='infer') - - tm.assert_frame_equal(expected, df) + for inp in inputs: + df = self.read_csv(inp, index_col=0, parse_dates=True, + compression='infer') - inputs[3].close() + tm.assert_frame_equal(expected, df) - def test_read_csv_compressed_utf16_example(self): + def test_read_csv_compressed_utf16_example(self, datapath): # GH18071 - path = tm.get_data_path('utf16_ex_small.zip') + path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip') result = self.read_csv(path, encoding='utf-16', compression='zip', sep='\t') diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index b91ce04673e29..8060ebf2fbcd4 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -125,9 +125,9 @@ def test_categorical_dtype_high_cardinality_numeric(self): np.sort(actual.a.cat.categories), ordered=True) tm.assert_frame_equal(actual, expected) - def test_categorical_dtype_encoding(self): + def test_categorical_dtype_encoding(self, datapath): # GH 10153 - pth = tm.get_data_path('unicode_series.csv') + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') encoding = 'latin-1' expected = self.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) @@ -135,7 +135,7 @@ def test_categorical_dtype_encoding(self): dtype={1: 'category'}) tm.assert_frame_equal(actual, expected) - pth = tm.get_data_path('utf16_ex.txt') + pth = datapath('io', 'parser', 'data', 'utf16_ex.txt') encoding = 'utf-16' expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index d2c3f82e95c4d..cc224efd533b7 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -369,3 +369,14 @@ def test_no_na_filter_on_index(self): expected = DataFrame({"a": [1, 4], "c": [3, 6]}, index=Index([np.nan, 5.0], name="b")) tm.assert_frame_equal(out, expected) + + def test_inf_na_values_with_int_index(self): + # see gh-17128 + data = "idx,col1,col2\n1,3,4\n2,inf,-inf" + + # Don't fail with OverflowError with infs and integer index column + out = self.read_csv(StringIO(data), index_col=[0], + na_values=['inf', '-inf']) + expected = DataFrame({"col1": [3, np.nan], "col2": [4, np.nan]}, + index=Index([1, 2], name="idx")) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index fdf45f307e953..72d2c5fd8d18f 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -48,11 +48,19 @@ def check_compressed_urls(salaries_table, compression, extension, mode, tm.assert_frame_equal(url_table, salaries_table) +@pytest.fixture +def tips_df(datapath): + """DataFrame with the tips dataset.""" + return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) + + @pytest.mark.usefixtures("s3_resource") +@td.skip_if_not_us_locale() class TestS3(object): - def test_parse_public_s3_bucket(self): + def test_parse_public_s3_bucket(self, tips_df): pytest.importorskip('s3fs') + # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: @@ -60,45 +68,40 @@ def test_parse_public_s3_bucket(self): ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self): + def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self): + def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self): + def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self): + def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) @@ -109,14 +112,13 @@ def test_parse_public_s3_bucket_chunked(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self): + def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp, @@ -127,36 +129,33 @@ def test_parse_public_s3_bucket_chunked_python(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self): + def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self): + def test_infer_s3_compression(self, tips_df): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self): + def test_parse_public_s3_bucket_nrows_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) def test_s3_fails(self): with pytest.raises(IOError): diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 7717102b64fc5..b6f13039641a2 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import pytest import pandas.util.testing as tm from pandas import read_csv, read_table, DataFrame @@ -45,8 +46,9 @@ def read_table(self, *args, **kwargs): def float_precision_choices(self): raise com.AbstractMethodError(self) - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index ab4c14034cd20..c7026e3e0fc88 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -28,31 +28,26 @@ class TestTextReader(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') def test_file_handle(self): - try: - f = open(self.csv1, 'rb') + with open(self.csv1, 'rb') as f: reader = TextReader(f) - result = reader.read() # noqa - finally: - f.close() + reader.read() def test_string_filename(self): reader = TextReader(self.csv1, header=None) reader.read() def test_file_handle_mmap(self): - try: - f = open(self.csv1, 'rb') + with open(self.csv1, 'rb') as f: reader = TextReader(f, memory_map=True, header=None) reader.read() - finally: - f.close() def test_StringIO(self): with open(self.csv1, 'rb') as f: diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 5da347e47957c..101ee3e619f5b 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -11,8 +11,9 @@ class TestSAS7BDAT(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: @@ -123,9 +124,8 @@ def test_iterator_read_too_much(self): rdr.close() -def test_encoding_options(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test1.sas7bdat") +def test_encoding_options(datapath): + fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: @@ -143,51 +143,48 @@ def test_encoding_options(): assert(x == y.decode()) -def test_productsales(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "productsales.sas7bdat") +def test_productsales(datapath): + fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') - fname = os.path.join(dirpath, "productsales.csv") + fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) -def test_12659(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test_12659.sas7bdat") +def test_12659(datapath): + fname = datapath("io", "sas", "data", "test_12659.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "test_12659.csv") + fname = datapath("io", "sas", "data", "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0) -def test_airline(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "airline.sas7bdat") +def test_airline(datapath): + fname = datapath("io", "sas", "data", "airline.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "airline.csv") + fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False) -def test_date_time(): +def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "datetime.sas7bdat") + fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "datetime.csv") + fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) + # GH 19732: Timestamps imported from sas will incur floating point errors + df.iloc[:, 3] = df.iloc[:, 3].dt.round('us') tm.assert_frame_equal(df, df0) -def test_zero_variables(): +def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "zero_variables.sas7bdat") + fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index de31c3e36a8d5..6e5b2ab067aa5 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -1,3 +1,4 @@ +import pytest import pandas as pd import pandas.util.testing as tm from pandas.io.sas.sasreader import read_sas @@ -18,8 +19,9 @@ def numeric_as_float(data): class TestXport(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 98c0effabec84..a6b331685e72a 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -9,10 +9,11 @@ from pandas import DataFrame from pandas import read_clipboard from pandas import get_option +from pandas.compat import PY2 from pandas.util import testing as tm from pandas.util.testing import makeCustomDataframe as mkdf from pandas.io.clipboard.exceptions import PyperclipException -from pandas.io.clipboard import clipboard_set +from pandas.io.clipboard import clipboard_set, clipboard_get try: @@ -22,73 +23,120 @@ _DEPS_INSTALLED = 0 +def build_kwargs(sep, excel): + kwargs = {} + if excel != 'default': + kwargs['excel'] = excel + if sep != 'default': + kwargs['sep'] = sep + return kwargs + + +@pytest.fixture(params=['delims', 'utf8', 'string', 'long', 'nonascii', + 'colwidth', 'mixed', 'float', 'int']) +def df(request): + data_type = request.param + + if data_type == 'delims': + return pd.DataFrame({'a': ['"a,\t"b|c', 'd\tef´'], + 'b': ['hi\'j', 'k\'\'lm']}) + elif data_type == 'utf8': + return pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], + 'b': ['øπ∆˚¬', 'œ∑´®']}) + elif data_type == 'string': + return mkdf(5, 3, c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'long': + max_rows = get_option('display.max_rows') + return mkdf(max_rows + 1, 3, + data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'nonascii': + return pd.DataFrame({'en': 'in English'.split(), + 'es': 'en español'.split()}) + elif data_type == 'colwidth': + _cw = get_option('display.max_colwidth') + 1 + return mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'mixed': + return DataFrame({'a': np.arange(1.0, 6.0) + 0.01, + 'b': np.arange(1, 6), + 'c': list('abcde')}) + elif data_type == 'float': + return mkdf(5, 3, data_gen_f=lambda r, c: float(r) + 0.01, + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + elif data_type == 'int': + return mkdf(5, 3, data_gen_f=lambda *args: randint(2), + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) + else: + raise ValueError + + @pytest.mark.single @pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") class TestClipboard(object): - - @classmethod - def setup_class(cls): - cls.data = {} - cls.data['string'] = mkdf(5, 3, c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['int'] = mkdf(5, 3, data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['float'] = mkdf(5, 3, - data_gen_f=lambda r, c: float(r) + 0.01, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - cls.data['mixed'] = DataFrame({'a': np.arange(1.0, 6.0) + 0.01, - 'b': np.arange(1, 6), - 'c': list('abcde')}) - - # Test columns exceeding "max_colwidth" (GH8305) - _cw = get_option('display.max_colwidth') + 1 - cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test GH-5346 - max_rows = get_option('display.max_rows') - cls.data['longdf'] = mkdf(max_rows + 1, 3, - data_gen_f=lambda *args: randint(2), - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) - # Test for non-ascii text: GH9263 - cls.data['nonascii'] = pd.DataFrame({'en': 'in English'.split(), - 'es': 'en español'.split()}) - # unicode round trip test for GH 13747, GH 12529 - cls.data['utf8'] = pd.DataFrame({'a': ['µasd', 'Ωœ∑´'], - 'b': ['øπ∆˚¬', 'œ∑´®']}) - cls.data_types = list(cls.data.keys()) - - @classmethod - def teardown_class(cls): - del cls.data_types, cls.data - - def check_round_trip_frame(self, data_type, excel=None, sep=None, + def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): - data = self.data[data_type] data.to_clipboard(excel=excel, sep=sep, encoding=encoding) - if sep is not None: - result = read_clipboard(sep=sep, index_col=0, encoding=encoding) - else: - result = read_clipboard(encoding=encoding) + result = read_clipboard(sep=sep or '\t', index_col=0, + encoding=encoding) tm.assert_frame_equal(data, result, check_dtype=False) - def test_round_trip_frame_sep(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, sep=',') - self.check_round_trip_frame(dt, sep=r'\s+') - self.check_round_trip_frame(dt, sep='|') - - def test_round_trip_frame_string(self): - for dt in self.data_types: - self.check_round_trip_frame(dt, excel=False) - - def test_round_trip_frame(self): - for dt in self.data_types: - self.check_round_trip_frame(dt) + # Test that default arguments copy as tab delimited + def test_round_trip_frame(self, df): + self.check_round_trip_frame(df) + + # Test that explicit delimiters are respected + @pytest.mark.parametrize('sep', ['\t', ',', '|']) + def test_round_trip_frame_sep(self, df, sep): + self.check_round_trip_frame(df, sep=sep) + + # Test white space separator + def test_round_trip_frame_string(self, df): + df.to_clipboard(excel=False, sep=None) + result = read_clipboard() + assert df.to_string() == result.to_string() + assert df.shape == result.shape + + # Two character separator is not supported in to_clipboard + # Test that multi-character separators are not silently passed + def test_excel_sep_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=True, sep=r'\t') + + # Separator is ignored when excel=False and should produce a warning + def test_copy_delim_warning(self, df): + with tm.assert_produces_warning(): + df.to_clipboard(excel=False, sep='\t') + + # Tests that the default behavior of to_clipboard is tab + # delimited and excel="True" + @pytest.mark.parametrize('sep', ['\t', None, 'default']) + @pytest.mark.parametrize('excel', [True, None, 'default']) + def test_clipboard_copy_tabs_default(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + if PY2: + # to_clipboard copies unicode, to_csv produces bytes. This is + # expected behavior + assert clipboard_get().encode('utf-8') == df.to_csv(sep='\t') + else: + assert clipboard_get() == df.to_csv(sep='\t') + + # Tests reading of white space separated tables + @pytest.mark.parametrize('sep', [None, 'default']) + @pytest.mark.parametrize('excel', [False]) + def test_clipboard_copy_strings(self, sep, excel, df): + kwargs = build_kwargs(sep, excel) + df.to_clipboard(**kwargs) + result = read_clipboard(sep=r'\s+') + assert result.to_string() == df.to_string() + assert df.shape == result.shape def test_read_clipboard_infer_excel(self): # gh-19010: avoid warnings @@ -124,15 +172,13 @@ def test_read_clipboard_infer_excel(self): tm.assert_frame_equal(res, exp) - def test_invalid_encoding(self): + def test_invalid_encoding(self, df): # test case for testing invalid encoding - data = self.data['string'] with pytest.raises(ValueError): - data.to_clipboard(encoding='ascii') + df.to_clipboard(encoding='ascii') with pytest.raises(NotImplementedError): pd.read_clipboard(encoding='ascii') - def test_round_trip_valid_encodings(self): - for enc in ['UTF-8', 'utf-8', 'utf8']: - for dt in self.data_types: - self.check_round_trip_frame(dt, encoding=enc) + @pytest.mark.parametrize('enc', ['UTF-8', 'utf-8', 'utf8']) + def test_round_trip_valid_encodings(self, enc, df): + self.check_round_trip_frame(df, encoding=enc) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a89156db38ae3..5c9739be73393 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -149,27 +149,22 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): reader(path) @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_fwf, 'os', os.path.join(HERE, 'data', - 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', os.path.join(HERE, 'data', - 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data', - 'test1.sas7bdat')), - (pd.read_json, 'os', os.path.join(HERE, 'json', 'data', - 'tsframe_v012.json')), - (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data', - 'frame.mp')), - (pd.read_pickle, 'os', os.path.join(HERE, 'data', - 'categorical_0_14_1.pickle')), + (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), + (pd.read_table, 'os', ('io', 'data', 'iris.csv')), + (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), + (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), + (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), + (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', + 'datetimetz_object.h5')), + (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), + (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), + (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), + (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), + (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), ]) - def test_read_fspath_all(self, reader, module, path): + def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) + path = datapath(*path) mypath = CustomFSPath(path) result = reader(mypath) @@ -232,13 +227,14 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) -class TestMMapWrapper(object): +@pytest.fixture +def mmap_file(datapath): + return datapath('io', 'data', 'test_mmap.csv') + - def setup_method(self, method): - self.mmap_file = os.path.join(tm.get_data_path(), - 'test_mmap.csv') +class TestMMapWrapper(object): - def test_constructor_bad_file(self): + def test_constructor_bad_file(self, mmap_file): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 @@ -252,15 +248,15 @@ def test_constructor_bad_file(self): tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) - target = open(self.mmap_file, 'r') + target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assert_raises_regex( ValueError, msg, common.MMapWrapper, target) - def test_get_attr(self): - with open(self.mmap_file, 'r') as target: + def test_get_attr(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) @@ -273,8 +269,8 @@ def test_get_attr(self): assert not hasattr(wrapper, 'foo') - def test_next(self): - with open(self.mmap_file, 'r') as target: + def test_next(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) lines = target.readlines() diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 05423474f330a..20f403e71fd36 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -39,8 +39,9 @@ @td.skip_if_no('xlrd', '0.9') class SharedItems(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -49,7 +50,6 @@ def setup_method(self, method): def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. - Test data path is defined by pandas.util.testing.get_data_path() Parameters ---------- @@ -68,8 +68,7 @@ def get_csv_refdf(self, basename): def get_excelfile(self, basename, ext): """ - Return test data ExcelFile instance. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data ExcelFile instance. Parameters ---------- @@ -86,8 +85,7 @@ def get_excelfile(self, basename, ext): def get_exceldf(self, basename, ext, *args, **kwds): """ - Return test data DataFrame. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data DataFrame. Parameters ---------- @@ -578,6 +576,7 @@ def test_read_from_http_url(self, ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_no('s3fs') + @td.skip_if_not_us_locale def test_read_from_s3_url(self, ext): boto3 = pytest.importorskip('boto3') moto = pytest.importorskip('moto') diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a56946b82b027..9c6a8de7ed446 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1,6 +1,5 @@ from __future__ import print_function -import glob import os import re import threading @@ -25,8 +24,18 @@ import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network +HERE = os.path.dirname(__file__) -DATA_PATH = tm.get_data_path() + +@pytest.fixture(params=[ + 'chinese_utf-16.html', + 'chinese_utf-32.html', + 'chinese_utf-8.html', + 'letz_latin1.html', +]) +def html_encoding_file(request, datapath): + """Parametrized fixture for HTML encoding test filenames.""" + return datapath('io', 'data', 'html_encoding', request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): @@ -44,11 +53,11 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): @td.skip_if_no('bs4') -def test_bs4_version_fails(monkeypatch): +def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): - read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') + read_html(datapath("io", "data", "spam.html"), flavor='bs4') def test_invalid_flavor(): @@ -59,8 +68,8 @@ def test_invalid_flavor(): @td.skip_if_no('bs4') @td.skip_if_no('lxml') -def test_same_ordering(): - filename = os.path.join(DATA_PATH, 'valid_markup.html') +def test_same_ordering(datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -72,11 +81,14 @@ def test_same_ordering(): pytest.param('lxml', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): - spam_data = os.path.join(DATA_PATH, 'spam.html') - spam_data_kwargs = {} - if PY3: - spam_data_kwargs['encoding'] = 'UTF-8' - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + @pytest.fixture(autouse=True) + def set_files(self, datapath): + self.spam_data = datapath('io', 'data', 'spam.html') + self.spam_data_kwargs = {} + if PY3: + self.spam_data_kwargs['encoding'] = 'UTF-8' + self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -272,7 +284,8 @@ def test_invalid_url(self): @pytest.mark.slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), 'First', + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), + 'First', attrs={'id': 'table'}) assert isinstance(dfs, list) for df in dfs: @@ -326,7 +339,7 @@ def test_multiindex_header_index_skiprows(self): @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) assert isinstance(dfs, list) @@ -352,9 +365,9 @@ def test_python_docs_table(self): assert sorted(zz) == sorted(['Repo', 'What']) @pytest.mark.slow - def test_thousands_macau_stats(self): + def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] @@ -362,9 +375,9 @@ def test_thousands_macau_stats(self): assert not any(s.isna().any() for _, s in df.iteritems()) @pytest.mark.slow - def test_thousands_macau_index_col(self): + def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath('io', 'data', 'macau.html') dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -518,8 +531,8 @@ def test_countries_municipalities(self): res2 = self.read_html(data2, header=0) assert_framelist_equal(res1, res2) - def test_nyse_wsj_commas_table(self): - data = os.path.join(DATA_PATH, 'nyse_wsj.html') + def test_nyse_wsj_commas_table(self, datapath): + data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] @@ -530,7 +543,7 @@ def test_nyse_wsj_commas_table(self): tm.assert_index_equal(df.columns, columns) @pytest.mark.slow - def test_banklist_header(self): + def test_banklist_header(self, datapath): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -541,7 +554,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] - ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), + ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) assert df.shape == ground_truth.shape @@ -658,19 +671,19 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + def test_computer_sales_page(self, datapath): + data = datapath('io', 'data', 'computer_sales_page.html') with tm.assert_raises_regex(ParserError, r"Passed header=\[0,1\] are " r"too many rows for this " r"multi_index of columns"): self.read_html(data, header=[0, 1]) - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + data = datapath('io', 'data', 'computer_sales_page.html') assert self.read_html(data, header=[1, 2]) - def test_wikipedia_states_table(self): - data = os.path.join(DATA_PATH, 'wikipedia_states.html') + def test_wikipedia_states_table(self, datapath): + data = datapath('io', 'data', 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] @@ -784,15 +797,15 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self): - filename = os.path.join(DATA_PATH, 'valid_markup.html') + def test_works_on_valid_markup(self, datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self): - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + def test_fallback_success(self, datapath): + banklist_data = datapath('io', 'data', 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) def test_to_html_timestamp(self): @@ -835,22 +848,23 @@ def test_displayed_only(self, displayed_only, exp0, exp1): else: assert len(dfs) == 1 # Should not parse hidden table - @pytest.mark.parametrize("f", glob.glob( - os.path.join(DATA_PATH, 'html_encoding', '*.html'))) - def test_encode(self, f): - _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') + def test_encode(self, html_encoding_file): + _, encoding = os.path.splitext( + os.path.basename(html_encoding_file) + )[0].split('_') try: - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() - from_filename = self.read_html(f, encoding=encoding, + from_filename = self.read_html(html_encoding_file, + encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) @@ -906,7 +920,7 @@ def seekable(self): assert self.read_html(bad) @pytest.mark.slow - def test_importcheck_thread_safety(self): + def test_importcheck_thread_safety(self, datapath): # see gh-16928 class ErrorThread(threading.Thread): @@ -921,7 +935,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = os.path.join(DATA_PATH, 'valid_markup.html') + filename = datapath('io', 'data', 'valid_markup.html') helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index cfac77291803d..491d5fe33cc33 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -3,6 +3,7 @@ from warnings import catch_warnings import os import datetime +import glob import numpy as np from distutils.version import LooseVersion @@ -837,13 +838,13 @@ def test_default_encoding(self): assert_frame_equal(result, frame) -def legacy_packers_versions(): - # yield the packers versions - path = tm.get_data_path('legacy_msgpack') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - yield v +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_msgpack", "*", "*.msgpack")) + + +@pytest.fixture(params=files) +def legacy_packer(request, datapath): + return datapath(request.param) class TestMsgpack(object): @@ -920,24 +921,20 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('version', legacy_packers_versions()) def test_msgpacks_legacy(self, current_packers_data, all_packers_data, - version): - - pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) - n = 0 - for f in os.listdir(pth): - # GH12142 0.17 files packed in P2 can't be read in P3 - if (compat.PY3 and version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): - continue - vf = os.path.join(pth, f) - try: - with catch_warnings(record=True): - self.compare(current_packers_data, all_packers_data, - vf, version) - except ImportError: - # blosc not installed - continue - n += 1 - assert n > 0, 'Msgpack files are not tested' + legacy_packer, datapath): + + version = os.path.basename(os.path.dirname(legacy_packer)) + + # GH12142 0.17 files packed in P2 can't be read in P3 + if (compat.PY3 and version.startswith('0.17.') and + legacy_packer.split('.')[-4][-1] == '2'): + msg = "Files packed in Py2 can't be read in Py3 ({})" + pytest.skip(msg.format(version)) + try: + with catch_warnings(record=True): + self.compare(current_packers_data, all_packers_data, + legacy_packer, version) + except ImportError: + # blosc not installed + pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index fbe2174e603e2..45cbbd43cd6a8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,7 +12,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ - +import glob import pytest from warnings import catch_warnings @@ -184,27 +184,25 @@ def compare_sp_frame_float(result, expected, typ, version): tm.assert_sp_frame_equal(result, expected) +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_pickle", "*", "*.pickle")) + + +@pytest.fixture(params=files) +def legacy_pickle(request, datapath): + return datapath(request.param) + + # --------------------- # tests # --------------------- -def legacy_pickle_versions(): - # yield the pickle versions - path = tm.get_data_path('legacy_pickle') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - for f in os.listdir(p): - yield (v, f) - - -@pytest.mark.parametrize('version, f', legacy_pickle_versions()) -def test_pickles(current_pickle_data, version, f): +def test_pickles(current_pickle_data, legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") - vf = tm.get_data_path('legacy_pickle/{}/{}'.format(version, f)) + version = os.path.basename(os.path.dirname(legacy_pickle)) with catch_warnings(record=True): - compare(current_pickle_data, vf, version) + compare(current_pickle_data, legacy_pickle, version) def test_round_trip_current(current_pickle_data): @@ -260,12 +258,11 @@ def python_unpickler(path): compare_element(result, expected, typ) -def test_pickle_v0_14_1(): +def test_pickle_v0_14_1(datapath): cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -275,14 +272,13 @@ def test_pickle_v0_14_1(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) -def test_pickle_v0_15_2(): +def test_pickle_v0_15_2(datapath): # ordered -> _ordered # GH 9347 cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 5ac91c15047ff..3c6b52074763e 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -14,7 +14,7 @@ from pandas import (Series, DataFrame, Panel, MultiIndex, Int64Index, RangeIndex, Categorical, bdate_range, date_range, timedelta_range, Index, DatetimeIndex, - isna, compat, concat, Timestamp) + isna, compat, concat, Timestamp, _np_version_under1p15) import pandas.util.testing as tm import pandas.util._test_decorators as td @@ -1842,6 +1842,12 @@ def make_index(names=None): 'a', 'b'], index=make_index(['date', 'a', 't'])) pytest.raises(ValueError, store.append, 'df', df) + # dup within level + _maybe_remove(store, 'df') + df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], + index=make_index(['date', 'date', 'date'])) + pytest.raises(ValueError, store.append, 'df', df) + # fully names _maybe_remove(store, 'df') df = DataFrame(np.zeros((12, 2)), columns=[ @@ -2134,6 +2140,10 @@ def test_unimplemented_dtypes_table_columns(self): # this fails because we have a date in the object block...... pytest.raises(TypeError, store.append, 'df_unimplemented', df) + @pytest.mark.skipif( + not _np_version_under1p15, + reason=("pytables conda build package needs build " + "with numpy 1.15: gh-22098")) def test_calendar_roundtrip_issue(self): # 8591 @@ -4452,28 +4462,27 @@ def f(): store.select('df') tm.assert_raises_regex(ClosedFileError, 'file is not open', f) - def test_pytables_native_read(self): - + def test_pytables_native_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native.h5'), + datapath('io', 'data', 'legacy_hdf/pytables_native.h5'), mode='r') as store: d2 = store['detector/readout'] assert isinstance(d2, DataFrame) @pytest.mark.skipif(PY35 and is_platform_windows(), reason="native2 read fails oddly on windows / 3.5") - def test_pytables_native2_read(self): + def test_pytables_native2_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native2.h5'), + datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'), mode='r') as store: str(store) d1 = store['detector'] assert isinstance(d1, DataFrame) - def test_legacy_table_read(self): + def test_legacy_table_read(self, datapath): # legacy table types with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy_table.h5'), + datapath('io', 'data', 'legacy_hdf', 'legacy_table.h5'), mode='r') as store: with catch_warnings(record=True): @@ -5120,7 +5129,7 @@ def test_fspath(self): with pd.HDFStore(path) as store: assert os.fspath(store) == str(path) - def test_read_py2_hdf_file_in_py3(self): + def test_read_py2_hdf_file_in_py3(self, datapath): # GH 16781 # tests reading a PeriodIndex DataFrame written in Python2 in Python3 @@ -5135,8 +5144,8 @@ def test_read_py2_hdf_file_in_py3(self): ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) with ensure_clean_store( - tm.get_data_path( - 'legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), + datapath('io', 'data', 'legacy_hdf', + 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), mode='r') as store: result = store['p'] assert_frame_equal(result, expected) @@ -5533,14 +5542,14 @@ def test_store_timezone(self): assert_frame_equal(result, df) - def test_legacy_datetimetz_object(self): + def test_legacy_datetimetz_object(self, datapath): # legacy from < 0.17.0 # 8260 expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) with ensure_clean_store( - tm.get_data_path('legacy_hdf/datetimetz_object.h5'), + datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'), mode='r') as store: result = store['df'] assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4530cc9d2fba9..f8f742c5980ac 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -22,7 +22,6 @@ import pytest import sqlite3 import csv -import os import warnings import numpy as np @@ -184,9 +183,11 @@ class MixInBase(object): def teardown_method(self, method): - for tbl in self._get_all_tables(): - self.drop_table(tbl) - self._close_conn() + # if setup fails, there may not be a connection to close. + if hasattr(self, 'conn'): + for tbl in self._get_all_tables(): + self.drop_table(tbl) + self._close_conn() class MySQLMixIn(MixInBase): @@ -253,9 +254,9 @@ def _get_exec(self): else: return self.conn.cursor() - def _load_iris_data(self): + def _load_iris_data(self, datapath): import io - iris_csv_file = os.path.join(tm.get_data_path(), 'iris.csv') + iris_csv_file = datapath('io', 'data', 'iris.csv') self.drop_table('iris') self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) @@ -503,9 +504,10 @@ class _TestSQLApi(PandasSQLTest): flavor = 'sqlite' mode = None - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_iris_view() self._load_test1_data() self._load_test2_data() @@ -1025,8 +1027,9 @@ class _EngineToConnMixin(object): A mixin that causes setup_connect to create a conn rather than an engine. """ - def setup_method(self, method): - super(_EngineToConnMixin, self).setup_method(method) + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + super(_EngineToConnMixin, self).setup_method(datapath) engine = self.conn conn = engine.connect() self.__tx = conn.begin() @@ -1034,12 +1037,14 @@ def setup_method(self, method): self.__engine = engine self.conn = conn - def teardown_method(self, method): + yield + self.__tx.rollback() self.conn.close() self.conn = self.__engine self.pandasSQL = sql.SQLDatabase(self.__engine) - super(_EngineToConnMixin, self).teardown_method(method) + # XXX: + # super(_EngineToConnMixin, self).teardown_method(method) @pytest.mark.single @@ -1136,7 +1141,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ flavor = None - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): cls.setup_import() cls.setup_driver() @@ -1149,10 +1154,11 @@ def setup_class(cls): msg = "{0} - can't connect to {1} server".format(cls, cls.flavor) pytest.skip(msg) - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.setup_connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_raw_sql() self._load_test1_data() @@ -1665,29 +1671,6 @@ class Temporary(Base): tm.assert_frame_equal(df, expected) - def test_insert_multivalues(self): - # issues addressed - # https://github.com/pandas-dev/pandas/issues/14315 - # https://github.com/pandas-dev/pandas/issues/8953 - - db = sql.SQLDatabase(self.conn) - df = DataFrame({'A': [1, 0, 0], 'B': [1.1, 0.2, 4.3]}) - table = sql.SQLTable("test_table", db, frame=df) - data = [ - {'A': 1, 'B': 0.46}, - {'A': 0, 'B': -2.06} - ] - statement = table.insert_statement(data, conn=self.conn)[0] - - if self.supports_multivalues_insert: - assert statement.parameters == data, ( - 'insert statement should be multivalues' - ) - else: - assert statement.parameters is None, ( - 'insert statement should not be multivalues' - ) - class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): @@ -1702,7 +1685,6 @@ class _TestSQLiteAlchemy(object): """ flavor = 'sqlite' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1751,7 +1733,6 @@ class _TestMySQLAlchemy(object): """ flavor = 'mysql' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1821,7 +1802,6 @@ class _TestPostgreSQLAlchemy(object): """ flavor = 'postgresql' - supports_multivalues_insert = True @classmethod def connect(cls): @@ -1946,11 +1926,12 @@ class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): def connect(cls): return sqlite3.connect(':memory:') - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() self.pandasSQL = sql.SQLiteDatabase(self.conn) - self._load_iris_data() + self._load_iris_data(datapath) self._load_test1_data() @@ -2161,8 +2142,9 @@ def _skip_if_no_pymysql(): @pytest.mark.single class TestXSQLite(SQLiteMixIn): - def setup_method(self, method): - self.method = method + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): + self.method = request.function self.conn = sqlite3.connect(':memory:') def test_basic(self): @@ -2241,8 +2223,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): create_sql = """ CREATE TABLE test ( @@ -2262,7 +2243,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): pass @@ -2367,7 +2348,7 @@ def clean_up(test_table_to_drop): "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn): - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): _skip_if_no_pymysql() @@ -2396,7 +2377,8 @@ def setup_class(cls): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): _skip_if_no_pymysql() import pymysql try: @@ -2422,7 +2404,7 @@ def setup_method(self, method): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - self.method = method + self.method = request.function def test_basic(self): _skip_if_no_pymysql() @@ -2527,8 +2509,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): _skip_if_no_pymysql() drop_sql = "DROP TABLE IF EXISTS test" create_sql = """ @@ -2551,7 +2532,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): _skip_if_no_pymysql() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 110b790a65037..cff63516f4086 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2,6 +2,8 @@ # pylint: disable=E1101 import datetime as dt +import io +import gzip import os import struct import warnings @@ -23,8 +25,8 @@ @pytest.fixture -def dirpath(): - return tm.get_data_path() +def dirpath(datapath): + return datapath("io", "data") @pytest.fixture @@ -37,8 +39,9 @@ def parsed_114(dirpath): class TestStata(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') @@ -1473,3 +1476,28 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError): original.to_stata(path, convert_dates={'wrong_name': 'tc'}) + + @pytest.mark.parametrize('version', [114, 117]) + def test_nonfile_writing(self, version): + # GH 21041 + bio = io.BytesIO() + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + df.to_stata(bio, version=version) + bio.seek(0) + with open(path, 'wb') as dta: + dta.write(bio.read()) + reread = pd.read_stata(path, index_col='index') + tm.assert_frame_equal(df, reread) + + def test_gzip_writing(self): + # writing version 117 requires seek and cannot be used with gzip + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + with gzip.GzipFile(path, 'wb') as gz: + df.to_stata(gz, version=114) + with gzip.GzipFile(path, 'rb') as gz: + reread = pd.read_stata(gz, index_col='index') + tm.assert_frame_equal(df, reread) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f65791329f2f1..09687dd97bd43 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -74,11 +74,6 @@ def setup_method(self, method): else: self.default_figsize = (8.0, 6.0) self.default_tick_position = 'left' if self.mpl_ge_2_0_0 else 'default' - # common test data - from pandas import read_csv - base = os.path.join(os.path.dirname(curpath()), os.pardir) - path = os.path.join(base, 'tests', 'data', 'iris.csv') - self.iris = read_csv(path) n = 100 with tm.RNGContext(42): diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 47cded19f5300..bb976a1e3e81c 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -1,4 +1,5 @@ import subprocess +import sys import pytest from datetime import datetime, date @@ -27,7 +28,7 @@ def test_register_by_default(self): "import pandas as pd; " "units = dict(matplotlib.units.registry); " "assert pd.Timestamp in units)'") - call = ['python', '-c', code] + call = [sys.executable, '-c', code] assert subprocess.check_call(call) == 0 def test_warns(self): diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index 2c2d371921d2f..a45b17ec98261 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -46,10 +46,9 @@ def test_boxplot_deprecated(self): by='indic') @pytest.mark.slow - def test_radviz_deprecated(self): - df = self.iris + def test_radviz_deprecated(self, iris): with tm.assert_produces_warning(FutureWarning): - plotting.radviz(frame=df, class_column='Name') + plotting.radviz(frame=iris, class_column='Name') @pytest.mark.slow def test_plot_params(self): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index c82c939584dc7..0473610ea2f8f 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -100,11 +100,11 @@ def test_scatter_matrix_axis(self): axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow - def test_andrews_curves(self): + def test_andrews_curves(self, iris): from pandas.plotting import andrews_curves from matplotlib import cm - df = self.iris + df = iris _check_plot_works(andrews_curves, frame=df, class_column='Name') @@ -165,11 +165,11 @@ def test_andrews_curves(self): andrews_curves(data=df, class_column='Name') @pytest.mark.slow - def test_parallel_coordinates(self): + def test_parallel_coordinates(self, iris): from pandas.plotting import parallel_coordinates from matplotlib import cm - df = self.iris + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') @@ -234,11 +234,11 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] @pytest.mark.slow - def test_radviz(self): + def test_radviz(self, iris): from pandas.plotting import radviz from matplotlib import cm - df = self.iris + df = iris _check_plot_works(radviz, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') @@ -272,8 +272,8 @@ def test_radviz(self): self._check_colors(handles, facecolors=colors) @pytest.mark.slow - def test_subplot_titles(self): - df = self.iris.drop('Name', axis=1).head() + def test_subplot_titles(self, iris): + df = iris.drop('Name', axis=1).head() # Use the column names as the subplot titles title = list(df.columns) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 8e639edd34b18..037bd9cc7cd18 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1526,6 +1526,27 @@ def test_merge_on_ints_floats_warning(self): result = B.merge(A, left_on='Y', right_on='X') assert_frame_equal(result, expected[['Y', 'X']]) + def test_merge_incompat_infer_boolean_object(self): + # GH21119: bool + object bool merge OK + df1 = DataFrame({'key': Series([True, False], dtype=object)}) + df2 = DataFrame({'key': [True, False]}) + + expected = DataFrame({'key': [True, False]}, dtype=object) + result = pd.merge(df1, df2, on='key') + assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on='key') + assert_frame_equal(result, expected) + + # with missing value + df1 = DataFrame({'key': Series([True, False, np.nan], dtype=object)}) + df2 = DataFrame({'key': [True, False]}) + + expected = DataFrame({'key': [True, False]}, dtype=object) + result = pd.merge(df1, df2, on='key') + assert_frame_equal(result, expected) + result = pd.merge(df2, df1, on='key') + assert_frame_equal(result, expected) + @pytest.mark.parametrize('df1_vals, df2_vals', [ ([0, 1, 2], ["0", "1", "2"]), ([0.0, 1.0, 2.0], ["0", "1", "2"]), @@ -1538,6 +1559,8 @@ def test_merge_on_ints_floats_warning(self): pd.date_range('20130101', periods=3, tz='US/Eastern')), ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), + # TODO ([0, 1], pd.Series([False, True], dtype=bool)), + ([0, 1], pd.Series([False, True], dtype=object)) ]) def test_merge_incompat_dtypes(self, df1_vals, df2_vals): # GH 9780, GH 15800 diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index cebbcc41c3e17..59b53cd23010e 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1,4 +1,3 @@ -import os import pytest import pytz @@ -13,8 +12,8 @@ class TestAsOfMerge(object): - def read_data(self, name, dedupe=False): - path = os.path.join(tm.get_data_path(), name) + def read_data(self, datapath, name, dedupe=False): + path = datapath('reshape', 'merge', 'data', name) x = read_csv(path) if dedupe: x = (x.drop_duplicates(['time', 'ticker'], keep='last') @@ -23,15 +22,17 @@ def read_data(self, name, dedupe=False): x.time = to_datetime(x.time) return x - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): - self.trades = self.read_data('trades.csv') - self.quotes = self.read_data('quotes.csv', dedupe=True) - self.asof = self.read_data('asof.csv') - self.tolerance = self.read_data('tolerance.csv') - self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.trades = self.read_data(datapath, 'trades.csv') + self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) + self.asof = self.read_data(datapath, 'asof.csv') + self.tolerance = self.read_data(datapath, 'tolerance.csv') + self.allow_exact_matches = self.read_data(datapath, + 'allow_exact_matches.csv') self.allow_exact_matches_and_tolerance = self.read_data( - 'allow_exact_matches_and_tolerance.csv') + datapath, 'allow_exact_matches_and_tolerance.csv') def test_examples1(self): """ doc-string examples """ @@ -423,11 +424,11 @@ def test_multiby_indexed(self): pd.merge_asof(left, right, left_index=True, right_index=True, left_by=['k1', 'k2'], right_by=['k1']) - def test_basic2(self): + def test_basic2(self, datapath): - expected = self.read_data('asof2.csv') - trades = self.read_data('trades2.csv') - quotes = self.read_data('quotes2.csv', dedupe=True) + expected = self.read_data(datapath, 'asof2.csv') + trades = self.read_data(datapath, 'trades2.csv') + quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) result = merge_asof(trades, quotes, on='time', @@ -467,14 +468,14 @@ def test_valid_join_keys(self): merge_asof(trades, quotes, by='ticker') - def test_with_duplicates(self): + def test_with_duplicates(self, datapath): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') - expected = self.read_data('asof.csv') + expected = self.read_data(datapath, 'asof.csv') assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index f5e58fa70e1c4..dea305d4b3fee 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2487,3 +2487,14 @@ def test_concat_aligned_sort_does_not_raise(): columns=[1, 'a']) result = pd.concat([df, df], ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("s1name,s2name", [ + (np.int64(190), (43, 0)), (190, (43, 0))]) +def test_concat_series_name_npscalar_tuple(s1name, s2name): + # GH21015 + s1 = pd.Series({'a': 1, 'b': 2}, name=s1name) + s2 = pd.Series({'c': 5, 'd': 6}, name=s2name) + result = pd.concat([s1, s2]) + expected = pd.Series({'a': 1, 'b': 2, 'c': 5, 'd': 6}) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d2cf3fc11e165..b71954163f9e1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from datetime import datetime, date, timedelta @@ -16,6 +17,11 @@ from pandas.api.types import CategoricalDtype as CDT +@pytest.fixture(params=[True, False]) +def dropna(request): + return request.param + + class TestPivotTable(object): def setup_method(self, method): @@ -109,7 +115,6 @@ def test_pivot_table_categorical(self): index=exp_index) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('dropna', [True, False]) def test_pivot_table_dropna_categoricals(self, dropna): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -137,6 +142,25 @@ def test_pivot_table_dropna_categoricals(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_with_non_observable_dropna(self, dropna): + # gh-21133 + df = pd.DataFrame( + {'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'], + categories=['low', 'high'], + ordered=True), + 'B': range(5)}) + + result = df.pivot_table(index='A', values='B', dropna=dropna) + expected = pd.DataFrame( + {'B': [2, 3]}, + index=pd.Index( + pd.Categorical.from_codes([0, 1], + categories=['low', 'high'], + ordered=True), + name='A')) + + tm.assert_frame_equal(result, expected) + def test_pass_array(self): result = self.data.pivot_table( 'D', index=self.data.A, columns=self.data.C) @@ -1705,9 +1729,15 @@ def test_crosstab_with_numpy_size(self): tm.assert_frame_equal(result, expected) def test_crosstab_dup_index_names(self): - # GH 13279, GH 18872 + # GH 13279 s = pd.Series(range(3), name='foo') - pytest.raises(ValueError, pd.crosstab, s, s) + + result = pd.crosstab(s, s) + expected_index = pd.Index(range(3), name='foo') + expected = pd.DataFrame(np.eye(3, dtype=np.int64), + index=expected_index, + columns=expected_index) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("names", [['a', ('b', 'c')], [('a', 'b'), 'c']]) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 5ea27f9e34e1c..807fb2530603a 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -282,10 +282,10 @@ def test_round_frac(self): result = tmod._round_frac(0.000123456, precision=2) assert result == 0.00012 - def test_qcut_binning_issues(self): + def test_qcut_binning_issues(self, datapath): # #1978, 1979 - path = os.path.join(tm.get_data_path(), 'cut_data.csv') - arr = np.loadtxt(path) + cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv')) + arr = np.loadtxt(cut_file) result = qcut(arr, 20) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 3fdc2aa71bfc0..6472bd4245622 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -106,6 +106,16 @@ def test_compare_timedelta_ndarray(self): class TestTimedeltas(object): + @pytest.mark.parametrize("unit, value, expected", [ + ('us', 9.999, 9999), ('ms', 9.999999, 9999999), + ('s', 9.999999999, 9999999999)]) + def test_rounding_on_int_unit_construction(self, unit, value, expected): + # GH 12690 + result = Timedelta(value, unit=unit) + assert result.value == expected + result = Timedelta(str(value) + unit) + assert result.value == expected + def test_total_seconds_scalar(self): # see gh-10939 rng = Timedelta('1 days, 10:11:12.100123456') @@ -578,3 +588,17 @@ def test_components(self): result = s.dt.components assert not result.iloc[0].isna().all() assert result.iloc[1].isna().all() + + +@pytest.mark.parametrize('value, expected', [ + (Timedelta('10S'), True), + (Timedelta('-10S'), True), + (Timedelta(10, unit='ns'), True), + (Timedelta(0, unit='ns'), False), + (Timedelta(-10, unit='ns'), True), + (Timedelta(None), True), + (pd.NaT, True), +]) +def test_truthiness(value, expected): + # https://github.com/pandas-dev/pandas/issues/21484 + assert bool(value) is expected diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index b022b327de57c..e829506e95b53 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -5,6 +5,7 @@ import dateutil import calendar import locale +import unicodedata import numpy as np from dateutil.tz import tzutc @@ -20,7 +21,7 @@ from pandas._libs.tslibs.timezones import get_timezone, dateutil_gettz as gettz from pandas.errors import OutOfBoundsDatetime -from pandas.compat import long, PY3 +from pandas.compat import long, PY3, PY2 from pandas.compat.numpy import np_datetime64_compat from pandas import Timestamp, Period, Timedelta, NaT @@ -116,8 +117,21 @@ def test_names(self, data, time_locale): expected_day = calendar.day_name[0].capitalize() expected_month = calendar.month_name[8].capitalize() - assert data.day_name(time_locale) == expected_day - assert data.month_name(time_locale) == expected_month + result_day = data.day_name(time_locale) + result_month = data.month_name(time_locale) + + # Work around https://github.com/pandas-dev/pandas/issues/22342 + # different normalizations + + if not PY2: + expected_day = unicodedata.normalize("NFD", expected_day) + expected_month = unicodedata.normalize("NFD", expected_month) + + result_day = unicodedata.normalize("NFD", result_day,) + result_month = unicodedata.normalize("NFD", result_month) + + assert result_day == expected_day + assert result_month == expected_month # Test NaT nan_ts = Timestamp(NaT) @@ -528,6 +542,14 @@ def test_disallow_setting_tz(self, tz): with pytest.raises(AttributeError): ts.tz = tz + @pytest.mark.parametrize('offset', ['+0300', '+0200']) + def test_construct_timestamp_near_dst(self, offset): + # GH 20854 + expected = Timestamp('2016-10-30 03:00:00{}'.format(offset), + tz='Europe/Helsinki') + result = Timestamp(expected, tz='Europe/Helsinki') + assert result == expected + class TestTimestamp(object): @@ -621,10 +643,51 @@ def test_basics_nanos(self): assert stamp.microsecond == 145224 assert stamp.nanosecond == 192 - def test_unit(self): - - def check(val, unit=None, h=1, s=1, us=0): - stamp = Timestamp(val, unit=unit) + @pytest.mark.parametrize('value, check_kwargs', [ + [946688461000000000, {}], + [946688461000000000 / long(1000), dict(unit='us')], + [946688461000000000 / long(1000000), dict(unit='ms')], + [946688461000000000 / long(1000000000), dict(unit='s')], + [10957, dict(unit='D', h=0)], + pytest.param((946688461000000000 + 500000) / long(1000000000), + dict(unit='s', us=499, ns=964), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000000) / long(1000000000), + dict(unit='s', us=500000), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000) / long(1000000), + dict(unit='ms', us=500), + marks=pytest.mark.skipif(not PY3, + reason='using truediv, so these' + ' are like floats')), + pytest.param((946688461000000000 + 500000) / long(1000000000), + dict(unit='s'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + pytest.param((946688461000000000 + 500000000) / long(1000000000), + dict(unit='s'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + pytest.param((946688461000000000 + 500000) / long(1000000), + dict(unit='ms'), + marks=pytest.mark.skipif(PY3, + reason='get chopped in py2')), + [(946688461000000000 + 500000) / long(1000), dict(unit='us', us=500)], + [(946688461000000000 + 500000000) / long(1000000), + dict(unit='ms', us=500000)], + [946688461000000000 / 1000.0 + 5, dict(unit='us', us=5)], + [946688461000000000 / 1000.0 + 5000, dict(unit='us', us=5000)], + [946688461000000000 / 1000000.0 + 0.5, dict(unit='ms', us=500)], + [946688461000000000 / 1000000.0 + 0.005, dict(unit='ms', us=5, ns=5)], + [946688461000000000 / 1000000000.0 + 0.5, dict(unit='s', us=500000)], + [10957 + 0.5, dict(unit='D', h=12)]]) + def test_unit(self, value, check_kwargs): + def check(value, unit=None, h=1, s=1, us=0, ns=0): + stamp = Timestamp(value, unit=unit) assert stamp.year == 2000 assert stamp.month == 1 assert stamp.day == 1 @@ -637,41 +700,9 @@ def check(val, unit=None, h=1, s=1, us=0): assert stamp.minute == 0 assert stamp.second == 0 assert stamp.microsecond == 0 - assert stamp.nanosecond == 0 - - ts = Timestamp('20000101 01:01:01') - val = ts.value - days = (ts - Timestamp('1970-01-01')).days - - check(val) - check(val / long(1000), unit='us') - check(val / long(1000000), unit='ms') - check(val / long(1000000000), unit='s') - check(days, unit='D', h=0) + assert stamp.nanosecond == ns - # using truediv, so these are like floats - if PY3: - check((val + 500000) / long(1000000000), unit='s', us=500) - check((val + 500000000) / long(1000000000), unit='s', us=500000) - check((val + 500000) / long(1000000), unit='ms', us=500) - - # get chopped in py2 - else: - check((val + 500000) / long(1000000000), unit='s') - check((val + 500000000) / long(1000000000), unit='s') - check((val + 500000) / long(1000000), unit='ms') - - # ok - check((val + 500000) / long(1000), unit='us', us=500) - check((val + 500000000) / long(1000000), unit='ms', us=500000) - - # floats - check(val / 1000.0 + 5, unit='us', us=5) - check(val / 1000.0 + 5000, unit='us', us=5000) - check(val / 1000000.0 + 0.5, unit='ms', us=500) - check(val / 1000000.0 + 0.005, unit='ms', us=5) - check(val / 1000000000.0 + 0.5, unit='s', us=500000) - check(days + 0.5, unit='D', h=12) + check(value, **check_kwargs) def test_roundtrip(self): diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index aecddab8477fc..dbe31ccb11114 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -118,6 +118,25 @@ def test_ceil_floor_edge(self, test_input, rounder, freq, expected): expected = Timestamp(expected) assert result == expected + @pytest.mark.parametrize('test_input, freq, expected', [ + ('2018-01-01 00:02:06', '2s', '2018-01-01 00:02:06'), + ('2018-01-01 00:02:00', '2T', '2018-01-01 00:02:00'), + ('2018-01-01 00:04:00', '4T', '2018-01-01 00:04:00'), + ('2018-01-01 00:15:00', '15T', '2018-01-01 00:15:00'), + ('2018-01-01 00:20:00', '20T', '2018-01-01 00:20:00'), + ('2018-01-01 03:00:00', '3H', '2018-01-01 03:00:00'), + ]) + @pytest.mark.parametrize('rounder', ['ceil', 'floor', 'round']) + def test_round_minute_freq(self, test_input, freq, expected, rounder): + # Ensure timestamps that shouldnt round dont! + # GH#21262 + + dt = Timestamp(test_input) + expected = Timestamp(expected) + func = getattr(dt, rounder) + result = func(freq) + assert result == expected + def test_ceil(self): dt = Timestamp('20130101 09:10:11') result = dt.ceil('D') @@ -257,7 +276,6 @@ def test_timestamp(self): if PY3: # datetime.timestamp() converts in the local timezone with tm.set_timezone('UTC'): - # should agree with datetime.timestamp method dt = ts.to_pydatetime() assert dt.timestamp() == ts.timestamp() diff --git a/pandas/tests/series/indexing/test_alter_index.py b/pandas/tests/series/indexing/test_alter_index.py index 999ed5f26daee..2fdf198596ce2 100644 --- a/pandas/tests/series/indexing/test_alter_index.py +++ b/pandas/tests/series/indexing/test_alter_index.py @@ -463,54 +463,86 @@ def test_rename(): assert result.name == expected.name -def test_drop(): - # unique - s = Series([1, 2], index=['one', 'two']) - expected = Series([1], index=['one']) - result = s.drop(['two']) - assert_series_equal(result, expected) - result = s.drop('two', axis='rows') - assert_series_equal(result, expected) - - # non-unique - # GH 5248 - s = Series([1, 1, 2], index=['one', 'two', 'one']) - expected = Series([1, 2], index=['one', 'one']) - result = s.drop(['two'], axis=0) - assert_series_equal(result, expected) - result = s.drop('two') - assert_series_equal(result, expected) - - expected = Series([1], index=['two']) - result = s.drop(['one']) - assert_series_equal(result, expected) - result = s.drop('one') - assert_series_equal(result, expected) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, expected_data, expected_index', + [ + # Unique Index + ([1, 2], ['one', 'two'], ['two'], + 0, [1], ['one']), + ([1, 2], ['one', 'two'], ['two'], + 'rows', [1], ['one']), + ([1, 1, 2], ['one', 'two', 'one'], ['two'], + 0, [1, 2], ['one', 'one']), + + # GH 5248 Non-Unique Index + ([1, 1, 2], ['one', 'two', 'one'], 'two', + 0, [1, 2], ['one', 'one']), + ([1, 1, 2], ['one', 'two', 'one'], ['one'], + 0, [1], ['two']), + ([1, 1, 2], ['one', 'two', 'one'], 'one', + 0, [1], ['two'])]) +def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, + expected_data, expected_index): + + s = Series(data=data, index=index) + result = s.drop(drop_labels, axis=axis) + expected = Series(data=expected_data, index=expected_index) + tm.assert_series_equal(result, expected) - # single string/tuple-like - s = Series(range(3), index=list('abc')) - pytest.raises(KeyError, s.drop, 'bc') - pytest.raises(KeyError, s.drop, ('a',)) +@pytest.mark.parametrize( + 'data, index, drop_labels,' + ' axis, error_type, error_desc', + [ + # single string/tuple-like + (range(3), list('abc'), 'bc', + 0, KeyError, 'not found in axis'), + + # bad axis + (range(3), list('abc'), ('a',), + 0, KeyError, 'not found in axis'), + (range(3), list('abc'), 'one', + 'columns', ValueError, 'No axis named columns')]) +def test_drop_exception_raised(data, index, drop_labels, + axis, error_type, error_desc): + + with tm.assert_raises_regex(error_type, error_desc): + Series(data, index=index).drop(drop_labels, axis=axis) + + +def test_drop_with_ignore_errors(): # errors='ignore' s = Series(range(3), index=list('abc')) result = s.drop('bc', errors='ignore') - assert_series_equal(result, s) + tm.assert_series_equal(result, s) result = s.drop(['a', 'd'], errors='ignore') expected = s.iloc[1:] - assert_series_equal(result, expected) - - # bad axis - pytest.raises(ValueError, s.drop, 'one', axis='columns') + tm.assert_series_equal(result, expected) # GH 8522 s = Series([2, 3], index=[True, False]) assert s.index.is_object() result = s.drop(True) expected = Series([3], index=[False]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + - # GH 16877 - s = Series([2, 3], index=[0, 1]) - with tm.assert_raises_regex(KeyError, 'not contained in axis'): - s.drop([False, True]) +@pytest.mark.parametrize('index', [[1, 2, 3], [1, 1, 3]]) +@pytest.mark.parametrize('drop_labels', [[], [1], [3]]) +def test_drop_empty_list(index, drop_labels): + # GH 21494 + expected_index = [i for i in index if i not in drop_labels] + series = pd.Series(index=index).drop(drop_labels) + tm.assert_series_equal(series, pd.Series(index=expected_index)) + + +@pytest.mark.parametrize('data, index, drop_labels', [ + (None, [1, 2, 3], [1, 4]), + (None, [1, 2, 2], [1, 4]), + ([2, 3], [0, 1], [False, True]) +]) +def test_drop_non_empty_list(data, index, drop_labels): + # GH 21494 and GH 16877 + with tm.assert_raises_regex(KeyError, 'not found in axis'): + pd.Series(data=data, index=index).drop(drop_labels) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index dce4e82cbdcf1..859082a7e722d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -188,6 +188,11 @@ def test_reset_index_level(self): with tm.assert_raises_regex(IndexError, 'Too many levels'): s.reset_index(level=[0, 1, 2]) + # Check that .reset_index([],drop=True) doesn't fail + result = pd.Series(range(4)).reset_index([], drop=True) + expected = pd.Series(range(4)) + assert_series_equal(result, expected) + def test_reset_index_range(self): # GH 12071 s = pd.Series(range(2), name='A', dtype='int64') @@ -275,3 +280,18 @@ def test_set_axis_prior_to_deprecation_signature(self): with tm.assert_produces_warning(FutureWarning): result = s.set_axis(0, list('abcd'), inplace=False) tm.assert_series_equal(result, expected) + + def test_reset_index_drop_errors(self): + # GH 20925 + + # KeyError raised for series index when passed level name is missing + s = pd.Series(range(4)) + with tm.assert_raises_regex(KeyError, 'must be same as name'): + s.reset_index('wrong', drop=True) + with tm.assert_raises_regex(KeyError, 'must be same as name'): + s.reset_index('wrong') + + # KeyError raised for series when level to be dropped is missing + s = pd.Series(range(4), index=pd.MultiIndex.from_product([[1, 2]] * 2)) + with tm.assert_raises_regex(KeyError, 'not found'): + s.reset_index('wrong', drop=True) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 6ea40329f4bc3..bcf209521f913 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1140,11 +1140,15 @@ def test_clip_with_na_args(self): s = Series([1, 2, 3]) assert_series_equal(s.clip(np.nan), Series([1, 2, 3])) - assert_series_equal(s.clip(upper=[1, 1, np.nan]), Series([1, 2, 3])) - assert_series_equal(s.clip(lower=[1, np.nan, 1]), Series([1, 2, 3])) assert_series_equal(s.clip(upper=np.nan, lower=np.nan), Series([1, 2, 3])) + # GH #19992 + assert_series_equal(s.clip(lower=[0, 4, np.nan]), + Series([1, 4, np.nan])) + assert_series_equal(s.clip(upper=[1, np.nan, 1]), + Series([1, np.nan, 1])) + def test_clip_against_series(self): # GH #6966 @@ -1866,6 +1870,15 @@ def s_main_dtypes(): return df +def assert_check_nselect_boundary(vals, dtype, method): + # helper function for 'test_boundary_{dtype}' tests + s = Series(vals, dtype=dtype) + result = getattr(s, method)(3) + expected_idxr = [0, 1, 2] if method == 'nsmallest' else [3, 2, 1] + expected = s.loc[expected_idxr] + tm.assert_series_equal(result, expected) + + class TestNLargestNSmallest(object): @pytest.mark.parametrize( @@ -1950,6 +1963,32 @@ def test_n(self, n): expected = s.sort_values().head(n) assert_series_equal(result, expected) + def test_boundary_integer(self, nselect_method, any_int_dtype): + # GH 21426 + dtype_info = np.iinfo(any_int_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val, min_val + 1, max_val - 1, max_val] + assert_check_nselect_boundary(vals, any_int_dtype, nselect_method) + + def test_boundary_float(self, nselect_method, float_dtype): + # GH 21426 + dtype_info = np.finfo(float_dtype) + min_val, max_val = dtype_info.min, dtype_info.max + min_2nd, max_2nd = np.nextafter( + [min_val, max_val], 0, dtype=float_dtype) + vals = [min_val, min_2nd, max_2nd, max_val] + assert_check_nselect_boundary(vals, float_dtype, nselect_method) + + @pytest.mark.parametrize('dtype', ['datetime64[ns]', 'timedelta64[ns]']) + def test_boundary_datetimelike(self, nselect_method, dtype): + # GH 21426 + # use int64 bounds and +1 to min_val since true minimum is NaT + # (include min_val/NaT at end to maintain same expected_idxr) + dtype_info = np.iinfo('int64') + min_val, max_val = dtype_info.min, dtype_info.max + vals = [min_val + 1, min_val + 2, max_val - 1, max_val, min_val] + assert_check_nselect_boundary(vals, dtype, nselect_method) + class TestCategoricalSeriesAnalytics(object): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index ec0d7296e540e..95836f046195a 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -88,6 +88,46 @@ def test_ser_cmp_result_names(self, names, op): class TestTimestampSeriesComparison(object): + def test_dt64_ser_cmp_date_warning(self): + # https://github.com/pandas-dev/pandas/issues/21359 + # Remove this test and enble invalid test below + ser = pd.Series(pd.date_range('20010101', periods=10), name='dates') + date = ser.iloc[0].to_pydatetime().date() + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser == date + expected = pd.Series([True] + [False] * 9, name='dates') + tm.assert_series_equal(result, expected) + assert "Comparing Series of datetimes " in str(m[0].message) + assert "will not compare equal" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser != date + tm.assert_series_equal(result, ~expected) + assert "will not compare equal" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser <= date + tm.assert_series_equal(result, expected) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser < date + tm.assert_series_equal(result, pd.Series([False] * 10, name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser >= date + tm.assert_series_equal(result, pd.Series([True] * 10, name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + with tm.assert_produces_warning(FutureWarning) as m: + result = ser > date + tm.assert_series_equal(result, pd.Series([False] + [True] * 9, + name='dates')) + assert "a TypeError will be raised" in str(m[0].message) + + @pytest.mark.skip(reason="GH-21359") def test_dt64ser_cmp_date_invalid(self): # GH#19800 datetime.date comparison raises to # match DatetimeIndex/Timestamp. This also matches the behavior diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 7e59325c32ddc..906d2aacd5586 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -137,6 +137,17 @@ def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] + def test_constructor_dtype_str_na_values(self, string_dtype): + # https://github.com/pandas-dev/pandas/issues/21083 + ser = Series(['x', None], dtype=string_dtype) + result = ser.isna() + expected = Series([False, True]) + tm.assert_series_equal(result, expected) + assert ser.iloc[1] is None + + ser = Series(['x', np.nan], dtype=string_dtype) + assert np.isnan(ser.iloc[1]) + def test_constructor_series(self): index1 = ['d', 'b', 'a', 'c'] index2 = sorted(index1) @@ -164,22 +175,25 @@ def test_constructor_list_like(self): @pytest.mark.parametrize('input_vals', [ ([1, 2]), - ([1.0, 2.0, np.nan]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) - def test_constructor_list_str(self, input_vals): + def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements from a list are converted to strings # when dtype is str, 'str', or 'U' + result = Series(input_vals, dtype=string_dtype) + expected = Series(input_vals).astype(string_dtype) + assert_series_equal(result, expected) - for dtype in ['str', str, 'U']: - result = Series(input_vals, dtype=dtype) - expected = Series(input_vals).astype(dtype) - assert_series_equal(result, expected) + def test_constructor_list_str_na(self, string_dtype): + result = Series([1.0, 2.0, np.nan], dtype=string_dtype) + expected = Series(['1.0', '2.0', np.nan], dtype=object) + assert_series_equal(result, expected) + assert np.isnan(result[2]) def test_constructor_generator(self): gen = (i for i in range(10)) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 47798d0ddd7f5..5e924ac5c8894 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -3,6 +3,7 @@ import locale import calendar +import unicodedata import pytest from datetime import datetime, date @@ -13,7 +14,8 @@ from pandas.core.dtypes.common import is_integer_dtype, is_list_like from pandas import (Index, Series, DataFrame, bdate_range, date_range, period_range, timedelta_range, - PeriodIndex, DatetimeIndex, TimedeltaIndex) + PeriodIndex, DatetimeIndex, TimedeltaIndex, + compat) import pandas.core.common as com from pandas.util.testing import assert_series_equal @@ -309,10 +311,24 @@ def test_dt_accessor_datetime_name_accessors(self, time_locale): s = Series(DatetimeIndex(freq='M', start='2012', end='2013')) result = s.dt.month_name(locale=time_locale) expected = Series([month.capitalize() for month in expected_months]) + + # work around https://github.com/pandas-dev/pandas/issues/22342 + if not compat.PY2: + result = result.str.normalize("NFD") + expected = expected.str.normalize("NFD") + tm.assert_series_equal(result, expected) + for s_date, expected in zip(s, expected_months): result = s_date.month_name(locale=time_locale) - assert result == expected.capitalize() + expected = expected.capitalize() + + if not compat.PY2: + result = unicodedata.normalize("NFD", result) + expected = unicodedata.normalize("NFD", expected) + + assert result == expected + s = s.append(Series([pd.NaT])) assert np.isnan(s.dt.month_name(locale=time_locale).iloc[-1]) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0b0d4334c86a3..90f37053ce17e 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -11,6 +11,7 @@ from pandas import Series, DataFrame from pandas.compat import StringIO, u +from pandas.io.common import _get_handle from pandas.util.testing import (assert_series_equal, assert_almost_equal, assert_frame_equal, ensure_clean) import pandas.util.testing as tm @@ -138,29 +139,44 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression(self, compression): - - s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X') + @pytest.mark.parametrize('s,encoding', [ + (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X'), None), + # GH 21241, 21118 + (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), + (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'), + (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737') + ]) + def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: - s.to_csv(filename, compression=compression, header=True) - + s.to_csv(filename, compression=compression, encoding=encoding, + header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression=compression, - index_col=0, squeeze=True) - assert_series_equal(s, rs) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) + assert_series_equal(s, result) + + # test the round trip using file handle - to_csv -> read_csv + f, _handles = _get_handle(filename, 'w', compression=compression, + encoding=encoding) + with f: + s.to_csv(f, encoding=encoding, header=True) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) + assert_series_equal(s, result) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') assert s.name in text with tm.decompress_file(filename, compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, - squeeze=True)) + squeeze=True, + encoding=encoding)) class TestSeriesIO(TestData): diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 97236f028b1c4..730c2b7865f1f 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -11,6 +11,7 @@ from pandas import (Index, Series, DataFrame, date_range, option_context, Categorical, period_range, timedelta_range) from pandas.core.index import MultiIndex +from pandas.core.base import StringMixin from pandas.compat import lrange, range, u from pandas import compat @@ -202,6 +203,35 @@ def test_latex_repr(self): class TestCategoricalRepr(object): + def test_categorical_repr_unicode(self): + # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii', + # and we are working in PY2, then rendering a Categorical could raise + # UnicodeDecodeError by trying to decode when it shouldn't + + class County(StringMixin): + name = u'San Sebastián' + state = u'PR' + + def __unicode__(self): + return self.name + u', ' + self.state + + cat = pd.Categorical([County() for n in range(61)]) + idx = pd.Index(cat) + ser = idx.to_series() + + if compat.PY3: + # no reloading of sys, just check that the default (utf8) works + # as expected + repr(ser) + str(ser) + + else: + # set sys.defaultencoding to ascii, then change it back after + # the test + with tm.set_defaultencoding('ascii'): + repr(ser) + str(ser) + def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = u("0 1\n1 2\n2 3\n3 4\n" + diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py index 01b4ea6eaa238..13e0d1b12c372 100644 --- a/pandas/tests/series/test_sorting.py +++ b/pandas/tests/series/test_sorting.py @@ -141,19 +141,20 @@ def test_sort_index_inplace(self): assert result is None tm.assert_series_equal(random_order, self.ts) - def test_sort_index_multiindex(self): + @pytest.mark.parametrize("level", ['A', 0]) # GH 21052 + def test_sort_index_multiindex(self, level): mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) s = Series([1, 2], mi) backwards = s.iloc[[1, 0]] # implicit sort_remaining=True - res = s.sort_index(level='A') + res = s.sort_index(level=level) assert_series_equal(backwards, res) # GH13496 - # rows share same level='A': sort has no effect without remaining lvls - res = s.sort_index(level='A', sort_remaining=False) + # sort has no effect without remaining lvls + res = s.sort_index(level=level, sort_remaining=False) assert_series_equal(s, res) def test_sort_index_kind(self): diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index 6c0c83cf65ff7..b3330f866ba1f 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -454,6 +454,17 @@ def test_values_asarray(self): assert_almost_equal(self.arr.to_dense(), self.arr_data) assert_almost_equal(self.arr.sp_values, np.asarray(self.arr)) + @pytest.mark.parametrize('data,shape,dtype', [ + ([0, 0, 0, 0, 0], (5,), None), + ([], (0,), None), + ([0], (1,), None), + (['A', 'A', np.nan, 'B'], (4,), np.object) + ]) + def test_shape(self, data, shape, dtype): + # GH 21126 + out = SparseArray(data, dtype=dtype) + assert out.shape == shape + def test_to_dense(self): vals = np.array([1, np.nan, np.nan, 3, np.nan]) res = SparseArray(vals).to_dense() diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 0b329f64dafa3..576239e49455e 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- import pytest +import os import collections from functools import partial import numpy as np -from pandas import Series, Timestamp +from pandas import Series, DataFrame, Timestamp from pandas.compat import range, lmap import pandas.core.common as com from pandas.core import ops +from pandas.io.common import _get_handle import pandas.util.testing as tm @@ -222,3 +224,59 @@ def test_standardize_mapping(): dd = collections.defaultdict(list) assert isinstance(com.standardize_mapping(dd), partial) + + +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_pickle', 'to_json', 'to_csv']) +def test_compression_size(obj, method, compression): + if not compression: + pytest.skip("only test compression case.") + + with tm.ensure_clean() as filename: + getattr(obj, method)(filename, compression=compression) + compressed = os.path.getsize(filename) + getattr(obj, method)(filename, compression=None) + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed + + +@pytest.mark.parametrize('obj', [ + DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']), + Series(100 * [0.123456, 0.234567, 0.567567], name='X')]) +@pytest.mark.parametrize('method', ['to_csv', 'to_json']) +def test_compression_size_fh(obj, method, compression_only): + + with tm.ensure_clean() as filename: + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + compressed = os.path.getsize(filename) + with tm.ensure_clean() as filename: + f, _handles = _get_handle(filename, 'w', compression=None) + with f: + getattr(obj, method)(f) + assert not f.closed + assert f.closed + uncompressed = os.path.getsize(filename) + assert uncompressed > compressed + + +# GH 21227 +def test_compression_warning(compression_only): + df = DataFrame(100 * [[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + columns=['X', 'Y', 'Z']) + with tm.ensure_clean() as filename: + f, _handles = _get_handle(filename, 'w', compression=compression_only) + with tm.assert_produces_warning(RuntimeWarning, + check_stacklevel=False): + with f: + df.to_csv(f, compression=compression_only) diff --git a/pandas/tests/test_compat.py b/pandas/tests/test_compat.py index ead9ba1e26e2d..79d3aad493182 100644 --- a/pandas/tests/test_compat.py +++ b/pandas/tests/test_compat.py @@ -4,9 +4,10 @@ """ import pytest +import re from pandas.compat import (range, zip, map, filter, lrange, lzip, lmap, lfilter, builtins, iterkeys, itervalues, iteritems, - next, get_range_parameters, PY2) + next, get_range_parameters, PY2, re_type) class TestBuiltinIterators(object): @@ -89,3 +90,7 @@ def test_get_range_parameters(self, start, stop, step): assert start_result == start_expected assert stop_result == stop_expected assert step_result == step_expected + + +def test_re_type(): + assert isinstance(re.compile(''), re_type) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index a595d9f18d6b8..cf98cff97669a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -2,6 +2,9 @@ """ Testing that we work in the downstream packages """ +import subprocess +import sys + import pytest import numpy as np # noqa from pandas import DataFrame @@ -53,6 +56,11 @@ def test_xarray(df): assert df.to_xarray() is not None +def test_oo_optimizable(): + # GH 21071 + subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) + + @tm.network def test_statsmodels(): @@ -87,6 +95,7 @@ def test_pandas_gbq(df): pandas_gbq = import_module('pandas_gbq') # noqa +@pytest.mark.xfail(reason="0.7.0 pending") @tm.network def test_pandas_datareader(): diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 7973b27601237..128ab0572ba55 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2717,3 +2717,10 @@ def test_panel_index(): np.repeat([1, 2, 3], 4)], names=['time', 'panel']) tm.assert_index_equal(index, expected) + + +def test_panel_np_all(): + with catch_warnings(record=True): + wp = Panel({"A": DataFrame({'b': [1, 2]})}) + result = np.all(wp) + assert result == np.bool_(True) diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index c1257cce9a9a4..bcc50a25623a1 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2870,6 +2870,16 @@ def test_asfreq_bug(self): freq='1T')) assert_frame_equal(result, expected) + def test_resample_with_nat(self): + # GH 13223 + index = pd.to_timedelta(['0s', pd.NaT, '2s']) + result = DataFrame({'value': [2, 3, 5]}, index).resample('1s').mean() + expected = DataFrame({'value': [2.5, np.nan, 5.0]}, + index=timedelta_range('0 day', + periods=3, + freq='1S')) + assert_frame_equal(result, expected) + class TestResamplerGrouper(object): diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index d8e90ae0e1b35..cfd88f41f855e 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -41,7 +41,7 @@ def win_types(request): return request.param -@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian', 'slepian']) +@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian']) def win_types_special(request): return request.param @@ -389,8 +389,8 @@ def test_constructor(self, which): c(window=2, min_periods=1, center=False) # GH 13383 - c(0) with pytest.raises(ValueError): + c(0) c(-1) # not valid @@ -409,7 +409,6 @@ def test_constructor_with_win_type(self, which): # GH 13383 o = getattr(self, which) c = o.rolling - c(0, win_type='boxcar') with pytest.raises(ValueError): c(-1, win_type='boxcar') @@ -1079,8 +1078,7 @@ def test_cmov_window_special(self, win_types_special): kwds = { 'kaiser': {'beta': 1.}, 'gaussian': {'std': 1.}, - 'general_gaussian': {'power': 2., 'width': 2.}, - 'slepian': {'width': 0.5}} + 'general_gaussian': {'power': 2., 'width': 2.}} vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48]) @@ -1090,8 +1088,6 @@ def test_cmov_window_special(self, win_types_special): 13.65671, 12.01002, np.nan, np.nan], 'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, 11.73161, 13.08516, 12.95111, 12.74577, np.nan, np.nan], - 'slepian': [np.nan, np.nan, 9.81073, 10.89359, 11.70284, 12.88331, - 12.96079, 12.77008, np.nan, np.nan], 'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, 12.75129, 12.90702, 12.83757, np.nan, np.nan] } diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 5369b1a94a956..00701ca2be946 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,4 +1,3 @@ -import os from distutils.version import LooseVersion from datetime import date, datetime, timedelta @@ -455,14 +454,15 @@ def test_add(self, offset_types, tz): assert isinstance(result, Timestamp) assert result == expected_localize - def test_pickle_v0_15_2(self): + def test_pickle_v0_15_2(self, datapath): offsets = {'DateOffset': DateOffset(years=1), 'MonthBegin': MonthBegin(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'Week': Week(1)} - pickle_path = os.path.join(tm.get_data_path(), - 'dateoffset_0_15_2.pickle') + + pickle_path = datapath('tseries', 'offsets', 'data', + 'dateoffset_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) # @@ -528,7 +528,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * BusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -1642,7 +1645,10 @@ def test_repr(self): assert repr(self.offset) == '' assert repr(self.offset2) == '<2 * CustomBusinessDays>' - expected = '' + if compat.PY37: + expected = '' + else: + expected = '' assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -1848,12 +1854,10 @@ def _check_roundtrip(obj): _check_roundtrip(self.offset2) _check_roundtrip(self.offset * 2) - def test_pickle_compat_0_14_1(self): + def test_pickle_compat_0_14_1(self, datapath): hdays = [datetime(2013, 1, 1) for ele in range(4)] - - pth = tm.get_data_path() - - cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle')) + pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle') + cday0_14_1 = read_pickle(pth) cday = CDay(holidays=hdays) assert cday == cday0_14_1 diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index d6f58d16bcf64..4d34987e14f75 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import pandas as pd import pytest import numpy as np @@ -503,6 +504,25 @@ def test_index_equal_metadata_message(self): with tm.assert_raises_regex(AssertionError, expected): assert_index_equal(idx1, idx2) + def test_categorical_index_equality(self): + expected = """Index are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), + pd.Index(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c']))) + + def test_categorical_index_equality_relax_categories_check(self): + assert_index_equal(pd.Index(pd.Categorical(['a', 'b'])), + pd.Index(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c'])), + check_categorical=False) + class TestAssertSeriesEqual(object): @@ -600,6 +620,25 @@ def test_series_equal_message(self): assert_series_equal(pd.Series([1, 2, 3]), pd.Series([1, 2, 4]), check_less_precise=True) + def test_categorical_series_equality(self): + expected = """Attributes are different + +Attribute "dtype" are different +\\[left\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b'\\], ordered=False\\) +\\[right\\]: CategoricalDtype\\(categories=\\[u?'a', u?'b', u?'c'\\], \ +ordered=False\\)""" + + with tm.assert_raises_regex(AssertionError, expected): + assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), + pd.Series(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c']))) + + def test_categorical_series_equality_relax_categories_check(self): + assert_series_equal(pd.Series(pd.Categorical(['a', 'b'])), + pd.Series(pd.Categorical(['a', 'b'], + categories=['a', 'b', 'c'])), + check_categorical=False) + class TestAssertFrameEqual(object): @@ -803,3 +842,15 @@ def test_locale(self): # GH9744 locales = tm.get_locales() assert len(locales) >= 1 + + +def test_datapath_missing(datapath, request): + if not request.config.getoption("--strict-data-files"): + pytest.skip("Need to set '--strict-data-files'") + + with pytest.raises(ValueError): + datapath('not_a_file') + + result = datapath('data', 'iris.csv') + expected = os.path.join('pandas', 'tests', 'data', 'iris.csv') + assert result == expected diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 145be7f85b193..c049dfc874940 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -433,6 +433,26 @@ def teardown_class(cls): del cls.locales del cls.current_locale + def test_can_set_locale_valid_set(self): + # Setting the default locale should return True + assert tm.can_set_locale('') is True + + def test_can_set_locale_invalid_set(self): + # Setting an invalid locale should return False + assert tm.can_set_locale('non-existent_locale') is False + + def test_can_set_locale_invalid_get(self, monkeypatch): + # In some cases, an invalid locale can be set, + # but a subsequent getlocale() raises a ValueError + # See GH 22129 + + def mockgetlocale(): + raise ValueError() + + with monkeypatch.context() as m: + m.setattr(locale, 'getlocale', mockgetlocale) + assert tm.can_set_locale('') is False + def test_get_locales(self): # all systems should have at least a single locale assert len(tm.get_locales()) > 0 @@ -466,7 +486,7 @@ def test_set_locale(self): enc = codecs.lookup(enc).name new_locale = lang, enc - if not tm._can_set_locale(new_locale): + if not tm.can_set_locale(new_locale): with pytest.raises(locale.Error): with tm.set_locale(new_locale): pass diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 749165f894819..c294110d89ec5 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1090,12 +1090,17 @@ def apply(self, other): class CustomBusinessMonthEnd(_CustomBusinessMonth): - __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', 'end') + # TODO(py27): Replace condition with Subsitution after dropping Py27 + if _CustomBusinessMonth.__doc__: + __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', 'end') _prefix = 'CBM' class CustomBusinessMonthBegin(_CustomBusinessMonth): - __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', 'beginning') + # TODO(py27): Replace condition with Subsitution after dropping Py27 + if _CustomBusinessMonth.__doc__: + __doc__ = _CustomBusinessMonth.__doc__.replace('[BEGIN/END]', + 'beginning') _prefix = 'CBMS' diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 624fbbbd4f05e..6b55554cdc941 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -4,7 +4,7 @@ import types import warnings from textwrap import dedent, wrap -from functools import wraps, update_wrapper +from functools import wraps, update_wrapper, WRAPPER_ASSIGNMENTS def deprecate(name, alternative, version, alt_name=None, @@ -20,18 +20,18 @@ def deprecate(name, alternative, version, alt_name=None, Parameters ---------- name : str - Name of function to deprecate - alternative : str - Name of function to use instead + Name of function to deprecate. + alternative : func + Function to use instead. version : str - Version of pandas in which the method has been deprecated + Version of pandas in which the method has been deprecated. alt_name : str, optional - Name to use in preference of alternative.__name__ + Name to use in preference of alternative.__name__. klass : Warning, default FutureWarning stacklevel : int, default 2 msg : str - The message to display in the warning. - Default is '{name} is deprecated. Use {alt_name} instead.' + The message to display in the warning. + Default is '{name} is deprecated. Use {alt_name} instead.' """ alt_name = alt_name or alternative.__name__ @@ -39,25 +39,26 @@ def deprecate(name, alternative, version, alt_name=None, warning_msg = msg or '{} is deprecated, use {} instead'.format(name, alt_name) - @wraps(alternative) + # adding deprecated directive to the docstring + msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name) + msg = '\n '.join(wrap(msg, 70)) + + @Substitution(version=version, msg=msg) + @Appender(alternative.__doc__) def wrapper(*args, **kwargs): + """ + .. deprecated:: %(version)s + + %(msg)s + + """ warnings.warn(warning_msg, klass, stacklevel=stacklevel) return alternative(*args, **kwargs) - # adding deprecated directive to the docstring - msg = msg or 'Use `{alt_name}` instead.'.format(alt_name=alt_name) - tpl = dedent(""" - .. deprecated:: {version} - - {msg} - - {rest} - """) - rest = getattr(wrapper, '__doc__', '') - docstring = tpl.format(version=version, - msg='\n '.join(wrap(msg, 70)), - rest=dedent(rest)) - wrapper.__doc__ = docstring + # Since we are using Substitution to create the required docstring, + # remove that from the attributes that should be assigned to the wrapper + assignments = tuple(x for x in WRAPPER_ASSIGNMENTS if x != '__doc__') + update_wrapper(wrapper, alternative, assigned=assignments) return wrapper diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 89d90258f58e0..c6ab24403d58d 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,13 +23,13 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ - import pytest import locale from distutils.version import LooseVersion from pandas.compat import (is_platform_windows, is_platform_32bit, PY3, import_lzma) +from pandas.compat.numpy import _np_version_under1p15 from pandas.core.computation.expressions import (_USE_NUMEXPR, _NUMEXPR_INSTALLED) @@ -160,6 +160,9 @@ def decorated_func(func): skip_if_no_mpl = pytest.mark.skipif(_skip_if_no_mpl(), reason="Missing matplotlib dependency") + +skip_if_np_lt_115 = pytest.mark.skipif(_np_version_under1p15, + reason="NumPy 1.15 or greater required") skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") skip_if_mpl_1_5 = pytest.mark.skipif(_skip_if_mpl_1_5(), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e1484a9c1b390..bb79c25126fab 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -6,7 +6,6 @@ import sys import tempfile import warnings -import inspect import os import subprocess import locale @@ -479,6 +478,8 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): A string of the form .. For example to set the current locale to US English with a UTF8 encoding, you would pass "en_US.UTF-8". + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. Notes ----- @@ -490,37 +491,37 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): try: locale.setlocale(lc_var, new_locale) - - try: - normalized_locale = locale.getlocale() - except ValueError: - yield new_locale + normalized_locale = locale.getlocale() + if com._all_not_none(*normalized_locale): + yield '.'.join(normalized_locale) else: - if com._all_not_none(*normalized_locale): - yield '.'.join(normalized_locale) - else: - yield new_locale + yield new_locale finally: locale.setlocale(lc_var, current_locale) -def _can_set_locale(lc): - """Check to see if we can set a locale without throwing an exception. +def can_set_locale(lc, lc_var=locale.LC_ALL): + """ + Check to see if we can set a locale, and subsequently get the locale, + without raising an Exception. Parameters ---------- lc : str The locale to attempt to set. + lc_var : int, default `locale.LC_ALL` + The category of the locale being set. Returns ------- - isvalid : bool + is_valid : bool Whether the passed locale can be set """ try: - with set_locale(lc): + with set_locale(lc, lc_var=lc_var): pass - except locale.Error: # horrible name for a Exception subclass + except (ValueError, + locale.Error): # horrible name for a Exception subclass return False else: return True @@ -547,12 +548,34 @@ def _valid_locales(locales, normalize): else: normalizer = lambda x: x.strip() - return list(filter(_can_set_locale, map(normalizer, locales))) + return list(filter(can_set_locale, map(normalizer, locales))) # ----------------------------------------------------------------------------- # Stdout / stderr decorators +@contextmanager +def set_defaultencoding(encoding): + """ + Set default encoding (as given by sys.getdefaultencoding()) to the given + encoding; restore on exit. + + Parameters + ---------- + encoding : str + """ + if not PY2: + raise ValueError("set_defaultencoding context is only available " + "in Python 2.") + orig = sys.getdefaultencoding() + reload(sys) # noqa:F821 + sys.setdefaultencoding(encoding) + try: + yield + finally: + sys.setdefaultencoding(orig) + + def capture_stdout(f): """ Decorator to capture stdout in a buffer so that it can be checked @@ -729,15 +752,6 @@ def ensure_clean(filename=None, return_filelike=False): print("Exception on removing file: {error}".format(error=e)) -def get_data_path(f=''): - """Return the path of a data file, these are relative to the current test - directory. - """ - # get our callers file - _, filename, _, _, _, _ = inspect.getouterframes(inspect.currentframe())[1] - base_dir = os.path.abspath(os.path.dirname(filename)) - return os.path.join(base_dir, 'data', f) - # ----------------------------------------------------------------------------- # Comparators @@ -778,8 +792,12 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, def _check_types(l, r, obj='Index'): if exact: - assert_class_equal(left, right, exact=exact, obj=obj) - assert_attr_equal('dtype', l, r, obj=obj) + assert_class_equal(l, r, exact=exact, obj=obj) + + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal('dtype', l, r, obj=obj) + # allow string-like to have different inferred_types if l.inferred_type in ('string', 'unicode'): assert r.inferred_type in ('string', 'unicode') @@ -829,7 +847,8 @@ def _get_ilevel_values(index, level): # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) - if check_exact: + # skip exact index checking when `check_categorical` is False + if check_exact and check_categorical: if not left.equals(right): diff = np.sum((left.values != right.values) .astype(int)) * 100.0 / len(left) @@ -950,23 +969,23 @@ def is_sorted(seq): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical', check_category_order=True): + check_category_order=True, obj='Categorical'): """Test that Categoricals are equivalent. Parameters ---------- - left, right : Categorical - Categoricals to compare + left : Categorical + right : Categorical check_dtype : bool, default True Check that integer dtype of the codes are the same - obj : str, default 'Categorical' - Specify object name being compared, internally used to show appropriate - assertion message check_category_order : bool, default True Whether the order of the categories should be compared, which implies identical integer codes. If False, only the resulting values are compared. The ordered attribute is checked regardless. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message """ _check_isinstance(left, right, Categorical) @@ -1020,7 +1039,7 @@ def raise_assert_detail(obj, message, left, right, diff=None): def assert_numpy_array_equal(left, right, strict_nan=False, check_dtype=True, err_msg=None, - obj='numpy array', check_same=None): + check_same=None, obj='numpy array'): """ Checks that 'np.ndarray' is equivalent Parameters @@ -1033,11 +1052,11 @@ def assert_numpy_array_equal(left, right, strict_nan=False, check dtype if both a and b are np.ndarray err_msg : str, default None If provided, used as assertion message + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message - check_same : None|'copy'|'same', default None - Ensure left and right refer/do not refer to the same memory area """ # instance validation diff --git a/setup.cfg b/setup.cfg index 6d9657737a8bd..9ec967c25e225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,4 +32,5 @@ markers = slow: mark a test as slow network: mark a test as network high_memory: mark a test as a high-memory only -doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL +addopts = --strict-data-files +doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL \ No newline at end of file diff --git a/setup.py b/setup.py index 6febe674fb2a1..5d6bbbcf7b862 100755 --- a/setup.py +++ b/setup.py @@ -217,6 +217,7 @@ def build_extensions(self): 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', 'Programming Language :: Cython', 'Topic :: Scientific/Engineering'] @@ -453,10 +454,10 @@ def pxd(name): return pjoin('pandas', name + '.pxd') -# args to ignore warnings if is_platform_windows(): extra_compile_args = [] else: + # args to ignore warnings extra_compile_args = ['-Wno-unused-function'] lib_depends = lib_depends + ['pandas/_libs/src/numpy_helper.h', @@ -733,11 +734,7 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['data/*', 'templates/*'], - 'pandas.tests.io': ['data/legacy_hdf/*.h5', - 'data/legacy_pickle/*/*.pickle', - 'data/legacy_msgpack/*/*.msgpack', - 'data/html_encoding/*.html']}, + package_data={'': ['templates/*', '_libs/*.dll']}, ext_modules=extensions, maintainer_email=EMAIL, description=DESCRIPTION,