From 88bc2b72bc5dfa20950fb7633cd1ce9566bae252 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 16:40:32 +0100 Subject: [PATCH 01/21] change pandas to pandas-dev --- ci/deps/azure-27-compat.yaml | 2 +- ci/deps/azure-36-locale_slow.yaml | 2 +- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-macos-35.yaml | 2 +- ci/deps/azure-windows-27.yaml | 2 +- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/circle-36-locale.yaml | 2 +- ci/deps/travis-27-locale.yaml | 2 +- ci/deps/travis-27.yaml | 2 +- ci/deps/travis-36-doc.yaml | 2 +- ci/deps/travis-36-slow.yaml | 2 +- ci/deps/travis-36.yaml | 2 +- ci/deps/travis-37-numpydev.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ci/deps/azure-27-compat.yaml b/ci/deps/azure-27-compat.yaml index 44c561e9c8911..f3cc615c35243 100644 --- a/ci/deps/azure-27-compat.yaml +++ b/ci/deps/azure-27-compat.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 7e40bd1a9979e..4bbc6a2c11f1e 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 59c8818eaef1e..2b38465c04512 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 6ccdc79d11b27..7a0c3b81ac8f9 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults dependencies: diff --git a/ci/deps/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml index dc68129a5e6d3..b1533b071fa74 100644 --- a/ci/deps/azure-windows-27.yaml +++ b/ci/deps/azure-windows-27.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index af42545af7971..817aab66c65aa 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/circle-36-locale.yaml b/ci/deps/circle-36-locale.yaml index 59c8818eaef1e..2b38465c04512 100644 --- a/ci/deps/circle-36-locale.yaml +++ b/ci/deps/circle-36-locale.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-27-locale.yaml b/ci/deps/travis-27-locale.yaml index c8d17cf190e35..0846ef5e8264e 100644 --- a/ci/deps/travis-27-locale.yaml +++ b/ci/deps/travis-27-locale.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-27.yaml b/ci/deps/travis-27.yaml index 5a9e206ec2c69..8d14673ebde6d 100644 --- a/ci/deps/travis-27.yaml +++ b/ci/deps/travis-27.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml index fb54c784d6fac..ed0764fab414a 100644 --- a/ci/deps/travis-36-doc.yaml +++ b/ci/deps/travis-36-doc.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 3157ecac3a902..a6ffdb95e5e7c 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 1880fa2501581..1781f67041f44 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-37-numpydev.yaml b/ci/deps/travis-37-numpydev.yaml index 82c75b7c91b1f..99ae228f25de3 100644 --- a/ci/deps/travis-37-numpydev.yaml +++ b/ci/deps/travis-37-numpydev.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults dependencies: diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 7dbd85ac27df6..a297786f6b14d 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge From fc88e2b45889a7b4c4d36a2a0bcdc86dd880a5d7 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 22:48:15 +0100 Subject: [PATCH 02/21] add source activate --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 6d31adcbf8a43..9c98286f4c20f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,6 +104,7 @@ before_script: - ci/before_script_travis.sh script: + - source activate $CONDA_ENV - echo "script start" - ci/run_build_docs.sh - ci/script_single.sh From fb34dcc287cb27b50c9b573c3fec44763ad73495 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 22:49:17 +0100 Subject: [PATCH 03/21] remove source activate --- ci/build_docs.sh | 2 -- ci/incremental/build.sh | 1 - ci/incremental/setup_conda_environment.cmd | 4 ++-- ci/incremental/setup_conda_environment.sh | 5 ++--- ci/script_single.sh | 1 - ci/upload_coverage.sh | 1 - 6 files changed, 4 insertions(+), 10 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 33340a1c038dc..f89c4369dff4a 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -12,8 +12,6 @@ if [ "$DOC" ]; then echo "Will build docs" - source activate pandas - echo ############################### echo # Log file for the doc build # echo ############################### diff --git a/ci/incremental/build.sh b/ci/incremental/build.sh index 8f2301a3b7ef5..40f78e7d95d52 100755 --- a/ci/incremental/build.sh +++ b/ci/incremental/build.sh @@ -1,6 +1,5 @@ #!/bin/bash -source activate $CONDA_ENV # Make sure any error below is reported as such set -v -e diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index 35595ffb03695..bd628f3147b79 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -11,9 +11,9 @@ call deactivate @rem Display root environment (for debugging) conda list @rem Clean up any left-over from a previous build -conda remove --all -q -y -n %CONDA_ENV% +conda remove --all -q -y @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite -conda env create -n %CONDA_ENV% --file=ci\deps\azure-windows-%CONDA_PY%.yaml +conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml call activate %CONDA_ENV% conda list diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index f3ac99d5e7c5a..559a9e8b802b3 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -12,15 +12,14 @@ conda list # Clean up any left-over from a previous build # (note workaround for https://github.com/conda/conda/issues/2679: # `conda env remove` issue) -conda remove --all -q -y -n $CONDA_ENV +conda remove --all -q -y echo echo "[create env]" -time conda env create -q -n "${CONDA_ENV}" --file="${ENV_FILE}" || exit 1 +time conda env create -q --file="${ENV_FILE}" || exit 1 # Activate first set +v -source activate $CONDA_ENV set -v # remove any installed pandas package diff --git a/ci/script_single.sh b/ci/script_single.sh index ea0d48bc2da8a..b57b643290c73 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -2,7 +2,6 @@ echo "[script_single]" -source activate pandas if [ -n "$LOCALE_OVERRIDE" ]; then echo "Setting LC_ALL and LANG to $LOCALE_OVERRIDE" diff --git a/ci/upload_coverage.sh b/ci/upload_coverage.sh index a7ef2fa908079..88aca20590505 100755 --- a/ci/upload_coverage.sh +++ b/ci/upload_coverage.sh @@ -5,7 +5,6 @@ if [ -z "$COVERAGE" ]; then exit 0 fi -source activate pandas echo "uploading coverage" bash <(curl -s https://codecov.io/bash) -Z -c -F single -f /tmp/cov-single.xml From 04e721cc55f0b5e4fd9dcde86ec2eb107d97f007 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 22:53:52 +0100 Subject: [PATCH 04/21] clean the script --- .travis.yml | 2 +- ci/incremental/build.cmd | 2 +- ci/incremental/setup_conda_environment.cmd | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9c98286f4c20f..bec92204e393d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,7 +104,7 @@ before_script: - ci/before_script_travis.sh script: - - source activate $CONDA_ENV + - source activate pandas-dev - echo "script start" - ci/run_build_docs.sh - ci/script_single.sh diff --git a/ci/incremental/build.cmd b/ci/incremental/build.cmd index d2fd06d7d9e50..7115dea8c7eac 100644 --- a/ci/incremental/build.cmd +++ b/ci/incremental/build.cmd @@ -1,5 +1,5 @@ @rem https://github.com/numba/numba/blob/master/buildscripts/incremental/build.cmd -call activate %CONDA_ENV% + @rem Build numba extensions without silencing compile errors python setup.py build_ext -q --inplace diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index bd628f3147b79..b084e45fa3db7 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -15,7 +15,6 @@ conda remove --all -q -y @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml -call activate %CONDA_ENV% conda list if %errorlevel% neq 0 exit /b %errorlevel% From cd81131b671a8c63c55f6d557294f1dbbfeac38b Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 23:12:42 +0100 Subject: [PATCH 05/21] debug --- ci/incremental/setup_conda_environment.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index b084e45fa3db7..6c1a484b2d2ef 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -11,7 +11,7 @@ call deactivate @rem Display root environment (for debugging) conda list @rem Clean up any left-over from a previous build -conda remove --all -q -y + @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml From 463be05f4e04b8f9c7d953f9648adcf932f478cd Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 23:35:49 +0100 Subject: [PATCH 06/21] debug --- .travis.yml | 2 +- ci/incremental/setup_conda_environment.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index bec92204e393d..cd341a0af0c67 100644 --- a/.travis.yml +++ b/.travis.yml @@ -116,7 +116,7 @@ after_success: after_script: - echo "after_script start" - - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - if [ -e test-data-single.xml ]; then ci/print_skipped.py test-data-single.xml; fi diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index 559a9e8b802b3..6875db39125df 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -20,6 +20,7 @@ time conda env create -q --file="${ENV_FILE}" || exit 1 # Activate first set +v +source activate pandas-dev set -v # remove any installed pandas package From 70b99bc91ab79f8a74c0c672eae1da3ca601ea33 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 22 Nov 2018 18:34:44 -0800 Subject: [PATCH 07/21] REF/TST: Finish pytest idiom in parser tests (#23863) Adds more idiom to the following: * parser/common --> parser/test_common * parser/quoting --> parser/test_quoting * parser/usecols --> parser/test_usecols * parser/python_parser_only --> parser/test_python_parser_only Also: * Finally delete parser/test_parsers! * Bug in capture_stdout decorator in which we were forgetting to use compat.wraps. Builds off of gh-23712. --- pandas/tests/io/parser/common.py | 1620 -------------- pandas/tests/io/parser/conftest.py | 6 +- pandas/tests/io/parser/python_parser_only.py | 270 --- pandas/tests/io/parser/quoting.py | 173 -- pandas/tests/io/parser/test_c_parser_only.py | 33 +- pandas/tests/io/parser/test_common.py | 1912 +++++++++++++++++ pandas/tests/io/parser/test_parsers.py | 143 -- .../io/parser/test_python_parser_only.py | 303 +++ pandas/tests/io/parser/test_quoting.py | 158 ++ pandas/tests/io/parser/test_usecols.py | 533 +++++ pandas/tests/io/parser/usecols.py | 550 ----- pandas/util/testing.py | 2 +- 12 files changed, 2941 insertions(+), 2762 deletions(-) delete mode 100644 pandas/tests/io/parser/common.py delete mode 100644 pandas/tests/io/parser/python_parser_only.py delete mode 100644 pandas/tests/io/parser/quoting.py create mode 100644 pandas/tests/io/parser/test_common.py delete mode 100644 pandas/tests/io/parser/test_parsers.py create mode 100644 pandas/tests/io/parser/test_python_parser_only.py create mode 100644 pandas/tests/io/parser/test_quoting.py create mode 100644 pandas/tests/io/parser/test_usecols.py delete mode 100644 pandas/tests/io/parser/usecols.py diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py deleted file mode 100644 index 6ce08f10e2d00..0000000000000 --- a/pandas/tests/io/parser/common.py +++ /dev/null @@ -1,1620 +0,0 @@ -# -*- coding: utf-8 -*- - -import codecs -from collections import OrderedDict -import csv -from datetime import datetime -import os -import platform -import re -import sys - -import numpy as np -import pytest - -from pandas._libs.tslib import Timestamp -from pandas.compat import PY3, BytesIO, StringIO, lrange, range, u -from pandas.errors import DtypeWarning, EmptyDataError, ParserError - -import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, compat -import pandas.util.testing as tm - -from pandas.io.common import URLError -from pandas.io.parsers import TextFileReader, TextParser - - -class ParserTests(object): - """ - Want to be able to test either C+Cython or Python+Cython parsers - """ - data1 = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - def test_empty_decimal_marker(self): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - # Parsers support only length-1 decimals - msg = 'Only length-1 decimal markers supported' - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), decimal='') - - def test_bad_stream_exception(self): - # Issue 13652: - # This test validates that both python engine - # and C engine will raise UnicodeDecodeError instead of - # c engine raising ParserError and swallowing exception - # that caused read to fail. - codec = codecs.lookup("utf-8") - utf8 = codecs.lookup('utf-8') - - if compat.PY3: - msg = "'utf-8' codec can't decode byte" - else: - msg = "'utf8' codec can't decode byte" - - # stream must be binary UTF8 - with open(self.csv_shiftjs, "rb") as handle, codecs.StreamRecoder( - handle, utf8.encode, utf8.decode, codec.streamreader, - codec.streamwriter) as stream: - - with pytest.raises(UnicodeDecodeError, match=msg): - self.read_csv(stream) - - def test_read_csv(self): - if not compat.PY3: - if compat.is_platform_windows(): - prefix = u("file:///") - else: - prefix = u("file://") - - fname = prefix + compat.text_type(os.path.abspath(self.csv1)) - self.read_csv(fname, index_col=0, parse_dates=True) - - def test_1000_sep(self): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334, 13], - 'C': [5, 10.] - }) - - df = self.read_csv(StringIO(data), sep='|', thousands=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', thousands=',') - tm.assert_frame_equal(df, expected) - - def test_squeeze(self): - data = """\ -a,1 -b,2 -c,3 -""" - idx = Index(['a', 'b', 'c'], name=0) - expected = Series([1, 2, 3], name=1, index=idx) - result = self.read_table(StringIO(data), sep=',', index_col=0, - header=None, squeeze=True) - assert isinstance(result, Series) - tm.assert_series_equal(result, expected) - - def test_squeeze_no_view(self): - # see gh-8217 - # Series should not be a view - data = """time,data\n0,10\n1,11\n2,12\n4,14\n5,15\n3,13""" - result = self.read_csv(StringIO(data), index_col='time', squeeze=True) - assert not result._is_view - - def test_malformed(self): - # see gh-6607 - - # all - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -""" - msg = 'Expected 3 fields in line 4, saw 5' - with pytest.raises(Exception, match=msg): - self.read_table(StringIO(data), sep=',', - header=1, comment='#') - - # first chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - msg = 'Expected 3 fields in line 6, saw 5' - with pytest.raises(Exception, match=msg): - it = self.read_table(StringIO(data), sep=',', - header=1, comment='#', - iterator=True, chunksize=1, - skiprows=[2]) - it.read(5) - - # middle chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - msg = 'Expected 3 fields in line 6, saw 5' - with pytest.raises(Exception, match=msg): - it = self.read_table(StringIO(data), sep=',', header=1, - comment='#', iterator=True, chunksize=1, - skiprows=[2]) - it.read(3) - - # last chunk - data = """ignore -A,B,C -skip -1,2,3 -3,5,10 # comment -1,2,3,4,5 -2,3,4 -""" - msg = 'Expected 3 fields in line 6, saw 5' - with pytest.raises(Exception, match=msg): - it = self.read_table(StringIO(data), sep=',', header=1, - comment='#', iterator=True, chunksize=1, - skiprows=[2]) - it.read() - - # skipfooter is not supported with the C parser yet - if self.engine == 'python': - # skipfooter - data = """ignore -A,B,C -1,2,3 # comment -1,2,3,4,5 -2,3,4 -footer -""" - msg = 'Expected 3 fields in line 4, saw 5' - with pytest.raises(Exception, match=msg): - self.read_table(StringIO(data), sep=',', - header=1, comment='#', - skipfooter=1) - - def test_unnamed_columns(self): - data = """A,B,C,, -1,2,3,4,5 -6,7,8,9,10 -11,12,13,14,15 -""" - expected = np.array([[1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]], dtype=np.int64) - df = self.read_table(StringIO(data), sep=',') - tm.assert_almost_equal(df.values, expected) - tm.assert_index_equal(df.columns, - Index(['A', 'B', 'C', 'Unnamed: 3', - 'Unnamed: 4'])) - - def test_csv_mixed_type(self): - data = """A,B,C -a,1,2 -b,3,4 -c,4,5 -""" - expected = DataFrame({'A': ['a', 'b', 'c'], - 'B': [1, 3, 4], - 'C': [2, 4, 5]}) - out = self.read_csv(StringIO(data)) - tm.assert_frame_equal(out, expected) - - def test_read_csv_low_memory_no_rows_with_index(self): - if self.engine == "c" and not self.low_memory: - pytest.skip("This is a low-memory specific test") - - # see gh-21141 - data = """A,B,C -1,1,1,2 -2,2,3,4 -3,3,4,5 -""" - out = self.read_csv(StringIO(data), low_memory=True, - index_col=0, nrows=0) - expected = DataFrame(columns=["A", "B", "C"]) - tm.assert_frame_equal(out, expected) - - def test_read_csv_dataframe(self): - df = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = self.read_table(self.csv1, sep=',', index_col=0, - parse_dates=True) - tm.assert_index_equal(df.columns, pd.Index(['A', 'B', 'C', 'D'])) - assert df.index.name == 'index' - assert isinstance( - df.index[0], (datetime, np.datetime64, Timestamp)) - assert df.values.dtype == np.float64 - tm.assert_frame_equal(df, df2) - - def test_read_csv_no_index_name(self): - df = self.read_csv(self.csv2, index_col=0, parse_dates=True) - df2 = self.read_table(self.csv2, sep=',', index_col=0, - parse_dates=True) - tm.assert_index_equal(df.columns, - pd.Index(['A', 'B', 'C', 'D', 'E'])) - assert isinstance(df.index[0], (datetime, np.datetime64, Timestamp)) - assert df.loc[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64 - tm.assert_frame_equal(df, df2) - - def test_read_table_unicode(self): - fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8')) - df1 = self.read_table(fin, sep=";", encoding="utf-8", header=None) - assert isinstance(df1[0].values[0], compat.text_type) - - def test_read_table_wrong_num_columns(self): - # too few! - data = """A,B,C,D,E,F -1,2,3,4,5,6 -6,7,8,9,10,11,12 -11,12,13,14,15,16 -""" - pytest.raises(ValueError, self.read_csv, StringIO(data)) - - def test_read_duplicate_index_explicit(self): - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - - result = self.read_csv(StringIO(data), index_col=0) - expected = self.read_csv(StringIO(data)).set_index( - 'index', verify_integrity=False) - tm.assert_frame_equal(result, expected) - - result = self.read_table(StringIO(data), sep=',', index_col=0) - expected = self.read_table(StringIO(data), sep=',', ).set_index( - 'index', verify_integrity=False) - tm.assert_frame_equal(result, expected) - - def test_read_duplicate_index_implicit(self): - data = """A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo,12,13,14,15 -bar,12,13,14,15 -""" - - # make sure an error isn't thrown - self.read_csv(StringIO(data)) - self.read_table(StringIO(data), sep=',') - - def test_parse_bools(self): - data = """A,B -True,1 -False,2 -True,3 -""" - data = self.read_csv(StringIO(data)) - assert data['A'].dtype == np.bool_ - - data = """A,B -YES,1 -no,2 -yes,3 -No,3 -Yes,3 -""" - data = self.read_csv(StringIO(data), - true_values=['yes', 'Yes', 'YES'], - false_values=['no', 'NO', 'No']) - assert data['A'].dtype == np.bool_ - - data = """A,B -TRUE,1 -FALSE,2 -TRUE,3 -""" - data = self.read_csv(StringIO(data)) - assert data['A'].dtype == np.bool_ - - data = """A,B -foo,bar -bar,foo""" - result = self.read_csv(StringIO(data), true_values=['foo'], - false_values=['bar']) - expected = DataFrame({'A': [True, False], 'B': [False, True]}) - tm.assert_frame_equal(result, expected) - - def test_int_conversion(self): - data = """A,B -1.0,1 -2.0,2 -3.0,3 -""" - data = self.read_csv(StringIO(data)) - assert data['A'].dtype == np.float64 - assert data['B'].dtype == np.int64 - - def test_read_nrows(self): - expected = self.read_csv(StringIO(self.data1))[:3] - - df = self.read_csv(StringIO(self.data1), nrows=3) - tm.assert_frame_equal(df, expected) - - # see gh-10476 - df = self.read_csv(StringIO(self.data1), nrows=3.0) - tm.assert_frame_equal(df, expected) - - msg = r"'nrows' must be an integer >=0" - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), nrows=1.2) - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), nrows='foo') - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), nrows=-1) - - def test_read_chunksize(self): - reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - # with invalid chunksize value: - msg = r"'chunksize' must be an integer >=1" - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), chunksize=1.3) - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), chunksize='foo') - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), chunksize=0) - - def test_read_chunksize_and_nrows(self): - - # gh-15755 - # With nrows - reader = self.read_csv(StringIO(self.data1), index_col=0, - chunksize=2, nrows=5) - df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) - - tm.assert_frame_equal(pd.concat(reader), df) - - # chunksize > nrows - reader = self.read_csv(StringIO(self.data1), index_col=0, - chunksize=8, nrows=5) - df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) - - tm.assert_frame_equal(pd.concat(reader), df) - - # with changing "size": - reader = self.read_csv(StringIO(self.data1), index_col=0, - chunksize=8, nrows=5) - df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) - - tm.assert_frame_equal(reader.get_chunk(size=2), df.iloc[:2]) - tm.assert_frame_equal(reader.get_chunk(size=4), df.iloc[2:5]) - with pytest.raises(StopIteration): - reader.get_chunk(size=3) - - def test_read_chunksize_named(self): - reader = self.read_csv( - StringIO(self.data1), index_col='index', chunksize=2) - df = self.read_csv(StringIO(self.data1), index_col='index') - - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - def test_get_chunk_passed_chunksize(self): - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -1,2,3""" - result = self.read_csv(StringIO(data), chunksize=2) - - piece = result.get_chunk() - assert len(piece) == 2 - - def test_read_chunksize_generated_index(self): - # GH 12185 - reader = self.read_csv(StringIO(self.data1), chunksize=2) - df = self.read_csv(StringIO(self.data1)) - - tm.assert_frame_equal(pd.concat(reader), df) - - reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0) - df = self.read_csv(StringIO(self.data1), index_col=0) - - tm.assert_frame_equal(pd.concat(reader), df) - - def test_read_chunksize_jagged_names(self): - # see gh-23509 - data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) - reader = self.read_csv(StringIO(data), names=range(10), chunksize=4) - - expected = DataFrame() - - for i in range(10): - if i == 0: - expected[i] = [0] * 8 - else: - expected[i] = [np.nan] * 7 + [0] - - result = pd.concat(reader) - tm.assert_frame_equal(result, expected) - - def test_read_text_list(self): - data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" - as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', - '4', '5', '6']] - df = self.read_csv(StringIO(data), index_col=0) - - parser = TextParser(as_list, index_col=0, chunksize=2) - chunk = parser.read(None) - - tm.assert_frame_equal(chunk, df) - - def test_iterator(self): - # See gh-6607 - reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True) - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunk = reader.read(3) - tm.assert_frame_equal(chunk, df[:3]) - - last_chunk = reader.read(5) - tm.assert_frame_equal(last_chunk, df[3:]) - - # pass list - lines = list(csv.reader(StringIO(self.data1))) - parser = TextParser(lines, index_col=0, chunksize=2) - - df = self.read_csv(StringIO(self.data1), index_col=0) - - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[:2]) - tm.assert_frame_equal(chunks[1], df[2:4]) - tm.assert_frame_equal(chunks[2], df[4:]) - - # pass skiprows - parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) - chunks = list(parser) - tm.assert_frame_equal(chunks[0], df[1:3]) - - treader = self.read_table(StringIO(self.data1), sep=',', index_col=0, - iterator=True) - assert isinstance(treader, TextFileReader) - - # gh-3967: stopping iteration when chunksize is specified - data = """A,B,C -foo,1,2,3 -bar,4,5,6 -baz,7,8,9 -""" - reader = self.read_csv(StringIO(data), iterator=True) - result = list(reader) - expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ - 3, 6, 9]), index=['foo', 'bar', 'baz']) - tm.assert_frame_equal(result[0], expected) - - # chunksize = 1 - reader = self.read_csv(StringIO(data), chunksize=1) - result = list(reader) - expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ - 3, 6, 9]), index=['foo', 'bar', 'baz']) - assert len(result) == 3 - tm.assert_frame_equal(pd.concat(result), expected) - - @pytest.mark.parametrize("kwargs", [ - dict(iterator=True, - chunksize=1), - dict(iterator=True), - dict(chunksize=1) - ]) - def test_iterator_skipfooter_errors(self, kwargs): - msg = "'skipfooter' not supported for 'iteration'" - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), skipfooter=1, **kwargs) - - def test_nrows_skipfooter_errors(self): - msg = "'skipfooter' not supported with 'nrows'" - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(self.data1), skipfooter=1, nrows=5) - - def test_pass_names_with_index(self): - lines = self.data1.split('\n') - no_header = '\n'.join(lines[1:]) - - # regular index - names = ['index', 'A', 'B', 'C', 'D'] - df = self.read_csv(StringIO(no_header), index_col=0, names=names) - expected = self.read_csv(StringIO(self.data1), index_col=0) - tm.assert_frame_equal(df, expected) - - # multi index - data = """index1,index2,A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - lines = data.split('\n') - no_header = '\n'.join(lines[1:]) - names = ['index1', 'index2', 'A', 'B', 'C', 'D'] - df = self.read_csv(StringIO(no_header), index_col=[0, 1], - names=names) - expected = self.read_csv(StringIO(data), index_col=[0, 1]) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data), index_col=['index1', 'index2']) - tm.assert_frame_equal(df, expected) - - def test_multi_index_no_level_names(self): - data = """index1,index2,A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - - data2 = """A,B,C,D -foo,one,2,3,4,5 -foo,two,7,8,9,10 -foo,three,12,13,14,15 -bar,one,12,13,14,15 -bar,two,12,13,14,15 -""" - - lines = data.split('\n') - no_header = '\n'.join(lines[1:]) - names = ['A', 'B', 'C', 'D'] - - df = self.read_csv(StringIO(no_header), index_col=[0, 1], - header=None, names=names) - expected = self.read_csv(StringIO(data), index_col=[0, 1]) - tm.assert_frame_equal(df, expected, check_names=False) - - # 2 implicit first cols - df2 = self.read_csv(StringIO(data2)) - tm.assert_frame_equal(df2, df) - - # reverse order of index - df = self.read_csv(StringIO(no_header), index_col=[1, 0], names=names, - header=None) - expected = self.read_csv(StringIO(data), index_col=[1, 0]) - tm.assert_frame_equal(df, expected, check_names=False) - - def test_multi_index_blank_df(self): - # GH 14545 - data = """a,b -""" - df = self.read_csv(StringIO(data), header=[0]) - expected = DataFrame(columns=['a', 'b']) - tm.assert_frame_equal(df, expected) - round_trip = self.read_csv(StringIO( - expected.to_csv(index=False)), header=[0]) - tm.assert_frame_equal(round_trip, expected) - - data_multiline = """a,b -c,d -""" - df2 = self.read_csv(StringIO(data_multiline), header=[0, 1]) - cols = MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) - expected2 = DataFrame(columns=cols) - tm.assert_frame_equal(df2, expected2) - round_trip = self.read_csv(StringIO( - expected2.to_csv(index=False)), header=[0, 1]) - tm.assert_frame_equal(round_trip, expected2) - - def test_no_unnamed_index(self): - data = """ id c0 c1 c2 -0 1 0 a b -1 2 0 c d -2 2 2 e f -""" - df = self.read_table(StringIO(data), sep=' ') - assert df.index.name is None - - def test_read_csv_parse_simple_list(self): - text = """foo -bar baz -qux foo -foo -bar""" - df = self.read_csv(StringIO(text), header=None) - expected = DataFrame({0: ['foo', 'bar baz', 'qux foo', - 'foo', 'bar']}) - tm.assert_frame_equal(df, expected) - - @tm.network - def test_url(self, datapath): - # HTTP(S) - url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/tests/io/parser/data/salaries.csv') - url_table = self.read_table(url) - localtable = datapath('io', 'parser', 'data', 'salaries.csv') - local_table = self.read_table(localtable) - tm.assert_frame_equal(url_table, local_table) - # TODO: ftp testing - - @pytest.mark.slow - def test_file(self, datapath): - localtable = datapath('io', 'parser', 'data', 'salaries.csv') - local_table = self.read_table(localtable) - - try: - url_table = self.read_table('file://localhost/' + localtable) - except URLError: - # fails on some systems - pytest.skip("failing on %s" % - ' '.join(platform.uname()).strip()) - - tm.assert_frame_equal(url_table, local_table) - - def test_path_pathlib(self): - df = tm.makeDataFrame() - result = tm.round_trip_pathlib(df.to_csv, - lambda p: self.read_csv(p, index_col=0)) - tm.assert_frame_equal(df, result) - - def test_path_localpath(self): - df = tm.makeDataFrame() - result = tm.round_trip_localpath( - df.to_csv, - lambda p: self.read_csv(p, index_col=0)) - tm.assert_frame_equal(df, result) - - def test_nonexistent_path(self): - # gh-2428: pls no segfault - # gh-14086: raise more helpful FileNotFoundError - path = '%s.csv' % tm.rands(10) - pytest.raises(compat.FileNotFoundError, self.read_csv, path) - - def test_missing_trailing_delimiters(self): - data = """A,B,C,D -1,2,3,4 -1,3,3, -1,4,5""" - result = self.read_csv(StringIO(data)) - assert result['D'].isna()[1:].all() - - def test_skipinitialspace(self): - s = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' - '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' - '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' - '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' - '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' - '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') - - sfile = StringIO(s) - # it's 33 columns - result = self.read_csv(sfile, names=lrange(33), na_values=['-9999.0'], - header=None, skipinitialspace=True) - assert pd.isna(result.iloc[0, 29]) - - def test_utf16_bom_skiprows(self): - # #2298 - data = u("""skip this -skip this too -A\tB\tC -1\t2\t3 -4\t5\t6""") - - data2 = u("""skip this -skip this too -A,B,C -1,2,3 -4,5,6""") - - path = '__%s__.csv' % tm.rands(10) - - with tm.ensure_clean(path) as path: - for sep, dat in [('\t', data), (',', data2)]: - for enc in ['utf-16', 'utf-16le', 'utf-16be']: - bytes = dat.encode(enc) - with open(path, 'wb') as f: - f.write(bytes) - - s = BytesIO(dat.encode('utf-8')) - if compat.PY3: - # somewhat False since the code never sees bytes - from io import TextIOWrapper - s = TextIOWrapper(s, encoding='utf-8') - - result = self.read_csv(path, encoding=enc, skiprows=2, - sep=sep) - expected = self.read_csv(s, encoding='utf-8', skiprows=2, - sep=sep) - s.close() - - tm.assert_frame_equal(result, expected) - - def test_utf16_example(self, datapath): - path = datapath('io', 'parser', 'data', 'utf16_ex.txt') - - # it works! and is the right length - result = self.read_table(path, encoding='utf-16') - assert len(result) == 50 - - if not compat.PY3: - buf = BytesIO(open(path, 'rb').read()) - result = self.read_table(buf, encoding='utf-16') - assert len(result) == 50 - - def test_unicode_encoding(self, datapath): - pth = datapath('io', 'parser', 'data', 'unicode_series.csv') - - result = self.read_csv(pth, header=None, encoding='latin-1') - result = result.set_index(0) - - got = result[1][1632] - expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)') - - assert got == expected - - def test_trailing_delimiters(self): - # #2442. grumble grumble - data = """A,B,C -1,2,3, -4,5,6, -7,8,9,""" - result = self.read_csv(StringIO(data), index_col=False) - - expected = DataFrame({'A': [1, 4, 7], 'B': [2, 5, 8], - 'C': [3, 6, 9]}) - - tm.assert_frame_equal(result, expected) - - def test_escapechar(self): - # http://stackoverflow.com/questions/13824840/feature-request-for- - # pandas-read-csv - data = '''SEARCH_TERM,ACTUAL_URL -"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" -"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa - - result = self.read_csv(StringIO(data), escapechar='\\', - quotechar='"', encoding='utf-8') - assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", ' - 'IKEA:s 1700-tals serie') - tm.assert_index_equal(result.columns, - Index(['SEARCH_TERM', 'ACTUAL_URL'])) - - def test_int64_min_issues(self): - # #2599 - data = 'A,B\n0,0\n0,' - - result = self.read_csv(StringIO(data)) - expected = DataFrame({'A': [0, 0], 'B': [0, np.nan]}) - - tm.assert_frame_equal(result, expected) - - def test_parse_integers_above_fp_precision(self): - data = """Numbers -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000191 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000192 -17007000002000194""" - - result = self.read_csv(StringIO(data)) - expected = DataFrame({'Numbers': [17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000191, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000192, - 17007000002000194]}) - - tm.assert_series_equal(result['Numbers'], expected['Numbers']) - - def test_chunks_have_consistent_numerical_type(self): - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) - - with tm.assert_produces_warning(False): - df = self.read_csv(StringIO(data)) - # Assert that types were coerced. - assert type(df.a[0]) is np.float64 - assert df.a.dtype == np.float - - def test_warn_if_chunks_have_mismatched_type(self): - warning_type = False - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ['a', 'b'] + integers) - - # see gh-3866: if chunks are different types and can't - # be coerced using numerical types, then issue warning. - if self.engine == 'c' and self.low_memory: - warning_type = DtypeWarning - - with tm.assert_produces_warning(warning_type): - df = self.read_csv(StringIO(data)) - assert df.a.dtype == np.object - - def test_integer_overflow_bug(self): - # see gh-2601 - data = "65248E10 11\n55555E55 22\n" - - result = self.read_csv(StringIO(data), header=None, sep=' ') - assert result[0].dtype == np.float64 - - result = self.read_csv(StringIO(data), header=None, sep=r'\s+') - assert result[0].dtype == np.float64 - - def test_catch_too_many_names(self): - # see gh-5156 - data = """\ -1,2,3 -4,,6 -7,8,9 -10,11,12\n""" - pytest.raises(ValueError, self.read_csv, StringIO(data), - header=0, names=['a', 'b', 'c', 'd']) - - def test_ignore_leading_whitespace(self): - # see gh-3374, gh-6607 - data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' - result = self.read_table(StringIO(data), sep=r'\s+') - expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) - tm.assert_frame_equal(result, expected) - - def test_chunk_begins_with_newline_whitespace(self): - # see gh-10022 - data = '\n hello\nworld\n' - result = self.read_csv(StringIO(data), header=None) - assert len(result) == 2 - - # see gh-9735: this issue is C parser-specific (bug when - # parsing whitespace and characters at chunk boundary) - if self.engine == 'c': - chunk1 = 'a' * (1024 * 256 - 2) + '\na' - chunk2 = '\n a' - result = self.read_csv(StringIO(chunk1 + chunk2), header=None) - expected = DataFrame(['a' * (1024 * 256 - 2), 'a', ' a']) - tm.assert_frame_equal(result, expected) - - def test_empty_with_index(self): - # see gh-10184 - data = 'x,y' - result = self.read_csv(StringIO(data), index_col=0) - expected = DataFrame([], columns=['y'], index=Index([], name='x')) - tm.assert_frame_equal(result, expected) - - def test_empty_with_multiindex(self): - # see gh-10467 - data = 'x,y,z' - result = self.read_csv(StringIO(data), index_col=['x', 'y']) - expected = DataFrame([], columns=['z'], - index=MultiIndex.from_arrays( - [[]] * 2, names=['x', 'y'])) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_empty_with_reversed_multiindex(self): - data = 'x,y,z' - result = self.read_csv(StringIO(data), index_col=[1, 0]) - expected = DataFrame([], columns=['z'], - index=MultiIndex.from_arrays( - [[]] * 2, names=['y', 'x'])) - tm.assert_frame_equal(result, expected, check_index_type=False) - - def test_float_parser(self): - # see gh-9565 - data = '45e-1,4.5,45.,inf,-inf' - result = self.read_csv(StringIO(data), header=None) - expected = DataFrame([[float(s) for s in data.split(',')]]) - tm.assert_frame_equal(result, expected) - - def test_scientific_no_exponent(self): - # see gh-12215 - df = DataFrame.from_dict(OrderedDict([('w', ['2e']), ('x', ['3E']), - ('y', ['42e']), - ('z', ['632E'])])) - data = df.to_csv(index=False) - for prec in self.float_precision_choices: - df_roundtrip = self.read_csv( - StringIO(data), float_precision=prec) - tm.assert_frame_equal(df_roundtrip, df) - - def test_int64_overflow(self): - data = """ID -00013007854817840016671868 -00013007854817840016749251 -00013007854817840016754630 -00013007854817840016781876 -00013007854817840017028824 -00013007854817840017963235 -00013007854817840018860166""" - - # 13007854817840016671868 > UINT64_MAX, so this - # will overflow and return object as the dtype. - result = self.read_csv(StringIO(data)) - assert result['ID'].dtype == object - - # 13007854817840016671868 > UINT64_MAX, so attempts - # to cast to either int64 or uint64 will result in - # an OverflowError being raised. - for conv in (np.int64, np.uint64): - pytest.raises(OverflowError, self.read_csv, - StringIO(data), converters={'ID': conv}) - - # These numbers fall right inside the int64-uint64 range, - # so they should be parsed as string. - ui_max = np.iinfo(np.uint64).max - i_max = np.iinfo(np.int64).max - i_min = np.iinfo(np.int64).min - - for x in [i_max, i_min, ui_max]: - result = self.read_csv(StringIO(str(x)), header=None) - expected = DataFrame([x]) - tm.assert_frame_equal(result, expected) - - # These numbers fall just outside the int64-uint64 range, - # so they should be parsed as string. - too_big = ui_max + 1 - too_small = i_min - 1 - - for x in [too_big, too_small]: - result = self.read_csv(StringIO(str(x)), header=None) - expected = DataFrame([str(x)]) - tm.assert_frame_equal(result, expected) - - # No numerical dtype can hold both negative and uint64 values, - # so they should be cast as string. - data = '-1\n' + str(2**63) - expected = DataFrame([str(-1), str(2**63)]) - result = self.read_csv(StringIO(data), header=None) - tm.assert_frame_equal(result, expected) - - data = str(2**63) + '\n-1' - expected = DataFrame([str(2**63), str(-1)]) - result = self.read_csv(StringIO(data), header=None) - tm.assert_frame_equal(result, expected) - - def test_empty_with_nrows_chunksize(self): - # see gh-9535 - expected = DataFrame([], columns=['foo', 'bar']) - result = self.read_csv(StringIO('foo,bar\n'), nrows=10) - tm.assert_frame_equal(result, expected) - - result = next(iter(self.read_csv( - StringIO('foo,bar\n'), chunksize=10))) - tm.assert_frame_equal(result, expected) - - def test_eof_states(self): - # see gh-10728, gh-10548 - - # With skip_blank_lines = True - expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) - - # gh-10728: WHITESPACE_LINE - data = 'a,b,c\n4,5,6\n ' - result = self.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - # gh-10548: EAT_LINE_COMMENT - data = 'a,b,c\n4,5,6\n#comment' - result = self.read_csv(StringIO(data), comment='#') - tm.assert_frame_equal(result, expected) - - # EAT_CRNL_NOP - data = 'a,b,c\n4,5,6\n\r' - result = self.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - # EAT_COMMENT - data = 'a,b,c\n4,5,6#comment' - result = self.read_csv(StringIO(data), comment='#') - tm.assert_frame_equal(result, expected) - - # SKIP_LINE - data = 'a,b,c\n4,5,6\nskipme' - result = self.read_csv(StringIO(data), skiprows=[2]) - tm.assert_frame_equal(result, expected) - - # With skip_blank_lines = False - - # EAT_LINE_COMMENT - data = 'a,b,c\n4,5,6\n#comment' - result = self.read_csv( - StringIO(data), comment='#', skip_blank_lines=False) - expected = DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - # IN_FIELD - data = 'a,b,c\n4,5,6\n ' - result = self.read_csv(StringIO(data), skip_blank_lines=False) - expected = DataFrame( - [['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - # EAT_CRNL - data = 'a,b,c\n4,5,6\n\r' - result = self.read_csv(StringIO(data), skip_blank_lines=False) - expected = DataFrame( - [[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - # Should produce exceptions - - # ESCAPED_CHAR - data = "a,b,c\n4,5,6\n\\" - pytest.raises(Exception, self.read_csv, - StringIO(data), escapechar='\\') - - # ESCAPE_IN_QUOTED_FIELD - data = 'a,b,c\n4,5,6\n"\\' - pytest.raises(Exception, self.read_csv, - StringIO(data), escapechar='\\') - - # IN_QUOTED_FIELD - data = 'a,b,c\n4,5,6\n"' - pytest.raises(Exception, self.read_csv, - StringIO(data), escapechar='\\') - - def test_uneven_lines_with_usecols(self): - # See gh-12203 - csv = r"""a,b,c - 0,1,2 - 3,4,5,6,7 - 8,9,10 - """ - - # make sure that an error is still thrown - # when the 'usecols' parameter is not provided - msg = r"Expected \d+ fields in line \d+, saw \d+" - with pytest.raises(ValueError, match=msg): - df = self.read_csv(StringIO(csv)) - - expected = DataFrame({ - 'a': [0, 3, 8], - 'b': [1, 4, 9] - }) - - usecols = [0, 1] - df = self.read_csv(StringIO(csv), usecols=usecols) - tm.assert_frame_equal(df, expected) - - usecols = ['a', 'b'] - df = self.read_csv(StringIO(csv), usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_read_empty_with_usecols(self): - # See gh-12493 - names = ['Dummy', 'X', 'Dummy_2'] - usecols = names[1:2] # ['X'] - - # first, check to see that the response of - # parser when faced with no provided columns - # throws the correct error, with or without usecols - errmsg = "No columns to parse from file" - - with pytest.raises(EmptyDataError, match=errmsg): - self.read_csv(StringIO('')) - - with pytest.raises(EmptyDataError, match=errmsg): - self.read_csv(StringIO(''), usecols=usecols) - - expected = DataFrame(columns=usecols, index=[0], dtype=np.float64) - df = self.read_csv(StringIO(',,'), names=names, usecols=usecols) - tm.assert_frame_equal(df, expected) - - expected = DataFrame(columns=usecols) - df = self.read_csv(StringIO(''), names=names, usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_trailing_spaces(self): - data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa - expected = DataFrame([[1., 2., 4.], - [5.1, np.nan, 10.]]) - - # gh-8661, gh-8679: this should ignore six lines including - # lines with trailing whitespace and blank lines - df = self.read_csv(StringIO(data.replace(',', ' ')), - header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - df = self.read_table(StringIO(data.replace(',', ' ')), - header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], - skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - - # gh-8983: test skipping set of rows after a row with trailing spaces - expected = DataFrame({"A": [1., 5.1], "B": [2., np.nan], - "C": [4., 10]}) - df = self.read_table(StringIO(data.replace(',', ' ')), - delim_whitespace=True, - skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - - def test_raise_on_sep_with_delim_whitespace(self): - # see gh-6607 - data = 'a b c\n1 2 3' - with pytest.raises(ValueError, match='you can only specify one'): - self.read_table(StringIO(data), sep=r'\s', delim_whitespace=True) - - def test_single_char_leading_whitespace(self): - # see gh-9710 - data = """\ -MyColumn - a - b - a - b\n""" - - expected = DataFrame({'MyColumn': list('abab')}) - - result = self.read_csv(StringIO(data), delim_whitespace=True, - skipinitialspace=True) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), skipinitialspace=True) - tm.assert_frame_equal(result, expected) - - def test_empty_lines(self): - data = """\ -A,B,C -1,2.,4. - - -5.,NaN,10.0 - --70,.4,1 -""" - expected = np.array([[1., 2., 4.], - [5., np.nan, 10.], - [-70., .4, 1.]]) - df = self.read_csv(StringIO(data)) - tm.assert_numpy_array_equal(df.values, expected) - df = self.read_csv(StringIO(data.replace(',', ' ')), sep=r'\s+') - tm.assert_numpy_array_equal(df.values, expected) - expected = np.array([[1., 2., 4.], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [5., np.nan, 10.], - [np.nan, np.nan, np.nan], - [-70., .4, 1.]]) - df = self.read_csv(StringIO(data), skip_blank_lines=False) - tm.assert_numpy_array_equal(df.values, expected) - - def test_whitespace_lines(self): - data = """ - -\t \t\t - \t -A,B,C - \t 1,2.,4. -5.,NaN,10.0 -""" - expected = np.array([[1, 2., 4.], - [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data)) - tm.assert_numpy_array_equal(df.values, expected) - - def test_regex_separator(self): - # see gh-6607 - data = """ A B C D -a 1 2 3 4 -b 1 2 3 4 -c 1 2 3 4 -""" - df = self.read_table(StringIO(data), sep=r'\s+') - expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), - index_col=0) - assert expected.index.name is None - tm.assert_frame_equal(df, expected) - - data = ' a b c\n1 2 3 \n4 5 6\n 7 8 9' - result = self.read_table(StringIO(data), sep=r'\s+') - expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - columns=['a', 'b', 'c']) - tm.assert_frame_equal(result, expected) - - @tm.capture_stdout - def test_verbose_import(self): - text = """a,b,c,d -one,1,2,3 -one,1,2,3 -,1,2,3 -one,1,2,3 -,1,2,3 -,1,2,3 -one,1,2,3 -two,1,2,3""" - - # Engines are verbose in different ways. - self.read_csv(StringIO(text), verbose=True) - output = sys.stdout.getvalue() - - if self.engine == 'c': - assert 'Tokenization took:' in output - assert 'Parser memory cleanup took:' in output - else: # Python engine - assert output == 'Filled 3 NA values in column a\n' - - # Reset the stdout buffer. - sys.stdout = StringIO() - - text = """a,b,c,d -one,1,2,3 -two,1,2,3 -three,1,2,3 -four,1,2,3 -five,1,2,3 -,1,2,3 -seven,1,2,3 -eight,1,2,3""" - - self.read_csv(StringIO(text), verbose=True, index_col=0) - output = sys.stdout.getvalue() - - # Engines are verbose in different ways. - if self.engine == 'c': - assert 'Tokenization took:' in output - assert 'Parser memory cleanup took:' in output - else: # Python engine - assert output == 'Filled 1 NA values in column a\n' - - @pytest.mark.skipif(PY3, reason="won't work in Python 3") - def test_iteration_open_handle(self): - - with tm.ensure_clean() as path: - with open(path, 'wb') as f: - f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG') - - with open(path, 'rb') as f: - for line in f: - if 'CCC' in line: - break - - if self.engine == 'c': - pytest.raises(Exception, self.read_table, - f, squeeze=True, header=None) - else: - result = self.read_table(f, squeeze=True, header=None) - expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) - tm.assert_series_equal(result, expected) - - def test_1000_sep_with_decimal(self): - data = """A|B|C -1|2,334.01|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334.01, 13], - 'C': [5, 10.] - }) - - assert expected.A.dtype == 'int64' - assert expected.B.dtype == 'float' - assert expected.C.dtype == 'float' - - df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', - thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - data_with_odd_sep = """A|B|C -1|2.334,01|5 -10|13|10, -""" - df = self.read_csv(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - def test_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - df2 = self.read_csv(StringIO(data), sep=';', decimal=',') - assert df2['Number1'].dtype == float - assert df2['Number2'].dtype == float - assert df2['Number3'].dtype == float - - def test_inf_parsing(self): - data = """\ -,A -a,inf -b,-inf -c,+Inf -d,-Inf -e,INF -f,-INF -g,+INf -h,-INf -i,inF -j,-inF""" - inf = float('inf') - expected = Series([inf, -inf] * 5) - - df = self.read_csv(StringIO(data), index_col=0) - tm.assert_almost_equal(df['A'].values, expected.values) - - df = self.read_csv(StringIO(data), index_col=0, na_filter=False) - tm.assert_almost_equal(df['A'].values, expected.values) - - def test_raise_on_no_columns(self): - # single newline - data = "\n" - pytest.raises(EmptyDataError, self.read_csv, StringIO(data)) - - # test with more than a single newline - data = "\n\n\n" - pytest.raises(EmptyDataError, self.read_csv, StringIO(data)) - - def test_memory_map(self): - mmap_file = os.path.join(self.dirpath, 'test_mmap.csv') - expected = DataFrame({ - 'a': [1, 2, 3], - 'b': ['one', 'two', 'three'], - 'c': ['I', 'II', 'III'] - }) - - out = self.read_csv(mmap_file, memory_map=True) - tm.assert_frame_equal(out, expected) - - def test_null_byte_char(self): - # see gh-2741 - data = '\x00,foo' - cols = ['a', 'b'] - - expected = DataFrame([[np.nan, 'foo']], - columns=cols) - - if self.engine == 'c': - out = self.read_csv(StringIO(data), names=cols) - tm.assert_frame_equal(out, expected) - else: - msg = "NULL byte detected" - with pytest.raises(ParserError, match=msg): - self.read_csv(StringIO(data), names=cols) - - def test_utf8_bom(self): - # see gh-4793 - bom = u('\ufeff') - utf8 = 'utf-8' - - def _encode_data_with_bom(_data): - bom_data = (bom + _data).encode(utf8) - return BytesIO(bom_data) - - # basic test - data = 'a\n1' - expected = DataFrame({'a': [1]}) - - out = self.read_csv(_encode_data_with_bom(data), - encoding=utf8) - tm.assert_frame_equal(out, expected) - - # test with "regular" quoting - data = '"a"\n1' - expected = DataFrame({'a': [1]}) - - out = self.read_csv(_encode_data_with_bom(data), - encoding=utf8, quotechar='"') - tm.assert_frame_equal(out, expected) - - # test in a data row instead of header - data = 'b\n1' - expected = DataFrame({'a': ['b', '1']}) - - out = self.read_csv(_encode_data_with_bom(data), - encoding=utf8, names=['a']) - tm.assert_frame_equal(out, expected) - - # test in empty data row with skipping - data = '\n1' - expected = DataFrame({'a': [1]}) - - out = self.read_csv(_encode_data_with_bom(data), - encoding=utf8, names=['a'], - skip_blank_lines=True) - tm.assert_frame_equal(out, expected) - - # test in empty data row without skipping - data = '\n1' - expected = DataFrame({'a': [np.nan, 1.0]}) - - out = self.read_csv(_encode_data_with_bom(data), - encoding=utf8, names=['a'], - skip_blank_lines=False) - tm.assert_frame_equal(out, expected) - - def test_temporary_file(self): - # see gh-13398 - data1 = "0 0" - - from tempfile import TemporaryFile - new_file = TemporaryFile("w+") - new_file.write(data1) - new_file.flush() - new_file.seek(0) - - result = self.read_csv(new_file, sep=r'\s+', header=None) - new_file.close() - expected = DataFrame([[0, 0]]) - tm.assert_frame_equal(result, expected) - - def test_read_csv_utf_aliases(self): - # see gh issue 13549 - expected = pd.DataFrame({'mb_num': [4.8], 'multibyte': ['test']}) - for byte in [8, 16]: - for fmt in ['utf-{0}', 'utf_{0}', 'UTF-{0}', 'UTF_{0}']: - encoding = fmt.format(byte) - data = 'mb_num,multibyte\n4.8,test'.encode(encoding) - result = self.read_csv(BytesIO(data), encoding=encoding) - tm.assert_frame_equal(result, expected) - - def test_internal_eof_byte(self): - # see gh-5500 - data = "a,b\n1\x1a,2" - - expected = pd.DataFrame([["1\x1a", 2]], columns=['a', 'b']) - result = self.read_csv(StringIO(data)) - tm.assert_frame_equal(result, expected) - - def test_internal_eof_byte_to_file(self): - # see gh-16559 - data = b'c1,c2\r\n"test \x1a test", test\r\n' - expected = pd.DataFrame([["test \x1a test", " test"]], - columns=["c1", "c2"]) - - path = '__%s__.csv' % tm.rands(10) - - with tm.ensure_clean(path) as path: - with open(path, "wb") as f: - f.write(data) - - result = self.read_csv(path) - tm.assert_frame_equal(result, expected) - - def test_sub_character(self, datapath): - # see gh-16893 - filename = datapath('io', 'parser', 'data', 'sub_char.csv') - - expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) - result = self.read_csv(filename) - - tm.assert_frame_equal(result, expected) - - def test_file_handles(self): - # GH 14418 - don't close user provided file handles - - fh = StringIO('a,b\n1,2') - self.read_csv(fh) - assert not fh.closed - - with open(self.csv1, 'r') as f: - self.read_csv(f) - assert not f.closed - - # mmap not working with python engine - if self.engine != 'python': - - import mmap - with open(self.csv1, 'r') as f: - m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) - self.read_csv(m) - # closed attribute new in python 3.2 - if PY3: - assert not m.closed - m.close() - - def test_invalid_file_buffer(self, mock): - # see gh-15337 - - class InvalidBuffer(object): - pass - - msg = "Invalid file path or buffer object type" - - with pytest.raises(ValueError, match=msg): - self.read_csv(InvalidBuffer()) - - # gh-16135: we want to ensure that "tell" and "seek" - # aren't actually being used when we call `read_csv` - # - # Thus, while the object may look "invalid" (these - # methods are attributes of the `StringIO` class), - # it is still a valid file-object for our purposes. - class NoSeekTellBuffer(StringIO): - def tell(self): - raise AttributeError("No tell method") - - def seek(self, pos, whence=0): - raise AttributeError("No seek method") - - data = "a\n1" - - expected = pd.DataFrame({"a": [1]}) - result = self.read_csv(NoSeekTellBuffer(data)) - - tm.assert_frame_equal(result, expected) - - with pytest.raises(ValueError, match=msg): - self.read_csv(mock.Mock()) - - @tm.capture_stderr - def test_skip_bad_lines(self): - # see gh-15925 - data = 'a\n1\n1,2,3\n4\n5,6,7' - - with pytest.raises(ParserError): - self.read_csv(StringIO(data)) - - with pytest.raises(ParserError): - self.read_csv(StringIO(data), error_bad_lines=True) - - expected = DataFrame({'a': [1, 4]}) - - out = self.read_csv(StringIO(data), - error_bad_lines=False, - warn_bad_lines=False) - tm.assert_frame_equal(out, expected) - - val = sys.stderr.getvalue() - assert val == '' - - # Reset the stderr buffer. - sys.stderr = StringIO() - - out = self.read_csv(StringIO(data), - error_bad_lines=False, - warn_bad_lines=True) - tm.assert_frame_equal(out, expected) - - val = sys.stderr.getvalue() - assert 'Skipping line 3' in val - assert 'Skipping line 5' in val diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 857cdea942459..feb6c36b5178f 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -3,7 +3,6 @@ import pytest from pandas import read_csv, read_table -import pandas.util.testing as tm class BaseParser(object): @@ -24,8 +23,7 @@ def read_csv(self, *args, **kwargs): def read_table(self, *args, **kwargs): kwargs = self.update_kwargs(kwargs) - with tm.assert_produces_warning(FutureWarning): - return read_table(*args, **kwargs) + return read_table(*args, **kwargs) class CParser(BaseParser): @@ -43,7 +41,7 @@ class CParserLowMemory(CParser): class PythonParser(BaseParser): engine = "python" - float_precision_choices = [] + float_precision_choices = [None] @pytest.fixture diff --git a/pandas/tests/io/parser/python_parser_only.py b/pandas/tests/io/parser/python_parser_only.py deleted file mode 100644 index 6a41b4636e532..0000000000000 --- a/pandas/tests/io/parser/python_parser_only.py +++ /dev/null @@ -1,270 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that apply specifically to the Python parser. Unless specifically -stated as a Python-specific issue, the goal is to eventually move as many of -these tests out of this module as soon as the C parser can accept further -arguments when parsing. -""" - -import csv -import sys - -import pytest - -import pandas.compat as compat -from pandas.compat import BytesIO, StringIO, u -from pandas.errors import ParserError - -from pandas import DataFrame, Index -import pandas.util.testing as tm - - -class PythonParserTests(object): - - def test_default_separator(self): - # GH17333 - # csv.Sniffer in Python treats 'o' as separator. - text = 'aob\n1o2\n3o4' - expected = DataFrame({'a': [1, 3], 'b': [2, 4]}) - - result = self.read_csv(StringIO(text), sep=None) - - tm.assert_frame_equal(result, expected) - - def test_invalid_skipfooter(self): - text = "a\n1\n2" - - # see gh-15925 (comment) - msg = "skipfooter must be an integer" - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(text), skipfooter="foo") - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(text), skipfooter=1.5) - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(text), skipfooter=True) - - msg = "skipfooter cannot be negative" - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(text), skipfooter=-1) - - def test_sniff_delimiter(self): - text = """index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""" - data = self.read_csv(StringIO(text), index_col=0, sep=None) - tm.assert_index_equal(data.index, - Index(['foo', 'bar', 'baz'], name='index')) - - data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|') - tm.assert_frame_equal(data, data2) - - text = """ignore this -ignore this too -index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""" - data3 = self.read_csv(StringIO(text), index_col=0, - sep=None, skiprows=2) - tm.assert_frame_equal(data, data3) - - text = u("""ignore this -ignore this too -index|A|B|C -foo|1|2|3 -bar|4|5|6 -baz|7|8|9 -""").encode('utf-8') - - s = BytesIO(text) - if compat.PY3: - # somewhat False since the code never sees bytes - from io import TextIOWrapper - s = TextIOWrapper(s, encoding='utf-8') - - data4 = self.read_csv(s, index_col=0, sep=None, skiprows=2, - encoding='utf-8') - tm.assert_frame_equal(data, data4) - - def test_BytesIO_input(self): - if not compat.PY3: - pytest.skip( - "Bytes-related test - only needs to work on Python 3") - - data = BytesIO("שלום::1234\n562::123".encode('cp1255')) - result = self.read_table(data, sep="::", encoding='cp1255') - expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) - tm.assert_frame_equal(result, expected) - - def test_single_line(self): - # see gh-6607: sniff separator - df = self.read_csv(StringIO('1,2'), names=['a', 'b'], - header=None, sep=None) - tm.assert_frame_equal(DataFrame({'a': [1], 'b': [2]}), df) - - def test_skipfooter(self): - # see gh-6607 - data = """A,B,C -1,2,3 -4,5,6 -7,8,9 -want to skip this -also also skip this -""" - result = self.read_csv(StringIO(data), skipfooter=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = self.read_csv(StringIO(no_footer)) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), nrows=3) - tm.assert_frame_equal(result, expected) - - # skipfooter alias - result = self.read_csv(StringIO(data), skipfooter=2) - no_footer = '\n'.join(data.split('\n')[:-3]) - expected = self.read_csv(StringIO(no_footer)) - tm.assert_frame_equal(result, expected) - - def test_decompression_regex_sep(self): - # see gh-6607 - - try: - import gzip - import bz2 - except ImportError: - pytest.skip('need gzip and bz2 to run') - - with open(self.csv1, 'rb') as f: - data = f.read() - data = data.replace(b',', b'::') - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, sep='::', compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, sep='::', compression='bz2') - tm.assert_frame_equal(result, expected) - - pytest.raises(ValueError, self.read_csv, - path, compression='bz3') - - def test_read_table_buglet_4x_multiindex(self): - # see gh-6607 - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - df = self.read_table(StringIO(text), sep=r'\s+') - assert df.index.names == ('one', 'two', 'three', 'four') - - # see gh-6893 - data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' - expected = DataFrame.from_records( - [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], - columns=list('abcABC'), index=list('abc')) - actual = self.read_table(StringIO(data), sep=r'\s+') - tm.assert_frame_equal(actual, expected) - - def test_skipfooter_with_decimal(self): - # see gh-6971 - data = '1#2\n3#4' - expected = DataFrame({'a': [1.2, 3.4]}) - - result = self.read_csv(StringIO(data), names=['a'], - decimal='#') - tm.assert_frame_equal(result, expected) - - # the stray footer line should not mess with the - # casting of the first t wo lines if we skip it - data = data + '\nFooter' - result = self.read_csv(StringIO(data), names=['a'], - decimal='#', skipfooter=1) - tm.assert_frame_equal(result, expected) - - def test_encoding_non_utf8_multichar_sep(self): - # see gh-3404 - expected = DataFrame({'a': [1], 'b': [2]}) - - for sep in ['::', '#####', '!!!', '123', '#1!c5', - '%!c!d', '@@#4:2', '_!pd#_']: - data = '1' + sep + '2' - - for encoding in ['utf-16', 'utf-16-be', 'utf-16-le', - 'utf-32', 'cp037']: - encoded_data = data.encode(encoding) - result = self.read_csv(BytesIO(encoded_data), - sep=sep, names=['a', 'b'], - encoding=encoding) - tm.assert_frame_equal(result, expected) - - def test_multi_char_sep_quotes(self): - # see gh-13374 - - data = 'a,,b\n1,,a\n2,,"2,,b"' - msg = 'ignored when a multi-char delimiter is used' - - with pytest.raises(ParserError, match=msg): - self.read_csv(StringIO(data), sep=',,') - - # We expect no match, so there should be an assertion - # error out of the inner context manager. - with pytest.raises(AssertionError): - with pytest.raises(ParserError, match=msg): - self.read_csv(StringIO(data), sep=',,', - quoting=csv.QUOTE_NONE) - - @tm.capture_stderr - def test_none_delimiter(self): - # see gh-13374 and gh-17465 - - data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" - expected = DataFrame({'a': [0, 7], - 'b': [1, 8], - 'c': [2, 9]}) - - # We expect the third line in the data to be - # skipped because it is malformed, - # but we do not expect any errors to occur. - result = self.read_csv(StringIO(data), header=0, - sep=None, - error_bad_lines=False, - warn_bad_lines=True) - tm.assert_frame_equal(result, expected) - - warning = sys.stderr.getvalue() - assert 'Skipping line 3' in warning - - def test_skipfooter_bad_row(self): - # see gh-13879 - # see gh-15910 - - msg = 'parsing errors in the skipped footer rows' - - for data in ('a\n1\n"b"a', - 'a,b,c\ncat,foo,bar\ndog,foo,"baz'): - with pytest.raises(ParserError, match=msg): - self.read_csv(StringIO(data), skipfooter=1) - - # We expect no match, so there should be an assertion - # error out of the inner context manager. - with pytest.raises(AssertionError): - with pytest.raises(ParserError, match=msg): - self.read_csv(StringIO(data)) diff --git a/pandas/tests/io/parser/quoting.py b/pandas/tests/io/parser/quoting.py deleted file mode 100644 index a8a1cc5451f37..0000000000000 --- a/pandas/tests/io/parser/quoting.py +++ /dev/null @@ -1,173 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that quoting specifications are properly handled -during parsing for all of the parsers defined in parsers.py -""" - -import csv - -import pytest - -from pandas.compat import PY3, StringIO, u -from pandas.errors import ParserError - -from pandas import DataFrame -import pandas.util.testing as tm - - -class QuotingTests(object): - - def test_bad_quote_char(self): - data = '1,2,3' - - # Python 2.x: "...must be an 1-character..." - # Python 3.x: "...must be a 1-character..." - msg = '"quotechar" must be a(n)? 1-character string' - with pytest.raises(TypeError, match=msg): - self.read_csv(StringIO(data), quotechar='foo') - - msg = 'quotechar must be set if quoting enabled' - with pytest.raises(TypeError, match=msg): - self.read_csv(StringIO(data), quotechar=None, - quoting=csv.QUOTE_MINIMAL) - - msg = '"quotechar" must be string, not int' - with pytest.raises(TypeError, match=msg): - self.read_csv(StringIO(data), quotechar=2) - - def test_bad_quoting(self): - data = '1,2,3' - - msg = '"quoting" must be an integer' - with pytest.raises(TypeError, match=msg): - self.read_csv(StringIO(data), quoting='foo') - - # quoting must in the range [0, 3] - msg = 'bad "quoting" value' - with pytest.raises(TypeError, match=msg): - self.read_csv(StringIO(data), quoting=5) - - def test_quote_char_basic(self): - data = 'a,b,c\n1,2,"cat"' - expected = DataFrame([[1, 2, 'cat']], - columns=['a', 'b', 'c']) - result = self.read_csv(StringIO(data), quotechar='"') - tm.assert_frame_equal(result, expected) - - def test_quote_char_various(self): - data = 'a,b,c\n1,2,"cat"' - expected = DataFrame([[1, 2, 'cat']], - columns=['a', 'b', 'c']) - quote_chars = ['~', '*', '%', '$', '@', 'P'] - - for quote_char in quote_chars: - new_data = data.replace('"', quote_char) - result = self.read_csv(StringIO(new_data), quotechar=quote_char) - tm.assert_frame_equal(result, expected) - - def test_null_quote_char(self): - data = 'a,b,c\n1,2,3' - - # sanity checks - msg = 'quotechar must be set if quoting enabled' - - with pytest.raises(TypeError, match=msg): - self.read_csv(StringIO(data), quotechar=None, - quoting=csv.QUOTE_MINIMAL) - - with pytest.raises(TypeError, match=msg): - self.read_csv(StringIO(data), quotechar='', - quoting=csv.QUOTE_MINIMAL) - - # no errors should be raised if quoting is None - expected = DataFrame([[1, 2, 3]], - columns=['a', 'b', 'c']) - - result = self.read_csv(StringIO(data), quotechar=None, - quoting=csv.QUOTE_NONE) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), quotechar='', - quoting=csv.QUOTE_NONE) - tm.assert_frame_equal(result, expected) - - def test_quoting_various(self): - data = '1,2,"foo"' - cols = ['a', 'b', 'c'] - - # QUOTE_MINIMAL and QUOTE_ALL apply only to - # the CSV writer, so they should have no - # special effect for the CSV reader - expected = DataFrame([[1, 2, 'foo']], columns=cols) - - # test default (afterwards, arguments are all explicit) - result = self.read_csv(StringIO(data), names=cols) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), quotechar='"', - quoting=csv.QUOTE_MINIMAL, names=cols) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), quotechar='"', - quoting=csv.QUOTE_ALL, names=cols) - tm.assert_frame_equal(result, expected) - - # QUOTE_NONE tells the reader to do no special handling - # of quote characters and leave them alone - expected = DataFrame([[1, 2, '"foo"']], columns=cols) - result = self.read_csv(StringIO(data), quotechar='"', - quoting=csv.QUOTE_NONE, names=cols) - tm.assert_frame_equal(result, expected) - - # QUOTE_NONNUMERIC tells the reader to cast - # all non-quoted fields to float - expected = DataFrame([[1.0, 2.0, 'foo']], columns=cols) - result = self.read_csv(StringIO(data), quotechar='"', - quoting=csv.QUOTE_NONNUMERIC, - names=cols) - tm.assert_frame_equal(result, expected) - - def test_double_quote(self): - data = 'a,b\n3,"4 "" 5"' - - expected = DataFrame([[3, '4 " 5']], - columns=['a', 'b']) - result = self.read_csv(StringIO(data), quotechar='"', - doublequote=True) - tm.assert_frame_equal(result, expected) - - expected = DataFrame([[3, '4 " 5"']], - columns=['a', 'b']) - result = self.read_csv(StringIO(data), quotechar='"', - doublequote=False) - tm.assert_frame_equal(result, expected) - - def test_quotechar_unicode(self): - # See gh-14477 - data = 'a\n1' - expected = DataFrame({'a': [1]}) - - result = self.read_csv(StringIO(data), quotechar=u('"')) - tm.assert_frame_equal(result, expected) - - # Compared to Python 3.x, Python 2.x does not handle unicode well. - if PY3: - result = self.read_csv(StringIO(data), quotechar=u('\u0001')) - tm.assert_frame_equal(result, expected) - - def test_unbalanced_quoting(self): - # see gh-22789. - data = "a,b,c\n1,2,\"3" - - if self.engine == "c": - regex = "EOF inside string starting at row 1" - else: - regex = "unexpected end of data" - - with pytest.raises(ParserError, match=regex): - self.read_csv(StringIO(data)) - - expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) - data = self.read_csv(StringIO(data + '"')) - tm.assert_frame_equal(data, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 570ecd80b00c0..fcf9736110ff8 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -8,6 +8,7 @@ """ from io import TextIOWrapper +import mmap import os import sys import tarfile @@ -381,7 +382,7 @@ def test_internal_null_byte(c_parser_only): # character, only as a placeholder to indicate that # none was specified. # - # This test should be moved to common.py ONLY when + # This test should be moved to test_common.py ONLY when # Python's csv class supports parsing '\x00'. parser = c_parser_only @@ -544,3 +545,33 @@ def test_bytes_exceed_2gb(c_parser_only): ["x" * (1 << 20) for _ in range(2100)])) df = parser.read_csv(csv) assert not df.empty + + +def test_chunk_whitespace_on_boundary(c_parser_only): + # see gh-9735: this issue is C parser-specific (bug when + # parsing whitespace and characters at chunk boundary) + # + # This test case has a field too large for the Python parser / CSV library. + parser = c_parser_only + + chunk1 = "a" * (1024 * 256 - 2) + "\na" + chunk2 = "\n a" + result = parser.read_csv(StringIO(chunk1 + chunk2), header=None) + + expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"]) + tm.assert_frame_equal(result, expected) + + +def test_file_handles_mmap(c_parser_only, csv1): + # gh-14418 + # + # Don't close user provided file handles. + parser = c_parser_only + + with open(csv1, "r") as f: + m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + parser.read_csv(m) + + if PY3: + assert not m.closed + m.close() diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py new file mode 100644 index 0000000000000..9d38fdbecdb62 --- /dev/null +++ b/pandas/tests/io/parser/test_common.py @@ -0,0 +1,1912 @@ +# -*- coding: utf-8 -*- + +""" +Tests that work on both the Python and C engines but do not have a +specific classification into the other test modules. +""" + +import codecs +from collections import OrderedDict +import csv +from datetime import datetime +import os +import platform +import sys +from tempfile import TemporaryFile + +import numpy as np +import pytest + +from pandas._libs.tslib import Timestamp +from pandas.compat import BytesIO, StringIO, lrange, range, u +from pandas.errors import DtypeWarning, EmptyDataError, ParserError + +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +import pandas.util.testing as tm + +from pandas.io.common import URLError +from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser + + +def test_override_set_noconvert_columns(): + # see gh-17351 + # + # Usecols needs to be sorted in _set_noconvert_columns based + # on the test_usecols_with_parse_dates test from test_usecols.py + class MyTextFileReader(TextFileReader): + def __init__(self): + self._currow = 0 + self.squeeze = False + + class MyCParserWrapper(CParserWrapper): + def _set_noconvert_columns(self): + if self.usecols_dtype == "integer": + # self.usecols is a set, which is documented as unordered + # but in practice, a CPython set of integers is sorted. + # In other implementations this assumption does not hold. + # The following code simulates a different order, which + # before GH 17351 would cause the wrong columns to be + # converted via the parse_dates parameter + self.usecols = list(self.usecols) + self.usecols.reverse() + return CParserWrapper._set_noconvert_columns(self) + + data = """a,b,c,d,e +0,1,20140101,0900,4 +0,1,20140102,1000,4""" + + parse_dates = [[1, 2]] + cols = { + "a": [0, 0], + "c_d": [ + Timestamp("2014-01-01 09:00:00"), + Timestamp("2014-01-02 10:00:00") + ] + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + parser = MyTextFileReader() + parser.options = {"usecols": [0, 2, 3], + "parse_dates": parse_dates, + "delimiter": ","} + parser._engine = MyCParserWrapper(StringIO(data), **parser.options) + + result = parser.read() + tm.assert_frame_equal(result, expected) + + +def test_bytes_io_input(all_parsers): + if compat.PY2: + pytest.skip("Bytes-related test does not need to work on Python 2.x") + + encoding = "cp1255" + parser = all_parsers + + data = BytesIO("שלום:1234\n562:123".encode(encoding)) + result = parser.read_csv(data, sep=":", encoding=encoding) + + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + +def test_empty_decimal_marker(all_parsers): + data = """A|B|C +1|2,334|5 +10|13|10. +""" + # Parsers support only length-1 decimals + msg = "Only length-1 decimal markers supported" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), decimal="") + + +def test_bad_stream_exception(all_parsers, csv_dir_path): + # see gh-13652 + # + # This test validates that both the Python engine and C engine will + # raise UnicodeDecodeError instead of C engine raising ParserError + # and swallowing the exception that caused read to fail. + path = os.path.join(csv_dir_path, "sauron.SHIFT_JIS.csv") + codec = codecs.lookup("utf-8") + utf8 = codecs.lookup('utf-8') + parser = all_parsers + + msg = ("'utf-8' codec can't decode byte" if compat.PY3 + else "'utf8' codec can't decode byte") + + # Stream must be binary UTF8. + with open(path, "rb") as handle, codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) as stream: + + with pytest.raises(UnicodeDecodeError, match=msg): + parser.read_csv(stream) + + +@pytest.mark.skipif(compat.PY2, reason="PY3-only test") +def test_read_csv_local(all_parsers, csv1): + prefix = u("file:///") if compat.is_platform_windows() else u("file://") + parser = all_parsers + + fname = prefix + compat.text_type(os.path.abspath(csv1)) + result = parser.read_csv(fname, index_col=0, parse_dates=True) + + expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007]], + columns=["A", "B", "C", "D"], + index=Index([datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11)], name="index")) + tm.assert_frame_equal(result, expected) + + +def test_1000_sep(all_parsers): + parser = all_parsers + data = """A|B|C +1|2,334|5 +10|13|10. +""" + expected = DataFrame({ + "A": [1, 10], + "B": [2334, 13], + "C": [5, 10.] + }) + + result = parser.read_csv(StringIO(data), sep="|", thousands=",") + tm.assert_frame_equal(result, expected) + + +def test_squeeze(all_parsers): + data = """\ +a,1 +b,2 +c,3 +""" + parser = all_parsers + index = Index(["a", "b", "c"], name=0) + expected = Series([1, 2, 3], name=1, index=index) + + result = parser.read_csv(StringIO(data), index_col=0, + header=None, squeeze=True) + tm.assert_series_equal(result, expected) + + # see gh-8217 + # + # Series should not be a view. + assert not result._is_view + + +def test_malformed(all_parsers): + # see gh-6607 + parser = all_parsers + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + msg = "Expected 3 fields in line 4, saw 5" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=1, comment="#") + + +@pytest.mark.parametrize("nrows", [5, 3, None]) +def test_malformed_chunks(all_parsers, nrows): + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + parser = all_parsers + msg = 'Expected 3 fields in line 6, saw 5' + reader = parser.read_csv(StringIO(data), header=1, comment="#", + iterator=True, chunksize=1, skiprows=[2]) + + with pytest.raises(ParserError, match=msg): + reader.read(nrows) + + +def test_unnamed_columns(all_parsers): + data = """A,B,C,, +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + parser = all_parsers + expected = DataFrame([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], + dtype=np.int64, columns=["A", "B", "C", + "Unnamed: 3", + "Unnamed: 4"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_csv_mixed_type(all_parsers): + data = """A,B,C +a,1,2 +b,3,4 +c,4,5 +""" + parser = all_parsers + expected = DataFrame({"A": ["a", "b", "c"], + "B": [1, 3, 4], + "C": [2, 4, 5]}) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_low_memory_no_rows_with_index(all_parsers): + # see gh-21141 + parser = all_parsers + + if not parser.low_memory: + pytest.skip("This is a low-memory specific test") + + data = """A,B,C +1,1,1,2 +2,2,3,4 +3,3,4,5 +""" + result = parser.read_csv(StringIO(data), low_memory=True, + index_col=0, nrows=0) + expected = DataFrame(columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_dataframe(all_parsers, csv1): + parser = all_parsers + result = parser.read_csv(csv1, index_col=0, parse_dates=True) + + expected = DataFrame([[0.980269, 3.685731, -0.364216805298, -1.159738], + [1.047916, -0.041232, -0.16181208307, 0.212549], + [0.498581, 0.731168, -0.537677223318, 1.346270], + [1.120202, 1.567621, 0.00364077397681, 0.675253], + [-0.487094, 0.571455, -1.6116394093, 0.103469], + [0.836649, 0.246462, 0.588542635376, 1.062782], + [-0.157161, 1.340307, 1.1957779562, -1.097007]], + columns=["A", "B", "C", "D"], + index=Index([datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7), + datetime(2000, 1, 10), + datetime(2000, 1, 11)], name="index")) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_no_index_name(all_parsers, csv_dir_path): + parser = all_parsers + csv2 = os.path.join(csv_dir_path, "test2.csv") + result = parser.read_csv(csv2, index_col=0, parse_dates=True) + + expected = DataFrame([[0.980269, 3.685731, -0.364216805298, + -1.159738, "foo"], + [1.047916, -0.041232, -0.16181208307, + 0.212549, "bar"], + [0.498581, 0.731168, -0.537677223318, + 1.346270, "baz"], + [1.120202, 1.567621, 0.00364077397681, + 0.675253, "qux"], + [-0.487094, 0.571455, -1.6116394093, + 0.103469, "foo2"]], + columns=["A", "B", "C", "D", "E"], + index=Index([datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5), + datetime(2000, 1, 6), + datetime(2000, 1, 7)])) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unicode(all_parsers): + parser = all_parsers + data = BytesIO(u("\u0141aski, Jan;1").encode("utf-8")) + + result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) + expected = DataFrame([[u("\u0141aski, Jan"), 1]]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_wrong_num_columns(all_parsers): + # Too few columns. + data = """A,B,C,D,E,F +1,2,3,4,5,6 +6,7,8,9,10,11,12 +11,12,13,14,15,16 +""" + parser = all_parsers + msg = "Expected 6 fields in line 3, saw 7" + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + + +def test_read_duplicate_index_explicit(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], + [12, 13, 14, 15], [12, 13, 14, 15], + [12, 13, 14, 15], [12, 13, 14, 15]], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", + "qux", "foo", "bar"], name="index")) + tm.assert_frame_equal(result, expected) + + +def test_read_duplicate_index_implicit(all_parsers): + data = """A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], + [12, 13, 14, 15], [12, 13, 14, 15], + [12, 13, 14, 15], [12, 13, 14, 15]], + columns=["A", "B", "C", "D"], + index=Index(["foo", "bar", "baz", + "qux", "foo", "bar"])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + ("A,B\nTrue,1\nFalse,2\nTrue,3", dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])), + ("A,B\nYES,1\nno,2\nyes,3\nNo,3\nYes,3", + dict(true_values=["yes", "Yes", "YES"], + false_values=["no", "NO", "No"]), + DataFrame([[True, 1], [False, 2], [True, 3], + [False, 3], [True, 3]], columns=["A", "B"])), + ("A,B\nTRUE,1\nFALSE,2\nTRUE,3", dict(), + DataFrame([[True, 1], [False, 2], [True, 3]], columns=["A", "B"])), + ("A,B\nfoo,bar\nbar,foo", dict(true_values=["foo"], + false_values=["bar"]), + DataFrame([[True, False], [False, True]], columns=["A", "B"])) +]) +def test_parse_bool(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_int_conversion(all_parsers): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["A", "B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [3, 3.0]) +def test_read_nrows(all_parsers, nrows): + # see gh-10476 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + expected = DataFrame([["foo", 2, 3, 4, 5], + ["bar", 7, 8, 9, 10], + ["baz", 12, 13, 14, 15]], + columns=["index", "A", "B", "C", "D"]) + parser = all_parsers + + result = parser.read_csv(StringIO(data), nrows=nrows) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [1.2, "foo", -1]) +def test_read_nrows_bad(all_parsers, nrows): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + msg = r"'nrows' must be an integer >=0" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), nrows=nrows) + + +@pytest.mark.parametrize("index_col", [0, "index"]) +def test_read_chunksize_with_index(all_parsers, index_col): + parser = all_parsers + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + reader = parser.read_csv(StringIO(data), index_col=0, chunksize=2) + expected = DataFrame([["foo", 2, 3, 4, 5], + ["bar", 7, 8, 9, 10], + ["baz", 12, 13, 14, 15], + ["qux", 12, 13, 14, 15], + ["foo2", 12, 13, 14, 15], + ["bar2", 12, 13, 14, 15]], + columns=["index", "A", "B", "C", "D"]) + expected = expected.set_index("index") + + chunks = list(reader) + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +@pytest.mark.parametrize("chunksize", [1.3, "foo", 0]) +def test_read_chunksize_bad(all_parsers, chunksize): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + msg = r"'chunksize' must be an integer >=1" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), chunksize=chunksize) + + +@pytest.mark.parametrize("chunksize", [2, 8]) +def test_read_chunksize_and_nrows(all_parsers, chunksize): + # see gh-15755 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0, nrows=5) + + reader = parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) + expected = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(concat(reader), expected) + + +def test_read_chunksize_and_nrows_changing_size(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0, nrows=5) + + reader = parser.read_csv(StringIO(data), chunksize=8, **kwargs) + expected = parser.read_csv(StringIO(data), **kwargs) + + tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2]) + tm.assert_frame_equal(reader.get_chunk(size=4), expected.iloc[2:5]) + + with pytest.raises(StopIteration, match=""): + reader.get_chunk(size=3) + + +def test_get_chunk_passed_chunksize(all_parsers): + parser = all_parsers + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +1,2,3""" + + reader = parser.read_csv(StringIO(data), chunksize=2) + result = reader.get_chunk() + + expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(), dict(index_col=0)]) +def test_read_chunksize_compat(all_parsers, kwargs): + # see gh-12185 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + reader = parser.read_csv(StringIO(data), chunksize=2, **kwargs) + + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(concat(reader), result) + + +def test_read_chunksize_jagged_names(all_parsers): + # see gh-23509 + parser = all_parsers + data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) + + expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) + reader = parser.read_csv(StringIO(data), names=range(10), chunksize=4) + + result = concat(reader) + tm.assert_frame_equal(result, expected) + + +def test_read_data_list(all_parsers): + parser = all_parsers + kwargs = dict(index_col=0) + data = "A,B,C\nfoo,1,2,3\nbar,4,5,6" + + data_list = [["A", "B", "C"], ["foo", "1", "2", "3"], + ["bar", "4", "5", "6"]] + expected = parser.read_csv(StringIO(data), **kwargs) + + parser = TextParser(data_list, chunksize=2, **kwargs) + result = parser.read() + + tm.assert_frame_equal(result, expected) + + +def test_iterator(all_parsers): + # see gh-6607 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0) + + expected = parser.read_csv(StringIO(data), **kwargs) + reader = parser.read_csv(StringIO(data), iterator=True, **kwargs) + + first_chunk = reader.read(3) + tm.assert_frame_equal(first_chunk, expected[:3]) + + last_chunk = reader.read(5) + tm.assert_frame_equal(last_chunk, expected[3:]) + + +def test_iterator2(all_parsers): + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + + reader = parser.read_csv(StringIO(data), iterator=True) + result = list(reader) + + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"]) + tm.assert_frame_equal(result[0], expected) + + +def test_reader_list(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0) + + lines = list(csv.reader(StringIO(data))) + reader = TextParser(lines, chunksize=2, **kwargs) + + expected = parser.read_csv(StringIO(data), **kwargs) + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], expected[:2]) + tm.assert_frame_equal(chunks[1], expected[2:4]) + tm.assert_frame_equal(chunks[2], expected[4:]) + + +def test_reader_list_skiprows(all_parsers): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + parser = all_parsers + kwargs = dict(index_col=0) + + lines = list(csv.reader(StringIO(data))) + reader = TextParser(lines, chunksize=2, skiprows=[1], **kwargs) + + expected = parser.read_csv(StringIO(data), **kwargs) + chunks = list(reader) + + tm.assert_frame_equal(chunks[0], expected[1:3]) + + +def test_iterator_stop_on_chunksize(all_parsers): + # gh-3967: stopping iteration when chunksize is specified + parser = all_parsers + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + + reader = parser.read_csv(StringIO(data), chunksize=1) + result = list(reader) + + assert len(result) == 3 + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=["foo", "bar", "baz"], + columns=["A", "B", "C"]) + tm.assert_frame_equal(concat(result), expected) + + +@pytest.mark.parametrize("kwargs", [ + dict(iterator=True, + chunksize=1), + dict(iterator=True), + dict(chunksize=1) +]) +def test_iterator_skipfooter_errors(all_parsers, kwargs): + msg = "'skipfooter' not supported for 'iteration'" + parser = all_parsers + data = "a\n1\n2" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=1, **kwargs) + + +def test_nrows_skipfooter_errors(all_parsers): + msg = "'skipfooter' not supported with 'nrows'" + data = "a\n1\n2\n3\n4\n5\n6" + parser = all_parsers + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=1, nrows=5) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + ("""foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""", dict(index_col=0, names=["index", "A", "B", "C", "D"]), + DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], + [12, 13, 14, 15], [12, 13, 14, 15], [12, 13, 14, 15]], + index=Index(["foo", "bar", "baz", "qux", + "foo2", "bar2"], name="index"), + columns=["A", "B", "C", "D"])), + ("""foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""", dict(index_col=[0, 1], names=["index1", "index2", "A", "B", "C", "D"]), + DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], + [12, 13, 14, 15], [12, 13, 14, 15]], + index=MultiIndex.from_tuples([ + ("foo", "one"), ("foo", "two"), ("foo", "three"), + ("bar", "one"), ("bar", "two")], + names=["index1", "index2"]), + columns=["A", "B", "C", "D"])), +]) +def test_pass_names_with_index(all_parsers, data, kwargs, expected): + parser = all_parsers + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) +def test_multi_index_no_level_names(all_parsers, index_col): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + headless_data = '\n'.join(data.split("\n")[1:]) + + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv(StringIO(headless_data), + index_col=index_col, + header=None, names=names) + expected = parser.read_csv(StringIO(data), index_col=index_col) + + # No index names in headless data. + expected.index.names = [None] * 2 + tm.assert_frame_equal(result, expected) + + +def test_multi_index_no_level_names_implicit(all_parsers): + parser = all_parsers + data = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame([[2, 3, 4, 5], [7, 8, 9, 10], [12, 13, 14, 15], + [12, 13, 14, 15], [12, 13, 14, 15]], + columns=["A", "B", "C", "D"], + index=MultiIndex.from_tuples([ + ("foo", "one"), ("foo", "two"), ("foo", "three"), + ("bar", "one"), ("bar", "two")])) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,expected,header", [ + ("a,b", DataFrame(columns=["a", "b"]), [0]), + ("a,b\nc,d", DataFrame(columns=MultiIndex.from_tuples( + [("a", "c"), ("b", "d")])), [0, 1]), +]) +@pytest.mark.parametrize("round_trip", [True, False]) +def test_multi_index_blank_df(all_parsers, data, expected, header, round_trip): + # see gh-14545 + parser = all_parsers + data = expected.to_csv(index=False) if round_trip else data + + result = parser.read_csv(StringIO(data), header=header) + tm.assert_frame_equal(result, expected) + + +def test_no_unnamed_index(all_parsers): + parser = all_parsers + data = """ id c0 c1 c2 +0 1 0 a b +1 2 0 c d +2 2 2 e f +""" + result = parser.read_csv(StringIO(data), sep=" ") + expected = DataFrame([[0, 1, 0, "a", "b"], [1, 2, 0, "c", "d"], + [2, 2, 2, "e", "f"]], columns=["Unnamed: 0", "id", + "c0", "c1", "c2"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_parse_simple_list(all_parsers): + parser = all_parsers + data = """foo +bar baz +qux foo +foo +bar""" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame(["foo", "bar baz", "qux foo", "foo", "bar"]) + tm.assert_frame_equal(result, expected) + + +@tm.network +def test_url(all_parsers, csv_dir_path): + # TODO: FTP testing + parser = all_parsers + kwargs = dict(sep="\t") + + url = ("https://raw.github.com/pandas-dev/pandas/master/" + "pandas/tests/io/parser/data/salaries.csv") + url_result = parser.read_csv(url, **kwargs) + + local_path = os.path.join(csv_dir_path, "salaries.csv") + local_result = parser.read_csv(local_path, **kwargs) + tm.assert_frame_equal(url_result, local_result) + + +@pytest.mark.slow +def test_local_file(all_parsers, csv_dir_path): + parser = all_parsers + kwargs = dict(sep="\t") + + local_path = os.path.join(csv_dir_path, "salaries.csv") + local_result = parser.read_csv(local_path, **kwargs) + url = "file://localhost/" + local_path + + try: + url_result = parser.read_csv(url, **kwargs) + tm.assert_frame_equal(url_result, local_result) + except URLError: + # Fails on some systems. + pytest.skip("Failing on: " + " ".join(platform.uname())) + + +def test_path_path_lib(all_parsers): + parser = all_parsers + df = tm.makeDataFrame() + result = tm.round_trip_pathlib( + df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + + +def test_path_local_path(all_parsers): + parser = all_parsers + df = tm.makeDataFrame() + result = tm.round_trip_localpath( + df.to_csv, lambda p: parser.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + + +def test_nonexistent_path(all_parsers): + # gh-2428: pls no segfault + # gh-14086: raise more helpful FileNotFoundError + parser = all_parsers + path = "%s.csv" % tm.rands(10) + + msg = ("does not exist" if parser.engine == "c" + else r"\[Errno 2\]") + with pytest.raises(compat.FileNotFoundError, match=msg): + parser.read_csv(path) + + +def test_missing_trailing_delimiters(all_parsers): + parser = all_parsers + data = """A,B,C,D +1,2,3,4 +1,3,3, +1,4,5""" + + result = parser.read_csv(StringIO(data)) + expected = DataFrame([[1, 2, 3, 4], [1, 3, 3, np.nan], + [1, 4, 5, np.nan]], columns=["A", "B", "C", "D"]) + tm.assert_frame_equal(result, expected) + + +def test_skip_initial_space(all_parsers): + data = ('"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, ' + '1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, ' + '314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, ' + '70.06056, 344.98370, 1, 1, -0.689265, -0.692787, ' + '0.212036, 14.7674, 41.605, -9999.0, -9999.0, ' + '-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128') + parser = all_parsers + + result = parser.read_csv(StringIO(data), names=lrange(33), header=None, + na_values=["-9999.0"], skipinitialspace=True) + expected = DataFrame([["09-Apr-2012", "01:10:18.300", 2456026.548822908, + 12849, 1.00361, 1.12551, 330.65659, + 355626618.16711, 73.48821, 314.11625, 1917.09447, + 179.71425, 80.0, 240.0, -350, 70.06056, 344.9837, + 1, 1, -0.689265, -0.692787, 0.212036, 14.7674, + 41.605, np.nan, np.nan, np.nan, np.nan, np.nan, + np.nan, 0, 12, 128]]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", [",", "\t"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) +def test_utf16_bom_skiprows(all_parsers, sep, encoding): + # see gh-2298 + parser = all_parsers + data = u("""skip this +skip this too +A,B,C +1,2,3 +4,5,6""").replace(",", sep) + path = "__%s__.csv" % tm.rands(10) + kwargs = dict(sep=sep, skiprows=2) + utf8 = "utf-8" + + with tm.ensure_clean(path) as path: + bytes_data = data.encode(encoding) + + with open(path, "wb") as f: + f.write(bytes_data) + + bytes_buffer = BytesIO(data.encode(utf8)) + + if compat.PY3: + from io import TextIOWrapper + bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) + + result = parser.read_csv(path, encoding=encoding, **kwargs) + expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) + + bytes_buffer.close() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("buffer", [ + False, + pytest.param(True, marks=pytest.mark.skipif( + compat.PY3, reason="Not supported on PY3"))]) +def test_utf16_example(all_parsers, csv_dir_path, buffer): + path = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + + src = BytesIO(open(path, "rb").read()) if buffer else path + result = parser.read_csv(src, encoding="utf-16", sep="\t") + assert len(result) == 50 + + +def test_unicode_encoding(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + + result = parser.read_csv(path, header=None, encoding="latin-1") + result = result.set_index(0) + got = result[1][1632] + + expected = u('\xc1 k\xf6ldum klaka (Cold Fever) (1994)') + assert got == expected + + +def test_trailing_delimiters(all_parsers): + # see gh-2442 + data = """A,B,C +1,2,3, +4,5,6, +7,8,9,""" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=False) + + expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +def test_escapechar(all_parsers): + # http://stackoverflow.com/questions/13824840/feature-request-for- + # pandas-read-csv + data = '''SEARCH_TERM,ACTUAL_URL +"bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" +"SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals serie","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa + + parser = all_parsers + result = parser.read_csv(StringIO(data), escapechar='\\', + quotechar='"', encoding='utf-8') + + assert result['SEARCH_TERM'][2] == ('SLAGBORD, "Bergslagen", ' + 'IKEA:s 1700-tals serie') + tm.assert_index_equal(result.columns, + Index(['SEARCH_TERM', 'ACTUAL_URL'])) + + +def test_int64_min_issues(all_parsers): + # see gh-2599 + parser = all_parsers + data = "A,B\n0,0\n0," + result = parser.read_csv(StringIO(data)) + + expected = DataFrame({"A": [0, 0], "B": [0, np.nan]}) + tm.assert_frame_equal(result, expected) + + +def test_parse_integers_above_fp_precision(all_parsers): + data = """Numbers +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000191 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000192 +17007000002000194""" + parser = all_parsers + result = parser.read_csv(StringIO(data)) + expected = DataFrame({"Numbers": [17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000191, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000192, + 17007000002000194]}) + tm.assert_frame_equal(result, expected) + + +def test_chunks_have_consistent_numerical_type(all_parsers): + parser = all_parsers + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["1.0", "2.0"] + integers) + + # Coercions should work without warnings. + with tm.assert_produces_warning(None): + result = parser.read_csv(StringIO(data)) + + assert type(result.a[0]) is np.float64 + assert result.a.dtype == np.float + + +def test_warn_if_chunks_have_mismatched_type(all_parsers): + warning_type = None + parser = all_parsers + integers = [str(i) for i in range(499999)] + data = "a\n" + "\n".join(integers + ["a", "b"] + integers) + + # see gh-3866: if chunks are different types and can't + # be coerced using numerical types, then issue warning. + if parser.engine == "c" and parser.low_memory: + warning_type = DtypeWarning + + with tm.assert_produces_warning(warning_type): + df = parser.read_csv(StringIO(data)) + assert df.a.dtype == np.object + + +@pytest.mark.parametrize("sep", [" ", r"\s+"]) +def test_integer_overflow_bug(all_parsers, sep): + # see gh-2601 + data = "65248E10 11\n55555E55 22\n" + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=None, sep=sep) + expected = DataFrame([[6.5248e14, 11], [5.5555e59, 22]]) + tm.assert_frame_equal(result, expected) + + +def test_catch_too_many_names(all_parsers): + # see gh-5156 + data = """\ +1,2,3 +4,,6 +7,8,9 +10,11,12\n""" + parser = all_parsers + msg = ("Too many columns specified: " + "expected 4 and found 3" if parser.engine == "c" + else "Number of passed names did not match " + "number of header fields in the file") + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), header=0, names=["a", "b", "c", "d"]) + + +def test_ignore_leading_whitespace(all_parsers): + # see gh-3374, gh-6607 + parser = all_parsers + data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9" + result = parser.read_csv(StringIO(data), sep=r"\s+") + + expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]}) + tm.assert_frame_equal(result, expected) + + +def test_chunk_begins_with_newline_whitespace(all_parsers): + # see gh-10022 + parser = all_parsers + data = "\n hello\nworld\n" + + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([" hello", "world"]) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_index(all_parsers): + # see gh-10184 + data = "x,y" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=0) + + expected = DataFrame([], columns=["y"], index=Index([], name="x")) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_multi_index(all_parsers): + # see gh-10467 + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=["x", "y"]) + + expected = DataFrame([], columns=["z"], + index=MultiIndex.from_arrays( + [[]] * 2, names=["x", "y"])) + tm.assert_frame_equal(result, expected) + + +def test_empty_with_reversed_multi_index(all_parsers): + data = "x,y,z" + parser = all_parsers + result = parser.read_csv(StringIO(data), index_col=[1, 0]) + + expected = DataFrame([], columns=["z"], + index=MultiIndex.from_arrays( + [[]] * 2, names=["y", "x"])) + tm.assert_frame_equal(result, expected) + + +def test_float_parser(all_parsers): + # see gh-9565 + parser = all_parsers + data = "45e-1,4.5,45.,inf,-inf" + result = parser.read_csv(StringIO(data), header=None) + + expected = DataFrame([[float(s) for s in data.split(",")]]) + tm.assert_frame_equal(result, expected) + + +def test_scientific_no_exponent(all_parsers): + # see gh-12215 + df = DataFrame.from_dict(OrderedDict([("w", ["2e"]), ("x", ["3E"]), + ("y", ["42e"]), + ("z", ["632E"])])) + data = df.to_csv(index=False) + parser = all_parsers + + for precision in parser.float_precision_choices: + df_roundtrip = parser.read_csv(StringIO(data), + float_precision=precision) + tm.assert_frame_equal(df_roundtrip, df) + + +@pytest.mark.parametrize("conv", [None, np.int64, np.uint64]) +def test_int64_overflow(all_parsers, conv): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + parser = all_parsers + + if conv is None: + # 13007854817840016671868 > UINT64_MAX, so this + # will overflow and return object as the dtype. + result = parser.read_csv(StringIO(data)) + expected = DataFrame(["00013007854817840016671868", + "00013007854817840016749251", + "00013007854817840016754630", + "00013007854817840016781876", + "00013007854817840017028824", + "00013007854817840017963235", + "00013007854817840018860166"], columns=["ID"]) + tm.assert_frame_equal(result, expected) + else: + # 13007854817840016671868 > UINT64_MAX, so attempts + # to cast to either int64 or uint64 will result in + # an OverflowError being raised. + msg = ("(Python int too large to convert to C long)|" + "(long too big to convert)|" + "(int too big to convert)") + + with pytest.raises(OverflowError, match=msg): + parser.read_csv(StringIO(data), converters={"ID": conv}) + + +@pytest.mark.parametrize("val", [ + np.iinfo(np.uint64).max, + np.iinfo(np.int64).max, + np.iinfo(np.int64).min +]) +def test_int64_uint64_range(all_parsers, val): + # These numbers fall right inside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([val]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("val", [ + np.iinfo(np.uint64).max + 1, + np.iinfo(np.int64).min - 1 +]) +def test_outside_int64_uint64_range(all_parsers, val): + # These numbers fall just outside the int64-uint64 + # range, so they should be parsed as string. + parser = all_parsers + result = parser.read_csv(StringIO(str(val)), header=None) + + expected = DataFrame([str(val)]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("exp_data", [[str(-1), str(2**63)], + [str(2**63), str(-1)]]) +def test_numeric_range_too_wide(all_parsers, exp_data): + # No numerical dtype can hold both negative and uint64 + # values, so they should be cast as string. + parser = all_parsers + data = "\n".join(exp_data) + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), header=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("iterator", [True, False]) +def test_empty_with_nrows_chunksize(all_parsers, iterator): + # see gh-9535 + parser = all_parsers + expected = DataFrame([], columns=["foo", "bar"]) + + nrows = 10 + data = StringIO("foo,bar\n") + + if iterator: + result = next(iter(parser.read_csv(data, chunksize=nrows))) + else: + result = parser.read_csv(data, nrows=nrows) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,kwargs,expected,msg", [ + # gh-10728: WHITESPACE_LINE + ("a,b,c\n4,5,6\n ", dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), + + # gh-10548: EAT_LINE_COMMENT + ("a,b,c\n4,5,6\n#comment", dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), + + # EAT_CRNL_NOP + ("a,b,c\n4,5,6\n\r", dict(), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), + + # EAT_COMMENT + ("a,b,c\n4,5,6#comment", dict(comment="#"), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), + + # SKIP_LINE + ("a,b,c\n4,5,6\nskipme", dict(skiprows=[2]), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), + + # EAT_LINE_COMMENT + ("a,b,c\n4,5,6\n#comment", dict(comment="#", skip_blank_lines=False), + DataFrame([[4, 5, 6]], columns=["a", "b", "c"]), None), + + # IN_FIELD + ("a,b,c\n4,5,6\n ", dict(skip_blank_lines=False), + DataFrame([["4", 5, 6], [" ", None, None]], + columns=["a", "b", "c"]), None), + + # EAT_CRNL + ("a,b,c\n4,5,6\n\r", dict(skip_blank_lines=False), + DataFrame([[4, 5, 6], [None, None, None]], + columns=["a", "b", "c"]), None), + + # ESCAPED_CHAR + ("a,b,c\n4,5,6\n\\", dict(escapechar="\\"), + None, "(EOF following escape character)|(unexpected end of data)"), + + # ESCAPE_IN_QUOTED_FIELD + ('a,b,c\n4,5,6\n"\\', dict(escapechar="\\"), + None, "(EOF inside string starting at row 2)|(unexpected end of data)"), + + # IN_QUOTED_FIELD + ('a,b,c\n4,5,6\n"', dict(escapechar="\\"), + None, "(EOF inside string starting at row 2)|(unexpected end of data)"), +], ids=["whitespace-line", "eat-line-comment", "eat-crnl-nop", "eat-comment", + "skip-line", "eat-line-comment", "in-field", "eat-crnl", + "escaped-char", "escape-in-quoted-field", "in-quoted-field"]) +def test_eof_states(all_parsers, data, kwargs, expected, msg): + # see gh-10728, gh-10548 + parser = all_parsers + + if expected is None: + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]]) +def test_uneven_lines_with_usecols(all_parsers, usecols): + # see gh-12203 + parser = all_parsers + data = r"""a,b,c +0,1,2 +3,4,5,6,7 +8,9,10""" + + if usecols is None: + # Make sure that an error is still raised + # when the "usecols" parameter is not provided. + msg = r"Expected \d+ fields in line \d+, saw \d+" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) + else: + expected = DataFrame({ + "a": [0, 3, 8], + "b": [1, 4, 9] + }) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + # First, check to see that the response of parser when faced with no + # provided columns raises the correct error, with or without usecols. + ("", dict(), None), + ("", dict(usecols=["X"]), None), + (",,", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"], index=[0], dtype=np.float64)), + ("", dict(names=["Dummy", "X", "Dummy_2"], usecols=["X"]), + DataFrame(columns=["X"])), +]) +def test_read_empty_with_usecols(all_parsers, data, kwargs, expected): + # see gh-12493 + parser = all_parsers + + if expected is None: + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs,expected", [ + # gh-8661, gh-8679: this should ignore six lines, including + # lines with trailing whitespace and blank lines. + (dict(header=None, delim_whitespace=True, skiprows=[0, 1, 2, 3, 5, 6], + skip_blank_lines=True), DataFrame([[1., 2., 4.], + [5.1, np.nan, 10.]])), + + # gh-8983: test skipping set of rows after a row with trailing spaces. + (dict(delim_whitespace=True, skiprows=[1, 2, 3, 5, 6], + skip_blank_lines=True), DataFrame({"A": [1., 5.1], + "B": [2., np.nan], + "C": [4., 10]})), +]) +def test_trailing_spaces(all_parsers, kwargs, expected): + data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa + parser = all_parsers + + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_raise_on_sep_with_delim_whitespace(all_parsers): + # see gh-6607 + data = "a b c\n1 2 3" + parser = all_parsers + + with pytest.raises(ValueError, match="you can only specify one"): + parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) + + +@pytest.mark.parametrize("delim_whitespace", [True, False]) +def test_single_char_leading_whitespace(all_parsers, delim_whitespace): + # see gh-9710 + parser = all_parsers + data = """\ +MyColumn +a +b +a +b\n""" + + expected = DataFrame({"MyColumn": list("abab")}) + result = parser.read_csv(StringIO(data), skipinitialspace=True, + delim_whitespace=delim_whitespace) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep,skip_blank_lines,exp_data", [ + (",", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]), + (r"\s+", True, [[1., 2., 4.], [5., np.nan, 10.], [-70., .4, 1.]]), + (",", False, [[1., 2., 4.], [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], [5., np.nan, 10.], + [np.nan, np.nan, np.nan], [-70., .4, 1.]]), +]) +def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data): + parser = all_parsers + data = """\ +A,B,C +1,2.,4. + + +5.,NaN,10.0 + +-70,.4,1 +""" + + if sep == r"\s+": + data = data.replace(",", " ") + + result = parser.read_csv(StringIO(data), sep=sep, + skip_blank_lines=skip_blank_lines) + expected = DataFrame(exp_data, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_whitespace_lines(all_parsers): + parser = all_parsers + data = """ + +\t \t\t +\t +A,B,C +\t 1,2.,4. +5.,NaN,10.0 +""" + expected = DataFrame([[1, 2., 4.], [5., np.nan, 10.]], + columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,expected", [ + (""" A B C D +a 1 2 3 4 +b 1 2 3 4 +c 1 2 3 4 +""", DataFrame([[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + columns=["A", "B", "C", "D"], index=["a", "b", "c"])), + (" a b c\n1 2 3 \n4 5 6\n 7 8 9", + DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])), +]) +def test_whitespace_regex_separator(all_parsers, data, expected): + # see gh-6607 + parser = all_parsers + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +@tm.capture_stdout +def test_verbose_read(all_parsers): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +one,1,2,3 +,1,2,3 +one,1,2,3 +,1,2,3 +,1,2,3 +one,1,2,3 +two,1,2,3""" + + # Engines are verbose in different ways. + parser.read_csv(StringIO(data), verbose=True) + output = sys.stdout.getvalue() + + if parser.engine == "c": + assert "Tokenization took:" in output + assert "Parser memory cleanup took:" in output + else: # Python engine + assert output == "Filled 3 NA values in column a\n" + + +@tm.capture_stdout +def test_verbose_read2(all_parsers): + parser = all_parsers + data = """a,b,c,d +one,1,2,3 +two,1,2,3 +three,1,2,3 +four,1,2,3 +five,1,2,3 +,1,2,3 +seven,1,2,3 +eight,1,2,3""" + + parser.read_csv(StringIO(data), verbose=True, index_col=0) + output = sys.stdout.getvalue() + + # Engines are verbose in different ways. + if parser.engine == "c": + assert "Tokenization took:" in output + assert "Parser memory cleanup took:" in output + else: # Python engine + assert output == "Filled 1 NA values in column a\n" + + +def test_iteration_open_handle(all_parsers): + parser = all_parsers + kwargs = dict(squeeze=True, header=None) + + with tm.ensure_clean() as path: + with open(path, "wb" if compat.PY2 else "w") as f: + f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") + + with open(path, "rb" if compat.PY2 else "r") as f: + for line in f: + if "CCC" in line: + break + + if parser.engine == "c" and compat.PY2: + msg = "Mixing iteration and read methods would lose data" + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, **kwargs) + else: + result = parser.read_csv(f, **kwargs) + expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("data,thousands,decimal", [ + ("""A|B|C +1|2,334.01|5 +10|13|10. +""", ",", "."), + ("""A|B|C +1|2.334,01|5 +10|13|10, +""", ".", ","), +]) +def test_1000_sep_with_decimal(all_parsers, data, thousands, decimal): + parser = all_parsers + expected = DataFrame({ + "A": [1, 10], + "B": [2334.01, 13], + "C": [5, 10.] + }) + + result = parser.read_csv(StringIO(data), sep="|", + thousands=thousands, + decimal=decimal) + tm.assert_frame_equal(result, expected) + + +def test_euro_decimal_format(all_parsers): + parser = all_parsers + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + result = parser.read_csv(StringIO(data), sep=";", decimal=",") + expected = DataFrame([ + [1, 1521.1541, 187101.9543, "ABC", "poi", 4.738797819], + [2, 121.12, 14897.76, "DEF", "uyt", 0.377320872], + [3, 878.158, 108013.434, "GHI", "rez", 2.735694704] + ], columns=["Id", "Number1", "Number2", "Text1", "Text2", "Number3"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("na_filter", [True, False]) +def test_inf_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,inf +b,-inf +c,+Inf +d,-Inf +e,INF +f,-INF +g,+INf +h,-INf +i,inF +j,-inF""" + expected = DataFrame({"A": [float("inf"), float("-inf")] * 5}, + index=["a", "b", "c", "d", "e", + "f", "g", "h", "i", "j"]) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) +def test_raise_on_no_columns(all_parsers, nrows): + parser = all_parsers + data = "\n" * nrows + + msg = "No columns to parse from file" + with pytest.raises(EmptyDataError, match=msg): + parser.read_csv(StringIO(data)) + + +def test_memory_map(all_parsers, csv_dir_path): + mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") + parser = all_parsers + + expected = DataFrame({ + "a": [1, 2, 3], + "b": ["one", "two", "three"], + "c": ["I", "II", "III"] + }) + + result = parser.read_csv(mmap_file, memory_map=True) + tm.assert_frame_equal(result, expected) + + +def test_null_byte_char(all_parsers): + # see gh-2741 + data = "\x00,foo" + names = ["a", "b"] + parser = all_parsers + + if parser.engine == "c": + expected = DataFrame([[np.nan, "foo"]], columns=names) + out = parser.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(out, expected) + else: + msg = "NULL byte detected" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), names=names) + + +@pytest.mark.parametrize("data,kwargs,expected", [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + + # Test in empty data row without skipping + ("\n1", dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]})), +]) +def test_utf8_bom(all_parsers, data, kwargs, expected): + # see gh-4793 + parser = all_parsers + bom = u("\ufeff") + utf8 = "utf-8" + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + result = parser.read_csv(_encode_data_with_bom(data), + encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_temporary_file(all_parsers): + # see gh-13398 + parser = all_parsers + data = "0 0" + + new_file = TemporaryFile("w+") + new_file.write(data) + new_file.flush() + new_file.seek(0) + + result = parser.read_csv(new_file, sep=r"\s+", header=None) + new_file.close() + + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("byte", [8, 16]) +@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", + "UTF-{0}", "UTF_{0}"]) +def test_read_csv_utf_aliases(all_parsers, byte, fmt): + # see gh-13549 + expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) + parser = all_parsers + + encoding = fmt.format(byte) + data = "mb_num,multibyte\n4.8,test".encode(encoding) + + result = parser.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte(all_parsers): + # see gh-5500 + parser = all_parsers + data = "a,b\n1\x1a,2" + + expected = DataFrame([["1\x1a", 2]], columns=["a", "b"]) + result = parser.read_csv(StringIO(data)) + tm.assert_frame_equal(result, expected) + + +def test_internal_eof_byte_to_file(all_parsers): + # see gh-16559 + parser = all_parsers + data = b'c1,c2\r\n"test \x1a test", test\r\n' + expected = DataFrame([["test \x1a test", " test"]], + columns=["c1", "c2"]) + path = "__%s__.csv" % tm.rands(10) + + with tm.ensure_clean(path) as path: + with open(path, "wb") as f: + f.write(data) + + result = parser.read_csv(path) + tm.assert_frame_equal(result, expected) + + +def test_sub_character(all_parsers, csv_dir_path): + # see gh-16893 + filename = os.path.join(csv_dir_path, "sub_char.csv") + expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) + + parser = all_parsers + result = parser.read_csv(filename) + tm.assert_frame_equal(result, expected) + + +def test_file_handle_string_io(all_parsers): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + data = "a,b\n1,2" + + fh = StringIO(data) + parser.read_csv(fh) + assert not fh.closed + + +def test_file_handles_with_open(all_parsers, csv1): + # gh-14418 + # + # Don't close user provided file handles. + parser = all_parsers + + with open(csv1, "r") as f: + parser.read_csv(f) + assert not f.closed + + +def test_invalid_file_buffer_class(all_parsers): + # see gh-15337 + class InvalidBuffer(object): + pass + + parser = all_parsers + msg = "Invalid file path or buffer object type" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(InvalidBuffer()) + + +def test_invalid_file_buffer_mock(all_parsers, mock): + # see gh-15337 + parser = all_parsers + msg = "Invalid file path or buffer object type" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(mock.Mock()) + + +def test_valid_file_buffer_seems_invalid(all_parsers): + # gh-16135: we want to ensure that "tell" and "seek" + # aren't actually being used when we call `read_csv` + # + # Thus, while the object may look "invalid" (these + # methods are attributes of the `StringIO` class), + # it is still a valid file-object for our purposes. + class NoSeekTellBuffer(StringIO): + def tell(self): + raise AttributeError("No tell method") + + def seek(self, pos, whence=0): + raise AttributeError("No seek method") + + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(NoSeekTellBuffer(data)) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [ + dict(), # Default is True. + dict(error_bad_lines=True), # Explicitly pass in. +]) +@pytest.mark.parametrize("warn_kwargs", [ + dict(), dict(warn_bad_lines=True), + dict(warn_bad_lines=False) +]) +def test_error_bad_lines(all_parsers, kwargs, warn_kwargs): + # see gh-15925 + parser = all_parsers + kwargs.update(**warn_kwargs) + data = "a\n1\n1,2,3\n4\n5,6,7" + + msg = "Expected 1 fields in line 3, saw 3" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + +@tm.capture_stderr +def test_warn_bad_lines(all_parsers): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + + result = parser.read_csv(StringIO(data), + error_bad_lines=False, + warn_bad_lines=True) + tm.assert_frame_equal(result, expected) + + val = sys.stderr.getvalue() + assert "Skipping line 3" in val + assert "Skipping line 5" in val + + +@tm.capture_stderr +def test_suppress_error_output(all_parsers): + # see gh-15925 + parser = all_parsers + data = "a\n1\n1,2,3\n4\n5,6,7" + expected = DataFrame({"a": [1, 4]}) + + result = parser.read_csv(StringIO(data), + error_bad_lines=False, + warn_bad_lines=False) + tm.assert_frame_equal(result, expected) + + val = sys.stderr.getvalue() + assert val == "" + + +def test_read_table_deprecated(all_parsers): + # see gh-21948 + parser = all_parsers + data = "a\tb\n1\t2\n3\t4" + expected = parser.read_csv(StringIO(data), sep="\t") + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = parser.read_table(StringIO(data)) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py deleted file mode 100644 index 11389a943bea2..0000000000000 --- a/pandas/tests/io/parser/test_parsers.py +++ /dev/null @@ -1,143 +0,0 @@ -# -*- coding: utf-8 -*- - -import os - -import pytest - -from pandas._libs.tslib import Timestamp -from pandas.compat import StringIO -from pandas.errors import AbstractMethodError - -from pandas import DataFrame, read_csv, read_table -import pandas.util.testing as tm - -from .common import ParserTests -from .python_parser_only import PythonParserTests -from .quoting import QuotingTests -from .usecols import UsecolsTests - - -class BaseParser(ParserTests, UsecolsTests, - QuotingTests): - - def read_csv(self, *args, **kwargs): - raise NotImplementedError - - def read_table(self, *args, **kwargs): - raise NotImplementedError - - def float_precision_choices(self): - raise AbstractMethodError(self) - - @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.dirpath = datapath('io', 'parser', 'data') - self.csv1 = os.path.join(self.dirpath, 'test1.csv') - self.csv2 = os.path.join(self.dirpath, 'test2.csv') - self.xls1 = os.path.join(self.dirpath, 'test.xls') - self.csv_shiftjs = os.path.join(self.dirpath, 'sauron.SHIFT_JIS.csv') - - -class TestCParserHighMemory(BaseParser): - engine = 'c' - low_memory = False - float_precision_choices = [None, 'high', 'round_trip'] - - def read_csv(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = self.low_memory - return read_csv(*args, **kwds) - - def read_table(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = self.low_memory - with tm.assert_produces_warning(FutureWarning): - df = read_table(*args, **kwds) - return df - - -class TestCParserLowMemory(BaseParser): - engine = 'c' - low_memory = True - float_precision_choices = [None, 'high', 'round_trip'] - - def read_csv(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = self.low_memory - return read_csv(*args, **kwds) - - def read_table(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - kwds['low_memory'] = True - with tm.assert_produces_warning(FutureWarning): - df = read_table(*args, **kwds) - return df - - -class TestPythonParser(BaseParser, PythonParserTests): - engine = 'python' - float_precision_choices = [None] - - def read_csv(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - return read_csv(*args, **kwds) - - def read_table(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = self.engine - with tm.assert_produces_warning(FutureWarning): - df = read_table(*args, **kwds) - return df - - -class TestUnsortedUsecols(object): - def test_override__set_noconvert_columns(self): - # GH 17351 - usecols needs to be sorted in _setnoconvert_columns - # based on the test_usecols_with_parse_dates test from usecols.py - from pandas.io.parsers import CParserWrapper, TextFileReader - - s = """a,b,c,d,e - 0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - - parse_dates = [[1, 2]] - cols = { - 'a': [0, 0], - 'c_d': [ - Timestamp('2014-01-01 09:00:00'), - Timestamp('2014-01-02 10:00:00') - ] - } - expected = DataFrame(cols, columns=['c_d', 'a']) - - class MyTextFileReader(TextFileReader): - def __init__(self): - self._currow = 0 - self.squeeze = False - - class MyCParserWrapper(CParserWrapper): - def _set_noconvert_columns(self): - if self.usecols_dtype == 'integer': - # self.usecols is a set, which is documented as unordered - # but in practice, a CPython set of integers is sorted. - # In other implementations this assumption does not hold. - # The following code simulates a different order, which - # before GH 17351 would cause the wrong columns to be - # converted via the parse_dates parameter - self.usecols = list(self.usecols) - self.usecols.reverse() - return CParserWrapper._set_noconvert_columns(self) - - parser = MyTextFileReader() - parser.options = {'usecols': [0, 2, 3], - 'parse_dates': parse_dates, - 'delimiter': ','} - parser._engine = MyCParserWrapper(StringIO(s), **parser.options) - df = parser.read() - - tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py new file mode 100644 index 0000000000000..d5a7e3549ef0f --- /dev/null +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -0,0 +1,303 @@ +# -*- coding: utf-8 -*- + +""" +Tests that apply specifically to the Python parser. Unless specifically +stated as a Python-specific issue, the goal is to eventually move as many of +these tests out of this module as soon as the C parser can accept further +arguments when parsing. +""" + +import csv +import sys + +import pytest + +import pandas.compat as compat +from pandas.compat import BytesIO, StringIO, u +from pandas.errors import ParserError + +from pandas import DataFrame, Index, MultiIndex +import pandas.util.testing as tm + + +def test_default_separator(python_parser_only): + # see gh-17333 + # + # csv.Sniffer in Python treats "o" as separator. + data = "aob\n1o2\n3o4" + parser = python_parser_only + expected = DataFrame({"a": [1, 3], "b": [2, 4]}) + + result = parser.read_csv(StringIO(data), sep=None) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("skipfooter", ["foo", 1.5, True]) +def test_invalid_skipfooter_non_int(python_parser_only, skipfooter): + # see gh-15925 (comment) + data = "a\n1\n2" + parser = python_parser_only + msg = "skipfooter must be an integer" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=skipfooter) + + +def test_invalid_skipfooter_negative(python_parser_only): + # see gh-15925 (comment) + data = "a\n1\n2" + parser = python_parser_only + msg = "skipfooter cannot be negative" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), skipfooter=-1) + + +@pytest.mark.parametrize("kwargs", [ + dict(sep=None), + dict(delimiter="|") +]) +def test_sniff_delimiter(python_parser_only, kwargs): + data = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), index_col=0, **kwargs) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("encoding", [None, "utf-8"]) +def test_sniff_delimiter_encoding(python_parser_only, encoding): + parser = python_parser_only + data = """ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + + if encoding is not None: + data = u(data).encode(encoding) + data = BytesIO(data) + + if compat.PY3: + from io import TextIOWrapper + data = TextIOWrapper(data, encoding=encoding) + else: + data = StringIO(data) + + result = parser.read_csv(data, index_col=0, sep=None, + skiprows=2, encoding=encoding) + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"], + index=Index(["foo", "bar", "baz"], name="index")) + tm.assert_frame_equal(result, expected) + + +def test_single_line(python_parser_only): + # see gh-6607: sniff separator + parser = python_parser_only + result = parser.read_csv(StringIO("1,2"), names=["a", "b"], + header=None, sep=None) + + expected = DataFrame({"a": [1], "b": [2]}) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs", [dict(skipfooter=2), dict(nrows=3)]) +def test_skipfooter(python_parser_only, kwargs): + # see gh-6607 + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +""" + parser = python_parser_only + result = parser.read_csv(StringIO(data), **kwargs) + + expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("compression,klass", [ + ("gzip", "GzipFile"), + ("bz2", "BZ2File"), +]) +def test_decompression_regex_sep(python_parser_only, csv1, compression, klass): + # see gh-6607 + parser = python_parser_only + + with open(csv1, "rb") as f: + data = f.read() + + data = data.replace(b",", b"::") + expected = parser.read_csv(csv1) + + module = pytest.importorskip(compression) + klass = getattr(module, klass) + + with tm.ensure_clean() as path: + tmp = klass(path, mode="wb") + tmp.write(data) + tmp.close() + + result = parser.read_csv(path, sep="::", + compression=compression) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_buglet_4x_multi_index(python_parser_only): + # see gh-6607 + data = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + parser = python_parser_only + + expected = DataFrame([[-0.5109, -2.3358, -0.4645, 0.05076, 0.3640], + [0.4473, 1.4152, 0.2834, 1.00661, 0.1744], + [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838]], + columns=["A", "B", "C", "D", "E"], + index=MultiIndex.from_tuples([ + ("a", "b", 10.0032, 5), + ("a", "q", 20, 4), + ("x", "q", 30, 3), + ], names=["one", "two", "three", "four"])) + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +def test_read_csv_buglet_4x_multi_index2(python_parser_only): + # see gh-6893 + data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9" + parser = python_parser_only + + expected = DataFrame.from_records( + [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], + columns=list("abcABC"), index=list("abc")) + result = parser.read_csv(StringIO(data), sep=r"\s+") + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("add_footer", [True, False]) +def test_skipfooter_with_decimal(python_parser_only, add_footer): + # see gh-6971 + data = "1#2\n3#4" + parser = python_parser_only + expected = DataFrame({"a": [1.2, 3.4]}) + + if add_footer: + # The stray footer line should not mess with the + # casting of the first two lines if we skip it. + kwargs = dict(skipfooter=1) + data += "\nFooter" + else: + kwargs = dict() + + result = parser.read_csv(StringIO(data), names=["a"], + decimal="#", **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", ["::", "#####", "!!!", "123", "#1!c5", + "%!c!d", "@@#4:2", "_!pd#_"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16-be", "utf-16-le", + "utf-32", "cp037"]) +def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding): + # see gh-3404 + expected = DataFrame({"a": [1], "b": [2]}) + parser = python_parser_only + + data = "1" + sep + "2" + encoded_data = data.encode(encoding) + + result = parser.read_csv(BytesIO(encoded_data), sep=sep, + names=["a", "b"], encoding=encoding) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) +def test_multi_char_sep_quotes(python_parser_only, quoting): + # see gh-13374 + kwargs = dict(sep=",,") + parser = python_parser_only + + data = 'a,,b\n1,,a\n2,,"2,,b"' + msg = "ignored when a multi-char delimiter is used" + + def fail_read(): + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting, **kwargs) + + if quoting == csv.QUOTE_NONE: + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with pytest.raises(AssertionError): + fail_read() + else: + fail_read() + + +@tm.capture_stderr +def test_none_delimiter(python_parser_only): + # see gh-13374 and gh-17465 + parser = python_parser_only + data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" + expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]}) + + # We expect the third line in the data to be + # skipped because it is malformed, but we do + # not expect any errors to occur. + result = parser.read_csv(StringIO(data), header=0, + sep=None, warn_bad_lines=True, + error_bad_lines=False) + tm.assert_frame_equal(result, expected) + + warning = sys.stderr.getvalue() + assert "Skipping line 3" in warning + + +@pytest.mark.parametrize("data", [ + 'a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) +@pytest.mark.parametrize("skipfooter", [0, 1]) +def test_skipfooter_bad_row(python_parser_only, data, skipfooter): + # see gh-13879 and gh-15910 + msg = "parsing errors in the skipped footer rows" + parser = python_parser_only + + def fail_read(): + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), skipfooter=skipfooter) + + if skipfooter: + fail_read() + else: + # We expect no match, so there should be an assertion + # error out of the inner context manager. + with pytest.raises(AssertionError): + fail_read() + + +def test_malformed_skipfooter(python_parser_only): + parser = python_parser_only + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + msg = "Expected 3 fields in line 4, saw 5" + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data), header=1, + comment="#", skipfooter=1) diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py new file mode 100644 index 0000000000000..b33a1b8448bea --- /dev/null +++ b/pandas/tests/io/parser/test_quoting.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +""" +Tests that quoting specifications are properly handled +during parsing for all of the parsers defined in parsers.py +""" + +import csv + +import pytest + +from pandas.compat import PY2, StringIO, u +from pandas.errors import ParserError + +from pandas import DataFrame +import pandas.util.testing as tm + + +@pytest.mark.parametrize("kwargs,msg", [ + (dict(quotechar="foo"), '"quotechar" must be a(n)? 1-character string'), + (dict(quotechar=None, quoting=csv.QUOTE_MINIMAL), + "quotechar must be set if quoting enabled"), + (dict(quotechar=2), '"quotechar" must be string, not int') +]) +def test_bad_quote_char(all_parsers, kwargs, msg): + data = "1,2,3" + parser = all_parsers + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + + +@pytest.mark.parametrize("quoting,msg", [ + ("foo", '"quoting" must be an integer'), + (5, 'bad "quoting" value'), # quoting must be in the range [0, 3] +]) +def test_bad_quoting(all_parsers, quoting, msg): + data = "1,2,3" + parser = all_parsers + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), quoting=quoting) + + +def test_quote_char_basic(all_parsers): + parser = all_parsers + data = 'a,b,c\n1,2,"cat"' + expected = DataFrame([[1, 2, "cat"]], + columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), quotechar='"') + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"]) +def test_quote_char_various(all_parsers, quote_char): + parser = all_parsers + expected = DataFrame([[1, 2, "cat"]], + columns=["a", "b", "c"]) + + data = 'a,b,c\n1,2,"cat"' + new_data = data.replace('"', quote_char) + + result = parser.read_csv(StringIO(new_data), quotechar=quote_char) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE]) +@pytest.mark.parametrize("quote_char", ["", None]) +def test_null_quote_char(all_parsers, quoting, quote_char): + kwargs = dict(quotechar=quote_char, quoting=quoting) + data = "a,b,c\n1,2,3" + parser = all_parsers + + if quoting != csv.QUOTE_NONE: + # Sanity checking. + msg = "quotechar must be set if quoting enabled" + + with pytest.raises(TypeError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("kwargs,exp_data", [ + (dict(), [[1, 2, "foo"]]), # Test default. + + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_MINIMAL), [[1, 2, "foo"]]), + + # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading. + (dict(quotechar='"', quoting=csv.QUOTE_ALL), [[1, 2, "foo"]]), + + # QUOTE_NONE tells the reader to do no special handling + # of quote characters and leave them alone. + (dict(quotechar='"', quoting=csv.QUOTE_NONE), [[1, 2, '"foo"']]), + + # QUOTE_NONNUMERIC tells the reader to cast + # all non-quoted fields to float + (dict(quotechar='"', quoting=csv.QUOTE_NONNUMERIC), [[1.0, 2.0, "foo"]]) +]) +def test_quoting_various(all_parsers, kwargs, exp_data): + data = '1,2,"foo"' + parser = all_parsers + columns = ["a", "b", "c"] + + result = parser.read_csv(StringIO(data), names=columns, **kwargs) + expected = DataFrame(exp_data, columns=columns) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("doublequote,exp_data", [ + (True, [[3, '4 " 5']]), + (False, [[3, '4 " 5"']]), +]) +def test_double_quote(all_parsers, doublequote, exp_data): + parser = all_parsers + data = 'a,b\n3,"4 "" 5"' + + result = parser.read_csv(StringIO(data), quotechar='"', + doublequote=doublequote) + expected = DataFrame(exp_data, columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("quotechar", [ + u('"'), + pytest.param(u('\u0001'), marks=pytest.mark.skipif( + PY2, reason="Python 2.x does not handle unicode well."))]) +def test_quotechar_unicode(all_parsers, quotechar): + # see gh-14477 + data = "a\n1" + parser = all_parsers + expected = DataFrame({"a": [1]}) + + result = parser.read_csv(StringIO(data), quotechar=quotechar) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("balanced", [True, False]) +def test_unbalanced_quoting(all_parsers, balanced): + # see gh-22789. + parser = all_parsers + data = "a,b,c\n1,2,\"3" + + if balanced: + # Re-balance the quoting and read in without errors. + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + result = parser.read_csv(StringIO(data + '"')) + tm.assert_frame_equal(result, expected) + else: + msg = ("EOF inside string starting at row 1" if parser.engine == "c" + else "unexpected end of data") + + with pytest.raises(ParserError, match=msg): + parser.read_csv(StringIO(data)) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py new file mode 100644 index 0000000000000..d2ec1cf49445f --- /dev/null +++ b/pandas/tests/io/parser/test_usecols.py @@ -0,0 +1,533 @@ +# -*- coding: utf-8 -*- + +""" +Tests the usecols functionality during parsing +for all of the parsers defined in parsers.py +""" + +import numpy as np +import pytest + +from pandas._libs.tslib import Timestamp +from pandas.compat import PY2, StringIO + +from pandas import DataFrame, Index +import pandas.util.testing as tm + +_msg_validate_usecols_arg = ("'usecols' must either be list-like " + "of all strings, all unicode, all " + "integers or a callable.") +_msg_validate_usecols_names = ("Usecols do not match columns, columns " + "expected but not found: {0}") + + +def test_raise_on_mixed_dtype_usecols(all_parsers): + # See gh-12678 + data = """a,b,c + 1000,2000,3000 + 4000,5000,6000 + """ + usecols = [0, "b", 2] + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [(1, 2), ("b", "c")]) +def test_usecols(all_parsers, usecols): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], + [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_names(all_parsers): + data = """\ +a,b,c +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + names = ["foo", "bar"] + result = parser.read_csv(StringIO(data), names=names, + usecols=[1, 2], header=0) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], + [11, 12]], columns=names) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("names,usecols", [ + (["b", "c"], [1, 2]), + (["a", "b", "c"], ["b", "c"]) +]) +def test_usecols_relative_to_names(all_parsers, names, usecols): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), names=names, + header=None, usecols=usecols) + + expected = DataFrame([[2, 3], [5, 6], [8, 9], + [11, 12]], columns=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_relative_to_names2(all_parsers): + # see gh-5766 + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + result = parser.read_csv(StringIO(data), names=["a", "b"], + header=None, usecols=[0, 1]) + + expected = DataFrame([[1, 2], [4, 5], [7, 8], + [10, 11]], columns=["a", "b"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_name_length_conflict(all_parsers): + data = """\ +1,2,3 +4,5,6 +7,8,9 +10,11,12""" + parser = all_parsers + msg = ("Number of passed names did not " + "match number of header fields in the file" + if parser.engine == "python" else + "Passed header names mismatches usecols") + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), names=["a", "b"], + header=None, usecols=[1]) + + +def test_usecols_single_string(all_parsers): + # see gh-20558 + parser = all_parsers + data = """foo, bar, baz +1000, 2000, 3000 +4000, 5000, 6000""" + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols="foo") + + +@pytest.mark.parametrize("data", ["a,b,c,d\n1,2,3,4\n5,6,7,8", + "a,b,c,d\n1,2,3,4,\n5,6,7,8,"]) +def test_usecols_index_col_false(all_parsers, data): + # see gh-9082 + parser = all_parsers + usecols = ["a", "c", "d"] + expected = DataFrame({"a": [1, 5], "c": [3, 7], "d": [4, 8]}) + + result = parser.read_csv(StringIO(data), usecols=usecols, index_col=False) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("index_col", ["b", 0]) +@pytest.mark.parametrize("usecols", [["b", "c"], [1, 2]]) +def test_usecols_index_col_conflict(all_parsers, usecols, index_col): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + expected = DataFrame({"c": [1, 2]}, index=Index(["a", "b"], name="b")) + + result = parser.read_csv(StringIO(data), usecols=usecols, + index_col=index_col) + tm.assert_frame_equal(result, expected) + + +def test_usecols_index_col_conflict2(all_parsers): + # see gh-4201: test that index_col as integer reflects usecols + parser = all_parsers + data = "a,b,c,d\nA,a,1,one\nB,b,2,two" + + expected = DataFrame({"b": ["a", "b"], "c": [1, 2], "d": ("one", "two")}) + expected = expected.set_index(["b", "c"]) + + result = parser.read_csv(StringIO(data), usecols=["b", "c", "d"], + index_col=["b", "c"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_implicit_index_col(all_parsers): + # see gh-2654 + parser = all_parsers + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" + + result = parser.read_csv(StringIO(data), usecols=["a", "b"]) + expected = DataFrame({"a": ["apple", "orange"], + "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_regex_sep(all_parsers): + # see gh-2733 + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + result = parser.read_csv(StringIO(data), sep=r"\s+", usecols=("a", "b")) + + expected = DataFrame({"a": ["apple", "orange"], + "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_whitespace(all_parsers): + parser = all_parsers + data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + + result = parser.read_csv(StringIO(data), delim_whitespace=True, + usecols=("a", "b")) + expected = DataFrame({"a": ["apple", "orange"], + "b": ["bat", "cow"]}, index=[4, 8]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols,expected", [ + # Column selection by index. + ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], + columns=["2", "0"])), + + # Column selection by name. + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], + columns=["0", "1"])), +]) +def test_usecols_with_integer_like_header(all_parsers, usecols, expected): + parser = all_parsers + data = """2,0,1 +1000,2000,3000 +4000,5000,6000""" + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +def test_usecols_with_parse_dates(all_parsers, usecols): + # see gh-9755 + data = """a,b,c,d,e +0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parser = all_parsers + parse_dates = [[1, 2]] + + cols = { + "a": [0, 0], + "c_d": [ + Timestamp("2014-01-01 09:00:00"), + Timestamp("2014-01-02 10:00:00") + ] + } + expected = DataFrame(cols, columns=["c_d", "a"]) + result = parser.read_csv(StringIO(data), usecols=usecols, + parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates2(all_parsers): + # see gh-13604 + parser = all_parsers + data = """2008-02-07 09:40,1032.43 +2008-02-07 09:50,1042.54 +2008-02-07 10:00,1051.65""" + + names = ["date", "values"] + usecols = names[:] + parse_dates = [0] + + index = Index([Timestamp("2008-02-07 09:40"), + Timestamp("2008-02-07 09:50"), + Timestamp("2008-02-07 10:00")], + name="date") + cols = {"values": [1032.43, 1042.54, 1051.65]} + expected = DataFrame(cols, index=index) + + result = parser.read_csv(StringIO(data), parse_dates=parse_dates, + index_col=0, usecols=usecols, + header=None, names=names) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates3(all_parsers): + # see gh-14792 + parser = all_parsers + data = """a,b,c,d,e,f,g,h,i,j +2016/09/21,1,1,2,3,4,5,6,7,8""" + + usecols = list("abcdefghij") + parse_dates = [0] + + cols = {"a": Timestamp("2016-09-21"), + "b": [1], "c": [1], "d": [2], + "e": [3], "f": [4], "g": [5], + "h": [6], "i": [7], "j": [8]} + expected = DataFrame(cols, columns=usecols) + + result = parser.read_csv(StringIO(data), usecols=usecols, + parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_parse_dates4(all_parsers): + data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" + usecols = list("abcdefghij") + parse_dates = [[0, 1]] + parser = all_parsers + + cols = {"a_b": "2016/09/21 1", + "c": [1], "d": [2], "e": [3], "f": [4], + "g": [5], "h": [6], "i": [7], "j": [8]} + expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) + + result = parser.read_csv(StringIO(data), usecols=usecols, + parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) +@pytest.mark.parametrize("names", [ + list("abcde"), # Names span all columns in original data. + list("acd"), # Names span only the selected columns. +]) +def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names): + # see gh-9755 + s = """0,1,20140101,0900,4 +0,1,20140102,1000,4""" + parse_dates = [[1, 2]] + parser = all_parsers + + cols = { + "a": [0, 0], + "c_d": [ + Timestamp("2014-01-01 09:00:00"), + Timestamp("2014-01-02 10:00:00") + ] + } + expected = DataFrame(cols, columns=["c_d", "a"]) + + result = parser.read_csv(StringIO(s), names=names, + parse_dates=parse_dates, + usecols=usecols) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_unicode_strings(all_parsers): + # see gh-13219 + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "AAA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + "BBB": {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=[u"AAA", u"BBB"]) + tm.assert_frame_equal(result, expected) + + +def test_usecols_with_single_byte_unicode_strings(all_parsers): + # see gh-13219 + data = """A,B,C,D +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "A": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + "B": {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=[u"A", u"B"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [[u"AAA", b"BBB"], [b"AAA", u"BBB"]]) +def test_usecols_with_mixed_encoding_strings(all_parsers, usecols): + data = """AAA,BBB,CCC,DDD +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + with pytest.raises(ValueError, match=_msg_validate_usecols_arg): + parser.read_csv(StringIO(data), usecols=usecols) + + +@pytest.mark.parametrize("usecols", [ + ["あああ", "いい"], + pytest.param([u"あああ", u"いい"], marks=pytest.mark.skipif( + PY2, reason="Buggy behavior: see gh-13253")) +]) +def test_usecols_with_multi_byte_characters(all_parsers, usecols): + data = """あああ,いい,ううう,ええええ +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + exp_data = { + "あああ": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + "いい": {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(exp_data) + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +def test_empty_usecols(all_parsers): + data = "a,b,c\n1,2,3\n4,5,6" + expected = DataFrame() + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=set()) + tm.assert_frame_equal(result, expected) + + +def test_np_array_usecols(all_parsers): + # see gh-12546 + parser = all_parsers + data = "a,b,c\n1,2,3" + usecols = np.array(["a", "b"]) + + expected = DataFrame([[1, 2]], columns=usecols) + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols,expected", [ + (lambda x: x.upper() in ["AAA", "BBB", "DDD"], + DataFrame({ + "AaA": { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + "bBb": {0: 8, 1: 2, 2: 7}, + "ddd": {0: "a", 1: "b", 2: "a"} + })), + (lambda x: False, DataFrame()), +]) +def test_callable_usecols(all_parsers, usecols, expected): + # see gh-14154 + data = """AaA,bBb,CCC,ddd +0.056674973,8,True,a +2.613230982,2,False,b +3.568935038,7,False,a""" + parser = all_parsers + + result = parser.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols", [["a", "c"], lambda x: x in ["a", "c"]]) +def test_incomplete_first_row(all_parsers, usecols): + # see gh-6710 + data = "1,2\n1,2,3" + parser = all_parsers + names = ["a", "b", "c"] + expected = DataFrame({"a": [1, 1], "c": [np.nan, 3]}) + + result = parser.read_csv(StringIO(data), names=names, usecols=usecols) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("data,usecols,kwargs,expected", [ + # see gh-8985 + ("19,29,39\n" * 2 + "10,20,30,40", [0, 1, 2], + dict(header=None), DataFrame([[19, 29, 39], [19, 29, 39], [10, 20, 30]])), + + # see gh-9549 + (("A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n" + "1,2,3,,,1,\n1,2,3\n5,6,7"), ["A", "B", "C"], + dict(), DataFrame({"A": [1, 3, 1, 1, 1, 5], + "B": [2, 4, 2, 2, 2, 6], + "C": [3, 5, 4, 3, 3, 7]})), +]) +def test_uneven_length_cols(all_parsers, data, usecols, kwargs, expected): + # see gh-8985 + parser = all_parsers + result = parser.read_csv(StringIO(data), usecols=usecols, **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("usecols,kwargs,expected,msg", [ + (["a", "b", "c", "d"], dict(), + DataFrame({"a": [1, 5], "b": [2, 6], "c": [3, 7], "d": [4, 8]}), None), + (["a", "b", "c", "f"], dict(), None, + _msg_validate_usecols_names.format(r"\['f'\]")), + (["a", "b", "f"], dict(), None, + _msg_validate_usecols_names.format(r"\['f'\]")), + (["a", "b", "f", "g"], dict(), None, + _msg_validate_usecols_names.format(r"\[('f', 'g'|'g', 'f')\]")), + + # see gh-14671 + (None, dict(header=0, names=["A", "B", "C", "D"]), + DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 7], + "D": [4, 8]}), None), + (["A", "B", "C", "f"], dict(header=0, names=["A", "B", "C", "D"]), + None, _msg_validate_usecols_names.format(r"\['f'\]")), + (["A", "B", "f"], dict(names=["A", "B", "C", "D"]), + None, _msg_validate_usecols_names.format(r"\['f'\]")), +]) +def test_raises_on_usecols_names_mismatch(all_parsers, usecols, + kwargs, expected, msg): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + kwargs.update(usecols=usecols) + parser = all_parsers + + if expected is None: + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), **kwargs) + else: + result = parser.read_csv(StringIO(data), **kwargs) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="see gh-16469: buggy behavior") +@pytest.mark.parametrize("usecols", [["A", "C"], [0, 2]]) +def test_usecols_subset_names_mismatch_orig_columns(all_parsers, usecols): + data = "a,b,c,d\n1,2,3,4\n5,6,7,8" + names = ["A", "B", "C", "D"] + parser = all_parsers + + result = parser.read_csv(StringIO(data), header=0, + names=names, usecols=usecols) + expected = DataFrame({"A": [1, 5], "C": [3, 7]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py deleted file mode 100644 index e9bb72be124d3..0000000000000 --- a/pandas/tests/io/parser/usecols.py +++ /dev/null @@ -1,550 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests the usecols functionality during parsing -for all of the parsers defined in parsers.py -""" - -import numpy as np -import pytest - -from pandas._libs.tslib import Timestamp -from pandas.compat import StringIO - -from pandas import DataFrame, Index -import pandas.util.testing as tm - - -class UsecolsTests(object): - msg_validate_usecols_arg = ("'usecols' must either be list-like of all " - "strings, all unicode, all integers or a " - "callable.") - msg_validate_usecols_names = ("Usecols do not match columns, columns " - "expected but not found: {0}") - - def test_raise_on_mixed_dtype_usecols(self): - # See gh-12678 - data = """a,b,c - 1000,2000,3000 - 4000,5000,6000 - """ - - usecols = [0, 'b', 2] - - with pytest.raises(ValueError, match=self.msg_validate_usecols_arg): - self.read_csv(StringIO(data), usecols=usecols) - - def test_usecols(self): - data = """\ -a,b,c -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - - result = self.read_csv(StringIO(data), usecols=(1, 2)) - result2 = self.read_csv(StringIO(data), usecols=('b', 'c')) - exp = self.read_csv(StringIO(data)) - - assert len(result.columns) == 2 - assert (result['b'] == exp['b']).all() - assert (result['c'] == exp['c']).all() - - tm.assert_frame_equal(result, result2) - - result = self.read_csv(StringIO(data), usecols=[1, 2], header=0, - names=['foo', 'bar']) - expected = self.read_csv(StringIO(data), usecols=[1, 2]) - expected.columns = ['foo', 'bar'] - tm.assert_frame_equal(result, expected) - - data = """\ -1,2,3 -4,5,6 -7,8,9 -10,11,12""" - result = self.read_csv(StringIO(data), names=['b', 'c'], - header=None, usecols=[1, 2]) - - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - expected = expected[['b', 'c']] - tm.assert_frame_equal(result, expected) - - result2 = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None, usecols=['b', 'c']) - tm.assert_frame_equal(result2, result) - - # see gh-5766 - result = self.read_csv(StringIO(data), names=['a', 'b'], - header=None, usecols=[0, 1]) - - expected = self.read_csv(StringIO(data), names=['a', 'b', 'c'], - header=None) - expected = expected[['a', 'b']] - tm.assert_frame_equal(result, expected) - - # length conflict, passed names and usecols disagree - pytest.raises(ValueError, self.read_csv, StringIO(data), - names=['a', 'b'], usecols=[1], header=None) - - def test_usecols_single_string(self): - # GH 20558 - data = """foo, bar, baz - 1000, 2000, 3000 - 4000, 5000, 6000 - """ - - usecols = 'foo' - - with pytest.raises(ValueError, match=self.msg_validate_usecols_arg): - self.read_csv(StringIO(data), usecols=usecols) - - def test_usecols_index_col_False(self): - # see gh-9082 - s = "a,b,c,d\n1,2,3,4\n5,6,7,8" - s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8," - cols = ['a', 'c', 'd'] - expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]}) - df = self.read_csv(StringIO(s), usecols=cols, index_col=False) - tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(s_malformed), - usecols=cols, index_col=False) - tm.assert_frame_equal(expected, df) - - def test_usecols_index_col_conflict(self): - # see gh-4201: test that index_col as integer reflects usecols - data = 'a,b,c,d\nA,a,1,one\nB,b,2,two' - expected = DataFrame({'c': [1, 2]}, index=Index( - ['a', 'b'], name='b')) - - df = self.read_csv(StringIO(data), usecols=['b', 'c'], - index_col=0) - tm.assert_frame_equal(expected, df) - - df = self.read_csv(StringIO(data), usecols=['b', 'c'], - index_col='b') - tm.assert_frame_equal(expected, df) - - df = self.read_csv(StringIO(data), usecols=[1, 2], - index_col='b') - tm.assert_frame_equal(expected, df) - - df = self.read_csv(StringIO(data), usecols=[1, 2], - index_col=0) - tm.assert_frame_equal(expected, df) - - expected = DataFrame( - {'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')}) - expected = expected.set_index(['b', 'c']) - df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'], - index_col=['b', 'c']) - tm.assert_frame_equal(expected, df) - - def test_usecols_implicit_index_col(self): - # see gh-2654 - data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' - - result = self.read_csv(StringIO(data), usecols=['a', 'b']) - expected = DataFrame({'a': ['apple', 'orange'], - 'b': ['bat', 'cow']}, index=[4, 8]) - - tm.assert_frame_equal(result, expected) - - def test_usecols_regex_sep(self): - # see gh-2733 - data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' - - df = self.read_csv(StringIO(data), sep=r'\s+', usecols=('a', 'b')) - - expected = DataFrame({'a': ['apple', 'orange'], - 'b': ['bat', 'cow']}, index=[4, 8]) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_whitespace(self): - data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' - - result = self.read_csv(StringIO(data), delim_whitespace=True, - usecols=('a', 'b')) - expected = DataFrame({'a': ['apple', 'orange'], - 'b': ['bat', 'cow']}, index=[4, 8]) - - tm.assert_frame_equal(result, expected) - - def test_usecols_with_integer_like_header(self): - data = """2,0,1 - 1000,2000,3000 - 4000,5000,6000 - """ - - usecols = [0, 1] # column selection by index - expected = DataFrame(data=[[1000, 2000], - [4000, 5000]], - columns=['2', '0']) - df = self.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(df, expected) - - usecols = ['0', '1'] # column selection by name - expected = DataFrame(data=[[2000, 3000], - [5000, 6000]], - columns=['0', '1']) - df = self.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_parse_dates(self): - # See gh-9755 - s = """a,b,c,d,e - 0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - - cols = { - 'a': [0, 0], - 'c_d': [ - Timestamp('2014-01-01 09:00:00'), - Timestamp('2014-01-02 10:00:00') - ] - } - expected = DataFrame(cols, columns=['c_d', 'a']) - - df = self.read_csv(StringIO(s), usecols=[0, 2, 3], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(s), usecols=[3, 0, 2], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - # See gh-13604 - s = """2008-02-07 09:40,1032.43 - 2008-02-07 09:50,1042.54 - 2008-02-07 10:00,1051.65 - """ - parse_dates = [0] - names = ['date', 'values'] - usecols = names[:] - - index = Index([Timestamp('2008-02-07 09:40'), - Timestamp('2008-02-07 09:50'), - Timestamp('2008-02-07 10:00')], - name='date') - cols = {'values': [1032.43, 1042.54, 1051.65]} - expected = DataFrame(cols, index=index) - - df = self.read_csv(StringIO(s), parse_dates=parse_dates, index_col=0, - usecols=usecols, header=None, names=names) - tm.assert_frame_equal(df, expected) - - # See gh-14792 - s = """a,b,c,d,e,f,g,h,i,j - 2016/09/21,1,1,2,3,4,5,6,7,8""" - parse_dates = [0] - usecols = list('abcdefghij') - cols = {'a': Timestamp('2016-09-21'), - 'b': [1], 'c': [1], 'd': [2], - 'e': [3], 'f': [4], 'g': [5], - 'h': [6], 'i': [7], 'j': [8]} - expected = DataFrame(cols, columns=usecols) - df = self.read_csv(StringIO(s), usecols=usecols, - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - s = """a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8""" - parse_dates = [[0, 1]] - usecols = list('abcdefghij') - cols = {'a_b': '2016/09/21 1', - 'c': [1], 'd': [2], 'e': [3], 'f': [4], - 'g': [5], 'h': [6], 'i': [7], 'j': [8]} - expected = DataFrame(cols, columns=['a_b'] + list('cdefghij')) - df = self.read_csv(StringIO(s), usecols=usecols, - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_parse_dates_and_full_names(self): - # See gh-9755 - s = """0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - names = list('abcde') - - cols = { - 'a': [0, 0], - 'c_d': [ - Timestamp('2014-01-01 09:00:00'), - Timestamp('2014-01-02 10:00:00') - ] - } - expected = DataFrame(cols, columns=['c_d', 'a']) - - df = self.read_csv(StringIO(s), names=names, - usecols=[0, 2, 3], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(s), names=names, - usecols=[3, 0, 2], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_parse_dates_and_usecol_names(self): - # See gh-9755 - s = """0,1,20140101,0900,4 - 0,1,20140102,1000,4""" - parse_dates = [[1, 2]] - names = list('acd') - - cols = { - 'a': [0, 0], - 'c_d': [ - Timestamp('2014-01-01 09:00:00'), - Timestamp('2014-01-02 10:00:00') - ] - } - expected = DataFrame(cols, columns=['c_d', 'a']) - - df = self.read_csv(StringIO(s), names=names, - usecols=[0, 2, 3], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(s), names=names, - usecols=[3, 0, 2], - parse_dates=parse_dates) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_unicode_strings(self): - # see gh-13219 - - s = '''AAA,BBB,CCC,DDD - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a - ''' - - data = { - 'AAA': { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - 'BBB': {0: 8, 1: 2, 2: 7} - } - expected = DataFrame(data) - - df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB']) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_single_byte_unicode_strings(self): - # see gh-13219 - - s = '''A,B,C,D - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a - ''' - - data = { - 'A': { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - 'B': {0: 8, 1: 2, 2: 7} - } - expected = DataFrame(data) - - df = self.read_csv(StringIO(s), usecols=[u'A', u'B']) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_mixed_encoding_strings(self): - s = '''AAA,BBB,CCC,DDD - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a - ''' - - with pytest.raises(ValueError, match=self.msg_validate_usecols_arg): - self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) - - with pytest.raises(ValueError, match=self.msg_validate_usecols_arg): - self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB']) - - def test_usecols_with_multibyte_characters(self): - s = '''あああ,いい,ううう,ええええ - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a - ''' - data = { - 'あああ': { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - 'いい': {0: 8, 1: 2, 2: 7} - } - expected = DataFrame(data) - - df = self.read_csv(StringIO(s), usecols=['あああ', 'いい']) - tm.assert_frame_equal(df, expected) - - def test_usecols_with_multibyte_unicode_characters(self): - pytest.skip('TODO: see gh-13253') - - s = '''あああ,いい,ううう,ええええ - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a - ''' - data = { - 'あああ': { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - 'いい': {0: 8, 1: 2, 2: 7} - } - expected = DataFrame(data) - - df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい']) - tm.assert_frame_equal(df, expected) - - def test_empty_usecols(self): - # should not raise - data = 'a,b,c\n1,2,3\n4,5,6' - expected = DataFrame() - result = self.read_csv(StringIO(data), usecols=set()) - tm.assert_frame_equal(result, expected) - - def test_np_array_usecols(self): - # See gh-12546 - data = 'a,b,c\n1,2,3' - usecols = np.array(['a', 'b']) - - expected = DataFrame([[1, 2]], columns=usecols) - result = self.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(result, expected) - - def test_callable_usecols(self): - # See gh-14154 - s = '''AaA,bBb,CCC,ddd - 0.056674973,8,True,a - 2.613230982,2,False,b - 3.568935038,7,False,a - ''' - - data = { - 'AaA': { - 0: 0.056674972999999997, - 1: 2.6132309819999997, - 2: 3.5689350380000002 - }, - 'bBb': {0: 8, 1: 2, 2: 7}, - 'ddd': {0: 'a', 1: 'b', 2: 'a'} - } - expected = DataFrame(data) - df = self.read_csv(StringIO(s), usecols=lambda x: - x.upper() in ['AAA', 'BBB', 'DDD']) - tm.assert_frame_equal(df, expected) - - # Check that a callable returning only False returns - # an empty DataFrame - expected = DataFrame() - df = self.read_csv(StringIO(s), usecols=lambda x: False) - tm.assert_frame_equal(df, expected) - - def test_incomplete_first_row(self): - # see gh-6710 - data = '1,2\n1,2,3' - names = ['a', 'b', 'c'] - expected = DataFrame({'a': [1, 1], - 'c': [np.nan, 3]}) - - usecols = ['a', 'c'] - df = self.read_csv(StringIO(data), names=names, usecols=usecols) - tm.assert_frame_equal(df, expected) - - usecols = lambda x: x in ['a', 'c'] - df = self.read_csv(StringIO(data), names=names, usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_uneven_length_cols(self): - # see gh-8985 - usecols = [0, 1, 2] - data = '19,29,39\n' * 2 + '10,20,30,40' - expected = DataFrame([[19, 29, 39], - [19, 29, 39], - [10, 20, 30]]) - df = self.read_csv(StringIO(data), header=None, usecols=usecols) - tm.assert_frame_equal(df, expected) - - # see gh-9549 - usecols = ['A', 'B', 'C'] - data = ('A,B,C\n1,2,3\n3,4,5\n1,2,4,5,1,6\n' - '1,2,3,,,1,\n1,2,3\n5,6,7') - expected = DataFrame({'A': [1, 3, 1, 1, 1, 5], - 'B': [2, 4, 2, 2, 2, 6], - 'C': [3, 5, 4, 3, 3, 7]}) - df = self.read_csv(StringIO(data), usecols=usecols) - tm.assert_frame_equal(df, expected) - - def test_raise_on_usecols_names_mismatch(self): - # GH 14671 - data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' - - usecols = ['a', 'b', 'c', 'd'] - df = self.read_csv(StringIO(data), usecols=usecols) - expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7], - 'd': [4, 8]}) - tm.assert_frame_equal(df, expected) - - usecols = ['a', 'b', 'c', 'f'] - msg = self.msg_validate_usecols_names.format(r"\['f'\]") - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), usecols=usecols) - - usecols = ['a', 'b', 'f'] - msg = self.msg_validate_usecols_names.format(r"\['f'\]") - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), usecols=usecols) - - usecols = ['a', 'b', 'f', 'g'] - msg = self.msg_validate_usecols_names.format( - r"\[('f', 'g'|'g', 'f')\]") - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), usecols=usecols) - - names = ['A', 'B', 'C', 'D'] - - df = self.read_csv(StringIO(data), header=0, names=names) - expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7], - 'D': [4, 8]}) - tm.assert_frame_equal(df, expected) - - # TODO: https://github.com/pandas-dev/pandas/issues/16469 - # usecols = ['A','C'] - # df = self.read_csv(StringIO(data), header=0, names=names, - # usecols=usecols) - # expected = DataFrame({'A': [1,5], 'C': [3,7]}) - # tm.assert_frame_equal(df, expected) - # - # usecols = [0,2] - # df = self.read_csv(StringIO(data), header=0, names=names, - # usecols=usecols) - # expected = DataFrame({'A': [1,5], 'C': [3,7]}) - # tm.assert_frame_equal(df, expected) - - usecols = ['A', 'B', 'C', 'f'] - msg = self.msg_validate_usecols_names.format(r"\['f'\]") - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), header=0, names=names, - usecols=usecols) - - usecols = ['A', 'B', 'f'] - msg = self.msg_validate_usecols_names.format(r"\['f'\]") - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), names=names, usecols=usecols) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 9025573c8cf6f..210620f2092cf 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -625,7 +625,7 @@ def capture_stdout(f): AssertionError: assert 'foo\n' == 'bar\n' """ - @wraps(f) + @compat.wraps(f) def wrapper(*args, **kwargs): try: sys.stdout = StringIO() From 7e184f00a94ad1dcb7fa2e4621f78487c79325fc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 22 Nov 2018 18:37:23 -0800 Subject: [PATCH 08/21] move misplaced modulo test (#23827) --- pandas/tests/arithmetic/test_numeric.py | 38 +++++++++++++++++++++++++ pandas/tests/series/test_analytics.py | 36 ----------------------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index f1023148aaf1c..c3cd9f0f43559 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -597,6 +597,44 @@ def test_operators_frame(self): tm.assert_series_equal(ts / ts, ts / df['A'], check_names=False) + # TODO: this came from tests.series.test_analytics, needs cleannup and + # de-duplication with test_modulo above + def test_modulo2(self): + with np.errstate(all='ignore'): + + # GH#3590, modulo as ints + p = pd.DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = p['first'] % p['second'] + expected = Series(p['first'].values % p['second'].values, + dtype='float64') + expected.iloc[0:3] = np.nan + tm.assert_series_equal(result, expected) + + result = p['first'] % 0 + expected = Series(np.nan, index=p.index, name='first') + tm.assert_series_equal(result, expected) + + p = p.astype('float64') + result = p['first'] % p['second'] + expected = Series(p['first'].values % p['second'].values) + tm.assert_series_equal(result, expected) + + p = p.astype('float64') + result = p['first'] % p['second'] + result2 = p['second'] % p['first'] + assert not result.equals(result2) + + # GH#9144 + s = Series([0, 1]) + + result = s % 0 + expected = Series([np.nan, np.nan]) + tm.assert_series_equal(result, expected) + + result = 0 % s + expected = Series([np.nan, 0.0]) + tm.assert_series_equal(result, expected) + class TestAdditionSubtraction(object): # __add__, __sub__, __radd__, __rsub__, __iadd__, __isub__ diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index a5a7cc2217864..86b471492263c 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -681,42 +681,6 @@ def test_all_any_params(self): pytest.raises(NotImplementedError, s.any, bool_only=True) pytest.raises(NotImplementedError, s.all, bool_only=True) - def test_modulo(self): - with np.errstate(all='ignore'): - - # GH3590, modulo as ints - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values, - dtype='float64') - expected.iloc[0:3] = np.nan - assert_series_equal(result, expected) - - result = p['first'] % 0 - expected = Series(np.nan, index=p.index, name='first') - assert_series_equal(result, expected) - - p = p.astype('float64') - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values) - assert_series_equal(result, expected) - - p = p.astype('float64') - result = p['first'] % p['second'] - result2 = p['second'] % p['first'] - assert not result.equals(result2) - - # GH 9144 - s = Series([0, 1]) - - result = s % 0 - expected = Series([nan, nan]) - assert_series_equal(result, expected) - - result = 0 % s - expected = Series([nan, 0.0]) - assert_series_equal(result, expected) - @td.skip_if_no_scipy def test_corr(self, datetime_series): import scipy.stats as stats From 54932d00f777dbc17536e3be4b1e7cd1fbce9c75 Mon Sep 17 00:00:00 2001 From: Erik Date: Thu, 22 Nov 2018 18:38:11 -0800 Subject: [PATCH 09/21] TST: Add test cases for GH6173, appending to empty df (#23806) --- pandas/tests/indexing/test_loc.py | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index df0180c7a5bf7..21bb624790328 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -804,3 +804,35 @@ def test_loc_uint64(self): result = s.loc[[np.iinfo('uint64').max - 1, np.iinfo('uint64').max]] tm.assert_series_equal(result, s) + + def test_loc_setitem_empty_append(self): + # GH6173, various appends to an empty dataframe + + data = [1, 2, 3] + expected = DataFrame({'x': data, 'y': [None] * len(data)}) + + # appends to fit length of data + df = DataFrame(columns=['x', 'y']) + df.loc[:, 'x'] = data + tm.assert_frame_equal(df, expected) + + # only appends one value + expected = DataFrame({'x': [1.0], 'y': [np.nan]}) + df = DataFrame(columns=['x', 'y'], + dtype=np.float) + df.loc[0, 'x'] = expected.loc[0, 'x'] + tm.assert_frame_equal(df, expected) + + def test_loc_setitem_empty_append_raises(self): + # GH6173, various appends to an empty dataframe + + data = [1, 2] + df = DataFrame(columns=['x', 'y']) + msg = (r"None of \[Int64Index\(\[0, 1\], dtype='int64'\)\] " + r"are in the \[index\]") + with pytest.raises(KeyError, match=msg): + df.loc[[0, 1], 'x'] = data + + msg = "cannot copy sequence with size 2 to array axis with dimension 0" + with pytest.raises(ValueError, match=msg): + df.loc[0:2, 'x'] = data From 1e30a67329bfb0e20ef888461cab82a7cfb283b0 Mon Sep 17 00:00:00 2001 From: RomainSa Date: Fri, 23 Nov 2018 03:40:58 +0100 Subject: [PATCH 10/21] DOC/TST: doctests leaving extraneous files (#23858) --- pandas/core/frame.py | 8 +++++--- pandas/core/generic.py | 9 +++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 572bb3668caf8..5104cf815abf6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2028,8 +2028,9 @@ def to_parquet(self, fname, engine='auto', compression='snappy', Examples -------- >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) - >>> df.to_parquet('df.parquet.gzip', compression='gzip') - >>> pd.read_parquet('df.parquet.gzip') + >>> df.to_parquet('df.parquet.gzip', + ... compression='gzip') # doctest: +SKIP + >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP col1 col2 0 1 3 1 2 4 @@ -2243,7 +2244,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, >>> buffer = io.StringIO() >>> df.info(buf=buffer) >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", encoding="utf-8") as f: + >>> with open("df_info.txt", "w", + ... encoding="utf-8") as f: # doctest: +SKIP ... f.write(s) 260 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5bb364e1d1605..dd025fb61f973 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2069,17 +2069,18 @@ def _repr_latex_(self): >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']], ... index=['row 1', 'row 2'], ... columns=['col 1', 'col 2']) - >>> df1.to_excel("output.xlsx") + >>> df1.to_excel("output.xlsx") # doctest: +SKIP To specify the sheet name: - >>> df1.to_excel("output.xlsx", sheet_name='Sheet_name_1') + >>> df1.to_excel("output.xlsx", + ... sheet_name='Sheet_name_1') # doctest: +SKIP If you wish to write to more than one sheet in the workbook, it is necessary to specify an ExcelWriter object: >>> df2 = df1.copy() - >>> with pd.ExcelWriter('output.xlsx') as writer: + >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP ... df1.to_excel(writer, sheet_name='Sheet_name_1') ... df2.to_excel(writer, sheet_name='Sheet_name_2') @@ -2087,7 +2088,7 @@ def _repr_latex_(self): you can pass the `engine` keyword (the default engine is automatically chosen depending on the file extension): - >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') + >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP """ def to_json(self, path_or_buf=None, orient=None, date_format=None, From 8edf972fa07aa3b761756aba87c08fea98499687 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Fri, 23 Nov 2018 02:55:13 +0000 Subject: [PATCH 11/21] CLN: Isort core/*.py and core/indexes/ (#23764) --- pandas/core/algorithms.py | 41 +++++++--------- pandas/core/apply.py | 14 +++--- pandas/core/base.py | 34 ++++++-------- pandas/core/categorical.py | 7 +-- pandas/core/common.py | 16 +++---- pandas/core/config.py | 6 +-- pandas/core/config_init.py | 6 ++- pandas/core/generic.py | 72 ++++++++++++---------------- pandas/core/indexes/accessors.py | 12 ++--- pandas/core/indexes/api.py | 28 +++++------ pandas/core/indexes/base.py | 81 ++++++++++++-------------------- pandas/core/indexes/category.py | 27 +++++------ pandas/core/indexes/frozen.py | 7 ++- pandas/core/indexes/interval.py | 54 ++++++++------------- pandas/core/indexes/multi.py | 48 ++++++++----------- pandas/core/indexes/numeric.py | 20 ++++---- pandas/core/indexes/period.py | 47 ++++++++---------- pandas/core/indexing.py | 19 +++----- pandas/core/missing.py | 17 ++----- pandas/core/nanops.py | 27 +++++------ pandas/core/ops.py | 47 ++++++++---------- pandas/core/panel.py | 41 ++++++++-------- pandas/core/resample.py | 46 +++++++++--------- pandas/core/sorting.py | 17 ++++--- pandas/core/strings.py | 35 ++++++-------- setup.cfg | 25 ---------- 26 files changed, 328 insertions(+), 466 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9ff8ba7bb23d2..ecdad8752113a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,40 +3,31 @@ intended for public consumption """ from __future__ import division -from warnings import warn, catch_warnings, simplefilter + from textwrap import dedent +from warnings import catch_warnings, simplefilter, warn import numpy as np +from pandas._libs import algos, hashtable as htable, lib +from pandas._libs.tslib import iNaT +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg + from pandas.core.dtypes.cast import ( - maybe_promote, construct_1d_object_array_from_listlike) -from pandas.core.dtypes.generic import ( - ABCSeries, ABCIndex, - ABCIndexClass) + construct_1d_object_array_from_listlike, maybe_promote) from pandas.core.dtypes.common import ( - is_array_like, - is_unsigned_integer_dtype, is_signed_integer_dtype, - is_integer_dtype, is_complex_dtype, - is_object_dtype, - is_extension_array_dtype, - is_categorical_dtype, is_sparse, - is_period_dtype, - is_numeric_dtype, is_float_dtype, - is_bool_dtype, needs_i8_conversion, - is_datetimetz, - is_datetime64_any_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_datetimelike, - is_interval_dtype, is_scalar, is_list_like, - ensure_platform_int, ensure_object, - ensure_float64, ensure_uint64, - ensure_int64) + ensure_float64, ensure_int64, ensure_object, ensure_platform_int, + ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, + is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, + is_datetimelike, is_datetimetz, is_extension_array_dtype, is_float_dtype, + is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, + is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, + is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, + needs_i8_conversion) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas.core import common as com -from pandas._libs import algos, lib, hashtable as htable -from pandas._libs.tslib import iNaT -from pandas.util._decorators import (Appender, Substitution, - deprecate_kwarg) _shared_docs = {} diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 40cd952a62138..c44e64d29ed26 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,15 +1,15 @@ import warnings + import numpy as np -from pandas import compat + from pandas._libs import reduction -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.common import ( - is_extension_type, - is_dict_like, - is_list_like, - is_sequence) +import pandas.compat as compat from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.common import ( + is_dict_like, is_extension_type, is_list_like, is_sequence) +from pandas.core.dtypes.generic import ABCSeries + from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/base.py b/pandas/core/base.py index 9dc4237bdcd2d..fd303182959a5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1,32 +1,28 @@ """ Base and utility classes for pandas objects. """ -import warnings import textwrap -from pandas import compat -from pandas.compat import builtins -import numpy as np +import warnings -from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass -from pandas.core.dtypes.common import ( - is_datetimelike, - is_object_dtype, - is_list_like, - is_scalar, - is_extension_type, - is_extension_array_dtype) +import numpy as np -from pandas.util._validators import validate_bool_kwarg -from pandas.errors import AbstractMethodError -from pandas.core import common as com, algorithms -import pandas.core.nanops as nanops import pandas._libs.lib as lib +import pandas.compat as compat +from pandas.compat import PYPY, OrderedDict, builtins from pandas.compat.numpy import function as nv -from pandas.compat import PYPY, OrderedDict -from pandas.util._decorators import Appender, cache_readonly, Substitution +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._validators import validate_bool_kwarg +from pandas.core.dtypes.common import ( + is_datetimelike, is_extension_array_dtype, is_extension_type, is_list_like, + is_object_dtype, is_scalar) +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import isna + +from pandas.core import algorithms, common as com from pandas.core.accessor import DirNamesMixin +import pandas.core.nanops as nanops _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 530a3ecb5f378..43c35c4000bb6 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1,8 +1,9 @@ import warnings +from pandas.core.dtypes.dtypes import CategoricalDtype # noqa + +from pandas.core.arrays import Categorical # noqa + # TODO: Remove after 0.23.x warnings.warn("'pandas.core' is private. Use 'pandas.Categorical'", FutureWarning, stacklevel=2) - -from pandas.core.arrays import Categorical # noqa -from pandas.core.dtypes.dtypes import CategoricalDtype # noqa diff --git a/pandas/core/common.py b/pandas/core/common.py index 0a82dd8636888..b4de0daa13b16 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -4,25 +4,23 @@ Note: pandas.core.common is *not* part of the public API. """ +import collections from datetime import datetime, timedelta from functools import partial import inspect -import collections import numpy as np + from pandas._libs import lib, tslibs +import pandas.compat as compat +from pandas.compat import PY36, OrderedDict, iteritems -from pandas import compat -from pandas.compat import iteritems, PY36, OrderedDict -from pandas.core.dtypes.generic import ( - ABCSeries, ABCIndex, ABCIndexClass -) +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( - is_integer, is_bool_dtype, is_extension_array_dtype, is_array_like -) + is_array_like, is_bool_dtype, is_extension_array_dtype, is_integer) +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import _iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike class SettingWithCopyError(ValueError): diff --git a/pandas/core/config.py b/pandas/core/config.py index f178600b74626..f4757bfd8069c 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -48,13 +48,13 @@ """ -import re - from collections import namedtuple from contextlib import contextmanager +import re import warnings -from pandas.compat import map, lmap, u + import pandas.compat as compat +from pandas.compat import lmap, map, u DeprecatedOption = namedtuple('DeprecatedOption', 'key msg rkey removal_ver') RegisteredOption = namedtuple('RegisteredOption', diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index b836a35b8cf29..d42a1ab72b156 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -10,8 +10,10 @@ """ import pandas.core.config as cf -from pandas.core.config import (is_int, is_bool, is_text, is_instance_factory, - is_one_of_factory, is_callable) +from pandas.core.config import ( + is_bool, is_callable, is_instance_factory, is_int, is_one_of_factory, + is_text) + from pandas.io.formats.console import detect_console_encoding from pandas.io.formats.terminal import is_terminal diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dd025fb61f973..3a7016ce39676 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,64 +1,52 @@ # pylint: disable=W0231,E1101 import collections import functools -import warnings -import operator -import weakref import gc import json +import operator +import warnings +import weakref import numpy as np -import pandas as pd -from pandas._libs import properties, Timestamp, iNaT +from pandas._libs import Timestamp, iNaT, properties +import pandas.compat as compat +from pandas.compat import ( + cPickle as pkl, isidentifier, lrange, lzip, map, set_function_name, + string_types, to_str, zip) +from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError +from pandas.util._decorators import ( + Appender, Substitution, rewrite_axis_style_signature) +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core.dtypes.common import ( - ensure_int64, - ensure_object, - is_scalar, - is_number, - is_integer, is_bool, - is_bool_dtype, - is_numeric_dtype, - is_datetime64_any_dtype, - is_timedelta64_dtype, - is_datetime64tz_dtype, - is_list_like, - is_dict_like, - is_re_compilable, - is_period_arraylike, - is_object_dtype, - is_extension_array_dtype, - pandas_dtype) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.common import ( + ensure_int64, ensure_object, is_bool, is_bool_dtype, + is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, + is_extension_array_dtype, is_integer, is_list_like, is_number, + is_numeric_dtype, is_object_dtype, is_period_arraylike, is_re_compilable, + is_scalar, is_timedelta64_dtype, pandas_dtype) +from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import isna, notna -from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame +import pandas as pd +from pandas.core import config, missing, nanops +import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin -from pandas.core.index import (Index, MultiIndex, ensure_index, - InvalidIndexError, RangeIndex) -import pandas.core.indexing as indexing +import pandas.core.common as com +from pandas.core.index import ( + Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index) from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.period import PeriodIndex, Period +from pandas.core.indexes.period import Period, PeriodIndex +import pandas.core.indexing as indexing from pandas.core.internals import BlockManager -import pandas.core.algorithms as algos -import pandas.core.common as com -import pandas.core.missing as missing +from pandas.core.ops import _align_method_FRAME + +from pandas.io.formats.format import DataFrameFormatter, format_percentiles from pandas.io.formats.printing import pprint_thing -from pandas.io.formats.format import format_percentiles, DataFrameFormatter from pandas.tseries.frequencies import to_offset -from pandas import compat -from pandas.compat.numpy import function as nv -from pandas.compat import (map, zip, lzip, lrange, string_types, to_str, - isidentifier, set_function_name, cPickle as pkl) -from pandas.core.ops import _align_method_FRAME -import pandas.core.nanops as nanops -from pandas.util._decorators import (Appender, Substitution, - rewrite_axis_style_signature) -from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs -from pandas.core import config # goal is to be able to define the docs close to function, while still being # able to share diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index c3b94c297652a..6138f73726e0a 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -3,20 +3,18 @@ """ import numpy as np -from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.common import ( - is_period_arraylike, - is_datetime_arraylike, is_integer_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_categorical_dtype, - is_list_like) + is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_datetime_arraylike, is_integer_dtype, is_list_like, is_period_arraylike, + is_timedelta64_dtype) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.accessor import PandasDelegate, delegate_names +from pandas.core.algorithms import take_1d from pandas.core.base import NoNewAttributesMixin, PandasObject from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodArray from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.algorithms import take_1d class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index fb090c0fd83ba..6299fc482d0df 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -1,23 +1,21 @@ import textwrap import warnings -from pandas.core.indexes.base import (Index, - _new_Index, - ensure_index, - ensure_index_from_sequences, - InvalidIndexError) # noqa -from pandas.core.indexes.category import CategoricalIndex # noqa -from pandas.core.indexes.multi import MultiIndex # noqa -from pandas.core.indexes.interval import IntervalIndex # noqa -from pandas.core.indexes.numeric import (NumericIndex, Float64Index, # noqa - Int64Index, UInt64Index) -from pandas.core.indexes.range import RangeIndex # noqa -from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.datetimes import DatetimeIndex +from pandas._libs import NaT, lib import pandas.core.common as com -from pandas._libs import lib, NaT +from pandas.core.indexes.base import ( + Index, _new_Index, ensure_index, ensure_index_from_sequences) +from pandas.core.indexes.base import InvalidIndexError # noqa:F401 +from pandas.core.indexes.category import CategoricalIndex # noqa:F401 +from pandas.core.indexes.datetimes import DatetimeIndex +from pandas.core.indexes.interval import IntervalIndex # noqa:F401 +from pandas.core.indexes.multi import MultiIndex # noqa:F401 +from pandas.core.indexes.numeric import ( # noqa:F401 + Float64Index, Int64Index, NumericIndex, UInt64Index) +from pandas.core.indexes.period import PeriodIndex +from pandas.core.indexes.range import RangeIndex # noqa:F401 +from pandas.core.indexes.timedeltas import TimedeltaIndex _sort_msg = textwrap.dedent("""\ Sorting because non-concatenation axis is not aligned. A future version diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 93af7b9933782..29fb541991389 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,69 +1,50 @@ from datetime import datetime, timedelta -import warnings import operator from textwrap import dedent +import warnings import numpy as np -from pandas._libs import (lib, index as libindex, tslibs, - algos as libalgos, join as libjoin, - Timedelta) -from pandas._libs.lib import is_datetime_array -from pandas.compat import range, u, set_function_name +from pandas._libs import ( + Timedelta, algos as libalgos, index as libindex, join as libjoin, lib, + tslibs) +from pandas._libs.lib import is_datetime_array +import pandas.compat as compat +from pandas.compat import range, set_function_name, u from pandas.compat.numpy import function as nv -from pandas import compat +from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.core.accessor import CachedAccessor -from pandas.core.arrays import ExtensionArray -from pandas.core.dtypes.generic import ( - ABCSeries, ABCDataFrame, - ABCMultiIndex, - ABCPeriodIndex, ABCTimedeltaIndex, ABCDatetimeIndex, - ABCDateOffset, ABCIndexClass, ABCTimedeltaArray) -from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( - ensure_int64, - ensure_object, - ensure_categorical, - ensure_platform_int, - is_integer, - is_float, - is_dtype_equal, - is_dtype_union_equal, - is_object_dtype, - is_categorical, - is_categorical_dtype, - is_interval_dtype, - is_period_dtype, - is_bool, - is_bool_dtype, - is_signed_integer_dtype, - is_unsigned_integer_dtype, - is_integer_dtype, is_float_dtype, - is_datetime64_any_dtype, - is_datetime64tz_dtype, - is_timedelta64_dtype, - is_extension_array_dtype, - is_hashable, - is_iterator, is_list_like, - is_scalar) - -from pandas.core.base import PandasObject, IndexOpsMixin -import pandas.core.common as com + ensure_categorical, ensure_int64, ensure_object, ensure_platform_int, + is_bool, is_bool_dtype, is_categorical, is_categorical_dtype, + is_datetime64_any_dtype, is_datetime64tz_dtype, is_dtype_equal, + is_dtype_union_equal, is_extension_array_dtype, is_float, is_float_dtype, + is_hashable, is_integer, is_integer_dtype, is_interval_dtype, is_iterator, + is_list_like, is_object_dtype, is_period_dtype, is_scalar, + is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype) +import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, + ABCMultiIndex, ABCPeriodIndex, ABCSeries, ABCTimedeltaArray, + ABCTimedeltaIndex) +from pandas.core.dtypes.missing import array_equivalent, isna + from pandas.core import ops -from pandas.util._decorators import ( - Appender, Substitution, cache_readonly) +from pandas.core.accessor import CachedAccessor +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray +from pandas.core.base import IndexOpsMixin, PandasObject +import pandas.core.common as com from pandas.core.indexes.frozen import FrozenList -import pandas.core.dtypes.concat as _concat import pandas.core.missing as missing -import pandas.core.algorithms as algos +from pandas.core.ops import get_op_result_name, make_invalid_op import pandas.core.sorting as sorting -from pandas.io.formats.printing import ( - pprint_thing, default_pprint, format_object_summary, format_object_attrs) -from pandas.core.ops import make_invalid_op, get_op_result_name from pandas.core.strings import StringMethods +from pandas.io.formats.printing import ( + default_pprint, format_object_attrs, format_object_summary, pprint_thing) + __all__ = ['Index'] _unsortable_types = frozenset(('mixed', 'mixed-integer')) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 6e2f0b00fcd6e..86269c7795d35 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -2,31 +2,28 @@ import warnings import numpy as np -from pandas._libs import index as libindex -from pandas import compat +from pandas._libs import index as libindex +import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.core.dtypes.generic import ABCCategorical, ABCSeries -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.util._decorators import Appender, cache_readonly + from pandas.core.dtypes.common import ( - is_categorical_dtype, - ensure_platform_int, - is_list_like, - is_interval_dtype, + ensure_platform_int, is_categorical_dtype, is_interval_dtype, is_list_like, is_scalar) +from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.generic import ABCCategorical, ABCSeries from pandas.core.dtypes.missing import array_equivalent, isna -from pandas.core.algorithms import take_1d - -from pandas.util._decorators import Appender, cache_readonly -from pandas.core.config import get_option -from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core import accessor +from pandas.core.algorithms import take_1d +from pandas.core.arrays.categorical import Categorical, contains import pandas.core.common as com -import pandas.core.missing as missing +from pandas.core.config import get_option import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import Index, _index_shared_docs +import pandas.core.missing as missing from pandas.core.ops import get_op_result_name -from pandas.core.arrays.categorical import Categorical, contains _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index cb145dd9eed4d..46731069d88b8 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -9,10 +9,15 @@ """ import warnings + import numpy as np -from pandas.core.base import PandasObject + from pandas.util._decorators import deprecate_kwarg + from pandas.core.dtypes.cast import coerce_indexer_dtype + +from pandas.core.base import PandasObject + from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c64a179a299e9..1ebcf213ab0eb 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -4,49 +4,35 @@ import numpy as np +from pandas._libs import Timedelta, Timestamp +from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas.compat import add_metaclass -from pandas.core.dtypes.missing import isna +from pandas.util._decorators import Appender, cache_readonly +from pandas.util._doctools import _WritableDoc +from pandas.util._exceptions import rewrite_exception + from pandas.core.dtypes.cast import ( - find_common_type, maybe_downcast_to_dtype, infer_dtype_from_scalar) + find_common_type, infer_dtype_from_scalar, maybe_downcast_to_dtype) from pandas.core.dtypes.common import ( - ensure_platform_int, - is_list_like, - is_datetime_or_timedelta_dtype, - is_datetime64tz_dtype, - is_dtype_equal, - is_integer_dtype, - is_float_dtype, - is_interval_dtype, - is_object_dtype, - is_scalar, - is_float, - is_number, - is_integer) -from pandas.core.indexes.base import ( - Index, ensure_index, - default_pprint, _index_shared_docs) -from pandas.core.ops import get_op_result_name - -from pandas._libs import Timestamp, Timedelta -from pandas._libs.interval import ( - Interval, IntervalMixin, IntervalTree, -) + ensure_platform_int, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, + is_dtype_equal, is_float, is_float_dtype, is_integer, is_integer_dtype, + is_interval_dtype, is_list_like, is_number, is_object_dtype, is_scalar) +from pandas.core.dtypes.missing import isna -from pandas.core.indexes.datetimes import date_range, DatetimeIndex -from pandas.core.indexes.timedeltas import timedelta_range, TimedeltaIndex -from pandas.core.indexes.multi import MultiIndex +from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com -from pandas.util._decorators import cache_readonly, Appender -from pandas.util._doctools import _WritableDoc -from pandas.util._exceptions import rewrite_exception from pandas.core.config import get_option +import pandas.core.indexes.base as ibase +from pandas.core.indexes.base import ( + Index, _index_shared_docs, default_pprint, ensure_index) +from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.indexes.multi import MultiIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range +from pandas.core.ops import get_op_result_name + from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset -import pandas.core.indexes.base as ibase -from pandas.core.arrays.interval import (IntervalArray, - _interval_shared_docs) - _VALID_CLOSED = {'left', 'right', 'both', 'neither'} _index_doc_kwargs = dict(ibase._index_doc_kwargs) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 619e1ae866a1b..0e3fd201d8ddb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,47 +1,37 @@ # pylint: disable=E1101,E1103,W0232 import datetime -import warnings from sys import getsizeof +import warnings import numpy as np -from pandas._libs import algos as libalgos, index as libindex, lib, Timestamp -from pandas._libs import tslibs -from pandas.compat import range, zip, lrange, lzip, map +from pandas._libs import ( + Timestamp, algos as libalgos, index as libindex, lib, tslibs) +import pandas.compat as compat +from pandas.compat import lrange, lzip, map, range, zip from pandas.compat.numpy import function as nv -from pandas import compat +from pandas.errors import PerformanceWarning, UnsortedIndexError +from pandas.util._decorators import Appender, cache_readonly -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.common import ( - ensure_int64, - ensure_platform_int, - is_categorical_dtype, - is_object_dtype, - is_hashable, - is_integer, - is_iterator, - is_list_like, - pandas_dtype, - is_scalar) -from pandas.core.dtypes.missing import isna, array_equivalent -from pandas.errors import PerformanceWarning, UnsortedIndexError + ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, + is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, + pandas_dtype) +from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype +from pandas.core.dtypes.missing import array_equivalent, isna -from pandas.util._decorators import Appender, cache_readonly -import pandas.core.common as com -import pandas.core.missing as missing import pandas.core.algorithms as algos -from pandas.io.formats.printing import pprint_thing - +import pandas.core.common as com from pandas.core.config import get_option - +import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - Index, ensure_index, - InvalidIndexError, - _index_shared_docs) + Index, InvalidIndexError, _index_shared_docs, ensure_index) from pandas.core.indexes.frozen import FrozenList, _ensure_frozen -import pandas.core.indexes.base as ibase +import pandas.core.missing as missing + +from pandas.io.formats.printing import pprint_thing + _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(klass='MultiIndex', diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 23b2dde2e6494..491176bc586a8 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,26 +1,22 @@ import warnings import numpy as np + from pandas._libs import index as libindex +import pandas.compat as compat +from pandas.util._decorators import Appender, cache_readonly + from pandas.core.dtypes.common import ( - is_dtype_equal, - pandas_dtype, - needs_i8_conversion, - is_integer_dtype, - is_float, - is_bool, - is_bool_dtype, - is_scalar) + is_bool, is_bool_dtype, is_dtype_equal, is_float, is_integer_dtype, + is_scalar, needs_i8_conversion, pandas_dtype) +import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna -from pandas import compat from pandas.core import algorithms import pandas.core.common as com +import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, InvalidIndexError, _index_shared_docs) -from pandas.util._decorators import Appender, cache_readonly -import pandas.core.dtypes.concat as _concat -import pandas.core.indexes.base as ibase from pandas.core.ops import get_op_result_name _num_index_shared_docs = dict() diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f83687bacd72d..fec3a9bd24cc8 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,49 +1,40 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta -import numpy as np import operator import warnings -from pandas.core import common as com -from pandas.core.dtypes.common import ( - is_integer, - is_float, - is_float_dtype, - is_integer_dtype, - is_datetime64_any_dtype, - is_bool_dtype, - pandas_dtype -) -from pandas.core.ops import get_op_result_name -from pandas.core.accessor import PandasDelegate, delegate_names -from pandas.core.indexes.datetimes import DatetimeIndex, Int64Index, Index -from pandas.core.indexes.datetimelike import ( - DatelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op -) -from pandas.core.tools.datetimes import parse_time_string, DateParseError +import numpy as np from pandas._libs import index as libindex -from pandas._libs.tslibs.period import (Period, IncompatibleFrequency, - DIFFERENT_FREQ_INDEX) +from pandas._libs.tslibs import NaT, iNaT, resolution +from pandas._libs.tslibs.period import ( + DIFFERENT_FREQ_INDEX, IncompatibleFrequency, Period) +from pandas.util._decorators import ( + Appender, Substitution, cache_readonly, deprecate_kwarg) -from pandas._libs.tslibs import resolution, NaT, iNaT +from pandas.core.dtypes.common import ( + is_bool_dtype, is_datetime64_any_dtype, is_float, is_float_dtype, + is_integer, is_integer_dtype, pandas_dtype) +from pandas import compat +from pandas.core import common as com +from pandas.core.accessor import PandasDelegate, delegate_names from pandas.core.algorithms import unique1d import pandas.core.arrays.datetimelike as dtl from pandas.core.arrays.period import PeriodArray, period_array from pandas.core.base import _shared_docs +import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, ensure_index +from pandas.core.indexes.datetimelike import ( + DatelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op) +from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index from pandas.core.missing import isna +from pandas.core.ops import get_op_result_name +from pandas.core.tools.datetimes import DateParseError, parse_time_string -from pandas import compat -from pandas.util._decorators import ( - Appender, Substitution, cache_readonly, deprecate_kwarg -) - -from pandas.tseries.offsets import Tick, DateOffset from pandas.tseries import frequencies +from pandas.tseries.offsets import DateOffset, Tick -import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(target_klass='PeriodIndex or list of Periods')) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8ed181381f668..cfa451db866be 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -3,25 +3,18 @@ import warnings import numpy as np -from pandas._libs.indexing import _NDFrameIndexerBase -from pandas.util._decorators import Appender - -from pandas.errors import AbstractMethodError +from pandas._libs.indexing import _NDFrameIndexerBase import pandas.compat as compat from pandas.compat import range, zip +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( - is_integer_dtype, - is_integer, is_float, - is_list_like, - is_sequence, - is_iterator, - is_scalar, - is_sparse, - ensure_platform_int) + ensure_platform_int, is_float, is_integer, is_integer_dtype, is_iterator, + is_list_like, is_scalar, is_sequence, is_sparse) from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries -from pandas.core.dtypes.missing import isna, _infer_fill_value +from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com from pandas.core.index import Index, MultiIndex diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 222873cd7f81a..1012639fe0f9d 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1,26 +1,19 @@ """ Routines for filling missing data """ +from distutils.version import LooseVersion import operator import numpy as np -from distutils.version import LooseVersion from pandas._libs import algos, lib - from pandas.compat import range, string_types -from pandas.core.dtypes.common import ( - is_numeric_v_string_like, - is_float_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_integer_dtype, - is_scalar, - is_integer, - needs_i8_conversion, - ensure_float64) from pandas.core.dtypes.cast import infer_dtype_from_array +from pandas.core.dtypes.common import ( + ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, is_float_dtype, + is_integer, is_integer_dtype, is_numeric_v_string_like, is_scalar, + needs_i8_conversion) from pandas.core.dtypes.missing import isna diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index afba433f0e391..4369ac60a075e 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1,27 +1,24 @@ +from distutils.version import LooseVersion import functools import itertools import operator import warnings -from distutils.version import LooseVersion import numpy as np -import pandas.core.common as com -from pandas import compat -from pandas._libs import tslibs, lib -from pandas.core.config import get_option +from pandas._libs import lib, tslibs +import pandas.compat as compat + from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( - _get_dtype, - is_float, is_scalar, - is_integer, is_complex, is_float_dtype, - is_complex_dtype, is_integer_dtype, - is_bool_dtype, is_object_dtype, - is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, - is_int_or_datetime_dtype, is_any_int_dtype) -from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype + _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype, + is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float, + is_float_dtype, is_int_or_datetime_dtype, is_integer, is_integer_dtype, + is_numeric_dtype, is_object_dtype, is_scalar, is_timedelta64_dtype) +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna + +import pandas.core.common as com +from pandas.core.config import get_option _BOTTLENECK_INSTALLED = False _MIN_BOTTLENECK_VERSION = '1.0.0' diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 869a1d6e2fb14..2a21593fab8f5 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -5,50 +5,41 @@ """ # necessary to enforce truediv in Python 2.X from __future__ import division + import datetime import operator import textwrap import warnings import numpy as np -import pandas as pd - -from pandas._libs import lib, algos as libalgos, ops as libops - -from pandas import compat -from pandas.util._decorators import Appender +from pandas._libs import algos as libalgos, lib, ops as libops +import pandas.compat as compat from pandas.compat import bind_method -import pandas.core.missing as missing -import pandas.core.common as com - from pandas.errors import NullFrequencyError -from pandas.core.dtypes.missing import notna, isna -from pandas.core.dtypes.common import ( - needs_i8_conversion, - is_datetimelike_v_numeric, - is_period_dtype, - is_integer_dtype, is_categorical_dtype, - is_object_dtype, is_timedelta64_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, - is_list_like, - is_scalar, - is_extension_array_dtype, - ensure_object) +from pandas.util._decorators import Appender + from pandas.core.dtypes.cast import ( - maybe_upcast_putmask, find_common_type, - construct_1d_object_array_from_listlike) + construct_1d_object_array_from_listlike, find_common_type, + maybe_upcast_putmask) +from pandas.core.dtypes.common import ( + ensure_object, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, + is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype, + is_scalar, is_timedelta64_dtype, needs_i8_conversion) from pandas.core.dtypes.generic import ( - ABCSeries, - ABCDataFrame, ABCPanel, - ABCIndex, ABCIndexClass, - ABCSparseSeries, ABCSparseArray) + ABCDataFrame, ABCIndex, ABCIndexClass, ABCPanel, ABCSeries, ABCSparseArray, + ABCSparseSeries) +from pandas.core.dtypes.missing import isna, notna +import pandas as pd +import pandas.core.common as com +import pandas.core.missing as missing # ----------------------------------------------------------------------------- # Ops Wrapping Utilities + def get_op_result_name(left, right): """ Find the appropriate name to pin to an operation result. This result diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 5ae7848b5adc6..bf076b306a9ad 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -4,36 +4,37 @@ # pylint: disable=E1103,W0231,W0212,W0621 from __future__ import division -import numpy as np import warnings + +import numpy as np + +import pandas.compat as compat +from pandas.compat import OrderedDict, map, range, u, zip +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution, deprecate_kwarg +from pandas.util._validators import validate_axis_style_args + from pandas.core.dtypes.cast import ( - infer_dtype_from_scalar, - cast_scalar_to_array, - maybe_cast_item) + cast_scalar_to_array, infer_dtype_from_scalar, maybe_cast_item) from pandas.core.dtypes.common import ( - is_integer, is_list_like, - is_string_like, is_scalar) + is_integer, is_list_like, is_scalar, is_string_like) from pandas.core.dtypes.missing import notna -import pandas.core.ops as ops import pandas.core.common as com -import pandas.core.indexes.base as ibase -from pandas import compat -from pandas.compat import (map, zip, range, u, OrderedDict) -from pandas.compat.numpy import function as nv from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import (Index, MultiIndex, ensure_index, - _get_objs_combined_axis) -from pandas.io.formats.printing import pprint_thing +from pandas.core.index import ( + Index, MultiIndex, _get_objs_combined_axis, ensure_index) +import pandas.core.indexes.base as ibase from pandas.core.indexing import maybe_droplevels -from pandas.core.internals import (BlockManager, - create_block_manager_from_arrays, - create_block_manager_from_blocks) -from pandas.core.series import Series +from pandas.core.internals import ( + BlockManager, create_block_manager_from_arrays, + create_block_manager_from_blocks) +import pandas.core.ops as ops from pandas.core.reshape.util import cartesian_product -from pandas.util._decorators import Appender, Substitution, deprecate_kwarg -from pandas.util._validators import validate_axis_style_args +from pandas.core.series import Series + +from pandas.io.formats.printing import pprint_thing _shared_doc_kwargs = dict( axes='items, major_axis, minor_axis', diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 26b1e33e9893b..4f0669a568ae7 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1,37 +1,37 @@ -from datetime import timedelta -import numpy as np -import warnings import copy +from datetime import timedelta from textwrap import dedent +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs import NaT, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas as pd +import pandas.core.algorithms as algos +from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin -from pandas.core.groupby.ops import BinGrouper +from pandas.core.groupby.generic import PanelGroupBy, SeriesGroupBy from pandas.core.groupby.groupby import ( - _GroupBy, GroupBy, groupby, _pipe_template -) + GroupBy, _GroupBy, _pipe_template, groupby) from pandas.core.groupby.grouper import Grouper -from pandas.core.groupby.generic import SeriesGroupBy, PanelGroupBy - -from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod +from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range -from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas.tseries.offsets import (DateOffset, Tick, Day, - delta_to_nanoseconds, Nano) from pandas.core.indexes.period import PeriodIndex -from pandas.errors import AbstractMethodError -import pandas.core.algorithms as algos -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries - -import pandas.compat as compat -from pandas.compat.numpy import function as nv +from pandas.core.indexes.timedeltas import TimedeltaIndex -from pandas._libs import lib -from pandas._libs.tslibs import Timestamp, NaT -from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas.tseries.frequencies import is_subperiod, is_superperiod, to_offset +from pandas.tseries.offsets import ( + DateOffset, Day, Nano, Tick, delta_to_nanoseconds) -from pandas.util._decorators import Appender, Substitution -from pandas.core.generic import _shared_docs _shared_docs_kwargs = dict() diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ee1c62f3decf9..b34dfddcc66e1 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,18 +1,17 @@ """ miscellaneous sorting / groupby utilities """ import numpy as np -from pandas.compat import long, string_types, PY3 -from pandas.core.dtypes.common import ( - ensure_platform_int, - ensure_int64, - is_list_like, - is_categorical_dtype) + +from pandas._libs import algos, hashtable, lib +from pandas._libs.hashtable import unique_label_indices +from pandas.compat import PY3, long, string_types + from pandas.core.dtypes.cast import infer_dtype_from_array +from pandas.core.dtypes.common import ( + ensure_int64, ensure_platform_int, is_categorical_dtype, is_list_like) from pandas.core.dtypes.missing import isna -import pandas.core.algorithms as algorithms -from pandas._libs import lib, algos, hashtable -from pandas._libs.hashtable import unique_label_indices +import pandas.core.algorithms as algorithms _INT64_MAX = np.iinfo(np.int64).max diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6c21318c93597..0b791f6f91aa3 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,31 +1,26 @@ # -*- coding: utf-8 -*- +import codecs +import re +import textwrap +import warnings + import numpy as np +import pandas._libs.lib as lib +import pandas._libs.ops as libops +import pandas.compat as compat from pandas.compat import zip -from pandas.core.dtypes.generic import ABCSeries, ABCIndex -from pandas.core.dtypes.missing import isna +from pandas.util._decorators import Appender, deprecate_kwarg + from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_object_dtype, - is_string_like, - is_list_like, - is_scalar, - is_integer, - is_re) + ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, + is_list_like, is_object_dtype, is_re, is_scalar, is_string_like) +from pandas.core.dtypes.generic import ABCIndex, ABCSeries +from pandas.core.dtypes.missing import isna -import pandas.core.common as com from pandas.core.algorithms import take_1d -import pandas.compat as compat from pandas.core.base import NoNewAttributesMixin -from pandas.util._decorators import Appender, deprecate_kwarg -import re -import pandas._libs.lib as lib -import pandas._libs.ops as libops -import warnings -import textwrap -import codecs +import pandas.core.common as com _cpython_optimized_encoders = ( "utf-8", "utf8", "latin-1", "latin1", "iso-8859-1", "mbcs", "ascii" diff --git a/setup.cfg b/setup.cfg index e8db1308741aa..4027e040cf421 100644 --- a/setup.cfg +++ b/setup.cfg @@ -111,33 +111,8 @@ force_grid_wrap=0 combine_as_imports=True force_sort_within_sections=True skip= - pandas/core/ops.py, - pandas/core/categorical.py, pandas/core/api.py, - pandas/core/indexing.py, - pandas/core/apply.py, - pandas/core/generic.py, - pandas/core/sorting.py, pandas/core/frame.py, - pandas/core/nanops.py, - pandas/core/algorithms.py, - pandas/core/strings.py, - pandas/core/panel.py, - pandas/core/config.py, - pandas/core/resample.py, - pandas/core/base.py, - pandas/core/common.py, - pandas/core/missing.py, - pandas/core/config_init.py, - pandas/core/indexes/category.py, - pandas/core/indexes/api.py, - pandas/core/indexes/numeric.py, - pandas/core/indexes/interval.py, - pandas/core/indexes/multi.py, - pandas/core/indexes/base.py, - pandas/core/indexes/accessors.py, - pandas/core/indexes/period.py, - pandas/core/indexes/frozen.py, pandas/tests/test_errors.py, pandas/tests/test_base.py, pandas/tests/test_register_accessor.py, From e5c90e5e355077c4ee5245a4a72833f4e8af72b9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 23 Nov 2018 03:07:21 +0000 Subject: [PATCH 12/21] TST: split up pandas/tests/test_multilevel.py (#23797) --- pandas/tests/indexing/test_multiindex.py | 894 ++++++++++++++++++++++- pandas/tests/test_multilevel.py | 810 +------------------- 2 files changed, 892 insertions(+), 812 deletions(-) diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index f4caf17b60d65..bf5fa0a48cfe7 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -1,19 +1,58 @@ from datetime import datetime -from warnings import catch_warnings +from warnings import catch_warnings, simplefilter import numpy as np +from numpy.random import randn import pytest +import pandas._libs.index as _index +from pandas.compat import ( + StringIO, lrange, lzip, product as cart_product, range, u, zip) from pandas.errors import PerformanceWarning, UnsortedIndexError import pandas as pd from pandas import ( - DataFrame, Index, MultiIndex, Panel, Period, Series, Timestamp, date_range, - period_range) + DataFrame, Index, MultiIndex, Panel, Period, Series, Timestamp, concat, + date_range, isna, notna, period_range, read_csv) +import pandas.core.common as com from pandas.tests.indexing.common import _mklbl from pandas.util import testing as tm +@pytest.fixture +def multiindex_dataframe_random_data(): + """DataFrame with 2 level MultiIndex with random data""" + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + return DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + + +@pytest.fixture +def single_level_multiindex(): + """single level MultiIndex""" + return MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], names=['first']) + + +@pytest.fixture +def multiindex_year_month_day_dataframe_random_data(): + """DataFrame with 3 level MultiIndex (year, month, day) covering + first 100 business days from 2000-01-01 with random data""" + tm.N = 100 + tdf = tm.makeTimeDataFrame() + ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]).sum() + # use Int64Index, to make sure things work + ymd.index.set_levels([lev.astype('i8') for lev in ymd.index.levels], + inplace=True) + ymd.index.set_names(['year', 'month', 'day'], inplace=True) + return ymd + + @pytest.mark.filterwarnings("ignore:\\n.ix:DeprecationWarning") class TestMultiIndexBasic(object): @@ -746,6 +785,855 @@ def test_multiindex_contains_dropped(self): def test_multiindex_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected + def test_getitem_simple(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + + col = df['foo', 'one'] + tm.assert_almost_equal(col.values, df.values[:, 0]) + with pytest.raises(KeyError): + df[('foo', 'four')] + with pytest.raises(KeyError): + df['foobar'] + + def test_series_getitem( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + + result = s[2000, 3] + + # TODO(wesm): unused? + # result2 = s.loc[2000, 3] + + expected = s.reindex(s.index[42:65]) + expected.index = expected.index.droplevel(0).droplevel(0) + tm.assert_series_equal(result, expected) + + result = s[2000, 3, 10] + expected = s[49] + assert result == expected + + # fancy + expected = s.reindex(s.index[49:51]) + result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] + tm.assert_series_equal(result, expected) + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] + tm.assert_series_equal(result, expected) + + # key error + pytest.raises(KeyError, s.__getitem__, (2000, 3, 4)) + + def test_series_getitem_corner( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + + # don't segfault, GH #495 + # out of bounds access + pytest.raises(IndexError, s.__getitem__, len(ymd)) + + # generator + result = s[(x > 0 for x in s)] + expected = s[s > 0] + tm.assert_series_equal(result, expected) + + def test_series_setitem( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + + s[2000, 3] = np.nan + assert isna(s.values[42:65]).all() + assert notna(s.values[:42]).all() + assert notna(s.values[65:]).all() + + s[2000, 3, 10] = np.nan + assert isna(s[49]) + + def test_series_slice_partial(self): + pass + + def test_frame_getitem_setitem_boolean( + self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T.copy() + values = df.values + + result = df[df > 0] + expected = df.where(df > 0) + tm.assert_frame_equal(result, expected) + + df[df > 0] = 5 + values[values > 0] = 5 + tm.assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + tm.assert_almost_equal(df.values, values) + + # a df that needs alignment first + df[df[:-1] < 0] = 2 + np.putmask(values[:-1], values[:-1] < 0, 2) + tm.assert_almost_equal(df.values, values) + + with pytest.raises(TypeError, match='boolean values only'): + df[df * 0] = 2 + + def test_frame_getitem_setitem_slice( + self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + # getitem + result = frame.iloc[:4] + expected = frame[:4] + tm.assert_frame_equal(result, expected) + + # setitem + cp = frame.copy() + cp.iloc[:4] = 0 + + assert (cp.values[:4] == 0).all() + assert (cp.values[4:] != 0).all() + + def test_frame_getitem_setitem_multislice(self): + levels = [['t1', 't2'], ['a', 'b', 'c']] + labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] + midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) + df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) + + result = df.loc[:, 'value'] + tm.assert_series_equal(df['value'], result) + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + result = df.ix[:, 'value'] + tm.assert_series_equal(df['value'], result) + + result = df.loc[df.index[1:3], 'value'] + tm.assert_series_equal(df['value'][1:3], result) + + result = df.loc[:, :] + tm.assert_frame_equal(df, result) + + result = df + df.loc[:, 'value'] = 10 + result['value'] = 10 + tm.assert_frame_equal(df, result) + + df.loc[:, :] = 10 + tm.assert_frame_equal(df, result) + + def test_frame_getitem_multicolumn_empty_level(self): + f = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']}) + f.columns = [['level1 item1', 'level1 item2'], ['', 'level2 item2'], + ['level3 item1', 'level3 item2']] + + result = f['level1 item1'] + expected = DataFrame([['1'], ['2'], ['3']], index=f.index, + columns=['level3 item1']) + tm.assert_frame_equal(result, expected) + + def test_frame_setitem_multi_column(self): + df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], + [0, 1, 0, 1]]) + + cp = df.copy() + cp['a'] = cp['b'] + tm.assert_frame_equal(cp['a'], cp['b']) + + # set with ndarray + cp = df.copy() + cp['a'] = cp['b'].values + tm.assert_frame_equal(cp['a'], cp['b']) + + # --------------------------------------- + # #1803 + columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) + df = DataFrame(index=[1, 3, 5], columns=columns) + + # Works, but adds a column instead of updating the two existing ones + df['A'] = 0.0 # Doesn't work + assert (df['A'].values == 0).all() + + # it broadcasts + df['B', '1'] = [1, 2, 3] + df['A'] = df['B', '1'] + + sliced_a1 = df['A', '1'] + sliced_a2 = df['A', '2'] + sliced_b1 = df['B', '1'] + tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) + tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) + assert sliced_a1.name == ('A', '1') + assert sliced_a2.name == ('A', '2') + assert sliced_b1.name == ('B', '1') + + def test_getitem_tuple_plus_slice(self): + # GH #671 + df = DataFrame({'a': lrange(10), + 'b': lrange(10), + 'c': np.random.randn(10), + 'd': np.random.randn(10)}) + + idf = df.set_index(['a', 'b']) + + result = idf.loc[(0, 0), :] + expected = idf.loc[0, 0] + expected2 = idf.xs((0, 0)) + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + expected3 = idf.ix[0, 0] + + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected2) + tm.assert_series_equal(result, expected3) + + def test_getitem_setitem_tuple_plus_columns( + self, multiindex_year_month_day_dataframe_random_data): + # GH #1013 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd[:5] + + result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] + expected = df.loc[2000, 1, 6][['A', 'B', 'C']] + tm.assert_series_equal(result, expected) + + def test_xs(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + xs = frame.xs(('bar', 'two')) + xs2 = frame.loc[('bar', 'two')] + + tm.assert_series_equal(xs, xs2) + tm.assert_almost_equal(xs.values, frame.values[4]) + + # GH 6574 + # missing values in returned index should be preserrved + acc = [ + ('a', 'abcde', 1), + ('b', 'bbcde', 2), + ('y', 'yzcde', 25), + ('z', 'xbcde', 24), + ('z', None, 26), + ('z', 'zbcde', 25), + ('z', 'ybcde', 26), + ] + df = DataFrame(acc, + columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) + expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( + ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) + + result = df.xs('z', level='a1') + tm.assert_frame_equal(result, expected) + + def test_xs_partial(self, multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data): + frame = multiindex_dataframe_random_data + ymd = multiindex_year_month_day_dataframe_random_data + result = frame.xs('foo') + result2 = frame.loc['foo'] + expected = frame.T['foo'].T + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) + + result = ymd.xs((2000, 4)) + expected = ymd.loc[2000, 4] + tm.assert_frame_equal(result, expected) + + # ex from #1796 + index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], + labels=[[0, 0, 0, 0, 1, 1, 1, 1], + [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, + 0, 1]]) + df = DataFrame(np.random.randn(8, 4), index=index, + columns=list('abcd')) + + result = df.xs(['foo', 'one']) + expected = df.loc['foo', 'one'] + tm.assert_frame_equal(result, expected) + + def test_xs_with_duplicates(self, multiindex_dataframe_random_data): + # Issue #13719 + frame = multiindex_dataframe_random_data + df_dup = concat([frame] * 2) + assert df_dup.index.is_unique is False + expected = concat([frame.xs('one', level='second')] * 2) + tm.assert_frame_equal(df_dup.xs('one', level='second'), expected) + tm.assert_frame_equal(df_dup.xs(['one'], level=['second']), expected) + + def test_xs_level(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + result = frame.xs('two', level='second') + expected = frame[frame.index.get_level_values(1) == 'two'] + expected.index = expected.index.droplevel(1) + + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ( + 'p', 'q', 'r')]) + df = DataFrame(np.random.randn(3, 5), index=index) + result = df.xs('c', level=2) + expected = df[1:2] + expected.index = expected.index.droplevel(2) + tm.assert_frame_equal(result, expected) + + # this is a copy in 0.14 + result = frame.xs('two', level='second') + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + def f(x): + x[:] = 10 + + pytest.raises(com.SettingWithCopyError, f, result) + + def test_xs_level_multiple(self): + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_csv(StringIO(text), sep=r'\s+', engine='python') + + result = df.xs(('a', 4), level=['one', 'four']) + expected = df.xs('a').xs(4, level='four') + tm.assert_frame_equal(result, expected) + + # this is a copy in 0.14 + result = df.xs(('a', 4), level=['one', 'four']) + + # setting this will give a SettingWithCopyError + # as we are trying to write a view + def f(x): + x[:] = 10 + + pytest.raises(com.SettingWithCopyError, f, result) + + # GH2107 + dates = lrange(20111201, 20111205) + ids = 'abcde' + idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) + idx.names = ['date', 'secid'] + df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) + + rs = df.xs(20111201, level='date') + xp = df.loc[20111201, :] + tm.assert_frame_equal(rs, xp) + + def test_xs_level0(self): + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_csv(StringIO(text), sep=r'\s+', engine='python') + + result = df.xs('a', level=0) + expected = df.xs('a') + assert len(result) == 2 + tm.assert_frame_equal(result, expected) + + def test_xs_level_series(self, multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data): + frame = multiindex_dataframe_random_data + ymd = multiindex_year_month_day_dataframe_random_data + s = frame['A'] + result = s[:, 'two'] + expected = frame.xs('two', level=1)['A'] + tm.assert_series_equal(result, expected) + + s = ymd['A'] + result = s[2000, 5] + expected = ymd.loc[2000, 5]['A'] + tm.assert_series_equal(result, expected) + + # not implementing this for now + + pytest.raises(TypeError, s.__getitem__, (2000, slice(3, 4))) + + # result = s[2000, 3:4] + # lv =s.index.get_level_values(1) + # expected = s[(lv == 3) | (lv == 4)] + # expected.index = expected.index.droplevel(0) + # tm.assert_series_equal(result, expected) + + # can do this though + + def test_get_loc_single_level(self, single_level_multiindex): + single_level = single_level_multiindex + s = Series(np.random.randn(len(single_level)), + index=single_level) + for k in single_level.values: + s[k] + + def test_getitem_toplevel(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + + result = df['foo'] + expected = df.reindex(columns=df.columns[:3]) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + + result = df['bar'] + result2 = df.loc[:, 'bar'] + + expected = df.reindex(columns=df.columns[3:5]) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, result2) + + def test_getitem_setitem_slice_integers(self): + index = MultiIndex(levels=[[0, 1, 2], [0, 2]], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + + frame = DataFrame(np.random.randn(len(index), 4), index=index, + columns=['a', 'b', 'c', 'd']) + res = frame.loc[1:2] + exp = frame.reindex(frame.index[2:]) + tm.assert_frame_equal(res, exp) + + frame.loc[1:2] = 7 + assert (frame.loc[1:2] == 7).values.all() + + series = Series(np.random.randn(len(index)), index=index) + + res = series.loc[1:2] + exp = series.reindex(series.index[2:]) + tm.assert_series_equal(res, exp) + + series.loc[1:2] = 7 + assert (series.loc[1:2] == 7).values.all() + + def test_getitem_int(self, multiindex_dataframe_random_data): + levels = [[0, 1], [0, 1, 2]] + labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, labels=labels) + + frame = DataFrame(np.random.randn(6, 2), index=index) + + result = frame.loc[1] + expected = frame[-3:] + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + + # raises exception + pytest.raises(KeyError, frame.loc.__getitem__, 3) + + # however this will work + frame = multiindex_dataframe_random_data + result = frame.iloc[2] + expected = frame.xs(frame.index[2]) + tm.assert_series_equal(result, expected) + + def test_getitem_partial( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + ymd = ymd.T + result = ymd[2000, 2] + + expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) + expected.columns = expected.columns.droplevel(0).droplevel(0) + tm.assert_frame_equal(result, expected) + + def test_setitem_change_dtype(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + dft = frame.T + s = dft['foo', 'two'] + dft['foo', 'two'] = s > s.median() + tm.assert_series_equal(dft['foo', 'two'], s > s.median()) + # assert isinstance(dft._data.blocks[1].items, MultiIndex) + + reindexed = dft.reindex(columns=[('foo', 'two')]) + tm.assert_series_equal(reindexed['foo', 'two'], s > s.median()) + + def test_frame_setitem_ix(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + frame.loc[('bar', 'two'), 'B'] = 5 + assert frame.loc[('bar', 'two'), 'B'] == 5 + + # with integer labels + df = frame.copy() + df.columns = lrange(3) + df.loc[('bar', 'two'), 1] = 7 + assert df.loc[('bar', 'two'), 1] == 7 + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + df = frame.copy() + df.columns = lrange(3) + df.ix[('bar', 'two'), 1] = 7 + assert df.loc[('bar', 'two'), 1] == 7 + + def test_fancy_slice_partial( + self, multiindex_dataframe_random_data, + multiindex_year_month_day_dataframe_random_data): + frame = multiindex_dataframe_random_data + result = frame.loc['bar':'baz'] + expected = frame[3:7] + tm.assert_frame_equal(result, expected) + + ymd = multiindex_year_month_day_dataframe_random_data + result = ymd.loc[(2000, 2):(2000, 4)] + lev = ymd.index.labels[1] + expected = ymd[(lev >= 1) & (lev <= 3)] + tm.assert_frame_equal(result, expected) + + def test_getitem_partial_column_select(self): + idx = MultiIndex(labels=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], + levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) + df = DataFrame(np.random.rand(3, 2), index=idx) + + result = df.loc[('a', 'y'), :] + expected = df.loc[('a', 'y')] + tm.assert_frame_equal(result, expected) + + result = df.loc[('a', 'y'), [1, 0]] + expected = df.loc[('a', 'y')][[1, 0]] + tm.assert_frame_equal(result, expected) + + with catch_warnings(record=True): + simplefilter("ignore", DeprecationWarning) + result = df.ix[('a', 'y'), [1, 0]] + tm.assert_frame_equal(result, expected) + + pytest.raises(KeyError, df.loc.__getitem__, + (('a', 'foo'), slice(None, None))) + + def test_frame_getitem_view(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T.copy() + + # this works because we are modifying the underlying array + # really a no-no + df['foo'].values[:] = 0 + assert (df['foo'].values == 0).all() + + # but not if it's mixed-type + df['foo', 'four'] = 'foo' + df = df.sort_index(level=0, axis=1) + + # this will work, but will raise/warn as its chained assignment + def f(): + df['foo']['one'] = 2 + return df + + pytest.raises(com.SettingWithCopyError, f) + + try: + df = f() + except ValueError: + pass + assert (df['foo', 'one'] == 0).all() + + def test_partial_set( + self, multiindex_year_month_day_dataframe_random_data): + # GH #397 + ymd = multiindex_year_month_day_dataframe_random_data + df = ymd.copy() + exp = ymd.copy() + df.loc[2000, 4] = 0 + exp.loc[2000, 4].values[:] = 0 + tm.assert_frame_equal(df, exp) + + df['A'].loc[2000, 4] = 1 + exp['A'].loc[2000, 4].values[:] = 1 + tm.assert_frame_equal(df, exp) + + df.loc[2000] = 5 + exp.loc[2000].values[:] = 5 + tm.assert_frame_equal(df, exp) + + # this works...for now + df['A'].iloc[14] = 5 + assert df['A'][14] == 5 + + def test_getitem_lowerdim_corner(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + pytest.raises(KeyError, frame.loc.__getitem__, + (('bar', 'three'), 'B')) + + # in theory should be inserting in a sorted space???? + frame.loc[('bar', 'three'), 'B'] = 0 + assert frame.sort_index().loc[('bar', 'three'), 'B'] == 0 + + # --------------------------------------------------------------------- + # AMBIGUOUS CASES! + + def test_partial_ix_missing( + self, multiindex_year_month_day_dataframe_random_data): + pytest.skip("skipping for now") + + ymd = multiindex_year_month_day_dataframe_random_data + result = ymd.loc[2000, 0] + expected = ymd.loc[2000]['A'] + tm.assert_series_equal(result, expected) + + # need to put in some work here + + # self.ymd.loc[2000, 0] = 0 + # assert (self.ymd.loc[2000]['A'] == 0).all() + + # Pretty sure the second (and maybe even the first) is already wrong. + pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6)) + pytest.raises(Exception, ymd.loc.__getitem__, (2000, 6), 0) + + # --------------------------------------------------------------------- + + def test_int_series_slicing( + self, multiindex_year_month_day_dataframe_random_data): + ymd = multiindex_year_month_day_dataframe_random_data + s = ymd['A'] + result = s[5:] + expected = s.reindex(s.index[5:]) + tm.assert_series_equal(result, expected) + + exp = ymd['A'].copy() + s[5:] = 0 + exp.values[5:] = 0 + tm.assert_numpy_array_equal(s.values, exp.values) + + result = ymd[5:] + expected = ymd.reindex(s.index[5:]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('unicode_strings', [True, False]) + def test_mixed_depth_get(self, unicode_strings): + # If unicode_strings is True, the column labels in dataframe + # construction will use unicode strings in Python 2 (pull request + # #17099). + + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + if unicode_strings: + arrays = [[u(s) for s in arr] for arr in arrays] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.randn(4, 6), columns=index) + + result = df['a'] + expected = df['a', '', ''].rename('a') + tm.assert_series_equal(result, expected) + + result = df['routine1', 'result1'] + expected = df['routine1', 'result1', ''] + expected = expected.rename(('routine1', 'result1')) + tm.assert_series_equal(result, expected) + + def test_mixed_depth_insert(self): + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + ['', 'OD', 'OD', 'result1', 'result2', 'result1'], + ['', 'wx', 'wy', '', '', '']] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df.copy() + expected = df.copy() + result['b'] = [1, 2, 3, 4] + expected['b', '', ''] = [1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_setitem_multiple_partial(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + expected = frame.copy() + result = frame.copy() + result.loc[['foo', 'bar']] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame.copy() + result = frame.copy() + result.loc['foo':'bar'] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_frame_equal(result, expected) + + expected = frame['A'].copy() + result = frame['A'].copy() + result.loc[['foo', 'bar']] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_series_equal(result, expected) + + expected = frame['A'].copy() + result = frame['A'].copy() + result.loc['foo':'bar'] = 0 + expected.loc['foo'] = 0 + expected.loc['bar'] = 0 + tm.assert_series_equal(result, expected) + + def test_dataframe_insert_column_all_na(self): + # GH #1534 + mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c') + ]) + df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) + s = Series({(1, 1): 1, (1, 2): 2}) + df['new'] = s + assert df['new'].isna().all() + + def test_set_column_scalar_with_ix(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + subset = frame.index[[1, 4, 5]] + + frame.loc[subset] = 99 + assert (frame.loc[subset].values == 99).all() + + col = frame['B'] + col[subset] = 97 + assert (frame.loc[subset, 'B'] == 97).all() + + def test_indexing_ambiguity_bug_1678(self): + columns = MultiIndex.from_tuples([('Ohio', 'Green'), ('Ohio', 'Red'), ( + 'Colorado', 'Green')]) + index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2) + ]) + + frame = DataFrame(np.arange(12).reshape((4, 3)), index=index, + columns=columns) + + result = frame.iloc[:, 1] + exp = frame.loc[:, ('Ohio', 'Red')] + assert isinstance(result, Series) + tm.assert_series_equal(result, exp) + + def test_nonunique_assignment_1750(self): + df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], + columns=list("ABCD")) + + df = df.set_index(['A', 'B']) + ix = MultiIndex.from_tuples([(1, 1)]) + + df.loc[ix, "C"] = '_' + + assert (df.xs((1, 1))['C'] == '_').all() + + def test_indexing_over_hashtable_size_cutoff(self): + n = 10000 + + old_cutoff = _index._SIZE_CUTOFF + _index._SIZE_CUTOFF = 20000 + + s = Series(np.arange(n), + MultiIndex.from_arrays((["a"] * n, np.arange(n)))) + + # hai it works! + assert s[("a", 5)] == 5 + assert s[("a", 6)] == 6 + assert s[("a", 7)] == 7 + + _index._SIZE_CUTOFF = old_cutoff + + def test_iloc_mi(self): + # GH 13797 + # Test if iloc can handle integer locations in MultiIndexed DataFrame + + data = [['str00', 'str01'], ['str10', 'str11'], ['str20', 'srt21'], + ['str30', 'str31'], ['str40', 'str41']] + + mi = MultiIndex.from_tuples( + [('CC', 'A'), ('CC', 'B'), ('CC', 'B'), ('BB', 'a'), ('BB', 'b')]) + + expected = DataFrame(data) + df_mi = DataFrame(data, index=mi) + + result = DataFrame([[df_mi.iloc[r, c] for c in range(2)] + for r in range(5)]) + + tm.assert_frame_equal(result, expected) + + def test_getitem_multilevel_index_tuple_not_sorted(self): + index_columns = list("abc") + df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], + columns=index_columns + ["data"]) + df = df.set_index(index_columns) + query_index = df.index[:1] + rs = df.loc[query_index, "data"] + + xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) + xp = Series(['x'], index=xp_idx, name='data') + tm.assert_series_equal(rs, xp) + + def test_getitem_slice_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.sort_index(level=1).T + + # buglet with int typechecking + result = df.iloc[:, :np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted2(self): + # 13431 + df = DataFrame({'col1': ['b', 'd', 'b', 'a'], + 'col2': [3, 1, 1, 2], + 'data': ['one', 'two', 'three', 'four']}) + + df2 = df.set_index(['col1', 'col2']) + df2_original = df2.copy() + + df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) + df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) + assert not df2.index.is_lexsorted() + assert not df2.index.is_monotonic + + assert df2_original.index.equals(df2.index) + expected = df2.sort_index() + assert expected.index.is_lexsorted() + assert expected.index.is_monotonic + + result = df2.sort_index(level=0) + assert result.index.is_lexsorted() + assert result.index.is_monotonic + tm.assert_frame_equal(result, expected) + + def test_frame_getitem_not_sorted(self, multiindex_dataframe_random_data): + frame = multiindex_dataframe_random_data + df = frame.T + df['foo', 'four'] = 'foo' + + arrays = [np.array(x) for x in zip(*df.columns.values)] + + result = df['foo'] + result2 = df.loc[:, 'foo'] + expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + expected.columns = expected.columns.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + df = df.T + result = df.xs('foo') + result2 = df.loc['foo'] + expected = df.reindex(df.index[arrays[0] == 'foo']) + expected.index = expected.index.droplevel(0) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = lzip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + + arrays = [np.array(x) for x in zip(*index.values)] + + result = s['qux'] + result2 = s.loc['qux'] + expected = s[arrays[0] == 'qux'] + expected.index = expected.index.droplevel(0) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + class TestMultiIndexSlicers(object): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 70d2c9080ab94..cc4ee7ca72343 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -10,16 +10,13 @@ import numpy as np from pandas.core.index import Index, MultiIndex -from pandas import (Panel, DataFrame, Series, notna, isna, Timestamp, concat, - read_csv) +from pandas import (Panel, DataFrame, Series, isna, Timestamp) from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype -import pandas.core.common as com import pandas.util.testing as tm from pandas.compat import (range, lrange, StringIO, lzip, u, product as cart_product, zip) import pandas as pd -import pandas._libs.index as _index AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] @@ -239,493 +236,6 @@ def test_repr_name_coincide(self): lines = repr(df).split('\n') assert lines[2].startswith('a 0 foo') - def test_getitem_simple(self): - df = self.frame.T - - col = df['foo', 'one'] - tm.assert_almost_equal(col.values, df.values[:, 0]) - with pytest.raises(KeyError): - df[('foo', 'four')] - with pytest.raises(KeyError): - df['foobar'] - - def test_series_getitem(self): - s = self.ymd['A'] - - result = s[2000, 3] - - # TODO(wesm): unused? - # result2 = s.loc[2000, 3] - - expected = s.reindex(s.index[42:65]) - expected.index = expected.index.droplevel(0).droplevel(0) - tm.assert_series_equal(result, expected) - - result = s[2000, 3, 10] - expected = s[49] - assert result == expected - - # fancy - expected = s.reindex(s.index[49:51]) - result = s.loc[[(2000, 3, 10), (2000, 3, 13)]] - tm.assert_series_equal(result, expected) - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] - tm.assert_series_equal(result, expected) - - # key error - pytest.raises(KeyError, s.__getitem__, (2000, 3, 4)) - - def test_series_getitem_corner(self): - s = self.ymd['A'] - - # don't segfault, GH #495 - # out of bounds access - pytest.raises(IndexError, s.__getitem__, len(self.ymd)) - - # generator - result = s[(x > 0 for x in s)] - expected = s[s > 0] - tm.assert_series_equal(result, expected) - - def test_series_setitem(self): - s = self.ymd['A'] - - s[2000, 3] = np.nan - assert isna(s.values[42:65]).all() - assert notna(s.values[:42]).all() - assert notna(s.values[65:]).all() - - s[2000, 3, 10] = np.nan - assert isna(s[49]) - - def test_series_slice_partial(self): - pass - - def test_frame_getitem_setitem_boolean(self): - df = self.frame.T.copy() - values = df.values - - result = df[df > 0] - expected = df.where(df > 0) - tm.assert_frame_equal(result, expected) - - df[df > 0] = 5 - values[values > 0] = 5 - tm.assert_almost_equal(df.values, values) - - df[df == 5] = 0 - values[values == 5] = 0 - tm.assert_almost_equal(df.values, values) - - # a df that needs alignment first - df[df[:-1] < 0] = 2 - np.putmask(values[:-1], values[:-1] < 0, 2) - tm.assert_almost_equal(df.values, values) - - with pytest.raises(TypeError, match='boolean values only'): - df[df * 0] = 2 - - def test_frame_getitem_setitem_slice(self): - # getitem - result = self.frame.iloc[:4] - expected = self.frame[:4] - tm.assert_frame_equal(result, expected) - - # setitem - cp = self.frame.copy() - cp.iloc[:4] = 0 - - assert (cp.values[:4] == 0).all() - assert (cp.values[4:] != 0).all() - - def test_frame_getitem_setitem_multislice(self): - levels = [['t1', 't2'], ['a', 'b', 'c']] - labels = [[0, 0, 0, 1, 1], [0, 1, 2, 0, 1]] - midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) - df = DataFrame({'value': [1, 2, 3, 7, 8]}, index=midx) - - result = df.loc[:, 'value'] - tm.assert_series_equal(df['value'], result) - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - result = df.ix[:, 'value'] - tm.assert_series_equal(df['value'], result) - - result = df.loc[df.index[1:3], 'value'] - tm.assert_series_equal(df['value'][1:3], result) - - result = df.loc[:, :] - tm.assert_frame_equal(df, result) - - result = df - df.loc[:, 'value'] = 10 - result['value'] = 10 - tm.assert_frame_equal(df, result) - - df.loc[:, :] = 10 - tm.assert_frame_equal(df, result) - - def test_frame_getitem_multicolumn_empty_level(self): - f = DataFrame({'a': ['1', '2', '3'], 'b': ['2', '3', '4']}) - f.columns = [['level1 item1', 'level1 item2'], ['', 'level2 item2'], - ['level3 item1', 'level3 item2']] - - result = f['level1 item1'] - expected = DataFrame([['1'], ['2'], ['3']], index=f.index, - columns=['level3 item1']) - tm.assert_frame_equal(result, expected) - - def test_frame_setitem_multi_column(self): - df = DataFrame(randn(10, 4), columns=[['a', 'a', 'b', 'b'], - [0, 1, 0, 1]]) - - cp = df.copy() - cp['a'] = cp['b'] - tm.assert_frame_equal(cp['a'], cp['b']) - - # set with ndarray - cp = df.copy() - cp['a'] = cp['b'].values - tm.assert_frame_equal(cp['a'], cp['b']) - - # --------------------------------------- - # #1803 - columns = MultiIndex.from_tuples([('A', '1'), ('A', '2'), ('B', '1')]) - df = DataFrame(index=[1, 3, 5], columns=columns) - - # Works, but adds a column instead of updating the two existing ones - df['A'] = 0.0 # Doesn't work - assert (df['A'].values == 0).all() - - # it broadcasts - df['B', '1'] = [1, 2, 3] - df['A'] = df['B', '1'] - - sliced_a1 = df['A', '1'] - sliced_a2 = df['A', '2'] - sliced_b1 = df['B', '1'] - tm.assert_series_equal(sliced_a1, sliced_b1, check_names=False) - tm.assert_series_equal(sliced_a2, sliced_b1, check_names=False) - assert sliced_a1.name == ('A', '1') - assert sliced_a2.name == ('A', '2') - assert sliced_b1.name == ('B', '1') - - def test_getitem_tuple_plus_slice(self): - # GH #671 - df = DataFrame({'a': lrange(10), - 'b': lrange(10), - 'c': np.random.randn(10), - 'd': np.random.randn(10)}) - - idf = df.set_index(['a', 'b']) - - result = idf.loc[(0, 0), :] - expected = idf.loc[0, 0] - expected2 = idf.xs((0, 0)) - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - expected3 = idf.ix[0, 0] - - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result, expected2) - tm.assert_series_equal(result, expected3) - - def test_getitem_setitem_tuple_plus_columns(self): - # GH #1013 - - df = self.ymd[:5] - - result = df.loc[(2000, 1, 6), ['A', 'B', 'C']] - expected = df.loc[2000, 1, 6][['A', 'B', 'C']] - tm.assert_series_equal(result, expected) - - def test_xs(self): - xs = self.frame.xs(('bar', 'two')) - xs2 = self.frame.loc[('bar', 'two')] - - tm.assert_series_equal(xs, xs2) - tm.assert_almost_equal(xs.values, self.frame.values[4]) - - # GH 6574 - # missing values in returned index should be preserrved - acc = [ - ('a', 'abcde', 1), - ('b', 'bbcde', 2), - ('y', 'yzcde', 25), - ('z', 'xbcde', 24), - ('z', None, 26), - ('z', 'zbcde', 25), - ('z', 'ybcde', 26), - ] - df = DataFrame(acc, - columns=['a1', 'a2', 'cnt']).set_index(['a1', 'a2']) - expected = DataFrame({'cnt': [24, 26, 25, 26]}, index=Index( - ['xbcde', np.nan, 'zbcde', 'ybcde'], name='a2')) - - result = df.xs('z', level='a1') - tm.assert_frame_equal(result, expected) - - def test_xs_partial(self): - result = self.frame.xs('foo') - result2 = self.frame.loc['foo'] - expected = self.frame.T['foo'].T - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result, result2) - - result = self.ymd.xs((2000, 4)) - expected = self.ymd.loc[2000, 4] - tm.assert_frame_equal(result, expected) - - # ex from #1796 - index = MultiIndex(levels=[['foo', 'bar'], ['one', 'two'], [-1, 1]], - labels=[[0, 0, 0, 0, 1, 1, 1, 1], - [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, - 0, 1]]) - df = DataFrame(np.random.randn(8, 4), index=index, - columns=list('abcd')) - - result = df.xs(['foo', 'one']) - expected = df.loc['foo', 'one'] - tm.assert_frame_equal(result, expected) - - def test_xs_with_duplicates(self): - # Issue #13719 - df_dup = concat([self.frame] * 2) - assert df_dup.index.is_unique is False - expected = concat([self.frame.xs('one', level='second')] * 2) - tm.assert_frame_equal(df_dup.xs('one', level='second'), expected) - tm.assert_frame_equal(df_dup.xs(['one'], level=['second']), expected) - - def test_xs_level(self): - result = self.frame.xs('two', level='second') - expected = self.frame[self.frame.index.get_level_values(1) == 'two'] - expected.index = expected.index.droplevel(1) - - tm.assert_frame_equal(result, expected) - - index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), ( - 'p', 'q', 'r')]) - df = DataFrame(np.random.randn(3, 5), index=index) - result = df.xs('c', level=2) - expected = df[1:2] - expected.index = expected.index.droplevel(2) - tm.assert_frame_equal(result, expected) - - # this is a copy in 0.14 - result = self.frame.xs('two', level='second') - - # setting this will give a SettingWithCopyError - # as we are trying to write a view - def f(x): - x[:] = 10 - - pytest.raises(com.SettingWithCopyError, f, result) - - def test_xs_level_multiple(self): - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - df = read_csv(StringIO(text), sep=r'\s+', engine='python') - - result = df.xs(('a', 4), level=['one', 'four']) - expected = df.xs('a').xs(4, level='four') - tm.assert_frame_equal(result, expected) - - # this is a copy in 0.14 - result = df.xs(('a', 4), level=['one', 'four']) - - # setting this will give a SettingWithCopyError - # as we are trying to write a view - def f(x): - x[:] = 10 - - pytest.raises(com.SettingWithCopyError, f, result) - - # GH2107 - dates = lrange(20111201, 20111205) - ids = 'abcde' - idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) - idx.names = ['date', 'secid'] - df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) - - rs = df.xs(20111201, level='date') - xp = df.loc[20111201, :] - tm.assert_frame_equal(rs, xp) - - def test_xs_level0(self): - text = """ A B C D E -one two three four -a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 -a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 -x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" - - df = read_csv(StringIO(text), sep=r'\s+', engine='python') - - result = df.xs('a', level=0) - expected = df.xs('a') - assert len(result) == 2 - tm.assert_frame_equal(result, expected) - - def test_xs_level_series(self): - s = self.frame['A'] - result = s[:, 'two'] - expected = self.frame.xs('two', level=1)['A'] - tm.assert_series_equal(result, expected) - - s = self.ymd['A'] - result = s[2000, 5] - expected = self.ymd.loc[2000, 5]['A'] - tm.assert_series_equal(result, expected) - - # not implementing this for now - - pytest.raises(TypeError, s.__getitem__, (2000, slice(3, 4))) - - # result = s[2000, 3:4] - # lv =s.index.get_level_values(1) - # expected = s[(lv == 3) | (lv == 4)] - # expected.index = expected.index.droplevel(0) - # tm.assert_series_equal(result, expected) - - # can do this though - - def test_get_loc_single_level(self): - s = Series(np.random.randn(len(self.single_level)), - index=self.single_level) - for k in self.single_level.values: - s[k] - - def test_getitem_toplevel(self): - df = self.frame.T - - result = df['foo'] - expected = df.reindex(columns=df.columns[:3]) - expected.columns = expected.columns.droplevel(0) - tm.assert_frame_equal(result, expected) - - result = df['bar'] - result2 = df.loc[:, 'bar'] - - expected = df.reindex(columns=df.columns[3:5]) - expected.columns = expected.columns.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result, result2) - - def test_getitem_setitem_slice_integers(self): - index = MultiIndex(levels=[[0, 1, 2], [0, 2]], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) - - frame = DataFrame(np.random.randn(len(index), 4), index=index, - columns=['a', 'b', 'c', 'd']) - res = frame.loc[1:2] - exp = frame.reindex(frame.index[2:]) - tm.assert_frame_equal(res, exp) - - frame.loc[1:2] = 7 - assert (frame.loc[1:2] == 7).values.all() - - series = Series(np.random.randn(len(index)), index=index) - - res = series.loc[1:2] - exp = series.reindex(series.index[2:]) - tm.assert_series_equal(res, exp) - - series.loc[1:2] = 7 - assert (series.loc[1:2] == 7).values.all() - - def test_getitem_int(self): - levels = [[0, 1], [0, 1, 2]] - labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] - index = MultiIndex(levels=levels, labels=labels) - - frame = DataFrame(np.random.randn(6, 2), index=index) - - result = frame.loc[1] - expected = frame[-3:] - expected.index = expected.index.droplevel(0) - tm.assert_frame_equal(result, expected) - - # raises exception - pytest.raises(KeyError, frame.loc.__getitem__, 3) - - # however this will work - result = self.frame.iloc[2] - expected = self.frame.xs(self.frame.index[2]) - tm.assert_series_equal(result, expected) - - def test_getitem_partial(self): - ymd = self.ymd.T - result = ymd[2000, 2] - - expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) - expected.columns = expected.columns.droplevel(0).droplevel(0) - tm.assert_frame_equal(result, expected) - - def test_setitem_change_dtype(self): - dft = self.frame.T - s = dft['foo', 'two'] - dft['foo', 'two'] = s > s.median() - tm.assert_series_equal(dft['foo', 'two'], s > s.median()) - # assert isinstance(dft._data.blocks[1].items, MultiIndex) - - reindexed = dft.reindex(columns=[('foo', 'two')]) - tm.assert_series_equal(reindexed['foo', 'two'], s > s.median()) - - def test_frame_setitem_ix(self): - self.frame.loc[('bar', 'two'), 'B'] = 5 - assert self.frame.loc[('bar', 'two'), 'B'] == 5 - - # with integer labels - df = self.frame.copy() - df.columns = lrange(3) - df.loc[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - df = self.frame.copy() - df.columns = lrange(3) - df.ix[('bar', 'two'), 1] = 7 - assert df.loc[('bar', 'two'), 1] == 7 - - def test_fancy_slice_partial(self): - result = self.frame.loc['bar':'baz'] - expected = self.frame[3:7] - tm.assert_frame_equal(result, expected) - - result = self.ymd.loc[(2000, 2):(2000, 4)] - lev = self.ymd.index.labels[1] - expected = self.ymd[(lev >= 1) & (lev <= 3)] - tm.assert_frame_equal(result, expected) - - def test_getitem_partial_column_select(self): - idx = MultiIndex(labels=[[0, 0, 0], [0, 1, 1], [1, 0, 1]], - levels=[['a', 'b'], ['x', 'y'], ['p', 'q']]) - df = DataFrame(np.random.rand(3, 2), index=idx) - - result = df.loc[('a', 'y'), :] - expected = df.loc[('a', 'y')] - tm.assert_frame_equal(result, expected) - - result = df.loc[('a', 'y'), [1, 0]] - expected = df.loc[('a', 'y')][[1, 0]] - tm.assert_frame_equal(result, expected) - - with catch_warnings(record=True): - simplefilter("ignore", DeprecationWarning) - result = df.ix[('a', 'y'), [1, 0]] - tm.assert_frame_equal(result, expected) - - pytest.raises(KeyError, df.loc.__getitem__, - (('a', 'foo'), slice(None, None))) - def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product( @@ -1355,31 +865,6 @@ def test_alignment(self): exp = x.reindex(exp_index) - y.reindex(exp_index) tm.assert_series_equal(res, exp) - def test_frame_getitem_view(self): - df = self.frame.T.copy() - - # this works because we are modifying the underlying array - # really a no-no - df['foo'].values[:] = 0 - assert (df['foo'].values == 0).all() - - # but not if it's mixed-type - df['foo', 'four'] = 'foo' - df = df.sort_index(level=0, axis=1) - - # this will work, but will raise/warn as its chained assignment - def f(): - df['foo']['one'] = 2 - return df - - pytest.raises(com.SettingWithCopyError, f) - - try: - df = f() - except ValueError: - pass - assert (df['foo', 'one'] == 0).all() - def test_count(self): frame = self.frame.copy() frame.index.names = ['a', 'b'] @@ -1544,26 +1029,6 @@ def test_ix_preserve_names(self): assert result.index.name == self.ymd.index.names[2] assert result2.index.name == self.ymd.index.names[2] - def test_partial_set(self): - # GH #397 - df = self.ymd.copy() - exp = self.ymd.copy() - df.loc[2000, 4] = 0 - exp.loc[2000, 4].values[:] = 0 - tm.assert_frame_equal(df, exp) - - df['A'].loc[2000, 4] = 1 - exp['A'].loc[2000, 4].values[:] = 1 - tm.assert_frame_equal(df, exp) - - df.loc[2000] = 5 - exp.loc[2000].values[:] = 5 - tm.assert_frame_equal(df, exp) - - # this works...for now - df['A'].iloc[14] = 5 - assert df['A'][14] == 5 - def test_unstack_preserve_types(self): # GH #403 self.ymd['E'] = 'foo' @@ -1638,35 +1103,6 @@ def test_pyint_engine(self): result = index.get_indexer([missing] + [keys[i] for i in idces]) tm.assert_numpy_array_equal(result, expected) - def test_getitem_lowerdim_corner(self): - pytest.raises(KeyError, self.frame.loc.__getitem__, - (('bar', 'three'), 'B')) - - # in theory should be inserting in a sorted space???? - self.frame.loc[('bar', 'three'), 'B'] = 0 - assert self.frame.sort_index().loc[('bar', 'three'), 'B'] == 0 - - # --------------------------------------------------------------------- - # AMBIGUOUS CASES! - - def test_partial_ix_missing(self): - pytest.skip("skipping for now") - - result = self.ymd.loc[2000, 0] - expected = self.ymd.loc[2000]['A'] - tm.assert_series_equal(result, expected) - - # need to put in some work here - - # self.ymd.loc[2000, 0] = 0 - # assert (self.ymd.loc[2000]['A'] == 0).all() - - # Pretty sure the second (and maybe even the first) is already wrong. - pytest.raises(Exception, self.ymd.loc.__getitem__, (2000, 6)) - pytest.raises(Exception, self.ymd.loc.__getitem__, (2000, 6), 0) - - # --------------------------------------------------------------------- - def test_to_html(self): self.ymd.columns.name = 'foo' self.ymd.to_html() @@ -1717,62 +1153,6 @@ def test_level_with_tuples(self): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - def test_int_series_slicing(self): - s = self.ymd['A'] - result = s[5:] - expected = s.reindex(s.index[5:]) - tm.assert_series_equal(result, expected) - - exp = self.ymd['A'].copy() - s[5:] = 0 - exp.values[5:] = 0 - tm.assert_numpy_array_equal(s.values, exp.values) - - result = self.ymd[5:] - expected = self.ymd.reindex(s.index[5:]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize('unicode_strings', [True, False]) - def test_mixed_depth_get(self, unicode_strings): - # If unicode_strings is True, the column labels in dataframe - # construction will use unicode strings in Python 2 (pull request - # #17099). - - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] - - if unicode_strings: - arrays = [[u(s) for s in arr] for arr in arrays] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(np.random.randn(4, 6), columns=index) - - result = df['a'] - expected = df['a', '', ''].rename('a') - tm.assert_series_equal(result, expected) - - result = df['routine1', 'result1'] - expected = df['routine1', 'result1', ''] - expected = expected.rename(('routine1', 'result1')) - tm.assert_series_equal(result, expected) - - def test_mixed_depth_insert(self): - arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], - ['', 'OD', 'OD', 'result1', 'result2', 'result1'], - ['', 'wx', 'wy', '', '', '']] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) - - result = df.copy() - expected = df.copy() - result['b'] = [1, 2, 3, 4] - expected['b', '', ''] = [1, 2, 3, 4] - tm.assert_frame_equal(result, expected) - def test_mixed_depth_drop(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], @@ -1864,35 +1244,6 @@ def test_reindex_level_partial_selection(self): result = self.frame.T.loc[:, ['foo', 'qux']] tm.assert_frame_equal(result, expected.T) - def test_setitem_multiple_partial(self): - expected = self.frame.copy() - result = self.frame.copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_frame_equal(result, expected) - - expected = self.frame.copy() - result = self.frame.copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_frame_equal(result, expected) - - expected = self.frame['A'].copy() - result = self.frame['A'].copy() - result.loc[['foo', 'bar']] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_series_equal(result, expected) - - expected = self.frame['A'].copy() - result = self.frame['A'].copy() - result.loc['foo':'bar'] = 0 - expected.loc['foo'] = 0 - expected.loc['bar'] = 0 - tm.assert_series_equal(result, expected) - def test_drop_level(self): result = self.frame.drop(['bar', 'qux'], level='first') expected = self.frame.iloc[[0, 1, 2, 5, 6]] @@ -1972,15 +1323,6 @@ def test_unicode_repr_level_names(self): repr(s) repr(df) - def test_dataframe_insert_column_all_na(self): - # GH #1534 - mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c') - ]) - df = DataFrame([[1, 2], [3, 4], [5, 6]], index=mix) - s = Series({(1, 1): 1, (1, 2): 2}) - df['new'] = s - assert df['new'].isna().all() - def test_join_segfault(self): # 1532 df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]}) @@ -1991,16 +1333,6 @@ def test_join_segfault(self): for how in ['left', 'right', 'outer']: df1.join(df2, how=how) - def test_set_column_scalar_with_ix(self): - subset = self.frame.index[[1, 4, 5]] - - self.frame.loc[subset] = 99 - assert (self.frame.loc[subset].values == 99).all() - - col = self.frame['B'] - col[subset] = 97 - assert (self.frame.loc[subset, 'B'] == 97).all() - def test_frame_dict_constructor_empty_series(self): s1 = Series([ 1, 2, 3, 4 @@ -2014,47 +1346,6 @@ def test_frame_dict_constructor_empty_series(self): DataFrame({'foo': s1, 'bar': s2, 'baz': s3}) DataFrame.from_dict({'foo': s1, 'baz': s3, 'bar': s2}) - def test_indexing_ambiguity_bug_1678(self): - columns = MultiIndex.from_tuples([('Ohio', 'Green'), ('Ohio', 'Red'), ( - 'Colorado', 'Green')]) - index = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2) - ]) - - frame = DataFrame(np.arange(12).reshape((4, 3)), index=index, - columns=columns) - - result = frame.iloc[:, 1] - exp = frame.loc[:, ('Ohio', 'Red')] - assert isinstance(result, Series) - tm.assert_series_equal(result, exp) - - def test_nonunique_assignment_1750(self): - df = DataFrame([[1, 1, "x", "X"], [1, 1, "y", "Y"], [1, 2, "z", "Z"]], - columns=list("ABCD")) - - df = df.set_index(['A', 'B']) - ix = MultiIndex.from_tuples([(1, 1)]) - - df.loc[ix, "C"] = '_' - - assert (df.xs((1, 1))['C'] == '_').all() - - def test_indexing_over_hashtable_size_cutoff(self): - n = 10000 - - old_cutoff = _index._SIZE_CUTOFF - _index._SIZE_CUTOFF = 20000 - - s = Series(np.arange(n), - MultiIndex.from_arrays((["a"] * n, np.arange(n)))) - - # hai it works! - assert s[("a", 5)] == 5 - assert s[("a", 6)] == 6 - assert s[("a", 7)] == 7 - - _index._SIZE_CUTOFF = old_cutoff - def test_multiindex_na_repr(self): # only an issue with long columns @@ -2424,24 +1715,6 @@ def test_repeat(self): m_df = Series(data, index=m_idx) assert m_df.repeat(3).shape == (3 * len(data), ) - def test_iloc_mi(self): - # GH 13797 - # Test if iloc can handle integer locations in MultiIndexed DataFrame - - data = [['str00', 'str01'], ['str10', 'str11'], ['str20', 'srt21'], - ['str30', 'str31'], ['str40', 'str41']] - - mi = MultiIndex.from_tuples( - [('CC', 'A'), ('CC', 'B'), ('CC', 'B'), ('BB', 'a'), ('BB', 'b')]) - - expected = DataFrame(data) - df_mi = DataFrame(data, index=mi) - - result = DataFrame([[df_mi.iloc[r, c] for c in range(2)] - for r in range(5)]) - - tm.assert_frame_equal(result, expected) - class TestSorted(Base): """ everything you wanted to test about sorting """ @@ -2566,87 +1839,6 @@ def test_is_lexsorted(self): assert not index.is_lexsorted() assert index.lexsort_depth == 0 - def test_getitem_multilevel_index_tuple_not_sorted(self): - index_columns = list("abc") - df = DataFrame([[0, 1, 0, "x"], [0, 0, 1, "y"]], - columns=index_columns + ["data"]) - df = df.set_index(index_columns) - query_index = df.index[:1] - rs = df.loc[query_index, "data"] - - xp_idx = MultiIndex.from_tuples([(0, 1, 0)], names=['a', 'b', 'c']) - xp = Series(['x'], index=xp_idx, name='data') - tm.assert_series_equal(rs, xp) - - def test_getitem_slice_not_sorted(self): - df = self.frame.sort_index(level=1).T - - # buglet with int typechecking - result = df.iloc[:, :np.int32(3)] - expected = df.reindex(columns=df.columns[:3]) - tm.assert_frame_equal(result, expected) - - def test_frame_getitem_not_sorted2(self): - # 13431 - df = DataFrame({'col1': ['b', 'd', 'b', 'a'], - 'col2': [3, 1, 1, 2], - 'data': ['one', 'two', 'three', 'four']}) - - df2 = df.set_index(['col1', 'col2']) - df2_original = df2.copy() - - df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) - df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) - assert not df2.index.is_lexsorted() - assert not df2.index.is_monotonic - - assert df2_original.index.equals(df2.index) - expected = df2.sort_index() - assert expected.index.is_lexsorted() - assert expected.index.is_monotonic - - result = df2.sort_index(level=0) - assert result.index.is_lexsorted() - assert result.index.is_monotonic - tm.assert_frame_equal(result, expected) - - def test_frame_getitem_not_sorted(self): - df = self.frame.T - df['foo', 'four'] = 'foo' - - arrays = [np.array(x) for x in zip(*df.columns.values)] - - result = df['foo'] - result2 = df.loc[:, 'foo'] - expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) - expected.columns = expected.columns.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - df = df.T - result = df.xs('foo') - result2 = df.loc['foo'] - expected = df.reindex(df.index[arrays[0] == 'foo']) - expected.index = expected.index.droplevel(0) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - def test_series_getitem_not_sorted(self): - arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - tuples = lzip(*arrays) - index = MultiIndex.from_tuples(tuples) - s = Series(randn(8), index=index) - - arrays = [np.array(x) for x in zip(*index.values)] - - result = s['qux'] - result2 = s.loc['qux'] - expected = s[arrays[0] == 'qux'] - expected.index = expected.index.droplevel(0) - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) - def test_sort_index_and_reconstruction(self): # 15622 From 6efdedd8ed4de2a0ea66ea8b0944b61f6f96b53b Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 23 Nov 2018 08:43:33 +0100 Subject: [PATCH 13/21] add pandas-dev back at two lines --- ci/incremental/setup_conda_environment.cmd | 2 +- ci/incremental/setup_conda_environment.sh | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index 6c1a484b2d2ef..e44e62cf42442 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -11,7 +11,7 @@ call deactivate @rem Display root environment (for debugging) conda list @rem Clean up any left-over from a previous build - +conda remove -all -q -y -n pandas-dev @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index 6875db39125df..e2562626aea50 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -12,16 +12,12 @@ conda list # Clean up any left-over from a previous build # (note workaround for https://github.com/conda/conda/issues/2679: # `conda env remove` issue) -conda remove --all -q -y +conda remove --all -q -y -n pandas-dev echo echo "[create env]" time conda env create -q --file="${ENV_FILE}" || exit 1 -# Activate first -set +v -source activate pandas-dev -set -v # remove any installed pandas package # w/o removing anything else From 24bc4f64a4b57c6279d64b0a6d5dab0f895fbe96 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 16:40:32 +0100 Subject: [PATCH 14/21] change pandas to pandas-dev --- ci/deps/azure-27-compat.yaml | 2 +- ci/deps/azure-36-locale_slow.yaml | 2 +- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-macos-35.yaml | 2 +- ci/deps/azure-windows-27.yaml | 2 +- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/circle-36-locale.yaml | 2 +- ci/deps/travis-27-locale.yaml | 2 +- ci/deps/travis-27.yaml | 2 +- ci/deps/travis-36-doc.yaml | 2 +- ci/deps/travis-36-slow.yaml | 2 +- ci/deps/travis-36.yaml | 2 +- ci/deps/travis-37-numpydev.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ci/deps/azure-27-compat.yaml b/ci/deps/azure-27-compat.yaml index 44c561e9c8911..f3cc615c35243 100644 --- a/ci/deps/azure-27-compat.yaml +++ b/ci/deps/azure-27-compat.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 7e40bd1a9979e..4bbc6a2c11f1e 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 59c8818eaef1e..2b38465c04512 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 6ccdc79d11b27..7a0c3b81ac8f9 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults dependencies: diff --git a/ci/deps/azure-windows-27.yaml b/ci/deps/azure-windows-27.yaml index dc68129a5e6d3..b1533b071fa74 100644 --- a/ci/deps/azure-windows-27.yaml +++ b/ci/deps/azure-windows-27.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index af42545af7971..817aab66c65aa 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/circle-36-locale.yaml b/ci/deps/circle-36-locale.yaml index 59c8818eaef1e..2b38465c04512 100644 --- a/ci/deps/circle-36-locale.yaml +++ b/ci/deps/circle-36-locale.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-27-locale.yaml b/ci/deps/travis-27-locale.yaml index c8d17cf190e35..0846ef5e8264e 100644 --- a/ci/deps/travis-27-locale.yaml +++ b/ci/deps/travis-27-locale.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-27.yaml b/ci/deps/travis-27.yaml index 5a9e206ec2c69..8d14673ebde6d 100644 --- a/ci/deps/travis-27.yaml +++ b/ci/deps/travis-27.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-36-doc.yaml b/ci/deps/travis-36-doc.yaml index fb54c784d6fac..ed0764fab414a 100644 --- a/ci/deps/travis-36-doc.yaml +++ b/ci/deps/travis-36-doc.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 3157ecac3a902..a6ffdb95e5e7c 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 1880fa2501581..1781f67041f44 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge diff --git a/ci/deps/travis-37-numpydev.yaml b/ci/deps/travis-37-numpydev.yaml index 82c75b7c91b1f..99ae228f25de3 100644 --- a/ci/deps/travis-37-numpydev.yaml +++ b/ci/deps/travis-37-numpydev.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults dependencies: diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 7dbd85ac27df6..a297786f6b14d 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -1,4 +1,4 @@ -name: pandas +name: pandas-dev channels: - defaults - conda-forge From 28a3ebf6e21163301d98d506047ca2bfab88b851 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 22:48:15 +0100 Subject: [PATCH 15/21] add source activate --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 6d31adcbf8a43..9c98286f4c20f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,6 +104,7 @@ before_script: - ci/before_script_travis.sh script: + - source activate $CONDA_ENV - echo "script start" - ci/run_build_docs.sh - ci/script_single.sh From 9f562585f16200f0b43753a4650624b3016efd80 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 22:49:17 +0100 Subject: [PATCH 16/21] remove source activate --- ci/build_docs.sh | 2 -- ci/incremental/build.sh | 1 - ci/incremental/setup_conda_environment.cmd | 4 ++-- ci/incremental/setup_conda_environment.sh | 5 ++--- ci/script_single.sh | 1 - ci/upload_coverage.sh | 1 - 6 files changed, 4 insertions(+), 10 deletions(-) diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 33340a1c038dc..f89c4369dff4a 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -12,8 +12,6 @@ if [ "$DOC" ]; then echo "Will build docs" - source activate pandas - echo ############################### echo # Log file for the doc build # echo ############################### diff --git a/ci/incremental/build.sh b/ci/incremental/build.sh index 8f2301a3b7ef5..40f78e7d95d52 100755 --- a/ci/incremental/build.sh +++ b/ci/incremental/build.sh @@ -1,6 +1,5 @@ #!/bin/bash -source activate $CONDA_ENV # Make sure any error below is reported as such set -v -e diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index 35595ffb03695..bd628f3147b79 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -11,9 +11,9 @@ call deactivate @rem Display root environment (for debugging) conda list @rem Clean up any left-over from a previous build -conda remove --all -q -y -n %CONDA_ENV% +conda remove --all -q -y @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite -conda env create -n %CONDA_ENV% --file=ci\deps\azure-windows-%CONDA_PY%.yaml +conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml call activate %CONDA_ENV% conda list diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index f3ac99d5e7c5a..559a9e8b802b3 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -12,15 +12,14 @@ conda list # Clean up any left-over from a previous build # (note workaround for https://github.com/conda/conda/issues/2679: # `conda env remove` issue) -conda remove --all -q -y -n $CONDA_ENV +conda remove --all -q -y echo echo "[create env]" -time conda env create -q -n "${CONDA_ENV}" --file="${ENV_FILE}" || exit 1 +time conda env create -q --file="${ENV_FILE}" || exit 1 # Activate first set +v -source activate $CONDA_ENV set -v # remove any installed pandas package diff --git a/ci/script_single.sh b/ci/script_single.sh index ea0d48bc2da8a..b57b643290c73 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -2,7 +2,6 @@ echo "[script_single]" -source activate pandas if [ -n "$LOCALE_OVERRIDE" ]; then echo "Setting LC_ALL and LANG to $LOCALE_OVERRIDE" diff --git a/ci/upload_coverage.sh b/ci/upload_coverage.sh index a7ef2fa908079..88aca20590505 100755 --- a/ci/upload_coverage.sh +++ b/ci/upload_coverage.sh @@ -5,7 +5,6 @@ if [ -z "$COVERAGE" ]; then exit 0 fi -source activate pandas echo "uploading coverage" bash <(curl -s https://codecov.io/bash) -Z -c -F single -f /tmp/cov-single.xml From bf6bca2bcd248e6f387e9d3001d8e6cfcb9238ae Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 22:53:52 +0100 Subject: [PATCH 17/21] clean the script --- .travis.yml | 2 +- ci/incremental/build.cmd | 2 +- ci/incremental/setup_conda_environment.cmd | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9c98286f4c20f..bec92204e393d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,7 +104,7 @@ before_script: - ci/before_script_travis.sh script: - - source activate $CONDA_ENV + - source activate pandas-dev - echo "script start" - ci/run_build_docs.sh - ci/script_single.sh diff --git a/ci/incremental/build.cmd b/ci/incremental/build.cmd index d2fd06d7d9e50..7115dea8c7eac 100644 --- a/ci/incremental/build.cmd +++ b/ci/incremental/build.cmd @@ -1,5 +1,5 @@ @rem https://github.com/numba/numba/blob/master/buildscripts/incremental/build.cmd -call activate %CONDA_ENV% + @rem Build numba extensions without silencing compile errors python setup.py build_ext -q --inplace diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index bd628f3147b79..b084e45fa3db7 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -15,7 +15,6 @@ conda remove --all -q -y @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml -call activate %CONDA_ENV% conda list if %errorlevel% neq 0 exit /b %errorlevel% From 2c43b2fc42b3319d2875fca79b3603ed5c1e4bb1 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 23:12:42 +0100 Subject: [PATCH 18/21] debug --- ci/incremental/setup_conda_environment.cmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index b084e45fa3db7..6c1a484b2d2ef 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -11,7 +11,7 @@ call deactivate @rem Display root environment (for debugging) conda list @rem Clean up any left-over from a previous build -conda remove --all -q -y + @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml From 3a2a193a21a0c53b938eef0ec5d85b23bc202e09 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Thu, 22 Nov 2018 23:35:49 +0100 Subject: [PATCH 19/21] debug --- .travis.yml | 2 +- ci/incremental/setup_conda_environment.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index bec92204e393d..cd341a0af0c67 100644 --- a/.travis.yml +++ b/.travis.yml @@ -116,7 +116,7 @@ after_success: after_script: - echo "after_script start" - - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - if [ -e test-data-single.xml ]; then ci/print_skipped.py test-data-single.xml; fi diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index 559a9e8b802b3..6875db39125df 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -20,6 +20,7 @@ time conda env create -q --file="${ENV_FILE}" || exit 1 # Activate first set +v +source activate pandas-dev set -v # remove any installed pandas package From bacb2303d205d13de7a1706948cd458ea5e0c250 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 23 Nov 2018 08:43:33 +0100 Subject: [PATCH 20/21] add pandas-dev back at two lines --- ci/incremental/setup_conda_environment.cmd | 2 +- ci/incremental/setup_conda_environment.sh | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/ci/incremental/setup_conda_environment.cmd b/ci/incremental/setup_conda_environment.cmd index 6c1a484b2d2ef..e44e62cf42442 100644 --- a/ci/incremental/setup_conda_environment.cmd +++ b/ci/incremental/setup_conda_environment.cmd @@ -11,7 +11,7 @@ call deactivate @rem Display root environment (for debugging) conda list @rem Clean up any left-over from a previous build - +conda remove -all -q -y -n pandas-dev @rem Scipy, CFFI, jinja2 and IPython are optional dependencies, but exercised in the test suite conda env create --file=ci\deps\azure-windows-%CONDA_PY%.yaml diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index 6875db39125df..e2562626aea50 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -12,16 +12,12 @@ conda list # Clean up any left-over from a previous build # (note workaround for https://github.com/conda/conda/issues/2679: # `conda env remove` issue) -conda remove --all -q -y +conda remove --all -q -y -n pandas-dev echo echo "[create env]" time conda env create -q --file="${ENV_FILE}" || exit 1 -# Activate first -set +v -source activate pandas-dev -set -v # remove any installed pandas package # w/o removing anything else From 393f75453dd58d9b895d93c26341ee7c7342ff8a Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 23 Nov 2018 09:09:51 +0100 Subject: [PATCH 21/21] test again after catching the latest master --- ci/incremental/setup_conda_environment.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/incremental/setup_conda_environment.sh b/ci/incremental/setup_conda_environment.sh index e2562626aea50..5e13b6a96ed07 100755 --- a/ci/incremental/setup_conda_environment.sh +++ b/ci/incremental/setup_conda_environment.sh @@ -5,6 +5,7 @@ set -v -e CONDA_INSTALL="conda install -q -y" PIP_INSTALL="pip install -q" + # Deactivate any environment source deactivate # Display root environment (for debugging)